public class HtmlFilter extends AbstractMarkupFilter
SUB_FILTER
Constructor and Description |
---|
HtmlFilter() |
Modifier and Type | Method and Description |
---|---|
void |
close()
Close the filter and all used resources.
|
protected PropertyTextUnitPlaceholder |
createPropertyTextUnitPlaceholder(PropertyTextUnitPlaceholder.PlaceholderAccessType type,
java.lang.String name,
java.lang.String value,
Tag tag,
Attribute attribute)
|
ISkeletonWriter |
createSkeletonWriter()
Creates a new ISkeletonWriter object that corresponds to the type of skeleton
this filter uses.
|
protected TextFragment.TagType |
determineTagType(Tag tag)
Filter specific method for determining
TextFragment.TagType |
protected void |
endFilter()
End the current filter processing and send the
Ending Event |
protected TaggedFilterConfiguration |
getConfig()
Get the current
TaggedFilterConfiguration . |
Parameters |
getParameters()
Gets the current parameters for this filter.
|
ExtractionRuleState.ExtractionRule |
getRuleTypeFromStartTag(EndTag endTag,
java.util.EnumSet<TaggedFilterConfiguration.RULE_TYPE> ruleTypes) |
protected java.lang.String |
normalizeAttributeName(java.lang.String attrName,
java.lang.String attrValue,
Tag tag)
Some attributes names are converted to Okapi standards such as HTML charset to "encoding" and lang to "language"
|
void |
open(RawDocument input,
boolean generateSkeleton)
Start a new
IFilter using the supplied RawDocument . |
protected void |
preProcess(Segment segment)
Do any handling needed before the current Segment is processed.
|
void |
setParameters(IParameters params)
Sets new parameters for this filter.
|
void |
setParametersFromFile(java.io.File config)
Initialize filter parameters from a Java File.
|
void |
setParametersFromString(java.lang.String config)
Initialize filter parameters from a String.
|
void |
setParametersFromURL(java.net.URL config)
Initialize filter parameters from a URL.
|
protected void |
startFilter()
Initialize rule state and parser.
|
addCodeToCurrentTextUnit, addCodeToCurrentTextUnit, addToDocumentPart, addToTextUnit, addToTextUnit, addToTextUnit, addToTextUnit, canStartNewTextUnit, createEventBuilder, createPropertyTextUnitPlaceholders, detectEncoding, disambiguateElementRuleTypes, endDocumentPart, endGroup, endTextUnit, getCurrentDocName, getEventBuilder, getMainAttributeRule, getMainElementRule, getParsedHeader, getRuleState, getTextUnitId, handleCdataSection, handleCharacterEntity, handleComment, handleDocTypeDeclaration, handleDocumentPart, handleEndTag, handleNumericEntity, handleProcessingInstruction, handleServerCommon, handleServerCommonEscaped, handleStartTag, handleText, handleXmlDeclaration, hasNext, isBOM, isDocumentEncoding, isInline, isInsideTextRun, isMatchedTag, isPreserveWhitespace, isUtf8Bom, isUtf8Encoding, isWhiteSpace, next, open, peekTempEvent, postProcessTextUnit, setCurrentDocName, setDocumentPartId, setMimeType, setPreserveWhitespace, setTextUnitMimeType, setTextUnitName, setTextUnitType, startDocumentPart, startGroup, startGroup, startTextUnit, startTextUnit, startTextUnit, startTextUnit, updateEndTagRuleState, updateStartTagRuleState
addConfiguration, addConfiguration, addConfiguration, addConfigurations, cancel, createEndFilterEvent, createFilterWriter, createStartFilterEvent, findConfiguration, getConfiguration, getConfigurations, getDisplayName, getDocumentId, getDocumentName, getEncoderManager, getEncoding, getFilterConfigurationMapper, getFilterWriter, getMimeType, getName, getNewlineType, getParameters, getParametersClassName, getParentId, getSrcLoc, getTrgLoc, isCanceled, isGenerateSkeleton, isMultilingual, removeConfiguration, setDisplayName, setDocumentName, setEncoding, setFilterConfigurationMapper, setFilterWriter, setGenerateSkeleton, setMultilingual, setName, setNewlineType, setOptions, setParentId, setSrcLoc, setTrgLoc
public ISkeletonWriter createSkeletonWriter()
IFilter
createSkeletonWriter
in interface IFilter
createSkeletonWriter
in class AbstractFilter
public void open(RawDocument input, boolean generateSkeleton)
AbstractMarkupFilter
IFilter
using the supplied RawDocument
.public void close()
AbstractMarkupFilter
close
in interface java.lang.AutoCloseable
close
in interface IFilter
close
in class AbstractMarkupFilter
protected void startFilter()
startFilter
in class AbstractMarkupFilter
protected void endFilter()
Ending
Event
endFilter
in class AbstractMarkupFilter
protected void preProcess(Segment segment)
AbstractMarkupFilter
preProcess
in class AbstractMarkupFilter
public ExtractionRuleState.ExtractionRule getRuleTypeFromStartTag(EndTag endTag, java.util.EnumSet<TaggedFilterConfiguration.RULE_TYPE> ruleTypes)
getRuleTypeFromStartTag
in class AbstractMarkupFilter
protected PropertyTextUnitPlaceholder createPropertyTextUnitPlaceholder(PropertyTextUnitPlaceholder.PlaceholderAccessType type, java.lang.String name, java.lang.String value, Tag tag, Attribute attribute)
AbstractMarkupFilter
createPropertyTextUnitPlaceholder
in class AbstractMarkupFilter
type
- - PropertyTextUnitPlaceholder.PlaceholderAccessType
is one of TRANSLATABLE, READ_ONLY_PROPERTY, WRITABLE_PROPERTYname
- - attribute namevalue
- - attribute valuetag
- - Jericho Tag
which contains the attributeattribute
- - attribute as a Jericho Attribute
PropertyTextUnitPlaceholder
representing the attributeprotected java.lang.String normalizeAttributeName(java.lang.String attrName, java.lang.String attrValue, Tag tag)
AbstractMarkupFilter
normalizeAttributeName
in class AbstractMarkupFilter
attrName
- - the attribute nameattrValue
- - the attribute valuetag
- - the Jericho Tag
that contains the attributeprotected TaggedFilterConfiguration getConfig()
AbstractMarkupFilter
TaggedFilterConfiguration
. A TaggedFilterConfiguration is the result of reading in a YAML
configuration file and converting it into Java Objects.getConfig
in class AbstractMarkupFilter
TaggedFilterConfiguration
public void setParameters(IParameters params)
IFilter
setParameters
in interface IFilter
setParameters
in class AbstractFilter
params
- The new parameters to use.public Parameters getParameters()
IFilter
getParameters
in interface IFilter
getParameters
in class AbstractFilter
public void setParametersFromURL(java.net.URL config)
config
- public void setParametersFromFile(java.io.File config)
config
- public void setParametersFromString(java.lang.String config)
config
- protected TextFragment.TagType determineTagType(Tag tag)
AbstractMarkupFilter
TextFragment.TagType
determineTagType
in class AbstractMarkupFilter
tag
- Jericho Tag
start or end tagTextFragment.TagType
Copyright © 2022. All rights reserved.