|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object org.xml.sax.helpers.DefaultHandler org.knowceans.corpus.parsers.epg.EpgParser
public class EpgParser
EpgParser parses a EPG html file into a TextCorpus.
Field Summary | |
---|---|
private java.util.Vector<EpgDocument> |
allDocs
|
(package private) static java.lang.String[] |
blacklist
Documents containing one of these terms are invalid. |
private java.io.BufferedWriter |
bw
|
private EpgDocument |
curDoc
|
(package private) static java.lang.String |
htmlPattern
|
private int |
nr
|
private java.lang.String |
prevWord
|
private Stemmer |
stem
|
private StopWordFilter |
stop
|
boolean |
useBigrams
|
boolean |
useStemming
|
boolean |
useUnigrams
|
private java.lang.String |
xmlfile
|
(package private) static java.lang.String |
xmlPattern
|
Constructor Summary | |
---|---|
EpgParser()
|
|
EpgParser(java.lang.String stoplist)
|
Method Summary | |
---|---|
private boolean |
blackList(java.lang.String string)
|
private void |
closeOutfile()
|
void |
configure(boolean useStemming,
boolean useUnigrams,
boolean useBigrams)
configure the parser. |
private boolean |
isValid(int mindl)
|
static void |
main(java.lang.String[] argv)
|
private void |
openOutfile()
|
java.util.Vector<EpgDocument> |
parse(java.lang.String file,
int mindl)
opens the file and parses the content |
private java.util.Vector<EpgDocument> |
parseDir(java.lang.String sourcefile,
int mindl)
Parse directory by adding each XML file's content sequentially. |
private java.util.Vector<EpgDocument> |
parseString(java.lang.String string,
int mindl,
boolean html)
parses the string |
private int |
parseText(java.lang.String s,
java.util.Vector<java.lang.String> words)
Parse the given text and add terms to the model. |
private java.lang.String |
removePunct(java.lang.String s)
Remove all punctuation |
private void |
setXmlOutput(java.lang.String xmlfile)
|
private void |
writeText(java.lang.String title,
java.lang.String subtitle,
java.lang.String body)
|
Methods inherited from class org.xml.sax.helpers.DefaultHandler |
---|
characters, endDocument, endElement, endPrefixMapping, error, fatalError, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startDocument, startElement, startPrefixMapping, unparsedEntityDecl, warning |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
static java.lang.String htmlPattern
static java.lang.String xmlPattern
static java.lang.String[] blacklist
private EpgDocument curDoc
private StopWordFilter stop
public boolean useStemming
public boolean useBigrams
public boolean useUnigrams
private java.lang.String prevWord
private int nr
private Stemmer stem
private java.util.Vector<EpgDocument> allDocs
private java.io.BufferedWriter bw
private java.lang.String xmlfile
Constructor Detail |
---|
public EpgParser()
argv
- public EpgParser(java.lang.String stoplist)
argv
- Method Detail |
---|
public static void main(java.lang.String[] argv)
private void setXmlOutput(java.lang.String xmlfile)
public void configure(boolean useStemming, boolean useUnigrams, boolean useBigrams)
useStemming
- use stemminguseUnigrams
- use unigramsuseBigrams
- use bigramssentencesAsDocs
- meldungenAsDocs
- public java.util.Vector<EpgDocument> parse(java.lang.String file, int mindl)
mindl
- minimum required document length
private java.util.Vector<EpgDocument> parseDir(java.lang.String sourcefile, int mindl)
sourcefile
- mindl
- minimum doc length
private java.util.Vector<EpgDocument> parseString(java.lang.String string, int mindl, boolean html)
string
- mindl
- html
-
private boolean blackList(java.lang.String string)
private boolean isValid(int mindl)
private void openOutfile()
private void writeText(java.lang.String title, java.lang.String subtitle, java.lang.String body)
private void closeOutfile()
private int parseText(java.lang.String s, java.util.Vector<java.lang.String> words)
s
-
private java.lang.String removePunct(java.lang.String s)
s
-
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |