|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object org.xml.sax.helpers.DefaultHandler org.knowceans.corpus.parsers.reuters.ReutersParser
public class ReutersParser
EpgParser parses the reuters-21578 dataset into a TextCorpus.
TODO: not completed.
Field Summary | |
---|---|
private java.util.Vector<ReutersDocument> |
allDocs
|
private ReutersDocument |
curDoc
|
private int |
nr
|
private java.lang.String |
prevWord
|
private Stemmer |
stem
|
private StopWordFilter |
stop
|
boolean |
useBigrams
|
boolean |
useStemming
|
boolean |
useUnigrams
|
Constructor Summary | |
---|---|
ReutersParser()
|
|
ReutersParser(java.lang.String stoplist)
|
Method Summary | |
---|---|
void |
configure(boolean useStemming,
boolean useUnigrams,
boolean useBigrams)
configure the parser. |
static void |
main(java.lang.String[] argv)
|
java.util.Vector<ReutersDocument> |
parse(java.lang.String file)
opens the file and parses the content |
private java.util.Vector<ReutersDocument> |
parseDir(java.lang.String sourcefile)
Parse directory by adding each XML file's content sequentially. |
private java.util.Vector<ReutersDocument> |
parseString(java.lang.String r)
parses the string |
private int |
parseText(java.lang.String s,
java.util.Vector<java.lang.String> words)
Parse the given text and add terms to the model. |
private java.lang.String |
removePunct(java.lang.String s)
Remove all punctuation |
Methods inherited from class org.xml.sax.helpers.DefaultHandler |
---|
characters, endDocument, endElement, endPrefixMapping, error, fatalError, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startDocument, startElement, startPrefixMapping, unparsedEntityDecl, warning |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
private ReutersDocument curDoc
private StopWordFilter stop
public boolean useStemming
public boolean useBigrams
public boolean useUnigrams
private java.lang.String prevWord
private int nr
private Stemmer stem
private java.util.Vector<ReutersDocument> allDocs
Constructor Detail |
---|
public ReutersParser()
argv
- public ReutersParser(java.lang.String stoplist)
argv
- Method Detail |
---|
public static void main(java.lang.String[] argv)
public void configure(boolean useStemming, boolean useUnigrams, boolean useBigrams)
useStemming
- use stemminguseUnigrams
- use unigramsuseBigrams
- use bigramssentencesAsDocs
- meldungenAsDocs
- public java.util.Vector<ReutersDocument> parse(java.lang.String file)
file
-
private java.util.Vector<ReutersDocument> parseDir(java.lang.String sourcefile)
sourcefile
-
private java.util.Vector<ReutersDocument> parseString(java.lang.String r)
r
-
private int parseText(java.lang.String s, java.util.Vector<java.lang.String> words)
s
-
private java.lang.String removePunct(java.lang.String s)
s
-
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |