|
||||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||||
java.lang.Objectorg.xml.sax.helpers.DefaultHandler
org.knowceans.corpus.parsers.TxtParser
public class TxtParser
SimpleParser parses a set of plain text files into a TextCorpus, one for each document. This implementation uses the "old" parser code and not lucene etc.
| Field Summary | |
|---|---|
private java.util.Vector<SimpleDocument> |
allDocs
|
private java.io.BufferedWriter |
bw
|
private int |
nr
|
private java.lang.String |
prevWord
|
private Stemmer |
stem
|
private StopWordFilter |
stop
|
boolean |
useBigrams
|
boolean |
useStemming
|
boolean |
useUnigrams
|
private java.lang.String |
xmlfile
|
| Constructor Summary | |
|---|---|
TxtParser()
|
|
TxtParser(java.lang.String stoplist)
|
|
| Method Summary | |
|---|---|
private void |
closeOutfile()
|
void |
configure(boolean useStemming,
boolean useUnigrams,
boolean useBigrams)
configure the parser. |
private boolean |
isValid(java.lang.String sourcefile)
True for txt files. |
static void |
main(java.lang.String[] argv)
|
private void |
openOutfile()
|
java.util.Vector<SimpleDocument> |
parse(java.lang.String file,
int mindl)
opens the file and parses the content as one document |
private java.util.Vector<SimpleDocument> |
parseDir(java.lang.String sourcefile,
int mindl)
Parse directory by adding each XML file's content sequentially. |
private void |
parseString(java.lang.String file,
java.lang.String string,
int mindl)
parses the string |
private int |
parseText(java.lang.String s,
java.util.Vector<java.lang.String> words)
Parse the given text and add terms to the model. |
private java.lang.String |
removePunct(java.lang.String s)
Remove all punctuation |
private void |
setXmlOutput(java.lang.String xmlfile)
|
private void |
writeText(java.lang.String file,
int id,
java.lang.String text)
|
| Methods inherited from class org.xml.sax.helpers.DefaultHandler |
|---|
characters, endDocument, endElement, endPrefixMapping, error, fatalError, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startDocument, startElement, startPrefixMapping, unparsedEntityDecl, warning |
| Methods inherited from class java.lang.Object |
|---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
| Field Detail |
|---|
private StopWordFilter stop
public boolean useStemming
public boolean useBigrams
public boolean useUnigrams
private java.lang.String prevWord
private int nr
private Stemmer stem
private java.util.Vector<SimpleDocument> allDocs
private java.io.BufferedWriter bw
private java.lang.String xmlfile
| Constructor Detail |
|---|
public TxtParser()
argv - public TxtParser(java.lang.String stoplist)
argv - | Method Detail |
|---|
public static void main(java.lang.String[] argv)
private void setXmlOutput(java.lang.String xmlfile)
public void configure(boolean useStemming,
boolean useUnigrams,
boolean useBigrams)
useStemming - use stemminguseUnigrams - use unigramsuseBigrams - use bigramssentencesAsDocs - meldungenAsDocs -
private java.util.Vector<SimpleDocument> parseDir(java.lang.String sourcefile,
int mindl)
sourcefile - mindl - minimum doc length
private boolean isValid(java.lang.String sourcefile)
sourcefile -
public java.util.Vector<SimpleDocument> parse(java.lang.String file,
int mindl)
mindl - minimum required document length
private void parseString(java.lang.String file,
java.lang.String string,
int mindl)
string - mindl - private void openOutfile()
private void writeText(java.lang.String file,
int id,
java.lang.String text)
private void closeOutfile()
private int parseText(java.lang.String s,
java.util.Vector<java.lang.String> words)
s -
private java.lang.String removePunct(java.lang.String s)
s -
|
||||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||||