|
||||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||||
java.lang.Objectorg.xml.sax.helpers.DefaultHandler
org.knowceans.corpus.parsers.dpa.DpaSaxParser2
public class DpaSaxParser2
DpaSaxParser parses a DPA document corpus from a single pseudo-xml file. This version does not validate the content of tags and is robust against encoding problems. Necessary replacements (Xerces will not accept them): & -> & and find all invalid unbalanced tags via regex "\w+ +<\w" and remove them.
This sax parser adheres to a stricter XML format (an interim version of the converted XML by Fraunhofer AIS). It uses a complete directory tree as a source of separate XML files.
TODO: set parser working dir to data files. (now ../dpa.dtd in xml files hints to eclipse workspace.
| Field Summary | |
|---|---|
private java.util.Vector<DpaDocument> |
allDocs
|
private IptcCategories |
cat
|
private DpaDocument |
curDoc
|
private boolean |
inBody
|
private boolean |
inTitle
|
private java.io.Writer |
out
|
private java.lang.String |
prevWord
|
private java.util.Vector<java.lang.String> |
sentenceBuffer
|
private Stemmer |
stem
|
private StopWordFilter |
stop
|
private java.util.Stack<java.lang.String> |
tagstack
|
boolean |
useBigrams
|
boolean |
useMeldungenAsDocuments
|
private boolean |
usePars
TODO: paragraph processing does not check for "non-paragraph" content, i.e. crashes for paragraphIndex.size() == 0. --> disables; not used for experiments, anyway. |
boolean |
useSentencesAsDocuments
|
boolean |
useStemming
|
boolean |
useUnigrams
|
private boolean |
withinSentence
|
| Constructor Summary | |
|---|---|
DpaSaxParser2()
|
|
DpaSaxParser2(java.lang.String stoplist)
|
|
| Method Summary | |
|---|---|
private void |
aggregateSentences(java.lang.String s)
Adds text to the sentenceBuffer. |
void |
characters(char[] buf,
int offset,
int len)
|
private void |
checkParagraph(java.lang.String s)
Checks if a new paragraph is |
void |
configure(boolean useStemming,
boolean useUnigrams,
boolean useBigrams,
boolean meldungenAsDocs,
boolean sentencesAsDocs)
configure the parser. |
private void |
emit(java.lang.String s)
|
void |
endElement(java.lang.String namespaceURI,
java.lang.String sName,
java.lang.String qName)
|
void |
error(org.xml.sax.SAXParseException e)
|
void |
fatalError(org.xml.sax.SAXParseException e)
|
static void |
main(java.lang.String[] argv)
|
private void |
nl()
|
private java.util.Vector<DpaDocument> |
parse(java.lang.String file)
|
private java.util.Vector<DpaDocument> |
parseDir(java.lang.String sourcefile)
Parse directory by adding each XML file's content sequentially. |
private int |
parseText(java.lang.String s,
java.util.Vector<java.lang.String> words)
Parse the given text and add terms to the model. |
private java.lang.String |
removePunct(java.lang.String s)
Remove all punctuation |
private java.lang.String |
replaceAbbreviations(java.lang.String s)
|
void |
startElement(java.lang.String namespaceURI,
java.lang.String sName,
java.lang.String qName,
org.xml.sax.Attributes attrs)
opening tag callback for new DPA dataset (2000) |
| Methods inherited from class org.xml.sax.helpers.DefaultHandler |
|---|
endDocument, endPrefixMapping, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startDocument, startPrefixMapping, unparsedEntityDecl, warning |
| Methods inherited from class java.lang.Object |
|---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
| Field Detail |
|---|
private java.io.Writer out
private DpaDocument curDoc
private boolean inBody
private boolean inTitle
private boolean withinSentence
private StopWordFilter stop
public boolean useStemming
public boolean useBigrams
public boolean useUnigrams
public boolean useMeldungenAsDocuments
public boolean useSentencesAsDocuments
private java.lang.String prevWord
private IptcCategories cat
private Stemmer stem
private java.util.Vector<DpaDocument> allDocs
private java.util.Stack<java.lang.String> tagstack
private java.util.Vector<java.lang.String> sentenceBuffer
private boolean usePars
| Constructor Detail |
|---|
public DpaSaxParser2()
argv - public DpaSaxParser2(java.lang.String stoplist)
argv - | Method Detail |
|---|
public static void main(java.lang.String[] argv)
public void configure(boolean useStemming,
boolean useUnigrams,
boolean useBigrams,
boolean meldungenAsDocs,
boolean sentencesAsDocs)
useStemming - use stemminguseUnigrams - use unigramsuseBigrams - use bigramssentencesAsDocs - meldungenAsDocs - private java.util.Vector<DpaDocument> parse(java.lang.String file)
private java.util.Vector<DpaDocument> parseDir(java.lang.String sourcefile)
sourcefile -
private void emit(java.lang.String s)
throws org.xml.sax.SAXException
org.xml.sax.SAXException
private void nl()
throws org.xml.sax.SAXException
org.xml.sax.SAXException
public void startElement(java.lang.String namespaceURI,
java.lang.String sName,
java.lang.String qName,
org.xml.sax.Attributes attrs)
throws org.xml.sax.SAXException
startElement in interface org.xml.sax.ContentHandlerstartElement in class org.xml.sax.helpers.DefaultHandlerorg.xml.sax.SAXException
public void endElement(java.lang.String namespaceURI,
java.lang.String sName,
java.lang.String qName)
throws org.xml.sax.SAXException
endElement in interface org.xml.sax.ContentHandlerendElement in class org.xml.sax.helpers.DefaultHandlerorg.xml.sax.SAXException
public void characters(char[] buf,
int offset,
int len)
throws org.xml.sax.SAXException
characters in interface org.xml.sax.ContentHandlercharacters in class org.xml.sax.helpers.DefaultHandlerorg.xml.sax.SAXExceptionprivate void aggregateSentences(java.lang.String s)
s - private void checkParagraph(java.lang.String s)
s -
private int parseText(java.lang.String s,
java.util.Vector<java.lang.String> words)
s -
private java.lang.String replaceAbbreviations(java.lang.String s)
s -
private java.lang.String removePunct(java.lang.String s)
s -
public void error(org.xml.sax.SAXParseException e)
throws org.xml.sax.SAXException
error in interface org.xml.sax.ErrorHandlererror in class org.xml.sax.helpers.DefaultHandlerorg.xml.sax.SAXException
public void fatalError(org.xml.sax.SAXParseException e)
throws org.xml.sax.SAXException
fatalError in interface org.xml.sax.ErrorHandlerfatalError in class org.xml.sax.helpers.DefaultHandlerorg.xml.sax.SAXException
|
||||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||||