org.knowceans.corpus.parsers.dpa
Class DpaSaxParser

java.lang.Object
  extended by org.xml.sax.helpers.DefaultHandler
      extended by org.knowceans.corpus.parsers.dpa.DpaSaxParser
All Implemented Interfaces:
org.xml.sax.ContentHandler, org.xml.sax.DTDHandler, org.xml.sax.EntityResolver, org.xml.sax.ErrorHandler

public class DpaSaxParser
extends org.xml.sax.helpers.DefaultHandler

DpaSaxParser parses a DPA document corpus from a single pseudo-xml file. This version does not validate the content of tags and is robust against encoding problems. Necessary replacements (Xerces will not accept them): & -> & and find all invalid unbalanced tags via regex "\w+ +<\w" and remove them.

Author:
heinrich

Field Summary
private  java.util.Vector<DpaDocument> allDocs
           
private  DpaCategories cat
           
private  DpaDocument curDoc
           
private  boolean inBody
           
private  java.io.Writer out
           
private  java.lang.String prevWord
           
private  java.util.Vector<java.lang.String> sentenceBuffer
           
private  Stemmer stem
           
private  StopWordFilter stop
           
private  java.util.Stack<java.lang.String> tagstack
           
 boolean useBigrams
           
 boolean useMeldungenAsDocuments
           
 boolean useSentencesAsDocuments
           
 boolean useStemming
           
 boolean useUnigrams
           
private  boolean withinSentence
           
 
Constructor Summary
DpaSaxParser()
           
DpaSaxParser(java.lang.String stoplist)
           
 
Method Summary
private  void aggregateSentences(java.lang.String s)
          Adds text to the sentenceBuffer.
 void characters(char[] buf, int offset, int len)
           
 void configure(boolean useStemming, boolean useUnigrams, boolean useBigrams, boolean meldungenAsDocs, boolean sentencesAsDocs)
          configure the parser.
private  void emit(java.lang.String s)
           
 void endElement(java.lang.String namespaceURI, java.lang.String sName, java.lang.String qName)
           
 void error(org.xml.sax.SAXParseException e)
           
 void fatalError(org.xml.sax.SAXParseException e)
           
static void main(java.lang.String[] argv)
           
private  void nl()
           
private  java.util.Vector<DpaDocument> parse(java.lang.String file)
           
private  int parseText(java.lang.String s, java.util.Vector<java.lang.String> words)
          Parse the given text and add terms to the model.
private  java.lang.String removePunct(java.lang.String s)
          Remove all punctuation
private  java.lang.String replaceAbbreviations(java.lang.String s)
           
 void startElement(java.lang.String namespaceURI, java.lang.String sName, java.lang.String qName, org.xml.sax.Attributes attrs)
           
 
Methods inherited from class org.xml.sax.helpers.DefaultHandler
endDocument, endPrefixMapping, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startDocument, startPrefixMapping, unparsedEntityDecl, warning
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

out

private java.io.Writer out

curDoc

private DpaDocument curDoc

inBody

private boolean inBody

withinSentence

private boolean withinSentence

stop

private StopWordFilter stop

useStemming

public boolean useStemming

useBigrams

public boolean useBigrams

useUnigrams

public boolean useUnigrams

useMeldungenAsDocuments

public boolean useMeldungenAsDocuments

useSentencesAsDocuments

public boolean useSentencesAsDocuments

prevWord

private java.lang.String prevWord

cat

private DpaCategories cat

stem

private Stemmer stem

allDocs

private java.util.Vector<DpaDocument> allDocs

tagstack

private java.util.Stack<java.lang.String> tagstack

sentenceBuffer

private java.util.Vector<java.lang.String> sentenceBuffer
Constructor Detail

DpaSaxParser

public DpaSaxParser()
Parameters:
argv -

DpaSaxParser

public DpaSaxParser(java.lang.String stoplist)
Parameters:
argv -
Method Detail

main

public static void main(java.lang.String[] argv)

configure

public void configure(boolean useStemming,
                      boolean useUnigrams,
                      boolean useBigrams,
                      boolean meldungenAsDocs,
                      boolean sentencesAsDocs)
configure the parser.

Parameters:
useStemming - use stemming
useUnigrams - use unigrams
useBigrams - use bigrams
sentencesAsDocs -
meldungenAsDocs -

parse

private java.util.Vector<DpaDocument> parse(java.lang.String file)

emit

private void emit(java.lang.String s)
           throws org.xml.sax.SAXException
Throws:
org.xml.sax.SAXException

nl

private void nl()
         throws org.xml.sax.SAXException
Throws:
org.xml.sax.SAXException

startElement

public void startElement(java.lang.String namespaceURI,
                         java.lang.String sName,
                         java.lang.String qName,
                         org.xml.sax.Attributes attrs)
                  throws org.xml.sax.SAXException
Specified by:
startElement in interface org.xml.sax.ContentHandler
Overrides:
startElement in class org.xml.sax.helpers.DefaultHandler
Throws:
org.xml.sax.SAXException

endElement

public void endElement(java.lang.String namespaceURI,
                       java.lang.String sName,
                       java.lang.String qName)
                throws org.xml.sax.SAXException
Specified by:
endElement in interface org.xml.sax.ContentHandler
Overrides:
endElement in class org.xml.sax.helpers.DefaultHandler
Throws:
org.xml.sax.SAXException

characters

public void characters(char[] buf,
                       int offset,
                       int len)
                throws org.xml.sax.SAXException
Specified by:
characters in interface org.xml.sax.ContentHandler
Overrides:
characters in class org.xml.sax.helpers.DefaultHandler
Throws:
org.xml.sax.SAXException

aggregateSentences

private void aggregateSentences(java.lang.String s)
Adds text to the sentenceBuffer. Removes dots from abbreviations and splits at sentence full stops.

Parameters:
s -

parseText

private int parseText(java.lang.String s,
                      java.util.Vector<java.lang.String> words)
Parse the given text and add terms to the model. Here stop-words and stem filtering is located.

Parameters:
s -
Returns:
number of terms added to words.

replaceAbbreviations

private java.lang.String replaceAbbreviations(java.lang.String s)
Parameters:
s -
Returns:

removePunct

private java.lang.String removePunct(java.lang.String s)
Remove all punctuation

Parameters:
s -
Returns:

error

public void error(org.xml.sax.SAXParseException e)
           throws org.xml.sax.SAXException
Specified by:
error in interface org.xml.sax.ErrorHandler
Overrides:
error in class org.xml.sax.helpers.DefaultHandler
Throws:
org.xml.sax.SAXException

fatalError

public void fatalError(org.xml.sax.SAXParseException e)
                throws org.xml.sax.SAXException
Specified by:
fatalError in interface org.xml.sax.ErrorHandler
Overrides:
fatalError in class org.xml.sax.helpers.DefaultHandler
Throws:
org.xml.sax.SAXException