org.knowceans.corpus.parsers.igdbib
Class IgdSaxParser

java.lang.Object
  extended by org.xml.sax.helpers.DefaultHandler
      extended by org.knowceans.corpus.parsers.igdbib.IgdSaxParser
All Implemented Interfaces:
org.xml.sax.ContentHandler, org.xml.sax.DTDHandler, org.xml.sax.EntityResolver, org.xml.sax.ErrorHandler

public class IgdSaxParser
extends org.xml.sax.helpers.DefaultHandler

IgdSaxParser parse a IGD library document corpus from a single pseudo-xml file. This version does not validate the content of tags and is robust against encoding problems. Necessary replacements (Xerces will not accept them): and -> & and find all invalid unbalanced tags via regex "\w+ +<\w" and remove them.

Author:
heinrich

Field Summary
private static java.util.Vector<IgdDocument> allDocs
           
private static IgdCategories cat
           
private static IgdDocument curDoc
           
private static boolean inBody
           
private static boolean inSentence
           
private static java.io.Writer out
           
private static java.lang.String prevWord
           
private static Stemmer stem
           
private static StopWordFilter stop
           
private static java.util.Stack<java.lang.String> tagstack
           
 boolean useBigrams
           
 boolean useStemming
           
 
Constructor Summary
IgdSaxParser()
           
 
Method Summary
 void characters(char[] buf, int offset, int len)
           
 void configure(boolean useStemming, boolean useBigrams)
          configure the parser.
private  void emit(java.lang.String s)
           
 void endElement(java.lang.String namespaceURI, java.lang.String sName, java.lang.String qName)
           
 void error(org.xml.sax.SAXParseException e)
           
 void fatalError(org.xml.sax.SAXParseException e)
           
static void main(java.lang.String[] argv)
           
private  void nl()
           
private  java.util.Vector<IgdDocument> parse(java.lang.String file)
           
private  void parseAuthors(java.lang.String s, java.util.Vector<java.lang.String> authors)
          parse an author string of the format "surname, givenname (department); surname, ..."
private  void parseKeywords(java.lang.String s, java.util.Vector<java.lang.String> keywords)
          parses a keyword string of the format keyword; keyword; ...
private  void parseText(java.lang.String s, java.util.Vector<java.lang.String> words)
          Parse the given text and add terms to the model.
private  java.lang.String removePunct(java.lang.String s)
          Remove all punctuation
 void startElement(java.lang.String namespaceURI, java.lang.String sName, java.lang.String qName, org.xml.sax.Attributes attrs)
           
 
Methods inherited from class org.xml.sax.helpers.DefaultHandler
endDocument, endPrefixMapping, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startDocument, startPrefixMapping, unparsedEntityDecl, warning
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

out

private static java.io.Writer out

curDoc

private static IgdDocument curDoc

inBody

private static boolean inBody

inSentence

private static boolean inSentence

stop

private static StopWordFilter stop

useStemming

public boolean useStemming

useBigrams

public boolean useBigrams

prevWord

private static java.lang.String prevWord

cat

private static IgdCategories cat

stem

private static Stemmer stem

allDocs

private static java.util.Vector<IgdDocument> allDocs

tagstack

private static java.util.Stack<java.lang.String> tagstack
Constructor Detail

IgdSaxParser

public IgdSaxParser()
Parameters:
argv -
Method Detail

main

public static void main(java.lang.String[] argv)

configure

public void configure(boolean useStemming,
                      boolean useBigrams)
configure the parser.

Parameters:
useStemming - use stemming
useBigrams - use bigrams

parse

private java.util.Vector<IgdDocument> parse(java.lang.String file)

emit

private void emit(java.lang.String s)
           throws org.xml.sax.SAXException
Throws:
org.xml.sax.SAXException

nl

private void nl()
         throws org.xml.sax.SAXException
Throws:
org.xml.sax.SAXException

startElement

public void startElement(java.lang.String namespaceURI,
                         java.lang.String sName,
                         java.lang.String qName,
                         org.xml.sax.Attributes attrs)
                  throws org.xml.sax.SAXException
Specified by:
startElement in interface org.xml.sax.ContentHandler
Overrides:
startElement in class org.xml.sax.helpers.DefaultHandler
Throws:
org.xml.sax.SAXException

error

public void error(org.xml.sax.SAXParseException e)
           throws org.xml.sax.SAXException
Specified by:
error in interface org.xml.sax.ErrorHandler
Overrides:
error in class org.xml.sax.helpers.DefaultHandler
Throws:
org.xml.sax.SAXException

fatalError

public void fatalError(org.xml.sax.SAXParseException e)
                throws org.xml.sax.SAXException
Specified by:
fatalError in interface org.xml.sax.ErrorHandler
Overrides:
fatalError in class org.xml.sax.helpers.DefaultHandler
Throws:
org.xml.sax.SAXException

endElement

public void endElement(java.lang.String namespaceURI,
                       java.lang.String sName,
                       java.lang.String qName)
                throws org.xml.sax.SAXException
Specified by:
endElement in interface org.xml.sax.ContentHandler
Overrides:
endElement in class org.xml.sax.helpers.DefaultHandler
Throws:
org.xml.sax.SAXException

characters

public void characters(char[] buf,
                       int offset,
                       int len)
                throws org.xml.sax.SAXException
Specified by:
characters in interface org.xml.sax.ContentHandler
Overrides:
characters in class org.xml.sax.helpers.DefaultHandler
Throws:
org.xml.sax.SAXException

parseKeywords

private void parseKeywords(java.lang.String s,
                           java.util.Vector<java.lang.String> keywords)
parses a keyword string of the format keyword; keyword; ...

Parameters:
s -
keywords -

parseAuthors

private void parseAuthors(java.lang.String s,
                          java.util.Vector<java.lang.String> authors)
parse an author string of the format "surname, givenname (department); surname, ..."

Parameters:
s -
authors -

parseText

private void parseText(java.lang.String s,
                       java.util.Vector<java.lang.String> words)
Parse the given text and add terms to the model. Here stop-words and stem filtering is located.

Parameters:
s -

removePunct

private java.lang.String removePunct(java.lang.String s)
Remove all punctuation

Parameters:
s -
Returns: