org.knowceans.corpus.parsers.epg
Class EpgParser

java.lang.Object
  extended by org.xml.sax.helpers.DefaultHandler
      extended by org.knowceans.corpus.parsers.epg.EpgParser
All Implemented Interfaces:
org.xml.sax.ContentHandler, org.xml.sax.DTDHandler, org.xml.sax.EntityResolver, org.xml.sax.ErrorHandler

public class EpgParser
extends org.xml.sax.helpers.DefaultHandler

EpgParser parses a EPG html file into a TextCorpus.

Author:
heinrich

Field Summary
private  java.util.Vector<EpgDocument> allDocs
           
(package private) static java.lang.String[] blacklist
          Documents containing one of these terms are invalid.
private  java.io.BufferedWriter bw
           
private  EpgDocument curDoc
           
(package private) static java.lang.String htmlPattern
           
private  int nr
           
private  java.lang.String prevWord
           
private  Stemmer stem
           
private  StopWordFilter stop
           
 boolean useBigrams
           
 boolean useStemming
           
 boolean useUnigrams
           
private  java.lang.String xmlfile
           
(package private) static java.lang.String xmlPattern
           
 
Constructor Summary
EpgParser()
           
EpgParser(java.lang.String stoplist)
           
 
Method Summary
private  boolean blackList(java.lang.String string)
           
private  void closeOutfile()
           
 void configure(boolean useStemming, boolean useUnigrams, boolean useBigrams)
          configure the parser.
private  boolean isValid(int mindl)
           
static void main(java.lang.String[] argv)
           
private  void openOutfile()
           
 java.util.Vector<EpgDocument> parse(java.lang.String file, int mindl)
          opens the file and parses the content
private  java.util.Vector<EpgDocument> parseDir(java.lang.String sourcefile, int mindl)
          Parse directory by adding each XML file's content sequentially.
private  java.util.Vector<EpgDocument> parseString(java.lang.String string, int mindl, boolean html)
          parses the string
private  int parseText(java.lang.String s, java.util.Vector<java.lang.String> words)
          Parse the given text and add terms to the model.
private  java.lang.String removePunct(java.lang.String s)
          Remove all punctuation
private  void setXmlOutput(java.lang.String xmlfile)
           
private  void writeText(java.lang.String title, java.lang.String subtitle, java.lang.String body)
           
 
Methods inherited from class org.xml.sax.helpers.DefaultHandler
characters, endDocument, endElement, endPrefixMapping, error, fatalError, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, skippedEntity, startDocument, startElement, startPrefixMapping, unparsedEntityDecl, warning
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

htmlPattern

static java.lang.String htmlPattern

xmlPattern

static java.lang.String xmlPattern

blacklist

static java.lang.String[] blacklist
Documents containing one of these terms are invalid.


curDoc

private EpgDocument curDoc

stop

private StopWordFilter stop

useStemming

public boolean useStemming

useBigrams

public boolean useBigrams

useUnigrams

public boolean useUnigrams

prevWord

private java.lang.String prevWord

nr

private int nr

stem

private Stemmer stem

allDocs

private java.util.Vector<EpgDocument> allDocs

bw

private java.io.BufferedWriter bw

xmlfile

private java.lang.String xmlfile
Constructor Detail

EpgParser

public EpgParser()
Parameters:
argv -

EpgParser

public EpgParser(java.lang.String stoplist)
Parameters:
argv -
Method Detail

main

public static void main(java.lang.String[] argv)

setXmlOutput

private void setXmlOutput(java.lang.String xmlfile)

configure

public void configure(boolean useStemming,
                      boolean useUnigrams,
                      boolean useBigrams)
configure the parser.

Parameters:
useStemming - use stemming
useUnigrams - use unigrams
useBigrams - use bigrams
sentencesAsDocs -
meldungenAsDocs -

parse

public java.util.Vector<EpgDocument> parse(java.lang.String file,
                                           int mindl)
opens the file and parses the content

Parameters:
mindl - minimum required document length
Returns:

parseDir

private java.util.Vector<EpgDocument> parseDir(java.lang.String sourcefile,
                                               int mindl)
Parse directory by adding each XML file's content sequentially. Doc IDs are taken from the docId tag from inside the xml document.

Parameters:
sourcefile -
mindl - minimum doc length
Returns:

parseString

private java.util.Vector<EpgDocument> parseString(java.lang.String string,
                                                  int mindl,
                                                  boolean html)
parses the string

Parameters:
string -
mindl -
html -
Returns:

blackList

private boolean blackList(java.lang.String string)

isValid

private boolean isValid(int mindl)

openOutfile

private void openOutfile()

writeText

private void writeText(java.lang.String title,
                       java.lang.String subtitle,
                       java.lang.String body)

closeOutfile

private void closeOutfile()

parseText

private int parseText(java.lang.String s,
                      java.util.Vector<java.lang.String> words)
Parse the given text and add terms to the model. Here stop-words and stem filtering is located.

Parameters:
s -
Returns:
number of terms added to words.

removePunct

private java.lang.String removePunct(java.lang.String s)
Remove all punctuation

Parameters:
s -
Returns: