org.knowceans.corpus.xpert
Class XptDb2Corpus

java.lang.Object
  extended by org.knowceans.corpus.xpert.XptDb2Corpus

public final class XptDb2Corpus
extends java.lang.Object

Author:
Gregor Heinrich

Field Summary
private static java.util.HashMap<java.lang.Integer,XptAuthor> allAuthors
           
private static java.util.Vector<XptDocument> allDocs
           
private static IgdCategories cat
           
private  java.sql.Connection con
           
private static XptDocument curDoc
           
private static boolean inSentence
           
private static java.io.Writer out
           
private static java.lang.String prevWord
           
private static Stemmer stem
           
private static StopWordFilter stop
           
 boolean useBigrams
           
 boolean useStemming
           
 
Constructor Summary
XptDb2Corpus(java.lang.String modelFile)
          get properties for indexing and create database connection.
 
Method Summary
 void configure(boolean useStemming, boolean useBigrams)
          configure the db reader.
 java.lang.String convertToEntities(java.lang.String in)
           
 java.lang.String convertToUnicode(java.lang.String in)
           
 XptAuthor getAuthorData(int id)
          returns the list of authors.
 java.util.Vector<java.lang.Integer> getAuthorIds()
          gets all valid author ids from the database.
 XptDocument getProjectData(int pid)
          returns a single ProjectRec for the id.
 java.util.Vector<java.lang.Integer> getProjectIds()
          gets all valid project ids from the database.
static void main(java.lang.String[] args)
           
private  void parseText(java.lang.String s, java.util.Vector<java.lang.String> words)
          Parse the given text and add terms to the model.
private  void read()
           
private  java.lang.String removePunct(java.lang.String s)
          Remove all punctuation
private  AuthorTermCorpus toCorpus()
           
private  void write(AuthorTermCorpus f, java.lang.String corpusname)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

con

private java.sql.Connection con

out

private static java.io.Writer out

curDoc

private static XptDocument curDoc

inSentence

private static boolean inSentence

stop

private static StopWordFilter stop

useStemming

public boolean useStemming

useBigrams

public boolean useBigrams

prevWord

private static java.lang.String prevWord

cat

private static IgdCategories cat

stem

private static Stemmer stem

allDocs

private static java.util.Vector<XptDocument> allDocs

allAuthors

private static java.util.HashMap<java.lang.Integer,XptAuthor> allAuthors
Constructor Detail

XptDb2Corpus

public XptDb2Corpus(java.lang.String modelFile)
get properties for indexing and create database connection.

Method Detail

main

public static void main(java.lang.String[] args)

configure

public void configure(boolean useStemming,
                      boolean useBigrams)
configure the db reader.

Parameters:
useStemming - use stemming
useBigrams - use bigrams

read

private void read()

write

private void write(AuthorTermCorpus f,
                   java.lang.String corpusname)

toCorpus

private AuthorTermCorpus toCorpus()

getProjectIds

public java.util.Vector<java.lang.Integer> getProjectIds()
gets all valid project ids from the database.

Returns:

getAuthorIds

public java.util.Vector<java.lang.Integer> getAuthorIds()
gets all valid author ids from the database.

Returns:

getProjectData

public XptDocument getProjectData(int pid)
returns a single ProjectRec for the id.

Parameters:
pid -
Throws:
java.sql.SQLException

getAuthorData

public XptAuthor getAuthorData(int id)
returns the list of authors.

Parameters:
pid -
Throws:
java.sql.SQLException

parseText

private void parseText(java.lang.String s,
                       java.util.Vector<java.lang.String> words)
Parse the given text and add terms to the model. Here stop-words and stem filtering is located.

Parameters:
s -

removePunct

private java.lang.String removePunct(java.lang.String s)
Remove all punctuation

Parameters:
s -
Returns:

convertToEntities

public java.lang.String convertToEntities(java.lang.String in)

convertToUnicode

public java.lang.String convertToUnicode(java.lang.String in)