|
||||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||||
java.lang.Objectorg.knowceans.corpus.parsers.nips.NipsExtractor
public class NipsExtractor
NipsExtractor uses the XML and BIB files downloaded by NipsDownload to extract abstracts and content of the papers and convert them to an SVMlight-like corpus file.
TODO: this probably my worst code for years, under the excuse of "lack of time". Restructure and improve!
| Nested Class Summary | |
|---|---|
(package private) class |
NipsExtractor.Entry
|
| Field Summary | |
|---|---|
private boolean |
abstractsonly
|
private AmqCorpus |
amq
|
(package private) java.lang.String[] |
bibkeySourcePatterns
|
(package private) java.lang.String[] |
dirsAndBib
|
(package private) java.lang.String[] |
fileDestPatterns
|
(package private) java.lang.String |
fileroot
|
private boolean |
includerefs
|
private java.lang.String |
prevWord
|
private EnStemmer |
stem
|
private StopWordFilter |
stop
|
boolean |
useBigrams
|
boolean |
useStemming
|
boolean |
useUnigrams
|
| Constructor Summary | |
|---|---|
NipsExtractor()
standard extractor initialises normalisation filters (stoplist and stemmer) |
|
NipsExtractor(java.lang.String stoplist,
boolean stem2,
boolean abstractsonly,
boolean includerefs)
|
|
| Method Summary | |
|---|---|
private java.lang.String |
clean(java.lang.String s)
cleans the string of the most common LaTeX special European characters. |
private AmqCorpus |
createCorpus(java.util.Vector<NipsDocument> docs,
int mindf,
int mintf)
take a parsed NipsDocument and add its entries to the corpus. |
private java.util.Vector<java.lang.String> |
getAuthors(java.lang.String entry)
get the authors from a bibentry |
private java.lang.String |
getTitle(java.lang.String entry)
|
static void |
main(java.lang.String[] args)
|
private void |
normaliseAuthors(java.util.Vector<java.lang.String> authors)
changes all authors to a canonical name, i.e., given names are changed to uppercase initials. |
private NipsExtractor.Entry |
parseBibEntry(java.lang.String s,
int i)
parse a bibentry |
java.util.Map<java.lang.String,NipsExtractor.Entry> |
parseBibtex()
parse the bibtex files and fill the reading map |
private java.util.Vector<NipsDocument> |
parseMap(java.util.Map<java.lang.String,NipsExtractor.Entry> map)
|
private void |
parseTerms(java.util.Vector<NipsDocument> docs,
boolean abstractsOnly)
Convert the NipsDocument, which contains only sections in each of the vector elements into one that has terms in them and a section index. |
int |
parseText(java.lang.String s,
java.util.Vector<java.lang.String> words)
Parse the given text and add terms to the model. |
private java.lang.String |
removePunct(java.lang.String s)
Remove all punctuation |
private java.lang.String |
replaceAbbreviations(java.lang.String s)
|
void |
run(java.lang.String fileroot,
boolean abstractsonly,
int mindf,
int mintf)
|
void |
save(java.lang.String corpusname)
|
| Methods inherited from class java.lang.Object |
|---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
| Field Detail |
|---|
private StopWordFilter stop
private EnStemmer stem
java.lang.String fileroot
java.lang.String[] dirsAndBib
java.lang.String[] bibkeySourcePatterns
java.lang.String[] fileDestPatterns
public boolean useStemming
public boolean useBigrams
public boolean useUnigrams
private java.lang.String prevWord
private AmqCorpus amq
private boolean includerefs
private boolean abstractsonly
| Constructor Detail |
|---|
public NipsExtractor()
public NipsExtractor(java.lang.String stoplist,
boolean stem2,
boolean abstractsonly,
boolean includerefs)
stoplist - stem2 - abstractsonly - includerefs - | Method Detail |
|---|
public static void main(java.lang.String[] args)
public void run(java.lang.String fileroot,
boolean abstractsonly,
int mindf,
int mintf)
public void save(java.lang.String corpusname)
private AmqCorpus createCorpus(java.util.Vector<NipsDocument> docs,
int mindf,
int mintf)
docs -
private void parseTerms(java.util.Vector<NipsDocument> docs,
boolean abstractsOnly)
Processing is done in 3 steps:
docs - list of documents with raw contentabstractsOnly - restricts the corpus generation to abstracts for
faster test runs.
public int parseText(java.lang.String s,
java.util.Vector<java.lang.String> words)
s -
private java.lang.String replaceAbbreviations(java.lang.String s)
s -
private java.lang.String removePunct(java.lang.String s)
s -
private void normaliseAuthors(java.util.Vector<java.lang.String> authors)
authors - public java.util.Map<java.lang.String,NipsExtractor.Entry> parseBibtex()
private NipsExtractor.Entry parseBibEntry(java.lang.String s,
int i)
s - bibtex entryi - pattern index (different file name convention for each year)
private java.util.Vector<java.lang.String> getAuthors(java.lang.String entry)
entry -
private java.lang.String getTitle(java.lang.String entry)
private java.lang.String clean(java.lang.String s)
s -
private java.util.Vector<NipsDocument> parseMap(java.util.Map<java.lang.String,NipsExtractor.Entry> map)
|
||||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||||