|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object org.knowceans.corpus.TermCorpus
public class TermCorpus
TermCorpusCps collects terms from different documents and creates a corpus from them with a one-to-one term <-> id assignment.
This variation of TermCorpus provides tracking of document and term frequencies so the corpus must only be evaluated once and can be read several times with different mindf and mintf values. Further, all lists have been changed to ArrayList, which is unsynchronized and faster than Vector.
TODO: Filtering terms by minimum term frequency and document frequency could be done in a TermFilter class, possibly as a generalisation of a stoplist.
Field Summary | |
---|---|
protected ICategories |
cats
|
protected java.util.HashMap<java.lang.Integer,java.lang.Integer> |
curDoc
document |
protected java.util.ArrayList<java.util.Vector<java.lang.Integer>> |
docCategories
store docCategories |
protected java.util.ArrayList<java.lang.Integer> |
docFreqs
each term's document frequency |
protected java.util.ArrayList<java.lang.String> |
docNames
store docNames |
protected java.util.ArrayList<java.util.Map<java.lang.Integer,java.lang.Integer>> |
docTerms
each document's term frequencies termid -> frequency(doc) |
protected java.util.ArrayList<java.util.Map<java.lang.Integer,java.lang.Integer>> |
docTermsFiltered
each document's filtered term frequencies termid -> frequency(doc) (used when splitting a corpus into filtered and unfiltered terms, e.g., via a minimum document frequency). |
boolean |
ignoreFiltered
|
protected int |
maxId
maximum term id. |
protected int |
minDf
|
protected int |
minDl
|
protected int |
minTf
|
protected int |
ndocs
number of documents |
protected int |
nterms
number of unfiltered terms |
protected int |
ntermsTotal
number of terms, filtered and unfiltered |
protected int |
nwords
number of words |
private int |
nwordsFiltered
number of words that have been filtered out. |
protected int |
OFFSET
index offset for terms and documents (only tested with 0) |
protected boolean |
progress
monitor progress |
protected java.util.ArrayList<java.lang.Integer> |
termFreqs
term frequencies |
protected org.knowceans.map.IBijectiveMap<java.lang.String,java.lang.Integer> |
termIndex
term indices term->termid |
Constructor Summary | |
---|---|
TermCorpus()
|
|
TermCorpus(EpgCategories categories,
int mindf,
int mintf,
int mindl)
|
|
TermCorpus(ICategories cats)
|
|
TermCorpus(ICategories cats,
int mindf,
int mintf)
DPA corpus initialiser. |
|
TermCorpus(java.lang.String fileroot,
boolean readLowFreq,
ICategories cats)
create an actor-media corpus from files, which means for the corpus root name, all files are read: *.vocab, *.docs, *.actors, *.corpus. |
Method Summary | |
---|---|
void |
add(java.util.Vector<java.lang.String> terms)
add one term vector |
(package private) java.lang.String |
docCategoriesToString(int docIndex)
|
java.lang.String |
docToString(int docIndex,
boolean showFiltered)
Print the document content in order of descending term frequency |
boolean |
finaliseDocument(java.lang.String name,
java.util.Vector<java.lang.Integer> categories)
Finalise the current document with a name (useful to identify documents but uniqueness not required) and its categories (leave null if unused). |
java.util.ArrayList<java.util.Vector<java.lang.Integer>> |
getDocCategories()
Get the categories of all documents. |
java.util.ArrayList<java.lang.String> |
getDocNames()
Get a list of all document names / ids. |
java.util.ArrayList<java.util.Map<java.lang.Integer,java.lang.Integer>> |
getDocTerms()
Get list of document term maps (index->freq) |
java.util.Map<java.lang.Integer,java.lang.Integer> |
getDocTerms(int doc)
Get the document terms as a frequency map id->frequency. |
java.util.ArrayList<java.util.Map<java.lang.Integer,java.lang.Integer>> |
getDocTermsFiltered()
Get list of document term maps (index->freq). |
java.util.Map<java.lang.Integer,java.lang.Integer> |
getDocTermsFiltered(int doc)
Get the document terms as a frequency map id->frequency. |
private java.util.Vector<java.lang.Integer> |
getDocWords(int doc,
java.util.Random rand)
Get the words of document doc as a scrambled sequence. |
int[][] |
getDocWords(java.util.Random rand)
Get the documents as vectors of bag of words, i.e., per document, a scrambled array of term indices is generated. |
int |
getNdocs()
Number of documents in corpus |
int |
getNterms()
Number of terms in corpus |
int |
getNtermsFiltered()
Number of terms in corpus |
int |
getNwords()
Get the number of words (term observations) in the corpus. |
int |
getNwordsFiltered()
Number of words in corpus that are filtered. |
org.knowceans.map.IBijectiveMap<java.lang.String,java.lang.Integer> |
getTermIndex()
Get a bijective map term / id |
private void |
greedySet(java.util.ArrayList<java.lang.Integer> list,
int index,
int value)
Set the element of the list at the specified index to value and increases the size of the list if index >= size. |
java.lang.String |
lookup(int term)
look up term for id. |
int |
lookup(java.lang.String term)
look up id for term |
java.lang.String |
lookupDoc(int id)
look up term for id. |
int |
lookupDoc(java.lang.String name)
look up id for document. |
int[] |
parseQuery(java.lang.String query)
Lookup multiple terms to create a numeric term vector. |
void |
readCorpus(java.lang.String file,
boolean readFiltered)
Read the corpus in the format number of terms, id:freq for each term |
void |
readDocList(java.lang.String file)
Read the vocabulary from a file with format id = termstring (on each line) |
private void |
readDocTerms(java.lang.String file,
java.util.ArrayList<java.util.Map<java.lang.Integer,java.lang.Integer>> data)
Read the document term maps into the array of maps. |
void |
readVocabulary(java.lang.String file,
boolean readFiltered)
reads the vocabulary from a file with line format id = termstring = termfreq docfreq |
int |
reorderCorpus(boolean filterSplit)
Reorder the terms by document frequency and split the corpus in a regular and a low-frequency part. |
int |
reorderCorpus0(boolean filterSplit)
Reorder the vocabulary so indices of regular-freq terms span the interval 1..maxLsa, which can be used to reduce the size of a topic extraction problem. |
void |
writeCorpus(java.lang.String file,
boolean writeFiltered)
Write the complete corpus to a file in the format number of terms, id:freq for each term |
void |
writeDocList(java.lang.String file)
Write the vocabulary in a file with format id = termstring (on each line) |
private void |
writeDocTerms(java.lang.String file,
java.util.ArrayList<java.util.Map<java.lang.Integer,java.lang.Integer>> data,
boolean sorted)
|
void |
writeVocabulary(java.lang.String file,
boolean sort,
boolean writeFiltered)
write the vocabulary in a file with line format id = termstring = termfreq docfreq |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
protected boolean progress
public boolean ignoreFiltered
protected ICategories cats
protected int OFFSET
protected org.knowceans.map.IBijectiveMap<java.lang.String,java.lang.Integer> termIndex
protected java.util.ArrayList<java.lang.Integer> termFreqs
protected java.util.ArrayList<java.lang.Integer> docFreqs
protected java.util.ArrayList<java.util.Map<java.lang.Integer,java.lang.Integer>> docTerms
protected java.util.ArrayList<java.util.Map<java.lang.Integer,java.lang.Integer>> docTermsFiltered
protected java.util.HashMap<java.lang.Integer,java.lang.Integer> curDoc
protected java.util.ArrayList<java.lang.String> docNames
protected java.util.ArrayList<java.util.Vector<java.lang.Integer>> docCategories
protected int maxId
protected int ndocs
protected int ntermsTotal
protected int nterms
protected int nwords
private int nwordsFiltered
protected int minDf
protected int minTf
protected int minDl
Constructor Detail |
---|
public TermCorpus(java.lang.String fileroot, boolean readLowFreq, ICategories cats)
fileroot
- the root name of all files to be read into the corpusreadLowFreq
- public TermCorpus()
public TermCorpus(ICategories cats)
public TermCorpus(ICategories cats, int mindf, int mintf)
cats
- minDf
- use minimum document frequency when reorderingmintf
- use minimum term frequency when reorderingpublic TermCorpus(EpgCategories categories, int mindf, int mintf, int mindl)
Method Detail |
---|
public void add(java.util.Vector<java.lang.String> terms)
terms
- public boolean finaliseDocument(java.lang.String name, java.util.Vector<java.lang.Integer> categories)
name
- categories
-
public int reorderCorpus(boolean filterSplit)
This is the new implementation of the reorderCorpus() routine, which uses a much clearer design with a table list, my approach to an inline database.
filterSplit
-
public int reorderCorpus0(boolean filterSplit)
filterSplit
- true if the corpus should be splitted into filtered
terms and unfiltered terms, where filtering occurs according to a
minimum frequency.
public void writeVocabulary(java.lang.String file, boolean sort, boolean writeFiltered) throws java.io.IOException
file
- sort
- sorts the vocabulary in alphabetical orderwriteFiltered
- writes a second vocabulary file that has all unique
terms with a 2 added to the file name
java.io.IOException
public void readVocabulary(java.lang.String file, boolean readFiltered) throws java.lang.NumberFormatException, java.io.IOException
file
- oldformat
- read old format id = string, otherwise string = idreadFiltered
- read a second file with filtered terms
java.io.IOException
java.lang.NumberFormatException
private void greedySet(java.util.ArrayList<java.lang.Integer> list, int index, int value)
list
- index
- value
- public void writeDocList(java.lang.String file) throws java.io.IOException
file
-
java.io.IOException
public void readDocList(java.lang.String file) throws java.io.IOException
file
-
java.io.IOException
java.lang.NumberFormatException
public void writeCorpus(java.lang.String file, boolean writeFiltered) throws java.io.IOException
file
- file name of the corpus filewriteFiltered
- if set write the filtered words of a document into a
separate file with a 2 added to file extension.
java.io.IOException
private void writeDocTerms(java.lang.String file, java.util.ArrayList<java.util.Map<java.lang.Integer,java.lang.Integer>> data, boolean sorted) throws java.io.IOException
java.io.IOException
public void readCorpus(java.lang.String file, boolean readFiltered) throws java.lang.NumberFormatException, java.io.IOException
file
- readFiltered
- if set read the unique words of documents (that are
exclusive to this document) from a separate file with a 2 added to
file.
java.lang.NumberFormatException
java.io.IOException
private void readDocTerms(java.lang.String file, java.util.ArrayList<java.util.Map<java.lang.Integer,java.lang.Integer>> data) throws java.lang.NumberFormatException, java.io.IOException
file
- data
- vector of integer->integer maps which is initialised if
argument is null and appended if not.
java.lang.NumberFormatException
java.io.IOException
public java.lang.String lookup(int term)
ITermCorpus
lookup
in interface ITermCorpus
public int lookup(java.lang.String term)
ITermCorpus
lookup
in interface ITermCorpus
public int[] parseQuery(java.lang.String query)
query
-
public int lookupDoc(java.lang.String name)
lookupDoc
in interface ITermCorpus
name
-
public java.lang.String lookupDoc(int id)
lookupDoc
in interface ITermCorpus
term
-
public java.lang.String docToString(int docIndex, boolean showFiltered)
docIndex
- showFiltered
- set if unique terms should be shown
public int[][] getDocWords(java.util.Random rand)
getDocWords
in interface ITermCorpus
rand
- random number generator or null to use standard generator
private java.util.Vector<java.lang.Integer> getDocWords(int doc, java.util.Random rand)
doc
- rand
- random number generator or null to use standard generator
public java.util.Map<java.lang.Integer,java.lang.Integer> getDocTerms(int doc)
ITermCorpus
getDocTerms
in interface ITermCorpus
public java.util.Map<java.lang.Integer,java.lang.Integer> getDocTermsFiltered(int doc)
ITermCorpusFiltered
getDocTermsFiltered
in interface ITermCorpusFiltered
java.lang.String docCategoriesToString(int docIndex)
docIndex
-
public java.util.ArrayList<java.lang.String> getDocNames()
IRandomAccessTermCorpus
getDocNames
in interface IRandomAccessTermCorpus
public java.util.ArrayList<java.util.Vector<java.lang.Integer>> getDocCategories()
public java.util.ArrayList<java.util.Map<java.lang.Integer,java.lang.Integer>> getDocTerms()
IRandomAccessTermCorpus
getDocTerms
in interface IRandomAccessTermCorpus
public java.util.ArrayList<java.util.Map<java.lang.Integer,java.lang.Integer>> getDocTermsFiltered()
IRandomAccessTermCorpusFiltered
getDocTermsFiltered
in interface IRandomAccessTermCorpusFiltered
public org.knowceans.map.IBijectiveMap<java.lang.String,java.lang.Integer> getTermIndex()
IRandomAccessTermCorpus
getTermIndex
in interface IRandomAccessTermCorpus
public int getNdocs()
ITermCorpus
getNdocs
in interface ITermCorpus
public int getNterms()
ITermCorpus
getNterms
in interface ITermCorpus
public int getNtermsFiltered()
ITermCorpusFiltered
getNtermsFiltered
in interface ITermCorpusFiltered
public int getNwords()
IRandomAccessTermCorpus
getNwords
in interface IRandomAccessTermCorpus
public int getNwordsFiltered()
ITermCorpusFiltered
getNwordsFiltered
in interface ITermCorpusFiltered
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |