|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object org.knowceans.corpus.TermCorpusOld
public class TermCorpusOld
TermCorpus collects terms from different documents and creates a corpus from them with a one-to-one term <-> id assignment.
Field Summary | |
---|---|
protected ICategories |
cats
Deprecated. |
protected java.util.HashMap<java.lang.Integer,java.lang.Integer> |
curDoc
Deprecated. document |
boolean |
DEBUG
Deprecated. |
protected java.util.Vector<java.util.Vector<java.lang.Integer>> |
docCategories
Deprecated. store docCategories |
protected java.util.Vector<java.lang.Integer> |
docFreqs
Deprecated. each term's document frequency |
protected java.util.Vector<java.lang.String> |
docNames
Deprecated. store docNames |
protected java.util.Vector<java.util.Map<java.lang.Integer,java.lang.Integer>> |
docTerms
Deprecated. each document's term frequencies termid -> frequency(doc) |
protected java.util.Vector<java.util.Map<java.lang.Integer,java.lang.Integer>> |
docTermsUnique
Deprecated. each document's unique term frequencies termid -> frequency(doc) (used when splitting a corpus into unique and non-unique terms) |
boolean |
ignoreUnique
Deprecated. |
protected int |
maxId
Deprecated. maximum term id. |
protected int |
minDf
Deprecated. |
protected int |
minTf
Deprecated. |
protected int |
ndocs
Deprecated. number of documents |
protected int |
nterms
Deprecated. number of terms |
protected int |
ntermsNonUnique
Deprecated. number of nonunique terms |
protected int |
nwords
Deprecated. number of words |
protected int |
OFFSET
Deprecated. index offset for terms and documents (only tested with 0) |
protected boolean |
progress
Deprecated. monitor progress |
protected java.util.Vector<java.lang.Integer> |
termFreqs
Deprecated. term frequencies |
protected org.knowceans.map.IBijectiveMap<java.lang.String,java.lang.Integer> |
termIndex
Deprecated. term indices term->termid |
Constructor Summary | |
---|---|
TermCorpusOld()
Deprecated. |
|
TermCorpusOld(ICategories cats)
Deprecated. |
|
TermCorpusOld(ICategories cats,
int mindf,
int mintf)
Deprecated. DPA corpus initialiser. |
|
TermCorpusOld(java.lang.String fileroot,
boolean readUnique,
ICategories cats)
Deprecated. create an actor-media corpus from files, which means for the corpus root name, all files are read: *.vocab, *.docs, *.actors, *.corpus. |
Method Summary | |
---|---|
void |
add(java.util.Vector<java.lang.String> terms)
Deprecated. add one term vector |
(package private) java.lang.String |
docCategoriesToString(int docIndex)
Deprecated. |
java.lang.String |
docToString(int docIndex,
boolean showUnique)
Deprecated. prints the document content in order of descending term frequency |
void |
finaliseDocument(java.lang.String a,
java.util.Vector<java.lang.Integer> categories)
Deprecated. finalises the current document with a name (useful to identify documents) and its categories (leave null if unused). |
java.util.Vector<java.util.Vector<java.lang.Integer>> |
getDocCategories()
Deprecated. |
java.util.Vector<java.lang.String> |
getDocNames()
Deprecated. |
java.util.Vector<java.util.Map<java.lang.Integer,java.lang.Integer>> |
getDocTerms()
Deprecated. |
java.util.Vector<java.util.Map<java.lang.Integer,java.lang.Integer>> |
getDocTermsUnique()
Deprecated. |
int |
getNdocs()
Deprecated. |
int |
getNterms()
Deprecated. |
int |
getNwords()
Deprecated. |
org.knowceans.map.IBijectiveMap<java.lang.String,java.lang.Integer> |
getTermIndex()
Deprecated. |
java.lang.String |
lookup(int term)
Deprecated. look up term for id. |
int |
lookup(java.lang.String term)
Deprecated. look up id for term |
java.lang.String |
lookupDoc(int id)
Deprecated. look up term for id. |
(package private) int |
lookupDoc(java.lang.String name)
Deprecated. look up id for document. |
void |
readCorpus(java.lang.String file,
boolean readUnique)
Deprecated. read the corpus in the format number of terms, id:freq for each term |
void |
readDocList(java.lang.String file)
Deprecated. reads the vocabulary from a file with format id = termstring (on each line) |
private void |
readDocTerms(java.lang.String file,
java.util.Vector<java.util.Map<java.lang.Integer,java.lang.Integer>> data)
Deprecated. |
void |
readVocabulary(java.lang.String file,
boolean readUnique)
Deprecated. reads the vocabulary from a file with format id = termstring (on each line) |
int |
reorderCorpus(boolean splitUnique)
Deprecated. Reorders the vocabulary so indices of non-unique terms span the interval 1..maxLsa, which can be used to reduce the size of a topic extraction problem. |
void |
setNdocs(int ndocs)
Deprecated. |
void |
setNterms(int nterms)
Deprecated. |
void |
setNwords(int nwords)
Deprecated. |
void |
writeCorpus(java.lang.String file,
boolean writeUnique)
Deprecated. write the complete corpus to a file in the format number of terms, id:freq for each term |
void |
writeDocList(java.lang.String file)
Deprecated. write the vocabulary in a file with format id = termstring (on each line) |
private void |
writeDocTerms(java.lang.String file,
java.util.Vector<java.util.Map<java.lang.Integer,java.lang.Integer>> data,
boolean sorted)
Deprecated. |
void |
writeVocabulary(java.lang.String file,
boolean sort,
boolean writeUnique)
Deprecated. write the vocabulary in a file with format id = termstring (on each line) |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
protected boolean progress
public boolean ignoreUnique
protected ICategories cats
protected int OFFSET
protected org.knowceans.map.IBijectiveMap<java.lang.String,java.lang.Integer> termIndex
protected java.util.Vector<java.lang.Integer> termFreqs
protected java.util.Vector<java.lang.Integer> docFreqs
protected java.util.Vector<java.util.Map<java.lang.Integer,java.lang.Integer>> docTerms
protected java.util.Vector<java.util.Map<java.lang.Integer,java.lang.Integer>> docTermsUnique
protected java.util.HashMap<java.lang.Integer,java.lang.Integer> curDoc
protected java.util.Vector<java.lang.String> docNames
protected java.util.Vector<java.util.Vector<java.lang.Integer>> docCategories
protected int maxId
protected int ndocs
protected int nterms
protected int ntermsNonUnique
protected int nwords
protected int minDf
protected int minTf
public boolean DEBUG
Constructor Detail |
---|
public TermCorpusOld(java.lang.String fileroot, boolean readUnique, ICategories cats)
fileroot
- the root name of all files to be read into the corpusreadUnique
- public TermCorpusOld()
public TermCorpusOld(ICategories cats)
public TermCorpusOld(ICategories cats, int mindf, int mintf)
cats
- minDf
- use minimum document frequency when reorderingmintf
- use minimum term frequency when reorderingMethod Detail |
---|
public void add(java.util.Vector<java.lang.String> terms)
terms
- public void finaliseDocument(java.lang.String a, java.util.Vector<java.lang.Integer> categories)
public int reorderCorpus(boolean splitUnique)
splitUnique
- true if the corpus should be splitted into unique
terms and non-unique terms, i.e., such that have are in more than
one document and are significant for concept extraction.
public void writeVocabulary(java.lang.String file, boolean sort, boolean writeUnique) throws java.io.IOException
file
- sort
- sorts the vocabulary in alphabetical orderwriteUnique
- writes a second vocabulary file that has all unique
terms with a 2 added to the file name
java.io.IOException
public void readVocabulary(java.lang.String file, boolean readUnique) throws java.lang.NumberFormatException, java.io.IOException
file
- oldformat
- read old format id = string, otherwise string = idreadUnique
- read a second file with unique terms
java.io.IOException
java.lang.NumberFormatException
public void writeDocList(java.lang.String file) throws java.io.IOException
file
-
java.io.IOException
public void readDocList(java.lang.String file) throws java.io.IOException
file
-
java.io.IOException
java.lang.NumberFormatException
public void writeCorpus(java.lang.String file, boolean writeUnique) throws java.io.IOException
file
- file name of the corpus filewriteUnique
- if set write the unique words of a document (that are
exclusive to this document) into a separate file with a 2 added to
file.
java.io.IOException
private void writeDocTerms(java.lang.String file, java.util.Vector<java.util.Map<java.lang.Integer,java.lang.Integer>> data, boolean sorted) throws java.io.IOException
java.io.IOException
public void readCorpus(java.lang.String file, boolean readUnique) throws java.lang.NumberFormatException, java.io.IOException
file
- readUnique
- if set read the unique words of documents (that are
exclusive to this document) from a separate file with a 2 added to
file.
java.lang.NumberFormatException
java.io.IOException
private void readDocTerms(java.lang.String file, java.util.Vector<java.util.Map<java.lang.Integer,java.lang.Integer>> data) throws java.lang.NumberFormatException, java.io.IOException
file
- data
- vector of integer->integer maps which is initialised if
argument is null and appended if not.
java.lang.NumberFormatException
java.io.IOException
public java.lang.String lookup(int term)
term
-
public int lookup(java.lang.String term)
term
-
int lookupDoc(java.lang.String name)
name
-
public java.lang.String lookupDoc(int id)
term
-
public java.lang.String docToString(int docIndex, boolean showUnique)
docIndex
- showUnique
- set if unique terms should be shown
java.lang.String docCategoriesToString(int docIndex)
public java.util.Vector<java.lang.String> getDocNames()
public java.util.Vector<java.util.Vector<java.lang.Integer>> getDocCategories()
public java.util.Vector<java.util.Map<java.lang.Integer,java.lang.Integer>> getDocTerms()
public java.util.Vector<java.util.Map<java.lang.Integer,java.lang.Integer>> getDocTermsUnique()
public org.knowceans.map.IBijectiveMap<java.lang.String,java.lang.Integer> getTermIndex()
public int getNdocs()
public void setNdocs(int ndocs)
public int getNterms()
public void setNterms(int nterms)
public int getNwords()
public void setNwords(int nwords)
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |