|
||||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||||
java.lang.Objectorg.knowceans.corpus.parsers.nips.NipsXmlReader
public class NipsXmlReader
NipsXmlReader parsers one XML file converted from NIPS PDF documents using XPDF-basd pdftohtml
| Field Summary | |
|---|---|
(package private) java.lang.String |
_bold
|
(package private) java.lang.String |
_fontspec
|
(package private) java.lang.String |
_italics
|
(package private) java.lang.String |
_page
|
(package private) java.lang.String |
_text
|
(package private) java.util.regex.Pattern |
boldpattern
|
(package private) java.util.HashMap<java.lang.String,java.lang.Double> |
fontsizes
|
(package private) java.util.regex.Pattern |
fontspecpattern
|
private boolean |
includerefs
whether to include references in the text |
private boolean |
inrefs
|
(package private) java.util.regex.Pattern |
italicspattern
|
(package private) java.util.regex.Pattern |
pagepattern
|
(package private) java.util.regex.Pattern |
textpattern
|
| Constructor Summary | |
|---|---|
NipsXmlReader()
initialise reader (e.g., compile regex patterns) |
|
| Method Summary | |
|---|---|
private java.lang.String |
clean(java.lang.String in)
cleans the string. |
private java.lang.String |
cleanUp(java.lang.String in)
|
void |
extract(java.lang.String filename,
NipsDocument doc)
|
private java.util.Vector<java.lang.String> |
extractPage(java.lang.StringBuffer content)
extracts the content of a page. |
private java.lang.String[] |
getHead(java.lang.StringBuffer content)
extract title, authors and abstract from (the first page of a) document. |
private java.util.Vector<java.lang.String> |
getPages(java.lang.StringBuffer content)
|
static void |
main(java.lang.String[] args)
|
private void |
processText(java.lang.StringBuffer content,
NipsDocument doc)
|
private java.lang.String |
replaceUmlauts(java.lang.String in)
inserts umlauts TODO: direct unicode multicharacter replacements |
private int |
setFonts(java.lang.StringBuffer content)
sets the font sizes for the current page |
| Methods inherited from class java.lang.Object |
|---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
| Field Detail |
|---|
java.lang.String _page
java.lang.String _fontspec
java.lang.String _text
java.lang.String _bold
java.lang.String _italics
java.util.regex.Pattern pagepattern
java.util.regex.Pattern fontspecpattern
java.util.regex.Pattern textpattern
java.util.regex.Pattern boldpattern
java.util.regex.Pattern italicspattern
java.util.HashMap<java.lang.String,java.lang.Double> fontsizes
private boolean inrefs
private boolean includerefs
| Constructor Detail |
|---|
public NipsXmlReader()
| Method Detail |
|---|
public static void main(java.lang.String[] args)
public void extract(java.lang.String filename,
NipsDocument doc)
filename - doc - document record (existing or will be created)
private void processText(java.lang.StringBuffer content,
NipsDocument doc)
content - private java.lang.String[] getHead(java.lang.StringBuffer content)
content - the content that the heading data is extracted from. The
extracted parts are stripped from
private java.lang.String cleanUp(java.lang.String in)
in -
private int setFonts(java.lang.StringBuffer content)
content -
private java.lang.String replaceUmlauts(java.lang.String in)
in -
private java.lang.String clean(java.lang.String in)
in -
private java.util.Vector<java.lang.String> getPages(java.lang.StringBuffer content)
content -
private java.util.Vector<java.lang.String> extractPage(java.lang.StringBuffer content)
content -
|
||||||||||
| PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
| SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD | |||||||||