/*
 * Decompiled with CFR 0.152.
 */
package org.knowceans.corpus;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Random;
import org.knowceans.corpus.Document;
import org.knowceans.corpus.ICorpus;
import org.knowceans.corpus.ISplitCorpus;
import org.knowceans.corpus.ITermCorpus;
import org.knowceans.util.ArrayUtils;
import org.knowceans.util.RandomSamplers;
import org.knowceans.util.Vectors;

public class NumCorpus
implements ICorpus,
ITermCorpus,
ISplitCorpus {
    protected Document[] docs;
    protected int numTerms;
    protected int numDocs;
    protected int numWords;
    protected boolean debug = false;
    protected int[] splitperm;
    protected int[] splitstarts;
    protected NumCorpus trainCorpus;
    protected NumCorpus testCorpus;
    protected int[][] origDocIds;
    int[][] parbounds;
    private int[][] wordparbounds;
    private int readlimit = -1;
    static int OFFSET = 0;

    public NumCorpus(String dataFilename) {
        this.read(dataFilename);
    }

    public NumCorpus(String dataFilename, int readlimit) {
        this.readlimit = readlimit;
        this.read(dataFilename);
    }

    public NumCorpus() {
    }

    public NumCorpus(Document[] docs, int numTerms, int numWords) {
        this.numTerms = numTerms;
        this.numWords = numWords;
        this.numDocs = docs.length;
        this.docs = docs;
    }

    public void read(String dataFilename) {
        int count = 0;
        int nw = 0;
        if (this.debug) {
            System.out.println("reading data from " + dataFilename);
        }
        try {
            ArrayList<Document> cdocs = new ArrayList<Document>();
            BufferedReader br = new BufferedReader(new FileReader(dataFilename));
            int nd = 0;
            int nt = 0;
            String line = "";
            this.parbounds = null;
            boolean parmode = false;
            while ((line = br.readLine()) != null) {
                int word;
                int n;
                String[] fields = line.trim().split("\\s+");
                if (fields[0].equals("") || fields[0].equals("0")) continue;
                int length = Integer.parseInt(fields[0]);
                if (length == fields.length - 1) {
                    Document d = new Document();
                    cdocs.add(d);
                    d.setNumTerms(length);
                    d.setNumWords(0);
                    d.setTerms(new int[length]);
                    d.setCounts(new int[length]);
                    n = 0;
                    while (n < length) {
                        String[] numbers = fields[n + 1].split(":");
                        if (!numbers[0].equals("") && !numbers[0].equals("")) {
                            word = Integer.parseInt(numbers[0]);
                            count = (int)Float.parseFloat(numbers[1]);
                            nw += count;
                            d.setTerm(n, word -= OFFSET);
                            d.setCount(n, count);
                            d.setNumWords(d.getNumWords() + count);
                            if (word >= nt) {
                                nt = word + 1;
                            }
                        }
                        ++n;
                    }
                } else {
                    parmode = true;
                    int nextpar = 0;
                    int token = 0;
                    Document pd = new Document();
                    cdocs.add(pd);
                    while (nextpar < fields.length) {
                        length = Integer.parseInt(fields[nextpar]);
                        nextpar += length + 1;
                        ++token;
                        Document d = new Document();
                        d.setNumTerms(length);
                        d.setNumWords(0);
                        d.setTerms(new int[length]);
                        d.setCounts(new int[length]);
                        n = 0;
                        while (n < length) {
                            String[] numbers = fields[token].split(":");
                            if (!numbers[0].equals("") && !numbers[0].equals("")) {
                                word = Integer.parseInt(numbers[0]);
                                count = (int)Float.parseFloat(numbers[1]);
                                nw += count;
                                d.setTerm(n, word -= OFFSET);
                                d.setCount(n, count);
                                d.setNumWords(d.getNumWords() + count);
                                if (word >= nt) {
                                    nt = word + 1;
                                }
                            }
                            ++n;
                            ++token;
                        }
                        pd.addDocument(d);
                    }
                }
                if (nd % 1000 == 0) {
                    System.out.println(nd);
                }
                if (this.readlimit >= 0 && ++nd >= this.readlimit) break;
            }
            this.numDocs = nd;
            this.numTerms = nt;
            this.numWords = nw;
            this.docs = cdocs.toArray(new Document[0]);
            if (parmode) {
                this.parbounds = new int[this.docs.length][];
                int m = 0;
                while (m < this.docs.length) {
                    this.parbounds[m] = this.docs[m].getParBounds();
                    ++m;
                }
            }
            this.origDocIds = new int[2][];
            this.origDocIds[0] = Vectors.range(0, nd - 1);
            if (this.debug) {
                System.out.println("number of docs    : " + nd);
                System.out.println("number of terms   : " + nt);
            }
        }
        catch (NumberFormatException e) {
            e.printStackTrace();
        }
        catch (FileNotFoundException e) {
            e.printStackTrace();
        }
        catch (IOException e) {
            e.printStackTrace();
        }
    }

    public Document[] getDocs() {
        return this.docs;
    }

    @Override
    public int[][][] getDocTermsFreqs() {
        int[][][] documents = new int[2][this.getNumDocs()][];
        int i = 0;
        while (i < this.getNumDocs()) {
            documents[0][i] = this.getDoc(i).getTerms();
            documents[1][i] = this.getDoc(i).getCounts();
            ++i;
        }
        return documents;
    }

    public int[][] getDocParBounds() {
        return this.parbounds;
    }

    public int[][] getDocWordParBounds() {
        if (this.parbounds == null) {
            return null;
        }
        int[][] psnwords = new int[this.numDocs][];
        int m = 0;
        while (m < this.numDocs) {
            psnwords[m] = this.getDocWordParBounds(m);
            ++m;
        }
        return psnwords;
    }

    private int[] getDocWordParBounds(int m) {
        Document d = this.docs[m];
        int[] termbounds = d.getParBounds();
        int[] wordbounds = new int[termbounds.length];
        int prevbound = 0;
        int prevwbound = 0;
        int j = 0;
        while (j < termbounds.length) {
            wordbounds[j] = Vectors.sum(Vectors.sub(d.counts, prevbound, termbounds[j] - prevbound)) + prevwbound;
            prevbound = termbounds[j];
            prevwbound = wordbounds[j];
            ++j;
        }
        return wordbounds;
    }

    public void mergeDocPars() {
        if (this.parbounds == null) {
            return;
        }
        int i = 0;
        while (i < this.docs.length) {
            this.docs[i].mergeDocument(null);
            ++i;
        }
        this.parbounds = null;
    }

    public Document getDoc(int index) {
        return this.docs[index];
    }

    @Override
    public int[][] getDocWords(Random rand) {
        int[][] documents = new int[this.getNumDocs()][];
        int i = 0;
        while (i < this.getNumDocs()) {
            documents[i] = this.getDocWords(i, rand);
            ++i;
        }
        return documents;
    }

    @Override
    public int getNumWords() {
        return this.numWords;
    }

    @Override
    public int[] getDocWords(int m, Random rand) {
        if (this.parbounds == null || this.parbounds[m] == null || this.parbounds[m].length == 1) {
            ArrayList<Integer> document = new ArrayList<Integer>();
            int i = 0;
            while (i < this.docs[m].getTerms().length) {
                int term = this.docs[m].getTerms()[i];
                int j = 0;
                while (j < this.docs[m].getCount(i)) {
                    document.add(term);
                    ++j;
                }
                ++i;
            }
            if (rand != null) {
                Collections.shuffle(document, rand);
            }
            int[] a = (int[])ArrayUtils.asPrimitiveArray(document);
            return a;
        }
        if (this.wordparbounds == null) {
            this.wordparbounds = new int[this.parbounds.length][];
        }
        this.wordparbounds[m] = new int[this.parbounds[m].length];
        int nw = 0;
        int tstart = 0;
        int[] words = new int[Vectors.sum(this.docs[m].counts)];
        int s = 0;
        while (s < this.parbounds[m].length) {
            int tend = this.parbounds[m][s];
            ArrayList<Integer> par = new ArrayList<Integer>();
            int i = tstart;
            while (i < tend) {
                int term = this.docs[m].getTerms()[i];
                int j = 0;
                while (j < this.docs[m].getCount(i)) {
                    par.add(term);
                    ++j;
                }
                ++i;
            }
            if (rand != null) {
                Collections.shuffle(par, rand);
            }
            i = 0;
            while (i < par.size()) {
                words[nw + i] = (Integer)par.get(i);
                ++i;
            }
            tstart = tend;
            nw += par.size();
            ++s;
        }
        return words;
    }

    public void setDoc(int index, Document doc) {
        this.docs[index] = doc;
    }

    @Override
    public int getNumDocs() {
        return this.numDocs;
    }

    @Override
    public int getNumTerms() {
        return this.numTerms;
    }

    public int getNumTerms(int doc) {
        return this.docs[doc].getNumTerms();
    }

    public int getNumWords(int doc) {
        return this.docs[doc].getNumWords();
    }

    public void setDocs(Document[] documents) {
        this.docs = documents;
    }

    public String toString() {
        StringBuffer b = new StringBuffer();
        b.append("Corpus {numDocs=" + this.numDocs + " numTerms=" + this.numTerms + "}");
        return b.toString();
    }

    public void reduce(int ndocs, Random rand) {
        System.out.println(String.valueOf(this.numDocs) + ".");
        if (this.numDocs > ndocs) {
            Document[] docsnew = new Document[ndocs];
            int i = 0;
            while (i < ndocs) {
                docsnew[i] = this.docs[i];
                ++i;
            }
            this.docs = docsnew;
            this.numDocs = ndocs;
        }
    }

    @Override
    public void split(int order, int split, Random rand) {
        if (rand != null) {
            RandomSamplers rs = new RandomSamplers(rand);
            this.splitperm = rs.randPerm(this.numDocs);
            this.splitstarts = new int[order + 1];
        }
        int p = 0;
        while (p <= order) {
            this.splitstarts[p] = Math.round((float)this.numDocs * ((float)p / (float)order));
            ++p;
        }
        int Mtest = this.splitstarts[split + 1] - this.splitstarts[split];
        int mstart = this.splitstarts[split];
        this.origDocIds = new int[][]{new int[this.numDocs - Mtest], new int[Mtest]};
        Document[] trainDocs = new Document[this.numDocs - Mtest];
        Document[] testDocs = new Document[Mtest];
        int mtrain = 0;
        int m = 0;
        while (m < mstart) {
            trainDocs[mtrain] = this.docs[this.splitperm[m]];
            this.origDocIds[0][mtrain] = this.splitperm[m];
            ++mtrain;
            ++m;
        }
        m = this.splitstarts[split + 1];
        while (m < this.numDocs) {
            trainDocs[mtrain] = this.docs[this.splitperm[m]];
            this.origDocIds[0][mtrain] = this.splitperm[m];
            ++mtrain;
            ++m;
        }
        int numTestWords = 0;
        m = 0;
        while (m < Mtest) {
            testDocs[m] = this.docs[this.splitperm[m + mstart]];
            this.origDocIds[1][m] = this.splitperm[m + mstart];
            numTestWords += testDocs[m].getNumWords();
            ++m;
        }
        this.trainCorpus = new NumCorpus(trainDocs, this.numTerms, this.numWords - numTestWords);
        this.testCorpus = new NumCorpus(testDocs, this.numTerms, numTestWords);
    }

    @Override
    public ICorpus getTrainCorpus() {
        return this.trainCorpus;
    }

    @Override
    public ICorpus getTestCorpus() {
        return this.testCorpus;
    }

    @Override
    public int[][] getOrigDocIds() {
        return this.origDocIds;
    }

    public void write(String pathbase) throws IOException {
        BufferedWriter bwcorp = new BufferedWriter(new FileWriter(String.valueOf(pathbase) + ".corpus"));
        int m = 0;
        while (m < this.docs.length) {
            Document doc;
            if (m % 100 == 0) {
                System.out.println(m);
            }
            if ((doc = this.docs[m]).getParBounds() == null) {
                bwcorp.append(Integer.toString(doc.numTerms));
                int n = 0;
                while (n < doc.numTerms) {
                    bwcorp.append(" " + doc.terms[n] + ":" + doc.counts[n]);
                    ++n;
                }
                bwcorp.append('\n');
            } else {
                int prevbound = 0;
                int s = 0;
                while (s < doc.parBounds.length) {
                    if (s > 0) {
                        bwcorp.append(" ");
                    }
                    bwcorp.append(Integer.toString(doc.numTerms));
                    int n = prevbound;
                    while (n < doc.parBounds[s]) {
                        bwcorp.append(" " + doc.terms[n] + ":" + doc.counts[n]);
                        ++n;
                    }
                    prevbound = doc.parBounds[s];
                    ++s;
                }
                bwcorp.append('\n');
            }
            ++m;
        }
        bwcorp.close();
    }

    public static void main(String[] args) {
        NumCorpus nc = new NumCorpus("berry95/berry95.corpus");
        nc.split(10, 0, new Random());
        System.out.println("train");
        ICorpus ncc = nc.getTrainCorpus();
        System.out.println(ncc);
        int[][] x = ncc.getDocWords(new Random());
        System.out.println(Vectors.print(x));
        System.out.println("test");
        ncc = nc.getTestCorpus();
        System.out.println(ncc);
        x = ncc.getDocWords(new Random());
        System.out.println(Vectors.print(x));
        System.out.println("document mapping");
        System.out.println(Vectors.print(nc.getOrigDocIds()));
    }
}

