/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.vectorizer;

import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
import com.google.common.collect.Lists;
import java.net.URI;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.vectorizer.RandomDocumentGenerator;
import org.apache.mahout.vectorizer.SparseVectorsFromSequenceFiles;
import org.junit.Before;
import org.junit.Test;

@ThreadLeakScope(value=ThreadLeakScope.Scope.NONE)
public class HighDFWordsPrunerTest
extends MahoutTestCase {
    private static final int NUM_DOCS = 100;
    private static final String[] HIGH_DF_WORDS = new String[]{"has", "which", "what", "srtyui"};
    private Configuration conf;
    private Path inputPath;

    @Override
    @Before
    public void setUp() throws Exception {
        super.setUp();
        this.conf = this.getConfiguration();
        this.inputPath = this.getTestTempFilePath("documents/docs.file");
        FileSystem fs = FileSystem.get((URI)this.inputPath.toUri(), (Configuration)this.conf);
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, this.conf, this.inputPath, Text.class, Text.class);
        RandomDocumentGenerator gen = new RandomDocumentGenerator();
        for (int i = 0; i < 100; ++i) {
            writer.append((Writable)new Text("Document::ID::" + i), (Writable)new Text(HighDFWordsPrunerTest.enhanceWithHighDFWords(gen.getRandomDocument())));
        }
        writer.close();
    }

    private static String enhanceWithHighDFWords(String initialDoc) {
        StringBuilder sb = new StringBuilder(initialDoc);
        for (String word : HIGH_DF_WORDS) {
            sb.append(' ').append(word);
        }
        return sb.toString();
    }

    @Test
    public void testHighDFWordsPreserving() throws Exception {
        this.runTest(false);
    }

    @Test
    public void testHighDFWordsPruning() throws Exception {
        this.runTest(true);
    }

    private void runTest(boolean prune) throws Exception {
        Path outputPath = this.getTestTempFilePath("output");
        LinkedList argList = Lists.newLinkedList();
        argList.add("-i");
        argList.add(this.inputPath.toString());
        argList.add("-o");
        argList.add(outputPath.toString());
        if (prune) {
            argList.add("-xs");
            argList.add("3");
        } else {
            argList.add("--maxDFPercent");
            argList.add("100");
        }
        argList.add("-seq");
        argList.add("-nv");
        String[] args = argList.toArray(new String[argList.size()]);
        ToolRunner.run((Configuration)this.conf, (Tool)new SparseVectorsFromSequenceFiles(), (String[])args);
        Path dictionary = new Path(outputPath, "dictionary.file-0");
        Path tfVectors = new Path(outputPath, "tf-vectors");
        Path tfidfVectors = new Path(outputPath, "tfidf-vectors");
        int[] highDFWordsDictionaryIndices = this.getHighDFWordsDictionaryIndices(dictionary);
        this.validateVectors(tfVectors, highDFWordsDictionaryIndices, prune);
        this.validateVectors(tfidfVectors, highDFWordsDictionaryIndices, prune);
    }

    private int[] getHighDFWordsDictionaryIndices(Path dictionaryPath) {
        int[] highDFWordsDictionaryIndices = new int[HIGH_DF_WORDS.length];
        List<String> highDFWordsList = Arrays.asList(HIGH_DF_WORDS);
        for (Pair record : new SequenceFileDirIterable(dictionaryPath, PathType.GLOB, null, null, true, this.conf)) {
            int index = highDFWordsList.indexOf(((Text)record.getFirst()).toString());
            if (index <= -1) continue;
            highDFWordsDictionaryIndices[index] = ((IntWritable)record.getSecond()).get();
        }
        return highDFWordsDictionaryIndices;
    }

    private void validateVectors(Path vectorPath, int[] highDFWordsDictionaryIndices, boolean prune) throws Exception {
        HighDFWordsPrunerTest.assertTrue((String)"Path does not exist", (boolean)vectorPath.getFileSystem(this.conf).exists(vectorPath));
        for (VectorWritable value : new SequenceFileDirValueIterable(vectorPath, PathType.LIST, PathFilters.partFilter(), null, true, this.conf)) {
            Vector v = ((NamedVector)value.get()).getDelegate();
            for (int i = 0; i < highDFWordsDictionaryIndices.length; ++i) {
                if (prune) {
                    HighDFWordsPrunerTest.assertEquals((String)("Found vector for which word '" + HIGH_DF_WORDS[i] + "' is not pruned"), (double)0.0, (double)v.get(highDFWordsDictionaryIndices[i]), (double)0.0);
                    continue;
                }
                HighDFWordsPrunerTest.assertTrue((String)("Found vector for which word '" + HIGH_DF_WORDS[i] + "' is pruned, and shouldn't have been"), (v.get(highDFWordsDictionaryIndices[i]) != 0.0 ? 1 : 0) != 0);
            }
        }
    }
}

