/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.vectorizer;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.StringTuple;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.vectorizer.Vectorizer;
import org.apache.mahout.vectorizer.VectorizerConfig;
import org.apache.mahout.vectorizer.collocations.llr.CollocDriver;
import org.apache.mahout.vectorizer.common.PartialVectorMerger;
import org.apache.mahout.vectorizer.term.TFPartialVectorReducer;
import org.apache.mahout.vectorizer.term.TermCountCombiner;
import org.apache.mahout.vectorizer.term.TermCountMapper;
import org.apache.mahout.vectorizer.term.TermCountReducer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public final class DictionaryVectorizer
extends AbstractJob
implements Vectorizer {
    private static final Logger log = LoggerFactory.getLogger(DictionaryVectorizer.class);
    public static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tf-vectors";
    public static final String MIN_SUPPORT = "min.support";
    public static final String MAX_NGRAMS = "max.ngrams";
    public static final int DEFAULT_MIN_SUPPORT = 2;
    private static final String DICTIONARY_FILE = "dictionary.file-";
    private static final int MAX_CHUNKSIZE = 10000;
    private static final int MIN_CHUNKSIZE = 100;
    private static final String OUTPUT_FILES_PATTERN = "part-*";
    private static final int DICTIONARY_BYTE_OVERHEAD = 4;
    private static final String VECTOR_OUTPUT_FOLDER = "partial-vectors-";
    private static final String DICTIONARY_JOB_FOLDER = "wordcount";

    private DictionaryVectorizer() {
    }

    @Override
    public void createVectors(Path input, Path output, VectorizerConfig config) throws IOException, ClassNotFoundException, InterruptedException {
        DictionaryVectorizer.createTermFrequencyVectors(input, output, config.getTfDirName(), config.getConf(), config.getMinSupport(), config.getMaxNGramSize(), config.getMinLLRValue(), config.getNormPower(), config.isLogNormalize(), config.getNumReducers(), config.getChunkSizeInMegabytes(), config.isSequentialAccess(), config.isNamedVectors());
    }

    public static void createTermFrequencyVectors(Path input, Path output, String tfVectorsFolderName, Configuration baseConf, int minSupport, int maxNGramSize, float minLLRValue, float normPower, boolean logNormalize, int numReducers, int chunkSizeInMegabytes, boolean sequentialAccess, boolean namedVectors) throws IOException, InterruptedException, ClassNotFoundException {
        List<Path> dictionaryChunks;
        Preconditions.checkArgument(normPower == -1.0f || normPower >= 0.0f, "If specified normPower must be nonnegative", Float.valueOf(normPower));
        Preconditions.checkArgument(normPower == -1.0f || normPower > 1.0f && !Double.isInfinite(normPower) || !logNormalize, "normPower must be > 1 and not infinite if log normalization is chosen", Float.valueOf(normPower));
        if (chunkSizeInMegabytes < 100) {
            chunkSizeInMegabytes = 100;
        } else if (chunkSizeInMegabytes > 10000) {
            chunkSizeInMegabytes = 10000;
        }
        if (minSupport < 0) {
            minSupport = 2;
        }
        Path dictionaryJobPath = new Path(output, DICTIONARY_JOB_FOLDER);
        log.info("Creating dictionary from {} and saving at {}", (Object)input, (Object)dictionaryJobPath);
        int[] maxTermDimension = new int[1];
        if (maxNGramSize == 1) {
            DictionaryVectorizer.startWordCounting(input, dictionaryJobPath, baseConf, minSupport);
            dictionaryChunks = DictionaryVectorizer.createDictionaryChunks(dictionaryJobPath, output, baseConf, chunkSizeInMegabytes, maxTermDimension);
        } else {
            CollocDriver.generateAllGrams(input, dictionaryJobPath, baseConf, maxNGramSize, minSupport, minLLRValue, numReducers);
            dictionaryChunks = DictionaryVectorizer.createDictionaryChunks(new Path(new Path(output, DICTIONARY_JOB_FOLDER), "ngrams"), output, baseConf, chunkSizeInMegabytes, maxTermDimension);
        }
        int partialVectorIndex = 0;
        ArrayList<Path> partialVectorPaths = Lists.newArrayList();
        for (Path dictionaryChunk : dictionaryChunks) {
            Path partialVectorOutputPath = new Path(output, VECTOR_OUTPUT_FOLDER + partialVectorIndex++);
            partialVectorPaths.add(partialVectorOutputPath);
            DictionaryVectorizer.makePartialVectors(input, baseConf, maxNGramSize, dictionaryChunk, partialVectorOutputPath, maxTermDimension[0], sequentialAccess, namedVectors, numReducers);
        }
        Configuration conf = new Configuration(baseConf);
        Path outputDir = new Path(output, tfVectorsFolderName);
        PartialVectorMerger.mergePartialVectors(partialVectorPaths, outputDir, conf, normPower, logNormalize, maxTermDimension[0], sequentialAccess, namedVectors, numReducers);
        HadoopUtil.delete(conf, partialVectorPaths);
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private static List<Path> createDictionaryChunks(Path wordCountPath, Path dictionaryPathBase, Configuration baseConf, int chunkSizeInMegabytes, int[] maxTermDimension) throws IOException {
        ArrayList<Path> chunkPaths = Lists.newArrayList();
        Configuration conf = new Configuration(baseConf);
        FileSystem fs = FileSystem.get(wordCountPath.toUri(), conf);
        long chunkSizeLimit = (long)chunkSizeInMegabytes * 1024L * 1024L;
        int chunkIndex = 0;
        Path chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + chunkIndex);
        chunkPaths.add(chunkPath);
        SequenceFile.Writer dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
        try {
            long currentChunkSize = 0L;
            Path filesPattern = new Path(wordCountPath, OUTPUT_FILES_PATTERN);
            int i = 0;
            for (Pair record : new SequenceFileDirIterable(filesPattern, PathType.GLOB, null, null, true, conf)) {
                if (currentChunkSize > chunkSizeLimit) {
                    Closeables.close(dictWriter, false);
                    chunkPath = new Path(dictionaryPathBase, DICTIONARY_FILE + ++chunkIndex);
                    chunkPaths.add(chunkPath);
                    dictWriter = new SequenceFile.Writer(fs, conf, chunkPath, Text.class, IntWritable.class);
                    currentChunkSize = 0L;
                }
                Writable key = (Writable)record.getFirst();
                int fieldSize = 4 + key.toString().length() * 2 + 4;
                currentChunkSize += (long)fieldSize;
                dictWriter.append(key, new IntWritable(i++));
            }
            maxTermDimension[0] = i;
        }
        finally {
            Closeables.close(dictWriter, false);
        }
        return chunkPaths;
    }

    private static void makePartialVectors(Path input, Configuration baseConf, int maxNGramSize, Path dictionaryFilePath, Path output, int dimension, boolean sequentialAccess, boolean namedVectors, int numReducers) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration(baseConf);
        conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        conf.setInt("vector.dimension", dimension);
        conf.setBoolean("vector.sequentialAccess", sequentialAccess);
        conf.setBoolean("vector.named", namedVectors);
        conf.setInt(MAX_NGRAMS, maxNGramSize);
        DistributedCache.setCacheFiles(new URI[]{dictionaryFilePath.toUri()}, conf);
        Job job = new Job(conf);
        job.setJobName("DictionaryVectorizer::MakePartialVectors: input-folder: " + input + ", dictionary-file: " + dictionaryFilePath);
        job.setJarByClass(DictionaryVectorizer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(StringTuple.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(VectorWritable.class);
        FileInputFormat.setInputPaths(job, input);
        FileOutputFormat.setOutputPath(job, output);
        job.setMapperClass(Mapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setReducerClass(TFPartialVectorReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        job.setNumReduceTasks(numReducers);
        HadoopUtil.delete(conf, output);
        boolean succeeded = job.waitForCompletion(true);
        if (!succeeded) {
            throw new IllegalStateException("Job failed!");
        }
    }

    private static void startWordCounting(Path input, Path output, Configuration baseConf, int minSupport) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration conf = new Configuration(baseConf);
        conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        conf.setInt(MIN_SUPPORT, minSupport);
        Job job = new Job(conf);
        job.setJobName("DictionaryVectorizer::WordCount: input-folder: " + input);
        job.setJarByClass(DictionaryVectorizer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);
        FileInputFormat.setInputPaths(job, input);
        FileOutputFormat.setOutputPath(job, output);
        job.setMapperClass(TermCountMapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setCombinerClass(TermCountCombiner.class);
        job.setReducerClass(TermCountReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        HadoopUtil.delete(conf, output);
        boolean succeeded = job.waitForCompletion(true);
        if (!succeeded) {
            throw new IllegalStateException("Job failed!");
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        this.addInputOption();
        this.addOutputOption();
        this.addOption("tfDirName", "tf", "The folder to store the TF calculations", "tfDirName");
        this.addOption("minSupport", "s", "(Optional) Minimum Support. Default Value: 2", "2");
        this.addOption("maxNGramSize", "ng", "(Optional) The maximum size of ngrams to create (2 = bigrams, 3 = trigrams, etc) Default Value:1");
        this.addOption("minLLR", "ml", "(Optional)The minimum Log Likelihood Ratio(Float)  Default is 1.0");
        this.addOption("norm", "n", "The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  Must be greater or equal to 0.  The default is not to normalize");
        this.addOption("logNormalize", "lnorm", "(Optional) Whether output vectors should be logNormalize. If set true else false", "false");
        this.addOption(DefaultOptionCreator.numReducersOption().create());
        this.addOption("chunkSize", "chunk", "The chunkSize in MegaBytes. 100-10000 MB", "100");
        this.addOption(DefaultOptionCreator.methodOption().create());
        this.addOption("namedVector", "nv", "(Optional) Whether output vectors should be NamedVectors. If set true else false", "false");
        if (this.parseArguments(args) == null) {
            return -1;
        }
        String tfDirName = this.getOption("tfDirName", "tfDir");
        int minSupport = this.getInt("minSupport", 2);
        int maxNGramSize = this.getInt("maxNGramSize", 1);
        float minLLRValue = this.getFloat("minLLR", 1.0f);
        float normPower = this.getFloat("norm", -1.0f);
        boolean logNormalize = this.hasOption("logNormalize");
        int numReducers = this.getInt("maxRed");
        int chunkSizeInMegs = this.getInt("chunkSize", 100);
        boolean sequential = this.hasOption("sequential");
        boolean namedVecs = this.hasOption("namedVectors");
        DictionaryVectorizer.createTermFrequencyVectors(this.getInputPath(), this.getOutputPath(), tfDirName, this.getConf(), minSupport, maxNGramSize, minLLRValue, normPower, logNormalize, numReducers, chunkSizeInMegs, sequential, namedVecs);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new DictionaryVectorizer(), args);
    }
}

