package org.apache.mahout.vectorizer;

import com.ibm.icu.impl.locale.LanguageTag;
import java.util.List;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.option.DefaultOption;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.lucene.AnalyzerUtils;
import org.apache.mahout.math.hadoop.stats.BasicStats;
import org.apache.mahout.vectorizer.collocations.llr.LLRReducer;
import org.apache.mahout.vectorizer.tfidf.TFIDFConverter;
import org.jboss.netty.handler.codec.rtsp.RtspHeaders;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/mahout/vectorizer/SparseVectorsFromSequenceFiles.class */
public final class SparseVectorsFromSequenceFiles extends AbstractJob {
    private static final Logger log = LoggerFactory.getLogger(SparseVectorsFromSequenceFiles.class);

    public static void main(String[] strArr) throws Exception {
        ToolRunner.run(new SparseVectorsFromSequenceFiles(), strArr);
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        boolean z;
        DefaultOptionBuilder defaultOptionBuilder = new DefaultOptionBuilder();
        ArgumentBuilder argumentBuilder = new ArgumentBuilder();
        GroupBuilder groupBuilder = new GroupBuilder();
        DefaultOption create = DefaultOptionCreator.inputOption().create();
        DefaultOption create2 = DefaultOptionCreator.outputOption().create();
        DefaultOption create3 = defaultOptionBuilder.withLongName("minSupport").withArgument(argumentBuilder.withName("minSupport").withMinimum(1).withMaximum(1).create()).withDescription("(Optional) Minimum Support. Default Value: 2").withShortName("s").create();
        DefaultOption create4 = defaultOptionBuilder.withLongName("analyzerName").withArgument(argumentBuilder.withName("analyzerName").withMinimum(1).withMaximum(1).create()).withDescription("The class name of the analyzer").withShortName("a").create();
        DefaultOption create5 = defaultOptionBuilder.withLongName("chunkSize").withArgument(argumentBuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription("The chunkSize in MegaBytes. Default Value: 100MB").withShortName("chunk").create();
        DefaultOption create6 = defaultOptionBuilder.withLongName("weight").withRequired(false).withArgument(argumentBuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription("The kind of weight to use. Currently TF or TFIDF. Default: TFIDF").withShortName("wt").create();
        DefaultOption create7 = defaultOptionBuilder.withLongName("minDF").withRequired(false).withArgument(argumentBuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription("The minimum document frequency.  Default is 1").withShortName("md").create();
        DefaultOption create8 = defaultOptionBuilder.withLongName("maxDFPercent").withRequired(false).withArgument(argumentBuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription("The max percentage of docs for the DF.  Can be used to remove really high frequency terms. Expressed as an integer between 0 and 100. Default is 99.  If maxDFSigma is also set, it will override this value.").withShortName(LanguageTag.PRIVATEUSE).create();
        DefaultOption create9 = defaultOptionBuilder.withLongName("maxDFSigma").withRequired(false).withArgument(argumentBuilder.withName("maxDFSigma").withMinimum(1).withMaximum(1).create()).withDescription("What portion of the tf (tf-idf) vectors to be used, expressed in times the standard deviation (sigma) of the document frequencies of these vectors. Can be used to remove really high frequency terms. Expressed as a double value. Good value to be specified is 3.0. In case the value is less than 0 no vectors will be filtered out. Default is -1.0.  Overrides maxDFPercent").withShortName("xs").create();
        DefaultOption create10 = defaultOptionBuilder.withLongName(LLRReducer.MIN_LLR).withRequired(false).withArgument(argumentBuilder.withName(LLRReducer.MIN_LLR).withMinimum(1).withMaximum(1).create()).withDescription("(Optional)The minimum Log Likelihood Ratio(Float)  Default is 1.0").withShortName("ml").create();
        DefaultOption create11 = defaultOptionBuilder.withLongName("numReducers").withArgument(argumentBuilder.withName("numReducers").withMinimum(1).withMaximum(1).create()).withDescription("(Optional) Number of reduce tasks. Default Value: 1").withShortName("nr").create();
        DefaultOption create12 = defaultOptionBuilder.withLongName("norm").withRequired(false).withArgument(argumentBuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription("The norm to use, expressed as either a float or \"INF\" if you want to use the Infinite norm.  Must be greater or equal to 0.  The default is not to normalize").withShortName("n").create();
        DefaultOption create13 = defaultOptionBuilder.withLongName("logNormalize").withRequired(false).withDescription("(Optional) Whether output vectors should be logNormalize. If set true else false").withShortName("lnorm").create();
        DefaultOption create14 = defaultOptionBuilder.withLongName("maxNGramSize").withRequired(false).withArgument(argumentBuilder.withName("ngramSize").withMinimum(1).withMaximum(1).create()).withDescription("(Optional) The maximum size of ngrams to create (2 = bigrams, 3 = trigrams, etc) Default Value:1").withShortName("ng").create();
        DefaultOption create15 = defaultOptionBuilder.withLongName("sequentialAccessVector").withRequired(false).withDescription("(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false").withShortName(RtspHeaders.Values.SEQ).create();
        DefaultOption create16 = defaultOptionBuilder.withLongName("namedVector").withRequired(false).withDescription("(Optional) Whether output vectors should be NamedVectors. If set true else false").withShortName("nv").create();
        DefaultOption create17 = defaultOptionBuilder.withLongName(DefaultOptionCreator.OVERWRITE_OPTION).withRequired(false).withDescription("If set, overwrite the output directory").withShortName("ow").create();
        DefaultOption create18 = defaultOptionBuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
        Group create19 = groupBuilder.withName("Options").withOption(create3).withOption(create4).withOption(create5).withOption(create2).withOption(create).withOption(create7).withOption(create9).withOption(create8).withOption(create6).withOption(create12).withOption(create10).withOption(create11).withOption(create14).withOption(create17).withOption(create18).withOption(create15).withOption(create16).withOption(create13).create();
        try {
            Parser parser = new Parser();
            parser.setGroup(create19);
            parser.setHelpOption(create18);
            CommandLine parse = parser.parse(strArr);
            if (parse.hasOption(create18)) {
                CommandLineUtil.printHelp(create19);
                return -1;
            }
            Path path = new Path((String) parse.getValue(create));
            Path path2 = new Path((String) parse.getValue(create2));
            int i = 100;
            if (parse.hasOption(create5)) {
                i = Integer.parseInt((String) parse.getValue(create5));
            }
            int i2 = 2;
            if (parse.hasOption(create3)) {
                i2 = Integer.parseInt((String) parse.getValue(create3));
            }
            int i3 = 1;
            if (parse.hasOption(create14)) {
                try {
                    i3 = Integer.parseInt(parse.getValue(create14).toString());
                } catch (NumberFormatException e) {
                    log.warn("Could not parse ngram size option");
                }
            }
            log.info("Maximum n-gram size is: {}", Integer.valueOf(i3));
            if (parse.hasOption(create17)) {
                HadoopUtil.delete(getConf(), path2);
            }
            float f = 1.0f;
            if (parse.hasOption(create10)) {
                f = Float.parseFloat(parse.getValue(create10).toString());
            }
            log.info("Minimum LLR value: {}", Float.valueOf(f));
            int i4 = 1;
            if (parse.hasOption(create11)) {
                i4 = Integer.parseInt(parse.getValue(create11).toString());
            }
            log.info("Number of reduce tasks: {}", Integer.valueOf(i4));
            Class cls = StandardAnalyzer.class;
            if (parse.hasOption(create4)) {
                cls = Class.forName(parse.getValue(create4).toString()).asSubclass(Analyzer.class);
                AnalyzerUtils.createAnalyzer((Class<? extends Analyzer>) cls);
            }
            if (parse.hasOption(create6)) {
                String obj = parse.getValue(create6).toString();
                if ("tf".equalsIgnoreCase(obj)) {
                    z = false;
                } else {
                    if (!"tfidf".equalsIgnoreCase(obj)) {
                        throw new OptionException(create6);
                    }
                    z = true;
                }
            } else {
                z = true;
            }
            int i5 = 1;
            if (parse.hasOption(create7)) {
                i5 = Integer.parseInt(parse.getValue(create7).toString());
            }
            int i6 = 99;
            if (parse.hasOption(create8)) {
                i6 = Integer.parseInt(parse.getValue(create8).toString());
            }
            double d = -1.0d;
            if (parse.hasOption(create9)) {
                d = Double.parseDouble(parse.getValue(create9).toString());
            }
            float f2 = -1.0f;
            if (parse.hasOption(create12)) {
                String obj2 = parse.getValue(create12).toString();
                f2 = "INF".equals(obj2) ? Float.POSITIVE_INFINITY : Float.parseFloat(obj2);
            }
            boolean z2 = false;
            if (parse.hasOption(create13)) {
                z2 = true;
            }
            log.info("Tokenizing documents in {}", path);
            Configuration conf = getConf();
            Path path3 = new Path(path2, DocumentProcessor.TOKENIZED_DOCUMENT_OUTPUT_FOLDER);
            DocumentProcessor.tokenizeDocuments(path, cls, path3, conf);
            boolean z3 = false;
            if (parse.hasOption(create15)) {
                z3 = true;
            }
            boolean z4 = false;
            if (parse.hasOption(create16)) {
                z4 = true;
            }
            boolean z5 = d >= 0.0d || ((double) i6) > 0.0d;
            String str = z5 ? "tf-vectors-toprune" : DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER;
            log.info("Creating Term Frequency Vectors");
            if (z) {
                DictionaryVectorizer.createTermFrequencyVectors(path3, path2, str, conf, i2, i3, f, -1.0f, false, i4, i, z3, z4);
            } else {
                DictionaryVectorizer.createTermFrequencyVectors(path3, path2, str, conf, i2, i3, f, f2, z2, i4, i, z3, z4);
            }
            Pair<Long[], List<Path>> pair = null;
            if (z5 || z) {
                log.info("Calculating IDF");
                pair = TFIDFConverter.calculateDF(new Path(path2, str), path2, conf, i);
            }
            long j = i6;
            if (z5) {
                long longValue = pair.getFirst()[1].longValue();
                if (d >= 0.0d) {
                    j = (int) (((100.0d * d) * BasicStats.stdDevForGivenMean(new Path(path2, TFIDFConverter.WORDCOUNT_OUTPUT_FOLDER), new Path(path2, HighDFWordsPruner.STD_CALC_DIR), 0.0d, conf)) / longValue);
                }
                long j2 = ((float) longValue) * (((float) j) / 100.0f);
                Path path4 = new Path(path2, str);
                Path path5 = new Path(path2, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER);
                Path path6 = new Path(path2, "tf-vectors-partial");
                log.info("Pruning");
                if (z) {
                    HighDFWordsPruner.pruneVectors(path4, path5, path6, j2, i5, conf, pair, -1.0f, false, i4);
                } else {
                    HighDFWordsPruner.pruneVectors(path4, path5, path6, j2, i5, conf, pair, f2, z2, i4);
                }
                HadoopUtil.delete(new Configuration(conf), path4);
            }
            if (z) {
                TFIDFConverter.processTfIdf(new Path(path2, DictionaryVectorizer.DOCUMENT_VECTOR_OUTPUT_FOLDER), path2, conf, pair, i5, j, f2, z2, z3, z4, i4);
            }
            return 0;
        } catch (OptionException e2) {
            log.error("Exception", e2);
            CommandLineUtil.printHelp(create19);
            return 0;
        }
    }
}
