package org.apache.mahout.vectorizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.ClassUtils;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
import org.apache.mahout.vectorizer.encoders.LuceneTextValueEncoder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/mahout/vectorizer/EncodedVectorsFromSequenceFiles.class */
public final class EncodedVectorsFromSequenceFiles extends AbstractJob {
    private static final Logger log = LoggerFactory.getLogger(EncodedVectorsFromSequenceFiles.class);

    public static void main(String[] strArr) throws Exception {
        ToolRunner.run(new Configuration(), new EncodedVectorsFromSequenceFiles(), strArr);
    }

    public int run(String[] strArr) throws Exception {
        addInputOption();
        addOutputOption();
        addOption(DefaultOptionCreator.analyzerOption().create());
        addOption(buildOption("sequentialAccessVector", "seq", "(Optional) Whether output vectors should be SequentialAccessVectors. If set true else false", false, false, null));
        addOption(buildOption("namedVector", "nv", "Create named vectors using the key.  False by default", false, false, null));
        addOption(EncodingMapper.CARDINALITY, WikipediaTokenizer.CATEGORY, "The cardinality to use for creating the vectors.  Default is 5000", String.valueOf(5000));
        addOption(EncodingMapper.ENCODER_FIELD_NAME, "en", "The name of the encoder to be passed to the FeatureVectorEncoder constructor.  Default is text.  Note this is not the class name of a FeatureValueEncoder, but is instead the construction argument.", "text");
        addOption(EncodingMapper.ENCODER_CLASS, "ec", "The class name of the encoder to be used. Default is " + LuceneTextValueEncoder.class.getName(), LuceneTextValueEncoder.class.getName());
        addOption(DefaultOptionCreator.overwriteOption().create());
        if (parseArguments(strArr) == null) {
            return -1;
        }
        Path inputPath = getInputPath();
        Path outputPath = getOutputPath();
        if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
            HadoopUtil.delete(getConf(), outputPath);
        }
        Class<? extends Analyzer> analyzerClassFromOption = getAnalyzerClassFromOption();
        Configuration conf = getConf();
        boolean hasOption = hasOption("sequentialAccessVector");
        boolean hasOption2 = hasOption("namedVector");
        int i = 5000;
        if (hasOption(EncodingMapper.CARDINALITY)) {
            i = Integer.parseInt(getOption(EncodingMapper.CARDINALITY));
        }
        String option = hasOption(EncodingMapper.ENCODER_FIELD_NAME) ? getOption(EncodingMapper.ENCODER_FIELD_NAME) : "text";
        String name = LuceneTextValueEncoder.class.getName();
        if (hasOption(EncodingMapper.ENCODER_CLASS)) {
            name = getOption(EncodingMapper.ENCODER_CLASS);
            ClassUtils.instantiateAs(name, FeatureVectorEncoder.class, (Class<?>[]) new Class[]{String.class}, new Object[]{option});
        }
        new SimpleTextEncodingVectorizer().createVectors(inputPath, outputPath, new VectorizerConfig(conf, analyzerClassFromOption.getName(), name, option, hasOption, hasOption2, i));
        return 0;
    }
}
