package org.apache.mahout.vectorizer.tfidf;

import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.vectorizer.common.PartialVectorMerger;
import org.apache.mahout.vectorizer.term.TermDocumentCountMapper;
import org.apache.mahout.vectorizer.term.TermDocumentCountReducer;

/* loaded from: input_file:org/apache/mahout/vectorizer/tfidf/TFIDFConverter.class */
public final class TFIDFConverter {
    public static final String VECTOR_COUNT = "vector.count";
    public static final String FEATURE_COUNT = "feature.count";
    public static final String MIN_DF = "min.df";
    public static final String MAX_DF = "max.df";
    private static final String DOCUMENT_VECTOR_OUTPUT_FOLDER = "tfidf-vectors";
    private static final String FREQUENCY_FILE = "frequency.file-";
    private static final int MAX_CHUNKSIZE = 10000;
    private static final int MIN_CHUNKSIZE = 100;
    private static final String OUTPUT_FILES_PATTERN = "part-*";
    private static final int SEQUENCEFILE_BYTE_OVERHEAD = 45;
    private static final String VECTOR_OUTPUT_FOLDER = "partial-vectors-";
    public static final String WORDCOUNT_OUTPUT_FOLDER = "df-count";

    private TFIDFConverter() {
    }

    public static void processTfIdf(Path path, Path path2, Configuration configuration, Pair<Long[], List<Path>> pair, int i, long j, float f, boolean z, boolean z2, boolean z3, int i2) throws IOException, InterruptedException, ClassNotFoundException {
        Preconditions.checkArgument(f == -1.0f || f >= PackedInts.COMPACT, "If specified normPower must be nonnegative", Float.valueOf(f));
        Preconditions.checkArgument(f == -1.0f || (f > 1.0f && !Double.isInfinite((double) f)) || !z, "normPower must be > 1 and not infinite if log normalization is chosen", Float.valueOf(f));
        int i3 = 0;
        ArrayList newArrayList = Lists.newArrayList();
        for (Path path3 : pair.getSecond()) {
            int i4 = i3;
            i3++;
            Path path4 = new Path(path2, VECTOR_OUTPUT_FOLDER + i4);
            newArrayList.add(path4);
            makePartialVectors(path, configuration, pair.getFirst()[0], pair.getFirst()[1], i, j, path3, path4, z2, z3);
        }
        Configuration configuration2 = new Configuration(configuration);
        PartialVectorMerger.mergePartialVectors(newArrayList, new Path(path2, DOCUMENT_VECTOR_OUTPUT_FOLDER), configuration, f, z, pair.getFirst()[0].intValue(), z2, z3, i2);
        HadoopUtil.delete(configuration2, newArrayList);
    }

    public static Pair<Long[], List<Path>> calculateDF(Path path, Path path2, Configuration configuration, int i) throws IOException, InterruptedException, ClassNotFoundException {
        if (i < 100) {
            i = 100;
        } else if (i > 10000) {
            i = 10000;
        }
        Path path3 = new Path(path2, WORDCOUNT_OUTPUT_FOLDER);
        startDFCounting(path, path3, configuration);
        return createDictionaryChunks(path3, path2, configuration, i);
    }

    private static Pair<Long[], List<Path>> createDictionaryChunks(Path path, Path path2, Configuration configuration, int i) throws IOException {
        ArrayList newArrayList = Lists.newArrayList();
        Configuration configuration2 = new Configuration(configuration);
        FileSystem fileSystem = FileSystem.get(path.toUri(), configuration2);
        long j = i * 1024 * 1024;
        int i2 = 0;
        Path path3 = new Path(path2, FREQUENCY_FILE + 0);
        newArrayList.add(path3);
        SequenceFile.Writer writer = new SequenceFile.Writer(fileSystem, configuration2, path3, IntWritable.class, LongWritable.class);
        try {
            long j2 = 0;
            long j3 = 0;
            long j4 = Long.MAX_VALUE;
            Iterator it = new SequenceFileDirIterable(new Path(path, "part-*"), PathType.GLOB, null, null, true, configuration2).iterator();
            while (it.hasNext()) {
                Pair pair = (Pair) it.next();
                if (j2 > j) {
                    Closeables.close(writer, false);
                    i2++;
                    Path path4 = new Path(path2, FREQUENCY_FILE + i2);
                    newArrayList.add(path4);
                    writer = new SequenceFile.Writer(fileSystem, configuration2, path4, IntWritable.class, LongWritable.class);
                    j2 = 0;
                }
                j2 += 57;
                IntWritable intWritable = (IntWritable) pair.getFirst();
                LongWritable longWritable = (LongWritable) pair.getSecond();
                if (intWritable.get() >= 0) {
                    writer.append((Writable) intWritable, (Writable) longWritable);
                } else if (intWritable.get() == -1) {
                    j4 = longWritable.get();
                }
                j3 = Math.max(intWritable.get(), j3);
            }
            Pair<Long[], List<Path>> pair2 = new Pair<>(new Long[]{Long.valueOf(j3 + 1), Long.valueOf(j4)}, newArrayList);
            Closeables.close(writer, false);
            return pair2;
        } catch (Throwable th) {
            Closeables.close(writer, false);
            throw th;
        }
    }

    private static void makePartialVectors(Path path, Configuration configuration, Long l, Long l2, int i, long j, Path path2, Path path3, boolean z, boolean z2) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration configuration2 = new Configuration(configuration);
        configuration2.set(CommonConfigurationKeysPublic.IO_SERIALIZATIONS_KEY, "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        configuration2.setLong(FEATURE_COUNT, l.longValue());
        configuration2.setLong(VECTOR_COUNT, l2.longValue());
        configuration2.setInt("min.df", i);
        configuration2.setLong("max.df", j);
        configuration2.setBoolean(PartialVectorMerger.SEQUENTIAL_ACCESS, z);
        configuration2.setBoolean(PartialVectorMerger.NAMED_VECTOR, z2);
        DistributedCache.setCacheFiles(new URI[]{path2.toUri()}, configuration2);
        Job job = new Job(configuration2);
        job.setJobName(": MakePartialVectors: input-folder: " + path + ", dictionary-file: " + path2.toString());
        job.setJarByClass(TFIDFConverter.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(VectorWritable.class);
        FileInputFormat.setInputPaths(job, path);
        FileOutputFormat.setOutputPath(job, path3);
        job.setMapperClass(Mapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setReducerClass(TFIDFPartialVectorReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        HadoopUtil.delete(configuration2, path3);
        if (!job.waitForCompletion(true)) {
            throw new IllegalStateException("Job failed!");
        }
    }

    private static void startDFCounting(Path path, Path path2, Configuration configuration) throws IOException, InterruptedException, ClassNotFoundException {
        Configuration configuration2 = new Configuration(configuration);
        configuration2.set(CommonConfigurationKeysPublic.IO_SERIALIZATIONS_KEY, "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
        Job job = new Job(configuration2);
        job.setJobName("VectorTfIdf Document Frequency Count running over input: " + path);
        job.setJarByClass(TFIDFConverter.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(LongWritable.class);
        FileInputFormat.setInputPaths(job, path);
        FileOutputFormat.setOutputPath(job, path2);
        job.setMapperClass(TermDocumentCountMapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setCombinerClass(TermDocumentCountReducer.class);
        job.setReducerClass(TermDocumentCountReducer.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        HadoopUtil.delete(configuration2, path2);
        if (!job.waitForCompletion(true)) {
            throw new IllegalStateException("Job failed!");
        }
    }
}
