package org.apache.mahout.utils;

import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
import com.google.common.io.Closeables;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.BitSet;
import org.apache.commons.cli2.OptionException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
import org.apache.mahout.math.jet.random.sampling.RandomSampler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/mahout/utils/SplitInput.class */
public class SplitInput extends AbstractJob {
    private static final Logger log = LoggerFactory.getLogger(SplitInput.class);
    private int testSplitSize = -1;
    private int testSplitPct = -1;
    private int splitLocation = 100;
    private int testRandomSelectionSize = -1;
    private int testRandomSelectionPct = -1;
    private int keepPct = 100;
    private Charset charset = Charsets.UTF_8;
    private boolean useSequence;
    private boolean useMapRed;
    private Path inputDirectory;
    private Path trainingOutputDirectory;
    private Path testOutputDirectory;
    private Path mapRedOutputDirectory;
    private SplitCallback callback;

    /* loaded from: input_file:org/apache/mahout/utils/SplitInput$SplitCallback.class */
    public interface SplitCallback {
        void splitComplete(Path path, int i, int i2, int i3, int i4);
    }

    public int run(String[] strArr) throws Exception {
        if (!parseArgs(strArr)) {
            return 0;
        }
        splitDirectory();
        return 0;
    }

    public static void main(String[] strArr) throws Exception {
        ToolRunner.run(new Configuration(), new SplitInput(), strArr);
    }

    private boolean parseArgs(String[] strArr) throws Exception {
        addInputOption();
        addOption("trainingOutput", "tr", "The training data output directory", false);
        addOption("testOutput", "te", "The test data output directory", false);
        addOption("testSplitSize", "ss", "The number of documents held back as test data for each category", false);
        addOption("testSplitPct", "sp", "The % of documents held back as test data for each category", false);
        addOption("splitLocation", "sl", "Location for start of test data expressed as a percentage of the input file size (0=start, 50=middle, 100=end", false);
        addOption("randomSelectionSize", "rs", "The number of items to be randomly selected as test data ", false);
        addOption("randomSelectionPct", "rp", "Percentage of items to be randomly selected as test data when using mapreduce mode", false);
        addOption("charset", "c", "The name of the character encoding of the input files (not needed if using SequenceFiles)", false);
        addOption(buildOption("sequenceFiles", "seq", "Set if the input files are sequence files.  Default is false", false, false, "false"));
        addOption(DefaultOptionCreator.methodOption().create());
        addOption(DefaultOptionCreator.overwriteOption().create());
        addOption("keepPct", "k", "The percentage of total data to keep in map-reduce mode, the rest will be ignored.  Default is 100%", false);
        addOption("mapRedOutputDir", "mro", "Output directory for map reduce jobs", false);
        if (parseArguments(strArr) == null) {
            return false;
        }
        try {
            this.inputDirectory = getInputPath();
            this.useMapRed = getOption("method").equalsIgnoreCase("mapreduce");
            if (this.useMapRed) {
                if (!hasOption("randomSelectionPct")) {
                    throw new OptionException(getCLIOption("randomSelectionPct"), "must set randomSelectionPct when mapRed option is used");
                }
                if (!hasOption("mapRedOutputDir")) {
                    throw new OptionException(getCLIOption("mapRedOutputDir"), "mapRedOutputDir must be set when mapRed option is used");
                }
                this.mapRedOutputDirectory = new Path(getOption("mapRedOutputDir"));
                if (hasOption("keepPct")) {
                    this.keepPct = Integer.parseInt(getOption("keepPct"));
                }
                if (hasOption("overwrite")) {
                    HadoopUtil.delete(getConf(), new Path[]{this.mapRedOutputDirectory});
                }
            } else {
                if (!hasOption("trainingOutput") || !hasOption("testOutput")) {
                    throw new OptionException(getCLIOption("trainingOutput"), "trainingOutput and testOutput must be set if mapRed option is not used");
                }
                if (!hasOption("testSplitSize") && !hasOption("testSplitPct") && !hasOption("randomSelectionPct") && !hasOption("randomSelectionSize")) {
                    throw new OptionException(getCLIOption("testSplitSize"), "must set one of test split size/percentage or randomSelectionSize/percentage");
                }
                this.trainingOutputDirectory = new Path(getOption("trainingOutput"));
                this.testOutputDirectory = new Path(getOption("testOutput"));
                FileSystem fileSystem = this.trainingOutputDirectory.getFileSystem(getConf());
                if (hasOption("overwrite")) {
                    HadoopUtil.delete(fileSystem.getConf(), new Path[]{this.trainingOutputDirectory});
                    HadoopUtil.delete(fileSystem.getConf(), new Path[]{this.testOutputDirectory});
                }
                fileSystem.mkdirs(this.trainingOutputDirectory);
                fileSystem.mkdirs(this.testOutputDirectory);
            }
            if (hasOption("charset")) {
                this.charset = Charset.forName(getOption("charset"));
            }
            if (hasOption("testSplitSize") && hasOption("testSplitPct")) {
                throw new OptionException(getCLIOption("testSplitPct"), "must have either split size or split percentage option, not BOTH");
            }
            if (hasOption("testSplitSize")) {
                setTestSplitSize(Integer.parseInt(getOption("testSplitSize")));
            }
            if (hasOption("testSplitPct")) {
                setTestSplitPct(Integer.parseInt(getOption("testSplitPct")));
            }
            if (hasOption("splitLocation")) {
                setSplitLocation(Integer.parseInt(getOption("splitLocation")));
            }
            if (hasOption("randomSelectionSize")) {
                setTestRandomSelectionSize(Integer.parseInt(getOption("randomSelectionSize")));
            }
            if (hasOption("randomSelectionPct")) {
                setTestRandomSelectionPct(Integer.parseInt(getOption("randomSelectionPct")));
            }
            this.useSequence = hasOption("sequenceFiles");
            validate();
            return true;
        } catch (OptionException e) {
            log.error("Command-line option Exception", e);
            CommandLineUtil.printHelp(getGroup());
            return false;
        }
    }

    public void splitDirectory() throws IOException, ClassNotFoundException, InterruptedException {
        splitDirectory(this.inputDirectory);
    }

    public void splitDirectory(Path path) throws IOException, ClassNotFoundException, InterruptedException {
        splitDirectory(getConf(), path);
    }

    public void splitDirectory(Configuration configuration, Path path) throws IOException, ClassNotFoundException, InterruptedException {
        FileSystem fileSystem = path.getFileSystem(configuration);
        if (fileSystem.getFileStatus(path) == null) {
            throw new IOException(path + " does not exist");
        }
        if (!fileSystem.getFileStatus(path).isDir()) {
            throw new IOException(path + " is not a directory");
        }
        if (this.useMapRed) {
            SplitInputJob.run(configuration, path, this.mapRedOutputDirectory, this.keepPct, this.testRandomSelectionPct);
            return;
        }
        for (FileStatus fileStatus : fileSystem.listStatus(path, PathFilters.logsCRCFilter())) {
            if (!fileStatus.isDir()) {
                splitFile(fileStatus.getPath());
            }
        }
    }

    public void splitFile(Path path) throws IOException {
        FileSystem fileSystem = path.getFileSystem(getConf());
        if (fileSystem.getFileStatus(path) == null) {
            throw new IOException(path + " does not exist");
        }
        if (fileSystem.getFileStatus(path).isDir()) {
            throw new IOException(path + " is a directory");
        }
        validate();
        Path path2 = new Path(this.testOutputDirectory, path.getName());
        Path path3 = new Path(this.trainingOutputDirectory, path.getName());
        int countLines = countLines(fileSystem, path, this.charset);
        log.info("{} has {} lines", path.getName(), Integer.valueOf(countLines));
        int i = 0;
        int i2 = this.testSplitSize;
        BitSet bitSet = null;
        if (this.testRandomSelectionPct > 0 || this.testRandomSelectionSize > 0) {
            i2 = this.testRandomSelectionSize;
            if (this.testRandomSelectionPct > 0) {
                i2 = Math.round((countLines * this.testRandomSelectionPct) / 100.0f);
            }
            log.info("{} test split size is {} based on random selection percentage {}", new Object[]{path.getName(), Integer.valueOf(i2), Integer.valueOf(this.testRandomSelectionPct)});
            long[] jArr = new long[i2];
            RandomSampler.sample(i2, countLines - 1, i2, 0L, jArr, 0, RandomUtils.getRandom());
            bitSet = new BitSet(countLines);
            for (long j : jArr) {
                bitSet.set(((int) j) + 1);
            }
        } else {
            if (this.testSplitPct > 0) {
                i2 = Math.round((countLines * this.testSplitPct) / 100.0f);
                log.info("{} test split size is {} based on percentage {}", new Object[]{path.getName(), Integer.valueOf(i2), Integer.valueOf(this.testSplitPct)});
            } else {
                log.info("{} test split size is {}", path.getName(), Integer.valueOf(i2));
            }
            if (this.splitLocation > 0) {
                i = Math.round((countLines * this.splitLocation) / 100.0f);
                if (countLines - i < i2) {
                    i = countLines - i2;
                }
                log.info("{} test split start is {} based on split location {}", new Object[]{path.getName(), Integer.valueOf(i), Integer.valueOf(this.splitLocation)});
            }
            if (i < 0) {
                throw new IllegalArgumentException("test split size for " + path + " is too large, it would produce an empty training set from the initial set of " + countLines + " examples");
            }
            if (countLines - i2 < i2) {
                log.warn("Test set size for {} may be too large, {} is larger than the number of lines remaining in the training set: {}", new Object[]{path, Integer.valueOf(i2), Integer.valueOf(countLines - i2)});
            }
        }
        int i3 = 0;
        int i4 = 0;
        if (this.useSequence) {
            SequenceFileIterator sequenceFileIterator = new SequenceFileIterator(path, false, fileSystem.getConf());
            SequenceFile.Writer createWriter = SequenceFile.createWriter(fileSystem, fileSystem.getConf(), path3, sequenceFileIterator.getKeyClass(), sequenceFileIterator.getValueClass());
            SequenceFile.Writer createWriter2 = SequenceFile.createWriter(fileSystem, fileSystem.getConf(), path2, sequenceFileIterator.getKeyClass(), sequenceFileIterator.getValueClass());
            int i5 = 0;
            while (sequenceFileIterator.hasNext()) {
                try {
                    i5++;
                    SequenceFile.Writer writer = this.testRandomSelectionPct > 0 ? bitSet.get(i5) ? createWriter2 : createWriter : i5 > i ? createWriter2 : createWriter;
                    if (writer == createWriter2) {
                        if (i4 >= i2) {
                            writer = createWriter;
                        } else {
                            i4++;
                        }
                    }
                    if (writer == createWriter) {
                        i3++;
                    }
                    Pair pair = (Pair) sequenceFileIterator.next();
                    writer.append((Writable) pair.getFirst(), (Writable) pair.getSecond());
                } finally {
                    Closeables.close(sequenceFileIterator, true);
                    Closeables.close(createWriter, false);
                    Closeables.close(createWriter2, false);
                }
            }
        } else {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader((InputStream) fileSystem.open(path), this.charset));
            OutputStreamWriter outputStreamWriter = new OutputStreamWriter((OutputStream) fileSystem.create(path3), this.charset);
            OutputStreamWriter outputStreamWriter2 = new OutputStreamWriter((OutputStream) fileSystem.create(path2), this.charset);
            int i6 = 0;
            while (true) {
                try {
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        break;
                    }
                    i6++;
                    OutputStreamWriter outputStreamWriter3 = this.testRandomSelectionPct > 0 ? bitSet.get(i6) ? outputStreamWriter2 : outputStreamWriter : i6 > i ? outputStreamWriter2 : outputStreamWriter;
                    if (outputStreamWriter3 == outputStreamWriter2) {
                        if (i4 >= i2) {
                            outputStreamWriter3 = outputStreamWriter;
                        } else {
                            i4++;
                        }
                    }
                    if (outputStreamWriter3 == outputStreamWriter) {
                        i3++;
                    }
                    outputStreamWriter3.write(readLine);
                    outputStreamWriter3.write(10);
                } finally {
                    Closeables.close(bufferedReader, true);
                    Closeables.close(outputStreamWriter, false);
                    Closeables.close(outputStreamWriter2, false);
                }
            }
        }
        log.info("file: {}, input: {} train: {}, test: {} starting at {}", new Object[]{path.getName(), Integer.valueOf(countLines), Integer.valueOf(i3), Integer.valueOf(i4), Integer.valueOf(i)});
        if (this.callback != null) {
            this.callback.splitComplete(path, countLines, i3, i4, i);
        }
    }

    public int getTestSplitSize() {
        return this.testSplitSize;
    }

    public void setTestSplitSize(int i) {
        this.testSplitSize = i;
    }

    public int getTestSplitPct() {
        return this.testSplitPct;
    }

    public void setTestSplitPct(int i) {
        this.testSplitPct = i;
    }

    public void setKeepPct(int i) {
        this.keepPct = i;
    }

    public void setUseMapRed(boolean z) {
        this.useMapRed = z;
    }

    public void setMapRedOutputDirectory(Path path) {
        this.mapRedOutputDirectory = path;
    }

    public int getSplitLocation() {
        return this.splitLocation;
    }

    public void setSplitLocation(int i) {
        this.splitLocation = i;
    }

    public Charset getCharset() {
        return this.charset;
    }

    public void setCharset(Charset charset) {
        this.charset = charset;
    }

    public Path getInputDirectory() {
        return this.inputDirectory;
    }

    public void setInputDirectory(Path path) {
        this.inputDirectory = path;
    }

    public Path getTrainingOutputDirectory() {
        return this.trainingOutputDirectory;
    }

    public void setTrainingOutputDirectory(Path path) {
        this.trainingOutputDirectory = path;
    }

    public Path getTestOutputDirectory() {
        return this.testOutputDirectory;
    }

    public void setTestOutputDirectory(Path path) {
        this.testOutputDirectory = path;
    }

    public SplitCallback getCallback() {
        return this.callback;
    }

    public void setCallback(SplitCallback splitCallback) {
        this.callback = splitCallback;
    }

    public int getTestRandomSelectionSize() {
        return this.testRandomSelectionSize;
    }

    public void setTestRandomSelectionSize(int i) {
        this.testRandomSelectionSize = i;
    }

    public int getTestRandomSelectionPct() {
        return this.testRandomSelectionPct;
    }

    public void setTestRandomSelectionPct(int i) {
        this.testRandomSelectionPct = i;
    }

    public void validate() throws IOException {
        Preconditions.checkArgument(this.testSplitSize >= 1 || this.testSplitSize == -1, "Invalid testSplitSize: " + this.testSplitSize + ". Must be: testSplitSize >= 1 or testSplitSize = -1");
        Preconditions.checkArgument((this.splitLocation >= 0 && this.splitLocation <= 100) || this.splitLocation == -1, "Invalid splitLocation percentage: " + this.splitLocation + ". Must be: 0 <= splitLocation <= 100 or splitLocation = -1");
        Preconditions.checkArgument((this.testSplitPct >= 0 && this.testSplitPct <= 100) || this.testSplitPct == -1, "Invalid testSplitPct percentage: " + this.testSplitPct + ". Must be: 0 <= testSplitPct <= 100 or testSplitPct = -1");
        Preconditions.checkArgument((this.testRandomSelectionPct >= 0 && this.testRandomSelectionPct <= 100) || this.testRandomSelectionPct == -1, "Invalid testRandomSelectionPct percentage: " + this.testRandomSelectionPct + ". Must be: 0 <= testRandomSelectionPct <= 100 or testRandomSelectionPct = -1");
        Preconditions.checkArgument(this.trainingOutputDirectory != null || this.useMapRed, "No training output directory was specified");
        Preconditions.checkArgument(this.testOutputDirectory != null || this.useMapRed, "No test output directory was specified");
        int i = 0;
        if (this.testSplitSize > 0) {
            i = 0 + 1;
        }
        if (this.testSplitPct > 0) {
            i++;
        }
        if (this.testRandomSelectionSize > 0) {
            i++;
        }
        if (this.testRandomSelectionPct > 0) {
            i++;
        }
        Preconditions.checkArgument(i == 1, "Exactly one of testSplitSize, testSplitPct, testRandomSelectionSize, testRandomSelectionPct should be set");
        if (this.useMapRed) {
            return;
        }
        FileSystem fileSystem = this.trainingOutputDirectory.getFileSystem(getConf());
        FileStatus fileStatus = fileSystem.getFileStatus(this.trainingOutputDirectory);
        Preconditions.checkArgument(fileStatus != null && fileStatus.isDir(), "%s is not a directory", new Object[]{this.trainingOutputDirectory});
        FileStatus fileStatus2 = fileSystem.getFileStatus(this.testOutputDirectory);
        Preconditions.checkArgument(fileStatus2 != null && fileStatus2.isDir(), "%s is not a directory", new Object[]{this.testOutputDirectory});
    }

    public static int countLines(FileSystem fileSystem, Path path, Charset charset) throws IOException {
        int i = 0;
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader((InputStream) fileSystem.open(path), charset));
        while (bufferedReader.readLine() != null) {
            try {
                i++;
            } finally {
                Closeables.close(bufferedReader, true);
            }
        }
        return i;
    }
}
