/*
 * Decompiled with CFR 0.152.
 */
package org.apache.mahout.utils;

import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
import com.google.common.io.Closeables;
import java.io.BufferedReader;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import java.util.BitSet;
import java.util.Random;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.HadoopUtil;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterator;
import org.apache.mahout.math.jet.random.sampling.RandomSampler;
import org.apache.mahout.utils.SplitInputJob;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class SplitInput
extends AbstractJob {
    private static final Logger log = LoggerFactory.getLogger(SplitInput.class);
    private int testSplitSize = -1;
    private int testSplitPct = -1;
    private int splitLocation = 100;
    private int testRandomSelectionSize = -1;
    private int testRandomSelectionPct = -1;
    private int keepPct = 100;
    private Charset charset = Charsets.UTF_8;
    private boolean useSequence;
    private boolean useMapRed;
    private Path inputDirectory;
    private Path trainingOutputDirectory;
    private Path testOutputDirectory;
    private Path mapRedOutputDirectory;
    private SplitCallback callback;

    public int run(String[] args) throws Exception {
        if (this.parseArgs(args)) {
            this.splitDirectory();
        }
        return 0;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run((Configuration)new Configuration(), (Tool)new SplitInput(), (String[])args);
    }

    private boolean parseArgs(String[] args) throws Exception {
        this.addInputOption();
        this.addOption("trainingOutput", "tr", "The training data output directory", false);
        this.addOption("testOutput", "te", "The test data output directory", false);
        this.addOption("testSplitSize", "ss", "The number of documents held back as test data for each category", false);
        this.addOption("testSplitPct", "sp", "The % of documents held back as test data for each category", false);
        this.addOption("splitLocation", "sl", "Location for start of test data expressed as a percentage of the input file size (0=start, 50=middle, 100=end", false);
        this.addOption("randomSelectionSize", "rs", "The number of items to be randomly selected as test data ", false);
        this.addOption("randomSelectionPct", "rp", "Percentage of items to be randomly selected as test data when using mapreduce mode", false);
        this.addOption("charset", "c", "The name of the character encoding of the input files (not needed if using SequenceFiles)", false);
        this.addOption(SplitInput.buildOption((String)"sequenceFiles", (String)"seq", (String)"Set if the input files are sequence files.  Default is false", (boolean)false, (boolean)false, (String)"false"));
        this.addOption((Option)DefaultOptionCreator.methodOption().create());
        this.addOption((Option)DefaultOptionCreator.overwriteOption().create());
        this.addOption("keepPct", "k", "The percentage of total data to keep in map-reduce mode, the rest will be ignored.  Default is 100%", false);
        this.addOption("mapRedOutputDir", "mro", "Output directory for map reduce jobs", false);
        if (this.parseArguments(args) == null) {
            return false;
        }
        try {
            this.inputDirectory = this.getInputPath();
            this.useMapRed = this.getOption("method").equalsIgnoreCase("mapreduce");
            if (this.useMapRed) {
                if (!this.hasOption("randomSelectionPct")) {
                    throw new OptionException(this.getCLIOption("randomSelectionPct"), "must set randomSelectionPct when mapRed option is used");
                }
                if (!this.hasOption("mapRedOutputDir")) {
                    throw new OptionException(this.getCLIOption("mapRedOutputDir"), "mapRedOutputDir must be set when mapRed option is used");
                }
                this.mapRedOutputDirectory = new Path(this.getOption("mapRedOutputDir"));
                if (this.hasOption("keepPct")) {
                    this.keepPct = Integer.parseInt(this.getOption("keepPct"));
                }
                if (this.hasOption("overwrite")) {
                    HadoopUtil.delete((Configuration)this.getConf(), (Path[])new Path[]{this.mapRedOutputDirectory});
                }
            } else {
                if (!this.hasOption("trainingOutput") || !this.hasOption("testOutput")) {
                    throw new OptionException(this.getCLIOption("trainingOutput"), "trainingOutput and testOutput must be set if mapRed option is not used");
                }
                if (!(this.hasOption("testSplitSize") || this.hasOption("testSplitPct") || this.hasOption("randomSelectionPct") || this.hasOption("randomSelectionSize"))) {
                    throw new OptionException(this.getCLIOption("testSplitSize"), "must set one of test split size/percentage or randomSelectionSize/percentage");
                }
                this.trainingOutputDirectory = new Path(this.getOption("trainingOutput"));
                this.testOutputDirectory = new Path(this.getOption("testOutput"));
                FileSystem fs = this.trainingOutputDirectory.getFileSystem(this.getConf());
                if (this.hasOption("overwrite")) {
                    HadoopUtil.delete((Configuration)fs.getConf(), (Path[])new Path[]{this.trainingOutputDirectory});
                    HadoopUtil.delete((Configuration)fs.getConf(), (Path[])new Path[]{this.testOutputDirectory});
                }
                fs.mkdirs(this.trainingOutputDirectory);
                fs.mkdirs(this.testOutputDirectory);
            }
            if (this.hasOption("charset")) {
                this.charset = Charset.forName(this.getOption("charset"));
            }
            if (this.hasOption("testSplitSize") && this.hasOption("testSplitPct")) {
                throw new OptionException(this.getCLIOption("testSplitPct"), "must have either split size or split percentage option, not BOTH");
            }
            if (this.hasOption("testSplitSize")) {
                this.setTestSplitSize(Integer.parseInt(this.getOption("testSplitSize")));
            }
            if (this.hasOption("testSplitPct")) {
                this.setTestSplitPct(Integer.parseInt(this.getOption("testSplitPct")));
            }
            if (this.hasOption("splitLocation")) {
                this.setSplitLocation(Integer.parseInt(this.getOption("splitLocation")));
            }
            if (this.hasOption("randomSelectionSize")) {
                this.setTestRandomSelectionSize(Integer.parseInt(this.getOption("randomSelectionSize")));
            }
            if (this.hasOption("randomSelectionPct")) {
                this.setTestRandomSelectionPct(Integer.parseInt(this.getOption("randomSelectionPct")));
            }
            this.useSequence = this.hasOption("sequenceFiles");
        }
        catch (OptionException e) {
            log.error("Command-line option Exception", (Throwable)e);
            CommandLineUtil.printHelp((Group)this.getGroup());
            return false;
        }
        this.validate();
        return true;
    }

    public void splitDirectory() throws IOException, ClassNotFoundException, InterruptedException {
        this.splitDirectory(this.inputDirectory);
    }

    public void splitDirectory(Path inputDir) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = this.getConf();
        this.splitDirectory(conf, inputDir);
    }

    public void splitDirectory(Configuration conf, Path inputDir) throws IOException, ClassNotFoundException, InterruptedException {
        FileSystem fs = inputDir.getFileSystem(conf);
        if (fs.getFileStatus(inputDir) == null) {
            throw new IOException(inputDir + " does not exist");
        }
        if (!fs.getFileStatus(inputDir).isDir()) {
            throw new IOException(inputDir + " is not a directory");
        }
        if (this.useMapRed) {
            SplitInputJob.run(conf, inputDir, this.mapRedOutputDirectory, this.keepPct, this.testRandomSelectionPct);
        } else {
            FileStatus[] fileStats;
            for (FileStatus inputFile : fileStats = fs.listStatus(inputDir, PathFilters.logsCRCFilter())) {
                if (inputFile.isDir()) continue;
                this.splitFile(inputFile.getPath());
            }
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public void splitFile(Path inputFile) throws IOException {
        Configuration conf = this.getConf();
        FileSystem fs = inputFile.getFileSystem(conf);
        if (fs.getFileStatus(inputFile) == null) {
            throw new IOException(inputFile + " does not exist");
        }
        if (fs.getFileStatus(inputFile).isDir()) {
            throw new IOException(inputFile + " is a directory");
        }
        this.validate();
        Path testOutputFile = new Path(this.testOutputDirectory, inputFile.getName());
        Path trainingOutputFile = new Path(this.trainingOutputDirectory, inputFile.getName());
        int lineCount = SplitInput.countLines(fs, inputFile, this.charset);
        log.info("{} has {} lines", (Object)inputFile.getName(), (Object)lineCount);
        int testSplitStart = 0;
        int testSplitSize = this.testSplitSize;
        BitSet randomSel = null;
        if (this.testRandomSelectionPct > 0 || this.testRandomSelectionSize > 0) {
            testSplitSize = this.testRandomSelectionSize;
            if (this.testRandomSelectionPct > 0) {
                testSplitSize = Math.round((float)(lineCount * this.testRandomSelectionPct) / 100.0f);
            }
            log.info("{} test split size is {} based on random selection percentage {}", new Object[]{inputFile.getName(), testSplitSize, this.testRandomSelectionPct});
            long[] ridx = new long[testSplitSize];
            RandomSampler.sample((long)testSplitSize, (long)(lineCount - 1), (int)testSplitSize, (long)0L, (long[])ridx, (int)0, (Random)RandomUtils.getRandom());
            randomSel = new BitSet(lineCount);
            for (long idx : ridx) {
                randomSel.set((int)idx + 1);
            }
        } else {
            if (this.testSplitPct > 0) {
                testSplitSize = Math.round((float)(lineCount * this.testSplitPct) / 100.0f);
                log.info("{} test split size is {} based on percentage {}", new Object[]{inputFile.getName(), testSplitSize, this.testSplitPct});
            } else {
                log.info("{} test split size is {}", (Object)inputFile.getName(), (Object)testSplitSize);
            }
            if (this.splitLocation > 0) {
                testSplitStart = Math.round((float)(lineCount * this.splitLocation) / 100.0f);
                if (lineCount - testSplitStart < testSplitSize) {
                    testSplitStart = lineCount - testSplitSize;
                }
                log.info("{} test split start is {} based on split location {}", new Object[]{inputFile.getName(), testSplitStart, this.splitLocation});
            }
            if (testSplitStart < 0) {
                throw new IllegalArgumentException("test split size for " + inputFile + " is too large, it would produce an " + "empty training set from the initial set of " + lineCount + " examples");
            }
            if (lineCount - testSplitSize < testSplitSize) {
                log.warn("Test set size for {} may be too large, {} is larger than the number of lines remaining in the training set: {}", new Object[]{inputFile, testSplitSize, lineCount - testSplitSize});
            }
        }
        int trainCount = 0;
        int testCount = 0;
        if (!this.useSequence) {
            BufferedReader reader = new BufferedReader(new InputStreamReader((InputStream)fs.open(inputFile), this.charset));
            OutputStreamWriter trainingWriter = new OutputStreamWriter((OutputStream)fs.create(trainingOutputFile), this.charset);
            OutputStreamWriter testWriter = new OutputStreamWriter((OutputStream)fs.create(testOutputFile), this.charset);
            try {
                String line;
                int pos = 0;
                while ((line = reader.readLine()) != null) {
                    OutputStreamWriter writer;
                    ++pos;
                    if (this.testRandomSelectionPct > 0) {
                        writer = randomSel.get(pos) ? testWriter : trainingWriter;
                    } else {
                        OutputStreamWriter outputStreamWriter = writer = pos > testSplitStart ? testWriter : trainingWriter;
                    }
                    if (writer == testWriter) {
                        if (testCount >= testSplitSize) {
                            writer = trainingWriter;
                        } else {
                            ++testCount;
                        }
                    }
                    if (writer == trainingWriter) {
                        ++trainCount;
                    }
                    writer.write(line);
                    ((Writer)writer).write(10);
                }
            }
            finally {
                Closeables.close((Closeable)reader, (boolean)true);
                Closeables.close((Closeable)trainingWriter, (boolean)false);
                Closeables.close((Closeable)testWriter, (boolean)false);
            }
        }
        SequenceFileIterator iterator = new SequenceFileIterator(inputFile, false, fs.getConf());
        SequenceFile.Writer trainingWriter = SequenceFile.createWriter((FileSystem)fs, (Configuration)fs.getConf(), (Path)trainingOutputFile, (Class)iterator.getKeyClass(), (Class)iterator.getValueClass());
        SequenceFile.Writer testWriter = SequenceFile.createWriter((FileSystem)fs, (Configuration)fs.getConf(), (Path)testOutputFile, (Class)iterator.getKeyClass(), (Class)iterator.getValueClass());
        try {
            int pos = 0;
            while (iterator.hasNext()) {
                SequenceFile.Writer writer;
                ++pos;
                if (this.testRandomSelectionPct > 0) {
                    writer = randomSel.get(pos) ? testWriter : trainingWriter;
                } else {
                    SequenceFile.Writer writer2 = writer = pos > testSplitStart ? testWriter : trainingWriter;
                }
                if (writer == testWriter) {
                    if (testCount >= testSplitSize) {
                        writer = trainingWriter;
                    } else {
                        ++testCount;
                    }
                }
                if (writer == trainingWriter) {
                    ++trainCount;
                }
                Pair pair = (Pair)iterator.next();
                writer.append((Writable)pair.getFirst(), (Writable)pair.getSecond());
            }
        }
        finally {
            Closeables.close((Closeable)iterator, (boolean)true);
            Closeables.close((Closeable)trainingWriter, (boolean)false);
            Closeables.close((Closeable)testWriter, (boolean)false);
        }
        log.info("file: {}, input: {} train: {}, test: {} starting at {}", new Object[]{inputFile.getName(), lineCount, trainCount, testCount, testSplitStart});
        if (this.callback != null) {
            this.callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart);
        }
    }

    public int getTestSplitSize() {
        return this.testSplitSize;
    }

    public void setTestSplitSize(int testSplitSize) {
        this.testSplitSize = testSplitSize;
    }

    public int getTestSplitPct() {
        return this.testSplitPct;
    }

    public void setTestSplitPct(int testSplitPct) {
        this.testSplitPct = testSplitPct;
    }

    public void setKeepPct(int keepPct) {
        this.keepPct = keepPct;
    }

    public void setUseMapRed(boolean useMapRed) {
        this.useMapRed = useMapRed;
    }

    public void setMapRedOutputDirectory(Path mapRedOutputDirectory) {
        this.mapRedOutputDirectory = mapRedOutputDirectory;
    }

    public int getSplitLocation() {
        return this.splitLocation;
    }

    public void setSplitLocation(int splitLocation) {
        this.splitLocation = splitLocation;
    }

    public Charset getCharset() {
        return this.charset;
    }

    public void setCharset(Charset charset) {
        this.charset = charset;
    }

    public Path getInputDirectory() {
        return this.inputDirectory;
    }

    public void setInputDirectory(Path inputDir) {
        this.inputDirectory = inputDir;
    }

    public Path getTrainingOutputDirectory() {
        return this.trainingOutputDirectory;
    }

    public void setTrainingOutputDirectory(Path trainingOutputDir) {
        this.trainingOutputDirectory = trainingOutputDir;
    }

    public Path getTestOutputDirectory() {
        return this.testOutputDirectory;
    }

    public void setTestOutputDirectory(Path testOutputDir) {
        this.testOutputDirectory = testOutputDir;
    }

    public SplitCallback getCallback() {
        return this.callback;
    }

    public void setCallback(SplitCallback callback) {
        this.callback = callback;
    }

    public int getTestRandomSelectionSize() {
        return this.testRandomSelectionSize;
    }

    public void setTestRandomSelectionSize(int testRandomSelectionSize) {
        this.testRandomSelectionSize = testRandomSelectionSize;
    }

    public int getTestRandomSelectionPct() {
        return this.testRandomSelectionPct;
    }

    public void setTestRandomSelectionPct(int randomSelectionPct) {
        this.testRandomSelectionPct = randomSelectionPct;
    }

    public void validate() throws IOException {
        Preconditions.checkArgument((this.testSplitSize >= 1 || this.testSplitSize == -1 ? 1 : 0) != 0, (String)"Invalid testSplitSize", (Object[])new Object[]{this.testSplitSize});
        Preconditions.checkArgument((this.splitLocation >= 0 && this.splitLocation <= 100 || this.splitLocation == -1 ? 1 : 0) != 0, (String)"Invalid splitLocation percentage", (Object[])new Object[]{this.splitLocation});
        Preconditions.checkArgument((this.testSplitPct >= 0 && this.testSplitPct <= 100 || this.testSplitPct == -1 ? 1 : 0) != 0, (String)"Invalid testSplitPct percentage", (Object[])new Object[]{this.testSplitPct});
        Preconditions.checkArgument((this.splitLocation >= 0 && this.splitLocation <= 100 || this.splitLocation == -1 ? 1 : 0) != 0, (String)"Invalid splitLocation percentage", (Object[])new Object[]{this.splitLocation});
        Preconditions.checkArgument((this.testRandomSelectionPct >= 0 && this.testRandomSelectionPct <= 100 || this.testRandomSelectionPct == -1 ? 1 : 0) != 0, (String)"Invalid testRandomSelectionPct percentage", (Object[])new Object[]{this.testRandomSelectionPct});
        Preconditions.checkArgument((this.trainingOutputDirectory != null || this.useMapRed ? 1 : 0) != 0, (Object)"No training output directory was specified");
        Preconditions.checkArgument((this.testOutputDirectory != null || this.useMapRed ? 1 : 0) != 0, (Object)"No test output directory was specified");
        int count = 0;
        if (this.testSplitSize > 0) {
            ++count;
        }
        if (this.testSplitPct > 0) {
            ++count;
        }
        if (this.testRandomSelectionSize > 0) {
            ++count;
        }
        if (this.testRandomSelectionPct > 0) {
            ++count;
        }
        Preconditions.checkArgument((count == 1 ? 1 : 0) != 0, (Object)"Exactly one of testSplitSize, testSplitPct, testRandomSelectionSize, testRandomSelectionPct should be set");
        if (!this.useMapRed) {
            Configuration conf = this.getConf();
            FileSystem fs = this.trainingOutputDirectory.getFileSystem(conf);
            FileStatus trainingOutputDirStatus = fs.getFileStatus(this.trainingOutputDirectory);
            Preconditions.checkArgument((trainingOutputDirStatus != null && trainingOutputDirStatus.isDir() ? 1 : 0) != 0, (String)"%s is not a directory", (Object[])new Object[]{this.trainingOutputDirectory});
            FileStatus testOutputDirStatus = fs.getFileStatus(this.testOutputDirectory);
            Preconditions.checkArgument((testOutputDirStatus != null && testOutputDirStatus.isDir() ? 1 : 0) != 0, (String)"%s is not a directory", (Object[])new Object[]{this.testOutputDirectory});
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public static int countLines(FileSystem fs, Path inputFile, Charset charset) throws IOException {
        int lineCount = 0;
        BufferedReader reader = new BufferedReader(new InputStreamReader((InputStream)fs.open(inputFile), charset));
        try {
            while (reader.readLine() != null) {
                ++lineCount;
            }
        }
        finally {
            Closeables.close((Closeable)reader, (boolean)true);
        }
        return lineCount;
    }

    public static interface SplitCallback {
        public void splitComplete(Path var1, int var2, int var3, int var4, int var5);
    }
}

