package org.apache.mahout.clustering.streaming.tools;

import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.io.Closeables;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.option.DefaultOption;
import org.apache.commons.cli2.util.HelpFormatter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.shell.Test;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.mahout.clustering.ClusteringUtils;
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.distance.DistanceMeasure;
import org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterable;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.stats.OnlineSummarizer;

/* loaded from: input_file:org/apache/mahout/clustering/streaming/tools/ClusterQualitySummarizer.class */
public class ClusterQualitySummarizer extends AbstractJob {
    private String outputFile;
    private PrintWriter fileOut;
    private String trainFile;
    private String testFile;
    private String centroidFile;
    private String centroidCompareFile;
    private boolean mahoutKMeansFormat;
    private boolean mahoutKMeansFormatCompare;
    private DistanceMeasure distanceMeasure = new SquaredEuclideanDistanceMeasure();

    public void printSummaries(List<OnlineSummarizer> list, String str) {
        printSummaries(list, str, this.fileOut);
    }

    public static void printSummaries(List<OnlineSummarizer> list, String str, PrintWriter printWriter) {
        double d = 0.0d;
        for (int i = 0; i < list.size(); i++) {
            OnlineSummarizer onlineSummarizer = list.get(i);
            if (onlineSummarizer.getCount() > 1) {
                d = Math.max(d, onlineSummarizer.getMax());
                System.out.printf("Average distance in cluster %d [%d]: %f\n", Integer.valueOf(i), Integer.valueOf(onlineSummarizer.getCount()), Double.valueOf(onlineSummarizer.getMean()));
                if (printWriter != null) {
                    printWriter.printf("%d,%f,%f,%f,%f,%f,%f,%f,%d,%s\n", Integer.valueOf(i), Double.valueOf(onlineSummarizer.getMean()), Double.valueOf(onlineSummarizer.getSD()), Double.valueOf(onlineSummarizer.getQuartile(0)), Double.valueOf(onlineSummarizer.getQuartile(1)), Double.valueOf(onlineSummarizer.getQuartile(2)), Double.valueOf(onlineSummarizer.getQuartile(3)), Double.valueOf(onlineSummarizer.getQuartile(4)), Integer.valueOf(onlineSummarizer.getCount()), str);
                }
            } else {
                System.out.printf("Cluster %d is has %d data point. Need atleast 2 data points in a cluster for OnlineSummarizer.\n", Integer.valueOf(i), Integer.valueOf(onlineSummarizer.getCount()));
            }
        }
        System.out.printf("Num clusters: %d; maxDistance: %f\n", Integer.valueOf(list.size()), Double.valueOf(d));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws IOException {
        if (!parseArgs(strArr)) {
            return -1;
        }
        Configuration configuration = new Configuration();
        try {
            try {
                this.fileOut = new PrintWriter(new FileOutputStream(this.outputFile));
                this.fileOut.printf("cluster,distance.mean,distance.sd,distance.q0,distance.q1,distance.q2,distance.q3,distance.q4,count,is.train\n", new Object[0]);
                ArrayList arrayList = null;
                ArrayList newArrayList = this.mahoutKMeansFormat ? Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(new SequenceFileDirValueIterable(new Path(this.centroidFile), PathType.GLOB, configuration))) : Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(new SequenceFileDirValueIterable(new Path(this.centroidFile), PathType.GLOB, configuration)));
                if (this.centroidCompareFile != null) {
                    arrayList = this.mahoutKMeansFormatCompare ? Lists.newArrayList(IOUtils.getCentroidsFromClusterWritableIterable(new SequenceFileDirValueIterable(new Path(this.centroidCompareFile), PathType.GLOB, configuration))) : Lists.newArrayList(IOUtils.getCentroidsFromCentroidWritableIterable(new SequenceFileDirValueIterable(new Path(this.centroidCompareFile), PathType.GLOB, configuration)));
                }
                Iterable<Vector> vectorsFromVectorWritableIterable = IOUtils.getVectorsFromVectorWritableIterable(new SequenceFileDirValueIterable(new Path(this.trainFile), PathType.GLOB, configuration));
                Iterable<Vector> iterable = vectorsFromVectorWritableIterable;
                printSummaries(ClusteringUtils.summarizeClusterDistances(vectorsFromVectorWritableIterable, newArrayList, new SquaredEuclideanDistanceMeasure()), "train");
                if (this.testFile != null) {
                    Iterable<Vector> vectorsFromVectorWritableIterable2 = IOUtils.getVectorsFromVectorWritableIterable(new SequenceFileDirValueIterable(new Path(this.testFile), PathType.GLOB, configuration));
                    printSummaries(ClusteringUtils.summarizeClusterDistances(vectorsFromVectorWritableIterable2, newArrayList, new SquaredEuclideanDistanceMeasure()), Test.NAME);
                    iterable = Iterables.concat(vectorsFromVectorWritableIterable, vectorsFromVectorWritableIterable2);
                }
                List<OnlineSummarizer> summarizeClusterDistances = ClusteringUtils.summarizeClusterDistances(iterable, newArrayList, this.distanceMeasure);
                List<OnlineSummarizer> list = null;
                if (arrayList != null) {
                    list = ClusteringUtils.summarizeClusterDistances(iterable, arrayList, this.distanceMeasure);
                }
                System.out.printf("[Dunn Index] First: %f", Double.valueOf(ClusteringUtils.dunnIndex(newArrayList, this.distanceMeasure, summarizeClusterDistances)));
                if (list != null) {
                    System.out.printf(" Second: %f\n", Double.valueOf(ClusteringUtils.dunnIndex(arrayList, this.distanceMeasure, list)));
                } else {
                    System.out.printf("\n", new Object[0]);
                }
                System.out.printf("[Davies-Bouldin Index] First: %f", Double.valueOf(ClusteringUtils.daviesBouldinIndex(newArrayList, this.distanceMeasure, summarizeClusterDistances)));
                if (list != null) {
                    System.out.printf(" Second: %f\n", Double.valueOf(ClusteringUtils.daviesBouldinIndex(arrayList, this.distanceMeasure, list)));
                } else {
                    System.out.printf("\n", new Object[0]);
                }
                Closeables.close(this.fileOut, false);
                return 0;
            } catch (IOException e) {
                System.out.println(e.getMessage());
                Closeables.close(this.fileOut, false);
                return 0;
            }
        } catch (Throwable th) {
            Closeables.close(this.fileOut, false);
            throw th;
        }
    }

    private boolean parseArgs(String[] strArr) {
        DefaultOptionBuilder defaultOptionBuilder = new DefaultOptionBuilder();
        DefaultOption create = defaultOptionBuilder.withLongName("help").withDescription("print this list").create();
        ArgumentBuilder argumentBuilder = new ArgumentBuilder();
        DefaultOption create2 = defaultOptionBuilder.withLongName("input").withShortName(WikipediaTokenizer.ITALICS).withRequired(true).withArgument(argumentBuilder.withName("input").withMaximum(1).create()).withDescription("where to get seq files with the vectors (training set)").create();
        DefaultOption create3 = defaultOptionBuilder.withLongName("testInput").withShortName("itest").withArgument(argumentBuilder.withName("testInput").withMaximum(1).create()).withDescription("where to get seq files with the vectors (test set)").create();
        DefaultOption create4 = defaultOptionBuilder.withLongName("centroids").withShortName(WikipediaTokenizer.CATEGORY).withRequired(true).withArgument(argumentBuilder.withName("centroids").withMaximum(1).create()).withDescription("where to get seq files with the centroids (from Mahout KMeans or StreamingKMeansDriver)").create();
        DefaultOption create5 = defaultOptionBuilder.withLongName("centroidsCompare").withShortName("cc").withRequired(false).withArgument(argumentBuilder.withName("centroidsCompare").withMaximum(1).create()).withDescription("where to get seq files with the second set of centroids (from Mahout KMeans or StreamingKMeansDriver)").create();
        DefaultOption create6 = defaultOptionBuilder.withLongName("output").withShortName("o").withRequired(true).withArgument(argumentBuilder.withName("output").withMaximum(1).create()).withDescription("where to dump the CSV file with the results").create();
        DefaultOption create7 = defaultOptionBuilder.withLongName("mahoutkmeansformat").withShortName("mkm").withDescription("if set, read files as (IntWritable, ClusterWritable) pairs").withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create()).create();
        DefaultOption create8 = defaultOptionBuilder.withLongName("mahoutkmeansformatCompare").withShortName("mkmc").withDescription("if set, read files as (IntWritable, ClusterWritable) pairs").withArgument(argumentBuilder.withName("numpoints").withMaximum(1).create()).create();
        Group create9 = new GroupBuilder().withOption(create).withOption(create2).withOption(create3).withOption(create6).withOption(create4).withOption(create5).withOption(create7).withOption(create8).create();
        Parser parser = new Parser();
        parser.setHelpOption(create);
        parser.setHelpTrigger("--help");
        parser.setGroup(create9);
        parser.setHelpFormatter(new HelpFormatter(" ", "", " ", 150));
        CommandLine parseAndHelp = parser.parseAndHelp(strArr);
        if (parseAndHelp == null) {
            return false;
        }
        this.trainFile = (String) parseAndHelp.getValue(create2);
        if (parseAndHelp.hasOption(create3)) {
            this.testFile = (String) parseAndHelp.getValue(create3);
        }
        this.centroidFile = (String) parseAndHelp.getValue(create4);
        if (parseAndHelp.hasOption(create5)) {
            this.centroidCompareFile = (String) parseAndHelp.getValue(create5);
        }
        this.outputFile = (String) parseAndHelp.getValue(create6);
        if (parseAndHelp.hasOption(create7)) {
            this.mahoutKMeansFormat = true;
        }
        if (!parseAndHelp.hasOption(create8)) {
            return true;
        }
        this.mahoutKMeansFormatCompare = true;
        return true;
    }

    public static void main(String[] strArr) throws IOException {
        new ClusterQualitySummarizer().run(strArr);
    }
}
