package org.apache.mahout.classifier;

import com.google.common.collect.ConcurrentHashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.io.Closeables;
import com.google.common.io.Files;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.Random;
import org.apache.commons.io.Charsets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.util.Version;
import org.apache.mahout.common.RandomUtils;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;

/* loaded from: input_file:org/apache/mahout/classifier/NewsgroupHelper.class */
public final class NewsgroupHelper {
    private static final SimpleDateFormat[] DATE_FORMATS = {new SimpleDateFormat("", Locale.ENGLISH), new SimpleDateFormat("MMM-yyyy", Locale.ENGLISH), new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.ENGLISH)};
    public static final int FEATURES = 10000;
    private static final long DATE_REFERENCE = 853286460;
    private static final long MONTH = 2592000;
    private static final long WEEK = 604800;
    private final Random rand = RandomUtils.getRandom();
    private final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
    private final FeatureVectorEncoder encoder = new StaticWordValueEncoder(DocMaker.BODY_FIELD);
    private final FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");

    public FeatureVectorEncoder getEncoder() {
        return this.encoder;
    }

    public FeatureVectorEncoder getBias() {
        return this.bias;
    }

    public Random getRandom() {
        return this.rand;
    }

    public Vector encodeFeatureVector(File file, int i, int i2, Multiset<String> multiset) throws IOException {
        long nextDouble = (long) (1000.0d * (DATE_REFERENCE + (i * MONTH) + (604800.0d * this.rand.nextDouble())));
        ConcurrentHashMultiset create = ConcurrentHashMultiset.create();
        BufferedReader newReader = Files.newReader(file, Charsets.UTF_8);
        Throwable th = null;
        try {
            try {
                String readLine = newReader.readLine();
                countWords(this.analyzer, create, new StringReader(DATE_FORMATS[i2 % 3].format(new Date(nextDouble))), multiset);
                while (readLine != null && !readLine.isEmpty()) {
                    boolean z = (readLine.startsWith("From:") || readLine.startsWith("Subject:") || readLine.startsWith("Keywords:") || readLine.startsWith("Summary:")) && i2 < 6;
                    do {
                        StringReader stringReader = new StringReader(readLine);
                        if (z) {
                            countWords(this.analyzer, create, stringReader, multiset);
                        }
                        readLine = newReader.readLine();
                        if (readLine != null) {
                        }
                    } while (readLine.startsWith(" "));
                }
                if (i2 < 3) {
                    countWords(this.analyzer, create, newReader, multiset);
                }
                if (newReader != null) {
                    if (0 != 0) {
                        try {
                            newReader.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        newReader.close();
                    }
                }
                RandomAccessSparseVector randomAccessSparseVector = new RandomAccessSparseVector(10000);
                this.bias.addToVector("", 1.0d, randomAccessSparseVector);
                Iterator it2 = create.elementSet().iterator();
                while (it2.hasNext()) {
                    this.encoder.addToVector((String) it2.next(), Math.log1p(create.count(r0)), randomAccessSparseVector);
                }
                return randomAccessSparseVector;
            } finally {
            }
        } catch (Throwable th3) {
            if (newReader != null) {
                if (th != null) {
                    try {
                        newReader.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    newReader.close();
                }
            }
            throw th3;
        }
    }

    public static void countWords(Analyzer analyzer, Collection<String> collection, Reader reader, Multiset<String> multiset) throws IOException {
        TokenStream tokenStream = analyzer.tokenStream("text", reader);
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            collection.add(((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString());
        }
        multiset.addAll(collection);
        tokenStream.end();
        Closeables.close(tokenStream, true);
    }
}
