/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.tools.arc;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang3.time.StopWatch;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.fetcher.FetcherOutputFormat;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.tools.arc.ArcInputFormat;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class ArcSegmentCreator
extends Configured
implements Tool {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    public static final String URL_VERSION = "arc.url.version";
    private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");

    public ArcSegmentCreator() {
    }

    public ArcSegmentCreator(Configuration conf) {
        this.setConf(conf);
    }

    public static synchronized String generateSegmentName() {
        try {
            Thread.sleep(1000L);
        }
        catch (Throwable throwable) {
            // empty catch block
        }
        return sdf.format(new Date(System.currentTimeMillis()));
    }

    public void close() {
    }

    private static void logError(Text url, Throwable t) {
        LOG.info("Conversion of {} failed with: {}", (Object)url, (Object)StringUtils.stringifyException((Throwable)t));
    }

    public void createSegments(Path arcFiles, Path segmentsOutDir) throws IOException, InterruptedException, ClassNotFoundException {
        StopWatch stopWatch = new StopWatch();
        stopWatch.start();
        LOG.info("ArcSegmentCreator: starting");
        LOG.info("ArcSegmentCreator: arc files dir: {}", (Object)arcFiles);
        Job job = Job.getInstance((Configuration)this.getConf(), (String)("Nutch ArcSegmentCreator: " + String.valueOf(arcFiles)));
        Configuration conf = job.getConfiguration();
        String segName = ArcSegmentCreator.generateSegmentName();
        conf.set("nutch.segment.name", segName);
        FileInputFormat.addInputPath((Job)job, (Path)arcFiles);
        job.setInputFormatClass(ArcInputFormat.class);
        job.setJarByClass(ArcSegmentCreator.class);
        job.setMapperClass(ArcSegmentCreatorMapper.class);
        FileOutputFormat.setOutputPath((Job)job, (Path)new Path(segmentsOutDir, segName));
        job.setOutputFormatClass(FetcherOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NutchWritable.class);
        try {
            boolean success = job.waitForCompletion(true);
            if (!success) {
                String message = NutchJob.getJobFailureLogMessage("ArcSegmentCreator", job);
                LOG.error(message);
                throw new RuntimeException(message);
            }
        }
        catch (IOException | ClassNotFoundException | InterruptedException e) {
            LOG.error(StringUtils.stringifyException((Throwable)e));
            throw e;
        }
        stopWatch.stop();
        LOG.info("ArcSegmentCreator: finished, elapsed: {} ms{}", (Object)stopWatch.getTime(TimeUnit.MILLISECONDS));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run((Configuration)NutchConfiguration.create(), (Tool)new ArcSegmentCreator(), (String[])args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        String usage = "Usage: ArcSegmentCreator <arcFiles> <segmentsOutDir>";
        if (args.length < 2) {
            System.err.println(usage);
            return -1;
        }
        Path arcFiles = new Path(args[0]);
        Path segmentsOutDir = new Path(args[1]);
        try {
            this.createSegments(arcFiles, segmentsOutDir);
            return 0;
        }
        catch (Exception e) {
            LOG.error("ArcSegmentCreator: {}", (Object)StringUtils.stringifyException((Throwable)e));
            return -1;
        }
    }

    public static class ArcSegmentCreatorMapper
    extends Mapper<Text, BytesWritable, Text, NutchWritable> {
        public static final String URL_VERSION = "arc.url.version";
        private Configuration conf;
        private URLFilters urlFilters;
        private ScoringFilters scfilters;
        private ParseUtil parseUtil;
        private URLNormalizers normalizers;
        private int interval;

        private ParseStatus output(Mapper.Context context, String segmentName, Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status) throws InterruptedException {
            datum.setStatus(status);
            datum.setFetchTime(System.currentTimeMillis());
            if (pstatus != null) {
                datum.getMetaData().put((Writable)Nutch.WRITABLE_PROTO_STATUS_KEY, (Writable)pstatus);
            }
            ParseResult parseResult = null;
            if (content != null) {
                Parse p;
                block16: {
                    Object signature;
                    Metadata metadata = content.getMetadata();
                    metadata.set("nutch.segment.name", segmentName);
                    try {
                        this.scfilters.passScoreBeforeParsing(key, datum, content);
                    }
                    catch (Exception e) {
                        LOG.warn("Couldn't pass score, url {} ({})", (Object)key, (Object)e);
                    }
                    try {
                        parseResult = this.parseUtil.parse(content);
                    }
                    catch (Exception e) {
                        LOG.warn("Error parsing: {}: {}", (Object)key, (Object)StringUtils.stringifyException((Throwable)e));
                    }
                    if (parseResult == null) {
                        signature = SignatureFactory.getSignature(this.conf).calculate(content, new ParseStatus().getEmptyParse(this.conf));
                        datum.setSignature((byte[])signature);
                    }
                    if (parseResult == null) {
                        signature = SignatureFactory.getSignature(this.conf).calculate(content, new ParseStatus().getEmptyParse(this.conf));
                        datum.setSignature((byte[])signature);
                    }
                    try {
                        context.write((Object)key, (Object)new NutchWritable((Writable)datum));
                        context.write((Object)key, (Object)new NutchWritable(content));
                        if (parseResult == null) break block16;
                        signature = parseResult.iterator();
                        while (signature.hasNext()) {
                            Map.Entry entry = (Map.Entry)signature.next();
                            Text url = (Text)entry.getKey();
                            Parse parse = (Parse)entry.getValue();
                            ParseStatus parseStatus = parse.getData().getStatus();
                            if (!parseStatus.isSuccess()) {
                                LOG.warn("Error parsing: {}: {}", (Object)key, (Object)parseStatus);
                                parse = parseStatus.getEmptyParse(this.conf);
                            }
                            byte[] signature2 = SignatureFactory.getSignature(this.conf).calculate(content, parse);
                            parse.getData().getContentMeta().set("nutch.segment.name", segmentName);
                            parse.getData().getContentMeta().set("nutch.content.digest", StringUtil.toHexString(signature2));
                            parse.getData().getContentMeta().set("_ftk_", Long.toString(datum.getFetchTime()));
                            if (url.equals((Object)key)) {
                                datum.setSignature(signature2);
                            }
                            try {
                                this.scfilters.passScoreAfterParsing(url, content, parse);
                            }
                            catch (Exception e) {
                                LOG.warn("Couldn't pass score, url {} ({})", (Object)key, (Object)e);
                            }
                            context.write((Object)url, (Object)new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parse.getData(), parse.isCanonical())));
                        }
                    }
                    catch (IOException e) {
                        LOG.error("ArcSegmentCreator caught:{}", (Object)StringUtils.stringifyException((Throwable)e));
                    }
                }
                if (parseResult != null && !parseResult.isEmpty() && (p = parseResult.get(content.getUrl())) != null) {
                    return p.getData().getStatus();
                }
            }
            return null;
        }

        public void setup(Mapper.Context context) {
            this.conf = context.getConfiguration();
            this.urlFilters = new URLFilters(this.conf);
            this.scfilters = new ScoringFilters(this.conf);
            this.parseUtil = new ParseUtil(this.conf);
            this.normalizers = new URLNormalizers(this.conf, "fetcher");
            this.interval = this.conf.getInt("db.fetch.interval.default", 2592000);
        }

        public void map(Text key, BytesWritable bytes, Mapper.Context context) throws IOException, InterruptedException {
            String[] headers = key.toString().split("\\s+");
            String urlStr = headers[0];
            String version = headers[2];
            String contentType = headers[3];
            if (urlStr.startsWith("filedesc://")) {
                LOG.info("Ignoring file header: {}", (Object)urlStr);
                return;
            }
            LOG.info("Processing: {}", (Object)urlStr);
            Text url = new Text();
            CrawlDatum datum = new CrawlDatum(2, this.interval, 1.0f);
            String segmentName = this.conf.get("nutch.segment.name");
            try {
                urlStr = this.normalizers.normalize(urlStr, "fetcher");
                urlStr = this.urlFilters.filter(urlStr);
            }
            catch (Exception e) {
                LOG.warn("Skipping {}: {}", (Object)url, (Object)e);
                urlStr = null;
            }
            if (urlStr != null) {
                url.set(urlStr);
                try {
                    ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS;
                    Content content = new Content(urlStr, urlStr, bytes.getBytes(), contentType, new Metadata(), this.conf);
                    content.getMetadata().set("arc.url.version", version);
                    ParseStatus pstatus = null;
                    pstatus = this.output(context, segmentName, url, datum, content, status, 33);
                    context.progress();
                }
                catch (Throwable t) {
                    ArcSegmentCreator.logError(url, t);
                    this.output(context, segmentName, url, datum, null, null, 34);
                }
            }
        }
    }
}

