/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.parse.js;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.Parser;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class JSParseFilter
implements HtmlParseFilter,
Parser {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    private static final int MAX_TITLE_LEN = 80;
    private Configuration conf;
    private static final Pattern STRING_PATTERN = Pattern.compile("(\\\\*(?:\"|'))([^\\s\"']+?)(?:\\1)", 10);
    private static final Pattern URI_PATTERN = Pattern.compile("(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)", 10);

    public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
        Parse parse = parseResult.get(content.getUrl());
        String url = content.getBaseUrl();
        ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
        this.walk(doc, parse, metaTags, url, outlinks);
        if (outlinks.size() > 0) {
            Outlink[] old = parse.getData().getOutlinks();
            String title = parse.getData().getTitle();
            List<Outlink> list = Arrays.asList(old);
            outlinks.addAll(list);
            ParseStatus status = parse.getData().getStatus();
            String text = parse.getText();
            Outlink[] newlinks = outlinks.toArray(new Outlink[outlinks.size()]);
            ParseData parseData = new ParseData(status, title, newlinks, parse.getData().getContentMeta(), parse.getData().getParseMeta());
            parseResult.put(content.getUrl(), new ParseText(text), parseData);
        }
        return parseResult;
    }

    private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List<Outlink> outlinks) {
        if (n instanceof Element) {
            String name = n.getNodeName();
            if (name.equalsIgnoreCase("script")) {
                StringBuffer script = new StringBuffer();
                NodeList nn = n.getChildNodes();
                if (nn.getLength() > 0) {
                    for (int i = 0; i < nn.getLength(); ++i) {
                        if (i > 0) {
                            script.append('\n');
                        }
                        script.append(nn.item(i).getNodeValue());
                    }
                    Outlink[] links = this.getJSLinks(script.toString(), "", base);
                    if (links != null && links.length > 0) {
                        outlinks.addAll(Arrays.asList(links));
                    }
                    return;
                }
            } else {
                NamedNodeMap attrs = n.getAttributes();
                int len = attrs.getLength();
                for (int i = 0; i < len; ++i) {
                    String val;
                    Node anode = attrs.item(i);
                    Outlink[] links = null;
                    if (anode.getNodeName().startsWith("on")) {
                        links = this.getJSLinks(anode.getNodeValue(), "", base);
                    } else if (anode.getNodeName().equalsIgnoreCase("href") && (val = anode.getNodeValue()) != null && val.toLowerCase().indexOf("javascript:") != -1) {
                        links = this.getJSLinks(val, "", base);
                    }
                    if (links == null || links.length <= 0) continue;
                    outlinks.addAll(Arrays.asList(links));
                }
            }
        }
        NodeList nl = n.getChildNodes();
        for (int i = 0; i < nl.getLength(); ++i) {
            this.walk(nl.item(i), parse, metaTags, base, outlinks);
        }
    }

    public ParseResult getParse(Content c) {
        String title;
        int idx;
        String script = new String(c.getContent());
        Outlink[] outlinks = this.getJSLinks(script, "", c.getUrl());
        if (outlinks == null) {
            outlinks = new Outlink[]{};
        }
        if ((idx = script.indexOf(10)) != -1) {
            if (idx > 80) {
                idx = 80;
            }
            title = script.substring(0, idx);
        } else {
            idx = Math.min(80, script.length());
            title = script.substring(0, idx);
        }
        ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, c.getMetadata());
        return ParseResult.createParseResult((String)c.getUrl(), (Parse)new ParseImpl(script, pd));
    }

    private Outlink[] getJSLinks(String plainText, String anchor, String base) {
        ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
        URL baseURL = null;
        try {
            baseURL = new URL(base);
        }
        catch (Exception e) {
            LOG.error("error assigning base URL", (Throwable)e);
        }
        try {
            Matcher matcher = STRING_PATTERN.matcher(plainText);
            while (matcher.find()) {
                Object url = matcher.group(2);
                Matcher matcherUri = URI_PATTERN.matcher((CharSequence)url);
                if (!matcherUri.matches()) continue;
                if (((String)url).startsWith("www.")) {
                    url = "http://" + (String)url;
                } else {
                    try {
                        url = new URL(baseURL, (String)url).toString();
                    }
                    catch (MalformedURLException ex) {
                        LOG.trace(" - failed URL parse '{}' and baseURL '{}'", new Object[]{url, baseURL, ex});
                        continue;
                    }
                }
                url = ((String)url).replaceAll("&amp;", "&");
                if (LOG.isTraceEnabled()) {
                    LOG.trace(" - outlink from JS: '" + (String)url + "'");
                }
                outlinks.add(new Outlink((String)url, anchor));
            }
        }
        catch (Exception ex) {
            LOG.error(" - invalid or malformed URL", (Throwable)ex);
        }
        Outlink[] retval = outlinks != null && outlinks.size() > 0 ? outlinks.toArray(new Outlink[0]) : new Outlink[]{};
        return retval;
    }

    public static void main(String[] args) throws Exception {
        if (args.length < 2) {
            System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
            return;
        }
        FileInputStream in = new FileInputStream(args[0]);
        BufferedReader br = new BufferedReader(new InputStreamReader((InputStream)in, "UTF-8"));
        StringBuffer sb = new StringBuffer();
        String line = null;
        while ((line = br.readLine()) != null) {
            sb.append(line + "\n");
        }
        br.close();
        JSParseFilter parseFilter = new JSParseFilter();
        parseFilter.setConf(NutchConfiguration.create());
        Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
        System.out.println("Outlinks extracted: " + links.length);
        for (int i = 0; i < links.length; ++i) {
            System.out.println(" - " + String.valueOf(links[i]));
        }
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    public Configuration getConf() {
        return this.conf;
    }
}

