/*
 * Copyright 2009-2009 the Fess Project and the Others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package jp.sf.fess.transformer;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import jp.sf.fess.db.exentity.CrawlingConfig;
import jp.sf.fess.helper.CrawlingConfigHelper;
import jp.sf.fess.helper.CrawlingSessionHelper;
import jp.sf.fess.helper.PathMappingHelper;

import org.apache.commons.lang.StringUtils;
import org.seasar.framework.container.SingletonS2Container;
import org.seasar.framework.util.StringUtil;
import org.seasar.robot.entity.ResponseData;
import org.seasar.robot.transformer.impl.XpathTransformer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class FessXpathTransformer extends XpathTransformer {
    private static final Logger logger = LoggerFactory
            .getLogger(FessXpathTransformer.class);

    public String cacheXpath = "/HTML/BODY";

    public String anchorXpath = "//A/@href";

    protected String getResultDataBody(Map<String, Object> dataMap, String key,
            Object value) {
        dataMap.put(key, value);
        if (value instanceof List) {
            return getResultDataBody(key, (List<String>) value);
        } else {
            return getResultDataBody(key, (String) value);
        }
    }

    @Override
    protected String getAdditionalData(ResponseData responseData,
            Document document) {
        CrawlingSessionHelper crawlingSessionHelper = SingletonS2Container
                .getComponent("crawlingSessionHelper");
        String sessionId = crawlingSessionHelper
                .getCanonicalSessionId(responseData.getSessionId());
        PathMappingHelper pathMappingHelper = SingletonS2Container
                .getComponent("pathMappingHelper");
        String url = pathMappingHelper.replaceUrl(sessionId, responseData
                .getUrl());
        StringBuilder buf = new StringBuilder(1000);

        Map<String, Object> dataMap = new HashMap<String, Object>();

        // digest
        // content
        // title
        // segment
        buf.append(getResultDataBody(dataMap, "segment", sessionId));
        // TODO boost
        // host
        buf.append(getResultDataBody(dataMap, "host", getHost(url)));
        // site
        buf.append(getResultDataBody(dataMap, "site", getSite(url)));
        // url
        buf.append(getResultDataBody(dataMap, "url", url));
        // cache 
        // TODO performance issue. all content is needed??
        buf.append(getResultDataBody(dataMap, "cache", getSingleNodeValue(
                document, cacheXpath)));
        // tstamp
        buf.append(getResultDataBody(dataMap, "tstamp", Long
                .toString(new Date().getTime())));
        // anchor
        buf.append(getResultDataBody(dataMap, "anchor", getAnchorList(document,
                responseData.getUrl())));
        // mimetype
        buf.append(getResultDataBody(dataMap, "mimetype", responseData
                .getMimeType()));
        // contentLength
        buf.append(getResultDataBody(dataMap, "contentLength", Long
                .toString(responseData.getContentLength())));
        //  lastModified
        buf.append(getResultDataBody(dataMap, "lastModified", Long
                .toString(responseData.getLastModified().getTime())));
        // type
        CrawlingConfigHelper crawlingConfigHelper = SingletonS2Container
                .getComponent("crawlingConfigHelper");
        CrawlingConfig crawlingConfig = crawlingConfigHelper
                .getCrawlingConfig(responseData.getSessionId());
        List<String> typeList = new ArrayList<String>();
        for (String type : crawlingConfig.getBrowserTypeValues()) {
            typeList.add(type);
        }
        buf.append(getResultDataBody(dataMap, "type", typeList));
        // TODO date
        // TODO lang
        // id
        buf.append(getResultDataBody("id", crawlingSessionHelper
                .generateId(dataMap)));

        return buf.toString();
    }

    protected String getSingleNodeValue(Document document, String xpath) {
        Node value = null;
        try {
            value = getXPathAPI().selectSingleNode(document, xpath);
        } catch (Exception e) {
            logger.warn("Could not parse a value of " + xpath);
        }
        return value != null ? value.getTextContent() : null;
    }

    protected String getMultipleNodeValue(Document document, String xpath) {
        NodeList nodeList = null;
        StringBuilder buf = new StringBuilder(100);
        try {
            nodeList = getXPathAPI().selectNodeList(document, xpath);
            for (int i = 0; i < nodeList.getLength(); i++) {
                Node node = nodeList.item(i);
                buf.append(node.getTextContent());
                buf.append("\n");
            }
        } catch (Exception e) {
            logger.warn("Could not parse a value of " + xpath);
        }
        return buf.toString();
    }

    protected List<String> getAnchorList(Document document, String currentUrl) {
        List<String> anchorList = new ArrayList<String>();
        String baseHref = getBaseHref(document);
        try {
            URL url = new URL(baseHref != null ? baseHref : currentUrl);
            NodeList list = getXPathAPI().selectNodeList(document, anchorXpath);
            for (int i = 0; i < list.getLength(); i++) {
                Node node = list.item(i);
                String attrValue = node.getTextContent();
                if (isValidPath(attrValue)) {
                    try {
                        URL childUrl = new URL(url, attrValue);
                        String u = normalizeUrl(childUrl.toString());
                        if (StringUtil.isNotBlank(u)) {
                            anchorList.add(u);
                        }
                    } catch (MalformedURLException e) {
                    }
                }
            }
        } catch (Exception e) {
            logger.warn("Could not parse anchor tags.", e);
        }
        return anchorList;
    }

    protected String getHost(String url) {
        if (StringUtil.isBlank(url)) {
            return ""; // empty
        }

        int idx = url.indexOf("://");
        if (idx >= 0) {
            url = url.substring(idx + 3);
        }

        idx = url.indexOf("/");
        if (idx >= 0) {
            url = url.substring(0, idx);
        }

        return url;
    }

    protected String getSite(String url) {
        if (StringUtil.isBlank(url)) {
            return ""; // empty
        }

        int idx = url.indexOf("://");
        if (idx >= 0) {
            url = url.substring(idx + 3);
        }

        idx = url.indexOf("?");
        if (idx >= 0) {
            url = url.substring(0, idx);
        }

        return StringUtils.abbreviate(url, 50);
    }
}
