/*
 * Copyright 2009-2009 the Fess Project and the Others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package jp.sf.fess.transformer;

import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import jp.sf.fess.Constants;
import jp.sf.fess.FessSystemException;
import jp.sf.fess.db.exentity.CrawlingConfig;
import jp.sf.fess.helper.CrawlingConfigHelper;
import jp.sf.fess.helper.CrawlingSessionHelper;
import jp.sf.fess.helper.PathMappingHelper;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.seasar.framework.container.SingletonS2Container;
import org.seasar.framework.util.StringUtil;
import org.seasar.robot.RobotCrawlAccessException;
import org.seasar.robot.entity.ExtractData;
import org.seasar.robot.entity.ResponseData;
import org.seasar.robot.entity.ResultData;
import org.seasar.robot.extractor.Extractor;
import org.seasar.robot.extractor.ExtractorFactory;
import org.seasar.robot.transformer.impl.XpathTransformer;

public class FessFileTransformer extends XpathTransformer {
    public String encoding = Constants.UTF_8;

    public String noTitleLabel = "No title.";

    public int abbreviationMarginLength = 10;

    protected String getResultDataBody(Map<String, Object> dataMap, String key,
            Object value) {
        dataMap.put(key, value);
        if (value instanceof List) {
            return getResultDataBody(key, (List<String>) value);
        } else {
            return getResultDataBody(key, (String) value);
        }
    }

    @Override
    public ResultData transform(ResponseData responseData) {
        if (responseData == null || responseData.getResponseBody() == null) {
            throw new RobotCrawlAccessException("No response body.");
        }

        ExtractorFactory extractorFactory = SingletonS2Container
                .getComponent("extractorFactory");
        if (extractorFactory == null) {
            throw new FessSystemException("Could not find extractorFactory.");
        }
        Extractor extractor = extractorFactory.getExtractor(responseData
                .getMimeType());
        InputStream in = responseData.getResponseBody();
        Map<String, String> params = new HashMap<String, String>();
        params
                .put(ExtractData.RESOURCE_NAME_KEY,
                        getResourceName(responseData));
        params.put(ExtractData.CONTENT_TYPE, responseData.getMimeType());
        StringBuilder contentBuf = new StringBuilder(1000);
        try {
            ExtractData extractData = extractor.getText(in, params);
            contentBuf.append(extractData.getContent());
            for (String key : extractData.getKeySet()) {
                String[] values = extractData.getValues(key);
                if (values != null) {
                    contentBuf.append(StringUtils.join(values, ' '));
                }
            }
        } catch (Exception e) {
            RobotCrawlAccessException rcae = new RobotCrawlAccessException(
                    "Could not get a text from " + responseData.getUrl(), e);
            rcae.setLogLevel(RobotCrawlAccessException.WARN);
            throw rcae;
        } finally {
            IOUtils.closeQuietly(in);
        }
        String content = contentBuf.toString();

        ResultData resultData = new ResultData();
        resultData.setTransformerName(getName());

        CrawlingSessionHelper crawlingSessionHelper = SingletonS2Container
                .getComponent("crawlingSessionHelper");
        String sessionId = crawlingSessionHelper
                .getCanonicalSessionId(responseData.getSessionId());
        PathMappingHelper pathMappingHelper = SingletonS2Container
                .getComponent("pathMappingHelper");
        String url = pathMappingHelper.replaceUrl(sessionId, responseData
                .getUrl());

        StringBuilder buf = new StringBuilder(1000);
        buf.append(getResultDataHeader());

        Map<String, Object> dataMap = new HashMap<String, Object>();

        // segment
        buf.append(getResultDataBody(dataMap, "segment", sessionId));
        // digest
        buf.append(getResultDataBody(dataMap, "digest", ""));
        // content
        String body = content.replaceAll("\\s+", " ");
        if (StringUtil.isNotBlank(content)) {
            buf.append(getResultDataBody(dataMap, "content", body));
        } else {
            buf.append(getResultDataBody(dataMap, "content", ""));
        }
        // title
        if (StringUtil.isNotBlank(content)) {
            buf
                    .append(getResultDataBody(dataMap, "title", abbreviate(
                            body, 50)));
        } else {
            buf.append(getResultDataBody(dataMap, "title", noTitleLabel));
        }
        // host
        buf.append(getResultDataBody(dataMap, "host", url));
        // site
        buf.append(getResultDataBody(dataMap, "site", url));
        // url
        buf.append(getResultDataBody(dataMap, "url", url));
        // cache 
        // TODO performance issue. all content is needed??
        buf.append(getResultDataBody(dataMap, "cache", body));
        // tstamp
        buf.append(getResultDataBody(dataMap, "tstamp", Long
                .toString(new Date().getTime())));
        // TODO anchor
        buf.append(getResultDataBody(dataMap, "anchor", ""));
        // mimetype
        buf.append(getResultDataBody(dataMap, "mimetype", responseData
                .getMimeType()));
        // contentLength
        buf.append(getResultDataBody(dataMap, "contentLength", Long
                .toString(responseData.getContentLength())));
        //  lastModified
        buf.append(getResultDataBody(dataMap, "lastModified", Long
                .toString(responseData.getLastModified().getTime())));
        // type
        CrawlingConfigHelper crawlingConfigHelper = SingletonS2Container
                .getComponent("crawlingConfigHelper");
        CrawlingConfig crawlingConfig = crawlingConfigHelper
                .getCrawlingConfig(responseData.getSessionId());
        List<String> typeList = new ArrayList<String>();
        for (String type : crawlingConfig.getBrowserTypeValues()) {
            typeList.add(type);
        }
        buf.append(getResultDataBody(dataMap, "type", typeList));
        // TODO date
        // TODO lang
        // TODO boost
        // id
        buf.append(getResultDataBody("id", crawlingSessionHelper
                .generateId(dataMap)));

        buf.append(getResultDataFooter());

        try {
            resultData.setData(buf.toString().getBytes(charsetName));
        } catch (UnsupportedEncodingException e) {
            throw new RobotCrawlAccessException("Invalid charsetName: "
                    + charsetName, e);
        }
        resultData.setEncoding(charsetName);

        return resultData;
    }

    protected String abbreviate(String str, int maxWidth) {
        String newStr = StringUtils.abbreviate(str, maxWidth);
        try {
            if (newStr.getBytes(Constants.UTF_8).length > maxWidth
                    + abbreviationMarginLength) {
                newStr = StringUtils.abbreviate(str, maxWidth / 2);
            }
        } catch (UnsupportedEncodingException e) {
            // NOP
        }
        return newStr;
    }

    private String getResourceName(ResponseData responseData) {
        String name = responseData.getUrl();
        String enc = responseData.getCharSet();

        if (name == null || enc == null) {
            return null;
        }

        name = name.replaceAll("/+$", "");
        int idx = name.lastIndexOf("/");
        if (idx >= 0) {
            name = name.substring(idx + 1);
        }
        try {
            return URLDecoder.decode(name, enc);
        } catch (UnsupportedEncodingException e) {
            return name;
        }
    }
}
