/*
 * Copyright 2009-2009 the Fess Project and the Others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package jp.sf.fess.helper;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.annotation.Resource;

import jp.sf.fess.Constants;
import jp.sf.fess.FessSystemException;
import jp.sf.fess.db.exentity.WebCrawlingConfig;
import jp.sf.fess.interval.FessIntervalController;
import jp.sf.fess.service.WebCrawlingConfigService;
import jp.sf.fess.solr.IndexUpdater;
import jp.sf.fess.solr.SolrServerGroup;
import jp.sf.fess.util.FessProperties;

import org.seasar.framework.container.SingletonS2Container;
import org.seasar.framework.util.StringUtil;
import org.seasar.robot.S2Robot;
import org.seasar.robot.S2RobotContext;
import org.seasar.robot.client.S2RobotClient;
import org.seasar.robot.client.http.CommonsHttpClient;
import org.seasar.robot.db.exbhv.AccessResultBhv;
import org.seasar.robot.service.DataService;
import org.seasar.robot.service.UrlQueueService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WebIndexHelper implements Serializable {

    private static final long serialVersionUID = 1L;

    private static final Logger logger = LoggerFactory
            .getLogger(WebIndexHelper.class);

    @Resource
    protected FessProperties solrServerProperties;

    @Resource
    protected FessProperties crawlerProperties;

    @Resource
    protected DataService dataService;

    @Resource
    protected UrlQueueService urlQueueService;

    @Resource
    protected WebCrawlingConfigService webCrawlingConfigService;

    @Resource
    protected CrawlingConfigHelper crawlingConfigHelper;

    @Resource
    protected AccessResultBhv accessResultBhv;

    public long maxAccessCount = 100000;

    public void crawl(String sessionId, SolrServerGroup solrServerGroup) {
        List<WebCrawlingConfig> configList = webCrawlingConfigService
                .getAllWebCrawlingConfigList();

        if (configList.isEmpty()) {
            // nothing
            if (logger.isInfoEnabled()) {
                logger.info("No crawling target urls.");
            }
            return;
        }

        boolean multiprocessCrawling = crawlerProperties.getProperty(
                Constants.MULTIPROCESS_CRAWLING_PROPERTY, Constants.TRUE)
                .equals(Constants.TRUE);

        long startTime = System.currentTimeMillis();

        int count = 0;
        List<String> sessionIdList = new ArrayList<String>();
        List<S2Robot> s2RobotList = new ArrayList<S2Robot>();
        for (WebCrawlingConfig webCrawlingConfig : configList) {
            count++;
            String sid = sessionId + "-" + count;

            crawlingConfigHelper.setCrawlingConfig(sid, webCrawlingConfig);

            // create s2robot
            S2Robot s2Robot = SingletonS2Container.getComponent(S2Robot.class);
            s2Robot.setSessionId(sid);
            sessionIdList.add(sid);

            String urlsStr = webCrawlingConfig.getUrls();
            if (StringUtil.isBlank(urlsStr)) {
                logger.warn("No target urls. Skipped");
                break;
            }

            // interval time
            int intervalTime = webCrawlingConfig.getIntervalTime() != null ? webCrawlingConfig
                    .getIntervalTime()
                    : Constants.DEFAULT_INTERVAL_TIME_FOR_WEB;
            ((FessIntervalController) s2Robot.getIntervalController())
                    .setDelayMillisForWaitingNewUrl(intervalTime);

            String includedUrlsStr = webCrawlingConfig.getIncludedUrls();
            String excludedUrlsStr = webCrawlingConfig.getExcludedUrls();

            // num of threads
            S2RobotContext robotContext = s2Robot.getRobotContext();
            int numOfThread = webCrawlingConfig.getNumOfThread() != null ? webCrawlingConfig
                    .getNumOfThread()
                    : Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB;
            robotContext.setNumOfThread(numOfThread);

            // depth
            int depth = webCrawlingConfig.getDepth() != null ? webCrawlingConfig
                    .getDepth()
                    : -1;
            robotContext.setMaxDepth(depth);

            // max count
            long maxCount = webCrawlingConfig.getMaxAccessCount() != null ? webCrawlingConfig
                    .getMaxAccessCount()
                    : maxAccessCount;
            robotContext.setMaxAccessCount(maxCount);

            String userAgent = webCrawlingConfig.getUserAgent();
            S2RobotClient robotClient = s2Robot.getClientFactory().getClient(
                    "http://s2robot/");
            if (robotClient == null
                    || !(robotClient instanceof CommonsHttpClient)) {
                throw new FessSystemException(
                        "S2RobotClient is an invalid instance: " + robotClient);
            }
            ((CommonsHttpClient) robotClient).userAgent = userAgent;

            // set urls
            String[] urls = urlsStr.split("\\s");
            for (String u : urls) {
                s2Robot.addUrl(u);
            }

            // set included urls
            String[] includedUrls = includedUrlsStr.split("\\s");
            for (String u : includedUrls) {
                if (StringUtil.isNotBlank(u)) {
                    s2Robot.addIncludeFilter(u);
                }
            }

            // set excluded urls
            String[] excludedUrls = excludedUrlsStr.split("\\s");
            for (String u : excludedUrls) {
                if (StringUtil.isNotBlank(u)) {
                    s2Robot.addExcludeFilter(u);
                }
            }

            if (logger.isDebugEnabled()) {
                logger.debug("Crawling " + urlsStr);
            }

            s2Robot.setBackground(multiprocessCrawling);

            // crawl
            s2Robot.execute();

            if (multiprocessCrawling) {
                s2RobotList.add(s2Robot);
            }
        }

        // run index update
        IndexUpdater indexUpdater = SingletonS2Container
                .getComponent("indexUpdater");
        indexUpdater.setSessionIdList(sessionIdList);
        indexUpdater.setSolrServerGroup(solrServerGroup);
        indexUpdater.setDaemon(true);
        indexUpdater.start();

        if (multiprocessCrawling) {
            for (S2Robot s2Robot : s2RobotList) {
                s2Robot.awaitTermination();
            }
        }

        Map<String, String> infoMap = new HashMap<String, String>();

        long execTime = System.currentTimeMillis() - startTime;
        infoMap.put(Constants.WEB_CRAWLING_EXEC_TIME, Long.toString(execTime));
        if (logger.isInfoEnabled()) {
            logger.info("[EXEC TIME] crawling time: " + execTime + "ms");
        }

        indexUpdater.setFinishCrawling(true);
        try {
            indexUpdater.join();
        } catch (InterruptedException e) {
            logger.warn("Interrupted index update.", e);
        }

        infoMap.put(Constants.WEB_INDEX_EXEC_TIME, Long.toString(indexUpdater
                .getExecuteTime()));
        infoMap.put(Constants.WEB_INDEX_SIZE, Long.toString(indexUpdater
                .getDocumentSize()));

        // store info map
        CrawlingSessionHelper crawlingSessionHelper = SingletonS2Container
                .getComponent("crawlingSessionHelper");
        crawlingSessionHelper.put(sessionId, infoMap);

        for (String sid : sessionIdList) {
            // remove config
            crawlingConfigHelper.setCrawlingConfig(sid, null);
        }

        // clear queue
        urlQueueService.deleteAll();

        // clear
        dataService.deleteAll();

    }

}
