/*
 * Copyright 2009-2010 the Fess Project and the Others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package jp.sf.fess.helper;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import javax.annotation.Resource;

import jp.sf.fess.Constants;
import jp.sf.fess.db.exentity.RequestHeader;
import jp.sf.fess.db.exentity.WebAuthentication;
import jp.sf.fess.db.exentity.WebCrawlingConfig;
import jp.sf.fess.interval.FessIntervalController;
import jp.sf.fess.service.FailureUrlService;
import jp.sf.fess.service.RequestHeaderService;
import jp.sf.fess.service.WebAuthenticationService;
import jp.sf.fess.service.WebCrawlingConfigService;
import jp.sf.fess.solr.IndexUpdater;
import jp.sf.fess.solr.SolrServerGroup;
import jp.sf.fess.util.FessProperties;

import org.seasar.framework.container.SingletonS2Container;
import org.seasar.framework.util.StringUtil;
import org.seasar.robot.S2Robot;
import org.seasar.robot.S2RobotContext;
import org.seasar.robot.client.http.Authentication;
import org.seasar.robot.client.http.HcHttpClient;
import org.seasar.robot.service.DataService;
import org.seasar.robot.service.UrlFilterService;
import org.seasar.robot.service.UrlQueueService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WebIndexHelper implements Serializable {

    private static final long serialVersionUID = 1L;

    private static final Logger logger = LoggerFactory
            .getLogger(WebIndexHelper.class);

    @Resource
    protected FessProperties crawlerProperties;

    @Resource
    public WebCrawlingConfigService webCrawlingConfigService;

    @Resource
    protected WebAuthenticationService webAuthenticationService;

    @Resource
    protected RequestHeaderService requestHeaderService;

    @Resource
    public FailureUrlService failureUrlService;

    @Resource
    protected CrawlingConfigHelper crawlingConfigHelper;

    public long maxAccessCount = 100000;

    public long crawlingExecutionInterval = Constants.DEFAULT_CRAWLING_EXECUTION_INTERVAL;

    public int indexUpdaterPriority = Thread.MAX_PRIORITY;

    public int crawlerPriority = Thread.NORM_PRIORITY;

    private List<S2Robot> s2RobotList = Collections
            .synchronizedList(new ArrayList<S2Robot>());

    public void crawl(String sessionId, SolrServerGroup solrServerGroup) {
        List<WebCrawlingConfig> configList = webCrawlingConfigService
                .getAllWebCrawlingConfigList();

        if (configList.isEmpty()) {
            // nothing
            if (logger.isInfoEnabled()) {
                logger.info("No crawling target urls.");
            }
            return;
        }

        int multiprocessCrawlingCount = 5;
        String value = crawlerProperties.getProperty(
                Constants.CRAWLING_THREAD_COUNT_PROPERTY, "5");
        try {
            multiprocessCrawlingCount = Integer.parseInt(value);
        } catch (NumberFormatException e) {
            // NOP
        }

        long commitPerCount = Constants.DEFAULT_COMMIT_PER_COUNT;
        value = crawlerProperties.getProperty(
                Constants.COMMIT_PER_COUNT_PROPERTY,
                Long.toString(Constants.DEFAULT_COMMIT_PER_COUNT));
        try {
            commitPerCount = Long.parseLong(value);
        } catch (NumberFormatException e) {
            // NOP
        }

        long startTime = System.currentTimeMillis();

        int count = 0;
        List<String> sessionIdList = new ArrayList<String>();
        s2RobotList.clear();
        crawlingConfigHelper.init();
        List<String> s2RobotStatusList = new ArrayList<String>();
        for (WebCrawlingConfig webCrawlingConfig : configList) {
            count++;
            String sid = sessionId + "-" + count;

            crawlingConfigHelper.setCrawlingConfig(sid, webCrawlingConfig);

            // create s2robot
            S2Robot s2Robot = SingletonS2Container.getComponent(S2Robot.class);
            s2Robot.setSessionId(sid);
            sessionIdList.add(sid);

            String urlsStr = webCrawlingConfig.getUrls();
            if (StringUtil.isBlank(urlsStr)) {
                logger.warn("No target urls. Skipped");
                break;
            }

            // interval time
            int intervalTime = webCrawlingConfig.getIntervalTime() != null ? webCrawlingConfig
                    .getIntervalTime()
                    : Constants.DEFAULT_INTERVAL_TIME_FOR_WEB;
            ((FessIntervalController) s2Robot.getIntervalController())
                    .setDelayMillisForWaitingNewUrl(intervalTime);

            String includedUrlsStr = webCrawlingConfig.getIncludedUrls();
            String excludedUrlsStr = webCrawlingConfig.getExcludedUrls();

            // num of threads
            S2RobotContext robotContext = s2Robot.getRobotContext();
            int numOfThread = webCrawlingConfig.getNumOfThread() != null ? webCrawlingConfig
                    .getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB;
            robotContext.setNumOfThread(numOfThread);

            // depth
            int depth = webCrawlingConfig.getDepth() != null ? webCrawlingConfig
                    .getDepth() : -1;
            robotContext.setMaxDepth(depth);

            // max count
            long maxCount = webCrawlingConfig.getMaxAccessCount() != null ? webCrawlingConfig
                    .getMaxAccessCount() : maxAccessCount;
            robotContext.setMaxAccessCount(maxCount);

            // HttpClient Parameters
            Map<String, Object> paramMap = new HashMap<String, Object>();
            s2Robot.getClientFactory().setInitParameterMap(paramMap);

            String userAgent = webCrawlingConfig.getUserAgent();
            paramMap.put(HcHttpClient.USER_AGENT_PROPERTY, userAgent);

            List<WebAuthentication> webAuthList = webAuthenticationService
                    .getWebAuthenticationList(webCrawlingConfig.getId());
            List<Authentication> basicAuthList = new ArrayList<Authentication>();
            for (WebAuthentication webAuth : webAuthList) {
                basicAuthList.add(webAuth.getAuthentication());
            }
            paramMap.put(HcHttpClient.BASIC_AUTHENTICATIONS_PROPERTY,
                    basicAuthList.toArray(new Authentication[basicAuthList
                            .size()]));

            // request header
            List<RequestHeader> requestHeaderList = requestHeaderService
                    .getRequestHeaderList(webCrawlingConfig.getId());
            List<org.seasar.robot.client.http.RequestHeader> rhList = new ArrayList<org.seasar.robot.client.http.RequestHeader>();
            for (RequestHeader requestHeader : requestHeaderList) {
                rhList.add(requestHeader.getS2RobotRequestHeader());
            }
            paramMap.put(
                    HcHttpClient.REQUERT_HEADERS_PROPERTY,
                    rhList.toArray(new org.seasar.robot.client.http.RequestHeader[rhList
                            .size()]));

            // set urls
            String[] urls = urlsStr.split("[\r\n]");
            for (String u : urls) {
                if (StringUtil.isNotBlank(u)) {
                    String urlValue = u.trim();
                    if (!urlValue.startsWith("#")) {
                        s2Robot.addUrl(urlValue);
                    }
                }
            }

            // set included urls
            String[] includedUrls = includedUrlsStr.split("[\r\n]");
            for (String u : includedUrls) {
                if (StringUtil.isNotBlank(u)) {
                    String urlValue = u.trim();
                    if (!urlValue.startsWith("#")) {
                        s2Robot.addIncludeFilter(urlValue);
                    }
                }
            }

            // set excluded urls
            String[] excludedUrls = excludedUrlsStr.split("[\r\n]");
            for (String u : excludedUrls) {
                if (StringUtil.isNotBlank(u)) {
                    String urlValue = u.trim();
                    if (!urlValue.startsWith("#")) {
                        s2Robot.addExcludeFilter(urlValue);
                    }
                }
            }

            // failure url
            List<String> excludedUrlList = failureUrlService
                    .getExcludedUrlList(webCrawlingConfig.getId(), null);
            if (excludedUrlList != null) {
                for (String u : excludedUrlList) {
                    if (StringUtil.isNotBlank(u)) {
                        String urlValue = u.trim();
                        s2Robot.addExcludeFilter(urlValue);
                    }
                }
            }

            if (logger.isDebugEnabled()) {
                logger.debug("Crawling " + urlsStr);
            }

            s2Robot.setBackground(true);
            s2Robot.setThreadPriority(crawlerPriority);

            s2RobotList.add(s2Robot);
            s2RobotStatusList.add(Constants.READY);
        }

        // run index update
        IndexUpdater indexUpdater = SingletonS2Container
                .getComponent("indexUpdater");
        indexUpdater.setName("IndexUpdater");
        indexUpdater.setPriority(indexUpdaterPriority);
        indexUpdater.setSessionIdList(sessionIdList);
        indexUpdater.setSolrServerGroup(solrServerGroup);
        indexUpdater.setDaemon(true);
        indexUpdater.setCommitPerCount(commitPerCount);
        indexUpdater.start();

        SystemHelper systemHelper = SingletonS2Container
                .getComponent("systemHelper");

        int startedCrawlerNum = 0;
        int activeCrawlerNum = 0;
        while (startedCrawlerNum < s2RobotList.size()) {
            // Force to stop crawl
            if (systemHelper.isForceStop()) {
                for (S2Robot s2Robot : s2RobotList) {
                    s2Robot.stop();
                }
                break;
            }

            if (activeCrawlerNum < multiprocessCrawlingCount) {
                // start crawling
                s2RobotList.get(startedCrawlerNum).execute();
                s2RobotStatusList.set(startedCrawlerNum, Constants.RUNNING);
                startedCrawlerNum++;
                activeCrawlerNum++;
                try {
                    Thread.sleep(crawlingExecutionInterval);
                } catch (InterruptedException e) {
                    // NOP
                }
                continue;
            }

            // check status
            for (int i = 0; i < startedCrawlerNum; i++) {
                if (!s2RobotList.get(i).getRobotContext().isRunning()
                        && s2RobotStatusList.get(i).equals(Constants.RUNNING)) {
                    s2RobotList.get(i).awaitTermination();
                    s2RobotStatusList.set(i, Constants.DONE);
                    String sid = s2RobotList.get(i).getRobotContext()
                            .getSessionId();
                    indexUpdater.addFinishedSessionId(sid);
                    activeCrawlerNum--;
                }
            }
            try {
                Thread.sleep(crawlingExecutionInterval);
            } catch (InterruptedException e) {
                // NOP
            }
        }

        boolean finishedAll = false;
        while (!finishedAll) {
            finishedAll = true;
            for (int i = 0; i < s2RobotList.size(); i++) {
                s2RobotList.get(i).awaitTermination(crawlingExecutionInterval);
                if (!s2RobotList.get(i).getRobotContext().isRunning()
                        && s2RobotStatusList.get(i).equals(Constants.RUNNING)) {
                    s2RobotStatusList.set(i, Constants.DONE);
                    String sid = s2RobotList.get(i).getRobotContext()
                            .getSessionId();
                    indexUpdater.addFinishedSessionId(sid);
                }
                if (!s2RobotStatusList.get(i).equals(Constants.DONE)) {
                    finishedAll = false;
                }
            }
        }
        s2RobotList.clear();
        s2RobotStatusList.clear();

        Map<String, String> infoMap = new HashMap<String, String>();

        long execTime = System.currentTimeMillis() - startTime;
        infoMap.put(Constants.WEB_CRAWLING_EXEC_TIME, Long.toString(execTime));
        if (logger.isInfoEnabled()) {
            logger.info("[EXEC TIME] crawling time: " + execTime + "ms");
        }

        indexUpdater.setFinishCrawling(true);
        try {
            indexUpdater.join();
        } catch (InterruptedException e) {
            logger.warn("Interrupted index update.", e);
        }

        infoMap.put(Constants.WEB_INDEX_EXEC_TIME,
                Long.toString(indexUpdater.getExecuteTime()));
        infoMap.put(Constants.WEB_INDEX_SIZE,
                Long.toString(indexUpdater.getDocumentSize()));

        // store info map
        CrawlingSessionHelper crawlingSessionHelper = SingletonS2Container
                .getComponent("crawlingSessionHelper");
        crawlingSessionHelper.put(sessionId, infoMap);

        for (String sid : sessionIdList) {
            // remove config
            crawlingConfigHelper.setCrawlingConfig(sid, null);
        }

        // clear url filter
        UrlFilterService urlFilterService = SingletonS2Container
                .getComponent(UrlFilterService.class);
        urlFilterService.deleteAll();

        // clear queue
        UrlQueueService urlQueueService = SingletonS2Container
                .getComponent(UrlQueueService.class);
        urlQueueService.deleteAll();

        // clear
        DataService dataService = SingletonS2Container
                .getComponent(DataService.class);
        dataService.deleteAll();

    }

    /**
     * Stop processes.
     * (This method is not MT-safe.)
     * 
     */
    public void stopProcesses() {
        synchronized (s2RobotList) {
            for (S2Robot s2Robot : s2RobotList) {
                s2Robot.stop();
            }
        }
    }

}
