
require 'rubygems'
require 'hpricot'
require 'benchmark'

require File.dirname(__FILE__) + '/../lib/simplecrawler'
# require 'simplecrawler'
require 'pp'

SAVE_DIR = './data'
SITE = "http://localhost"
#SITE = "http://google.co.jp/" 
#SITE = "http://homepage2.nifty.com/youichi_kato/src.html"
#SITE = "http://homepage2.nifty.com/youichi_kato/index.html"
#SITE = "http://b.hatena.ne.jp/entrylist"
#SITE = "http://help.yahoo.co.jp/help/jp/auct"
#SITE = "http://help.yahoo.co.jp"

puts Benchmark.measure {
  # Set up a new crawler
  sc = SimpleCrawler::Crawler.new(SITE)
  # override 
  def sc.log(message) 
  end
  def sc.href_filter(href)
    if /javascript:opennew\('(.*)','(.*)','(.*)'\);/ =~ href
      $1
    else
      href
    end
  end

  sc.maxcount = 1000
  sc.maxnest = 3
  sc.load_binary_data = true
  sc.skip_patterns =['.*\/manual\/']
  #sc.include_patterns = ['jp\/auct']

  # The crawler yields a Document object for each visited page.
  num = 0
  sc.crawl { |document|

    size = document.data == nil ? -1: document.data.size
    if document.headers and document.headers["last-modified"]
      time = Time.parse(document.headers["last-modified"])
    else
      time = document.fetched_at
    end
    num += 1
    printf("%5d: #{document.uri.to_s}\n", num)
    puts "\tnest:#{document.nest} #{time}\tsize:#{size}"
  }
}
