#-*- coding: utf-8; -*-

# 2009-01-25 katoy 
#  Scaw and cache all importd file for XBRL-instnce data.
#
# ruby 1.8.7
# libxml-ruby (0.9.8)

require 'rubygems'
require 'xml/libxml'
require 'open-uri'
require 'pathname'
require 'pp'
require 'benchmark'
require 'digest/md5'
require 'set'

class FileCache
  CACHE = './cache'   # Cache Folder. 末尾に / は付けない。

  attr_reader :md5
  attr_reader :count
  attr_reader :size
  attr_reader :readed

  def initialize
    @md5 = Digest::MD5.new
    @count = 0
    @size = 0
    @readed = Set.new
  end

  def visit(path, dir = '')
    from_path = Utils::normalize_path(path, dir)
    return if @readed.include?(from_path) 

    parent = Utils::parent(from_path)
    data = ""

    if from_path.index('http:/') != 0
      cache_file = from_path
      open(cache_file) { |src| data = src.read }
    else
      # ネット上ファイルの場合
      cache_file = CACHE + from_path.gsub('http:/', '/http')    
      # キャッシュから読み込む
      if File.exists?(cache_file)
        open(cache_file) { |src| data = src.read }
      else
        # 読み込んでから、キャッシュにも保存する
        Utils::multi_mkdir(File::dirname(cache_file))  if !File.exists?(File::dirname(cache_file))
        open(from_path) { |src|
          data = src.read
          open(cache_file, "w") { |dest| dest.write(data) }
        }
      end
    end

    @readed << from_path
    @md5 << data
    @count += 1
    @size += data.length

    reader = nil

    begin
      reader = XML::Reader.string(data)
      while reader.read
        href = nil
        name = "#{reader.namespace_uri}:#{reader.local_name}"
        case name
        when 'http://www.w3.org/2001/XMLSchema:import'
          href = reader['schemaLocation']
        when 'http://www.xbrl.org/2003/linkbase:schemaRef'
          href = reader['xlink:href']
        when 'http://www.xbrl.org/2003/linkbase:linkbaseRef'
          href = reader['xlink:href']
        end

        if href != nil
          to_path = Utils::normalize_path(href, parent)
          visit(to_path, parent)
        end
      end
    rescue
      puts "---- error reading #{from_path} (#{cache_file})"
      exit 1
    ensure
      reader.close if reader != nil
      reader = nil
    end
  end
end

class Utils
  def self.cleanpath(path)
    Pathname.new(path).cleanpath
  end

  def self.parent(full_path)
    p = full_path.rindex('/') - 1
    full_path[0..p]
  end

  def self.multi_mkdir(mkpath)
    path = ''
    mkpath.split('/').each do |f|
      path.concat(f)
      Dir.mkdir(path) unless path == '' || File.exist?(path)
      path.concat('/')
    end
  end

  def self.normalize_path(path, dir='')
    if path.index('http://') == 0
      uri = URI.parse(path)
      full_path = "#{uri.scheme}://#{uri.host}#{Utils::cleanpath(uri.path.to_s)}"
    elsif dir.index('http://') == 0
      uri = URI.parse("#{dir}/#{path}")
      full_path = "#{uri.scheme}://#{uri.host}#{Utils::cleanpath(uri.path.to_s)}"
    else
      path = path[5.. path.length] if path.index('file:') == 0
      dir = File::expand_path('.') if dir == ''
      full_path = Utils::cleanpath(File::expand_path(path, dir))
    end

    full_path.to_s
  end
end

# See http://www.sec.gov
# SEC - adobe
# pat = "http://www.sec.gov/Archives/edgar/data/796343/000079634308000007/adbe-20080916.xml"
# SEC - microsoft
# pat = "http://www.sec.gov/Archives/edgar/data/789019/000119312508215214/msft-20080930.xml"

# pat = "file:/Users/youichikato/work/www/xbrl.org/XBRL-CONF/Common/instance/397-00-ConsistentInstance-valid.xbrl"
# pat = "/Users/youichikato/work/www/xbrl.org/XBRL-CONF/**/*.xbrl"
# pat = "/Users/youichikato/NetBeansProjects/ruby-xbrl/Edinet/sample/**/*.xbrl"
# pat = "../data/X99001-000/jpfr-asr-X99001-000-2008-03-31-01-2008-06-27.xbrl"
pat = "../data/msft/msft-20080930.xml"
# pat = "../data/td-net/081220090203088072/tdnet-qcedjpfr-33500-2008-11-30-01-2009-02-20.xbrl"
# pat = ARGV[0]

puts Benchmark.measure { 
  fscan = FileCache.new

  if pat == nil
    puts "--No specified XBRL instance."
    exit 1
  elsif pat.index('http:') == 0
    full_path = Utils::normalize_path(pat)
    fscan.visit(full_path)
  else
    Dir.glob(pat).each do |f|
      full_path = Utils::normalize_path(f)
      fscan.visit(full_path)
    end
  end

  pp fscan.md5.hexdigest
  pp "readed  #{fscan.count} files. #{fscan.size} bytes." 
}
