#!/usr/bin/perl -w -C

eval 'exec /usr/bin/perl -w -C -S $0 ${1+"$@"}'
    if 0; # not running under some shell

=pod

=head1 NAME

tv_grab_ja - Grab TV listings for Japan.

=head1 SYNOPSIS

tv_grab_ja --help

tv_grab_ja [--config-file FILE] --configure

tv_grab_ja [--config-file FILE] [--output FILE] [--days N]
           [--offset N] [--quiet]

tv_grab_ja --list-channels

=head1 DESCRIPTION

Output TV listings for several channels available in Japan.
The grabber relies on parsing HTML so it might stop working at any
time.

First run B<tv_grab_ja --configure> to choose, which channels you want
to download. Then running B<tv_grab_ja> with no arguments will output
listings in XML format to standard output.

tv_grab_ja always grab 7 days of listings.

B<--configure> Prompt for which channels,
and write the configuration file.

B<--config-file FILE> Set the name of the configuration file, the
default is B<~/.xmltv/tv_grab_ja.conf>.  This is the file written by
B<--configure> and read when grabbing.

B<--output FILE> write to FILE rather than standard output.

B<--quiet> suppress the progress messages normally written to standard
error.

B<--list-channels> write output giving <channel> elements for every
channel available (ignoring the config file), but no programmes.

=head1 SEE ALSO

L<xmltv(5)>.

=head1 AUTHOR

Takeru Komoriya<komoriya@paken.org>
Based on tv_grab_fi by Matti Airas and tv_grab_sn by Stefan G:orling.

=head1 BUGS

The data source may not suit recommended XMLTV DTD format.

=cut

######################################################################
# initializations

use strict;
use XMLTV::Version '$Id: tv_grab_ja,v 1.3 2004/01/21 14:30:43 hiro Exp $ ';
use Getopt::Long;
use Date::Manip;
use HTML::TreeBuilder;
use HTML::Entities; # parse entities
use IO::File;
use LWP::UserAgent;

use XMLTV;
use XMLTV::Memoize;
use XMLTV::Ask;
use XMLTV::Config_file;
use XMLTV::Mode;
use Text::Kakasi;

use utf8;
use Encode qw(from_to);

# Todo: perhaps we should internationalize messages and docs?
use XMLTV::Usage <<END
$0: get Japanese television listings in XMLTV format
To configure: $0 --configure [--config-file FILE]
To grab listings: $0 [--config-file FILE] [--output FILE] [--days N]
        [--offset N] [--quiet]
To list channels: $0 --list-channels
END
  ;

# Use Term::ProgressBar if installed.
use constant Have_bar => eval { require Term::ProgressBar; 1 };

# Attributes of the root element in output.
my $HEAD = { 'source-info-url'     => 'http://www.ontvjapan.com/',
	     'source-data-url'     => "http://www.ontvjapan.com/program/",
	     'generator-info-name' => 'XMLTV',
	     'generator-info-url'  => 'http://membled.com/work/apps/xmltv/',
	   };

# The timezone in Japan.
my $TZ="+0900";

# default language
my $LANG="ja_JP.UTF-8";

# base URL of source data
my $urlbase = "http://www.ontvjapan.com/program";

# xmltv channel id extension
my $channel_ext = ".ontvjapan.com";

# Global channel data.
our @ch_all;

# region data
my %regions = ("0001" => '北海道',
	       "0101" => '青森',
	       "0102" => '秋田',
	       "0103" => '岩手',
	       "0104" => '山形',
	       "0105" => '宮城',
	       "0106" => '福島',
	       "0002" => '東京',
	       "0201" => '埼玉',
	       "0202" => '千葉',
	       "0203" => '神奈川', 
	       "0204" => '群馬',
	       "0205" => '栃木',
	       "0206" => '茨城',
	       "0301" => '山梨',
	       "0302" => '新潟',
	       "0303" => '長野',
	       "0401" => '静岡',
	       "0003" => '愛知',
	       "0402" => '岐阜',
	       "0403" => '三重',
	       "0501" => '富山',
	       "0502" => '石川',
	       "0503" => '福井',
	       "0004" => '大阪',
	       "0601" => '京都',
	       "0602" => '兵庫',
	       "0603" => '奈良',
	       "0604" => '和歌山',
	       "0605" => '滋賀',
	       "0701" => '岡山',
	       "0702" => '広島',
	       "0703" => '鳥取',
	       "0704" => '島根',
	       "0705" => '山口',
	       "0801" => '香川',
	       "0802" => '徳島',
	       "0803" => '愛媛',
	       "0804" => '高知',
	       "0005" => '福岡',
	       "0901" => '佐賀',
	       "0902" => '鹿児島',
	       "0903" => '宮崎',
	       "0904" => '大分',
	       "0905" => '熊本',
	       "0906" => '長崎',
	       "0A01" => '沖縄');

# region id
my $regionid = "0002";

######################################################################
# get options

XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');
my ($opt_help, $opt_output,
    $opt_configure, $opt_config_file, $opt_quiet,
    $opt_list_channels);
$opt_quiet  = 0; # default
GetOptions('help'          => \$opt_help,
	   'configure'     => \$opt_configure,
	   'config-file=s' => \$opt_config_file,
	   'output=s'      => \$opt_output,
	   'quiet'         => \$opt_quiet,
	   'list-channels' => \$opt_list_channels,
	  )
  or usage(0);
usage(1) if $opt_help;

my $mode = XMLTV::Mode::mode('grab', # default
			     $opt_configure => 'configure',
			     $opt_list_channels => 'list-channels',
			    );

# File that stores which channels to download.
my $config_file
  = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_ja', $opt_quiet);

my @config_lines; # used only in grab mode
if ($mode eq 'configure') {
    XMLTV::Config_file::check_no_overwrite($config_file);
}
elsif ($mode eq 'grab' or $mode eq 'list-channels') {
    @config_lines = XMLTV::Config_file::read_lines($config_file);
}
else { die }

######################################################################
# write configuration

if ($mode eq 'configure') {
    open(CONF, ">$config_file") or die "cannot write to $config_file: $!";

    # Ask region
    my $qs = "\n";
    my $cr = 0;
    my @rids = sort keys %regions;
    foreach my $rid (@rids) {
	$qs .= $rid . ":" . $regions{$rid};
	$cr++;
	if ($cr == 5) {
	    $qs .= "\n";
	    $cr = 0;
	} else {
	    $qs .= "\t";
	}
    }
    $qs .= "\n地域番号を指定してください(CATVの場合はREADME.CATVを参照): ";
    $regionid = ask($qs);

    if (not defined $regionid) {
	print("正しい地域番号を入力してください.\n");
	exit();
    }
    print CONF "region $regionid\n";

    my @channels = get_channels();

    # Ask about each channel.
    print("各放送局のチャンネル番号と略称を入力してください．\n");
    print("追加したくないチャンネルについては，そのままEnterを押してください.\n");
    foreach my $ch (@channels) {
	my $id = $ch->{id};
	my $name = $ch->{name};
	my $callsign = $ch->{callsign};
	my $number = ask("「$name」のチャンネル番号: ");
	my $newcs = ask("「$name」の略称[$callsign]: ") if $number;
	$callsign = $newcs if $number and $newcs ne '';
	# Print a config line, but comment it out if channel not wanted.
	print CONF '#' if not $number;
	print CONF "channel $id $callsign $number $name\n";

    }

    close CONF or warn "cannot close $config_file: $!";
    print("設定完了.\n");

    exit();
}

# Read channels from configuration, push them to @ch_all
my $line_num = 1;
foreach (@config_lines) {
    ++ $line_num;
    next if not defined;
    if (/^channel:?\s+(\S+)\s(\S+)\s(\S+)\s(\S+)/) {
	my $ch_id = $1;
	my $ch_callsign = $2;
	my $ch_number = $3;
	my $ch_name = $4;
	$ch_name =~ s/\s*$//;

	push @ch_all, { 'id' => $ch_id,
			'display-name' => [ [ $ch_name, $LANG ] ],
			'number' => $ch_number,
			'call-sign' => $ch_callsign };
    }
    elsif (/^region:?\s+(\S+)/) {
	$regionid = $1;
    }
    else {
	warn "$config_file:$line_num: bad line\n";
    }
}

# Not configuration, we must be writing something, either full
# listings or just channels.
#
die if $mode ne 'grab' and $mode ne 'list-channels';

# Options to be used for XMLTV::Writer.
my %w_args;
if (defined $opt_output) {
    my $fh = new IO::File(">$opt_output");
    die "cannot write to $opt_output: $!" if not defined $fh;
    $w_args{OUTPUT} = $fh;
}
$w_args{encoding} = 'UTF8';
my $writer = new XMLTV::Writer(%w_args);
$writer->start($HEAD);

######################################################################
# Channel listings
if ($mode eq 'list-channels') {
    # Write channels mode.
    $writer->write_channel($_) foreach @ch_all;
    $writer->end();
    exit();
}

# We are producing full listings.
die if $mode ne 'grab';

######################################################################
# begin main program

my $ch_number = @ch_all;
die "チャンネルが設定されていません．まず --configure オプションをつけて設定をしてください．\n"
  if $ch_number == 0;

# the order in which we fetch the channels matters
foreach (@ch_all) {
    $writer->write_channel($_);
}

# This progress bar is for both downloading and parsing.
my $bar = new Term::ProgressBar('番組表を取得中', scalar @ch_all)
  if Have_bar && not $opt_quiet;
foreach (@ch_all) {
    foreach (process_table($_->{'id'})) {
	$writer->write_programme($_);
    }
    update $bar if Have_bar && not $opt_quiet;
}
$writer->end();

######################################################################
# subroutine definitions

# Use Log::TraceMessages if installed.
BEGIN {
    eval { require Log::TraceMessages };
    if ($@) {
	*t = sub {};
	*d = sub { '' };
    }
    else {
	*t = \&Log::TraceMessages::t;
	*d = \&Log::TraceMessages::d;
	Log::TraceMessages::check_argv();
    }
}

####
# process_table: fetch a URL and process it
#
# arguments:
#    id of channel
# returns: list of program hashes to write
#
sub process_table {
    my ($channel_id) = @_;

    my $data = get_table($channel_id);

    # parse the page to a document object
    my $tree = HTML::TreeBuilder->new();
    $tree->parse($data);
    my @data = parse_program_table($channel_id, $tree);

    return @data;
}

# parse program table
sub parse_program_table {
    my ($channel, $tree) = @_;
    t "parse_program_table() ENTRY for tree: $tree";
    my @data;

    # <a>タグで囲まれ，hrefに'detail.php3'が含まれた要素を取得
    my @elems = $tree->look_down( '_tag','a',
				  'href',qr/detail.php3/);

    # 要素ごとに解析して，番組情報を@dataにpush
    foreach my $elem (@elems) {
	t 'doing elem: ' . d $elem;
	# get program infomation
	my $h = get_content($elem);
	next if not defined $h;
	$h->{channel} = $channel;
	push @data, $h;
    }
    return @data;
}

sub get_content {
    my ($elem) = @_;
    my $p;

    # 番組名
    my $title = '';
    foreach $p ($elem->content_list()) {
	if (ref $p) {
	    # ニュース/天気のアイコン
	    my $src = $p->attr('src');
	    if ( $src =~ m/n.gif/ ) {
		$title .= "[Ｎ]";
	    } elsif ( $src =~ m/w.gif/ ) {
		$title .= "[天]";
	    }
	} else {
	    # タイトル
	    from_to($p, "euc-jp", "utf8"); # UTF-8への変換
	    utf8::decode($p);
	    $title .= $p;
	}
    }
    utf8::encode($title);
    return undef if $title eq '';

    # 番組内容
    my $desc = '';
    foreach $p ($elem->right()) {
	if (not ref $p) {
	    $p =~ s/^\s+//;
	    last if ($p =~ m/^\d\d:\d\d/); # 時刻が入っていたら次の番組
	    from_to($p, "euc-jp", "utf8"); # UTF-8への変換
	    $desc .= $p;
	}
    }

    # 放送日
    my $hsid = $elem->attr('href');
    $hsid =~ m/hsid=(\d\d\d\d\d\d\d\d)/;
    my $date = $1;
    return undef if not defined $date;

    # 放送時間とジャンル
    my $time_genre = $elem->attr('title');
    $time_genre =~ m/^(\d\d:\d\d)-(\d\d:\d\d)\s*(\S*)/;
    my $starttime = $1;
    my $endtime = $2;
    my $genre = $3;
    from_to($genre, "euc-jp", "utf8"); # UTF-8への変換
    $genre =~ s/\/.*$//;   # '/'以降を削除
    return undef if (not defined $starttime) or (not defined $endtime);

    # 番組情報のまとめ
    my $r;
    $r->{title}=[ [ $title, $LANG ] ];
    $r->{category}=[ [ $genre, $LANG ] ] if $genre ne '';
    $r->{desc}=[ [ $desc, $LANG ] ] if $desc ne '';

    # 開始/終了時間
    $starttime =~ s/://;
    $endtime =~ s/://;

    # OnTVは番組終了時間がAM5:00を越えると日付が切り替わる
    my $startdate = $date;
    my $enddate = $date;
    if ($endtime >= '0000' and $endtime < '0500') {
	$enddate = UnixDate(DateCalc($date,"+ 1 day"), '%Q');
	if ($starttime < $endtime) {
	    $startdate = $enddate;
	}
    }

    $r->{start}= $startdate . $starttime . "00 " . $TZ;
    $r->{stop}= $enddate . $endtime . "00 " . $TZ;

    # タイトルの読み(ひらがな)
    utf8::decode($title);
    $title =~ s/\[.+\]//g;   # [ニュース]等を除去
    $title =~ s/◇//g;       # '◇'を除去
    $title =~ s/\d//g;       # 時間を表す数字は除去
    $title =~ s/\s+//g;      # 空白を除去
    utf8::encode($title);
    Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-JH', '-KH', '-aE');
    from_to($title, "utf8", "euc-jp"); # いったんEUCに戻す
    my $readstr = '';
    if ($title ne '') {
	$readstr=Text::Kakasi::do_kakasi($title);
	from_to($readstr, "euc-jp", "utf8"); # UTF-8への変換
	if (defined $readstr and $readstr ne '') {
	    $r->{'title-readstr'} = [ [$readstr , $LANG ] ];
	}
    }
    Text::Kakasi::close_kanwadict();

    t "TITLE:$title($readstr) $date $starttime-$endtime DESC:$desc GENRE:$genre\n";

    return $r;
}

# get channel listing
sub get_channels {
    my $bar = new Term::ProgressBar('チャンネル情報を取得中', 1)
      if Have_bar && not $opt_quiet;

    my @channels;
    my $local_data = get_channel_table();

    my $tree = HTML::TreeBuilder->new();
    $tree->parse($local_data);

    # channels elements are specially formatted <a> tags
    # with href="gridChannel.php..."
    my @ch_a_elems = $tree->look_down(_tag => 'a');

    # 全角 -> 半角変換
    Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-Ea');

    # get channels
    while (@ch_a_elems) {
	my $elem = shift @ch_a_elems;
	my $href = $elem->attr('href');
	if ($href =~ m/^gridChannel.php\?tikicd=${regionid}&ch=(\d\d\d\d)/) {
	    my $channel_id = $1;
	    $channel_id .= $channel_ext;    # this is xmltv channel id
	    my $callsign = ($elem->content_list)[0];
	    $callsign = Text::Kakasi::do_kakasi($callsign);
	    $callsign =~ s/[\(\)]//g;
	    $elem = shift @ch_a_elems;
	    my $channel = ($elem->content_list)[0];
	    # convert to UTF-8
	    from_to($channel, "euc-jp", "utf8");
	    from_to($callsign, "euc-jp", "utf8");
	    utf8::decode($channel);
	    utf8::decode($callsign);
	    push @channels, { 'id' => $channel_id,
			      'name' => $channel,
			      'callsign' => $callsign };
        }
    }
    Text::Kakasi::close_kanwadict();

    my $chnum = @channels;
    die "チャンネル情報の取得に失敗しました\n"
    . "ネットワークの接続と地域番号を確認してください" if $chnum == 0;
    update $bar if Have_bar && not $opt_quiet;
    return @channels;
}

# get time table
sub get_table {
    my ($channelid) = @_;
    my $ext = quotemeta $channel_ext;
    $channelid =~ s/$ext//;
    my $url = "$urlbase/gridChannel.php?tikicd=${regionid}&ch=${channelid}&genre=all";
    my $content = get_html($url);
    return $content;
}

# get channel table
sub get_channel_table {
    # request channel data
    my $url = "$urlbase/gridChannel.php?tikicd=${regionid}";
    my $content = get_html($url);
    return $content;
}

# get requested page
sub get_html {
    my ($url) = @_;
    my $ua = LWP::UserAgent->new;

    # Be nice to the server.  Technically we don't need to do this
    # after the very last fetch, but sleeping every time is simpler.
    sleep(rand 3);

    t "getting URL: $url";
    my $request = HTTP::Request->new('GET', $url);
    my $res = $ua->request($request);
    if ($res->is_success) {
	return $res->content;
    }

    # FIXME commonize this
    local $SIG{__WARN__} = sub {
	warn "$url: $_[0]";
    };
    local $SIG{__DIE__} = sub {
	die "$url: $_[0]";
    };
    
    die "could not fetch $url, aborting\n"
}
