#!/usr/bin/perl

eval 'exec /usr/bin/perl  -S $0 ${1+"$@"}'
    if 0; # not running under some shell

=pod

=head1 NAME

tv_grab_ch - Grab TV listings for Switzerland (from www.fernsehen.ch webpage).

=head1 SYNOPSIS

tv_grab_ch --help

tv_grab_ch [--config-file FILE] --configure [--gui OPTION]

tv_grab_ch [--config-file FILE] [--output FILE] [--quiet]
           [--days N] [--offset N]

tv_grab_ch --list-channels

=head1 DESCRIPTION

Output TV listings for several channels available in Switzerland and
(partly) central Europe. 
The data comes from www.fernsehen.ch. The grabber relies on
parsing HTML so it might stop working at any time.

First run B<tv_grab_ch --configure> to choose, which channels 
you want to download. Then running B<tv_grab_ch> with no 
arguments will output listings in XML format to standard output.

B<--configure> Ask for each available channel whether to download
and write the configuration file.

B<--config-file FILE> Set the name of the configuration file, the
default is B<~/.xmltv/tv_grab_ch.conf>.  This is the file 
written by B<--configure> and read when grabbing.

B<--gui OPTION> Use this option to enable a graphical interface to be used.
OPTION may be 'Tk', or left blank for the best available choice.
Additional allowed values of OPTION are 'Term' for normal terminal output
(default) and 'TermNoProgressBar' to disable the use of Term::ProgressBar.

B<--output FILE> write to FILE rather than standard output.

B<--days N> grab N days.  The default is fourteen.

B<--offset N> start N days in the future.  The default is to start
from now on (= zero).

B<--quiet> suppress the progress messages normally written to standard
error.

B<--list-channels> write output giving <channel> elements for every
channel available (ignoring the config file), but no programmes.

B<--help> print a help message and exit.

Some channels, like for example SAT.1 and RTL-2, sometimes broadcast
different programmes in different regions, i.e. the programmes broadcast
in Switzerland differ from the programmes shown in Germany.
Unfortunately fernsehen.ch tends to supply both programmes in one
channel listing, making it hard for the grabber to decide which
programme to write out. By default it tends to write the first, however
you can overwrite this in the configuration file. Simply add a line telling
B<last> plus the channel id, e.g. "last RT2" for RTL-2.

=head1 SEE ALSO

L<xmltv(5)>.

=head1 AUTHOR

Stefan Siegl <stesie@brokenpipe.de>. Inspired by tv_grab_fi by Matti Airas.

=head1 BUGS

If you happen to find a bug, you're requested to send a mail to me
at B<stesie@brokenpipe.de> or to one of the XMLTV mailing lists, see webpages
at http://sourceforge.net/projects/xmltv/.

=cut

use warnings;
use strict;
use Date::Manip;
use XMLTV::Version '$Id: tv_grab_ch.in,v 1.3 2006/01/08 10:55:02 epaepa Exp $ ';
use Getopt::Long;
use HTML::TreeBuilder;
use HTML::Entities;
use URI::Escape;
use XMLTV;
use XMLTV::Ask;
use XMLTV::ProgressBar;
use XMLTV::DST;
use XMLTV::Config_file;
use XMLTV::Mode;
use XMLTV::Get_nice;
use XMLTV::Memoize;
use XMLTV::Usage <<END
$0: get Swiss television listings from www.fernsehen.ch in XMLTV format
To configure: $0 --configure [--config-file FILE] [--gui OPTION]
To grab data: $0 [--config-file FILE] [--output FILE] [--quiet]
                 [--days N] [--offset N]
Channel List: $0 --list-channels
END
  ;

# Use Log::TraceMessages if installed.
BEGIN {
    eval { require Log::TraceMessages };
    if ($@) {
        *t = sub {};
        *d = sub { '' };
    }
    else {
        *t = \&Log::TraceMessages::t;
        *d = \&Log::TraceMessages::d;
    }
}



#-- our own prototypes first ...
sub get_channels();
sub channel_id($);
sub get_page($);
sub grab_channel($);



#-- attributes of xmltv root element
my $head = { 
    'source-data-url'      => 'http://www.fernsehen.ch/sender/',
    'source-info-url'      => 'http://www.fernsehen.ch/',
    'generator-info-name'  => 'XMLTV',
    'generator-info-url'   => 'http://membled.com/work/apps/xmltv/',
};



#-- the timezone fernsehen.ch lives in is, CET/CEST
my constant $TZ = "+0100";
my constant $lang = "de";



#-- Parse argv now.  First do undocumented --cache option.
XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');



my $opt_configure;
my $opt_config_file;
my $opt_gui;
my $opt_output;
my $opt_days = 14;
my $opt_offset = 0;
my $opt_quiet = 0;
my $opt_list_channels;
my $opt_help;
my $opt_share;

GetOptions(
    'configure'      => \$opt_configure,
    'config-file=s'  => \$opt_config_file,
    'gui:s'          => \$opt_gui,
    'output=s'       => \$opt_output,
    'days=i'         => \$opt_days,
    'offset=i'       => \$opt_offset,
    'quiet'          => \$opt_quiet,
    'list-channels'  => \$opt_list_channels,
    'help'           => \$opt_help,
    'share=s'        => \$opt_share,
) or usage(0);

usage(1) if $opt_help;

XMLTV::Ask::init($opt_gui);

#-- make sure offset+days arguments are within range
die "neither offset nor days may be negative"
  if($opt_offset < 0 || $opt_days < 0);


#-- calculate global start/stop times ...
my $grab_start = DateCalc("00:00:00", "+ $opt_offset days");
my $grab_stop = DateCalc($grab_start, "+ $opt_days days");


my $mode = XMLTV::Mode::mode('grab', # default value
    $opt_configure 	=> 'configure',
    $opt_list_channels	=> 'list-channels',
);



#-- initialize config file support
my $config_file = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_ch', $opt_quiet);
my @config_lines;

if($mode eq 'configure') {
    XMLTV::Config_file::check_no_overwrite($config_file);
} 
elsif($mode eq 'grab' || $mode eq 'list-channels') {
    @config_lines = XMLTV::Config_file::read_lines($config_file);
} 
else { die("never heard of XMLTV mode $mode, sorry :-(") }



#-- hey, we can't live without channel data, so let's get those now!
my $bar = new XMLTV::ProgressBar( 'getting list of channels', 1 )
    if not $opt_quiet;

my %channels = get_channels();
$bar->update() if not $opt_quiet;
$bar->finish() if not $opt_quiet;



# share/ directory for storing channel mapping files.  This next line
# is altered by processing through tv_grab_ch.PL.  But we can
# use the current directory instead of share/tv_grab_ch for
# development.
#
# The 'source' file tv_grab_ch.in has $SHARE_DIR undef, which
# means use the current directory.  In any case the directory can be
# overridden with the --share option (useful for testing).
#
my $SHARE_DIR='/var/tmp/xmltv-0.5.42-4m.mo5-root-builduser/usr/share/xmltv'; # by grab/ch/tv_grab_ch.PL
$SHARE_DIR = $opt_share if defined $opt_share;
my $OUR_SHARE_DIR = (defined $SHARE_DIR) ? "$SHARE_DIR/tv_grab_ch" : '.';

# Read the file with channel mappings.
(my $CHANNEL_NAMES_FILE = "$OUR_SHARE_DIR/channel_ids") =~ tr!/!/!s;
my (%chid_mapping, %seen, %first_last);
my $line_num = 0;
foreach (XMLTV::Config_file::read_lines($CHANNEL_NAMES_FILE, 1)) {
    ++ $line_num;
    next unless defined;
    my $where = "$CHANNEL_NAMES_FILE:$line_num";
    my @fields = split m/:/;
    die "$where: wrong number of fields"
      if(@fields < 2 || @fields > 3);

    my ($xmltv_id, $fernsehen_ch_id, $writelast) = @fields;
    warn "$where: fernsehen.ch id $fernsehen_ch_id seen already\n"
      if defined $chid_mapping{$fernsehen_ch_id};
    $chid_mapping{$fernsehen_ch_id} = $xmltv_id;
    $writelast = "first" 
      unless defined $writelast;
    die "$where: first-last selector of $fernsehen_ch_id `$writelast' invalid"
      if($writelast ne "first" and $writelast ne "last");
    $first_last{$fernsehen_ch_id} = $writelast;
    warn "$where: XMLTV id $xmltv_id seen already\n"
      if $seen{$xmltv_id}++;
}

my @requests;

#-- read our configuration file now
my $line = 1;
foreach(@config_lines) {
    $line ++;
    next unless defined;

    if (/^channel:?\s+(\S+)/) {
	warn("\nConfigured channel $1 not available anymore. \nPlease reconfigure tv_grab_ch.\n"),
	  next unless(defined($channels{$1}));
	push @requests, $1;
    } 
    elsif (/^map:?\s+(\S+)\s+(\S+)/) {
	# Override anything set in the channel_ids file.
	$chid_mapping{$1} = $2;
    } 
    elsif (/^first:?\s+(\S+)/) {
        $first_last{$1} = "first";
    }
    elsif (/^last:?\s+(\S+)/) {
        $first_last{$1} = "last";
    }
    else {
	warn "$config_file:$line: bad line\n";
    }
}

#-- if we're requested to do so, write out a new config file ...
if ($mode eq 'configure') {
    open(CONFIG, ">$config_file") or die("cannot write to $config_file, due to: $!");

    #-- now let's annoy the user, sorry, I meant ask ..
    my @chs = sort keys %channels;
    my @names = map { $channels{$_} } @chs;
    my @qs = map { "add channel $_?" } @names;
    my @want = ask_many_boolean(1, @qs);

    foreach (@chs) {
	my $w = shift @want;
	my $chname = shift @names;
	
	warn("cannot read input, stopping to ask questions ..."), last if not defined $w;

	print CONFIG '#' if not $w; #- comment line out if user answer 'no'

	# shall we store the display name in the config file?
	# leave it in, since it probably makes it a lot easier for the
	# user to choose which channel to comment/uncommet - when manually
	# viing the config file -- are there people who do that?
	print CONFIG "channel $_ #$chname\n";
    }

    close CONFIG or warn "unable to nicely close the config file: $!";
    say("Finished configuration.");

    exit();
}



#-- well, we don't have to write a config file, so, probably it's some xml stuff :)
#-- if not, let's go dying ...
die unless($mode eq 'grab' or $mode eq 'list-channels');

my %writer_args;
if (defined $opt_output) {
    my $handle = new IO::File(">$opt_output");
    die "cannot write to output file, $opt_output: $!" unless (defined $handle);
    $writer_args{'OUTPUT'} = $handle;
}

$writer_args{'encoding'} = 'ISO-8859-1';



#-- create our writer object
my $writer = new XMLTV::Writer(%writer_args);
$writer->start($head);



if ($mode eq 'list-channels') {
    foreach (keys %channels) {
        my %channel = ('id'           => channel_id($_), 
                       'display-name' => [[$channels{$_}, $lang]]); 
        $writer->write_channel(\%channel);
    }

    $writer->end();
    exit();
}



#-- there's only one thing, why we might exist: write out tvdata!
die unless ($mode eq 'grab');
die "No channels specified, run me with --configure flag\n" unless(scalar(@requests));



#-- write out <channel> tags
foreach(@requests) {
    my $id = channel_id($_);
    my %channel = ('id'           => $id, 
                   'display-name' => [[$channels{$_}, $lang]]); 
    $writer->write_channel(\%channel);
}


# the page doesn't specify the year when the programmes begin or end, thus
# we need to guess, store current year and month globally as needed for every
# programme ...
my ($cur_year, $cur_month) = ParseDate('now') =~ m/(....)(..)/;


#-- write out <programme> tags
$bar = new XMLTV::ProgressBar('grabbing', scalar(@requests))
  if not $opt_quiet;

foreach my $channel (@requests) {
    grab_channel $channel;
    update $bar if not $opt_quiet;
}

$bar->finish()
    unless($opt_quiet);

#-- hey, looks like we've finished ...
$writer->end();



#-- channel_id($s) :: turn site channel id into an xmltv id
sub channel_id($) {
    for (my $s = shift) {
	$_ = lc(defined($chid_mapping{$_}) ? $chid_mapping{$_} : "$_.fernsehen.ch");
	$_ = "C$_" if /^\d/;
	return $_;
    }
}


#-- extract_start_stop($$text, $%show)
sub extract_start_stop($$) {
    my $text = shift @_;
    my $show = shift @_;

    my($day_num, $month_txt, $start_hour, $start_min, $stop_hour, $stop_min) = 
        $text =~ m/(\d+)\. ([A-Za-z&;]+).*?(\d{1,2}):(\d{1,2}) - (\d{1,2}):(\d{1,2})/ 
        or die "unable to extract start- and stoptimes from $text\n";

    my $months = {
        'Januar' => '01', 'Februar' => '02', 'Mrz' => '03',
        'M&auml;rz' => '03', 'April' => '04', 'Mai' => '05', 'Juni' => '06',
        'Juli' => '07', 'August' => '08', 'September' => '09',
        'Oktober' => '10', 'November' => '11', 'Dezember' => '12',
        };

    my $month = $months->{$month_txt};
    die "never heard of month '$month_txt'" unless(defined($month));

    # grabbed website doesn't serve data before 'now', therefore if month is
    # before month of 'now', it must be the next year
    my $year = $cur_year + ($month < $cur_month ? 1 : 0);
    my $day = ParseDate("$year-$month-$day_num 00:00:00");

    my $start = $start_hour < 5 ? 86400 : 0;
    my $stop = $start;

    $start += $start_hour * 3600 + $start_min * 60;
    $stop += $stop_hour * 3600 + $stop_min * 60;

    $stop += 86400
        if($stop < $start); # bump stop to next day if before start ...

    $start = parse_local_date(DateCalc($day, "+ $start seconds"), $TZ);
    my ($start_base, $start_tz) = @{date_to_local($start, $TZ)};
    $show->{"start"} = UnixDate($start_base, '%q') . " $start_tz";

    $stop = parse_local_date(DateCalc($day, "+ $stop seconds"), $TZ);
    my ($stop_base, $stop_tz) = @{date_to_local($stop, $TZ)};
    $show->{"stop"} = UnixDate($stop_base, '%q') . " $stop_tz";

    return ( $start, $stop );
}


sub adjust_limit($$$) {
    my $limit = shift;
    my $start = shift;

    my $delta = shift;
    $delta = DateCalc($start, $grab_start)
        unless(defined($delta));
    my $delta_s = Delta_Format($delta, 0, "%st");

    my $request_delta = DateCalc("now", $start);
    my $request_delta_s = Delta_Format($request_delta, 0, "%st");

    my $avg_show_length = $request_delta_s / $limit;
    my $correction = int($delta_s / $avg_show_length);
    $correction -= $correction % 10;

    unless($correction) { $correction = ($delta_s > 0 ? 1 : -1) * 10; }

    $limit += $correction;
    $limit = 0 if($limit < 0);
    return $limit;
}

sub grab_channel($) {
    my ($start, $laststop, $stop);
    my $channel = shift;
    my $shooting = 1; # we need to guess at which show to start ...
                      #    == 1 means allow either forward or backward
                      #    == 2 is allow backw. only (when already stepped back)
    my $limit = $opt_offset * 27; # assume 27 shows a day (average)
    my %programmes;

  grab_channel_loop:
    my $tb = HTML::TreeBuilder->new();
    my $got = 0;
    $limit -= $limit % 10; # only use limits that are dividable by 10!!
    #print STDERR "downloading: /sender/senderprogramm.php3?sender=$channel&limit=$limit\n";
    my $url = "http://fernsehen.ch/sender/senderprogramm.php3?sender=$channel&limit=$limit";
    $tb->parse(get_page($url)) or die "cannot parse content of $url\n";
    $tb->eof;
    
    for($tb->look_down('_tag' => 'td', 'height' => 230, 'xpos' => 150)) {
    	my ($programmes) = 
            $_->as_text() =~ m/Anzahl gefundene Sendungen: (\d+)/
            or warn("no data available for channel $channel, sorry.\n"), return;

        foreach($_->look_down('_tag' => 'tr')) {
            my %show;
            die unless(ref($_)) eq 'HTML::Element';

            $show{channel} = channel_id($channel);

            # well, fernsehen.ch's webpage uses three columns, where the
            # data is structured as follows:
            # col 1: channel name
            # col 2: start and stop time
            # col 3: description, etc.

            my @tds = $_->content_list();
            next if(scalar(@tds) < 3); #- hm, colspanning? must be the header.

            # col 2: start and stop time
            ($start, $stop) = extract_start_stop($tds[1]->as_text(), \%show);

            # test $start if we're still in shooting mode!
            if($shooting && $limit) {
                my $r = Date_Cmp($start, $grab_start);

                if($r > 0) {
                    # starttime of this show is beyond grab_start, therefore
                    # we need to go further back ...
                    $limit = adjust_limit($limit, $start, undef);
                    $shooting = 2; # don't allow stepping forward ...
                    $tb->delete();
                    goto grab_channel_loop;
                }
                elsif($r < 0 && $shooting == 1) {
                    # the limit we're at is too early
                    my $delta = DateCalc($start, $grab_start);

                    # adjust forward only if we're more than one day behind
                    unless($delta =~ m/^\+0:0:0:0:/) {
                        my $newlimit = adjust_limit($limit, $start, $delta);

                        if($newlimit > $programmes) {
                            warn "no information for '$channel' available.";
                            $shooting = 0; # give up.
                            last;
                        }

                        # don't adjust if it's probably either this or
                        # the next page
                        if($newlimit > $limit + 10) {
                            $limit = $newlimit;
                            $tb->delete();
                            goto grab_channel_loop;
                        }
                    }
                }
            }

            $shooting = 0; # looks good ...

            # col 3: description, etc ...
            my $title_a = $tds[2]->look_down('_tag' => 'a');
            my $title = ($title_a->content_list())[0];
            $title =~ s/\226/*/g; # replace 0x96 separator by '*'
            $show{title} = [[ $title, $lang ]];

            # scan for features (as mentioned by included icons) ...
            my %audio;
            my %video;
            foreach my $img (@{$_->extract_links('img')}) {
                my($link, $element, $attr, $tag) = @$img;

                my($feature) = $link =~ m/(\w+)\.gif/;
                if($feature eq "disc") { next }
                elsif($feature eq "fgstereo") {
                    $audio{present} = 1;
                    $audio{stereo} = 'stereo';
                }
                elsif($feature eq "amazonkl") { next }
                elsif($feature eq "1stern") {
                    $show{q(star-rating)} = [ "1 / 4" ];
                }
                elsif($feature eq "2sterne") { 
                    $show{q(star-rating)} = [ "2 / 4" ];
                }
                elsif($feature eq "3sterne") { 
                    $show{q(star-rating)} = [ "3 / 4" ];
                }
                elsif($feature eq "4sterne") {
                    $show{q(star-rating)} = [ "4 / 4" ];
                }
                elsif($feature eq "fgohr") {
                    # TODO
                }
                elsif($feature eq "fglive") {
                    # TODO
                }
                elsif($feature eq "fgsw") {
                    $video{present} = 1;
                    $video{colour} = 0;
                }
                elsif($feature eq "fgbreitb") {
                    $video{present} = 1;
                    $video{aspect} = "16:9";
                }
                else { warn "unknown feature $feature" }
            }

            $show{video} = \%video if(scalar(keys(%video)));
            $show{audio} = \%audio if(scalar(keys(%audio)));

            # get descriptive text ..
            my @font_tags = $tds[2]->look_down('_tag' => 'font');
            my $desc = $font_tags[scalar(@font_tags) - 1]->as_text();

            $desc =~ s/^\s*//; $desc =~ s/\s*$//; # strip whitespace
            $desc =~ s/\226/*/g; # replace 0x96 separator by '*'

            if($desc =~ s/Figur: (.+)$//) {
                # these are the names of the characters, played by the actors,
                # mentioned below
                #
                # where shall we write those out, TODO??
            }

            if($desc =~ s/Regie: (.+)$//) {
                $show{credits}{director} = [ $1 ];
            }

            if($desc =~ s/Darsteller: (.+)//) {
                my @actors = split(m/\s*,\s*/, $1);
                $show{credits}{actor} = \@actors;
            }
            
            $show{desc} = [[ $desc, $lang ]] if(length($desc));

            if(defined($laststop) && Date_Cmp($start, $laststop) < 0) {
                # there's already a programme in this timeslot, this
                # sometimes happens since fernsehen.ch supplies the programme
                # for Switzerland and Germany.

                if(($first_last{$channel} eq "last")
                   && (defined($programmes{$start}))) {
                    # the user wants the latter show written out ...
                    #print STDERR "using LAST strategy at $start/$channel\n";
                   
                    # eliminate already stored shows first
                    foreach(keys(%programmes)) {
                        next if Date_Cmp($_, $start) < 0;

                        #print STDERR "eliminating $_/$channel\n";
                        delete $programmes{$_};
                    }
                    
                    $programmes{$start} = \%show;
                    $laststop = $stop;
                } 
                #else {
                #    print STDERR "using FIRST strategy at $start/$channel\n";
                #}
            }
            else {
                $programmes{$start} = \%show;
                $laststop = $stop;
            }

            $got ++;
        }

	if(! $got && $shooting) {
            $limit = $programmes - 10;
            $limit = 0 
                if($limit < 0);
            $tb->delete();
            goto grab_channel_loop;
	}
    }

    $tb->delete();

    if(($got == 10) && (Date_Cmp($start, $grab_stop) <= 0)) {
        # starttime not beyond grab_stop, therefore grab another bunch ...
        $limit += 10;
        goto grab_channel_loop;
    }

    foreach(sort(keys(%programmes))) {
        $writer->write_programme($programmes{$_});
    }

}



#-- get channel listing
sub get_channels() {
    my %channels;
    my $url=$head->{q(source-data-url)};

    my $tb=new HTML::TreeBuilder();
    $tb->parse(get_page($url)) or die "cannot parse content of $url\n";
    $tb->eof;

    foreach($tb->look_down('_tag' => 'td', 'bgcolor' => '#f2f4ec')) {
        next unless(ref($_) eq "HTML::Element");

        foreach(@{$_->extract_links('a')}) {
            my($link, $element, $attr, $tag) = @$_;

            $link =~ m/sender=(\S+)/
                or die "unable to extract chid from $link";

            $channels{uri_escape($1)} = ($element->content_list())[0];
        }
    }

    $tb->delete;
    return %channels;
}



#-- get_page($url) :: try to download $url via http://, look for closing </body> tag or die
sub get_page($) {
    my $url = shift;
    my $retry = 0;

    local $SIG{__DIE__} = sub { die "\n$url: $_[0]" };
    
    while($retry < 2) {
        my $got = eval { get_nice($url . ($retry ? "&retry=$retry" : "")); };
        $retry ++;

        next if($@); # unable to download, doesn't look too good for us.
        return $got;
    }

    die "cannot grab webpage $url (tried $retry times). giving up. sorry";
}




###### -*- emacs is great ###
# Local Variables:
# mode: Perl
# indent-tabs-mode: nil
# end: 
#############################
