#!/usr/bin/perl -w

=pod

=head1 NAME

tv_grab_nl_wolf - Grab TV listings for Holland.

=head1 SYNOPSIS

tv_grab_nl_wolf --help

tv_grab_nl_wolf [--output FILE] [--days N] [--offset N] [--quiet]

=head1 DESCRIPTION

Output TV listings for several channels available in the Netherlands.
The data comes from Carlo de WolfE<39>s site exporting listings in
XMLTV format.  The default is to grab as many days as possible from
the current day onwards.

B<--output FILE> write to FILE rather than standard output.

B<--days N> grab N days starting from today, rather than as many as
possible.

B<--offset N> start grabbing N days from today, rather than starting
today.  N may be negative.

B<--quiet> suppress the progress messages normally written to standard
error.

B<--help> print a help message and exit.

=head1 SEE ALSO

L<xmltv(5)>, L<http://wolf.xs4all.nl/xmltv/>

=head1 AUTHOR

Ed Avis, ed@membled.com

=head1 BUGS

The grabber fetches data from the site with little processing.  This
means that if the upstream data is incorrect, the output of the
grabber will be wrong.  There are some fixes to correct errors such as
stop < start, but others may creep in undetected.

=cut

use strict;
use XMLTV::Version '$Id: tv_grab_nl_wolf,v 1.16 2006/01/08 10:50:40 epaepa Exp $ ';

# We work by inheriting from XMLTV::Grab_XML and overriding certain
# methods.
#
use XMLTV::Grab_XML;
package Grab_XML_nl_wolf;
use base 'XMLTV::Grab_XML';

# No need to be nice.
$XMLTV::Get_nice::Delay = 0;

use Date::Manip;
use XMLTV::TZ qw(tz_to_num);

# Memoize one routine if possible.
eval { require Memoize };
unless ($@) {
    for ('tz_to_num') {
	Memoize::memoize($_) or warn "cannot memoize $_";
    }
}

# Todo: perhaps we should internationalize messages and docs?
sub country( $ ) {
    my $pkg = shift;
    return 'the Netherlands';
}

my $URL_HOST = 'http://wolf.xs4all.nl';
my $URL_DIR = '/xmltv/';
my $url_base = "$URL_HOST$URL_DIR";

# Returns a hash mapping YYYMMDD to URL.
sub urls_by_date( $$$ ) {
    my $pkg = shift;
    my $index = $pkg->get($url_base);
    die "could not get index page $url_base, aborting\n"
      if not defined $index;

    # Use a horrible method to read the directory listing; we want the
    # filename and size.  Any method we choose will break if the
    # format changes.
    #
    my @lines = split /\n/, $index;
    my @got;
    while (@lines >= 2) {
	if ($lines[0] =~ m!<a href="/xmltv/(.+?)"><tt>\1</tt></a></td>!) {
	    my $filename = $1;
	    if ($lines[1] =~ m!<td align="right"><tt>(\S+) kb</tt></td>!) {
		my $size_kb = $1;
		push @got, [ $filename, $size_kb ];
	    }
	}
	shift @lines;
    }
    die "saw no download links in $url_base\n" if not @got;
    my @urls;
    foreach (@got) {
	my ($filename, $size_kb) = @$_;
	if ($size_kb ne '0.0' and $size_kb ne '0') {
	    push @urls, "/xmltv/$filename";
	}
	else {
	    warn "$filename has zero size on site, not downloading\n";
	}
    }

    my %by_date;
    foreach (@urls) {
	s/^$URL_DIR//;
	if (/^tv-(\d{8})\./) {
	    # We know that the same date can occur twice, we just take
	    # the first occurrence.
	    #
	    $by_date{$1} = "$url_base/$_"
	      unless defined $by_date{$1};
	}
    }
    return %by_date;
}

# Fix up programmes which have stop time before their start time.  We
# assume the start time is correct.
#
my $warned_empty_desc;
sub xml_from_data( $$ ) {
    my $pkg = shift;
    my $xml = shift;
    if ($xml =~ s!<desc/>!!g) {
	warn "removed empty <desc/> elements from upstream data\n"
	  unless $warned_empty_desc++;
    }
    return $pkg->remove_early_stop_times($xml);
}

Grab_XML_nl_wolf->go();
