#!/usr/bin/perl -w

=pod

=head1 NAME

tv_grab_br - Captura da programao Brasileira.

=head1 SYNOPSIS

tv_grab_br --help

tv_grab_br [--config-file FILE] --configure [--gui OPTION]

tv_grab_br [--config-file FILE] [--output FILE] [--days N]
           [--offset N] [--quiet]

tv_grab_br --list-channels

=head1 DESCRIPTION

Script desenvolvido para capturar a programao da TV brasileira a
partir do site www.tvmagazine.com.br. Este script le algumas pginas do
site e a interpreta. Sendo assim, ele pode parar de funcionar a qualquer
momento.

NOTA: Observamos que de tempos em tempos o site fica sem informaes,
assim o script roda e no baixa nenhum programa, veja que este no  um
problema do script e sim do site, por tanto, antes de fazer qualquer
comentrio sobre este assunto, verifique se o site
(www.tvmagazine.com.br) est com as informaes disponibilizadas.

Primeiro execute B<tv_grab_br --configure> para escolher os canais que 
voc deseja obter. Depois execute B<tv_grab_br> sem argumentos para obter
a lista de programas em XML.

B<--configure> Pergunta quais canais deseja obter,
e escreve a configurao.

B<--config-file FILE> Muda o nome do arquivo de configurao,
o padro  B<~/.xmltv/tv_grab_br.conf>.  este  o arquivo gerado
pelo comando B<--configure>. 

B<--output FILE> cria um arquivo de sada.

B<--days N> obtenha N dias.  O padro  3.

B<--offset N> Obtem a partir do dia N.

B<--quiet> No mostra a sada (output)

B<--help> Imprime informaes de ajuda e sai.

=head1 SEE ALSO

L<xmltv(5)>.

=head1 AUTHOR

Ronaldo Richieri <rrichieri@yahoo.com.br>
Marcelo Toledo   <marcelo@marcelotoledo.org>

baseado nos diversos grabbers disponiveis.

=head1 BUGS

=cut

# Author's TODOs & thoughts
#
# get the icons of each grabbed channel from the website
#
# get actors names


######################################################################
# initializations

use strict;
use XMLTV::Version '$Id: tv_grab_br,v 1.2 2006/01/08 10:55:02 epaepa Exp $ ';
use Getopt::Long;
use Date::Manip;
use HTML::TreeBuilder;
use HTML::Entities; # parse entities
use IO::File;

use XMLTV;
use XMLTV::Memoize;
use XMLTV::ProgressBar;
use XMLTV::Ask;
use XMLTV::Config_file;
use XMLTV::DST;
use XMLTV::Get_nice;
use XMLTV::Mode;
use XMLTV::Usage <<END
$0: obtem a programao da televiso brasileira no formato XMLTV
Para configurar: $0 --configure [--config-file FILE]
Para obter: $0 [--config-file FILE] [--output FILE] [--days N]
               [--offset N] [--quiet]
Para listar os canais disponveis: $0 --list-channels
END
  ;

# Attributes of the root element in output.
my $HEAD = { 'source-info-url'     => 'http://www.tvmagazine.com.br/conteudo/guiavm.asp',
	     'source-data-url'     => "http://www.tvmagazine.com.br/conteudo/guiavm.asp",
	     'generator-info-name' => 'XMLTV',
	     'generator-info-url'  => 'http://membled.com/work/apps/xmltv/',
	   };

# how many times to try downloading a page?
my $MAX_RETRY = 5;
		   
# Whether zero-length programmes should be included in the output.
my $WRITE_ZERO_LENGTH = 0;

# default language
my $LANG="pt_BR";

# Global channel_data
our @ch_all;

######################################################################
# get options

# Get options, including undocumented --cache option.
XMLTV::Memoize::check_argv('XMLTV::Get_nice::get_nice_aux');
my ($opt_days, $opt_offset, $opt_help, $opt_output,
    $opt_configure, $opt_config_file, $opt_gui,
    $opt_quiet, $opt_list_channels);
$opt_days  = 3; # default
$opt_offset = 0; # default
$opt_quiet  = 0; # default
GetOptions('days=i'        => \$opt_days,
	   'offset=i'      => \$opt_offset,
	   'help'          => \$opt_help,
	   'configure'     => \$opt_configure,
	   'config-file=s' => \$opt_config_file,
           'gui:s'         => \$opt_gui,
	   'output=s'      => \$opt_output,
	   'quiet'         => \$opt_quiet,
	   'list-channels' => \$opt_list_channels
	  )
  or usage(0);
die 'number of days must not be negative'
  if (defined $opt_days && $opt_days < 0);
usage(1) if $opt_help;

XMLTV::Ask::init($opt_gui);

my $mode = XMLTV::Mode::mode('grab', # default
			     $opt_configure => 'configure',
			     $opt_list_channels => 'list-channels',
			    );

# File that stores which channels to download.
my $config_file
  = XMLTV::Config_file::filename($opt_config_file, 'tv_grab_br', $opt_quiet);

my @config_lines; # used only in grab mode
if ($mode eq 'configure') {
    XMLTV::Config_file::check_no_overwrite($config_file);
}
elsif ($mode eq 'grab') {
    @config_lines = XMLTV::Config_file::read_lines($config_file);
}
elsif ($mode eq 'list-channels') {
    # Config file not used.
}
else { die }

# Whatever we are doing, we need the channels data.
my %channels = get_channels(); # sets @ch_all
my @channels;

######################################################################
# write configuration

if ($mode eq 'configure') {
    open(CONF, ">$config_file") or die "cannot write to $config_file: $!";

    # Ask about each channel.
    my @chs = sort keys %channels;
    my @names = map { $channels{$_} } @chs;
    my @qs = map { "Adicionar o canal $_?" } @names;
    my @want = ask_many_boolean(1, @qs);
    foreach (@chs) {
	my $w = shift @want;
	warn("cannot read input, stopping channel questions"), last
	  if not defined $w;
	# No need to print to user - XMLTV::Ask is verbose enough.

	# Print a config line, but comment it out if channel not wanted.
	print CONF '#' if not $w;
	my $name = shift @names;
	print CONF "channel $_ $name\n";
    }

    close CONF or warn "cannot close $config_file: $!";
    say("Finished configuration.");

    exit();
}


# Not configuration, we must be writing something, either full
# listings or just channels.
#
die if $mode ne 'grab' and $mode ne 'list-channels';

# Options to be used for XMLTV::Writer.
my %w_args;
if (defined $opt_output) {
    my $fh = new IO::File(">$opt_output");
    die "cannot write to $opt_output: $!" if not defined $fh;
    $w_args{OUTPUT} = $fh;
}
$w_args{encoding} = 'ISO-8859-1';
my $writer = new XMLTV::Writer(%w_args);
$writer->start($HEAD);

if ($mode eq 'list-channels') {
    $writer->write_channel($_) foreach @ch_all;
    $writer->end();
    exit();
}

######################################################################
# We are producing full listings.
die if $mode ne 'grab';

# Read configuration
my $line_num = 1;
foreach (@config_lines) {
    ++ $line_num;
    next if not defined;
    if (/^channel:?\s+(\S+)\s+([^\#]+)/) {
	my $ch_did = $1;
	my $ch_name = $2;
	$ch_name =~ s/\s*$//;
	push @channels, $ch_did;
	$channels{$ch_did} = $ch_name;
    }
    else {
	warn "$config_file:$line_num: bad line\n";
    }
}

######################################################################
# begin main program

# Get a page using this agent.
sub get_page( $ ) {
    my $url = shift;
    # For Memoize s sake make extra sure of scalar context
    #return scalar get_page_aux($url);
    #return get_nice($url);
    return scalar get_page_aux($url);
}

# Curious function to deal with the Get_nice API which does not offer an internal retry mode.
# Awful, but it seems to work.
# It works well, and it is mandatory with the telepoche website... Sorry for the ugly code...
sub get_page_aux {
    my $url = shift;
    my $retry = $MAX_RETRY;
    my $got = '';
    my $sleep = 0;

GET:
    # Sleep 1 second after 1 pass
    sleep $sleep;
    $sleep = 1;
    # Call the get_nice API
    eval { $got = get_nice($url) };
    # Then check the return string of the get_nice function
    goto GET if $@ and $@ =~ /could not fetch/ and --$retry;

    #die "Can\'t download $url !!! Check you internet connection." if $retry == 0;
    return $got;
}

# Assume the listings source uses CET (see BUGS above).
my $now = DateCalc(ParseDate('now'), "$opt_offset days");
die "Voc no especificou nenhum canal, execute com --configure\n"
  if not keys %channels;
my @to_get;


# the order in which we fetch the channels matters
foreach my $ch_did (@channels) {
    my $ch_name=$channels{$ch_did};
    my $ch_xid="$ch_did.tvmagazine.com.br";
    $writer->write_channel({ id => $ch_xid,
		    		'display-name' => [ [ $ch_name ] ] });
    my $day=UnixDate($now,'%Q');
    for (my $i=0;$i<$opt_days;$i++) {
        push @to_get, [ $day, $ch_xid, $ch_did ];
        #for each day
        $day=nextday($day); die if not defined $day;
    }
}

# This progress bar is for both downloading and parsing.  Maybe
# they could be separate.
#
my $bar = new XMLTV::ProgressBar('getting listings', scalar @to_get)
  if not $opt_quiet;
foreach (@to_get) {
	foreach (process_table($_->[0], $_->[1], $_->[2])) {
		$writer->write_programme($_);
	}
	update $bar if not $opt_quiet;
}
$writer->end();

$bar->finish() if not $opt_quiet;

######################################################################
# subroutine definitions

# Use Log::TraceMessages if installed.
BEGIN {
    eval { require Log::TraceMessages };
    if ($@) {
	*t = sub {};
	*d = sub { '' };
    }
    else {
	*t = \&Log::TraceMessages::t;
	*d = \&Log::TraceMessages::d;
	Log::TraceMessages::check_argv();
    }
}



#Trim usado para limpar os espaos em branco de algumas variveis
sub trim {
 my $string = shift(@_);
 $string =~ s/^\s+//;
 $string =~ s/\s+$//; 
 return $string;
}

####
# process_table: fetch a URL and process it
#
# arguments:
#    Date::Manip object giving the day to grab
#    xmltv id of channel
#    cplus.es id of channel
#
# returns: list of the programme hashes to write
#
sub process_table {
    my ($date, $ch_xmltv_id, $ch_es_id) = @_;

    $ch_es_id =~ s/\+/\%2B/gi;
    my $today = UnixDate($date, '%d/%m/%Y');
    my $today_dia = UnixDate($date, '%d');
    my $today_mes = UnixDate($date, '%m');
    my $today_ano = UnixDate($date, '%Y');
        
    my $url = "http://www.tvmagazine.com.br/conteudo/mostra.asp?tipo=5&escolhadia=$today_dia&escolhames=$today_mes&escolhaano=$today_ano&escolhahora=&escolhacanal=$ch_es_id";
    t $url;

    my $data;
    $data = get_page($url);
    if (not defined $data) {
        return ();
    }

    local $SIG{__WARN__} = sub {
	warn "$url: $_[0]";
    };

    # parse the page to a document object
    my $tree = HTML::TreeBuilder->new();
    $tree->parse($data) or die "cannot parse content of $url\n";
    $tree->eof;
    
    my @program_data = get_program_data($tree);
    my $bump_start_day=0;

    my @r;
    while (@program_data) {
	my $cur = shift @program_data;
	my $next = shift @program_data;
	unshift @program_data,$next if $next;
	
	push @r, make_programme_hash($date, $ch_xmltv_id, $ch_es_id, $cur, $next);
	if (!$bump_start_day && bump_start_day($cur,$next)) {
	    $bump_start_day=1;
	    $date = UnixDate(DateCalc($date,"+ 1 day"),'%Q');
	}
    }
    return @r;
}

sub make_programme_hash {
    my ($date, $ch_xmltv_id, $ch_es_id, $cur, $next) = @_;

    my %prog;

    $prog{channel}=$ch_xmltv_id;
    
    # Limpa espaos no final do ttulo:
    $cur->{title}=~ s/^\s+//;
    $cur->{title}=~ s/\s+$//;
    $prog{title}=[ [ $cur->{title}, $LANG ] ];
    
    # Verifica se h subttulo e limpa espaos no final
    if (defined $cur->{subtitle}){    
	$cur->{subtitle}=~ s/^\s+//;
	$cur->{subtitle}=~ s/\s+$//;
	$prog{"sub-title"}=[ [ $cur->{subtitle}, $LANG ] ];
    }
    
    # Verifica se h categoria e limpa espaos indesejados
    if (defined $cur->{category}) {
	$cur->{category}=~ s/^\s+//;
	$cur->{category}=~ s/\s+$//;
	$prog{category}=[ [ $cur->{category}, $LANG ] ];
    }

    $prog{start}=utc_offset("$date $cur->{time}", '-0300');
    if (not defined $prog{start}) {
	warn "bad time string: $cur->{time}";
	return undef;
    }

    # Verifica se h sinopse e limpa espaos no final dela
    if (defined $cur->{desc}) {
	$cur->{desc}=~ s/^\s+//;
	$cur->{desc}=~ s/\s+$//;
	$prog{desc}=[ [ $cur->{desc}, $LANG ] ]
    }
    
    if (defined $cur->{diretor}) {
	$cur->{diretor}=~ s/^\s+//;
	$cur->{diretor}=~ s/\s+$//;
	$prog{credits}->{director}=[$cur->{diretor}];
    }
    
    if (defined $cur->{elenco}) {
    	foreach my $ator(@{$cur->{elenco}}) {
    		push @{$prog{credits}->{actor}}, $ator;
    	};
    }	
    
    

    return \%prog;
}
sub bump_start_day {
    my ($cur,$next) = @_;
    if (!defined($next)) {
	return undef;
    }
    my $start = UnixDate($cur->{time},'%H:%M');
    my $stop = UnixDate($next->{time},'%H:%M');
    if (Date_Cmp($start,$stop)>0) {
	return 1;
    } else {
	return 0;
    }
}


#
# program data is split as follows:
sub get_program_data {
    my ($tree) = @_;
    my @data;

   # Ache a tabela com id tblProgramacao
    my $result = $tree->look_down('_tag', 'td', 
                                  sub { 
                                      $_[0]->as_text =~ /Procura/; 
                                  } 
                                  );
    my @tabelas;
    if (defined $result) { 
        @tabelas = $result->content_list;
    } else { 
        return ();
    }
			
   my $tabela = $tabelas[0]->look_down('_tag', 'td',
    			sub {
				$_[0]->as_text =~ /\d\d:\d\d:\d\d/;
    			}
			);
			

   

   # Cada linha desta tabela  um programa
   my @programas = $tabela->look_down('_tag', 'tr');

      
   # Para cada programa encontrado, faa:
   foreach my $programa (@programas){
   	# Procura pelo horrio
	my $horario = $programa->look_down(
		'_tag', 'td',
		sub {
			$_[0]->as_text =~ /\d\d:\d\d:\d\d/;
    		}
  	);

	my $p_stime = $horario->as_text();

	
	# Procura pelo ttulo do programa
	my $titulo = $horario->right;
	
	my $p_title = $titulo->as_text();
	
	# Define a categoria a princpio como em branco
	my $p_category;
	
	# Define a sinopse
	my $p_desc;
	
	# Define diretor
	my $p_diretor;
	
	# Define o elenco
	my @p_atores;
	
	# Ok, vamos para os detalhes do programa 
	# os quais no encontramos na pgina principal.
	#  necessrio ver se h uma pgina de detalhes e acess-la
	
	# Para cada programa, vefique se h um link de detalhes
	my $detalhes = $programa->look_down('_tag', 'a');
        my $detalhes_pagina;
	if (defined $detalhes){
            my @detalhes_array = split("'",$detalhes->attr('href'));
            my $detalhes_url = "http://www.tvmagazine.com.br".$detalhes_array[1];            
            $detalhes_pagina = get_page($detalhes_url);
            my $detalhes_tree = HTML::TreeBuilder->new();
            $detalhes_tree->parse($detalhes_pagina)
	      or die "cannot parse content of $detalhes_url\n";
	    $detalhes_tree->eof;
            
            # Procura pela categoria
            my @detalhes_cata = split(":",$detalhes_tree->look_down('_tag','font',
                                                                    sub { $_[0]->as_text =~ m{nero}}
                                                                    )->as_text);
            my $detalhes_cat = $detalhes_cata[1];
            
            
            if (defined $detalhes_cat) {
                if (not ($detalhes_cat eq "")) {
                    $p_category=$detalhes_cat;
                } else {
                    undef $p_category;
                }
            }
            
            # Procura pela sinopse (nem todos possuem)
            my $detalhes_sin = $detalhes_tree->look_down('_tag','small')->as_text;
            
            if (defined $detalhes_sin) {
                if (not (trim($detalhes_sin) eq "")) {
                    $p_desc=trim($detalhes_sin);
                } else {
                    undef $p_desc;
                }
                
            }
            
            # Procura pelo Diretor (nem todos possuem)
            my @detalhes_dira = split(":",$detalhes_tree->look_down('_tag','font',
                                                                    sub { $_[0]->as_text =~ m{Diretor}}
                                                                    )->as_text);
            my $detalhes_dir = trim($detalhes_dira[1]);
            
            
            if (defined $detalhes_dir) {
                if (not ($detalhes_dir eq "")) {
                    $p_diretor=$detalhes_dir;
                } else {
                    undef $p_diretor;
                }
                
            }
            
            # Procura pelo elenco (nem todos possuem)
            my @detalhes_elea = split(":",$detalhes_tree->look_down('_tag','font',
                                                                    sub { $_[0]->as_text =~ m{Elenco}}
                                                                    )->as_text); 
            my @detalhes_ele = split(",",$detalhes_elea[1]);
            if (not trim($detalhes_elea[1]) eq "") {
                foreach my $ator(@detalhes_ele) {
                    push @p_atores, trim($ator);
                }
            } else {
                undef @p_atores;
            }	
        }
	
	
        my %h = (       time =>         $p_stime,
                        # category=>      $p_category,
                        title=>         $p_title,
                        # subtitle=>      $p_subtitle,
			);
	$h{category} = $p_category if defined $p_category;
	$h{desc} = $p_desc if defined $p_desc;
	$h{diretor} = $p_diretor if defined $p_diretor;
	@{$h{elenco}} = @p_atores; 
        push @data, \%h;
	}
   return @data;
}


# get channel listing (Pega os canais)
sub get_channels {
    my $bar = new XMLTV::ProgressBar('getting list of channels', 1)
	if not $opt_quiet;
    my %channels;
    my $url="http://www.tvmagazine.com.br/conteudo/guiavm.asp";
    t $url;
    my $local_data = get_page($url);
    die "could not get channel listing $url, aborting\n"
      if not defined $local_data;

    my $tree = HTML::TreeBuilder->new();
    $tree->parse($local_data) or die "cannot parse content of $url\n";
    $tree->eof;
    my @menus = $tree->find_by_tag_name("_tag"=>"select");
    
    foreach my $elem (@menus) {
	my $cname = $elem->attr('name');
	
	# Procura na pgina pelo o <select name="lstCanais">
	if ($cname eq "escolhacanal") {
	    my @ocanals = $elem->find_by_tag_name("_tag"=>"option");
	    @ocanals = sort @ocanals;
	    foreach my $opt (@ocanals) {
		t $opt->attr('value');
		t $opt->attr('text');
		
		# Se o select box no estiver em branco,  um canal
		if (not $opt->attr('value') eq "") {
		    my $channel_id = $opt->attr('value');
		    my @children=$opt->content_list;
		    my $channel_name=$children[0];
		    if (length $channel_id eq 1) {
			$channel_id = "0" . $channel_id
		    }
		    $channels{$channel_id}=$channel_name;
		    push @ch_all, { 'display-name' => [ [ $channel_name,
							  $LANG ] ],
				    'id'=> "$channel_id.tvmagazine.com.br" };
		}
	    }
	}
    }
    die "no channels could be found" if not keys %channels;
    update $bar if not $opt_quiet;
    $bar->finish() if not $opt_quiet;
    return %channels;
}


# Bump a YYYYMMDD date by one.
sub nextday {
    my $d = shift;
    my $p = ParseDate($d);
    my $n = DateCalc($p, '+ 1 day');
    return UnixDate($n, '%Q');
}

# tv_grab_br ends here.
