# $Id: Hash.pm,v 1.21 2004/04/02 13:31:34 abs Exp $

$Text::CSV::Hash::VERSION = '0.18';
##++
##
##     Copyright (c) 2001,2002 David Brownlee. All Rights Reserved
##
##     E-Mail: <abs@mono.org>
##
##     Permission  to  use,  copy, and distribute is hereby granted,
##     providing that the above copyright notice and this permission
##     appear in all copies and in supporting documentation.
##--

=head1 NAME

Text::CSV::Hash - hash based CSV file handing, including comparisons

=head1 SYNOPSIS

Text::CSV::Hash provides simple hash based methods for loading,
saving, and comparing CSV files. Will handle standard quoting and
entries with newlines.

=head1 DESCRIPTION

=head2 Methods

=over 4

=item $csv->config()

Allows various configuration options to be set. Should be passed a hash
with any of the following:

=over 4

=item lookupscores

hash reference to default lookup fields and scores - see lookup() for more details. Load will fail if any are missing.

=item mapfields

reference to hash that maps column names on load. Mapping happens before column names are checked against $lookupscores.

=item max_data_rows

maximum number of data rows to read. Useful for quickly reading the start
of a csv file.

=item name_row_offset

offset to row containing column names

=item quote

character used to quote fields. Defaults to '"'.

=item separator

field separator. Defaults to ','.

=item skip_blank_lines

true/false - are blank (containing no non-whitespace) lines in the file ignored

=item trim

true/false - trim whitespace from start and end of each field.

=back

=item $csv->dump($row, $row2)

If called with one csv row reference displays each field label and
its values.  If called with two, displays the second's value where
it differs from the first.

=item $csv->labels()

Return the current set of standard labels for current csv as an array.

=item $csv->labels_add(@labels)

Add @labels to the set of standard labels for all csv rows. The
standard labels are used by dump(), save() etc.

=item $csv->labels_set(@labels)

Set the standard labels for all csv rows to @labels. The standard
labels are used by dump(), save() etc.

=item $csv->linenolabel

The name of the field containing the file linenumber for each row. This is set
by $csv->read automatically. It defaults to 'lineno', but if that conflicts
with a label in the file it will prepend '_' until a unique name is found.

=item $csv->load($file, $conf);

Load csv file '$file'. If $conf is present it can specify any of the
options valid for $csv->config.

The first line of the file is expected to contain column labels, except
where overridden by name_row_offset or skip_blank_lines.

Returns undef on error and error value can be displayed
with $csv->error().

=item $csv->lookup($row, $minscore, $lookupscores);

Lookup row $row in csv. Other parameters are optional, but if specified:

=over 4

=item $minscore - Minimum score for matching fields. Defaults to 1.

=item $lookupscores - Fields and scores for lookups. Defaults to $lookupscores passed to load().

=back

lookup() is intended to handle the case where some fields may
mismatch, and multiple match() calls may be needed to find a suitable
row. If it is unable to match all fields in $lookupscores it will
cycle through all possible combnations of fields in $lookupscores
starting with the highest possible score and working down. The
first combination that matches will be used. Any match with a score
lower than $minscore will be rejected. For example, if the csv file
has the fields 'first name', 'last name', 'country' and you are
willing to accept a mismatch of either 'first name' or 'country',
you could use

	$lookupscores = { 'last name' => 2,
			  'first name' => 1,
			  'country' => 1 };
	$minscore = 3;

Lookup returns a list of three items ($match, $ambig, $mismatch);

=over 4

=item $match - reference to exact match if only one match found.

=item $ambig - reference to array of ambiguous matches if more than
one match found.

=item $mismatch - reference to array of field names that mismatched
if not exact match.

=back

If you do not care about details of ambiguous matches or mismatched fields
you can treat them as boolean or just ignore them, as in
($match) = $csv->lookup($row);

=item $csv->match($row, @fields);

Return a list of references to rows in $csv which match $row for all fields
in @fields. Typically $row would be from a different csv object. match() builds
a hash cache internally to speedup repeated lookups on the same csv object.

If the returned list has one entry it is an exact match, more than one
indicates an ambiguous match, and zero no matches.

For more sophisticated matching see lookup().

=item new Text::CSV::Hash;

Create a new $csv object. If passed a reference to an array of hashes, set
the labels to the keys in the first entry, and the rows to the contents of
the array.

=item $csv->rows()

Returns the reference to an array of all rows in the csv. Each row
is a hash of label => value. If passed a reference to an array of hashes
will set the rows to the contents of the array.

=item $csv->save($file, $conf);

Save csv to file. If $map was specified on load, saved column names are the
values after mapping. If $file is not specified it will default to the file
used in load().

Returns undef on error and error value can be displayed  
with $csv->error().

If present, $conf can specify characteristics of the file. Note these will
default to those set when the file was loaded.

=over 4

=item quote - charater used to quote fields. Defaults to '"'.

=item separator - field separator. Defaults to ','.

=back

=back

=head1 EXAMPLE1

    # Sample script to load file.csv, double the 'cost' columns, and save
    my($csv);

    $csv = new Text::CSV::Hash;

    $csv->load('file.csv') || die $csv->error();
    foreach my $row (@{$csv->rows()})
	{ $row->{cost} *= 2; }
    $csv->save() || die $csv->error();

=head1 EXAMPLE2

    # Sample script to produce merged versions of file1.csv and file2.csv
    #   - matching on 'first name', 'last name', and 'country' columns
    #   - add two columns 'results', and 'mismatches'
    #   - allowing for any one of the three fields to mismatch
    #   - trimming spaces from each field.

    my($csv1, $csv2, %merge_match);

    $csv1 = new Text::CSV::Hash;
    $csv2 = new Text::CSV::Hash;

    %merge_match = ('first name' => 1, 'last name' => 1, 'country' => 1);

    $csv1->load('file1.csv', {trim => 1}, \%lookups) || die $csv1->error();
    $csv2->load('file2.csv', {trim => 1}, \%lookups) || die $csv2->error();

    $csv2->labels_add('results', 'mismatches');

    foreach my $row2 (@{$csv2->rows()})
	{
	my($match1, $ambig, $mismatches, $resfield, $misfield);

	($match1, $ambig, $mismatches) = $csv1->lookup($row2, 2);
	if ($ambig)
	    { $resfield = 'Ambiguous'; }
	elsif (!$match1)
	    { $resfield = 'Not Found'; }
	elsif ($mismatches)
	    {
	    $resfield = "Mismatch ".join(' + ', @${mismatches});
	    foreach my $mis (@${mismatches})
		{ $misfield .= qq#("$row2->{$mis}" - "$match1->{$mis}") #; }
	    chop $misfield;
	    }
	else
	    { $resfield = 'Match all'; }

	$row2->{results} = $resfield;
	$row2->{mismatches} = $misfield;

	if ($match1)
	    {
	    # Merge fields both ways

	    foreach($csv1->labels())
		{ $row2->{$_} ||= $match1->{$_}; }

	    foreach($csv2->labels())
		{ $match1->{$_} ||= $csv2->{$_}; }
	    }
	}

    $csv1->labels_add($csv2->labels());
    $csv1->save('merged_'.$csv1->{filename}) || die $csv1->error();

    $csv2->labels_add($csv1->labels());
    $csv2->save('merged_'.$csv2->{filename}) || die $csv2->error();

=cut


use strict;

package Text::CSV::Hash;

sub config
    {
    my($self) = shift;
    $self->{_conf} = $_[0];
    %{$self->{_conf}};
    }

sub dump
    {
    my($self) = shift;
    my($row, $row2) = @_;
    my($label_len, $row_len, @labels);

    @labels = ($self->{_linenolabel}, @{$self->{_labels}});
    $label_len = fmtlen(@labels);

    if ($row2)
	{
	my(@row);
	foreach my $label (@labels)
	    { push(@row, $row->{$label}); }
	$row_len = fmtlen(@row);
	}
    foreach my $label (@labels)
	{
	if ($row2 && $row2->{$label} ne $row->{$label})
	    { printf("%".$label_len."s = %-".$row_len."s : %s\n", $label,
					    $row->{$label}, $row2->{$label}); }
	else
	    { printf("%".$label_len."s = $row->{$label}\n", $label); }
	}
    }

sub error
    {
    my($self) = shift;
    $self->{_error};
    }

sub generate_hash # Intended for internal use
    {
    my($self) = shift;
    my(@fields) = @_;
    my(%hash);

    foreach my $row (@{$self->{_rows}})
	{ push(@{$hash{key_field($row, @fields)}}, $row); }
    \%hash;
    }

sub labels
    {
    my($self) = shift;
    @{$self->{_labels}};
    }

sub labels_add
    {
    my($self) = shift;
    my(%chk);
    foreach my $label (@{$self->{_labels}})
	{ $chk{$label} = 1; }
    foreach my $label (@_)
	{ $chk{$label} or push(@{$self->{_labels}}, $label); }
    @{$self->{_labels}};
    }

sub labels_set
    {
    my($self) = shift;
    my(@labels) = @_;
    $self->{_labels} = \@labels;
    @{$self->{_labels}};
    }

sub linenolabel
    {
    my $self = shift;
    $self->{_linenolabel};
    }

sub new
    {
    my $class = shift;
    my $self = {};
    bless $self, $class;
    if (@_)
	{
	my $list = $_[0];
	$self->labels_set(keys %{$list->[0]});
	$self->rows($list);
	}
    $self;
    }

sub csv_line_get # Intended for internal use
    {
    my($fh) = @_;
    my($line);

    $line = <$fh>;
    if (defined $line)
	{
	$line =~ s/\r$//;
	chomp $line;
	}
    $line;
    }

sub csv_line_parse # Intended for internal use
    {
    my($fh, $conf, $results) = @_;
    my($line, $field, @work, @fields);

    unless ($line = csv_line_get($fh))
	{ return; }
    @work = split($conf->{separator}, $line);
    while (@work)
	{
	$field = shift @work;
	if (substr($field, 0, 1) eq '"')
	    {
	    if ($field eq '""')
		{ $field = ''; }
	    else
		{
		substr($field, 0, 1, '');
		for (;;)
		    {
		    # Trailing ", but not as part of any number of ""
		    if (substr($field, -1) eq '"' &&
				($field !~ /("+)"$/ || ! (length($1)%2)))
			{
			chop $field;
			last;
			}
		    if (@work)
			{ $field .= $conf->{separator}.shift @work; }
		    else
			{
			@work = split($conf->{separator}, csv_line_get($fh));
			@work || last;	# Malformed file
			$field .= "\n".shift @work;
			}
		    }
		$field =~ s/""/"/g;
		}
	    }
	if ($conf->{trim})
	    {
	    $field =~ s/^\s+//;
	    $field =~ s/\s+$//;
	    }
	push(@fields, $field);
	}
    if ($results)
	{ @{$results} = @fields; }
    1;
    }

sub entries
    {
    my($self) = shift;
    $self->rows(@_);
    }

sub rows
    {
    my($self) = shift;
    if (@_)
	{ $self->{_rows} = $_[0]; }
    else
	{ $self->{_rows}; }
    }

sub load
    {
    my($self) = shift;
    my($file, $conf) = @_;
    my(@labels, @values, @rows, $linenolabel, $lineno, %restrictfields);

    $conf = $self->mergeconf($conf);

    if (!open(FILE, $file))
	{
	$self->{_error} = "Unable to open file '$file'";
	return(undef);
	}

    if ($conf->{restrictfields})
	{
	foreach my $label (@{$conf->{restrictfields}})
	    { $restrictfields{$label} = 1; }
	}
    if ($conf->{name_row_offset})
	{
	my($loop);
	for ($loop = 1 ; $loop < $conf->{name_row_offset} ; ++$loop)
	    { csv_line_parse(*FILE, $conf); }
	}
    my @rawlabels;
    while (csv_line_parse(*FILE, $conf, \@rawlabels) &&
				    $conf->{skip_blank_lines} && !@rawlabels)
	{ }
    foreach my $label (@rawlabels)
	{
	push(@labels, $conf->{mapfields}{$label} ?$conf->{mapfields}{$label}
						 :$label);
	}
    $linenolabel = 'lineno';
    while (grep($_ eq $linenolabel, @labels))
	{ $linenolabel = '_'.$linenolabel; }

    if ($conf->{lookupscores})
	{
	foreach my $label (keys %{$conf->{lookupscores}})
	    {
	    if (!grep($_ eq $label, @labels))
		{
		$self->{_error} = "Missing label '$label'";
		return(undef);
		}
	    }
	}

    $lineno = 1;
    while (!eof(FILE))
	{
	my(%row);

	if ($conf->{max_data_rows} && $lineno > $conf->{max_data_rows})
	    { last; }
	++$lineno;
	csv_line_parse(*FILE, $conf, \@values);
	$conf->{skip_blank_lines} && !@values && next;
	foreach my $label (@labels)
	    {
	    $_ = shift @values;
	    $conf->{restrictfields} && !$restrictfields{$label} && next;
	    if (!defined $_)
		{ $_ = ''; }

	    $row{$label} = $_;
	    }
	$row{$linenolabel} = $lineno;
	push(@rows, \%row);
	}
    close(FILE);

    if ($conf->{restrictfields})
	{ @labels = grep($restrictfields{$_}, @labels); }

    $self->{filename} = $file;
    $self->{_linenolabel} = $linenolabel;
    $self->{_lookupscores} = $conf->{lookupscores};
    $self->{_rows} = \@rows;
    $self->{_labels} = \@labels;
    1;
    }

sub lookup
    {
    my($self) = shift;
    my($row, $minscore, $lookupscores) = @_;
    my($pass, $match, @matchlist, @mismatch, @ambig, @fields);

    $lookupscores ||= $self->{_lookupscores};

    $minscore ||= 1;
    @fields = sort {$lookupscores->{$b} <=> $lookupscores->{$a}}
							keys %{$lookupscores};

    for ($pass = (1 << @fields) - 1 ; $pass > 0 ; --$pass)
	{
	my(@passfields, $bit, $fieldscore);
	$fieldscore = 0;
	for ($bit = 0 ; $bit < @fields ; ++$bit)
	    {
	    if ($pass & (1<<$bit))
		{
		push(@passfields, $fields[$bit]);
		$fieldscore += $lookupscores->{$fields[$bit]};
		}
	    }
	if ($fieldscore < $minscore)
	    { next; }
	if (@matchlist = $self->match($row, @passfields))
	    {
	    if (@matchlist > 1)
		{ @ambig = @matchlist; }
	    else
		{
		$match = $matchlist[0];
		for ($bit = 0 ; $bit < @fields ; ++$bit)
		    {
		    unless ($pass & (1<<$bit))
			{ push(@mismatch, $fields[$bit]); }
		    }
		last;
		}
	    }
	}
    ($match, (!$match && @ambig) ?\@ambig :undef, @mismatch ?\@mismatch :undef);
    }

sub match
    {
    my($self) = shift;
    my($row, @fields) = @_;
    my($key, $keylabel);

    my(%chk);
    foreach my $label (@{$self->{_labels}})
	{ $chk{$label} = 1; }
    foreach my $label (@fields)
	{ defined $chk{$label} || die("Unknown field '$label'"); }
    $key = key_field($row, @fields);
    $keylabel = join(',', @fields);
    if (!$self->{hash}{$keylabel})
	{ $self->{hash}{$keylabel} = generate_hash($self, @fields); }
    if ($self->{hash}{$keylabel}{$key})
	{ return(@{$self->{hash}{$keylabel}{$key}}); }
    ();
    }

sub save
    {
    my($self) = shift;
    my($file, $conf) = @_;

    $conf = $self->mergeconf($conf);

    $file ||= $self->{filename};

    if (!open(FILE, ">$file"))
	{
	$self->{_error} = "Unable to write '$file': $!";
	return(undef);
	}
    print FILE save_quote($conf, @{$self->{_labels}});
    foreach my $row (@{$self->{_rows}})
	{
	my(@values);
	foreach my $label (@{$self->{_labels}})
	    { push(@values, $row->{$label}); }
	print FILE save_quote($conf, @values);
	}
    close(FILE);
    }

sub save_quote # Intended for internal use
    {
    my $conf = shift;

    my ($line, $quote, $match);
    $quote = $conf->{quote};
    $match = "[$conf->{quote}$conf->{separator}\n]";
    foreach my $val (@_)
	{
	if (defined $val)
	    {
	    $_ = $val;
	    if ( s/$quote/$quote$quote/g || /$match/)
		{ $_ = $quote.$_.$quote; }
	    $line .= $_;
	    }
	$line .= $conf->{separator};
	}
    chop $line;
    $line . "\n";
    }

sub mergeconf
    {
    my $self = shift;
    my $conf = $_[0];

    if ($conf)
	{
	foreach my $tag (keys %{$conf})
	    { $self->{_conf}{$tag} = $conf->{$tag}; }
	}
    $self->{_conf}{separator} ||= ',';
    unless (exists $self->{_conf}{quote})
	{ $self->{_conf}{quote} = '"'; }
    $self->{_conf};
    }

sub fmtlen
    {
    my($len);

    $len = 0;
    foreach my $val (@_)
	{ (length($val) > $len) && ($len = length($val)); }
    $len;
    }

sub key_field
    {
    my($row, @fields) = @_;
    my($key);

    foreach my $field (@fields)
	{ $key .= $row->{$field}.','; }
    chop $key;
    $key;
    }

1;
