#!/usr/pkg/bin/perl
#
# perl grep equivalent-wrapper supporting utf-8 and thai in particular
#
use warnings;
use strict;
use Encode;
use Getopt::Std;

use utf8;
use open qw/:std :utf8/;

our ( $opt_h, $opt_i, $opt_l, $opt_n, $opt_q, $opt_v );

getopts('hilnqv');

if ( $opt_h ) {
    usage();
    exit 0;
} elsif ( ! defined $ARGV[0] ) {
    # no pattern given
    usage();
    exit 1;
}

my $pattern = decode('UTF-8', $ARGV[0]) if defined $ARGV[0];
unless ( length( $pattern ) ) {
    usage();
    exit 1;
}

my $opt_filesonly = ( defined $opt_l ? 1 : 0 );
my $opt_ignorecase = ( defined $opt_i ? 1 : 0 );
my $opt_linenum = ( defined $opt_n ? 1 : 0 );
my $opt_quiet = ( defined $opt_q ? 1 : 0 );
my $opt_invert = ( defined $opt_v ? 1 : 0 );

# rest of args should be filenames
my @files = @ARGV;
shift @files;
@files = map { decode('UTF-8', $_ ) } @files;

#
# usage
#
sub usage {
    print <<'EOF';

NAME
    tgrep - print lines matching a pattern, supports utf-8 characters
    and some thai character classes using perl regexp matching.

SYNOPSIS
    tgrep [options] PATTERN [FILE] [FILE2]

DESCRIPTION
    tgrep (thai grep) is similar to grep, in that it searches files or
    stdin for lines matching a pattern.  It uses perl to support utf-8
    characters, and therefore the patterns are perl regexp patterns.
    It supports a few simple homegrown character classes:

    [:thai:]          match any thai unicode value
    [:thaiconsonant:] match thai consonant including ฤ ฦ
    [:thaidigit:]     match thai number ๐๑๒๓๔๕๖๗๘๙ 
    [:thaitonemark:]  match thai tonemark ่้๊๋
    [:thaivowel:]     match thai vowel symbols ะัา ำิีึืุูเแโใไๅ็
                      does not include consonants that function as vowels
    [:thaifullvowel:] same as [:thaivowel:] plus อรวยฤฦๅ used to form
                      vowel diacritics and dipthongs
    [:thaimisc:]      match misc thai symbols ฯๆฺ฿์ํ๎๏๚๛

OPTIONS
    -h  print help or usage

    -i  ignore case

    -l  suppress normal output, only print filenames that match

    -n  prefix each line of output with the line number of the file

    -q  quiet mode, don't print out matches

    -v  invert match or print lines not matching pattern

ENVIRONMENT
     You may need to set LC_CTYPE, LC_ALL, or other LC_* to a utf-8
     setting for this to program to work, e.g. for csh-type shells:
          setenv LC_CTYPE en_US.UTF-8
         
EXIT STATUS
    Similar to grep, returns 0 when matching line found, 1 otherwise.
    If an error occurs, exit with 2 unless -q (quiet) option and a
    match is found

EXAMPLES
    search for 'ก' in a utf-8 text file
    $ tgrep ก file.txt

    use perl regexp to match any line thai with utf-8 characters
    $ tgrep '\p{InThai}' somefile.txt

    use perl regexp unicode values to match thai numbers
    $ tgrep '^[\x{0e50}-\x{0e59}]+$' other.file

    match lines with a thai number
    $ tgrep '[:thaidigit:]' afile.txt

NOTES
    grep(1) also can be used to match thai characters with unicode
    escapes, for example
       egrep "["$'\u0e01'-$'\u0e5b'"]" file.txt
    would match thai unicode chars in a bash-type shell.

SEE ALSO
    grep(1), perl(1), perlre(1), locale(1), ugrep(1)

BUGS
    Only utf-8 encodings are supported.
    The character classes used by this program ([:thai*:]) are not
    standard or supported by other programs.
    Quoting perl regular expression can sometimes be difficult from
    within the shell.

EOF
}

# handle convenience character classes
if ( index($pattern, "[:thai:]") != -1 ) {
    $pattern =~ s!\[\:thai\:\]!\\p\{InThai\}!g;
}
if ( index($pattern, "[:thaiconsonant:]") != -1 ) {
    # chars between ก & ฮ inclusive
    $pattern =~ s!\[\:thaiconsonant\:\]!\[\x{0e01}-\x{0e2e}\]!g;
}
if ( index($pattern, "[:thaidigit:]") != -1 ) {
    $pattern =~ s!\[\:thaidigit\:\]![๐๑๒๓๔๕๖๗๘๙]!g;
}
if ( index($pattern, "[:thaitonemark:]") != -1 ) {
    $pattern =~ s!\[\:thaitonemark\:\]![่้๊๋]!g;
}
if ( index($pattern, "[:thaivowel:]") != -1 ) {
    $pattern =~ s!\[\:thaivowel\:\]![ะัา ำิีึืุูเแโใไๅ็]!g;
}
if ( index($pattern, "[:thaivowelfull:]") != -1 ) {
    $pattern =~ s!\[\:thaivowelfull\:\]![ะัา ำิีึืุูเแโใไๅ็อรวยฤฦๅ]!g;
}
if ( index($pattern, "[:thaimisc:]") != -1 ) {
    $pattern =~ s!\[\:thaimisc\:\]![ฯๆ฿์ํ๎๏ฺ๚๛]!g;
}

my $qpattern = ( $opt_ignorecase ? qr/$pattern/iou : qr/$pattern/ou );
#print "pattern \"$pattern\"\n";
#print "qpattern \"$qpattern\"\n";

# if no file args or just "-", assume stdin
push @files, "/dev/stdin" if ! @files;
@files = map { $_ eq "-" ? "/dev/stdin" : $_ } @files;

# maybe help to improve matching speed
my $not_invert = ! $opt_invert;

my $match_found = 0;
my $error_occurred = 0;
foreach my $file ( @files ) {
    my $info;
    unless ( open $info, $file ) {
	warn "Could not open $file: $!";
	$error_occurred = 1;
	next;
    }
    
    my $line_num = 1;
    my $print_filename =  ( scalar ( @files ) > 1 ) && $file ne "/dev/stdin";
    my $print_linenum  = $opt_linenum && $file ne "/dev/stdin";

    while( my $line = <$info> ) {
	if ( ( $not_invert && $line =~ m/$qpattern/ ) ||
	     ( $opt_invert && $line !~ m/$qpattern/ ) ) {
	    $match_found = 1;
	    if ( $opt_quiet ) {
		last;
	    } elsif ( $opt_filesonly ) {
		print $file, "\n";
		last;
	    }
	    print $file,":" if $print_filename;
	    print $line_num,":" if $print_linenum;
	    chomp($line);
	    print $line, "\n";
	}
	$line_num += 1;
    }
    unless ( close $info ) {
	warn "Could not close $file: $!";
	$error_occurred = 1;
    }
}

# exit with same error codes as grep
if ( $error_occurred ) {
    if ( $match_found && $opt_quiet ) {
	exit 0;
    } else {
	exit 2;
    }
} else {
    exit ( $match_found ? 0 : 1 );
}
