#!/usr/pkg/bin/perl
#
# split thai text or file into longest (or smallest) possible words
# based on words in a dictionary file. 
#
use strict;
use warnings;
use Encode;
use Getopt::Std;

use utf8;
use open qw/:std :utf8/;

our ($opt_a, $opt_d, $opt_h, $opt_l, $opt_s);

# global dictionary of thai words
my %tdict = ();

# tmp results for recursions
my @tmp_combos = ();

#
# read thai dictionary into hash.  each line in dictionary should be a
# thai word. returns number of words read.
#
sub read_dictionary {
    # default dictionary
    my $fname = "/usr/pkg/share/split-thai/words";
    $fname = $opt_d if defined $opt_d;

    die "error reading dictionary $fname" unless -f $fname && -s $fname;
    open(my $fh, '<', $fname) or die "error opening $fname: $!";
    while ( <$fh> ) {
	chomp;
	$tdict{$_} = 1;
    }
    close $fh or die "error closing $fname : $!";

    # add punctuation and misc so parser won't choke
    my @misc_words =
	( "~", "`", "!", "@", "#", "\$", "%", "^", "&", "*", "(", ")",
	  "-", "_", "=", "+", "\\", "|", "{", "}", "[", "]",
	  ";", ":", "'", "\"", "<", ">", ".", ",", "/", "?",
	  "ๆ", "ฯาฯ", "ฯ", "฿", "๏", "๚", "๛" );

    foreach ( @misc_words ) {
	$tdict{$_} = 1;
    }
    
    my $size = keys %tdict;
    return $size;
}

#
# find all possible word breaks for a string,
# returns an array of strings
#
sub word_break {
    my $str = shift @_;

    # return string as is unless it contains thai
    return ( $str ) unless $str =~ m/\p{InThai}/;
    
    # add numbers to dictionary entries so numbers will be parsed
    my @numbers = ( $str =~ m/([0123456789๑๒๓๔๕๖๗๘๙๐,.]+)/g );
    foreach ( @numbers ) {
	$tdict{$_} = 1;
    }

    # add any english looking words 
    my @eng_words = ( $str =~ m/([a-zA-Z]+)/g );
    foreach ( @eng_words ) {
	$tdict{$_} = 1;
    }

    # clear any whitespace
    $str =~ s/\s+//g;
    
    # clear any previous word breaks
    @tmp_combos = ();
    word_break_util( $str, length($str), "");

    # filter results depending on args
    my @filtered = filter_combos( @tmp_combos );
    return @filtered;
}

#
# recursive function to find all possible word combos of string based
# on words in dictionary.  adapted from
# https://www.geeksforgeeks.org/word-break-problem-using-backtracking
#
# saves combos to global list
#
sub word_break_util{
    my $str = shift @_;
    my $n = shift @_;
    my $result = shift @_;

    for ( my $i=1; $i <= $n; $i++ ) {
	my $prefix = substr($str, 0, $i);

	if ( exists $tdict{$prefix} ) {
	    if ( $i == $n ) {
		$result .= $prefix;
		#print $result, "\n";
		push @tmp_combos, $result;
		return;
	    }
	    word_break_util( substr($str, $i, $n-$i),
			     $n-$i,$result . $prefix . " ");
	}
    }
}

#
# filter results depending on input args
#
sub filter_combos {
    my @swords = @_;
    
    # do nothing if keeping all or no words given
    if ( ! @swords ||
	 $opt_a && ( ! $opt_l && ! $opt_s ) ) {
	return @swords;
    }

    # get count of words for each str, make hash str => word-count
    my %combos = map { $_ => scalar( split / /, $_ ) } @swords;

    # sort by least or most words
    my @scombos = ();
    foreach my $str ( sort
		      { $opt_s ? $combos{$b} <=> $combos{$a} :
			    $combos{$a} <=> $combos{$b} } keys %combos ) {
	push @scombos, $str;
    }

    # only one permutation found, not listing more than one permutation,
    # or only one "best" match
    if ( scalar ( @scombos ) < 2 ||
	 ! $opt_a ||
	 $combos{$scombos[0]} != $combos{$scombos[1]} ) {
	return ( $scombos[0] );
    }

    # more than one best match and want them listed, so return "best" matches
    my $mark = $combos{$scombos[0]};
    my @best_set = ();
    foreach ( @scombos ) {
	if ( $combos{$_} == $mark ) {
	    push @best_set, $_;
	} else {
	    last;
	}
    }
    die "could not find best set" unless @best_set;
    
    return @best_set;
}

#
# word break a file line-by-line, print to stdout,
# filename as arg
#
sub break_file {
    my $fname = shift @_;

    open(my $fh, '<', $fname) or die "error opening $fname: $!";
    while ( my $line = <$fh> ) {
	chomp $line;

	if ( $line !~ m/\p{InThai}/ ) {
	    print $line, "\n";
	    next;
	}
	my @swords = word_break( $line );

	if ( @swords ) {
	    # print out any leading spaces
	    print "$1" if $line =~ m/^(\s+)/;
	    foreach ( @swords ) {
		print $_, "\n";
	    }
	} else {
	    print $line, "\n";
	}
    }
    close $fh or die "error closing $fname : $!";
    return 0;
}

#
# usage
#
sub usage {
print <<"EOF";

NAME
     st-wordbreak - print out possible word breaks for utf-8 Thai
     text, file, or stdin based on words in dictionary file

SYNOPSIS
     st-wordbreak [options] file|[text1 text2 text3]

OPTIONS
     -a   list all combinations possible (per line if breaking a file)

     -d   [dictionary-name]
          use dictionary given instead of default

     -h   print out help message

     -l   longest words (or fewest words), lists one "best" match.
          this is the default option.  if -a arg also given, include
          all the "best" matches, which could be more than one

     -s   smallest words (or most words), lists one "best" match.
          if -a arg also given, include all the "best" matches,
          which could be more than one

ENVIRONMENT
     You may need to set LC_CTYPE, LC_ALL, or other LC_* to a utf-8
     setting for this to program to work

EXIT STATUS
     Return 0 if no errors, non-zero if errors or couldn't split up
     string.

BUGS
     Only utf-8 encodings are supported.
     Uses recursion so may not handle extremely long strings.
     Good splitting results are very dependent on dictionary data.  If
     word not in dictionary, may not be able to split at all.

EOF
}

getopts('ad:hls');

if ( $opt_h ) {
    usage();
    exit 0;
}

die "invalid args" if $opt_l && $opt_s;

# default is to split into largest words
$opt_l = 1 if ! $opt_a && ! $opt_l && ! $opt_s;

# read remaining args, convert to utf8
my $argc = scalar( @ARGV );
my @dargv = map { decode('UTF-8', $_ ) } @ARGV;

# read word list
read_dictionary();

# reading stdin or a file?
my $textname;
if ( $argc == 0 || ($argc == 1 && $dargv[0] eq "-") ) {
    $textname = "/dev/stdin";
} elsif ( $argc == 1 && -e $dargv[0] ) {
    $textname = $dargv[0];
}

# splitting a file
if ( $textname ) {
    break_file( $textname );
} else {
    # splitting text args
    my $str = join(' ', @dargv);
    my @swords = word_break( $str );

    if ( ! @swords ) {
	# could not parse, print original inputs
	print join(' ', @dargv), "\n";
	exit 1;
    } else {
	foreach ( @swords ) {
	    print $_, "\n";
	}
	exit 0;
    }
}
