#!perl

# UnicodeCharClassify.pl
# Last update: 2016.06.20
# (c) 2016 JOJO

#Usage: perl scintilla/scripts/UnicodeCharClassify.pl > scintilla/src/UnicodeCharClassifyData

use utf8;
use warnings;
use strict;
use File::Basename;

my $DEBUG = 0;

my $UnicodeData = File::Basename::dirname($0)."/UnicodeData.txt";	# 'scintilla/scripts/UnicodeData.txt'

# scintilla/src/CharClassify.h : enum cc { ccSpace, ccNewLine, ccWord, ccPunctuation };
my @scc = ('ccSpace', 'ccNewLine', 'ccWord', 'ccPunctuation');
my $ccSpace = 0;
my $ccNewLine = 1;
my $ccWord = 2;
my $ccPunctuation = 3;

my $BITSHIFT = 8;
my $BITMASK = (1 << $BITSHIFT) - 1;

my $icc0 = -1;

open FILE, '<:utf8', $UnicodeData or die $!;

print "/*
 * http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
 */
static const int SHIFT = $BITSHIFT;
static const int MASK = $BITMASK;
static const int data[] = {\n";

while (not eof(FILE)) {
	# 0000;<control>;Cc;0;BN;;;;;N;NULL;;;;
	my $line = <FILE>;
	my @L = split /;/, $line;
	
	my $vcp = $L[0];	# 0. Code value
	my $vgc = $L[2];	# 2. General Category
	die unless $vgc;

	my $icp = hex($vcp);
	my $scp = "0x$vcp";
	my $icc;
	
	# scintilla/src/CharClassify.cxx: void CharClassify::SetDefaultCharClasses(bool includeWordClass) { ... }
	   if ($icp == 0x000A) {$icc = $ccNewLine}	# (Cc) <LF>
	elsif ($icp == 0x000D) {$icc = $ccNewLine}	# (Cc) <CR>
	elsif ($icp <= 0x0020) {$icc = $ccSpace}	# <TAB><FF><BS><ESC>...
	elsif ($icp == 0x005F) {$icc = $ccWord}		# (Pc) '_'
	#
 #	elsif ($icp == 0x0085) {$icc = $ccNewLine}	# (Cc) <NEL>
	elsif ($icp == 0xFF3F) {$icc = $ccWord}		# (Pc) '＿'

	elsif ($vgc eq 'Ll')  {$icc = $ccWord}
	elsif ($vgc eq 'Lm')  {$icc = $ccWord}	# 々〱〲〳〴〵〻ゝゞーヽヾｰﾞﾟ
	elsif ($vgc eq 'Lo')  {$icc = $ccWord}	# ぁあぃいぅうぇえぉおゐゑゔゕゖゟァアィイゥウェエォヰヱヴヵヶヷヸヹヺヿｧｨｩｪｫｱｲｳｴｵ
	elsif ($vgc eq 'Lt')  {$icc = $ccWord}	# ῼ
	elsif ($vgc eq 'Lu')  {$icc = $ccWord}

 #	elsif ($vgc eq 'Mc')  {$icc = $ccWord}
 #	elsif ($vgc eq 'Me')  {$icc = $ccWord}
 #	elsif ($vgc eq 'Mn')  {$icc = $ccWord}

	elsif ($vgc eq 'Nd')  {$icc = $ccWord}	# 0123456789０１２３４５６７８９
	elsif ($vgc eq 'Nl')  {$icc = $ccWord}	# ⅠⅡⅢⅰⅱⅲ
	elsif ($vgc eq 'No')  {$icc = $ccWord}	# ²³¹¼½¾①②③⑴⑵⑶

 #	elsif ($vgc eq 'Pc')  {$icc = $ccWord}	# _‿⁀⁔︳︴﹍﹎﹏＿

	elsif ($vgc eq 'Zl')  {$icc = $ccSpace}
	elsif ($vgc eq 'Zp')  {$icc = $ccSpace}	# U+2029 PARAGRAPH SEPARATOR
	elsif ($vgc eq 'Zs')  {$icc = $ccSpace}	# U+0020<SAPCE> U+00A0<NBSP> U+3000<　> ...(17)

	else                  {$icc = $ccPunctuation}

	next if $icc == $icc0;

	my $v = $icp << $BITSHIFT | $icc;
	print "$v,";
	print "\t// $scp, CharClassify::$scc[$icc]" if $DEBUG;
	print "\n";
	
	$icc0 = $icc;
}
print "};\n";
close FILE;
