# ==============================================================================
# Extract canonical decompositions and ordering from Unicode data file
#
# Copyright (c) 2012-2013 by the developers. See the LICENSE file for details.


# ==============================================================================
# Represent extracted data as initialized C array

BEGIN \
{
   printf("%s%s\n", "/* Unicode canonical decompositions",
          " and ordering rules created by build1.mk */")
   printf("static const struct uc_cdc  uc_cdc_table[] =\n{\n")
   # Typecast for canonical combining class
   ccc_type = "(unsigned char)"
}

END \
{
   printf("   /* Codepoint -1 is the end marker */\n")
   printf("   { -1L, " ccc_type " 0, -1L, -1L }\n")
   printf("};\n")
   printf("\n\n/* EOF */\n")
}


# ==============================================================================
# Process lines

{
   # Extract potential canonical decomposition from 6th field
   # (5th field in terms of Unicode standard counting from zero)
   decomp = extract_canon_decomp($6)
   if("" != decomp)
   {
      # Format: { Codepoint, Ccc, Decomposition (1 or 2 codepoints) }
      printf("   { 0x%sL, " ccc_type " %s, %s },\n", $1, $4, decomp)
   }
   else
   {
      # Check for nonzero canonical combining class
      ccc = $4
      if("0" != ccc)
      {
         # Format: { Codepoint, Ccc, Decomposition (1 or 2 codepoints) }
         printf("   { 0x%sL, " ccc_type " %s, -1L, -1L },\n", $1, ccc)
      }
   }
   next
}


# ==============================================================================
# Extract codepoints from decomposition field

function extract_canon_decomp(s) \
{
   dc = ""
   # Check for "as is" decomposition (and ignore it)
   if("" != s)
   {
      # Check for potential compatibility decomposition (and ignore it)
      if(!match(s, /^<.*>/))
      {
         # Extract decomposition codepoints
         if(match(s, /[0-9A-F]+/))
         {
            dc = "0x" substr(s, RSTART, RLENGTH) "L"
            remainder = substr(s, RSTART + RLENGTH)
            if(match(remainder, /[0-9A-F]+/))
            {
               dc = dc ", 0x" substr(remainder, RSTART, RLENGTH) "L"
            }
            else
            {
               # Set second codepoint to -1 if it does not exist
               dc = dc ", -1L"
            }
         }
      }
   }
   return(dc)
}


# EOF
