1#!/usr/bin/perl 2# 3# usage: make-precompose.h.pl UnicodeData.txt > precompose.h 4# 5# (c) 2008-2011 by HAT <hat@fa2.so-net.ne.jp> 6# 7# This program is free software; you can redistribute it and/or modify 8# it under the terms of the GNU General Public License as published by 9# the Free Software Foundation; either version 2 of the License, or 10# (at your option) any later version. 11# 12# This program is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15# GNU General Public License for more details. 16# 17 18# See 19# http://www.unicode.org/Public/UNIDATA/UCD.html 20# http://www.unicode.org/reports/tr15/ 21# http://www.unicode.org/Public/*/ucd/UnicodeData*.txt 22# http://www.unicode.org/Public/UNIDATA/UnicodeData.txt 23 24 25# temp files for binary search (compose.TEMP, compose_sp.TEMP) ------------- 26 27open(UNICODEDATA, "<$ARGV[0]"); 28 29open(COMPOSE_TEMP, ">compose.TEMP"); 30open(COMPOSE_SP_TEMP, ">compose_sp.TEMP"); 31 32while (<UNICODEDATA>) { 33 chop; 34 ( 35 $code0, 36 $Name1, 37 $General_Category2, 38 $Canonical_Combining_Class3, 39 $Bidi_Class4, 40 $Decomposition_Mapping5, 41 $Numeric_Value6, 42 $Numeric_Value7, 43 $Numeric_Value8, 44 $Bidi_Mirrored9, 45 $Unicode_1_Name10, 46 $ISO_Comment11, 47 $Simple_Uppercase_Mapping12, 48 $Simple_Lowercase_Mapping13, 49 $Simple_Titlecase_Mapping14 50 ) = split(/\;/); 51 52 if (($Decomposition_Mapping5 ne "") && ($Decomposition_Mapping5 !~ /\</) && ($Decomposition_Mapping5 =~ / /)) { 53 ($base, $comb) = split(/ /,$Decomposition_Mapping5); 54 55 $leftbracket = " { "; 56 $rightbracket =" }, "; 57 58 # AFP 3.x Spec 59 if ( ((0x2000 <= hex($code0)) && (hex($code0) <= 0x2FFF)) 60 || ((0xFE30 <= hex($code0)) && (hex($code0) <= 0xFE4F)) 61 || ((0x2F800 <= hex($code0)) && (hex($code0) <= 0x2FA1F))) { 62 $leftbracket = "\/\*{ "; 63 $rightbracket =" },\*\/ "; 64 } 65 66 if (hex($code0) > 0xFFFF) { 67 68 $code0_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($code0) >> 10); 69 $code0_sp_lo = 0xDC00 + (hex($code0) & 0x3FF); 70 71 $base_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($base) >> 10); 72 $base_sp_lo = 0xDC00 + (hex($base) & 0x3FF); 73 74 $comb_sp_hi = 0xD800 - (0x10000 >> 10) + (hex($comb) >> 10); 75 $comb_sp_lo = 0xDC00 + (hex($comb) & 0x3FF); 76 77 printf(COMPOSE_SP_TEMP "%s0x%04X%04X, 0x%04X%04X, 0x%04X%04X%s\/\* %s \*\/\n", 78 $leftbracket, $code0_sp_hi ,$code0_sp_lo, $base_sp_hi, $base_sp_lo, $comb_sp_hi, $comb_sp_lo, $rightbracket, $Name1); 79 80 $leftbracket = "\/\*{ "; 81 $rightbracket =" },\*\/ "; 82 } 83 84 printf(COMPOSE_TEMP "%s0x%08X, 0x%08X, 0x%08X%s\/\* %s \*\/\n", $leftbracket, hex($code0), hex($base), hex($comb), $rightbracket, $Name1); 85 86 } 87} 88 89close(UNICODEDATA); 90 91close(COMPOSE_TEMP); 92close(COMPOSE_SP_TEMP); 93 94# macros for BMP (PRECOMP_COUNT, DECOMP_COUNT, MAXCOMBLEN) ---------------- 95 96open(COMPOSE_TEMP, "<compose.TEMP"); 97 98@comp_table = (); 99$comp_count = 0; 100 101while (<COMPOSE_TEMP>) { 102 if (m/^\/\*/) { 103 next; 104 } 105 $comp_table[$comp_count][0] = substr($_, 4, 10); 106 $comp_table[$comp_count][1] = substr($_, 16, 10); 107 $comp_count++; 108} 109 110$maxcomblen = 2; # Hangul's maxcomblen is already 2. That is, VT. 111 112for ($i = 0 ; $i < $comp_count ; $i++) { 113 $base = $comp_table[$i][1]; 114 $comblen = 1; 115 $j = 0; 116 while ($j < $comp_count) { 117 if ($base ne $comp_table[$j][0]) { 118 $j++; 119 next; 120 } else { 121 $comblen++; 122 $base = $comp_table[$j][1]; 123 $j = 0; 124 } 125 } 126 $maxcomblen = ($maxcomblen > $comblen) ? $maxcomblen : $comblen; 127} 128 129close(COMPOSE_TEMP); 130 131# macros for SP (PRECOMP_SP_COUNT,DECOMP_SP_COUNT, MAXCOMBSPLEN) ----------- 132 133open(COMPOSE_SP_TEMP, "<compose_sp.TEMP"); 134 135@comp_sp_table = (); 136$comp_sp_count = 0; 137 138while (<COMPOSE_SP_TEMP>) { 139 if (m/^\/\*/) { 140 next; 141 } 142 $comp_sp_table[$comp_sp_count][0] = substr($_, 4, 10); 143 $comp_sp_table[$comp_sp_count][1] = substr($_, 16, 10); 144 $comp_sp_count++; 145} 146 147$maxcombsplen = 2; # one char have 2 codepoints, like a D8xx DCxx. 148 149for ($i = 0 ; $i < $comp_sp_count ; $i++) { 150 $base_sp = $comp_sp_table[$i][1]; 151 $comblen = 2; 152 $j = 0; 153 while ($j < $comp_sp_count) { 154 if ($base_sp ne $comp_sp_table[$j][0]) { 155 $j++; 156 next; 157 } else { 158 $comblen += 2; 159 $base_sp = $comp_sp_table[$j][1]; 160 $j = 0; 161 } 162 } 163 $maxcombsplen = ($maxcombsplen > $comblen) ? $maxcombsplen : $comblen; 164} 165 166close(COMPOSE_SP_TEMP); 167 168# macro for buffer length (COMBBUFLEN) ------------------------------------- 169 170$combbuflen = ($maxcomblen > $maxcombsplen) ? $maxcomblen : $maxcombsplen; 171 172# sort --------------------------------------------------------------------- 173 174system("sort -k 3 compose.TEMP \> precompose.SORT"); 175system("sort -k 2 compose.TEMP \> decompose.SORT"); 176 177system("sort -k 3 compose_sp.TEMP \> precompose_sp.SORT"); 178system("sort -k 2 compose_sp.TEMP \> decompose_sp.SORT"); 179 180# print ------------------------------------------------------------------- 181 182print ("\/\* DO NOT EDIT BY HAND\!\!\! \*\/\n"); 183print ("\/\* This file is generated by \*\/\n"); 184printf ("\/\* contrib/shell_utils/make-precompose.h.pl %s \*\/\n", $ARGV[0]); 185print ("\n"); 186printf ("\/\* %s is got from \*\/\n", $ARGV[0]); 187print ("\/\* http\:\/\/www.unicode.org\/Public\/UNIDATA\/UnicodeData.txt \*\/\n"); 188print ("\n"); 189 190print ("\#define SBASE 0xAC00\n"); 191print ("\#define LBASE 0x1100\n"); 192print ("\#define VBASE 0x1161\n"); 193print ("\#define TBASE 0x11A7\n"); 194print ("\#define LCOUNT 19\n"); 195print ("\#define VCOUNT 21\n"); 196print ("\#define TCOUNT 28\n"); 197print ("\#define NCOUNT 588 \/\* (VCOUNT \* TCOUNT) \*\/\n"); 198print ("\#define SCOUNT 11172 \/\* (LCOUNT \* NCOUNT) \*\/\n"); 199print ("\n"); 200 201printf ("\#define PRECOMP_COUNT %d\n", $comp_count); 202printf ("\#define DECOMP_COUNT %d\n", $comp_count); 203printf ("\#define MAXCOMBLEN %d\n", $maxcomblen); 204print ("\n"); 205printf ("\#define PRECOMP_SP_COUNT %d\n", $comp_sp_count); 206printf ("\#define DECOMP_SP_COUNT %d\n", $comp_sp_count); 207printf ("\#define MAXCOMBSPLEN %d\n", $maxcombsplen); 208print ("\n"); 209printf ("\#define COMBBUFLEN %d \/\* max\(MAXCOMBLEN\,MAXCOMBSPLEN\) \*\/\n", $combbuflen); 210print ("\n"); 211 212print ("static const struct \{\n"); 213print (" unsigned int replacement\;\n"); 214print (" unsigned int base\;\n"); 215print (" unsigned int comb\;\n"); 216print ("\} precompositions\[\] \= \{\n"); 217 218system("cat precompose.SORT"); 219 220print ("\}\;\n"); 221print ("\n"); 222 223print ("static const struct \{\n"); 224print (" unsigned int replacement\;\n"); 225print (" unsigned int base\;\n"); 226print (" unsigned int comb\;\n"); 227print ("\} decompositions\[\] \= \{\n"); 228 229system("cat decompose.SORT"); 230 231print ("\}\;\n"); 232print ("\n"); 233 234 235 236print ("static const struct \{\n"); 237print (" unsigned int replacement_sp\;\n"); 238print (" unsigned int base_sp\;\n"); 239print (" unsigned int comb_sp\;\n"); 240print ("\} precompositions_sp\[\] \= \{\n"); 241 242system("cat precompose_sp.SORT"); 243 244print ("\}\;\n"); 245print ("\n"); 246 247print ("static const struct \{\n"); 248print (" unsigned int replacement_sp\;\n"); 249print (" unsigned int base_sp\;\n"); 250print (" unsigned int comb_sp\;\n"); 251print ("\} decompositions_sp\[\] \= \{\n"); 252 253system("cat decompose_sp.SORT"); 254 255print ("\}\;\n"); 256print ("\n"); 257 258print ("\/\* EOF \*\/\n"); 259 260# EOF 261