1#!/usr/bin/perl 2# 3# usage: make-casetable.pl <infile> <outfile1> <outfile2> 4# make-casetable.pl UnicodeData.txt utf16_casetable.h utf16_case.c 5# 6# (c) 2011 by HAT <hat@fa2.so-net.ne.jp> 7# 8# This program is free software; you can redistribute it and/or modify 9# it under the terms of the GNU General Public License as published by 10# the Free Software Foundation; either version 2 of the License, or 11# (at your option) any later version. 12# 13# This program is distributed in the hope that it will be useful, 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16# GNU General Public License for more details. 17# 18 19# See 20# http://www.unicode.org/reports/tr44/ 21# http://www.unicode.org/Public/UNIDATA/UnicodeData.txt 22 23# One block has 64 chars. 24# 25# BMP 26# block 0 = dummy 27# block 1 = U+0000 - U+003F 28# block 2 = U+0040 - U+007F 29# ..... 30# block 1024 = U+FFC0 - U+FFFF 31# block 1025 = dummy 32# 33# Surrogate Pair 34# block 1024 = dummy 35# block 1025 = U+010000 - U+01003F 36# block 1026 = U+010040 - U+01007F 37# ..... 38# block 17408 = U+10FFC0 - U+10FFFF 39# block 17409 = dummy 40# 41# Dummy block is for edge detection. 42# If block include upper/lower chars, block_enable[]=1. 43 44use strict; 45use warnings; 46 47our $code0; 48our $Name1; 49our $General_Category2; 50our $Canonical_Combining_Class3; 51our $Bidi_Class4; 52our $Decomposition_Mapping5; 53our $Numeric_Value6; 54our $Numeric_Value7; 55our $Numeric_Value8; 56our $Bidi_Mirrored9; 57our $Unicode_1_Name10; 58our $ISO_Comment11; 59our $Simple_Uppercase_Mapping12; 60our $Simple_Lowercase_Mapping13; 61our $Simple_Titlecase_Mapping14; 62 63our $hex_code0; 64our $Mapping; 65our $hex_Mapping; 66 67our $char; 68our $sp; 69our $block; 70 71our @table; 72our @table_sp; 73 74our @block_enable; 75our @block_enable_sp; 76 77our $table_no; 78our $block_start; 79our $block_end; 80our $char_start; 81our $char_end; 82 83open(CHEADER, ">$ARGV[1]"); 84open(CSOURCE, ">$ARGV[2]"); 85 86printf (CHEADER "\/\*\n"); 87printf (CHEADER "DO NOT EDIT BY HAND\!\!\!\n"); 88printf (CHEADER "\n"); 89printf (CHEADER "This file is generated by\n"); 90printf (CHEADER " contrib/shell_utils/make-casetable.pl %s %s %s\n", $ARGV[0], $ARGV[1], $ARGV[2]); 91printf (CHEADER "\n"); 92printf (CHEADER "%s is got from\n", $ARGV[0]); 93printf (CHEADER "http\:\/\/www.unicode.org\/Public\/UNIDATA\/UnicodeData.txt\n"); 94printf (CHEADER "\*\/\n"); 95printf (CHEADER "\n"); 96 97printf (CSOURCE "\/\*\n"); 98printf (CSOURCE "DO NOT EDIT BY HAND\!\!\!\n"); 99printf (CSOURCE "\n"); 100printf (CSOURCE "This file is generated by\n"); 101printf (CSOURCE " contrib/shell_utils/make-casetable.pl %s %s %s\n", $ARGV[0], $ARGV[1], $ARGV[2]); 102printf (CSOURCE "\n"); 103printf (CSOURCE "%s is got from\n", $ARGV[0]); 104printf (CSOURCE "http\:\/\/www.unicode.org\/Public\/UNIDATA\/UnicodeData.txt\n"); 105printf (CSOURCE "\*\/\n"); 106printf (CSOURCE "\n"); 107printf (CSOURCE "\#include \<stdint.h\>\n"); 108printf (CSOURCE "\#include \<atalk\/unicode.h\>\n"); 109printf (CSOURCE "\#include \"%s\"\n", $ARGV[1]); 110printf (CSOURCE "\n"); 111 112&make_array("upper"); 113&make_array("lower"); 114 115printf (CHEADER "\/\* EOF \*\/\n"); 116printf (CSOURCE "\/\* EOF \*\/\n"); 117 118close(CHEADER); 119close(CSOURCE); 120 121 122########################################################################### 123sub make_array{ 124 125 # init table ----------------------------------------------------- 126 127 for ($char = 0 ; $char <= 0xFFFF ; $char++) { 128 $table[$char][0] = $char; # mapped char 129 $table[$char][1] = $char; # orig char 130 $table[$char][2] = ""; # char name 131 } 132 133 for ($char = 0x10000 ; $char <= 0x10FFFF ; $char++) { 134 $sp = ((0xD800 - (0x10000 >> 10) + ($char >> 10)) << 16) 135 + (0xDC00 + ($char & 0x3FF)); 136 $table_sp[$char][0] = $sp; # mapped surrogate pair 137 $table_sp[$char][1] = $sp; # orig surrogate pair 138 $table_sp[$char][2] = $char; # mapped char 139 $table_sp[$char][3] = $char; # orig char 140 $table_sp[$char][4] = ""; # char name 141 } 142 143 for ($block = 0 ; $block <= 1025 ; $block++) { 144 $block_enable[$block] = 0; 145 } 146 147 $block_enable[1] = 1; # ASCII block is forcibly included 148 $block_enable[2] = 1; # in the array for Speed-Up. 149 150 for ($block = 1024 ; $block <= 17409 ; $block++) { 151 $block_enable_sp[$block] = 0; 152 } 153 154 # write data to table -------------------------------------------- 155 156 open(UNICODEDATA, "<$ARGV[0]"); 157 158 while (<UNICODEDATA>) { 159 chop; 160 ( 161 $code0, 162 $Name1, 163 $General_Category2, 164 $Canonical_Combining_Class3, 165 $Bidi_Class4, 166 $Decomposition_Mapping5, 167 $Numeric_Value6, 168 $Numeric_Value7, 169 $Numeric_Value8, 170 $Bidi_Mirrored9, 171 $Unicode_1_Name10, 172 $ISO_Comment11, 173 $Simple_Uppercase_Mapping12, 174 $Simple_Lowercase_Mapping13, 175 $Simple_Titlecase_Mapping14 176 ) = split(/\;/); 177 178 if ($_[0] eq "upper") { 179 $Mapping = $Simple_Uppercase_Mapping12; 180 } elsif ($_[0] eq "lower") { 181 $Mapping = $Simple_Lowercase_Mapping13; 182 } else { 183 exit(1); 184 } 185 186 next if ($Mapping eq ""); 187 188 $hex_code0 = hex($code0); 189 $hex_Mapping = hex($Mapping); 190 191 if ($hex_code0 <= 0xFFFF) { 192 $table[$hex_code0][0] = $hex_Mapping; 193 #table[$hex_code0][1] already set 194 $table[$hex_code0][2] = $Name1; 195 $block_enable[($hex_code0 / 64) +1] = 1; 196 } else { 197 $sp = ((0xD800 - (0x10000 >> 10) + ($hex_Mapping >> 10)) << 16) 198 + (0xDC00 + ($hex_Mapping & 0x3FF)); 199 $table_sp[$hex_code0][0] = $sp; 200 #table_sp[$hex_code0][1] already set 201 $table_sp[$hex_code0][2] = $hex_Mapping; 202 #table_sp[$hex_code0][3] already set 203 $table_sp[$hex_code0][4] = $Name1; 204 $block_enable_sp[($hex_code0 / 64) +1] = 1; 205 } 206 } 207 208 close(UNICODEDATA); 209 210 # array for BMP -------------------------------------------------- 211 212 printf(CSOURCE "\/*******************************************************************\n"); 213 printf(CSOURCE " Convert a wide character to %s case.\n", $_[0]); 214 printf(CSOURCE "*******************************************************************\/\n"); 215 printf(CSOURCE "ucs2\_t to%s\_w\(ucs2\_t val\)\n", $_[0]); 216 printf(CSOURCE "{\n"); 217 218 $table_no = 1; 219 220 for ($block = 1 ; $block <= 1024 ; $block++) { 221 222 # rising edge detection 223 if ($block_enable[$block - 1] == 0 && $block_enable[$block] == 1) { 224 $block_start = $block; 225 } 226 227 # falling edge detection 228 if ($block_enable[$block] == 1 && $block_enable[$block + 1] == 0) { 229 $block_end = $block; 230 231 $char_start = ($block_start -1)* 64; 232 $char_end = ($block_end * 64) -1; 233 234 printf(CHEADER "static const uint16\_t %s\_table\_%d\[%d\] \= \{\n", 235 $_[0], $table_no, $char_end - $char_start +1); 236 237 for ($char = $char_start ; $char <= $char_end ; $char++) { 238 printf(CHEADER " 0x%04X, /*U\+%04X*/ /*%s*/\n", 239 $table[$char][0], 240 $table[$char][1], 241 $table[$char][2] 242 ); 243 } 244 printf(CHEADER "\}\;\n"); 245 printf(CHEADER "\n"); 246 247 if ($char_start == 0x0000) { 248 printf(CSOURCE " if \( val \<\= 0x%04X)\n", 249 $char_end); 250 printf(CSOURCE " return %s\_table\_%d\[val]\;\n", 251 $_[0], $table_no); 252 } else { 253 printf(CSOURCE " if \( val \>\= 0x%04X \&\& val \<\= 0x%04X)\n", 254 $char_start, $char_end); 255 printf(CSOURCE " return %s\_table\_%d\[val-0x%04X\]\;\n", 256 $_[0], $table_no, $char_start); 257 } 258 printf(CSOURCE "\n"); 259 260 $table_no++; 261 } 262 } 263 264 printf(CSOURCE "\treturn \(val\)\;\n"); 265 printf(CSOURCE "\}\n"); 266 printf(CSOURCE "\n"); 267 268 # array for Surrogate Pair --------------------------------------- 269 270 printf(CSOURCE "\/*******************************************************************\n"); 271 printf(CSOURCE " Convert a surrogate pair to %s case.\n", $_[0]); 272 printf(CSOURCE "*******************************************************************\/\n"); 273 printf(CSOURCE "uint32\_t to%s\_sp\(uint32\_t val\)\n", $_[0]); 274 printf(CSOURCE "{\n"); 275 276 $table_no = 1; 277 278 for ($block = 1025 ; $block <= 17408 ; $block++) { 279 280 # rising edge detection 281 if ((($block_enable_sp[$block - 1] == 0) || ((($block - 1) & 0xF) == 0)) 282 && ($block_enable_sp[$block] == 1)) { 283 $block_start = $block; 284 } 285 286 # falling edge detection 287 if (($block_enable_sp[$block] == 1) && 288 ((($block - 1) & 0xF == 0xF) || ($block_enable_sp[$block + 1] == 0))) { 289 $block_end = $block; 290 291 $char_start = ($block_start -1)* 64; 292 $char_end = ($block_end * 64) -1; 293 294 printf(CHEADER "static const uint32\_t %s\_table\_sp\_%d\[%d\] \= \{\n", 295 $_[0], $table_no, $char_end - $char_start +1); 296 297 for ($char = $char_start ; $char <= $char_end ; $char++) { 298 printf(CHEADER " 0x%08X, /*0x%08X*/ /*U\+%06X*/ /*U\+%06X*/ /*%s*/\n", 299 $table_sp[$char][0], 300 $table_sp[$char][1], 301 $table_sp[$char][2], 302 $table_sp[$char][3], 303 $table_sp[$char][4] 304 ); 305 } 306 printf(CHEADER "\}\;\n"); 307 printf(CHEADER "\n"); 308 309 printf(CSOURCE " if \( val \>\= 0x%08X \&\& val \<\= 0x%08X)\n", 310 $table_sp[$char_start][1], $table_sp[$char_end][1]); 311 printf(CSOURCE " return %s\_table\_sp\_%d\[val-0x%08X\]\;\n", 312 $_[0], $table_no, $table_sp[$char_start][1]); 313 printf(CSOURCE "\n"); 314 315 $table_no++; 316 } 317 } 318 319 printf(CSOURCE "\treturn \(val\)\;\n"); 320 printf(CSOURCE "\}\n"); 321 printf(CSOURCE "\n"); 322} 323 324# EOF 325