1# !!!!!!! DO NOT EDIT THIS FILE !!!!!!! 2# This file is machine-generated by lib/unicore/mktables from the Unicode 3# database, Version 15.0.0. Any changes made here will be lost! 4 5 6# !!!!!!! INTERNAL PERL USE ONLY !!!!!!! 7# This file is for internal use by core Perl only. The format and even the 8# name or existence of this file are subject to change without notice. Don't 9# use it directly. Use Unicode::UCD to access the Unicode character data 10# base. 11 12 13 14package charnames; 15 16# This module contains machine-generated tables and code for the 17# algorithmically-determinable Unicode character names. The following 18# routines can be used to translate between name and code point and vice versa 19 20{ # Closure 21 22 # Matches legal code point. 4-6 hex numbers, If there are 6, the first 23 # two must be 10; if there are 5, the first must not be a 0. Written this 24 # way to decrease backtracking. The first regex allows the code point to 25 # be at the end of a word, but to work properly, the word shouldn't end 26 # with a valid hex character. The second one won't match a code point at 27 # the end of a word, and doesn't have the run-on issue 28 my $run_on_code_point_re = qr/(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b)/; 29 my $code_point_re = qr/(?^aa:\b(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b))/; 30 31 # In the following hash, the keys are the bases of names which include 32 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value 33 # of each key is another hash which is used to get the low and high ends 34 # for each range of code points that apply to the name. 35 my %names_ending_in_code_point = ( 36'CJK COMPATIBILITY IDEOGRAPH' => 37{ 38'high' => 39[ 4064109, 4164217, 42195101, 43], 44'low' => 45[ 4663744, 4764112, 48194560, 49], 50}, 51'CJK UNIFIED IDEOGRAPH' => 52{ 53'high' => 54[ 5519903, 5640959, 57173791, 58177977, 59178205, 60183969, 61191456, 62201546, 63205743, 64], 65'low' => 66[ 6713312, 6819968, 69131072, 70173824, 71177984, 72178208, 73183984, 74196608, 75201552, 76], 77}, 78'KHITAN SMALL SCRIPT CHARACTER' => 79{ 80'high' => 81[ 82101589, 83], 84'low' => 85[ 86101120, 87], 88}, 89'NUSHU CHARACTER' => 90{ 91'high' => 92[ 93111355, 94], 95'low' => 96[ 97110960, 98], 99}, 100'TANGUT IDEOGRAPH' => 101{ 102'high' => 103[ 104100343, 105], 106'low' => 107[ 10894208, 109], 110}, 111'TANGUT IDEOGRAPH SUPPLEMENT' => 112{ 113'high' => 114[ 115101640, 116], 117'low' => 118[ 119101632, 120], 121}, 122 123 ); 124 125 # The following hash is a copy of the previous one, except is for loose 126 # matching, so each name has blanks and dashes squeezed out 127 my %loose_names_ending_in_code_point = ( 128'CJKCOMPATIBILITYIDEOGRAPH' => 129{ 130'high' => 131[ 13264109, 13364217, 134195101, 135], 136'low' => 137[ 13863744, 13964112, 140194560, 141], 142}, 143'CJKUNIFIEDIDEOGRAPH' => 144{ 145'high' => 146[ 14719903, 14840959, 149173791, 150177977, 151178205, 152183969, 153191456, 154201546, 155205743, 156], 157'low' => 158[ 15913312, 16019968, 161131072, 162173824, 163177984, 164178208, 165183984, 166196608, 167201552, 168], 169}, 170'KHITANSMALLSCRIPTCHARACTER' => 171{ 172'high' => 173[ 174101589, 175], 176'low' => 177[ 178101120, 179], 180}, 181'NUSHUCHARACTER' => 182{ 183'high' => 184[ 185111355, 186], 187'low' => 188[ 189110960, 190], 191}, 192'TANGUTIDEOGRAPH' => 193{ 194'high' => 195[ 196100343, 197], 198'low' => 199[ 20094208, 201], 202}, 203'TANGUTIDEOGRAPHSUPPLEMENT' => 204{ 205'high' => 206[ 207101640, 208], 209'low' => 210[ 211101632, 212], 213}, 214 215 ); 216 217 # And the following array gives the inverse mapping from code points to 218 # names. Lowest code points are first 219 @code_points_ending_in_code_point = ( 220 221{ 222'high' => 19903, 223'legal' => 224' 225 -0123456789ABCDEFGHIJKNOPRU', 226'low' => 13312, 227'name' => 'CJK UNIFIED IDEOGRAPH', 228}, 229{ 230'high' => 40959, 231'legal' => 232' 233 -0123456789ABCDEFGHIJKNOPRU', 234'low' => 19968, 235'name' => 'CJK UNIFIED IDEOGRAPH', 236}, 237{ 238'high' => 64109, 239'legal' => 240' 241 -0123456789ABCDEFGHIJKLMOPRTY', 242'low' => 63744, 243'name' => 'CJK COMPATIBILITY IDEOGRAPH', 244}, 245{ 246'high' => 64217, 247'legal' => 248' 249 -0123456789ABCDEFGHIJKLMOPRTY', 250'low' => 64112, 251'name' => 'CJK COMPATIBILITY IDEOGRAPH', 252}, 253{ 254'high' => 100343, 255'legal' => 256' 257 -0123456789ABCDEFGHINOPRTU', 258'low' => 94208, 259'name' => 'TANGUT IDEOGRAPH', 260}, 261{ 262'high' => 101589, 263'legal' => 264' 265 -0123456789ABCDEFHIKLMNPRST', 266'low' => 101120, 267'name' => 'KHITAN SMALL SCRIPT CHARACTER', 268}, 269{ 270'high' => 101640, 271'legal' => 272' 273 -0123456789ABCDEFGHILMNOPRSTU', 274'low' => 101632, 275'name' => 'TANGUT IDEOGRAPH SUPPLEMENT', 276}, 277{ 278'high' => 111355, 279'legal' => 280' 281 -0123456789ABCDEFHNRSTU', 282'low' => 110960, 283'name' => 'NUSHU CHARACTER', 284}, 285{ 286'high' => 173791, 287'legal' => 288' 289 -0123456789ABCDEFGHIJKNOPRU', 290'low' => 131072, 291'name' => 'CJK UNIFIED IDEOGRAPH', 292}, 293{ 294'high' => 177977, 295'legal' => 296' 297 -0123456789ABCDEFGHIJKNOPRU', 298'low' => 173824, 299'name' => 'CJK UNIFIED IDEOGRAPH', 300}, 301{ 302'high' => 178205, 303'legal' => 304' 305 -0123456789ABCDEFGHIJKNOPRU', 306'low' => 177984, 307'name' => 'CJK UNIFIED IDEOGRAPH', 308}, 309{ 310'high' => 183969, 311'legal' => 312' 313 -0123456789ABCDEFGHIJKNOPRU', 314'low' => 178208, 315'name' => 'CJK UNIFIED IDEOGRAPH', 316}, 317{ 318'high' => 191456, 319'legal' => 320' 321 -0123456789ABCDEFGHIJKNOPRU', 322'low' => 183984, 323'name' => 'CJK UNIFIED IDEOGRAPH', 324}, 325{ 326'high' => 195101, 327'legal' => 328' 329 -0123456789ABCDEFGHIJKLMOPRTY', 330'low' => 194560, 331'name' => 'CJK COMPATIBILITY IDEOGRAPH', 332}, 333{ 334'high' => 201546, 335'legal' => 336' 337 -0123456789ABCDEFGHIJKNOPRU', 338'low' => 196608, 339'name' => 'CJK UNIFIED IDEOGRAPH', 340}, 341{ 342'high' => 205743, 343'legal' => 344' 345 -0123456789ABCDEFGHIJKNOPRU', 346'low' => 201552, 347'name' => 'CJK UNIFIED IDEOGRAPH', 348}, 349, 350 351 ); 352 353 # Is exportable, make read-only 354 Internals::SvREADONLY(@code_points_ending_in_code_point, 1); 355 356 # Convert from code point to Jamo short name for use in composing Hangul 357 # syllable names 358 my %Jamo = ( 3594352 => 'G', 3604353 => 'GG', 3614354 => 'N', 3624355 => 'D', 3634356 => 'DD', 3644357 => 'R', 3654358 => 'M', 3664359 => 'B', 3674360 => 'BB', 3684361 => 'S', 3694362 => 'SS', 3704363 => '', 3714364 => 'J', 3724365 => 'JJ', 3734366 => 'C', 3744367 => 'K', 3754368 => 'T', 3764369 => 'P', 3774370 => 'H', 3784449 => 'A', 3794450 => 'AE', 3804451 => 'YA', 3814452 => 'YAE', 3824453 => 'EO', 3834454 => 'E', 3844455 => 'YEO', 3854456 => 'YE', 3864457 => 'O', 3874458 => 'WA', 3884459 => 'WAE', 3894460 => 'OE', 3904461 => 'YO', 3914462 => 'U', 3924463 => 'WEO', 3934464 => 'WE', 3944465 => 'WI', 3954466 => 'YU', 3964467 => 'EU', 3974468 => 'YI', 3984469 => 'I', 3994520 => 'G', 4004521 => 'GG', 4014522 => 'GS', 4024523 => 'N', 4034524 => 'NJ', 4044525 => 'NH', 4054526 => 'D', 4064527 => 'L', 4074528 => 'LG', 4084529 => 'LM', 4094530 => 'LB', 4104531 => 'LS', 4114532 => 'LT', 4124533 => 'LP', 4134534 => 'LH', 4144535 => 'M', 4154536 => 'B', 4164537 => 'BS', 4174538 => 'S', 4184539 => 'SS', 4194540 => 'NG', 4204541 => 'J', 4214542 => 'C', 4224543 => 'K', 4234544 => 'T', 4244545 => 'P', 4254546 => 'H', 426 427 ); 428 429 # Leading consonant (can be null) 430 my %Jamo_L = ( 431'' => 11, 432'B' => 7, 433'BB' => 8, 434'C' => 14, 435'D' => 3, 436'DD' => 4, 437'G' => 0, 438'GG' => 1, 439'H' => 18, 440'J' => 12, 441'JJ' => 13, 442'K' => 15, 443'M' => 6, 444'N' => 2, 445'P' => 17, 446'R' => 5, 447'S' => 9, 448'SS' => 10, 449'T' => 16, 450 451 ); 452 453 # Vowel 454 my %Jamo_V = ( 455'A' => 0, 456'AE' => 1, 457'E' => 5, 458'EO' => 4, 459'EU' => 18, 460'I' => 20, 461'O' => 8, 462'OE' => 11, 463'U' => 13, 464'WA' => 9, 465'WAE' => 10, 466'WE' => 15, 467'WEO' => 14, 468'WI' => 16, 469'YA' => 2, 470'YAE' => 3, 471'YE' => 7, 472'YEO' => 6, 473'YI' => 19, 474'YO' => 12, 475'YU' => 17, 476 477 ); 478 479 # Optional trailing consonant 480 my %Jamo_T = ( 481'B' => 17, 482'BS' => 18, 483'C' => 23, 484'D' => 7, 485'G' => 1, 486'GG' => 2, 487'GS' => 3, 488'H' => 27, 489'J' => 22, 490'K' => 24, 491'L' => 8, 492'LB' => 11, 493'LG' => 9, 494'LH' => 15, 495'LM' => 10, 496'LP' => 14, 497'LS' => 12, 498'LT' => 13, 499'M' => 16, 500'N' => 4, 501'NG' => 21, 502'NH' => 6, 503'NJ' => 5, 504'P' => 26, 505'S' => 19, 506'SS' => 20, 507'T' => 25, 508 509 ); 510 511 # Computed re that splits up a Hangul name into LVT or LV syllables 512 my $syllable_re = qr/(|B|BB|C|D|DD|G|GG|H|J|JJ|K|M|N|P|R|S|SS|T)(A|AE|E|EO|EU|I|O|OE|U|WA|WAE|WE|WEO|WI|YA|YAE|YE|YEO|YI|YO|YU)(B|BS|C|D|G|GG|GS|H|J|K|L|LB|LG|LH|LM|LP|LS|LT|M|N|NG|NH|NJ|P|S|SS|T)?/; 513 514 my $HANGUL_SYLLABLE = "HANGUL SYLLABLE "; 515 my $loose_HANGUL_SYLLABLE = "HANGULSYLLABLE"; 516 517 # These constants names and values were taken from the Unicode standard, 518 # version 5.1, section 3.12. They are used in conjunction with Hangul 519 # syllables 520 my $SBase = 0xAC00; 521 my $LBase = 0x1100; 522 my $VBase = 0x1161; 523 my $TBase = 0x11A7; 524 my $SCount = 11172; 525 my $LCount = 19; 526 my $VCount = 21; 527 my $TCount = 28; 528 my $NCount = $VCount * $TCount; 529 530 sub name_to_code_point_special { 531 my ($name, $loose) = @_; 532 533 # Returns undef if not one of the specially handled names; otherwise 534 # returns the code point equivalent to the input name 535 # $loose is non-zero if to use loose matching, 'name' in that case 536 # must be input as upper case with all blanks and dashes squeezed out. 537 538 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//) 539 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//)) 540 { 541 return if $name !~ qr/^$syllable_re$/; 542 my $L = $Jamo_L{$1}; 543 my $V = $Jamo_V{$2}; 544 my $T = (defined $3) ? $Jamo_T{$3} : 0; 545 return ($L * $VCount + $V) * $TCount + $T + $SBase; 546 } 547 548 # Name must end in 'code_point' for this to handle. 549 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x) 550 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x)); 551 552 my $base = $1; 553 my $code_point = CORE::hex $2; 554 my $names_ref; 555 556 if ($loose) { 557 $names_ref = \%loose_names_ending_in_code_point; 558 } 559 else { 560 return if $base !~ s/-$//; 561 $names_ref = \%names_ending_in_code_point; 562 } 563 564 # Name must be one of the ones which has the code point in it. 565 return if ! $names_ref->{$base}; 566 567 # Look through the list of ranges that apply to this name to see if 568 # the code point is in one of them. 569 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) { 570 return if $names_ref->{$base}{'low'}->[$i] > $code_point; 571 next if $names_ref->{$base}{'high'}->[$i] < $code_point; 572 573 # Here, the code point is in the range. 574 return $code_point; 575 } 576 577 # Here, looked like the name had a code point number in it, but 578 # did not match one of the valid ones. 579 return; 580 } 581 582 sub code_point_to_name_special { 583 my $code_point = shift; 584 585 # Returns the name of a code point if algorithmically determinable; 586 # undef if not 587 588 # If in the Hangul range, calculate the name based on Unicode's 589 # algorithm 590 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { 591 use integer; 592 my $SIndex = $code_point - $SBase; 593 my $L = $LBase + $SIndex / $NCount; 594 my $V = $VBase + ($SIndex % $NCount) / $TCount; 595 my $T = $TBase + $SIndex % $TCount; 596 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; 597 $name .= $Jamo{$T} if $T != $TBase; 598 return $name; 599 } 600 601 # Look through list of these code points for one in range. 602 foreach my $hash (@code_points_ending_in_code_point) { 603 return if $code_point < $hash->{'low'}; 604 if ($code_point <= $hash->{'high'}) { 605 return sprintf("%s-%04X", $hash->{'name'}, $code_point); 606 } 607 } 608 return; # None found 609 } 610} # End closure 611 6121; 613