1# !!!!!!!   DO NOT EDIT THIS FILE   !!!!!!!
2# This file is machine-generated by lib/unicore/mktables from the Unicode
3# database, Version 15.0.0.  Any changes made here will be lost!
4
5
6# !!!!!!!   INTERNAL PERL USE ONLY   !!!!!!!
7# This file is for internal use by core Perl only.  The format and even the
8# name or existence of this file are subject to change without notice.  Don't
9# use it directly.  Use Unicode::UCD to access the Unicode character data
10# base.
11
12
13
14package charnames;
15
16# This module contains machine-generated tables and code for the
17# algorithmically-determinable Unicode character names.  The following
18# routines can be used to translate between name and code point and vice versa
19
20{ # Closure
21
22    # Matches legal code point.  4-6 hex numbers, If there are 6, the first
23    # two must be 10; if there are 5, the first must not be a 0.  Written this
24    # way to decrease backtracking.  The first regex allows the code point to
25    # be at the end of a word, but to work properly, the word shouldn't end
26    # with a valid hex character.  The second one won't match a code point at
27    # the end of a word, and doesn't have the run-on issue
28    my $run_on_code_point_re = qr/(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b)/;
29    my $code_point_re = qr/(?^aa:\b(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b))/;
30
31    # In the following hash, the keys are the bases of names which include
32    # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01.  The value
33    # of each key is another hash which is used to get the low and high ends
34    # for each range of code points that apply to the name.
35    my %names_ending_in_code_point = (
36'CJK COMPATIBILITY IDEOGRAPH' =>
37{
38'high' =>
39[
4064109,
4164217,
42195101,
43],
44'low' =>
45[
4663744,
4764112,
48194560,
49],
50},
51'CJK UNIFIED IDEOGRAPH' =>
52{
53'high' =>
54[
5519903,
5640959,
57173791,
58177977,
59178205,
60183969,
61191456,
62201546,
63205743,
64],
65'low' =>
66[
6713312,
6819968,
69131072,
70173824,
71177984,
72178208,
73183984,
74196608,
75201552,
76],
77},
78'KHITAN SMALL SCRIPT CHARACTER' =>
79{
80'high' =>
81[
82101589,
83],
84'low' =>
85[
86101120,
87],
88},
89'NUSHU CHARACTER' =>
90{
91'high' =>
92[
93111355,
94],
95'low' =>
96[
97110960,
98],
99},
100'TANGUT IDEOGRAPH' =>
101{
102'high' =>
103[
104100343,
105],
106'low' =>
107[
10894208,
109],
110},
111'TANGUT IDEOGRAPH SUPPLEMENT' =>
112{
113'high' =>
114[
115101640,
116],
117'low' =>
118[
119101632,
120],
121},
122
123    );
124
125    # The following hash is a copy of the previous one, except is for loose
126    # matching, so each name has blanks and dashes squeezed out
127    my %loose_names_ending_in_code_point = (
128'CJKCOMPATIBILITYIDEOGRAPH' =>
129{
130'high' =>
131[
13264109,
13364217,
134195101,
135],
136'low' =>
137[
13863744,
13964112,
140194560,
141],
142},
143'CJKUNIFIEDIDEOGRAPH' =>
144{
145'high' =>
146[
14719903,
14840959,
149173791,
150177977,
151178205,
152183969,
153191456,
154201546,
155205743,
156],
157'low' =>
158[
15913312,
16019968,
161131072,
162173824,
163177984,
164178208,
165183984,
166196608,
167201552,
168],
169},
170'KHITANSMALLSCRIPTCHARACTER' =>
171{
172'high' =>
173[
174101589,
175],
176'low' =>
177[
178101120,
179],
180},
181'NUSHUCHARACTER' =>
182{
183'high' =>
184[
185111355,
186],
187'low' =>
188[
189110960,
190],
191},
192'TANGUTIDEOGRAPH' =>
193{
194'high' =>
195[
196100343,
197],
198'low' =>
199[
20094208,
201],
202},
203'TANGUTIDEOGRAPHSUPPLEMENT' =>
204{
205'high' =>
206[
207101640,
208],
209'low' =>
210[
211101632,
212],
213},
214
215    );
216
217    # And the following array gives the inverse mapping from code points to
218    # names.  Lowest code points are first
219    @code_points_ending_in_code_point = (
220
221{
222'high' => 19903,
223'legal' =>
224'
225 -0123456789ABCDEFGHIJKNOPRU',
226'low' => 13312,
227'name' => 'CJK UNIFIED IDEOGRAPH',
228},
229{
230'high' => 40959,
231'legal' =>
232'
233 -0123456789ABCDEFGHIJKNOPRU',
234'low' => 19968,
235'name' => 'CJK UNIFIED IDEOGRAPH',
236},
237{
238'high' => 64109,
239'legal' =>
240'
241 -0123456789ABCDEFGHIJKLMOPRTY',
242'low' => 63744,
243'name' => 'CJK COMPATIBILITY IDEOGRAPH',
244},
245{
246'high' => 64217,
247'legal' =>
248'
249 -0123456789ABCDEFGHIJKLMOPRTY',
250'low' => 64112,
251'name' => 'CJK COMPATIBILITY IDEOGRAPH',
252},
253{
254'high' => 100343,
255'legal' =>
256'
257 -0123456789ABCDEFGHINOPRTU',
258'low' => 94208,
259'name' => 'TANGUT IDEOGRAPH',
260},
261{
262'high' => 101589,
263'legal' =>
264'
265 -0123456789ABCDEFHIKLMNPRST',
266'low' => 101120,
267'name' => 'KHITAN SMALL SCRIPT CHARACTER',
268},
269{
270'high' => 101640,
271'legal' =>
272'
273 -0123456789ABCDEFGHILMNOPRSTU',
274'low' => 101632,
275'name' => 'TANGUT IDEOGRAPH SUPPLEMENT',
276},
277{
278'high' => 111355,
279'legal' =>
280'
281 -0123456789ABCDEFHNRSTU',
282'low' => 110960,
283'name' => 'NUSHU CHARACTER',
284},
285{
286'high' => 173791,
287'legal' =>
288'
289 -0123456789ABCDEFGHIJKNOPRU',
290'low' => 131072,
291'name' => 'CJK UNIFIED IDEOGRAPH',
292},
293{
294'high' => 177977,
295'legal' =>
296'
297 -0123456789ABCDEFGHIJKNOPRU',
298'low' => 173824,
299'name' => 'CJK UNIFIED IDEOGRAPH',
300},
301{
302'high' => 178205,
303'legal' =>
304'
305 -0123456789ABCDEFGHIJKNOPRU',
306'low' => 177984,
307'name' => 'CJK UNIFIED IDEOGRAPH',
308},
309{
310'high' => 183969,
311'legal' =>
312'
313 -0123456789ABCDEFGHIJKNOPRU',
314'low' => 178208,
315'name' => 'CJK UNIFIED IDEOGRAPH',
316},
317{
318'high' => 191456,
319'legal' =>
320'
321 -0123456789ABCDEFGHIJKNOPRU',
322'low' => 183984,
323'name' => 'CJK UNIFIED IDEOGRAPH',
324},
325{
326'high' => 195101,
327'legal' =>
328'
329 -0123456789ABCDEFGHIJKLMOPRTY',
330'low' => 194560,
331'name' => 'CJK COMPATIBILITY IDEOGRAPH',
332},
333{
334'high' => 201546,
335'legal' =>
336'
337 -0123456789ABCDEFGHIJKNOPRU',
338'low' => 196608,
339'name' => 'CJK UNIFIED IDEOGRAPH',
340},
341{
342'high' => 205743,
343'legal' =>
344'
345 -0123456789ABCDEFGHIJKNOPRU',
346'low' => 201552,
347'name' => 'CJK UNIFIED IDEOGRAPH',
348},
349,
350
351    );
352
353    # Is exportable, make read-only
354    Internals::SvREADONLY(@code_points_ending_in_code_point, 1);
355
356    # Convert from code point to Jamo short name for use in composing Hangul
357    # syllable names
358    my %Jamo = (
3594352 => 'G',
3604353 => 'GG',
3614354 => 'N',
3624355 => 'D',
3634356 => 'DD',
3644357 => 'R',
3654358 => 'M',
3664359 => 'B',
3674360 => 'BB',
3684361 => 'S',
3694362 => 'SS',
3704363 => '',
3714364 => 'J',
3724365 => 'JJ',
3734366 => 'C',
3744367 => 'K',
3754368 => 'T',
3764369 => 'P',
3774370 => 'H',
3784449 => 'A',
3794450 => 'AE',
3804451 => 'YA',
3814452 => 'YAE',
3824453 => 'EO',
3834454 => 'E',
3844455 => 'YEO',
3854456 => 'YE',
3864457 => 'O',
3874458 => 'WA',
3884459 => 'WAE',
3894460 => 'OE',
3904461 => 'YO',
3914462 => 'U',
3924463 => 'WEO',
3934464 => 'WE',
3944465 => 'WI',
3954466 => 'YU',
3964467 => 'EU',
3974468 => 'YI',
3984469 => 'I',
3994520 => 'G',
4004521 => 'GG',
4014522 => 'GS',
4024523 => 'N',
4034524 => 'NJ',
4044525 => 'NH',
4054526 => 'D',
4064527 => 'L',
4074528 => 'LG',
4084529 => 'LM',
4094530 => 'LB',
4104531 => 'LS',
4114532 => 'LT',
4124533 => 'LP',
4134534 => 'LH',
4144535 => 'M',
4154536 => 'B',
4164537 => 'BS',
4174538 => 'S',
4184539 => 'SS',
4194540 => 'NG',
4204541 => 'J',
4214542 => 'C',
4224543 => 'K',
4234544 => 'T',
4244545 => 'P',
4254546 => 'H',
426
427    );
428
429    # Leading consonant (can be null)
430    my %Jamo_L = (
431'' => 11,
432'B' => 7,
433'BB' => 8,
434'C' => 14,
435'D' => 3,
436'DD' => 4,
437'G' => 0,
438'GG' => 1,
439'H' => 18,
440'J' => 12,
441'JJ' => 13,
442'K' => 15,
443'M' => 6,
444'N' => 2,
445'P' => 17,
446'R' => 5,
447'S' => 9,
448'SS' => 10,
449'T' => 16,
450
451    );
452
453    # Vowel
454    my %Jamo_V = (
455'A' => 0,
456'AE' => 1,
457'E' => 5,
458'EO' => 4,
459'EU' => 18,
460'I' => 20,
461'O' => 8,
462'OE' => 11,
463'U' => 13,
464'WA' => 9,
465'WAE' => 10,
466'WE' => 15,
467'WEO' => 14,
468'WI' => 16,
469'YA' => 2,
470'YAE' => 3,
471'YE' => 7,
472'YEO' => 6,
473'YI' => 19,
474'YO' => 12,
475'YU' => 17,
476
477    );
478
479    # Optional trailing consonant
480    my %Jamo_T = (
481'B' => 17,
482'BS' => 18,
483'C' => 23,
484'D' => 7,
485'G' => 1,
486'GG' => 2,
487'GS' => 3,
488'H' => 27,
489'J' => 22,
490'K' => 24,
491'L' => 8,
492'LB' => 11,
493'LG' => 9,
494'LH' => 15,
495'LM' => 10,
496'LP' => 14,
497'LS' => 12,
498'LT' => 13,
499'M' => 16,
500'N' => 4,
501'NG' => 21,
502'NH' => 6,
503'NJ' => 5,
504'P' => 26,
505'S' => 19,
506'SS' => 20,
507'T' => 25,
508
509    );
510
511    # Computed re that splits up a Hangul name into LVT or LV syllables
512    my $syllable_re = qr/(|B|BB|C|D|DD|G|GG|H|J|JJ|K|M|N|P|R|S|SS|T)(A|AE|E|EO|EU|I|O|OE|U|WA|WAE|WE|WEO|WI|YA|YAE|YE|YEO|YI|YO|YU)(B|BS|C|D|G|GG|GS|H|J|K|L|LB|LG|LH|LM|LP|LS|LT|M|N|NG|NH|NJ|P|S|SS|T)?/;
513
514    my $HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
515    my $loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
516
517    # These constants names and values were taken from the Unicode standard,
518    # version 5.1, section 3.12.  They are used in conjunction with Hangul
519    # syllables
520    my $SBase = 0xAC00;
521    my $LBase = 0x1100;
522    my $VBase = 0x1161;
523    my $TBase = 0x11A7;
524    my $SCount = 11172;
525    my $LCount = 19;
526    my $VCount = 21;
527    my $TCount = 28;
528    my $NCount = $VCount * $TCount;
529
530    sub name_to_code_point_special {
531        my ($name, $loose) = @_;
532
533        # Returns undef if not one of the specially handled names; otherwise
534        # returns the code point equivalent to the input name
535        # $loose is non-zero if to use loose matching, 'name' in that case
536        # must be input as upper case with all blanks and dashes squeezed out.
537
538        if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
539            || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
540        {
541            return if $name !~ qr/^$syllable_re$/;
542            my $L = $Jamo_L{$1};
543            my $V = $Jamo_V{$2};
544            my $T = (defined $3) ? $Jamo_T{$3} : 0;
545            return ($L * $VCount + $V) * $TCount + $T + $SBase;
546        }
547
548        # Name must end in 'code_point' for this to handle.
549        return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
550                   || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
551
552        my $base = $1;
553        my $code_point = CORE::hex $2;
554        my $names_ref;
555
556        if ($loose) {
557            $names_ref = \%loose_names_ending_in_code_point;
558        }
559        else {
560            return if $base !~ s/-$//;
561            $names_ref = \%names_ending_in_code_point;
562        }
563
564        # Name must be one of the ones which has the code point in it.
565        return if ! $names_ref->{$base};
566
567        # Look through the list of ranges that apply to this name to see if
568        # the code point is in one of them.
569        for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
570            return if $names_ref->{$base}{'low'}->[$i] > $code_point;
571            next if $names_ref->{$base}{'high'}->[$i] < $code_point;
572
573            # Here, the code point is in the range.
574            return $code_point;
575        }
576
577        # Here, looked like the name had a code point number in it, but
578        # did not match one of the valid ones.
579        return;
580    }
581
582    sub code_point_to_name_special {
583        my $code_point = shift;
584
585        # Returns the name of a code point if algorithmically determinable;
586        # undef if not
587
588        # If in the Hangul range, calculate the name based on Unicode's
589        # algorithm
590        if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
591            use integer;
592            my $SIndex = $code_point - $SBase;
593            my $L = $LBase + $SIndex / $NCount;
594            my $V = $VBase + ($SIndex % $NCount) / $TCount;
595            my $T = $TBase + $SIndex % $TCount;
596            $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
597            $name .= $Jamo{$T} if $T != $TBase;
598            return $name;
599        }
600
601        # Look through list of these code points for one in range.
602        foreach my $hash (@code_points_ending_in_code_point) {
603            return if $code_point < $hash->{'low'};
604            if ($code_point <= $hash->{'high'}) {
605                return sprintf("%s-%04X", $hash->{'name'}, $code_point);
606            }
607        }
608        return;            # None found
609    }
610} # End closure
611
6121;
613