1package charnames;
2use strict;
3use warnings;
4use Carp;
5use File::Spec;
6our $VERSION = '1.03';
7
8use bytes ();		# for $bytes::hint_bits
9$charnames::hint_bits = 0x20000; # HINT_LOCALIZE_HH
10
11my %alias1 = (
12		# Icky 3.2 names with parentheses.
13		'LINE FEED'		=> 'LINE FEED (LF)',
14		'FORM FEED'		=> 'FORM FEED (FF)',
15		'CARRIAGE RETURN'	=> 'CARRIAGE RETURN (CR)',
16		'NEXT LINE'		=> 'NEXT LINE (NEL)',
17		# Convenience.
18		'LF'			=> 'LINE FEED (LF)',
19		'FF'			=> 'FORM FEED (FF)',
20		'CR'			=> 'CARRIAGE RETURN (CR)',
21		'NEL'			=> 'NEXT LINE (NEL)',
22	        # More convenience.  For futher convencience,
23	        # it is suggested some way using using the NamesList
24		# aliases is implemented.
25	        'ZWNJ'			=> 'ZERO WIDTH NON-JOINER',
26	        'ZWJ'			=> 'ZERO WIDTH JOINER',
27		'BOM'			=> 'BYTE ORDER MARK',
28	    );
29
30my %alias2 = (
31		# Pre-3.2 compatibility (only for the first 256 characters).
32		'HORIZONTAL TABULATION'	=> 'CHARACTER TABULATION',
33		'VERTICAL TABULATION'	=> 'LINE TABULATION',
34		'FILE SEPARATOR'	=> 'INFORMATION SEPARATOR FOUR',
35		'GROUP SEPARATOR'	=> 'INFORMATION SEPARATOR THREE',
36		'RECORD SEPARATOR'	=> 'INFORMATION SEPARATOR TWO',
37		'UNIT SEPARATOR'	=> 'INFORMATION SEPARATOR ONE',
38		'PARTIAL LINE DOWN'	=> 'PARTIAL LINE FORWARD',
39		'PARTIAL LINE UP'	=> 'PARTIAL LINE BACKWARD',
40	    );
41
42my %alias3 = (
43		# User defined aliasses. Even more convenient :)
44	    );
45my $txt;
46
47sub alias (@)
48{
49  @_ or return %alias3;
50  my $alias = ref $_[0] ? $_[0] : { @_ };
51  @alias3{keys %$alias} = values %$alias;
52} # alias
53
54sub alias_file ($)
55{
56  my ($arg, $file) = @_;
57  if (-f $arg && File::Spec->file_name_is_absolute ($arg)) {
58    $file = $arg;
59  }
60  elsif ($arg =~ m/^\w+$/) {
61    $file = "unicore/${arg}_alias.pl";
62  }
63  else {
64    croak "Charnames alias files can only have identifier characters";
65  }
66  if (my @alias = do $file) {
67    @alias == 1 && !defined $alias[0] and
68      croak "$file cannot be used as alias file for charnames";
69    @alias % 2 and
70      croak "$file did not return a (valid) list of alias pairs";
71    alias (@alias);
72    return (1);
73  }
74  0;
75} # alias_file
76
77# This is not optimized in any way yet
78sub charnames
79{
80  my $name = shift;
81
82  if (exists $alias1{$name}) {
83    $name = $alias1{$name};
84  }
85  elsif (exists $alias2{$name}) {
86    require warnings;
87    warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
88    $name = $alias2{$name};
89  }
90  elsif (exists $alias3{$name}) {
91    $name = $alias3{$name};
92  }
93
94  my $ord;
95  my @off;
96  my $fname;
97
98  if ($name eq "BYTE ORDER MARK") {
99    $fname = $name;
100    $ord = 0xFEFF;
101  } else {
102    ## Suck in the code/name list as a big string.
103    ## Lines look like:
104    ##     "0052\t\tLATIN CAPITAL LETTER R\n"
105    $txt = do "unicore/Name.pl" unless $txt;
106
107    ## @off will hold the index into the code/name string of the start and
108    ## end of the name as we find it.
109
110    ## If :full, look for the name exactly
111    if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
112      @off = ($-[0], $+[0]);
113    }
114
115    ## If we didn't get above, and :short allowed, look for the short name.
116    ## The short name is like "greek:Sigma"
117    unless (@off) {
118      if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
119	my ($script, $cname) = ($1, $2);
120	my $case = $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
121	if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
122	  @off = ($-[0], $+[0]);
123	}
124      }
125    }
126
127    ## If we still don't have it, check for the name among the loaded
128    ## scripts.
129    if (not @off) {
130      my $case = $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL";
131      for my $script (@{$^H{charnames_scripts}}) {
132	if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
133	  @off = ($-[0], $+[0]);
134	  last;
135	}
136      }
137    }
138
139    ## If we don't have it by now, give up.
140    unless (@off) {
141      carp "Unknown charname '$name'";
142      return "\x{FFFD}";
143    }
144
145    ##
146    ## Now know where in the string the name starts.
147    ## The code, in hex, is before that.
148    ##
149    ## The code can be 4-6 characters long, so we've got to sort of
150    ## go look for it, just after the newline that comes before $off[0].
151    ##
152    ## This would be much easier if unicore/Name.pl had info in
153    ## a name/code order, instead of code/name order.
154    ##
155    ## The +1 after the rindex() is to skip past the newline we're finding,
156    ## or, if the rindex() fails, to put us to an offset of zero.
157    ##
158    my $hexstart = rindex($txt, "\n", $off[0]) + 1;
159
160    ## we know where it starts, so turn into number -
161    ## the ordinal for the char.
162    $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
163  }
164
165  if ($^H & $bytes::hint_bits) {	# "use bytes" in effect?
166    use bytes;
167    return chr $ord if $ord <= 255;
168    my $hex = sprintf "%04x", $ord;
169    if (not defined $fname) {
170      $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
171    }
172    croak "Character 0x$hex with name '$fname' is above 0xFF";
173  }
174
175  no warnings 'utf8'; # allow even illegal characters
176  return pack "U", $ord;
177} # charnames
178
179sub import
180{
181  shift; ## ignore class name
182
183  if (not @_) {
184    carp("`use charnames' needs explicit imports list");
185  }
186  $^H |= $charnames::hint_bits;
187  $^H{charnames} = \&charnames ;
188
189  ##
190  ## fill %h keys with our @_ args.
191  ##
192  my ($promote, %h, @args) = (0);
193  while (@_ and $_ = shift) {
194    if ($_ eq ":alias") {
195      @_ or
196	croak ":alias needs an argument in charnames";
197      my $alias = shift;
198      if (ref $alias) {
199	ref $alias eq "HASH" or
200	  croak "Only HASH reference supported as argument to :alias";
201	alias ($alias);
202	next;
203      }
204      if ($alias =~ m{:(\w+)$}) {
205	$1 eq "full" || $1 eq "short" and
206	  croak ":alias cannot use existing pragma :$1 (reversed order?)";
207	alias_file ($1) and $promote = 1;
208	next;
209      }
210      alias_file ($alias);
211      next;
212    }
213    if (m/^:/ and ! ($_ eq ":full" || $_ eq ":short")) {
214      warn "unsupported special '$_' in charnames";
215      next;
216    }
217    push @args, $_;
218  }
219  @args == 0 && $promote and @args = (":full");
220  @h{@args} = (1) x @args;
221
222  $^H{charnames_full} = delete $h{':full'};
223  $^H{charnames_short} = delete $h{':short'};
224  $^H{charnames_scripts} = [map uc, keys %h];
225
226  ##
227  ## If utf8? warnings are enabled, and some scripts were given,
228  ## see if at least we can find one letter of each script.
229  ##
230  if (warnings::enabled('utf8') && @{$^H{charnames_scripts}}) {
231    $txt = do "unicore/Name.pl" unless $txt;
232
233    for my $script (@{$^H{charnames_scripts}}) {
234      if (not $txt =~ m/\t\t$script (?:CAPITAL |SMALL )?LETTER /) {
235	warnings::warn('utf8',  "No such script: '$script'");
236      }
237    }
238  }
239} # import
240
241# this comes actually from Unicode::UCD, but it avoids the
242# overhead of loading it
243sub _getcode {
244    my $arg = shift;
245
246    if ($arg =~ /^[1-9]\d*$/) {
247	return $arg;
248    } elsif ($arg =~ /^(?:[Uu]\+|0[xX])?([[:xdigit:]]+)$/) {
249	return hex($1);
250    }
251
252    return;
253}
254
255my %viacode;
256
257sub viacode
258{
259  if (@_ != 1) {
260    carp "charnames::viacode() expects one argument";
261    return ()
262  }
263
264  my $arg = shift;
265  my $code = _getcode($arg);
266
267  my $hex;
268
269  if (defined $code) {
270    $hex = sprintf "%04X", $arg;
271  } else {
272    carp("unexpected arg \"$arg\" to charnames::viacode()");
273    return;
274  }
275
276  if ($code > 0x10FFFF) {
277    carp sprintf "Unicode characters only allocated up to U+10FFFF (you asked for U+%X)", $hex;
278    return;
279  }
280
281  return $viacode{$hex} if exists $viacode{$hex};
282
283  $txt = do "unicore/Name.pl" unless $txt;
284
285  if ($txt =~ m/^$hex\t\t(.+)/m) {
286    return $viacode{$hex} = $1;
287  } else {
288    return;
289  }
290} # viacode
291
292my %vianame;
293
294sub vianame
295{
296  if (@_ != 1) {
297    carp "charnames::vianame() expects one name argument";
298    return ()
299  }
300
301  my $arg = shift;
302
303  return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
304
305  return $vianame{$arg} if exists $vianame{$arg};
306
307  $txt = do "unicore/Name.pl" unless $txt;
308
309  my $pos = index $txt, "\t\t$arg\n";
310  if ($[ <= $pos) {
311    my $posLF = rindex $txt, "\n", $pos;
312    (my $code = substr $txt, $posLF + 1, 6) =~ tr/\t//d;
313    return $vianame{$arg} = hex $code;
314
315    # If $pos is at the 1st line, $posLF must be $[ - 1 (not found);
316    # then $posLF + 1 equals to $[ (at the beginning of $txt).
317    # Otherwise $posLF is the position of "\n";
318    # then $posLF + 1 must be the position of the next to "\n"
319    # (the beginning of the line).
320    # substr($txt, $posLF + 1, 6) may be "0000\t\t", "00A1\t\t",
321    # "10300\t", "100000", etc. So we can get the code via removing TAB.
322  } else {
323    return;
324  }
325} # vianame
326
327
3281;
329__END__
330
331=head1 NAME
332
333charnames - define character names for C<\N{named}> string literal escapes
334
335=head1 SYNOPSIS
336
337  use charnames ':full';
338  print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
339
340  use charnames ':short';
341  print "\N{greek:Sigma} is an upper-case sigma.\n";
342
343  use charnames qw(cyrillic greek);
344  print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
345
346  use charnames ":full", ":alias" => {
347    e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
348  };
349  print "\N{e_ACUTE} is a small letter e with an acute.\n";
350
351  use charnames ();
352  print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
353  printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"
354
355=head1 DESCRIPTION
356
357Pragma C<use charnames> supports arguments C<:full>, C<:short>, script
358names and customized aliases.  If C<:full> is present, for expansion of
359C<\N{CHARNAME}>, the string C<CHARNAME> is first looked up in the list of
360standard Unicode character names.  If C<:short> is present, and
361C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
362as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
363with script name arguments, then for C<\N{CHARNAME}> the name
364C<CHARNAME> is looked up as a letter in the given scripts (in the
365specified order). Customized aliases are explained in L</CUSTOM ALIASES>.
366
367For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
368this pragma looks for the names
369
370  SCRIPTNAME CAPITAL LETTER CHARNAME
371  SCRIPTNAME SMALL LETTER CHARNAME
372  SCRIPTNAME LETTER CHARNAME
373
374in the table of standard Unicode names.  If C<CHARNAME> is lowercase,
375then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
376is ignored.
377
378Note that C<\N{...}> is compile-time, it's a special form of string
379constant used inside double-quoted strings: in other words, you cannot
380use variables inside the C<\N{...}>.  If you want similar run-time
381functionality, use charnames::vianame().
382
383For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
384as of Unicode 3.1, there are no official Unicode names but you can use
385instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth).  In
386Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
387has been updated, see L</ALIASES>.  Also note that the U+UU80, U+0081,
388U+0084, and U+0099 do not have names even in ISO 6429.
389
390Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
391is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
392
393=head1 CUSTOM TRANSLATORS
394
395The mechanism of translation of C<\N{...}> escapes is general and not
396hardwired into F<charnames.pm>.  A module can install custom
397translations (inside the scope which C<use>s the module) with the
398following magic incantation:
399
400    use charnames ();		# for $charnames::hint_bits
401    sub import {
402	shift;
403	$^H |= $charnames::hint_bits;
404	$^H{charnames} = \&translator;
405    }
406
407Here translator() is a subroutine which takes C<CHARNAME> as an
408argument, and returns text to insert into the string instead of the
409C<\N{CHARNAME}> escape.  Since the text to insert should be different
410in C<bytes> mode and out of it, the function should check the current
411state of C<bytes>-flag as in:
412
413    use bytes ();			# for $bytes::hint_bits
414    sub translator {
415	if ($^H & $bytes::hint_bits) {
416	    return bytes_translator(@_);
417	}
418	else {
419	    return utf8_translator(@_);
420	}
421    }
422
423=head1 CUSTOM ALIASES
424
425This version of charnames supports three mechanisms of adding local
426or customized aliases to standard Unicode naming conventions (:full)
427
428=head2 Anonymous hashes
429
430    use charnames ":full", ":alias" => {
431        e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
432        };
433    my $str = "\N{e_ACUTE}";
434
435=head2 Alias file
436
437    use charnames ":full", ":alias" => "pro";
438
439    will try to read "unicore/pro_alias.pl" from the @INC path. This
440    file should return a list in plain perl:
441
442    (
443    A_GRAVE         => "LATIN CAPITAL LETTER A WITH GRAVE",
444    A_CIRCUM        => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
445    A_DIAERES       => "LATIN CAPITAL LETTER A WITH DIAERESIS",
446    A_TILDE         => "LATIN CAPITAL LETTER A WITH TILDE",
447    A_BREVE         => "LATIN CAPITAL LETTER A WITH BREVE",
448    A_RING          => "LATIN CAPITAL LETTER A WITH RING ABOVE",
449    A_MACRON        => "LATIN CAPITAL LETTER A WITH MACRON",
450    );
451
452=head2 Alias shortcut
453
454    use charnames ":alias" => ":pro";
455
456    works exactly the same as the alias pairs, only this time,
457    ":full" is inserted automatically as first argument (if no
458    other argument is given).
459
460=head1 charnames::viacode(code)
461
462Returns the full name of the character indicated by the numeric code.
463The example
464
465    print charnames::viacode(0x2722);
466
467prints "FOUR TEARDROP-SPOKED ASTERISK".
468
469Returns undef if no name is known for the code.
470
471This works only for the standard names, and does not yet apply
472to custom translators.
473
474Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
475SPACE", not "BYTE ORDER MARK".
476
477=head1 charnames::vianame(name)
478
479Returns the code point indicated by the name.
480The example
481
482    printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
483
484prints "2722".
485
486Returns undef if the name is unknown.
487
488This works only for the standard names, and does not yet apply
489to custom translators.
490
491=head1 ALIASES
492
493A few aliases have been defined for convenience: instead of having
494to use the official names
495
496    LINE FEED (LF)
497    FORM FEED (FF)
498    CARRIAGE RETURN (CR)
499    NEXT LINE (NEL)
500
501(yes, with parentheses) one can use
502
503    LINE FEED
504    FORM FEED
505    CARRIAGE RETURN
506    NEXT LINE
507    LF
508    FF
509    CR
510    NEL
511
512One can also use
513
514    BYTE ORDER MARK
515    BOM
516
517and
518
519    ZWNJ
520    ZWJ
521
522for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.
523
524For backward compatibility one can use the old names for
525certain C0 and C1 controls
526
527    old                         new
528
529    HORIZONTAL TABULATION       CHARACTER TABULATION
530    VERTICAL TABULATION         LINE TABULATION
531    FILE SEPARATOR              INFORMATION SEPARATOR FOUR
532    GROUP SEPARATOR             INFORMATION SEPARATOR THREE
533    RECORD SEPARATOR            INFORMATION SEPARATOR TWO
534    UNIT SEPARATOR              INFORMATION SEPARATOR ONE
535    PARTIAL LINE DOWN           PARTIAL LINE FORWARD
536    PARTIAL LINE UP             PARTIAL LINE BACKWARD
537
538but the old names in addition to giving the character
539will also give a warning about being deprecated.
540
541=head1 ILLEGAL CHARACTERS
542
543If you ask by name for a character that does not exist, a warning is
544given and the Unicode I<replacement character> "\x{FFFD}" is returned.
545
546If you ask by code for a character that does not exist, no warning is
547given and C<undef> is returned.  (Though if you ask for a code point
548past U+10FFFF you do get a warning.)
549
550=head1 BUGS
551
552Since evaluation of the translation function happens in a middle of
553compilation (of a string literal), the translation function should not
554do any C<eval>s or C<require>s.  This restriction should be lifted in
555a future version of Perl.
556
557=cut
558