encoding.pm revision 1.1
1# $Id: encoding.pm,v 2.8 2009/02/15 17:44:13 dankogai Exp $
2package encoding;
3our $VERSION = '2.6_01';
4
5use Encode;
6use strict;
7use warnings;
8
9sub DEBUG () { 0 }
10
11BEGIN {
12    if ( ord("A") == 193 ) {
13        require Carp;
14        Carp::croak("encoding: pragma does not support EBCDIC platforms");
15    }
16}
17
18our $HAS_PERLIO = 0;
19eval { require PerlIO::encoding };
20unless ($@) {
21    $HAS_PERLIO = ( PerlIO::encoding->VERSION >= 0.02 );
22}
23
24sub _exception {
25    my $name = shift;
26    $] > 5.008 and return 0;    # 5.8.1 or higher then no
27    my %utfs = map { $_ => 1 }
28      qw(utf8 UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE
29      UTF-32 UTF-32BE UTF-32LE);
30    $utfs{$name} or return 0;    # UTFs or no
31    require Config;
32    Config->import();
33    our %Config;
34    return $Config{perl_patchlevel} ? 0 : 1    # maintperl then no
35}
36
37sub in_locale { $^H & ( $locale::hint_bits || 0 ) }
38
39sub _get_locale_encoding {
40    my $locale_encoding;
41
42    # I18N::Langinfo isn't available everywhere
43    eval {
44        require I18N::Langinfo;
45        I18N::Langinfo->import(qw(langinfo CODESET));
46        $locale_encoding = langinfo( CODESET() );
47    };
48
49    my $country_language;
50
51    no warnings 'uninitialized';
52
53    if ( (not $locale_encoding) && in_locale() ) {
54        if ( $ENV{LC_ALL} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
55            ( $country_language, $locale_encoding ) = ( $1, $2 );
56        }
57        elsif ( $ENV{LANG} =~ /^([^.]+)\.([^.@]+)(@.*)?$/ ) {
58            ( $country_language, $locale_encoding ) = ( $1, $2 );
59        }
60
61        # LANGUAGE affects only LC_MESSAGES only on glibc
62    }
63    elsif ( not $locale_encoding ) {
64        if (   $ENV{LC_ALL} =~ /\butf-?8\b/i
65            || $ENV{LANG} =~ /\butf-?8\b/i )
66        {
67            $locale_encoding = 'utf8';
68        }
69
70        # Could do more heuristics based on the country and language
71        # parts of LC_ALL and LANG (the parts before the dot (if any)),
72        # since we have Locale::Country and Locale::Language available.
73        # TODO: get a database of Language -> Encoding mappings
74        # (the Estonian database at http://www.eki.ee/letter/
75        # would be excellent!) --jhi
76    }
77    if (   defined $locale_encoding
78        && lc($locale_encoding) eq 'euc'
79        && defined $country_language )
80    {
81        if ( $country_language =~ /^ja_JP|japan(?:ese)?$/i ) {
82            $locale_encoding = 'euc-jp';
83        }
84        elsif ( $country_language =~ /^ko_KR|korean?$/i ) {
85            $locale_encoding = 'euc-kr';
86        }
87        elsif ( $country_language =~ /^zh_CN|chin(?:a|ese)$/i ) {
88            $locale_encoding = 'euc-cn';
89        }
90        elsif ( $country_language =~ /^zh_TW|taiwan(?:ese)?$/i ) {
91            $locale_encoding = 'euc-tw';
92        }
93        else {
94            require Carp;
95            Carp::croak(
96                "encoding: Locale encoding '$locale_encoding' too ambiguous"
97            );
98        }
99    }
100
101    return $locale_encoding;
102}
103
104sub import {
105    my $class = shift;
106    my $name  = shift;
107    if ( $name eq ':_get_locale_encoding' ) {    # used by lib/open.pm
108        my $caller = caller();
109        {
110            no strict 'refs';
111            *{"${caller}::_get_locale_encoding"} = \&_get_locale_encoding;
112        }
113        return;
114    }
115    $name = _get_locale_encoding() if $name eq ':locale';
116    my %arg = @_;
117    $name = $ENV{PERL_ENCODING} unless defined $name;
118    my $enc = find_encoding($name);
119    unless ( defined $enc ) {
120        require Carp;
121        Carp::croak("encoding: Unknown encoding '$name'");
122    }
123    $name = $enc->name;    # canonize
124    unless ( $arg{Filter} ) {
125        DEBUG and warn "_exception($name) = ", _exception($name);
126        _exception($name) or ${^ENCODING} = $enc;
127        $HAS_PERLIO or return 1;
128    }
129    else {
130        defined( ${^ENCODING} ) and undef ${^ENCODING};
131
132        # implicitly 'use utf8'
133        require utf8;      # to fetch $utf8::hint_bits;
134        $^H |= $utf8::hint_bits;
135        eval {
136            require Filter::Util::Call;
137            Filter::Util::Call->import;
138            filter_add(
139                sub {
140                    my $status = filter_read();
141                    if ( $status > 0 ) {
142                        $_ = $enc->decode( $_, 1 );
143                        DEBUG and warn $_;
144                    }
145                    $status;
146                }
147            );
148        };
149        $@ eq '' and DEBUG and warn "Filter installed";
150    }
151    defined ${^UNICODE} and ${^UNICODE} != 0 and return 1;
152    for my $h (qw(STDIN STDOUT)) {
153        if ( $arg{$h} ) {
154            unless ( defined find_encoding( $arg{$h} ) ) {
155                require Carp;
156                Carp::croak(
157                    "encoding: Unknown encoding for $h, '$arg{$h}'");
158            }
159            eval { binmode( $h, ":raw :encoding($arg{$h})" ) };
160        }
161        else {
162            unless ( exists $arg{$h} ) {
163                eval {
164                    no warnings 'uninitialized';
165                    binmode( $h, ":raw :encoding($name)" );
166                };
167            }
168        }
169        if ($@) {
170            require Carp;
171            Carp::croak($@);
172        }
173    }
174    return 1;    # I doubt if we need it, though
175}
176
177sub unimport {
178    no warnings;
179    undef ${^ENCODING};
180    if ($HAS_PERLIO) {
181        binmode( STDIN,  ":raw" );
182        binmode( STDOUT, ":raw" );
183    }
184    else {
185        binmode(STDIN);
186        binmode(STDOUT);
187    }
188    if ( $INC{"Filter/Util/Call.pm"} ) {
189        eval { filter_del() };
190    }
191}
192
1931;
194__END__
195
196=pod
197
198=head1 NAME
199
200encoding - allows you to write your script in non-ascii or non-utf8
201
202=head1 SYNOPSIS
203
204  use encoding "greek";  # Perl like Greek to you?
205  use encoding "euc-jp"; # Jperl!
206
207  # or you can even do this if your shell supports your native encoding
208
209  perl -Mencoding=latin2 -e'...' # Feeling centrally European?
210  perl -Mencoding=euc-kr -e'...' # Or Korean?
211
212  # more control
213
214  # A simple euc-cn => utf-8 converter
215  use encoding "euc-cn", STDOUT => "utf8";  while(<>){print};
216
217  # "no encoding;" supported (but not scoped!)
218  no encoding;
219
220  # an alternate way, Filter
221  use encoding "euc-jp", Filter=>1;
222  # now you can use kanji identifiers -- in euc-jp!
223
224  # switch on locale -
225  # note that this probably means that unless you have a complete control
226  # over the environments the application is ever going to be run, you should
227  # NOT use the feature of encoding pragma allowing you to write your script
228  # in any recognized encoding because changing locale settings will wreck
229  # the script; you can of course still use the other features of the pragma.
230  use encoding ':locale';
231
232=head1 ABSTRACT
233
234Let's start with a bit of history: Perl 5.6.0 introduced Unicode
235support.  You could apply C<substr()> and regexes even to complex CJK
236characters -- so long as the script was written in UTF-8.  But back
237then, text editors that supported UTF-8 were still rare and many users
238instead chose to write scripts in legacy encodings, giving up a whole
239new feature of Perl 5.6.
240
241Rewind to the future: starting from perl 5.8.0 with the B<encoding>
242pragma, you can write your script in any encoding you like (so long
243as the C<Encode> module supports it) and still enjoy Unicode support.
244This pragma achieves that by doing the following:
245
246=over
247
248=item *
249
250Internally converts all literals (C<q//,qq//,qr//,qw///, qx//>) from
251the encoding specified to utf8.  In Perl 5.8.1 and later, literals in
252C<tr///> and C<DATA> pseudo-filehandle are also converted.
253
254=item *
255
256Changing PerlIO layers of C<STDIN> and C<STDOUT> to the encoding
257 specified.
258
259=back
260
261=head2 Literal Conversions
262
263You can write code in EUC-JP as follows:
264
265  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
266               #<-char-><-char->   # 4 octets
267  s/\bCamel\b/$Rakuda/;
268
269And with C<use encoding "euc-jp"> in effect, it is the same thing as
270the code in UTF-8:
271
272  my $Rakuda = "\x{99F1}\x{99DD}"; # two Unicode Characters
273  s/\bCamel\b/$Rakuda/;
274
275=head2 PerlIO layers for C<STD(IN|OUT)>
276
277The B<encoding> pragma also modifies the filehandle layers of
278STDIN and STDOUT to the specified encoding.  Therefore,
279
280  use encoding "euc-jp";
281  my $message = "Camel is the symbol of perl.\n";
282  my $Rakuda = "\xF1\xD1\xF1\xCC"; # Camel in Kanji
283  $message =~ s/\bCamel\b/$Rakuda/;
284  print $message;
285
286Will print "\xF1\xD1\xF1\xCC is the symbol of perl.\n",
287not "\x{99F1}\x{99DD} is the symbol of perl.\n".
288
289You can override this by giving extra arguments; see below.
290
291=head2 Implicit upgrading for byte strings
292
293By default, if strings operating under byte semantics and strings
294with Unicode character data are concatenated, the new string will
295be created by decoding the byte strings as I<ISO 8859-1 (Latin-1)>.
296
297The B<encoding> pragma changes this to use the specified encoding
298instead.  For example:
299
300    use encoding 'utf8';
301    my $string = chr(20000); # a Unicode string
302    utf8::encode($string);   # now it's a UTF-8 encoded byte string
303    # concatenate with another Unicode string
304    print length($string . chr(20000));
305
306Will print C<2>, because C<$string> is upgraded as UTF-8.  Without
307C<use encoding 'utf8';>, it will print C<4> instead, since C<$string>
308is three octets when interpreted as Latin-1.
309
310=head2 Side effects
311
312If the C<encoding> pragma is in scope then the lengths returned are
313calculated from the length of C<$/> in Unicode characters, which is not
314always the same as the length of C<$/> in the native encoding.
315
316This pragma affects utf8::upgrade, but not utf8::downgrade.
317
318=head1 FEATURES THAT REQUIRE 5.8.1
319
320Some of the features offered by this pragma requires perl 5.8.1.  Most
321of these are done by Inaba Hiroto.  Any other features and changes
322are good for 5.8.0.
323
324=over
325
326=item "NON-EUC" doublebyte encodings
327
328Because perl needs to parse script before applying this pragma, such
329encodings as Shift_JIS and Big-5 that may contain '\' (BACKSLASH;
330\x5c) in the second byte fails because the second byte may
331accidentally escape the quoting character that follows.  Perl 5.8.1
332or later fixes this problem.
333
334=item tr//
335
336C<tr//> was overlooked by Perl 5 porters when they released perl 5.8.0
337See the section below for details.
338
339=item DATA pseudo-filehandle
340
341Another feature that was overlooked was C<DATA>.
342
343=back
344
345=head1 USAGE
346
347=over 4
348
349=item use encoding [I<ENCNAME>] ;
350
351Sets the script encoding to I<ENCNAME>.  And unless ${^UNICODE}
352exists and non-zero, PerlIO layers of STDIN and STDOUT are set to
353":encoding(I<ENCNAME>)".
354
355Note that STDERR WILL NOT be changed.
356
357Also note that non-STD file handles remain unaffected.  Use C<use
358open> or C<binmode> to change layers of those.
359
360If no encoding is specified, the environment variable L<PERL_ENCODING>
361is consulted.  If no encoding can be found, the error C<Unknown encoding
362'I<ENCNAME>'> will be thrown.
363
364=item use encoding I<ENCNAME> [ STDIN =E<gt> I<ENCNAME_IN> ...] ;
365
366You can also individually set encodings of STDIN and STDOUT via the
367C<< STDIN => I<ENCNAME> >> form.  In this case, you cannot omit the
368first I<ENCNAME>.  C<< STDIN => undef >> turns the IO transcoding
369completely off.
370
371When ${^UNICODE} exists and non-zero, these options will completely
372ignored.  ${^UNICODE} is a variable introduced in perl 5.8.1.  See
373L<perlrun> see L<perlvar/"${^UNICODE}"> and L<perlrun/"-C"> for
374details (perl 5.8.1 and later).
375
376=item use encoding I<ENCNAME> Filter=E<gt>1;
377
378This turns the encoding pragma into a source filter.  While the
379default approach just decodes interpolated literals (in qq() and
380qr()), this will apply a source filter to the entire source code.  See
381L</"The Filter Option"> below for details.
382
383=item no encoding;
384
385Unsets the script encoding. The layers of STDIN, STDOUT are
386reset to ":raw" (the default unprocessed raw stream of bytes).
387
388=back
389
390=head1 The Filter Option
391
392The magic of C<use encoding> is not applied to the names of
393identifiers.  In order to make C<${"\x{4eba}"}++> ($human++, where human
394is a single Han ideograph) work, you still need to write your script
395in UTF-8 -- or use a source filter.  That's what 'Filter=>1' does.
396
397What does this mean?  Your source code behaves as if it is written in
398UTF-8 with 'use utf8' in effect.  So even if your editor only supports
399Shift_JIS, for example, you can still try examples in Chapter 15 of
400C<Programming Perl, 3rd Ed.>.  For instance, you can use UTF-8
401identifiers.
402
403This option is significantly slower and (as of this writing) non-ASCII
404identifiers are not very stable WITHOUT this option and with the
405source code written in UTF-8.
406
407=head2 Filter-related changes at Encode version 1.87
408
409=over
410
411=item *
412
413The Filter option now sets STDIN and STDOUT like non-filter options.
414And C<< STDIN=>I<ENCODING> >> and C<< STDOUT=>I<ENCODING> >> work like
415non-filter version.
416
417=item *
418
419C<use utf8> is implicitly declared so you no longer have to C<use
420utf8> to C<${"\x{4eba}"}++>.
421
422=back
423
424=head1 CAVEATS
425
426=head2 NOT SCOPED
427
428The pragma is a per script, not a per block lexical.  Only the last
429C<use encoding> or C<no encoding> matters, and it affects
430B<the whole script>.  However, the <no encoding> pragma is supported and
431B<use encoding> can appear as many times as you want in a given script.
432The multiple use of this pragma is discouraged.
433
434By the same reason, the use this pragma inside modules is also
435discouraged (though not as strongly discouraged as the case above.
436See below).
437
438If you still have to write a module with this pragma, be very careful
439of the load order.  See the codes below;
440
441  # called module
442  package Module_IN_BAR;
443  use encoding "bar";
444  # stuff in "bar" encoding here
445  1;
446
447  # caller script
448  use encoding "foo"
449  use Module_IN_BAR;
450  # surprise! use encoding "bar" is in effect.
451
452The best way to avoid this oddity is to use this pragma RIGHT AFTER
453other modules are loaded.  i.e.
454
455  use Module_IN_BAR;
456  use encoding "foo";
457
458=head2 DO NOT MIX MULTIPLE ENCODINGS
459
460Notice that only literals (string or regular expression) having only
461legacy code points are affected: if you mix data like this
462
463    \xDF\x{100}
464
465the data is assumed to be in (Latin 1 and) Unicode, not in your native
466encoding.  In other words, this will match in "greek":
467
468    "\xDF" =~ /\x{3af}/
469
470but this will not
471
472    "\xDF\x{100}" =~ /\x{3af}\x{100}/
473
474since the C<\xDF> (ISO 8859-7 GREEK SMALL LETTER IOTA WITH TONOS) on
475the left will B<not> be upgraded to C<\x{3af}> (Unicode GREEK SMALL
476LETTER IOTA WITH TONOS) because of the C<\x{100}> on the left.  You
477should not be mixing your legacy data and Unicode in the same string.
478
479This pragma also affects encoding of the 0x80..0xFF code point range:
480normally characters in that range are left as eight-bit bytes (unless
481they are combined with characters with code points 0x100 or larger,
482in which case all characters need to become UTF-8 encoded), but if
483the C<encoding> pragma is present, even the 0x80..0xFF range always
484gets UTF-8 encoded.
485
486After all, the best thing about this pragma is that you don't have to
487resort to \x{....} just to spell your name in a native encoding.
488So feel free to put your strings in your encoding in quotes and
489regexes.
490
491=head2 tr/// with ranges
492
493The B<encoding> pragma works by decoding string literals in
494C<q//,qq//,qr//,qw///, qx//> and so forth.  In perl 5.8.0, this
495does not apply to C<tr///>.  Therefore,
496
497  use encoding 'euc-jp';
498  #....
499  $kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/;
500  #           -------- -------- -------- --------
501
502Does not work as
503
504  $kana =~ tr/\x{3041}-\x{3093}/\x{30a1}-\x{30f3}/;
505
506=over
507
508=item Legend of characters above
509
510  utf8     euc-jp   charnames::viacode()
511  -----------------------------------------
512  \x{3041} \xA4\xA1 HIRAGANA LETTER SMALL A
513  \x{3093} \xA4\xF3 HIRAGANA LETTER N
514  \x{30a1} \xA5\xA1 KATAKANA LETTER SMALL A
515  \x{30f3} \xA5\xF3 KATAKANA LETTER N
516
517=back
518
519This counterintuitive behavior has been fixed in perl 5.8.1.
520
521=head3 workaround to tr///;
522
523In perl 5.8.0, you can work around as follows;
524
525  use encoding 'euc-jp';
526  #  ....
527  eval qq{ \$kana =~ tr/\xA4\xA1-\xA4\xF3/\xA5\xA1-\xA5\xF3/ };
528
529Note the C<tr//> expression is surrounded by C<qq{}>.  The idea behind
530is the same as classic idiom that makes C<tr///> 'interpolate'.
531
532   tr/$from/$to/;            # wrong!
533   eval qq{ tr/$from/$to/ }; # workaround.
534
535Nevertheless, in case of B<encoding> pragma even C<q//> is affected so
536C<tr///> not being decoded was obviously against the will of Perl5
537Porters so it has been fixed in Perl 5.8.1 or later.
538
539=head1 EXAMPLE - Greekperl
540
541    use encoding "iso 8859-7";
542
543    # \xDF in ISO 8859-7 (Greek) is \x{3af} in Unicode.
544
545    $a = "\xDF";
546    $b = "\x{100}";
547
548    printf "%#x\n", ord($a); # will print 0x3af, not 0xdf
549
550    $c = $a . $b;
551
552    # $c will be "\x{3af}\x{100}", not "\x{df}\x{100}".
553
554    # chr() is affected, and ...
555
556    print "mega\n"  if ord(chr(0xdf)) == 0x3af;
557
558    # ... ord() is affected by the encoding pragma ...
559
560    print "tera\n" if ord(pack("C", 0xdf)) == 0x3af;
561
562    # ... as are eq and cmp ...
563
564    print "peta\n" if "\x{3af}" eq  pack("C", 0xdf);
565    print "exa\n"  if "\x{3af}" cmp pack("C", 0xdf) == 0;
566
567    # ... but pack/unpack C are not affected, in case you still
568    # want to go back to your native encoding
569
570    print "zetta\n" if unpack("C", (pack("C", 0xdf))) == 0xdf;
571
572=head1 KNOWN PROBLEMS
573
574=over
575
576=item literals in regex that are longer than 127 bytes
577
578For native multibyte encodings (either fixed or variable length),
579the current implementation of the regular expressions may introduce
580recoding errors for regular expression literals longer than 127 bytes.
581
582=item EBCDIC
583
584The encoding pragma is not supported on EBCDIC platforms.
585(Porters who are willing and able to remove this limitation are
586welcome.)
587
588=item format
589
590This pragma doesn't work well with format because PerlIO does not
591get along very well with it.  When format contains non-ascii
592characters it prints funny or gets "wide character warnings".
593To understand it, try the code below.
594
595  # Save this one in utf8
596  # replace *non-ascii* with a non-ascii string
597  my $camel;
598  format STDOUT =
599  *non-ascii*@>>>>>>>
600  $camel
601  .
602  $camel = "*non-ascii*";
603  binmode(STDOUT=>':encoding(utf8)'); # bang!
604  write;              # funny
605  print $camel, "\n"; # fine
606
607Without binmode this happens to work but without binmode, print()
608fails instead of write().
609
610At any rate, the very use of format is questionable when it comes to
611unicode characters since you have to consider such things as character
612width (i.e. double-width for ideographs) and directions (i.e. BIDI for
613Arabic and Hebrew).
614
615=item Thread safety
616
617C<use encoding ...> is not thread-safe (i.e., do not use in threaded
618applications).
619
620=back
621
622=head2 The Logic of :locale
623
624The logic of C<:locale> is as follows:
625
626=over 4
627
628=item 1.
629
630If the platform supports the langinfo(CODESET) interface, the codeset
631returned is used as the default encoding for the open pragma.
632
633=item 2.
634
635If 1. didn't work but we are under the locale pragma, the environment
636variables LC_ALL and LANG (in that order) are matched for encodings
637(the part after C<.>, if any), and if any found, that is used
638as the default encoding for the open pragma.
639
640=item 3.
641
642If 1. and 2. didn't work, the environment variables LC_ALL and LANG
643(in that order) are matched for anything looking like UTF-8, and if
644any found, C<:utf8> is used as the default encoding for the open
645pragma.
646
647=back
648
649If your locale environment variables (LC_ALL, LC_CTYPE, LANG)
650contain the strings 'UTF-8' or 'UTF8' (case-insensitive matching),
651the default encoding of your STDIN, STDOUT, and STDERR, and of
652B<any subsequent file open>, is UTF-8.
653
654=head1 HISTORY
655
656This pragma first appeared in Perl 5.8.0.  For features that require
6575.8.1 and better, see above.
658
659The C<:locale> subpragma was implemented in 2.01, or Perl 5.8.6.
660
661=head1 SEE ALSO
662
663L<perlunicode>, L<Encode>, L<open>, L<Filter::Util::Call>,
664
665Ch. 15 of C<Programming Perl (3rd Edition)>
666by Larry Wall, Tom Christiansen, Jon Orwant;
667O'Reilly & Associates; ISBN 0-596-00027-8
668
669=cut
670