1#
2# $Id: Encode.pm,v 3.19 2022/08/04 04:42:30 dankogai Exp $
3#
4package Encode;
5use strict;
6use warnings;
7use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
8our $VERSION;
9BEGIN {
10    $VERSION = sprintf "%d.%02d", q$Revision: 3.19 $ =~ /(\d+)/g;
11    require XSLoader;
12    XSLoader::load( __PACKAGE__, $VERSION );
13}
14
15use Exporter 5.57 'import';
16
17use Carp ();
18our @CARP_NOT = qw(Encode::Encoder);
19
20# Public, encouraged API is exported by default
21
22our @EXPORT = qw(
23  decode  decode_utf8  encode  encode_utf8 str2bytes bytes2str
24  encodings  find_encoding find_mime_encoding clone_encoding
25);
26our @FB_FLAGS = qw(
27  DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
28  PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
29);
30our @FB_CONSTS = qw(
31  FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
32  FB_PERLQQ FB_HTMLCREF FB_XMLCREF
33);
34our @EXPORT_OK = (
35    qw(
36      _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
37      is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
38      ),
39    @FB_FLAGS, @FB_CONSTS,
40);
41
42our %EXPORT_TAGS = (
43    all          => [ @EXPORT,    @EXPORT_OK ],
44    default      => [ @EXPORT ],
45    fallbacks    => [ @FB_CONSTS ],
46    fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
47);
48
49# Documentation moved after __END__ for speed - NI-S
50
51our $ON_EBCDIC = ( ord("A") == 193 );
52
53use Encode::Alias ();
54use Encode::MIME::Name;
55
56use Storable;
57
58# Make a %Encoding package variable to allow a certain amount of cheating
59our %Encoding;
60our %ExtModule;
61require Encode::Config;
62#  See
63#  https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
64#  to find why sig handlers inside eval{} are disabled.
65eval {
66    local $SIG{__DIE__};
67    local $SIG{__WARN__};
68    local @INC = @INC;
69    pop @INC if @INC && $INC[-1] eq '.';
70    require Encode::ConfigLocal;
71};
72
73sub encodings {
74    my %enc;
75    my $arg  = $_[1] || '';
76    if ( $arg eq ":all" ) {
77        %enc = ( %Encoding, %ExtModule );
78    }
79    else {
80        %enc = %Encoding;
81        for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
82            DEBUG and warn $mod;
83            for my $enc ( keys %ExtModule ) {
84                $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
85            }
86        }
87    }
88    return sort { lc $a cmp lc $b }
89      grep      { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
90}
91
92sub perlio_ok {
93    my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
94    $obj->can("perlio_ok") and return $obj->perlio_ok();
95    return 0;    # safety net
96}
97
98sub define_encoding {
99    my $obj  = shift;
100    my $name = shift;
101    $Encoding{$name} = $obj;
102    my $lc = lc($name);
103    define_alias( $lc => $obj ) unless $lc eq $name;
104    while (@_) {
105        my $alias = shift;
106        define_alias( $alias, $obj );
107    }
108    my $class = ref($obj);
109    push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT;
110    push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT;
111    return $obj;
112}
113
114sub getEncoding {
115    my ( $class, $name, $skip_external ) = @_;
116
117    defined($name) or return;
118
119    $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
120
121    ref($name) && $name->can('renew') and return $name;
122    exists $Encoding{$name} and return $Encoding{$name};
123    my $lc = lc $name;
124    exists $Encoding{$lc} and return $Encoding{$lc};
125
126    my $oc = $class->find_alias($name);
127    defined($oc) and return $oc;
128    $lc ne $name and $oc = $class->find_alias($lc);
129    defined($oc) and return $oc;
130
131    unless ($skip_external) {
132        if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
133            $mod =~ s,::,/,g;
134            $mod .= '.pm';
135            eval { require $mod; };
136            exists $Encoding{$name} and return $Encoding{$name};
137        }
138    }
139    return;
140}
141
142# HACK: These two functions must be defined in Encode and because of
143# cyclic dependency between Encode and Encode::Alias, Exporter does not work
144sub find_alias {
145    goto &Encode::Alias::find_alias;
146}
147sub define_alias {
148    goto &Encode::Alias::define_alias;
149}
150
151sub find_encoding($;$) {
152    my ( $name, $skip_external ) = @_;
153    return __PACKAGE__->getEncoding( $name, $skip_external );
154}
155
156sub find_mime_encoding($;$) {
157    my ( $mime_name, $skip_external ) = @_;
158    my $name = Encode::MIME::Name::get_encode_name( $mime_name );
159    return find_encoding( $name, $skip_external );
160}
161
162sub resolve_alias($) {
163    my $obj = find_encoding(shift);
164    defined $obj and return $obj->name;
165    return;
166}
167
168sub clone_encoding($) {
169    my $obj = find_encoding(shift);
170    ref $obj or return;
171    return Storable::dclone($obj);
172}
173
174onBOOT;
175
176if ($ON_EBCDIC) {
177    package Encode::UTF_EBCDIC;
178    use parent 'Encode::Encoding';
179    my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
180    Encode::define_encoding($obj, 'Unicode');
181    sub decode {
182        my ( undef, $str, $chk ) = @_;
183        my $res = '';
184        for ( my $i = 0 ; $i < length($str) ; $i++ ) {
185            $res .=
186              chr(
187                utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
188              );
189        }
190        $_[1] = '' if $chk;
191        return $res;
192    }
193    sub encode {
194        my ( undef, $str, $chk ) = @_;
195        my $res = '';
196        for ( my $i = 0 ; $i < length($str) ; $i++ ) {
197            $res .=
198              chr(
199                utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
200              );
201        }
202        $_[1] = '' if $chk;
203        return $res;
204    }
205}
206
207{
208    # https://rt.cpan.org/Public/Bug/Display.html?id=103253
209    package Encode::XS;
210    use parent 'Encode::Encoding';
211}
212
213{
214    package Encode::utf8;
215    use parent 'Encode::Encoding';
216    my %obj = (
217        'utf8'         => { Name => 'utf8' },
218        'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 }
219    );
220    for ( keys %obj ) {
221        bless $obj{$_} => __PACKAGE__;
222        Encode::define_encoding( $obj{$_} => $_ );
223    }
224    sub cat_decode {
225        # ($obj, $dst, $src, $pos, $trm, $chk)
226        # currently ignores $chk
227        my ( undef, undef, undef, $pos, $trm ) = @_;
228        my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
229        use bytes;
230        if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
231            $$rdst .=
232              substr( $$rsrc, $pos, $npos - $pos + length($trm) );
233            $$rpos = $npos + length($trm);
234            return 1;
235        }
236        $$rdst .= substr( $$rsrc, $pos );
237        $$rpos = length($$rsrc);
238        return '';
239    }
240}
241
2421;
243
244__END__
245
246=head1 NAME
247
248Encode - character encodings in Perl
249
250=head1 SYNOPSIS
251
252    use Encode qw(decode encode);
253    $characters = decode('UTF-8', $octets,     Encode::FB_CROAK);
254    $octets     = encode('UTF-8', $characters, Encode::FB_CROAK);
255
256=head2 Table of Contents
257
258Encode consists of a collection of modules whose details are too extensive
259to fit in one document.  This one itself explains the top-level APIs
260and general topics at a glance.  For other topics and more details,
261see the documentation for these modules:
262
263=over 2
264
265=item L<Encode::Alias> - Alias definitions to encodings
266
267=item L<Encode::Encoding> - Encode Implementation Base Class
268
269=item L<Encode::Supported> - List of Supported Encodings
270
271=item L<Encode::CN> - Simplified Chinese Encodings
272
273=item L<Encode::JP> - Japanese Encodings
274
275=item L<Encode::KR> - Korean Encodings
276
277=item L<Encode::TW> - Traditional Chinese Encodings
278
279=back
280
281=head1 DESCRIPTION
282
283The C<Encode> module provides the interface between Perl strings
284and the rest of the system.  Perl strings are sequences of
285I<characters>.
286
287The repertoire of characters that Perl can represent is a superset of those
288defined by the Unicode Consortium. On most platforms the ordinal
289values of a character as returned by C<ord(I<S>)> is the I<Unicode
290codepoint> for that character. The exceptions are platforms where
291the legacy encoding is some variant of EBCDIC rather than a superset
292of ASCII; see L<perlebcdic>.
293
294During recent history, data is moved around a computer in 8-bit chunks,
295often called "bytes" but also known as "octets" in standards documents.
296Perl is widely used to manipulate data of many types: not only strings of
297characters representing human or computer languages, but also "binary"
298data, being the machine's representation of numbers, pixels in an image, or
299just about anything.
300
301When Perl is processing "binary data", the programmer wants Perl to
302process "sequences of bytes". This is not a problem for Perl: because a
303byte has 256 possible values, it easily fits in Perl's much larger
304"logical character".
305
306This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
307explain the I<why>.
308
309=head2 TERMINOLOGY
310
311=head3 character
312
313A character in the range 0 .. 2**32-1 (or more);
314what Perl's strings are made of.
315
316=head3 byte
317
318A character in the range 0..255;
319a special case of a Perl character.
320
321=head3 octet
322
3238 bits of data, with ordinal values 0..255;
324term for bytes passed to or from a non-Perl context, such as a disk file,
325standard I/O stream, database, command-line argument, environment variable,
326socket etc.
327
328=head1 THE PERL ENCODING API
329
330=head2 Basic methods
331
332=head3 encode
333
334  $octets  = encode(ENCODING, STRING[, CHECK])
335
336Encodes the scalar value I<STRING> from Perl's internal form into
337I<ENCODING> and returns a sequence of octets.  I<ENCODING> can be either a
338canonical name or an alias.  For encoding names and aliases, see
339L</"Defining Aliases">.  For CHECK, see L</"Handling Malformed Data">.
340
341B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending
342on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
343left unchanged.
344
345For example, to convert a string from Perl's internal format into
346ISO-8859-1, also known as Latin1:
347
348  $octets = encode("iso-8859-1", $string);
349
350B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then
351$octets I<might not be equal to> $string.  Though both contain the
352same data, the UTF8 flag for $octets is I<always> off.  When you
353encode anything, the UTF8 flag on the result is always off, even when it
354contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below.
355
356If the $string is C<undef>, then C<undef> is returned.
357
358C<str2bytes> may be used as an alias for C<encode>.
359
360=head3 decode
361
362  $string = decode(ENCODING, OCTETS[, CHECK])
363
364This function returns the string that results from decoding the scalar
365value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
366Perl's internal form.  As with encode(),
367I<ENCODING> can be either a canonical name or an alias. For encoding names
368and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
369Malformed Data">.
370
371B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending
372on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
373left unchanged.
374
375For example, to convert ISO-8859-1 data into a string in Perl's
376internal format:
377
378  $string = decode("iso-8859-1", $octets);
379
380B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string
381I<might not be equal to> $octets.  Though both contain the same data, the
382UTF8 flag for $string is on.  See L</"The UTF8 flag">
383below.
384
385If the $string is C<undef>, then C<undef> is returned.
386
387C<bytes2str> may be used as an alias for C<decode>.
388
389=head3 find_encoding
390
391  [$obj =] find_encoding(ENCODING)
392
393Returns the I<encoding object> corresponding to I<ENCODING>.  Returns
394C<undef> if no matching I<ENCODING> is find.  The returned object is
395what does the actual encoding or decoding.
396
397  $string = decode($name, $bytes);
398
399is in fact
400
401    $string = do {
402        $obj = find_encoding($name);
403        croak qq(encoding "$name" not found) unless ref $obj;
404        $obj->decode($bytes);
405    };
406
407with more error checking.
408
409You can therefore save time by reusing this object as follows;
410
411    my $enc = find_encoding("iso-8859-1");
412    while(<>) {
413        my $string = $enc->decode($_);
414        ... # now do something with $string;
415    }
416
417Besides L</decode> and L</encode>, other methods are
418available as well.  For instance, C<name()> returns the canonical
419name of the encoding object.
420
421  find_encoding("latin1")->name; # iso-8859-1
422
423See L<Encode::Encoding> for details.
424
425=head3 find_mime_encoding
426
427  [$obj =] find_mime_encoding(MIME_ENCODING)
428
429Returns the I<encoding object> corresponding to I<MIME_ENCODING>.  Acts
430same as C<find_encoding()> but C<mime_name()> of returned object must
431match to I<MIME_ENCODING>.  So as opposite of C<find_encoding()>
432canonical names and aliases are not used when searching for object.
433
434    find_mime_encoding("utf8"); # returns undef because "utf8" is not valid I<MIME_ENCODING>
435    find_mime_encoding("utf-8"); # returns encode object "utf-8-strict"
436    find_mime_encoding("UTF-8"); # same as "utf-8" because I<MIME_ENCODING> is case insensitive
437    find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not valid I<MIME_ENCODING>
438
439=head3 from_to
440
441  [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
442
443Converts I<in-place> data between two encodings. The data in $octets
444must be encoded as octets and I<not> as characters in Perl's internal
445format. For example, to convert ISO-8859-1 data into Microsoft's CP1250
446encoding:
447
448  from_to($octets, "iso-8859-1", "cp1250");
449
450and to convert it back:
451
452  from_to($octets, "cp1250", "iso-8859-1");
453
454Because the conversion happens in place, the data to be
455converted cannot be a string constant: it must be a scalar variable.
456
457C<from_to()> returns the length of the converted string in octets on success,
458and C<undef> on error.
459
460B<CAVEAT>: The following operations may look the same, but are not:
461
462  from_to($data, "iso-8859-1", "UTF-8"); #1
463  $data = decode("iso-8859-1", $data);  #2
464
465Both #1 and #2 make $data consist of a completely valid UTF-8 string,
466but only #2 turns the UTF8 flag on.  #1 is equivalent to:
467
468  $data = encode("UTF-8", decode("iso-8859-1", $data));
469
470See L</"The UTF8 flag"> below.
471
472Also note that:
473
474  from_to($octets, $from, $to, $check);
475
476is equivalent to:
477
478  $octets = encode($to, decode($from, $octets), $check);
479
480Yes, it does I<not> respect the $check during decoding.  It is
481deliberately done that way.  If you need minute control, use C<decode>
482followed by C<encode> as follows:
483
484  $octets = encode($to, decode($from, $octets, $check_from), $check_to);
485
486=head3 encode_utf8
487
488  $octets = encode_utf8($string);
489
490B<WARNING>: L<This function can produce invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
491Do not use it for data exchange.
492Unless you want Perl's older "lax" mode, prefer
493C<$octets = encode("UTF-8", $string)>.
494
495Equivalent to C<$octets = encode("utf8", $string)>.  The characters in
496$string are encoded in Perl's internal format, and the result is returned
497as a sequence of octets.  Because all possible characters in Perl have a
498(loose, not strict) utf8 representation, this function cannot fail.
499
500=head3 decode_utf8
501
502  $string = decode_utf8($octets [, CHECK]);
503
504B<WARNING>: L<This function accepts invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
505Do not use it for data exchange.
506Unless you want Perl's older "lax" mode, prefer
507C<$string = decode("UTF-8", $octets [, CHECK])>.
508
509Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
510The sequence of octets represented by $octets is decoded
511from (loose, not strict) utf8 into a sequence of logical characters.
512Because not all sequences of octets are valid not strict utf8,
513it is quite possible for this function to fail.
514For CHECK, see L</"Handling Malformed Data">.
515
516B<CAVEAT>: the input I<$octets> might be modified in-place depending on
517what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
518left unchanged.
519
520=head2 Listing available encodings
521
522  use Encode;
523  @list = Encode->encodings();
524
525Returns a list of canonical names of available encodings that have already
526been loaded.  To get a list of all available encodings including those that
527have not yet been loaded, say:
528
529  @all_encodings = Encode->encodings(":all");
530
531Or you can give the name of a specific module:
532
533  @with_jp = Encode->encodings("Encode::JP");
534
535When "C<::>" is not in the name, "C<Encode::>" is assumed.
536
537  @ebcdic = Encode->encodings("EBCDIC");
538
539To find out in detail which encodings are supported by this package,
540see L<Encode::Supported>.
541
542=head2 Defining Aliases
543
544To add a new alias to a given encoding, use:
545
546  use Encode;
547  use Encode::Alias;
548  define_alias(NEWNAME => ENCODING);
549
550After that, I<NEWNAME> can be used as an alias for I<ENCODING>.
551I<ENCODING> may be either the name of an encoding or an
552I<encoding object>.
553
554Before you do that, first make sure the alias is nonexistent using
555C<resolve_alias()>, which returns the canonical name thereof.
556For example:
557
558  Encode::resolve_alias("latin1") eq "iso-8859-1" # true
559  Encode::resolve_alias("iso-8859-12")   # false; nonexistent
560  Encode::resolve_alias($name) eq $name  # true if $name is canonical
561
562C<resolve_alias()> does not need C<use Encode::Alias>; it can be
563imported via C<use Encode qw(resolve_alias)>.
564
565See L<Encode::Alias> for details.
566
567=head2 Finding IANA Character Set Registry names
568
569The canonical name of a given encoding does not necessarily agree with
570IANA Character Set Registry, commonly seen as C<< Content-Type:
571text/plain; charset=I<WHATEVER> >>.  For most cases, the canonical name
572works, but sometimes it does not, most notably with "utf-8-strict".
573
574As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
575
576  use Encode;
577  my $enc = find_encoding("UTF-8");
578  warn $enc->name;      # utf-8-strict
579  warn $enc->mime_name; # UTF-8
580
581See also:  L<Encode::Encoding>
582
583=head1 Encoding via PerlIO
584
585If your perl supports C<PerlIO> (which is the default), you can use a
586C<PerlIO> layer to decode and encode directly via a filehandle.  The
587following two examples are fully identical in functionality:
588
589  ### Version 1 via PerlIO
590    open(INPUT,  "< :encoding(shiftjis)", $infile)
591        || die "Can't open < $infile for reading: $!";
592    open(OUTPUT, "> :encoding(euc-jp)",  $outfile)
593        || die "Can't open > $output for writing: $!";
594    while (<INPUT>) {   # auto decodes $_
595        print OUTPUT;   # auto encodes $_
596    }
597    close(INPUT)   || die "can't close $infile: $!";
598    close(OUTPUT)  || die "can't close $outfile: $!";
599
600  ### Version 2 via from_to()
601    open(INPUT,  "< :raw", $infile)
602        || die "Can't open < $infile for reading: $!";
603    open(OUTPUT, "> :raw",  $outfile)
604        || die "Can't open > $output for writing: $!";
605
606    while (<INPUT>) {
607        from_to($_, "shiftjis", "euc-jp", 1);  # switch encoding
608        print OUTPUT;   # emit raw (but properly encoded) data
609    }
610    close(INPUT)   || die "can't close $infile: $!";
611    close(OUTPUT)  || die "can't close $outfile: $!";
612
613In the first version above, you let the appropriate encoding layer
614handle the conversion.  In the second, you explicitly translate
615from one encoding to the other.
616
617Unfortunately, it may be that encodings are not C<PerlIO>-savvy.  You can check
618to see whether your encoding is supported by C<PerlIO> by invoking the
619C<perlio_ok> method on it:
620
621  Encode::perlio_ok("hz");             # false
622  find_encoding("euc-cn")->perlio_ok;  # true wherever PerlIO is available
623
624  use Encode qw(perlio_ok);            # imported upon request
625  perlio_ok("euc-jp")
626
627Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
628except for C<hz> and C<ISO-2022-kr>.  For the gory details, see
629L<Encode::Encoding> and L<Encode::PerlIO>.
630
631=head1 Handling Malformed Data
632
633The optional I<CHECK> argument tells C<Encode> what to do when
634encountering malformed data.  Without I<CHECK>, C<Encode::FB_DEFAULT>
635(== 0) is assumed.
636
637As of version 2.12, C<Encode> supports coderef values for C<CHECK>;
638see below.
639
640B<NOTE:> Not all encodings support this feature.
641Some encodings ignore the I<CHECK> argument.  For example,
642L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
643
644=head2 List of I<CHECK> values
645
646=head3 FB_DEFAULT
647
648  I<CHECK> = Encode::FB_DEFAULT ( == 0)
649
650If I<CHECK> is 0, encoding and decoding replace any malformed character
651with a I<substitution character>.  When you encode, I<SUBCHAR> is used.
652When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
653used.  If the data is supposed to be UTF-8, an optional lexical warning of
654warning category C<"utf8"> is given.
655
656=head3 FB_CROAK
657
658  I<CHECK> = Encode::FB_CROAK ( == 1)
659
660If I<CHECK> is 1, methods immediately die with an error
661message.  Therefore, when I<CHECK> is 1, you should trap
662exceptions with C<eval{}>, unless you really want to let it C<die>.
663
664=head3 FB_QUIET
665
666  I<CHECK> = Encode::FB_QUIET
667
668If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
669return the portion of the data that has been processed so far when an
670error occurs. The data argument is overwritten with everything
671after that point; that is, the unprocessed portion of the data.  This is
672handy when you have to call C<decode> repeatedly in the case where your
673source data may contain partial multi-byte character sequences,
674(that is, you are reading with a fixed-width buffer). Here's some sample
675code to do exactly that:
676
677    my($buffer, $string) = ("", "");
678    while (read($fh, $buffer, 256, length($buffer))) {
679        $string .= decode($encoding, $buffer, Encode::FB_QUIET);
680        # $buffer now contains the unprocessed partial character
681    }
682
683=head3 FB_WARN
684
685  I<CHECK> = Encode::FB_WARN
686
687This is the same as C<FB_QUIET> above, except that instead of being silent
688on errors, it issues a warning.  This is handy for when you are debugging.
689
690B<CAVEAT>: All warnings from Encode module are reported, independently of
691L<pragma warnings|warnings> settings. If you want to follow settings of
692lexical warnings configured by L<pragma warnings|warnings> then append
693also check value C<ENCODE::ONLY_PRAGMA_WARNINGS>. This value is available
694since Encode version 2.99.
695
696=head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
697
698=over 2
699
700=item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
701
702=item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
703
704=item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
705
706=back
707
708For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
709C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
710
711When you decode, C<\xI<HH>> is inserted for a malformed character, where
712I<HH> is the hex representation of the octet that could not be decoded to
713utf8.  When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
714the Unicode code point (in any number of hex digits) of the character that
715cannot be found in the character repertoire of the encoding.
716
717The HTML/XML character reference modes are about the same. In place of
718C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
719XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
720
721In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
722
723=head3 The bitmask
724
725These modes are all actually set via a bitmask.  Here is how the C<FB_I<XXX>>
726constants are laid out.  You can import the C<FB_I<XXX>> constants via
727C<use Encode qw(:fallbacks)>, and you can import the generic bitmask
728constants via C<use Encode qw(:fallback_all)>.
729
730                     FB_DEFAULT FB_CROAK FB_QUIET FB_WARN  FB_PERLQQ
731 DIE_ON_ERR    0x0001             X
732 WARN_ON_ERR   0x0002                               X
733 RETURN_ON_ERR 0x0004                      X        X
734 LEAVE_SRC     0x0008                                        X
735 PERLQQ        0x0100                                        X
736 HTMLCREF      0x0200
737 XMLCREF       0x0400
738
739=head3 LEAVE_SRC
740
741  Encode::LEAVE_SRC
742
743If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
744source string to encode() or decode() will be overwritten in place.
745If you're not interested in this, then bitwise-OR it with the bitmask.
746
747=head2 coderef for CHECK
748
749As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
750ordinal value of the unmapped character as an argument and returns
751octets that represent the fallback character.  For instance:
752
753  $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
754
755Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
756
757Fallback for C<decode> must return decoded string (sequence of characters)
758and takes a list of ordinal values as its arguments. So for
759example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
760a fallback for bytes that are not valid UTF-8, you could write
761
762    $str = decode 'UTF-8', $octets, sub {
763        my $tmp = join '', map chr, @_;
764        return decode 'ISO-8859-15', $tmp;
765    };
766
767=head1 Defining Encodings
768
769To define a new encoding, use:
770
771    use Encode qw(define_encoding);
772    define_encoding($object, CANONICAL_NAME [, alias...]);
773
774I<CANONICAL_NAME> will be associated with I<$object>.  The object
775should provide the interface described in L<Encode::Encoding>.
776If more than two arguments are provided, additional
777arguments are considered aliases for I<$object>.
778
779See L<Encode::Encoding> for details.
780
781=head1 The UTF8 flag
782
783Before the introduction of Unicode support in Perl, The C<eq> operator
784just compared the strings represented by two scalars. Beginning with
785Perl 5.8, C<eq> compares two strings with simultaneous consideration of
786I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
787I<Programming Perl, 3rd ed.>
788
789=over 2
790
791=item Goal #1:
792
793Old byte-oriented programs should not spontaneously break on the old
794byte-oriented data they used to work on.
795
796=item Goal #2:
797
798Old byte-oriented programs should magically start working on the new
799character-oriented data when appropriate.
800
801=item Goal #3:
802
803Programs should run just as fast in the new character-oriented mode
804as in the old byte-oriented mode.
805
806=item Goal #4:
807
808Perl should remain one language, rather than forking into a
809byte-oriented Perl and a character-oriented Perl.
810
811=back
812
813When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
814born yet, many features documented in the book remained unimplemented for a
815long time.  Perl 5.8 corrected much of this, and the introduction of the
816UTF8 flag is one of them.  You can think of there being two fundamentally
817different kinds of strings and string-operations in Perl: one a
818byte-oriented mode  for when the internal UTF8 flag is off, and the other a
819character-oriented mode for when the internal UTF8 flag is on.
820
821This UTF8 flag is not visible in Perl scripts, exactly for the same reason
822you cannot (or rather, you I<don't have to>) see whether a scalar contains
823a string, an integer, or a floating-point number.   But you can still peek
824and poke these if you will.  See the next section.
825
826=head2 Messing with Perl's Internals
827
828The following API uses parts of Perl's internals in the current
829implementation.  As such, they are efficient but may change in a future
830release.
831
832=head3 is_utf8
833
834  is_utf8(STRING [, CHECK])
835
836[INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
837If I<CHECK> is true, also checks whether I<STRING> contains well-formed
838UTF-8.  Returns true if successful, false otherwise.
839
840Typically only necessary for debugging and testing.  Don't use this flag as
841a marker to distinguish character and binary data, that should be decided
842for each variable when you write your code.
843
844B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that
845I<STRING> is UTF-8 encoded and vice-versa.
846
847As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
848
849=head3 _utf8_on
850
851  _utf8_on(STRING)
852
853[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>.  The I<STRING>
854is I<not> checked for containing only well-formed UTF-8.  Do not use this
855unless you I<know with absolute certainty> that the STRING holds only
856well-formed UTF-8.  Returns the previous state of the UTF8 flag (so please
857don't treat the return value as indicating success or failure), or C<undef>
858if I<STRING> is not a string.
859
860B<NOTE>: For security reasons, this function does not work on tainted values.
861
862=head3 _utf8_off
863
864  _utf8_off(STRING)
865
866[INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>.  Do not use
867frivolously.  Returns the previous state of the UTF8 flag, or C<undef> if
868I<STRING> is not a string.  Do not treat the return value as indicative of
869success or failure, because that isn't what it means: it is only the
870previous setting.
871
872B<NOTE>: For security reasons, this function does not work on tainted values.
873
874=head1 UTF-8 vs. utf8 vs. UTF8
875
876  ....We now view strings not as sequences of bytes, but as sequences
877  of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
878  computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
879
880That has historically been Perl's notion of UTF-8, as that is how UTF-8 was
881first conceived by Ken Thompson when he invented it. However, thanks to
882later revisions to the applicable standards, official UTF-8 is now rather
883stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
884to cover only 21 bits instead of 32 or 64 bits) and some sequences
885are not allowed, like those used in surrogate pairs, the 31 non-character
886code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
887(0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
888
889The former default in which Perl would always use a loose interpretation of
890UTF-8 has now been overruled:
891
892  From: Larry Wall <larry@wall.org>
893  Date: December 04, 2004 11:51:58 JST
894  To: perl-unicode@perl.org
895  Subject: Re: Make Encode.pm support the real UTF-8
896  Message-Id: <20041204025158.GA28754@wall.org>
897
898  On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
899  : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
900  : but "UTF-8" is the name of the standard and should give the
901  : corresponding behaviour.
902
903  For what it's worth, that's how I've always kept them straight in my
904  head.
905
906  Also for what it's worth, Perl 6 will mostly default to strict but
907  make it easy to switch back to lax.
908
909  Larry
910
911Got that?  As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
912sense, which is conservative and strict and security-conscious, whereas
913B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
914lax.  C<Encode> version 2.10 or later thus groks this subtle but critically
915important distinction between C<"UTF-8"> and C<"utf8">.
916
917  encode("utf8",  "\x{FFFF_FFFF}", 1); # okay
918  encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
919
920This distinction is also important for decoding. In the following,
921C<$s> stores character U+200000, which exceeds UTF-8's allowed range.
922C<$s> thus stores an invalid Unicode code point:
923
924  $s = decode("utf8", "\xf8\x88\x80\x80\x80");
925
926C<"UTF-8">, by contrast, will either coerce the input to something valid:
927
928    $s = decode("UTF-8", "\xf8\x88\x80\x80\x80"); # U+FFFD
929
930.. or croak:
931
932    decode("UTF-8", "\xf8\x88\x80\x80\x80", FB_CROAK|LEAVE_SRC);
933
934In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
935C<"utf-8-strict">.  That hyphen between the C<"UTF"> and the C<"8"> is
936critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
937
938  find_encoding("UTF-8")->name # is 'utf-8-strict'
939  find_encoding("utf-8")->name # ditto. names are case insensitive
940  find_encoding("utf_8")->name # ditto. "_" are treated as "-"
941  find_encoding("UTF8")->name  # is 'utf8'.
942
943Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
944whether a string is internally encoded as "utf8", also without a hyphen.
945
946=head1 SEE ALSO
947
948L<Encode::Encoding>,
949L<Encode::Supported>,
950L<Encode::PerlIO>,
951L<encoding>,
952L<perlebcdic>,
953L<perlfunc/open>,
954L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
955L<utf8>,
956the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
957
958=head1 MAINTAINER
959
960This project was originated by the late Nick Ing-Simmons and later
961maintained by Dan Kogai I<< <dankogai@cpan.org> >>.  See AUTHORS
962for a full list of people involved.  For any questions, send mail to
963I<< <perl-unicode@perl.org> >> so that we can all share.
964
965While Dan Kogai retains the copyright as a maintainer, credit
966should go to all those involved.  See AUTHORS for a list of those
967who submitted code to the project.
968
969=head1 COPYRIGHT
970
971Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>.
972
973This library is free software; you can redistribute it and/or modify
974it under the same terms as Perl itself.
975
976=cut
977