1package Encode::MIME::Header;
2use strict;
3use warnings;
4
5our $VERSION = do { my @r = ( q$Revision: 2.29 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
6
7use Carp ();
8use Encode ();
9use MIME::Base64 ();
10
11my %seed = (
12    decode_b => 1,       # decodes 'B' encoding ?
13    decode_q => 1,       # decodes 'Q' encoding ?
14    encode   => 'B',     # encode with 'B' or 'Q' ?
15    charset  => 'UTF-8', # encode charset
16    bpl      => 75,      # bytes per line
17);
18
19my @objs;
20
21push @objs, bless {
22    %seed,
23    Name     => 'MIME-Header',
24} => __PACKAGE__;
25
26push @objs, bless {
27    %seed,
28    decode_q => 0,
29    Name     => 'MIME-B',
30} => __PACKAGE__;
31
32push @objs, bless {
33    %seed,
34    decode_b => 0,
35    encode   => 'Q',
36    Name     => 'MIME-Q',
37} => __PACKAGE__;
38
39Encode::define_encoding($_, $_->{Name}) foreach @objs;
40
41use parent qw(Encode::Encoding);
42
43sub needs_lines { 1 }
44sub perlio_ok   { 0 }
45
46# RFC 2047 and RFC 2231 grammar
47my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
48my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
49my $re_encoding = qr/[QqBb]/;
50my $re_encoded_text = qr/[^\?]*/;
51my $re_encoded_word = qr/=\?$re_charset(?:\*$re_language)?\?$re_encoding\?$re_encoded_text\?=/;
52my $re_capture_encoded_word = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding\?$re_encoded_text)\?=/;
53my $re_capture_encoded_word_split = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding)\?($re_encoded_text)\?=/;
54
55# in strict mode check also for valid base64 characters and also for valid quoted printable codes
56my $re_encoding_strict_b = qr/[Bb]/;
57my $re_encoding_strict_q = qr/[Qq]/;
58my $re_encoded_text_strict_b = qr/(?:[0-9A-Za-z\+\/]{4})*(?:[0-9A-Za-z\+\/]{2}==|[0-9A-Za-z\+\/]{3}=|)/;
59my $re_encoded_text_strict_q = qr/(?:[\x21-\x3C\x3E\x40-\x7E]|=[0-9A-Fa-f]{2})*/; # NOTE: first part are printable US-ASCII except ?, =, SPACE and TAB
60my $re_encoded_word_strict = qr/=\?$re_charset(?:\*$re_language)?\?(?:$re_encoding_strict_b\?$re_encoded_text_strict_b|$re_encoding_strict_q\?$re_encoded_text_strict_q)\?=/;
61my $re_capture_encoded_word_strict = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding_strict_b\?$re_encoded_text_strict_b|$re_encoding_strict_q\?$re_encoded_text_strict_q)\?=/;
62
63my $re_newline = qr/(?:\r\n|[\r\n])/;
64
65# in strict mode encoded words must be always separated by spaces or tabs (or folded newline)
66# except in comments when separator between words and comment round brackets can be omitted
67my $re_word_begin_strict = qr/(?:(?:[ \t]|\A)\(?|(?:[^\\]|\A)\)\()/;
68my $re_word_sep_strict = qr/(?:$re_newline?[ \t])+/;
69my $re_word_end_strict = qr/(?:\)\(|\)?(?:$re_newline?[ \t]|\z))/;
70
71my $re_match = qr/()((?:$re_encoded_word\s*)*$re_encoded_word)()/;
72my $re_match_strict = qr/($re_word_begin_strict)((?:$re_encoded_word_strict$re_word_sep_strict)*$re_encoded_word_strict)(?=$re_word_end_strict)/;
73
74my $re_capture = qr/$re_capture_encoded_word(?:\s*)?/;
75my $re_capture_strict = qr/$re_capture_encoded_word_strict$re_word_sep_strict?/;
76
77our $STRICT_DECODE = 0;
78
79sub decode($$;$) {
80    my ($obj, $str, $chk) = @_;
81    return undef unless defined $str;
82
83    my $re_match_decode = $STRICT_DECODE ? $re_match_strict : $re_match;
84    my $re_capture_decode = $STRICT_DECODE ? $re_capture_strict : $re_capture;
85
86    my $stop = 0;
87    my $output = substr($str, 0, 0); # to propagate taintedness
88
89    # decode each line separately, match whole continuous folded line at one call
90    1 while not $stop and $str =~ s{^((?:[^\r\n]*(?:$re_newline[ \t])?)*)($re_newline)?}{
91
92        my $line = $1;
93        my $sep = defined $2 ? $2 : '';
94
95        $stop = 1 unless length($line) or length($sep);
96
97        # in non strict mode append missing '=' padding characters for b words
98        # fixes below concatenation of consecutive encoded mime words
99        1 while not $STRICT_DECODE and $line =~ s/(=\?$re_charset(?:\*$re_language)?\?[Bb]\?)((?:[^\?]{4})*[^\?]{1,3})(\?=)/$1.$2.('='x(4-length($2)%4)).$3/se;
100
101        # NOTE: this code partially could break $chk support
102        # in non strict mode concat consecutive encoded mime words with same charset, language and encoding
103        # fixes breaking inside multi-byte characters
104        1 while not $STRICT_DECODE and $line =~ s/$re_capture_encoded_word_split\s*=\?\1\2\?\3\?($re_encoded_text)\?=/=\?$1$2\?$3\?$4$5\?=/so;
105
106        # process sequence of encoded MIME words at once
107        1 while not $stop and $line =~ s{^(.*?)$re_match_decode}{
108
109            my $begin = $1 . $2;
110            my $words = $3;
111
112            $begin =~ tr/\r\n//d;
113            $output .= $begin;
114
115            # decode one MIME word
116            1 while not $stop and $words =~ s{^(.*?)($re_capture_decode)}{
117
118                $output .= $1;
119                my $orig = $2;
120                my $charset = $3;
121                my ($mime_enc, $text) = split /\?/, $5;
122
123                $text =~ tr/\r\n//d;
124
125                my $enc = Encode::find_mime_encoding($charset);
126
127                # in non strict mode allow also perl encoding aliases
128                if ( not defined $enc and not $STRICT_DECODE ) {
129                    # make sure that decoded string will be always strict UTF-8
130                    $charset = 'UTF-8' if lc($charset) eq 'utf8';
131                    $enc = Encode::find_encoding($charset);
132                }
133
134                if ( not defined $enc ) {
135                    Carp::croak qq(Unknown charset "$charset") if not ref $chk and $chk and $chk & Encode::DIE_ON_ERR;
136                    Carp::carp qq(Unknown charset "$charset") if not ref $chk and $chk and $chk & Encode::WARN_ON_ERR;
137                    $stop = 1 if not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
138                    $output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace
139                    $stop ? $orig : '';
140                } else {
141                    if ( uc($mime_enc) eq 'B' and $obj->{decode_b} ) {
142                        my $decoded = _decode_b($enc, $text, $chk);
143                        $stop = 1 if not defined $decoded and not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
144                        $output .= (defined $decoded ? $decoded : $text) unless $stop;
145                        $stop ? $orig : '';
146                    } elsif ( uc($mime_enc) eq 'Q' and $obj->{decode_q} ) {
147                        my $decoded = _decode_q($enc, $text, $chk);
148                        $stop = 1 if not defined $decoded and not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
149                        $output .= (defined $decoded ? $decoded : $text) unless $stop;
150                        $stop ? $orig : '';
151                    } else {
152                        Carp::croak qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk and $chk & Encode::DIE_ON_ERR;
153                        Carp::carp qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk and $chk & Encode::WARN_ON_ERR;
154                        $stop = 1 if not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
155                        $output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace
156                        $stop ? $orig : '';
157                    }
158                }
159
160            }se;
161
162            if ( not $stop ) {
163                $output .= $words;
164                $words = '';
165            }
166
167            $words;
168
169        }se;
170
171        if ( not $stop ) {
172            $line =~ tr/\r\n//d;
173            $output .= $line . $sep;
174            $line = '';
175            $sep = '';
176        }
177
178        $line . $sep;
179
180    }se;
181
182    $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC);
183    return $output;
184}
185
186sub _decode_b {
187    my ($enc, $text, $chk) = @_;
188    # MIME::Base64::decode ignores everything after a '=' padding character
189    # in non strict mode split string after each sequence of padding characters and decode each substring
190    my $octets = $STRICT_DECODE ?
191        MIME::Base64::decode($text) :
192        join('', map { MIME::Base64::decode($_) } split /(?<==)(?=[^=])/, $text);
193    return _decode_octets($enc, $octets, $chk);
194}
195
196sub _decode_q {
197    my ($enc, $text, $chk) = @_;
198    $text =~ s/_/ /go;
199    $text =~ s/=([0-9A-Fa-f]{2})/pack('C', hex($1))/ego;
200    return _decode_octets($enc, $text, $chk);
201}
202
203sub _decode_octets {
204    my ($enc, $octets, $chk) = @_;
205    $chk = 0 unless defined $chk;
206    $chk &= ~Encode::LEAVE_SRC if not ref $chk and $chk;
207    my $output = $enc->decode($octets, $chk);
208    return undef if not ref $chk and $chk and $octets ne '';
209    return $output;
210}
211
212sub encode($$;$) {
213    my ($obj, $str, $chk) = @_;
214    return undef unless defined $str;
215    my $output = $obj->_fold_line($obj->_encode_string($str, $chk));
216    $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC);
217    return $output . substr($str, 0, 0); # to propagate taintedness
218}
219
220sub _fold_line {
221    my ($obj, $line) = @_;
222    my $bpl = $obj->{bpl};
223    my $output = '';
224
225    while ( length($line) ) {
226        if ( $line =~ s/^(.{0,$bpl})(\s|\z)// ) {
227            $output .= $1;
228            $output .= "\r\n" . $2 if length($line);
229        } elsif ( $line =~ s/(\s)(.*)$// ) {
230            $output .= $line;
231            $line = $2;
232            $output .= "\r\n" . $1 if length($line);
233        } else {
234            $output .= $line;
235            last;
236        }
237    }
238
239    return $output;
240}
241
242sub _encode_string {
243    my ($obj, $str, $chk) = @_;
244    my $wordlen = $obj->{bpl} > 76 ? 76 : $obj->{bpl};
245    my $enc = Encode::find_mime_encoding($obj->{charset});
246    my $enc_chk = $chk;
247    $enc_chk = 0 unless defined $enc_chk;
248    $enc_chk |= Encode::LEAVE_SRC if not ref $enc_chk and $enc_chk;
249    my @result = ();
250    my $octets = '';
251    while ( length( my $chr = substr($str, 0, 1, '') ) ) {
252        my $seq = $enc->encode($chr, $enc_chk);
253        if ( not length($seq) ) {
254            substr($str, 0, 0, $chr);
255            last;
256        }
257        if ( $obj->_encoded_word_len($octets . $seq) > $wordlen ) {
258            push @result, $obj->_encode_word($octets);
259            $octets = '';
260        }
261        $octets .= $seq;
262    }
263    length($octets) and push @result, $obj->_encode_word($octets);
264    $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC);
265    return join(' ', @result);
266}
267
268sub _encode_word {
269    my ($obj, $octets) = @_;
270    my $charset = $obj->{charset};
271    my $encode = $obj->{encode};
272    my $text = $encode eq 'B' ? _encode_b($octets) : _encode_q($octets);
273    return "=?$charset?$encode?$text?=";
274}
275
276sub _encoded_word_len {
277    my ($obj, $octets) = @_;
278    my $charset = $obj->{charset};
279    my $encode = $obj->{encode};
280    my $text_len = $encode eq 'B' ? _encoded_b_len($octets) : _encoded_q_len($octets);
281    return length("=?$charset?$encode??=") + $text_len;
282}
283
284sub _encode_b {
285    my ($octets) = @_;
286    return MIME::Base64::encode($octets, '');
287}
288
289sub _encoded_b_len {
290    my ($octets) = @_;
291    return ( length($octets) + 2 ) / 3 * 4;
292}
293
294my $re_invalid_q_char = qr/[^0-9A-Za-z !*+\-\/]/;
295
296sub _encode_q {
297    my ($octets) = @_;
298    $octets =~ s{($re_invalid_q_char)}{
299        join('', map { sprintf('=%02X', $_) } unpack('C*', $1))
300    }egox;
301    $octets =~ s/ /_/go;
302    return $octets;
303}
304
305sub _encoded_q_len {
306    my ($octets) = @_;
307    my $invalid_count = () = $octets =~ /$re_invalid_q_char/sgo;
308    return ( $invalid_count * 3 ) + ( length($octets) - $invalid_count );
309}
310
3111;
312__END__
313
314=head1 NAME
315
316Encode::MIME::Header -- MIME encoding for an unstructured email header
317
318=head1 SYNOPSIS
319
320    use Encode qw(encode decode);
321
322    my $mime_str = encode("MIME-Header", "Sample:Text \N{U+263A}");
323    # $mime_str is "=?UTF-8?B?U2FtcGxlOlRleHQg4pi6?="
324
325    my $mime_q_str = encode("MIME-Q", "Sample:Text \N{U+263A}");
326    # $mime_q_str is "=?UTF-8?Q?Sample=3AText_=E2=98=BA?="
327
328    my $str = decode("MIME-Header",
329        "=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=\r\n " .
330        "=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?="
331    );
332    # $str is "If you can read this you understand the example."
333
334    use Encode qw(decode :fallbacks);
335    use Encode::MIME::Header;
336    local $Encode::MIME::Header::STRICT_DECODE = 1;
337    my $strict_string = decode("MIME-Header", $mime_string, FB_CROAK);
338    # use strict decoding and croak on errors
339
340=head1 ABSTRACT
341
342This module implements L<RFC 2047|https://tools.ietf.org/html/rfc2047> MIME
343encoding for an unstructured field body of the email header.  It can also be
344used for L<RFC 822|https://tools.ietf.org/html/rfc822> 'text' token.  However,
345it cannot be used directly for the whole header with the field name or for the
346structured header fields like From, To, Cc, Message-Id, etc...  There are 3
347encoding names supported by this module: C<MIME-Header>, C<MIME-B> and
348C<MIME-Q>.
349
350=head1 DESCRIPTION
351
352Decode method takes an unstructured field body of the email header (or
353L<RFC 822|https://tools.ietf.org/html/rfc822> 'text' token) as its input and
354decodes each MIME encoded-word from input string to a sequence of bytes
355according to L<RFC 2047|https://tools.ietf.org/html/rfc2047> and
356L<RFC 2231|https://tools.ietf.org/html/rfc2231>.  Subsequently, each sequence
357of bytes with the corresponding MIME charset is decoded with
358L<the Encode module|Encode> and finally, one output string is returned.  Text
359parts of the input string which do not contain MIME encoded-word stay
360unmodified in the output string.  Folded newlines between two consecutive MIME
361encoded-words are discarded, others are preserved in the output string.
362C<MIME-B> can decode Base64 variant, C<MIME-Q> can decode Quoted-Printable
363variant and C<MIME-Header> can decode both of them.  If L<Encode module|Encode>
364does not support particular MIME charset or chosen variant then an action based
365on L<CHECK flags|Encode/Handling Malformed Data> is performed (by default, the
366MIME encoded-word is not decoded).
367
368Encode method takes a scalar string as its input and uses
369L<strict UTF-8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder for encoding it to UTF-8
370bytes.  Then a sequence of UTF-8 bytes is encoded into MIME encoded-words
371(C<MIME-Header> and C<MIME-B> use a Base64 variant while C<MIME-Q> uses a
372Quoted-Printable variant) where each MIME encoded-word is limited to 75
373characters.  MIME encoded-words are separated by C<CRLF SPACE> and joined to
374one output string.  Output string is suitable for unstructured field body of
375the email header.
376
377Both encode and decode methods propagate
378L<CHECK flags|Encode/Handling Malformed Data> when encoding and decoding the
379MIME charset.
380
381=head1 BUGS
382
383Versions prior to 2.22 (part of Encode 2.83) have a malfunctioning decoder
384and encoder.  The MIME encoder infamously inserted additional spaces or
385discarded white spaces between consecutive MIME encoded-words, which led to
386invalid MIME headers produced by this module.  The MIME decoder had a tendency
387to discard white spaces, incorrectly interpret data or attempt to decode Base64
388MIME encoded-words as Quoted-Printable.  These problems were fixed in version
3892.22.  It is highly recommended not to use any version prior 2.22!
390
391Versions prior to 2.24 (part of Encode 2.87) ignored
392L<CHECK flags|Encode/Handling Malformed Data>.  The MIME encoder used
393L<not strict utf8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder for input Unicode
394strings which could lead to invalid UTF-8 sequences.  MIME decoder used also
395L<not strict utf8|Encode/UTF-8 vs. utf8 vs. UTF8> decoder and additionally
396called the decode method with a C<Encode::FB_PERLQQ> flag (thus user-specified
397L<CHECK flags|Encode/Handling Malformed Data> were ignored).  Moreover, it
398automatically croaked when a MIME encoded-word contained unknown encoding.
399Since version 2.24, this module uses
400L<strict UTF-8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder and decoder.  And
401L<CHECK flags|Encode/Handling Malformed Data> are correctly propagated.
402
403Since version 2.22 (part of Encode 2.83), the MIME encoder should be fully
404compliant to L<RFC 2047|https://tools.ietf.org/html/rfc2047> and
405L<RFC 2231|https://tools.ietf.org/html/rfc2231>.  Due to the aforementioned
406bugs in previous versions of the MIME encoder, there is a I<less strict>
407compatible mode for the MIME decoder which is used by default.  It should be
408able to decode MIME encoded-words encoded by pre 2.22 versions of this module.
409However, note that this is not correct according to
410L<RFC 2047|https://tools.ietf.org/html/rfc2047>.
411
412In default I<not strict> mode the MIME decoder attempts to decode every substring
413which looks like a MIME encoded-word.  Therefore, the MIME encoded-words do not
414need to be separated by white space.  To enforce a correct I<strict> mode, set
415variable C<$Encode::MIME::Header::STRICT_DECODE> to 1 e.g. by localizing:
416
417  use Encode::MIME::Header;
418  local $Encode::MIME::Header::STRICT_DECODE = 1;
419
420=head1 AUTHORS
421
422Pali E<lt>pali@cpan.orgE<gt>
423
424=head1 SEE ALSO
425
426L<Encode>,
427L<RFC 822|https://tools.ietf.org/html/rfc822>,
428L<RFC 2047|https://tools.ietf.org/html/rfc2047>,
429L<RFC 2231|https://tools.ietf.org/html/rfc2231>
430
431=cut
432