1package Encode::MIME::Header; 2use strict; 3use warnings; 4 5our $VERSION = do { my @r = ( q$Revision: 2.29 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 6 7use Carp (); 8use Encode (); 9use MIME::Base64 (); 10 11my %seed = ( 12 decode_b => 1, # decodes 'B' encoding ? 13 decode_q => 1, # decodes 'Q' encoding ? 14 encode => 'B', # encode with 'B' or 'Q' ? 15 charset => 'UTF-8', # encode charset 16 bpl => 75, # bytes per line 17); 18 19my @objs; 20 21push @objs, bless { 22 %seed, 23 Name => 'MIME-Header', 24} => __PACKAGE__; 25 26push @objs, bless { 27 %seed, 28 decode_q => 0, 29 Name => 'MIME-B', 30} => __PACKAGE__; 31 32push @objs, bless { 33 %seed, 34 decode_b => 0, 35 encode => 'Q', 36 Name => 'MIME-Q', 37} => __PACKAGE__; 38 39Encode::define_encoding($_, $_->{Name}) foreach @objs; 40 41use parent qw(Encode::Encoding); 42 43sub needs_lines { 1 } 44sub perlio_ok { 0 } 45 46# RFC 2047 and RFC 2231 grammar 47my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/; 48my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/; 49my $re_encoding = qr/[QqBb]/; 50my $re_encoded_text = qr/[^\?]*/; 51my $re_encoded_word = qr/=\?$re_charset(?:\*$re_language)?\?$re_encoding\?$re_encoded_text\?=/; 52my $re_capture_encoded_word = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding\?$re_encoded_text)\?=/; 53my $re_capture_encoded_word_split = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding)\?($re_encoded_text)\?=/; 54 55# in strict mode check also for valid base64 characters and also for valid quoted printable codes 56my $re_encoding_strict_b = qr/[Bb]/; 57my $re_encoding_strict_q = qr/[Qq]/; 58my $re_encoded_text_strict_b = qr/(?:[0-9A-Za-z\+\/]{4})*(?:[0-9A-Za-z\+\/]{2}==|[0-9A-Za-z\+\/]{3}=|)/; 59my $re_encoded_text_strict_q = qr/(?:[\x21-\x3C\x3E\x40-\x7E]|=[0-9A-Fa-f]{2})*/; # NOTE: first part are printable US-ASCII except ?, =, SPACE and TAB 60my $re_encoded_word_strict = qr/=\?$re_charset(?:\*$re_language)?\?(?:$re_encoding_strict_b\?$re_encoded_text_strict_b|$re_encoding_strict_q\?$re_encoded_text_strict_q)\?=/; 61my $re_capture_encoded_word_strict = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding_strict_b\?$re_encoded_text_strict_b|$re_encoding_strict_q\?$re_encoded_text_strict_q)\?=/; 62 63my $re_newline = qr/(?:\r\n|[\r\n])/; 64 65# in strict mode encoded words must be always separated by spaces or tabs (or folded newline) 66# except in comments when separator between words and comment round brackets can be omitted 67my $re_word_begin_strict = qr/(?:(?:[ \t]|\A)\(?|(?:[^\\]|\A)\)\()/; 68my $re_word_sep_strict = qr/(?:$re_newline?[ \t])+/; 69my $re_word_end_strict = qr/(?:\)\(|\)?(?:$re_newline?[ \t]|\z))/; 70 71my $re_match = qr/()((?:$re_encoded_word\s*)*$re_encoded_word)()/; 72my $re_match_strict = qr/($re_word_begin_strict)((?:$re_encoded_word_strict$re_word_sep_strict)*$re_encoded_word_strict)(?=$re_word_end_strict)/; 73 74my $re_capture = qr/$re_capture_encoded_word(?:\s*)?/; 75my $re_capture_strict = qr/$re_capture_encoded_word_strict$re_word_sep_strict?/; 76 77our $STRICT_DECODE = 0; 78 79sub decode($$;$) { 80 my ($obj, $str, $chk) = @_; 81 return undef unless defined $str; 82 83 my $re_match_decode = $STRICT_DECODE ? $re_match_strict : $re_match; 84 my $re_capture_decode = $STRICT_DECODE ? $re_capture_strict : $re_capture; 85 86 my $stop = 0; 87 my $output = substr($str, 0, 0); # to propagate taintedness 88 89 # decode each line separately, match whole continuous folded line at one call 90 1 while not $stop and $str =~ s{^((?:[^\r\n]*(?:$re_newline[ \t])?)*)($re_newline)?}{ 91 92 my $line = $1; 93 my $sep = defined $2 ? $2 : ''; 94 95 $stop = 1 unless length($line) or length($sep); 96 97 # in non strict mode append missing '=' padding characters for b words 98 # fixes below concatenation of consecutive encoded mime words 99 1 while not $STRICT_DECODE and $line =~ s/(=\?$re_charset(?:\*$re_language)?\?[Bb]\?)((?:[^\?]{4})*[^\?]{1,3})(\?=)/$1.$2.('='x(4-length($2)%4)).$3/se; 100 101 # NOTE: this code partially could break $chk support 102 # in non strict mode concat consecutive encoded mime words with same charset, language and encoding 103 # fixes breaking inside multi-byte characters 104 1 while not $STRICT_DECODE and $line =~ s/$re_capture_encoded_word_split\s*=\?\1\2\?\3\?($re_encoded_text)\?=/=\?$1$2\?$3\?$4$5\?=/so; 105 106 # process sequence of encoded MIME words at once 107 1 while not $stop and $line =~ s{^(.*?)$re_match_decode}{ 108 109 my $begin = $1 . $2; 110 my $words = $3; 111 112 $begin =~ tr/\r\n//d; 113 $output .= $begin; 114 115 # decode one MIME word 116 1 while not $stop and $words =~ s{^(.*?)($re_capture_decode)}{ 117 118 $output .= $1; 119 my $orig = $2; 120 my $charset = $3; 121 my ($mime_enc, $text) = split /\?/, $5; 122 123 $text =~ tr/\r\n//d; 124 125 my $enc = Encode::find_mime_encoding($charset); 126 127 # in non strict mode allow also perl encoding aliases 128 if ( not defined $enc and not $STRICT_DECODE ) { 129 # make sure that decoded string will be always strict UTF-8 130 $charset = 'UTF-8' if lc($charset) eq 'utf8'; 131 $enc = Encode::find_encoding($charset); 132 } 133 134 if ( not defined $enc ) { 135 Carp::croak qq(Unknown charset "$charset") if not ref $chk and $chk and $chk & Encode::DIE_ON_ERR; 136 Carp::carp qq(Unknown charset "$charset") if not ref $chk and $chk and $chk & Encode::WARN_ON_ERR; 137 $stop = 1 if not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR; 138 $output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace 139 $stop ? $orig : ''; 140 } else { 141 if ( uc($mime_enc) eq 'B' and $obj->{decode_b} ) { 142 my $decoded = _decode_b($enc, $text, $chk); 143 $stop = 1 if not defined $decoded and not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR; 144 $output .= (defined $decoded ? $decoded : $text) unless $stop; 145 $stop ? $orig : ''; 146 } elsif ( uc($mime_enc) eq 'Q' and $obj->{decode_q} ) { 147 my $decoded = _decode_q($enc, $text, $chk); 148 $stop = 1 if not defined $decoded and not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR; 149 $output .= (defined $decoded ? $decoded : $text) unless $stop; 150 $stop ? $orig : ''; 151 } else { 152 Carp::croak qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk and $chk & Encode::DIE_ON_ERR; 153 Carp::carp qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk and $chk & Encode::WARN_ON_ERR; 154 $stop = 1 if not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR; 155 $output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace 156 $stop ? $orig : ''; 157 } 158 } 159 160 }se; 161 162 if ( not $stop ) { 163 $output .= $words; 164 $words = ''; 165 } 166 167 $words; 168 169 }se; 170 171 if ( not $stop ) { 172 $line =~ tr/\r\n//d; 173 $output .= $line . $sep; 174 $line = ''; 175 $sep = ''; 176 } 177 178 $line . $sep; 179 180 }se; 181 182 $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC); 183 return $output; 184} 185 186sub _decode_b { 187 my ($enc, $text, $chk) = @_; 188 # MIME::Base64::decode ignores everything after a '=' padding character 189 # in non strict mode split string after each sequence of padding characters and decode each substring 190 my $octets = $STRICT_DECODE ? 191 MIME::Base64::decode($text) : 192 join('', map { MIME::Base64::decode($_) } split /(?<==)(?=[^=])/, $text); 193 return _decode_octets($enc, $octets, $chk); 194} 195 196sub _decode_q { 197 my ($enc, $text, $chk) = @_; 198 $text =~ s/_/ /go; 199 $text =~ s/=([0-9A-Fa-f]{2})/pack('C', hex($1))/ego; 200 return _decode_octets($enc, $text, $chk); 201} 202 203sub _decode_octets { 204 my ($enc, $octets, $chk) = @_; 205 $chk = 0 unless defined $chk; 206 $chk &= ~Encode::LEAVE_SRC if not ref $chk and $chk; 207 my $output = $enc->decode($octets, $chk); 208 return undef if not ref $chk and $chk and $octets ne ''; 209 return $output; 210} 211 212sub encode($$;$) { 213 my ($obj, $str, $chk) = @_; 214 return undef unless defined $str; 215 my $output = $obj->_fold_line($obj->_encode_string($str, $chk)); 216 $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC); 217 return $output . substr($str, 0, 0); # to propagate taintedness 218} 219 220sub _fold_line { 221 my ($obj, $line) = @_; 222 my $bpl = $obj->{bpl}; 223 my $output = ''; 224 225 while ( length($line) ) { 226 if ( $line =~ s/^(.{0,$bpl})(\s|\z)// ) { 227 $output .= $1; 228 $output .= "\r\n" . $2 if length($line); 229 } elsif ( $line =~ s/(\s)(.*)$// ) { 230 $output .= $line; 231 $line = $2; 232 $output .= "\r\n" . $1 if length($line); 233 } else { 234 $output .= $line; 235 last; 236 } 237 } 238 239 return $output; 240} 241 242sub _encode_string { 243 my ($obj, $str, $chk) = @_; 244 my $wordlen = $obj->{bpl} > 76 ? 76 : $obj->{bpl}; 245 my $enc = Encode::find_mime_encoding($obj->{charset}); 246 my $enc_chk = $chk; 247 $enc_chk = 0 unless defined $enc_chk; 248 $enc_chk |= Encode::LEAVE_SRC if not ref $enc_chk and $enc_chk; 249 my @result = (); 250 my $octets = ''; 251 while ( length( my $chr = substr($str, 0, 1, '') ) ) { 252 my $seq = $enc->encode($chr, $enc_chk); 253 if ( not length($seq) ) { 254 substr($str, 0, 0, $chr); 255 last; 256 } 257 if ( $obj->_encoded_word_len($octets . $seq) > $wordlen ) { 258 push @result, $obj->_encode_word($octets); 259 $octets = ''; 260 } 261 $octets .= $seq; 262 } 263 length($octets) and push @result, $obj->_encode_word($octets); 264 $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC); 265 return join(' ', @result); 266} 267 268sub _encode_word { 269 my ($obj, $octets) = @_; 270 my $charset = $obj->{charset}; 271 my $encode = $obj->{encode}; 272 my $text = $encode eq 'B' ? _encode_b($octets) : _encode_q($octets); 273 return "=?$charset?$encode?$text?="; 274} 275 276sub _encoded_word_len { 277 my ($obj, $octets) = @_; 278 my $charset = $obj->{charset}; 279 my $encode = $obj->{encode}; 280 my $text_len = $encode eq 'B' ? _encoded_b_len($octets) : _encoded_q_len($octets); 281 return length("=?$charset?$encode??=") + $text_len; 282} 283 284sub _encode_b { 285 my ($octets) = @_; 286 return MIME::Base64::encode($octets, ''); 287} 288 289sub _encoded_b_len { 290 my ($octets) = @_; 291 return ( length($octets) + 2 ) / 3 * 4; 292} 293 294my $re_invalid_q_char = qr/[^0-9A-Za-z !*+\-\/]/; 295 296sub _encode_q { 297 my ($octets) = @_; 298 $octets =~ s{($re_invalid_q_char)}{ 299 join('', map { sprintf('=%02X', $_) } unpack('C*', $1)) 300 }egox; 301 $octets =~ s/ /_/go; 302 return $octets; 303} 304 305sub _encoded_q_len { 306 my ($octets) = @_; 307 my $invalid_count = () = $octets =~ /$re_invalid_q_char/sgo; 308 return ( $invalid_count * 3 ) + ( length($octets) - $invalid_count ); 309} 310 3111; 312__END__ 313 314=head1 NAME 315 316Encode::MIME::Header -- MIME encoding for an unstructured email header 317 318=head1 SYNOPSIS 319 320 use Encode qw(encode decode); 321 322 my $mime_str = encode("MIME-Header", "Sample:Text \N{U+263A}"); 323 # $mime_str is "=?UTF-8?B?U2FtcGxlOlRleHQg4pi6?=" 324 325 my $mime_q_str = encode("MIME-Q", "Sample:Text \N{U+263A}"); 326 # $mime_q_str is "=?UTF-8?Q?Sample=3AText_=E2=98=BA?=" 327 328 my $str = decode("MIME-Header", 329 "=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=\r\n " . 330 "=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=" 331 ); 332 # $str is "If you can read this you understand the example." 333 334 use Encode qw(decode :fallbacks); 335 use Encode::MIME::Header; 336 local $Encode::MIME::Header::STRICT_DECODE = 1; 337 my $strict_string = decode("MIME-Header", $mime_string, FB_CROAK); 338 # use strict decoding and croak on errors 339 340=head1 ABSTRACT 341 342This module implements L<RFC 2047|https://tools.ietf.org/html/rfc2047> MIME 343encoding for an unstructured field body of the email header. It can also be 344used for L<RFC 822|https://tools.ietf.org/html/rfc822> 'text' token. However, 345it cannot be used directly for the whole header with the field name or for the 346structured header fields like From, To, Cc, Message-Id, etc... There are 3 347encoding names supported by this module: C<MIME-Header>, C<MIME-B> and 348C<MIME-Q>. 349 350=head1 DESCRIPTION 351 352Decode method takes an unstructured field body of the email header (or 353L<RFC 822|https://tools.ietf.org/html/rfc822> 'text' token) as its input and 354decodes each MIME encoded-word from input string to a sequence of bytes 355according to L<RFC 2047|https://tools.ietf.org/html/rfc2047> and 356L<RFC 2231|https://tools.ietf.org/html/rfc2231>. Subsequently, each sequence 357of bytes with the corresponding MIME charset is decoded with 358L<the Encode module|Encode> and finally, one output string is returned. Text 359parts of the input string which do not contain MIME encoded-word stay 360unmodified in the output string. Folded newlines between two consecutive MIME 361encoded-words are discarded, others are preserved in the output string. 362C<MIME-B> can decode Base64 variant, C<MIME-Q> can decode Quoted-Printable 363variant and C<MIME-Header> can decode both of them. If L<Encode module|Encode> 364does not support particular MIME charset or chosen variant then an action based 365on L<CHECK flags|Encode/Handling Malformed Data> is performed (by default, the 366MIME encoded-word is not decoded). 367 368Encode method takes a scalar string as its input and uses 369L<strict UTF-8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder for encoding it to UTF-8 370bytes. Then a sequence of UTF-8 bytes is encoded into MIME encoded-words 371(C<MIME-Header> and C<MIME-B> use a Base64 variant while C<MIME-Q> uses a 372Quoted-Printable variant) where each MIME encoded-word is limited to 75 373characters. MIME encoded-words are separated by C<CRLF SPACE> and joined to 374one output string. Output string is suitable for unstructured field body of 375the email header. 376 377Both encode and decode methods propagate 378L<CHECK flags|Encode/Handling Malformed Data> when encoding and decoding the 379MIME charset. 380 381=head1 BUGS 382 383Versions prior to 2.22 (part of Encode 2.83) have a malfunctioning decoder 384and encoder. The MIME encoder infamously inserted additional spaces or 385discarded white spaces between consecutive MIME encoded-words, which led to 386invalid MIME headers produced by this module. The MIME decoder had a tendency 387to discard white spaces, incorrectly interpret data or attempt to decode Base64 388MIME encoded-words as Quoted-Printable. These problems were fixed in version 3892.22. It is highly recommended not to use any version prior 2.22! 390 391Versions prior to 2.24 (part of Encode 2.87) ignored 392L<CHECK flags|Encode/Handling Malformed Data>. The MIME encoder used 393L<not strict utf8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder for input Unicode 394strings which could lead to invalid UTF-8 sequences. MIME decoder used also 395L<not strict utf8|Encode/UTF-8 vs. utf8 vs. UTF8> decoder and additionally 396called the decode method with a C<Encode::FB_PERLQQ> flag (thus user-specified 397L<CHECK flags|Encode/Handling Malformed Data> were ignored). Moreover, it 398automatically croaked when a MIME encoded-word contained unknown encoding. 399Since version 2.24, this module uses 400L<strict UTF-8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder and decoder. And 401L<CHECK flags|Encode/Handling Malformed Data> are correctly propagated. 402 403Since version 2.22 (part of Encode 2.83), the MIME encoder should be fully 404compliant to L<RFC 2047|https://tools.ietf.org/html/rfc2047> and 405L<RFC 2231|https://tools.ietf.org/html/rfc2231>. Due to the aforementioned 406bugs in previous versions of the MIME encoder, there is a I<less strict> 407compatible mode for the MIME decoder which is used by default. It should be 408able to decode MIME encoded-words encoded by pre 2.22 versions of this module. 409However, note that this is not correct according to 410L<RFC 2047|https://tools.ietf.org/html/rfc2047>. 411 412In default I<not strict> mode the MIME decoder attempts to decode every substring 413which looks like a MIME encoded-word. Therefore, the MIME encoded-words do not 414need to be separated by white space. To enforce a correct I<strict> mode, set 415variable C<$Encode::MIME::Header::STRICT_DECODE> to 1 e.g. by localizing: 416 417 use Encode::MIME::Header; 418 local $Encode::MIME::Header::STRICT_DECODE = 1; 419 420=head1 AUTHORS 421 422Pali E<lt>pali@cpan.orgE<gt> 423 424=head1 SEE ALSO 425 426L<Encode>, 427L<RFC 822|https://tools.ietf.org/html/rfc822>, 428L<RFC 2047|https://tools.ietf.org/html/rfc2047>, 429L<RFC 2231|https://tools.ietf.org/html/rfc2231> 430 431=cut 432