1# Common subroutines and constants, called by .t files in this directory that 2# deal with UTF-8 3 4# The test files can't use byte_utf8a_to_utf8n() from t/charset_tools.pl 5# because that uses the same functions we are testing here. So UTF-EBCDIC 6# strings are hard-coded as I8 strings in this file instead, and we use the 7# translation functions to/from I8 from that file instead. 8 9sub isASCII { ord "A" == 65 } 10 11sub display_bytes_no_quotes { 12 use bytes; 13 my $string = shift; 14 return join("", map { 15 ($_ =~ /[[:print:]]/) 16 ? $_ 17 : sprintf("\\x%02x", ord $_) 18 } split "", $string) 19} 20 21sub display_bytes { 22 return '"' . display_bytes_no_quotes(shift) . '"'; 23} 24 25sub output_warnings(@) { 26 my @list = @_; 27 if (@list) { 28 diag "The warnings were:\n" . join "\n", map { chomp; $_ } @list; 29 } 30 else { 31 diag "No warnings were raised"; 32 } 33} 34 35sub start_byte_to_cont($) { 36 37 # Extract the code point information from the input UTF-8 start byte, and 38 # return a continuation byte containing the same information. This is 39 # used in constructing an overlong malformation from valid input. 40 41 my $byte = shift; 42 my $len = test_UTF8_SKIP($byte); 43 if ($len < 2) { 44 die "start_byte_to_cont() is expecting a UTF-8 variant"; 45 } 46 47 $byte = ord native_to_I8($byte); 48 49 # Copied from utf8.h. This gets rid of the leading 1 bits. 50 $byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2))); 51 52 $byte |= (isASCII) ? 0x80 : 0xA0; 53 return I8_to_native(chr $byte); 54} 55 56$::is64bit = length sprintf("%x", ~0) > 8; 57 58$::lowest_continuation = (isASCII) ? 0x80 : 0xA0; 59 60$::I8c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte 61 62 63$::max_bytes = (isASCII) ? 13 : 14; # Max number of bytes in a UTF-8 sequence 64 # representing a single code point 65 66# Copied from utf8.h 67$::UTF8_ALLOW_EMPTY = 0x0001; 68$::UTF8_GOT_EMPTY = $UTF8_ALLOW_EMPTY; 69$::UTF8_ALLOW_CONTINUATION = 0x0002; 70$::UTF8_GOT_CONTINUATION = $UTF8_ALLOW_CONTINUATION; 71$::UTF8_ALLOW_NON_CONTINUATION = 0x0004; 72$::UTF8_GOT_NON_CONTINUATION = $UTF8_ALLOW_NON_CONTINUATION; 73$::UTF8_ALLOW_SHORT = 0x0008; 74$::UTF8_GOT_SHORT = $UTF8_ALLOW_SHORT; 75$::UTF8_ALLOW_LONG = 0x0010; 76$::UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020; 77$::UTF8_GOT_LONG = $UTF8_ALLOW_LONG; 78$::UTF8_ALLOW_OVERFLOW = 0x0080; 79$::UTF8_GOT_OVERFLOW = $UTF8_ALLOW_OVERFLOW; 80$::UTF8_DISALLOW_SURROGATE = 0x0100; 81$::UTF8_GOT_SURROGATE = $UTF8_DISALLOW_SURROGATE; 82$::UTF8_WARN_SURROGATE = 0x0200; 83$::UTF8_DISALLOW_NONCHAR = 0x0400; 84$::UTF8_GOT_NONCHAR = $UTF8_DISALLOW_NONCHAR; 85$::UTF8_WARN_NONCHAR = 0x0800; 86$::UTF8_DISALLOW_SUPER = 0x1000; 87$::UTF8_GOT_SUPER = $UTF8_DISALLOW_SUPER; 88$::UTF8_WARN_SUPER = 0x2000; 89$::UTF8_DISALLOW_PERL_EXTENDED = 0x4000; 90$::UTF8_GOT_PERL_EXTENDED = $UTF8_DISALLOW_PERL_EXTENDED; 91$::UTF8_WARN_PERL_EXTENDED = 0x8000; 92$::UTF8_CHECK_ONLY = 0x10000; 93$::UTF8_NO_CONFIDENCE_IN_CURLEN_ = 0x20000; 94 95$::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE 96 = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE; 97$::UTF8_DISALLOW_ILLEGAL_INTERCHANGE 98 = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR; 99$::UTF8_WARN_ILLEGAL_C9_INTERCHANGE 100 = $UTF8_WARN_SUPER|$UTF8_WARN_SURROGATE; 101$::UTF8_WARN_ILLEGAL_INTERCHANGE 102 = $UTF8_WARN_ILLEGAL_C9_INTERCHANGE|$UTF8_WARN_NONCHAR; 103 104# Test uvchr_to_utf8(). 105$::UNICODE_WARN_SURROGATE = 0x0001; 106$::UNICODE_WARN_NONCHAR = 0x0002; 107$::UNICODE_WARN_SUPER = 0x0004; 108$::UNICODE_WARN_PERL_EXTENDED = 0x0008; 109$::UNICODE_DISALLOW_SURROGATE = 0x0010; 110$::UNICODE_DISALLOW_NONCHAR = 0x0020; 111$::UNICODE_DISALLOW_SUPER = 0x0040; 112$::UNICODE_DISALLOW_PERL_EXTENDED = 0x0080; 113