| 1 | # Common subroutines and constants, called by .t files in this directory that |
| 2 | # deal with UTF-8 |
| 3 | |
| 4 | # The test files can't use byte_utf8a_to_utf8n() from t/charset_tools.pl |
| 5 | # because that uses the same functions we are testing here. So UTF-EBCDIC |
| 6 | # strings are hard-coded as I8 strings in this file instead, and we use the |
| 7 | # translation functions to/from I8 from that file instead. |
| 8 | |
| 9 | sub isASCII { ord "A" == 65 } |
| 10 | |
| 11 | sub display_bytes_no_quotes { |
| 12 | use bytes; |
| 13 | my $string = shift; |
| 14 | return join("", map { |
| 15 | ($_ =~ /[[:print:]]/) |
| 16 | ? $_ |
| 17 | : sprintf("\\x%02x", ord $_) |
| 18 | } split "", $string) |
| 19 | } |
| 20 | |
| 21 | sub display_bytes { |
| 22 | return '"' . display_bytes_no_quotes(shift) . '"'; |
| 23 | } |
| 24 | |
| 25 | sub output_warnings(@) { |
| 26 | my @list = @_; |
| 27 | if (@list) { |
| 28 | diag "The warnings were:\n" . join "\n", map { chomp; $_ } @list; |
| 29 | } |
| 30 | else { |
| 31 | diag "No warnings were raised"; |
| 32 | } |
| 33 | } |
| 34 | |
| 35 | sub start_byte_to_cont($) { |
| 36 | |
| 37 | # Extract the code point information from the input UTF-8 start byte, and |
| 38 | # return a continuation byte containing the same information. This is |
| 39 | # used in constructing an overlong malformation from valid input. |
| 40 | |
| 41 | my $byte = shift; |
| 42 | my $len = test_UTF8_SKIP($byte); |
| 43 | if ($len < 2) { |
| 44 | die "start_byte_to_cont() is expecting a UTF-8 variant"; |
| 45 | } |
| 46 | |
| 47 | $byte = ord native_to_I8($byte); |
| 48 | |
| 49 | # Copied from utf8.h. This gets rid of the leading 1 bits. |
| 50 | $byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2))); |
| 51 | |
| 52 | $byte |= (isASCII) ? 0x80 : 0xA0; |
| 53 | return I8_to_native(chr $byte); |
| 54 | } |
| 55 | |
| 56 | $::is64bit = length sprintf("%x", ~0) > 8; |
| 57 | |
| 58 | $::lowest_continuation = (isASCII) ? 0x80 : 0xA0; |
| 59 | |
| 60 | $::I8c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte |
| 61 | |
| 62 | |
| 63 | $::max_bytes = (isASCII) ? 13 : 14; # Max number of bytes in a UTF-8 sequence |
| 64 | # representing a single code point |
| 65 | |
| 66 | # Copied from utf8.h |
| 67 | $::UTF8_ALLOW_EMPTY = 0x0001; |
| 68 | $::UTF8_GOT_EMPTY = $UTF8_ALLOW_EMPTY; |
| 69 | $::UTF8_ALLOW_CONTINUATION = 0x0002; |
| 70 | $::UTF8_GOT_CONTINUATION = $UTF8_ALLOW_CONTINUATION; |
| 71 | $::UTF8_ALLOW_NON_CONTINUATION = 0x0004; |
| 72 | $::UTF8_GOT_NON_CONTINUATION = $UTF8_ALLOW_NON_CONTINUATION; |
| 73 | $::UTF8_ALLOW_SHORT = 0x0008; |
| 74 | $::UTF8_GOT_SHORT = $UTF8_ALLOW_SHORT; |
| 75 | $::UTF8_ALLOW_LONG = 0x0010; |
| 76 | $::UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020; |
| 77 | $::UTF8_GOT_LONG = $UTF8_ALLOW_LONG; |
| 78 | $::UTF8_ALLOW_OVERFLOW = 0x0080; |
| 79 | $::UTF8_GOT_OVERFLOW = $UTF8_ALLOW_OVERFLOW; |
| 80 | $::UTF8_DISALLOW_SURROGATE = 0x0100; |
| 81 | $::UTF8_GOT_SURROGATE = $UTF8_DISALLOW_SURROGATE; |
| 82 | $::UTF8_WARN_SURROGATE = 0x0200; |
| 83 | $::UTF8_DISALLOW_NONCHAR = 0x0400; |
| 84 | $::UTF8_GOT_NONCHAR = $UTF8_DISALLOW_NONCHAR; |
| 85 | $::UTF8_WARN_NONCHAR = 0x0800; |
| 86 | $::UTF8_DISALLOW_SUPER = 0x1000; |
| 87 | $::UTF8_GOT_SUPER = $UTF8_DISALLOW_SUPER; |
| 88 | $::UTF8_WARN_SUPER = 0x2000; |
| 89 | $::UTF8_DISALLOW_PERL_EXTENDED = 0x4000; |
| 90 | $::UTF8_GOT_PERL_EXTENDED = $UTF8_DISALLOW_PERL_EXTENDED; |
| 91 | $::UTF8_WARN_PERL_EXTENDED = 0x8000; |
| 92 | $::UTF8_CHECK_ONLY = 0x10000; |
| 93 | $::UTF8_NO_CONFIDENCE_IN_CURLEN_ = 0x20000; |
| 94 | |
| 95 | $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE |
| 96 | = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE; |
| 97 | $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE |
| 98 | = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR; |
| 99 | $::UTF8_WARN_ILLEGAL_C9_INTERCHANGE |
| 100 | = $UTF8_WARN_SUPER|$UTF8_WARN_SURROGATE; |
| 101 | $::UTF8_WARN_ILLEGAL_INTERCHANGE |
| 102 | = $UTF8_WARN_ILLEGAL_C9_INTERCHANGE|$UTF8_WARN_NONCHAR; |
| 103 | |
| 104 | # Test uvchr_to_utf8(). |
| 105 | $::UNICODE_WARN_SURROGATE = 0x0001; |
| 106 | $::UNICODE_WARN_NONCHAR = 0x0002; |
| 107 | $::UNICODE_WARN_SUPER = 0x0004; |
| 108 | $::UNICODE_WARN_PERL_EXTENDED = 0x0008; |
| 109 | $::UNICODE_DISALLOW_SURROGATE = 0x0010; |
| 110 | $::UNICODE_DISALLOW_NONCHAR = 0x0020; |
| 111 | $::UNICODE_DISALLOW_SUPER = 0x0040; |
| 112 | $::UNICODE_DISALLOW_PERL_EXTENDED = 0x0080; |