+ sub calculate_table_definition
+ {
+ # Returns a human-readable string showing some or all of the code
+ # points matched by this table. The string will include a
+ # bracketed-character class for all characters matched in the 00-FF
+ # range, and the first few ranges matched beyond that.
+ my $max_ranges = 6;
+
+ my $self = shift;
+ my $definition = $self->definition || "";
+
+ # Skip this if already have a definition.
+ return $definition if $definition;
+
+ my $lows_string = ""; # The string representation of the 0-FF
+ # characters
+ my $string_range = ""; # The string rep. of the above FF ranges
+ my $range_count = 0; # How many ranges in $string_rage
+
+ my @lows_invlist; # The inversion list of the 0-FF code points
+ my $first_non_control = ord(" "); # Everything below this is a
+ # control, on ASCII or EBCDIC
+ my $max_table_code_point = $self->max;
+
+ # On ASCII platforms, the range 80-FF contains no printables.
+ my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126);
+
+
+ # Look through the first few ranges matched by this table.
+ $self->reset_each_range; # Defensive programming
+ while (defined (my $range = $self->each_range())) {
+ my $start = $range->start;
+ my $end = $range->end;
+
+ # Accumulate an inversion list of the 00-FF code points
+ if ($start < 256 && ($start > 0 || $end < 256)) {
+ push @lows_invlist, $start;
+ push @lows_invlist, 1 + (($end < 256) ? $end : 255);
+
+ # Get next range if there are more ranges below 256
+ next if $end < 256 && $end < $max_table_code_point;
+
+ # If the range straddles the 255/256 boundary, we split it
+ # there. We already added above the low portion to the
+ # inversion list
+ $start = 256 if $end > 256;
+ }
+
+ # Here, @lows_invlist contains the code points below 256, and
+ # there is no other range, or the current one starts at or above
+ # 256. Generate the [char class] for the 0-255 ones.
+ while (@lows_invlist) {
+
+ # If this range (necessarily the first one, by the way) starts
+ # at 0 ...
+ if ($lows_invlist[0] == 0) {
+
+ # If it ends within the block of controls, that means that
+ # some controls are in it and some aren't. Since Unicode
+ # properties pretty much only know about a few of the
+ # controls, like \n, \t, this means that its one of them
+ # that isn't in the range. Complement the inversion list
+ # which will likely cause these to be output using their
+ # mnemonics, hence being clearer.
+ if ($lows_invlist[1] < $first_non_control) {
+ $lows_string .= '^';
+ shift @lows_invlist;
+ push @lows_invlist, 256;
+ }
+ elsif ($lows_invlist[1] <= $highest_printable) {
+
+ # Here, it extends into the printables block. Split
+ # into two ranges so that the controls are separate.
+ $lows_string .= sprintf "\\x00-\\x%02x",
+ $first_non_control - 1;
+ $lows_invlist[0] = $first_non_control;
+ }
+ }
+
+ # If the range completely contains the printables, don't
+ # individually spell out the printables.
+ if ( $lows_invlist[0] <= $first_non_control
+ && $lows_invlist[1] > $highest_printable)
+ {
+ $lows_string .= sprintf "\\x%02x-\\x%02x",
+ $lows_invlist[0], $lows_invlist[1] - 1;
+ shift @lows_invlist;
+ shift @lows_invlist;
+ next;
+ }
+
+ # Here, the range may include some but not all printables.
+ # Look at each one individually
+ foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) {
+ my $char = chr $ord;
+
+ # If there is already something in the list, an
+ # alphanumeric char could be the next in sequence. If so,
+ # we start or extend a range. That is, we could have so
+ # far something like 'a-c', and the next char is a 'd', so
+ # we change it to 'a-d'. We use native_to_unicode()
+ # because a-z on EBCDIC means 26 chars, and excludes the
+ # gap ones.
+ if ($lows_string ne "" && $char =~ /[[:alnum:]]/) {
+ my $prev = substr($lows_string, -1);
+ if ( $prev !~ /[[:alnum:]]/
+ || utf8::native_to_unicode(ord $prev) + 1
+ != utf8::native_to_unicode(ord $char))
+ {
+ # Not extending the range
+ $lows_string .= $char;
+ }
+ elsif ( length $lows_string > 1
+ && substr($lows_string, -2, 1) eq '-')
+ {
+ # We had a sequence like '-c' and the current
+ # character is 'd'. Extend the range.
+ substr($lows_string, -1, 1) = $char;
+ }
+ else {
+ # We had something like 'd' and this is 'e'.
+ # Start a range.
+ $lows_string .= "-$char";
+ }
+ }
+ elsif ($char =~ /[[:graph:]]/) {
+
+ # We output a graphic char as-is, preceded by a
+ # backslash if it is a metacharacter
+ $lows_string .= '\\'
+ if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/;
+ $lows_string .= $char;
+ } # Otherwise use mnemonic for any that have them
+ elsif ($char =~ /[\a]/) {
+ $lows_string .= '\a';
+ }
+ elsif ($char =~ /[\b]/) {
+ $lows_string .= '\b';
+ }
+ elsif ($char eq "\e") {
+ $lows_string .= '\e';
+ }
+ elsif ($char eq "\f") {
+ $lows_string .= '\f';
+ }
+ elsif ($char eq "\cK") {
+ $lows_string .= '\cK';
+ }
+ elsif ($char eq "\n") {
+ $lows_string .= '\n';
+ }
+ elsif ($char eq "\r") {
+ $lows_string .= '\r';
+ }
+ elsif ($char eq "\t") {
+ $lows_string .= '\t';
+ }
+ else {
+
+ # Here is a non-graphic without a mnemonic. We use \x
+ # notation. But if the ordinal of this is one above
+ # the previous, create or extend the range
+ my $hex_representation = sprintf("%02x", ord $char);
+ if ( length $lows_string >= 4
+ && substr($lows_string, -4, 2) eq '\\x'
+ && hex(substr($lows_string, -2)) + 1 == ord $char)
+ {
+ if ( length $lows_string >= 5
+ && substr($lows_string, -5, 1) eq '-'
+ && ( length $lows_string == 5
+ || substr($lows_string, -6, 1) ne '\\'))
+ {
+ substr($lows_string, -2) = $hex_representation;
+ }
+ else {
+ $lows_string .= '-\\x' . $hex_representation;
+ }
+ }
+ else {
+ $lows_string .= '\\x' . $hex_representation;
+ }
+ }
+ }
+ }
+
+ # Done with assembling the string of all lows. If there are only
+ # lows in the property, are completely done.
+ if ($max_table_code_point < 256) {
+ $self->reset_each_range;
+ last;
+ }
+
+ # Otherwise, quit if reached max number of non-lows ranges. If
+ # there are lows, count them as one unit towards the maximum.
+ $range_count++;
+ if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) {
+ $string_range .= " ...";
+ $self->reset_each_range;
+ last;
+ }
+
+ # Otherwise add this range.
+ $string_range .= ", " if $string_range ne "";
+ if ($start == $end) {
+ $string_range .= sprintf("U+%04X", $start);
+ }
+ elsif ($end >= $MAX_WORKING_CODEPOINT) {
+ $string_range .= sprintf("U+%04X..infinity", $start);
+ }
+ else {
+ $string_range .= sprintf("U+%04X..%04X",
+ $start, $end);
+ }
+ }
+
+ # Done with all the ranges we're going to look at. Assemble the
+ # definition from the lows + non-lows.
+
+ if ($lows_string ne "" || $string_range ne "") {
+ if ($lows_string ne "") {
+ $definition .= "[$lows_string]";
+ $definition .= ", " if $string_range;
+ }
+ $definition .= $string_range;
+ }
+
+ return $definition;
+ }
+