no warnings 'surrogate'; # surrogates can be inputs to this
use charnames ();
-our $VERSION = '0.44';
+our $VERSION = '0.49';
require Exporter;
=over
-=item B<*> If you use this C<I> mapping
+=item Z<>B<*> If you use this C<I> mapping
the result is case-insensitive,
but dotless and dotted I's are not distinguished
-=item B<*> If you exclude this C<I> mapping
+=item Z<>B<*> If you exclude this C<I> mapping
the result is not fully case-insensitive, but
dotless and dotted I's are distinguished
prints:
0, 1114112
-An empty list is returned if the input is unknown; the number of elements in
+If the input is unknown C<undef> is returned in scalar context; an empty-list
+in list context. If the input is known, the number of elements in
the list is returned if called in scalar context.
L<perluniprops|perluniprops/Properties accessible through \p{} and \P{}> gives
our %loose_defaults;
our $MAX_UNICODE_CODEPOINT;
-sub prop_invlist ($) {
+sub prop_invlist ($;$) {
my $prop = $_[0];
+
+ # Undocumented way to get at Perl internal properties
+ my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok';
+
return if ! defined $prop;
require "utf8_heavy.pl";
|| ref $swash eq ""
|| $swash->{'BITS'} != 1
|| $swash->{'USER_DEFINED'}
- || $prop =~ /^\s*_/;
+ || (! $internal_ok && $prop =~ /^\s*_/);
if ($swash->{'EXTRAS'}) {
carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic";
sub _search_invlist {
# Find the range in the inversion list which contains a code point; that
- # is, find i such that l[i] <= code_point < l[i+1]
+ # is, find i such that l[i] <= code_point < l[i+1]. Returns undef if no
+ # such i.
# If this is ever made public, could use to speed up .t specials. Would
# need to use code point argument, as in other functions in this pm
# Verify non-neg numeric XXX
my $max_element = @$list_ref - 1;
- return if ! $max_element < 0; # Undef if list is empty.
+
+ # Return undef if list is empty or requested item is before the first element.
+ return if $max_element < 0;
+ return if $code_point < $list_ref->[0];
# Short cut something at the far-end of the table. This also allows us to
# refer to element [$i+1] without fear of being out-of-bounds in the loop
The difference between a block and a script is that scripts are closer
to the linguistic notion of a set of code points required to present
languages, while block is more of an artifact of the Unicode code point
-numbering and separation into blocks of (mostly) 256 code points.
+numbering and separation into blocks of consecutive code points (so far the
+size of a block is some multiple of 16, like 128 or 256).
For example the Latin B<script> is spread over several B<blocks>, such
as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and