This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Implement \p{Name=/.../} wildcards
authorKarl Williamson <khw@cpan.org>
Sat, 15 Feb 2020 21:28:32 +0000 (14:28 -0700)
committerKarl Williamson <khw@cpan.org>
Wed, 11 Mar 2020 15:00:04 +0000 (09:00 -0600)
This commit adds wildcard subpatterns for the Name and Name Aliases
properties.

13 files changed:
charclass_invlists.h
embed.fnc
embed.h
lib/_charnames.pm
lib/charnames.t
lib/unicore/mktables
lib/unicore/uni_keywords.pl
pod/perldelta.pod
pod/perlunicode.pod
proto.h
regcharclass.h
regcomp.c
uni_keywords.h

index 9768196..dc34d40 100644 (file)
@@ -419864,7 +419864,7 @@ static const U8 WB_table[23][23] = {
  * baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
  * 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
  * 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
- * 3e37ae63c1a4f3084bba787a2c6ca020dad9d0d56e115c118fe8c68ac290ea7a lib/unicore/mktables
+ * 62a198b1430be086ac577285f5cbc0c2bde043a8ba469d85b256f1e191aa997d lib/unicore/mktables
  * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
  * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
  * 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl
index 4386da6..73a0402 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1946,6 +1946,10 @@ ERS      |REGEXP*|compile_wildcard|NN const char * subpattern|const STRLEN len\
 ES     |I32    |execute_wildcard|NN REGEXP * const prog|NN char* stringarg \
                                |NN char* strend|NN char* strbeg \
                                |SSize_t minend |NN SV* screamer|U32 nosave
+ES     |bool   |handle_names_wildcard                                      \
+                               |NN const char * wname                      \
+                               |const STRLEN wname_len                     \
+                               |NN SV ** prop_definition
 ES     |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \
                                |NN SV** invlist
 Ei     |regnode_offset|handle_named_backref|NN RExC_state_t *pRExC_state   \
diff --git a/embed.h b/embed.h
index 2aab13f..c550f88 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b)
 #define grok_bslash_N(a,b,c,d,e,f,g)   S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g)
 #define handle_named_backref(a,b,c,d)  S_handle_named_backref(aTHX_ a,b,c,d)
+#define handle_names_wildcard(a,b,c)   S_handle_names_wildcard(aTHX_ a,b,c)
 #define handle_possible_posix(a,b,c,d,e)       S_handle_possible_posix(aTHX_ a,b,c,d,e)
 #define handle_regex_sets(a,b,c,d,e)   S_handle_regex_sets(aTHX_ a,b,c,d,e)
 #define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j)      S_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j)
index 4c706f4..b7b7400 100644 (file)
@@ -641,6 +641,14 @@ sub _loose_regcomp_lookup {
                     );
 }
 
+sub _get_names_info {
+  # For use only by regcomp.c to compile \p{name=/.../}
+  populate_txt() unless $txt;
+
+
+  return ( \$txt, \@charnames::code_points_ending_in_code_point );
+}
+
 sub import
 {
   shift; ## ignore class name
index 2bcf13d..3301929 100644 (file)
@@ -10,8 +10,8 @@ my $run_slow_tests = $ENV{PERL_RUN_SLOW_TESTS} || 0;
 my $RUN_SLOW_TESTS_EVERY_CODE_POINT = 100;
 
 # If $ENV{PERL_RUN_SLOW_TESTS} is at least 1 and less than the number above,
-# all code points with names are tested.  If it is at least that number, all
-# 1,114,112 Unicode code points are tested.
+# all code points with names are tested, including wildcard search names.  If
+# it is at least that number, all 1,114,112 Unicode code points are tested.
 
 # Because \N{} is compile time, any warnings will get generated before
 # execution, so have to have an array, and arrange things so no warning
@@ -114,6 +114,7 @@ sub get_loose_name ($) { # Modify name to stress the loose tests.
 }
 
 sub test_vianame ($$$) {
+    CORE::state $wildcard_count = 0;
 
     # Run the vianame tests on a code point, both loose and full
 
@@ -126,23 +127,54 @@ sub test_vianame ($$$) {
     # Get a copy of the name modified to stress the loose tests.
     my $loose_name = get_loose_name($name);
 
+    my $right_anchor;
+
     # Switch loose and full in vianame vs string_vianame half the time
     if (rand() < .5) {
         use charnames ":full";
         $all_pass &= is(charnames::vianame($name), $i, "Verify vianame(\"$name\") is 0x$hex");
         use charnames ":loose";
         $all_pass &= is(charnames::string_vianame($loose_name), chr($i), "Verify string_vianame(\"$loose_name\") is chr(0x$hex)");
+        $right_anchor = '\\Z';
     }
     else {
         use charnames ":loose";
         $all_pass &= is(charnames::vianame($loose_name), $i, "Verify vianame(\"$loose_name\") is 0x$hex");
         use charnames ":full";
         $all_pass &= is(charnames::string_vianame($name), chr($i), "Verify string_vianame(\"$name\") is chr(0x$hex)");
+        $right_anchor = '\\z';
     }
 
+    my $left_anchor = (rand() < .5) ? '^' : '\\A';
+
     # \p{name=} is always loose matching
     $all_pass &= like(chr($i), qr/^\p{name=$loose_name}$/, "Verify /\p{name=$loose_name}/ matches chr(0x$hex)");
 
+    $wildcard_count++;
+
+    # Because wildcard name matching is so real-time intensive, do it less
+    # frequently than the others
+    if ($wildcard_count >= 10) {
+        $wildcard_count = 0;
+
+        # A few control characters have anomalous names containing
+        # parentheses, which need to be escaped.
+        my $name_ref = \$name;
+        my $mod_name;
+        if ($i <= 0x85) {   # NEL in ASCII; affected controls are lower than
+                            # this in EBCDIC
+            $mod_name = $name =~ s/([()])/\\$1/gr;
+            $name_ref = \$mod_name;
+        }
+
+        # We anchor the name, randomly with the possible anchors.
+        my $assembled = $left_anchor. $$name_ref . $right_anchor;
+
+        # \p{name=/.../} is always full matching
+        $all_pass &= like(chr($i), qr!^\p{name=/$assembled/}!,
+                          "Verify /\p{name=/$assembled/} matches chr(0x$hex)");
+    }
+
     return $all_pass;
 }
 
@@ -352,6 +384,10 @@ is(charnames::viacode("U+00000000000FEED"), "ARABIC LETTER WAW ISOLATED FORM", '
 
 test_vianame(0x116C, "116C", "HANGUL JUNGSEONG OE");
 test_vianame(0x1180, "1180", "HANGUL JUNGSEONG O-E");
+like(chr(0x59C3), qr/\p{name=\/\ACJK UNIFIED IDEOGRAPH-59C3\z\/}/,
+         'Verify name wildcards delimitters can be escaped');
+like(chr(0xD800), qr!\p{name=/\A\z/}!,
+                                'Verify works on matching an empty name');
 
 {
     no warnings 'deprecated';
index d1fb8e4..a4f1483 100644 (file)
@@ -17897,9 +17897,12 @@ $loose_names
 
     # And the following array gives the inverse mapping from code points to
     # names.  Lowest code points are first
-    my \@code_points_ending_in_code_point = (
+    \@code_points_ending_in_code_point = (
 $code_points_ending_in_code_point
     );
+
+    # Is exportable, make read-only
+    Internals::SvREADONLY(\@code_points_ending_in_code_point, 1);
 END
     # Earlier releases didn't have Jamos.  No sense outputting
     # them unless will be used.
index 3a21ccf..92995a7 100644 (file)
 # baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
 # 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
 # 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
-# 3e37ae63c1a4f3084bba787a2c6ca020dad9d0d56e115c118fe8c68ac290ea7a lib/unicore/mktables
+# 62a198b1430be086ac577285f5cbc0c2bde043a8ba469d85b256f1e191aa997d lib/unicore/mktables
 # 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
 # 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
 # 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl
index 4b8d709..58445c5 100644 (file)
@@ -31,6 +31,17 @@ here, but most should go in the L</Performance Enhancements> section.
 
 See L<https://www.unicode.org/versions/Unicode13.0.0/> for details.
 
+=head2 It is now possible to write C<qr/\p{Name=...}/>, or
+C<qr!\p{na=/(SMILING|GRINNING) FACE/}!>
+
+The Unicode Name property is now accessible in regular expression
+patterns, as an alternative to C<\N{...}>.
+A comparison of the two methods is given in
+L<perlunicode/Comparison of \N{...} and \p{name=...}>.
+
+The second example above shows that wildcard subpatterns are also usable
+in this property.  See L<perlunicode/Wildcards in Property Values>.
+
 =head1 Security
 
 XXX Any security-related notices go here.  In particular, any security
index 4aa259e..fb446d6 100644 (file)
@@ -1066,11 +1066,6 @@ example,
 
 would match the same things.
 
-A warning is issued if none of the legal values for a property are
-matched by your pattern.  It's likely that a future release will raise a
-warning if your pattern ends up causing every possible code point to
-match.
-
 Another example that shows that within C<\p{...}>, C</x> isn't needed to
 have spaces:
 
@@ -1078,18 +1073,51 @@ have spaces:
 
 To be safe, we should have anchored the above example, to prevent
 matches for something like C<Hebrew_Braille>, but there aren't
-any script names like that.
+any script names like that, so far.
+A warning is issued if none of the legal values for a property are
+matched by your pattern.  It's likely that a future release will raise a
+warning if your pattern ends up causing every possible code point to
+match.
+
+Starting in 5.32, the Name and Name Aliases properties are allowed to be
+matched.  They are considered to be a single combination property, just
+as has long been the case for C<\N{}>.  Loose matching doesn't work in
+exactly the same way for these as it does for the values of other
+properties.  The rules are given in
+L<https://www.unicode.org/reports/tr44/tr44-24.html#UAX44-LM2>.  As a
+result, Perl doesn't try loose matching for you, like it does in other
+properties.  All letters in names are uppercase, but you can add C<(?i)>
+to your subpattern to ignore case.  If you're uncertain where a blank
+is, you can use C< ?> in your subpattern.  No character name contains an
+underscore, so don't bother trying to match one.  The use of hyphens is
+particularly problematic; refer to the above link.  But note that, as of
+Unicode 13.0, the only script in modern usage which has weirdnesses with
+these is Tibetan; also the two Korean characters U+116C HANGUL JUNGSEONG
+OE and U+1180 HANGUL JUNGSEONG O-E.  Unicode makes no promises to not
+add hyphen-problematic names in the future.
+
+Using wildcards on these is resource intensive, given the hundreds of
+thousands of legal names that must be checked against.
+
+An example of using Name property wildcards is
+
+ qr!\p{name=/(SMILING|GRINNING) FACE/}!
+
+Another is
+
+ qr/(?[ \p{name=\/CJK\/} - \p{ideographic} ])/
+
+which is the 200-ish (as of Unicode 13.0) CJK characters that aren't
+ideographs.
 
-There are certain properties that it doesn't currently work with.  These
-are:
+There are certain properties that wildcard subpatterns don't currently
+work with.  These are:
 
  Bidi Mirroring Glyph
  Bidi Paired Bracket
  Case Folding
  Decomposition Mapping
  Equivalent Unified Ideograph
- Name
- Name Alias
  Lowercase Mapping
  NFKC Case Fold
  Titlecase Mapping
diff --git a/proto.h b/proto.h
index 6d1e7f1..63b67ea 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -5721,6 +5721,9 @@ PERL_STATIC_INLINE regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRE
 #define PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF  \
        assert(pRExC_state); assert(flagp); assert(parse_start)
 #endif
+STATIC bool    S_handle_names_wildcard(pTHX_ const char * wname, const STRLEN wname_len, SV ** prop_definition);
+#define PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD \
+       assert(wname); assert(prop_definition)
 STATIC int     S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, const char* const s, char ** updated_parse_ptr, AV** posix_warnings, const bool check_only);
 #define PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX \
        assert(pRExC_state); assert(s)
index 38bce8d..68a5883 100644 (file)
  * baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
  * 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
  * 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
- * 3e37ae63c1a4f3084bba787a2c6ca020dad9d0d56e115c118fe8c68ac290ea7a lib/unicore/mktables
+ * 62a198b1430be086ac577285f5cbc0c2bde043a8ba469d85b256f1e191aa997d lib/unicore/mktables
  * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
  * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
  * f9a393e7add8c7c2728356473ce5b52246d51295b2da0c48fb6f0aa21799e2bb regen/regcharclass.pl
index df01e9a..059dd03 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -23701,16 +23701,19 @@ S_parse_uniprop_string(pTHX_
                  * but it must be punctuation */
             && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
         {
-            /* Find the property.  The table includes the equals sign, so we
-             * use 'j' as-is */
-            table_index = do_uniprop_match(lookup_name, j);
-            if (table_index) {
-                const char * const * prop_values
-                                                = get_prop_values(table_index);
+            bool special_property = memEQs(lookup_name, j - 1, "name")
+                                 || memEQs(lookup_name, j - 1, "na");
+            if (! special_property) {
+                /* Find the property.  The table includes the equals sign, so
+                 * we use 'j' as-is */
+                table_index = do_uniprop_match(lookup_name, j);
+            }
+            if (special_property || table_index) {
                 REGEXP * subpattern_re;
                 char open = name[i++];
                 char close;
                 const char * pos_in_brackets;
+                const char * const * prop_values;
                 bool escaped = 0;
 
                 /* Backslash => delimitter is the character following.  We
@@ -23744,6 +23747,35 @@ S_parse_uniprop_string(pTHX_
                     packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
                     "The Unicode property wildcards feature is experimental");
 
+                if (special_property) {
+                    const char * error_msg;
+                    const char * revised_name = name + i;
+                    Size_t revised_name_len = name_len - (i + 1 + escaped);
+
+                    /* Currently, the only 'special_property' is name, which we
+                     * lookup in _charnames.pm */
+
+                    if (! load_charnames(newSVpvs("placeholder"),
+                                         revised_name, revised_name_len,
+                                         &error_msg))
+                    {
+                        sv_catpv(msg, error_msg);
+                        goto append_name_to_msg;
+                    }
+
+                    /* Farm this out to a function just to make the current
+                     * function less unwieldy */
+                    if (handle_names_wildcard(revised_name, revised_name_len,
+                                &prop_definition))
+                    {
+                        return prop_definition;
+                    }
+
+                    goto failed;
+                }
+
+                prop_values = get_prop_values(table_index);
+
                 /* Now create and compile the wildcard subpattern.  Use /i
                  * because the property values are supposed to match with case
                  * ignored. */
@@ -23838,11 +23870,6 @@ S_parse_uniprop_string(pTHX_
              * error.  Otherwise run the pattern against every code point in
              * the ssc.  The ssc is kind of like tr18's 3.9 Possible Match Sets
              * And it might be good to create an API to return the ssc.
-             *
-             * For the name properties, a new function could be created in
-             * charnames which essentially does the same thing as above,
-             * sharing Name.pl with the other charname functions.  Don't know
-             * about loose name matching, or algorithmically determined names.
              * Decomposition.pl similarly.
              *
              * It might be that a new pattern modifier would have to be
@@ -23932,6 +23959,7 @@ S_parse_uniprop_string(pTHX_
 
             cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
             if (character_len < SvCUR(character)) {
+                /* Temporarily, named sequences aren't handled */
                 goto failed;
             }
 
@@ -24850,6 +24878,386 @@ S_parse_uniprop_string(pTHX_
     }
 }
 
+STATIC bool
+S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
+                              const STRLEN wname_len, /* Its length */
+                              SV ** prop_definition)
+{
+    /* Deal with Name property wildcard subpatterns; returns TRUE if there were
+     * any matches, adding them to prop_definition */
+
+    dSP;
+
+    CV * get_names_info;        /* entry to charnames.pm to get info we need */
+    SV * names_string;          /* Contains all character names, except algo */
+    SV * algorithmic_names;     /* Contains info about algorithmically
+                                   generated character names */
+    REGEXP * subpattern_re;     /* The user's pattern to match with */
+    struct regexp * prog;       /* The compiled pattern */
+    char * all_names_start;     /* lib/unicore/Name.pl string of every
+                                   (non-algorithmic) character name */
+    char * cur_pos;             /* We match, effectively using /gc; this is
+                                   where we are now */
+    bool found_matches = FALSE; /* Did any name match so far? */
+    SV * empty;                 /* For matching zero length names */
+    SV * must;                  /* What substring, if any, must be in a name
+                                   for the subpattern to match */
+    SV * syllable_name = NULL;  /* For Hangul syllables */
+    const char hangul_prefix[] = "HANGUL SYLLABLE ";
+    const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
+
+    /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
+     * syllable name, and these are immutable and guaranteed by the Unicode
+     * standard to never be extended */
+    const STRLEN syl_max_len = hangul_prefix_len + 7;
+
+    IV i;
+
+    PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
+
+    /* Make sure _charnames is loaded.  (The parameters give context
+     * for any errors generated */
+    get_names_info = get_cv("_charnames::_get_names_info", 0);
+    if (! get_names_info) {
+        Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
+    }
+
+    /* Get the charnames data */
+    PUSHSTACKi(PERLSI_OVERLOAD);
+    ENTER ;
+    SAVETMPS;
+    save_re_context();
+
+    PUSHMARK(SP) ;
+    PUTBACK;
+
+    /* Special _charnames entry point that returns the info this routine
+     * requires */
+    call_sv(MUTABLE_SV(get_names_info), G_ARRAY);
+
+    SPAGAIN ;
+
+    /* Data structure for names which end in their very own code points */
+    algorithmic_names = POPs;
+    SvREFCNT_inc_simple_void_NN(algorithmic_names);
+
+    /* The lib/unicore/Name.pl string */
+    names_string = POPs;
+    SvREFCNT_inc_simple_void_NN(names_string);
+
+    PUTBACK ;
+    FREETMPS ;
+    LEAVE ;
+    POPSTACK;
+
+    if (   ! SvROK(names_string)
+        || ! SvROK(algorithmic_names))
+    {   /* Perhaps should panic instead XXX */
+        SvREFCNT_dec(names_string);
+        SvREFCNT_dec(algorithmic_names);
+        return FALSE;
+    }
+
+    names_string = sv_2mortal(SvRV(names_string));
+    all_names_start = SvPVX(names_string);
+    cur_pos = all_names_start;
+
+    algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
+
+    /* Compile the subpattern consisting of the name being looked for */
+    subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
+    must = re_intuit_string(subpattern_re);
+    prog = ReANY(subpattern_re);
+
+    /* If only nothing is matched, skip to where empty names are looked for */
+    if (prog->maxlen == 0) {
+        goto check_empty;
+    }
+
+    /* And match against the string of all names /gc.  Don't even try if it
+     * must match a character not found in any name. */
+    if ( ! must
+        || SvCUR(must) == 0
+        || strspn(SvPVX(must), "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()")
+                                                              == SvCUR(must))
+    {
+        while (execute_wildcard(subpattern_re,
+                                cur_pos,
+                                SvEND(names_string),
+                                all_names_start, 0,
+                                names_string,
+                                0))
+        { /* Here, matched. */
+
+            /* Note the string entries look like
+             *      00001\nSTART OF HEADING\n\n
+             * so we could match anywhere in that string.  We have to rule out
+             * matching a code point line */
+            char * this_name_start = all_names_start
+                                                + RX_OFFS(subpattern_re)->start;
+            char * this_name_end   = all_names_start
+                                                + RX_OFFS(subpattern_re)->end;
+            char * cp_start;
+            char * cp_end;
+            UV cp;
+
+            /* If matched nothing, advance to next possible match */
+            if (this_name_start == this_name_end) {
+                cur_pos = (char *) memchr(this_name_end + 1, '\n',
+                                          SvEND(names_string) - this_name_end);
+                if (cur_pos == NULL) {
+                    break;
+                }
+            }
+            else {
+                /* Position the next match to start beyond the current returned
+                 * entry */
+                cur_pos = (char *) memchr(this_name_end, '\n',
+                                          SvEND(names_string) - this_name_end);
+            }
+
+            /* Back up to the \n just before the beginning of the character. */
+            cp_end = (char *) my_memrchr(all_names_start,
+                                         '\n',
+                                         this_name_start - all_names_start);
+
+            /* If we didn't find a \n, it means it matched somewhere in the
+             * initial '00000' in the string, so isn't a real match */
+            if (cp_end == NULL) {
+                continue;
+            }
+
+            this_name_start = cp_end + 1;   /* The name starts just after */
+            cp_end--;                       /* the \n, and the code point */
+                                            /* ends just before it */
+
+            /* All code points are 5 digits long */
+            cp_start = cp_end - 4;
+
+            /* Except for the first line in the string, the sequence before the
+             * code point is \n\n.  If that isn't the case here, we didn't
+             * match the name of a character.  (We could have matched a named
+             * sequence, not currently handled */
+            if (      cp_start > all_names_start + 1
+                && (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n'))
+            {
+                continue;
+            }
+
+            /* Calculate the code point from its 5 digits */
+            cp = (XDIGIT_VALUE(cp_start[0]) << 16)
+               + (XDIGIT_VALUE(cp_start[1]) << 12)
+               + (XDIGIT_VALUE(cp_start[2]) << 8)
+               + (XDIGIT_VALUE(cp_start[3]) << 4)
+               +  XDIGIT_VALUE(cp_start[4]);
+
+            /* We matched!  Add this to the list */
+            *prop_definition = add_cp_to_invlist(*prop_definition, cp);
+            found_matches = TRUE;
+        } /* End of loop through the non-algorithmic names string */
+    }
+
+    /* There are also character names not in 'names_string'.  These are
+     * algorithmically generatable.  Try this pattern on each possible one.
+     * (khw originally planned to leave this out given the large number of
+     * matches attempted; but the speed turned out to be quite acceptable
+     *
+     * There are plenty of opportunities to optimize to skip many of the tests.
+     * beyond the rudimentary ones already here */
+
+    /* First see if the subpattern matches any of the algorithmic generatable
+     * Hangul syllable names.
+     *
+     * We know none of these syllable names will match if the input pattern
+     * requires more bytes than any syllable has, or if the input pattern only
+     * matches an empty name, or if the pattern has something it must match and
+     * one of the characters in that isn't in any Hangul syllable. */
+    if (    prog->minlen <= (SSize_t) syl_max_len
+        &&  prog->maxlen > 0
+        && ( ! must
+            || SvCUR(must) == 0
+            || strspn(SvPVX(must), "\n ABCDEGHIJKLMNOPRSTUWY") == SvCUR(must)))
+    {
+        /* These constants, names, values, and algorithm are adapted from the
+         * Unicode standard, version 5.1, section 3.12, and should never
+         * change. */
+        const char * JamoL[] = {
+            "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
+            "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
+        };
+        const int LCount = C_ARRAY_LENGTH(JamoL);
+
+        const char * JamoV[] = {
+            "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
+            "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
+            "I"
+        };
+        const int VCount = C_ARRAY_LENGTH(JamoV);
+
+        const char * JamoT[] = {
+            "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
+            "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
+            "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
+        };
+        const int TCount = C_ARRAY_LENGTH(JamoT);
+
+        int L, V, T;
+
+        /* This is the initial Hangul syllable code point; each time through the
+         * inner loop, it maps to the next higher code point.  For more info,
+         * see the Hangul syllable section of the Unicode standard. */
+        int cp = 0xAC00;
+
+        syllable_name = sv_2mortal(newSV(syl_max_len));
+        sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
+
+        for (L = 0; L < LCount; L++) {
+            for (V = 0; V < VCount; V++) {
+                for (T = 0; T < TCount; T++) {
+
+                    /* Truncate back to the prefix, which is unvarying */
+                    SvCUR_set(syllable_name, hangul_prefix_len);
+
+                    sv_catpv(syllable_name, JamoL[L]);
+                    sv_catpv(syllable_name, JamoV[V]);
+                    sv_catpv(syllable_name, JamoT[T]);
+
+                    if (execute_wildcard(subpattern_re,
+                                SvPVX(syllable_name),
+                                SvEND(syllable_name),
+                                SvPVX(syllable_name), 0,
+                                syllable_name,
+                                0))
+                    {
+                        *prop_definition = add_cp_to_invlist(*prop_definition,
+                                                             cp);
+                        found_matches = TRUE;
+                    }
+
+                    cp++;
+                }
+            }
+        }
+    }
+
+    /* The rest of the algorithmically generatable names are of the form
+     * "PREFIX-code_point".  The prefixes and the code point limits of each
+     * were returned to us in the array 'algorithmic_names' from data in
+     * lib/unicore/Name.pm.  'code_point' in the name is expressed in hex. */
+    for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
+        IV j;
+
+        /* Each element of the array is a hash, giving the details for the
+         * series of names it covers.  There is the base name of the characters
+         * in the series, and the low and high code points in the series.  And,
+         * for optimization purposes a string containing all the legal
+         * characters that could possibly be in a name in this series. */
+        HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
+        SV * prefix = * hv_fetchs(this_series, "name", 0);
+        IV low = SvIV(* hv_fetchs(this_series, "low", 0));
+        IV high = SvIV(* hv_fetchs(this_series, "high", 0));
+        char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
+
+        /* Pre-allocate an SV with enough space */
+        SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
+                                                        SvPVX(prefix)));
+        if (high >= 0x10000) {
+            sv_catpvs(algo_name, "0");
+        }
+
+        /* This series can be skipped entirely if the pattern requires
+         * something longer than any name in the series, or can only match an
+         * empty name, or contains a character not found in any name in the
+         * series */
+        if (    prog->minlen <= (SSize_t) SvCUR(algo_name)
+            &&  prog->maxlen > 0
+            && ( ! must
+                || SvCUR(must) == 0
+                || strspn(SvPVX(must), legal) == SvCUR(must)))
+        {
+            for (j = low; j <= high; j++) { /* For each code point in the series */
+
+                /* Get its name, and see if it matches the subpattern */
+                Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
+                                     (unsigned) j);
+
+                if (execute_wildcard(subpattern_re,
+                                    SvPVX(algo_name),
+                                    SvEND(algo_name),
+                                    SvPVX(algo_name), 0,
+                                    algo_name,
+                                    0))
+                {
+                    *prop_definition = add_cp_to_invlist(*prop_definition, j);
+                    found_matches = TRUE;
+                }
+            }
+        }
+    }
+
+  check_empty:
+    /* Finally, see if the subpattern matches an empty string */
+    empty = newSVpvs("");
+    if (execute_wildcard(subpattern_re,
+                         SvPVX(empty),
+                         SvEND(empty),
+                         SvPVX(empty), 0,
+                         empty,
+                         0))
+    {
+        /* Many code points have empty names.  Currently these are the \p{GC=C}
+         * ones, minus CC and CF */
+
+        SV * empty_names_ref = get_prop_definition(UNI_C);
+        SV * empty_names = invlist_clone(empty_names_ref, NULL);
+
+        SV * subtract = get_prop_definition(UNI_CC);
+
+        _invlist_subtract(empty_names, subtract, &empty_names);
+        SvREFCNT_dec_NN(empty_names_ref);
+        SvREFCNT_dec_NN(subtract);
+
+        subtract = get_prop_definition(UNI_CF);
+        _invlist_subtract(empty_names, subtract, &empty_names);
+        SvREFCNT_dec_NN(subtract);
+
+        _invlist_union(*prop_definition, empty_names, prop_definition);
+        found_matches = TRUE;
+        SvREFCNT_dec_NN(empty_names);
+    }
+    SvREFCNT_dec_NN(empty);
+
+#if 0
+    /* If we ever were to accept aliases for, say private use names, we would
+     * need to do something fancier to find empty names.  The code below works
+     * (at the time it was written), and is slower than the above */
+    const char empties_pat[] = "^.";
+    if (strNE(name, empties_pat)) {
+        SV * empty = newSVpvs("");
+        if (execute_wildcard(subpattern_re,
+                    SvPVX(empty),
+                    SvEND(empty),
+                    SvPVX(empty), 0,
+                    empty,
+                    0))
+        {
+            SV * empties = NULL;
+
+            (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
+
+            _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
+            SvREFCNT_dec_NN(empties);
+
+            found_matches = TRUE;
+        }
+        SvREFCNT_dec_NN(empty);
+    }
+#endif
+
+    SvREFCNT_dec_NN(subpattern_re);
+    return found_matches;
+}
+
 /*
  * ex: set ts=8 sts=4 sw=4 et:
  */
index f6c6408..a3e99ad 100644 (file)
@@ -7537,7 +7537,7 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) {
  * baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt
  * 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt
  * 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt
- * 3e37ae63c1a4f3084bba787a2c6ca020dad9d0d56e115c118fe8c68ac290ea7a lib/unicore/mktables
+ * 62a198b1430be086ac577285f5cbc0c2bde043a8ba469d85b256f1e191aa997d lib/unicore/mktables
  * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version
  * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
  * 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl