This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add named sequences to Unicode wildcard name capabilites
authorKarl Williamson <khw@cpan.org>
Fri, 20 Mar 2020 04:13:30 +0000 (22:13 -0600)
committerKarl Williamson <khw@cpan.org>
Fri, 20 Mar 2020 13:44:31 +0000 (07:44 -0600)
Prior to this commit, specifying a named sequence would result in a
mostly unhelpful fatal error message.  This makes their use legal.

This is also the beginning of allowing Unicode string properties, which
are a new thing in the (still draft) Unicode requirements for regular
expression parsing, UTS 18.  Full compliance will have to come later.

embed.fnc
embed.h
lib/charnames.t
pod/perldiag.pod
pod/perlunicode.pod
proto.h
regcomp.c

index 09fce2a..98a5e38 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1933,6 +1933,7 @@ ES        |SV *   |parse_uniprop_string|NN const char * const name            \
                                     |const bool to_fold                    \
                                     |const bool runtime                    \
                                     |const bool deferrable                 \
+                                    |NULLOK AV ** strings                  \
                                     |NN bool * user_defined_ptr            \
                                     |NN SV * msg                           \
                                     |const STRLEN level
@@ -1954,7 +1955,8 @@ ES        |I32    |execute_wildcard|NN REGEXP * const prog|NN char* stringarg \
 ES     |bool   |handle_names_wildcard                                      \
                                |NN const char * wname                      \
                                |const STRLEN wname_len                     \
-                               |NN SV ** prop_definition
+                               |NN SV ** prop_definition                   \
+                               |NN AV ** strings
 ES     |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \
                                |NN SV** invlist
 ES     |regnode_offset|handle_named_backref|NN RExC_state_t *pRExC_state   \
diff --git a/embed.h b/embed.h
index a76a43f..8418e8c 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b)
 #define grok_bslash_N(a,b,c,d,e,f,g)   S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g)
 #define handle_named_backref(a,b,c,d)  S_handle_named_backref(aTHX_ a,b,c,d)
-#define handle_names_wildcard(a,b,c)   S_handle_names_wildcard(aTHX_ a,b,c)
+#define handle_names_wildcard(a,b,c,d) S_handle_names_wildcard(aTHX_ a,b,c,d)
 #define handle_possible_posix(a,b,c,d,e)       S_handle_possible_posix(aTHX_ a,b,c,d,e)
 #define handle_regex_sets(a,b,c,d,e)   S_handle_regex_sets(aTHX_ a,b,c,d,e)
 #define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j)      S_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j)
 #define nextchar(a)            S_nextchar(aTHX_ a)
 #define output_posix_warnings(a,b)     S_output_posix_warnings(aTHX_ a,b)
 #define parse_lparen_question_flags(a) S_parse_lparen_question_flags(aTHX_ a)
-#define parse_uniprop_string(a,b,c,d,e,f,g,h,i)        S_parse_uniprop_string(aTHX_ a,b,c,d,e,f,g,h,i)
+#define parse_uniprop_string(a,b,c,d,e,f,g,h,i,j)      S_parse_uniprop_string(aTHX_ a,b,c,d,e,f,g,h,i,j)
 #define populate_ANYOF_from_invlist(a,b)       S_populate_ANYOF_from_invlist(aTHX_ a,b)
 #define reg(a,b,c,d)           S_reg(aTHX_ a,b,c,d)
 #define reg2Lanode(a,b,c,d)    S_reg2Lanode(aTHX_ a,b,c,d)
index d611ae5..eb98c45 100644 (file)
@@ -1338,6 +1338,10 @@ is("\N{U+1D0C5}", "\N{BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS}", 'V
             my $loose_name = get_loose_name($name);
             use charnames ":loose";
             is(charnames::string_vianame($loose_name), $utf8, "Verify string_vianame(\"$loose_name\") is the proper utf8");
+
+            like($utf8, qr/^\p{name=$name}$/, "Verify /\p{name=$name}\$/ is the proper utf8");
+            like($utf8, qr/^\p{name=$loose_name}$/, "Verify /\p{name=$loose_name}\$/ is the proper utf8");
+            like($utf8, qr!^\p{name=/\A$name\z/}!, "Verify /\p{name=/$\A$name\z/} is the proper utf8");
             #diag("$name, $utf8");
         }
         close $fh;
index fdb2ba6..28e2d55 100644 (file)
@@ -3151,6 +3151,20 @@ an arbitrary reference was blessed into the "version" class.
 =item In '(*VERB...)', the '(' and '*' must be adjacent in regex;
 marked by S<<-- HERE> in m/%s/
 
+=item Inverting a character class which contains a multi-character
+sequence is illegal in regex; marked by <-- HERE in m/%s/
+
+(F) You wrote something like
+
+ qr/\P{name=KATAKANA LETTER AINU P}/
+ qr/[^\p{name=KATAKANA LETTER AINU P}]/
+
+This name actually evaluates to a sequence of two Katakana characters,
+not just a single one, and it is illegal to try to take the complement
+of a sequence.  (Mathematically it would mean any sequence of characters
+from 0 to infinity in length that weren't these two in a row, and that
+is likely not of any real use.)
+
 (F) The two-character sequence C<"(*"> in this context in a regular
 expression pattern should be an indivisible token, with nothing
 intervening between the C<"("> and the C<"*">, but you separated them.
index fb446d6..fa1710d 100644 (file)
@@ -938,7 +938,7 @@ summarizes the differences between these two:
  can interpolate    only with eval       yes            [1]
  custom names            yes             no             [2]
  name aliases            yes             yes            [3]
- named sequences         yes           not yet          [4]
+ named sequences         yes             yes            [4]
  name value parsing     exact       Unicode loose       [5]
 
 =over
@@ -965,10 +965,6 @@ Some characters have multiple names (synonyms).
 Some particular sequences of characters are given a single name, in
 addition to their individual ones.
 
-It is planned to add support for named sequences to the C<\p{...}> form
-before 5.32; in the meantime, an accurate but not fully informative
-message is generated if use of one of these is attempted.
-
 =item [5]
 
 Exact name value matching means you have to specify case, hyphens,
@@ -1079,11 +1075,11 @@ matched by your pattern.  It's likely that a future release will raise a
 warning if your pattern ends up causing every possible code point to
 match.
 
-Starting in 5.32, the Name and Name Aliases properties are allowed to be
-matched.  They are considered to be a single combination property, just
-as has long been the case for C<\N{}>.  Loose matching doesn't work in
-exactly the same way for these as it does for the values of other
-properties.  The rules are given in
+Starting in 5.32, the Name, Name Aliases, and Named Sequences properties
+are allowed to be matched.  They are considered to be a single
+combination property, just as has long been the case for C<\N{}>.  Loose
+matching doesn't work in exactly the same way for these as it does for
+the values of other properties.  The rules are given in
 L<https://www.unicode.org/reports/tr44/tr44-24.html#UAX44-LM2>.  As a
 result, Perl doesn't try loose matching for you, like it does in other
 properties.  All letters in names are uppercase, but you can add C<(?i)>
diff --git a/proto.h b/proto.h
index 543bfbd..17a6f6e 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -5756,9 +5756,9 @@ STATIC bool       S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode_offset* nod
 STATIC regnode_offset  S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, char * parse_start, char ch);
 #define PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF  \
        assert(pRExC_state); assert(flagp); assert(parse_start)
-STATIC bool    S_handle_names_wildcard(pTHX_ const char * wname, const STRLEN wname_len, SV ** prop_definition);
+STATIC bool    S_handle_names_wildcard(pTHX_ const char * wname, const STRLEN wname_len, SV ** prop_definition, AV ** strings);
 #define PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD \
-       assert(wname); assert(prop_definition)
+       assert(wname); assert(prop_definition); assert(strings)
 STATIC int     S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, const char* const s, char ** updated_parse_ptr, AV** posix_warnings, const bool check_only);
 #define PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX \
        assert(pRExC_state); assert(s)
@@ -5817,7 +5817,7 @@ STATIC void       S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_w
 STATIC void    S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state);
 #define PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS   \
        assert(pRExC_state)
-STATIC SV *    S_parse_uniprop_string(pTHX_ const char * const name, Size_t name_len, const bool is_utf8, const bool to_fold, const bool runtime, const bool deferrable, bool * user_defined_ptr, SV * msg, const STRLEN level);
+STATIC SV *    S_parse_uniprop_string(pTHX_ const char * const name, Size_t name_len, const bool is_utf8, const bool to_fold, const bool runtime, const bool deferrable, AV ** strings, bool * user_defined_ptr, SV * msg, const STRLEN level);
 #define PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING  \
        assert(name); assert(user_defined_ptr); assert(msg)
 STATIC void    S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr);
index 00b13f3..5c6ed3f 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -17670,6 +17670,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     /* If set TRUE, the property is user-defined as opposed to
                      * official Unicode */
                     bool user_defined = FALSE;
+                    AV * strings = NULL;
 
                     SV * prop_definition = parse_uniprop_string(
                                             name, n, UTF, FOLD,
@@ -17680,6 +17681,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                              * this call */
                                             ! cBOOL(ret_invlist),
 
+                                            &strings,
                                             &user_defined,
                                             msg,
                                             0 /* Base level */
@@ -17697,7 +17699,55 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                     SvCUR(msg), SvPVX(msg)));
                     }
 
-                    if (! is_invlist(prop_definition)) {
+                    assert(prop_definition || strings);
+
+                    if (strings) {
+                        if (! RExC_in_multi_char_class) {
+                            if (invert ^ (value == 'P')) {
+                                RExC_parse = e + 1;
+                                vFAIL("Inverting a character class which contains"
+                                    " a multi-character sequence is illegal");
+                            }
+
+                            /* For each multi-character string ... */
+                            while (av_tindex(strings) >= 0) {
+                                /* ... Each entry is itself an array of code
+                                * points. */
+                                AV * this_string = (AV *) av_shift( strings);
+                                STRLEN cp_count = av_tindex(this_string) + 1;
+                                SV * final = newSV(cp_count * 4);
+                                SvPVCLEAR(final);
+
+                                /* Create another string of sequences of \x{...} */
+                                while (av_tindex(this_string) >= 0) {
+                                    SV * character = av_shift(this_string);
+                                    UV cp = SvUV(character);
+
+                                    if (cp > 255) {
+                                        REQUIRE_UTF8(flagp);
+                                    }
+                                    Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
+                                                                        cp);
+                                    SvREFCNT_dec_NN(character);
+                                }
+                                SvREFCNT_dec_NN(this_string);
+
+                                /* And add that to the list of such things */
+                                multi_char_matches
+                                            = add_multi_match(multi_char_matches,
+                                                            final,
+                                                            cp_count);
+                            }
+                        }
+                        SvREFCNT_dec_NN(strings);
+                    }
+
+                    if (! prop_definition) {    /* If we got only a string,
+                                                   this iteration didn't really
+                                                   find a character */
+                        element_count--;
+                    }
+                    else if (! is_invlist(prop_definition)) {
 
                         /* Here, the definition isn't known, so we have gotten
                          * returned a string that will be evaluated if and when
@@ -23334,6 +23384,7 @@ S_handle_user_defined_property(pTHX_
         this_definition = parse_uniprop_string(s0, s - s0,
                                                is_utf8, to_fold, runtime,
                                                deferrable,
+                                               NULL,
                                                user_defined_ptr, msg,
                                                (name_len == 0)
                                                 ? level /* Don't increase level
@@ -23523,6 +23574,8 @@ S_parse_uniprop_string(pTHX_
     const bool runtime,         /* TRUE if this is being called at run time */
     const bool deferrable,      /* TRUE if it's ok for the definition to not be
                                    known at this call */
+    AV ** strings,              /* To return string property values, like named
+                                   sequences */
     bool *user_defined_ptr,     /* Upon return from this function it will be
                                    set to TRUE if any component is a
                                    user-defined property */
@@ -23773,7 +23826,8 @@ S_parse_uniprop_string(pTHX_
                     /* Farm this out to a function just to make the current
                      * function less unwieldy */
                     if (handle_names_wildcard(revised_name, revised_name_len,
-                                &prop_definition))
+                                              &prop_definition,
+                                              strings))
                     {
                         return prop_definition;
                     }
@@ -23822,6 +23876,7 @@ S_parse_uniprop_string(pTHX_
                                                            to_fold,
                                                            runtime,
                                                            deferrable,
+                                                           NULL,
                                                            user_defined_ptr,
                                                            msg,
                                                            level + 1);
@@ -23951,12 +24006,36 @@ S_parse_uniprop_string(pTHX_
             }
 
             cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
-            if (character_len < SvCUR(character)) {
-                /* Temporarily, named sequences aren't handled */
-                goto failed;
+            if (character_len == SvCUR(character)) {
+                prop_definition = add_cp_to_invlist(NULL, cp);
+            }
+            else {
+                AV * this_string;
+
+                /* First of the remaining characters in the string. */
+                char * remaining = SvPVX(character) + character_len;
+
+                if (strings == NULL) {
+                    goto failed;    /* XXX Perhaps a specific msg instead, like
+                                       'not available here' */
+                }
+
+                if (*strings == NULL) {
+                    *strings = newAV();
+                }
+
+                this_string = newAV();
+                av_push(this_string, newSVuv(cp));
+
+                do {
+                    cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
+                    av_push(this_string, newSVuv(cp));
+                    remaining += character_len;
+                } while (remaining < SvEND(character));
+
+                av_push(*strings, (SV *) this_string);
             }
 
-            prop_definition = add_cp_to_invlist(NULL, cp);
             return prop_definition;
         }
 
@@ -24874,7 +24953,8 @@ S_parse_uniprop_string(pTHX_
 STATIC bool
 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
                               const STRLEN wname_len, /* Its length */
-                              SV ** prop_definition)
+                              SV ** prop_definition,
+                              AV ** strings)
 {
     /* Deal with Name property wildcard subpatterns; returns TRUE if there were
      * any matches, adding them to prop_definition */
@@ -24992,7 +25072,9 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
                                                 + RX_OFFS(subpattern_re)->end;
             char * cp_start;
             char * cp_end;
-            UV cp;
+            UV cp = 0;      /* Silences some compilers */
+            AV * this_string = NULL;
+            bool is_multi = FALSE;
 
             /* If matched nothing, advance to next possible match */
             if (this_name_start == this_name_end) {
@@ -25027,26 +25109,69 @@ S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
             /* All code points are 5 digits long */
             cp_start = cp_end - 4;
 
+            /* This shouldn't happen, as we found a \n, and the first \n is
+             * further along than what we subtracted */
+            assert(cp_start >= all_names_start);
+
+            if (cp_start == all_names_start) {
+                *prop_definition = add_cp_to_invlist(*prop_definition, 0);
+                continue;
+            }
+
+            /* If the character is a blank, we either have a named sequence, or
+             * something is wrong */
+            if (*(cp_start - 1) == ' ') {
+                cp_start = (char *) my_memrchr(all_names_start,
+                                               '\n',
+                                               cp_start - all_names_start);
+                cp_start++;
+            }
+
+            assert(cp_start != NULL && cp_start >= all_names_start + 2);
+
             /* Except for the first line in the string, the sequence before the
              * code point is \n\n.  If that isn't the case here, we didn't
              * match the name of a character.  (We could have matched a named
              * sequence, not currently handled */
-            if (      cp_start > all_names_start + 1
-                && (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n'))
-            {
+            if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
                 continue;
             }
 
-                /* Calculate the code point from its 5 digits */
+            /* We matched!  Add this to the list */
+            found_matches = TRUE;
+
+            /* Loop through all the code points in the sequence */
+            while (cp_start < cp_end) {
+
+                /* Calculate this code point from its 5 digits */
                 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
                    + (XDIGIT_VALUE(cp_start[1]) << 12)
                    + (XDIGIT_VALUE(cp_start[2]) << 8)
                    + (XDIGIT_VALUE(cp_start[3]) << 4)
                    +  XDIGIT_VALUE(cp_start[4]);
 
-            /* We matched!  Add this to the list */
-            *prop_definition = add_cp_to_invlist(*prop_definition, cp);
-            found_matches = TRUE;
+                cp_start += 6;  /* Go past any blank */
+
+                if (cp_start < cp_end || is_multi) {
+                    if (this_string == NULL) {
+                        this_string = newAV();
+                    }
+
+                    is_multi = TRUE;
+                    av_push(this_string, newSVuv(cp));
+                }
+            }
+
+            if (is_multi) { /* Was more than one code point */
+                if (*strings == NULL) {
+                    *strings = newAV();
+                }
+
+                av_push(*strings, (SV *) this_string);
+            }
+            else {  /* Only a single code point */
+                *prop_definition = add_cp_to_invlist(*prop_definition, cp);
+            }
         } /* End of loop through the non-algorithmic names string */
     }