This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Add wrappers for cmplng/xctng wildcard subpatterns
authorKarl Williamson <khw@cpan.org>
Thu, 13 Feb 2020 04:07:27 +0000 (21:07 -0700)
committerKarl Williamson <khw@cpan.org>
Thu, 20 Feb 2020 05:09:48 +0000 (22:09 -0700)
This is in preparation for being called from more than one place.

It has the salubrious effect that the wrapping we do around the user's
supplied pattern is no longer visible in the Debug output of that
pattern.

embed.fnc
embed.h
pod/perldelta.pod
pod/perlunicode.pod
proto.h
regcomp.c

index 0cb88ab..935f75a 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1919,6 +1919,11 @@ EiR      |SV*    |invlist_contents|NN SV* const invlist              \
                                 |const bool traditional_style
 EixRT  |UV     |invlist_lowest|NN SV* const invlist
 #ifndef PERL_EXT_RE_BUILD
+ERS    |REGEXP*|compile_wildcard|NN const char * name|const STRLEN len     \
+                                |const bool ignore_case
+ES     |I32    |execute_wildcard|NN REGEXP * const prog|NN char* stringarg \
+                               |NN char* strend|NN char* strbeg \
+                               |SSize_t minend |NN SV* screamer|U32 nosave
 EiRT   |UV*    |_invlist_array_init    |NN SV* const invlist|const bool will_have_0
 EiRT   |UV     |invlist_max    |NN SV* const invlist
 EiRT   |IV*    |get_invlist_previous_index_addr|NN SV* invlist
diff --git a/embed.h b/embed.h
index a6cd2b2..3315325 100644 (file)
--- a/embed.h
+++ b/embed.h
 #    if defined(PERL_IN_REGCOMP_C)
 #define _append_range_to_invlist(a,b,c)        S__append_range_to_invlist(aTHX_ a,b,c)
 #define _invlist_array_init    S__invlist_array_init
+#define compile_wildcard(a,b,c)        S_compile_wildcard(aTHX_ a,b,c)
+#define execute_wildcard(a,b,c,d,e,f,g)        S_execute_wildcard(aTHX_ a,b,c,d,e,f,g)
 #define get_invlist_previous_index_addr        S_get_invlist_previous_index_addr
 #define invlist_clear(a)       S_invlist_clear(aTHX_ a)
 #define invlist_max            S_invlist_max
index 3bc0faf..06cab07 100644 (file)
@@ -551,6 +551,11 @@ interpolated into another.  The heuristics previously used have been
 replaced by a reliable method, and hence the diagnostics generated have
 changed.  See L</Diagnostics>.
 
+=item *
+The debug display (say by specifying C<-Dr> or S<C<use re>> (with
+appropriate options) of compiled Unicode propery wildcard subpatterns no
+longer has extraneous output.
+
 =back
 
 =head1 Known Problems
index b9be192..761b0db 100644 (file)
@@ -1032,12 +1032,8 @@ This feature is not available when the left-hand side is prefixed by
 C<Is_>, nor for any form that is marked as "Discouraged" in
 L<perluniprops/Discouraged>.
 
-Perl wraps your pattern with C<(?iaa: ... )>.  This is because nothing
-outside ASCII can match the Unicode property values available in this
-release, and they should match caselessly.  If your pattern has a syntax
-error, this wrapping will be shown in the error message, even though you
-didn't specify it yourself.  This could be confusing if you don't know
-about this.
+By default, your pattern is matched case-insensitively, as if C</i> had
+been specified.  You can change this by saying C<(?-i)> in your pattern.
 
 This experimental feature has been added to begin to implement
 L<https://www.unicode.org/reports/tr18/#Wildcard_Properties>.  Using it
diff --git a/proto.h b/proto.h
index cb314c2..441533f 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -4256,6 +4256,14 @@ PERL_STATIC_INLINE UV*   S__invlist_array_init(SV* const invlist, const bool will_
        assert(invlist)
 #endif
 
+STATIC REGEXP* S_compile_wildcard(pTHX_ const char * name, const STRLEN len, const bool ignore_case)
+                       __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_COMPILE_WILDCARD      \
+       assert(name)
+
+STATIC I32     S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char* strend, char* strbeg, SSize_t minend, SV* screamer, U32 nosave);
+#define PERL_ARGS_ASSERT_EXECUTE_WILDCARD      \
+       assert(prog); assert(stringarg); assert(strend); assert(strbeg); assert(screamer)
 #ifndef PERL_NO_INLINE_FUNCTIONS
 PERL_STATIC_INLINE IV* S_get_invlist_previous_index_addr(SV* invlist)
                        __attribute__warn_unused_result__;
index fa23d38..7682d27 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -22814,6 +22814,42 @@ S_get_extended_utf8_msg(pTHX_ const UV cp)
 
 #  endif
 
+STATIC REGEXP *
+S_compile_wildcard(pTHX_ const char * name, const STRLEN len,
+                         const bool ignore_case)
+{
+    U32 flags = PMf_MULTILINE;
+    REGEXP * subpattern_re;
+
+    PERL_ARGS_ASSERT_COMPILE_WILDCARD;
+
+    if (ignore_case) {
+        flags |= PMf_FOLD;
+    }
+    set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
+
+    subpattern_re = re_op_compile_wrapper(sv_2mortal(newSVpvn(name, len)),
+                                        /* Like in op.c, we copy the compile
+                                         * time pm flags to the rx ones */
+                                        (flags & RXf_PMf_COMPILETIME), flags);
+
+    assert(subpattern_re);  /* Should have died if didn't compile successfully */
+    return subpattern_re;
+}
+
+STATIC I32
+S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
+        char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
+{
+    I32 result;
+
+    PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
+
+    result = pregexec(prog, stringarg, strend, strbeg, minend, screamer, nosave);
+
+    return result;
+}
+
 SV *
 Perl_handle_user_defined_property(pTHX_
 
@@ -23410,8 +23446,6 @@ Perl_parse_uniprop_string(pTHX_
             if (table_index) {
                 const char * const * prop_values
                                             = UNI_prop_value_ptrs[table_index];
-                SV * subpattern;
-                Size_t subpattern_len;
                 REGEXP * subpattern_re;
                 char open = name[i++];
                 char close;
@@ -23455,14 +23489,10 @@ Perl_parse_uniprop_string(pTHX_
                  * pattern fails to compile, our added text to the user's
                  * pattern will be displayed to the user, which is not so
                  * desirable. */
-                subpattern_len = name_len - i - 1 - escaped;
-                subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)",
-                                              (unsigned) subpattern_len,
-                                              name + i);
-                subpattern = sv_2mortal(subpattern);
-                subpattern_re = re_compile(subpattern, 0);
-                assert(subpattern_re);  /* Should have died if didn't compile
-                                         successfully */
+                subpattern_re = compile_wildcard(name + i,
+                                                 name_len - i - 1 - escaped,
+                                                 TRUE /* /i */
+                                                );
 
                 /* For each legal property value, see if the supplied pattern
                  * matches it. */
@@ -23471,7 +23501,7 @@ Perl_parse_uniprop_string(pTHX_
                     const Size_t len = strlen(entry);
                     SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
 
-                    if (pregexec(subpattern_re,
+                    if (execute_wildcard(subpattern_re,
                                  (char *) entry,
                                  (char *) entry + len,
                                  (char *) entry, 0,