X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/02f811ddf7c78de43affe51e3337e3a4be99f9df..c2c7bda0885d19e01f0d998f22d2248d7663e957:/regexec.c

diff --git a/regexec.c b/regexec.c
index 5beed03..c88f467 100644
--- a/regexec.c
+++ b/regexec.c
@@ -37,9 +37,6 @@
 #include "re_top.h"
 #endif
 
-#define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
-      "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
-
 /*
  * pregcomp and pregexec -- regsub and regerror are not used in perl
  *
@@ -86,6 +83,12 @@
 #include "invlist_inline.h"
 #include "unicode_constants.h"
 
+#define B_ON_NON_UTF8_LOCALE_IS_WRONG            \
+ "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale"
+
+static const char utf8_locale_required[] =
+      "Use of (?[ ]) for non-UTF-8 locale is wrong.  Assuming a UTF-8 locale";
+
 #ifdef DEBUGGING
 /* At least one required character in the target string is expressible only in
  * UTF-8. */
@@ -489,7 +492,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
     }
     else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
         return isFOO_lc(classnum,
-                        TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
+                        EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
     }
 
     _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
@@ -1481,7 +1484,7 @@ STMT_START {
         } else {                                                                    \
             uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags);   \
             len = UTF8SKIP(uc);                                                     \
-            skiplen = UNISKIP( uvc );                                               \
+            skiplen = UVCHR_SKIP( uvc );                                            \
             foldlen -= skiplen;                                                     \
             uscan = foldbuf + skiplen;                                              \
         }                                                                           \
@@ -1498,7 +1501,7 @@ STMT_START {
         } else {                                                                    \
             len = 1;                                                                \
             uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags);             \
-            skiplen = UNISKIP( uvc );                                               \
+            skiplen = UVCHR_SKIP( uvc );                                            \
             foldlen -= skiplen;                                                     \
             uscan = foldbuf + skiplen;                                              \
         }                                                                           \
@@ -1766,7 +1769,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
 #define getSB_VAL_CP(cp)                                                       \
           _generic_GET_BREAK_VAL_CP(                                           \
                                     PL_SB_invlist,                             \
-                                    Sentence_Break_invmap,                     \
+                                    _Perl_SB_invmap,                     \
                                     (cp))
 
 /* Returns the SB value for the first code point in the UTF-8 encoded string
@@ -1778,7 +1781,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
 #define getWB_VAL_CP(cp)                                                       \
           _generic_GET_BREAK_VAL_CP(                                           \
                                     PL_WB_invlist,                             \
-                                    Word_Break_invmap,                         \
+                                    _Perl_WB_invmap,                         \
                                     (cp))
 
 /* Returns the WB value for the first code point in the UTF-8 encoded string
@@ -1822,7 +1825,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
     switch (OP(c)) {
     case ANYOFL:
         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+        if ((FLAGS(c) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
+            Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+        }
+
         /* FALLTHROUGH */
+    case ANYOFD:
     case ANYOF:
         if (utf8_target) {
             REXEC_FBC_UTF8_CLASS_SCAN(
@@ -2320,7 +2329,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                                                                 classnum)))
                         || (UTF8_IS_DOWNGRADEABLE_START(*s)
                             && to_complement ^ cBOOL(
-                                _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
+                                _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
                                                                       *(s + 1)),
                                               classnum))))
                     {
@@ -5377,7 +5386,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                             l++;
                         }
                         else {
-                            if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
+                            if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
                             {
                                 sayNO;
                             }
@@ -5401,7 +5410,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                             s++;
                         }
                         else {
-                            if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
+                            if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
                             {
                                 sayNO;
                             }
@@ -5729,7 +5738,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
 	case ANYOFL:  /*  /[abc]/l      */
             _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+            if ((FLAGS(scan) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE)
+            {
+              Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+            }
             /* FALLTHROUGH */
+	case ANYOFD:  /*   /[abc]/d       */
 	case ANYOF:  /*   /[abc]/       */
             if (NEXTCHR_IS_EOS)
                 sayNO;
@@ -5768,7 +5783,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             }
             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
                 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
-                                           (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
+                                           (U8) EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
                                                             *(locinput + 1))))))
                 {
                     sayNO;
@@ -5849,7 +5864,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             }
             else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
                 if (! (to_complement
-                       ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
+                       ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
                                                                *(locinput + 1)),
                                              FLAGS(scan)))))
                 {
@@ -8126,7 +8141,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
 
                 /* Target isn't utf8; convert the character in the UTF-8
                  * pattern to non-UTF8, and do a simple loop */
-                c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
+                c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
                 while (scan < loceol && UCHARAT(scan) == c) {
                     scan++;
                 }
@@ -8243,7 +8258,12 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
     }
     case ANYOFL:
         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+        if ((FLAGS(p) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
+            Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+        }
         /* FALLTHROUGH */
+    case ANYOFD:
     case ANYOF:
 	if (utf8_target) {
 	    while (hardcount < max
@@ -8365,7 +8385,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                     }
                     else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
                         if (! (to_complement
-                              ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
+                              ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
                                                                      *(scan + 1)),
                                                     classnum))))
                         {
@@ -8586,7 +8606,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
 		 * UTF8_ALLOW_FFFF */
 	if (c_len == (STRLEN)-1)
 	    Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
-        if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
+        if (c > 255 && OP(n) == ANYOFL && ! (flags & ANYOF_LOC_REQ_UTF8)) {
             _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
         }
     }
@@ -8595,7 +8615,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
     if (c < NUM_ANYOF_CODE_POINTS) {
 	if (ANYOF_BITMAP_TEST(n, c))
 	    match = TRUE;
-	else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII)
+	else if ((flags
+                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+                  && OP(n) == ANYOFD
 		  && ! utf8_target
 		  && ! isASCII(c))
 	{
@@ -8698,7 +8720,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
 	}
 
         if (UNICODE_IS_SUPER(c)
-            && (flags & ANYOF_WARN_SUPER)
+            && (flags
+               & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+            && OP(n) != ANYOFD
             && ckWARN_d(WARN_NON_UNICODE))
         {
             Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),