This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Remove relics of regex swash use
[perl5.git] / regexec.c
index 13db8fd..b612f04 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -504,7 +504,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
      * This just calls isFOO_lc on the code point for the character if it is in
      * the range 0-255.  Outside that range, all characters use Unicode
      * rules, ignoring any locale.  So use the Unicode function if this class
-     * requires a swash, and use the Unicode macro otherwise. */
+     * requires an inversion list, and use the Unicode macro otherwise. */
 
     PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
 
@@ -531,130 +531,6 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
     return FALSE; /* Things like CNTRL are always below 256 */
 }
 
-STATIC char *
-S_find_next_ascii(char * s, const char * send, const bool utf8_target)
-{
-    /* Returns the position of the first ASCII byte in the sequence between 's'
-     * and 'send-1' inclusive; returns 'send' if none found */
-
-    PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
-
-#ifndef EBCDIC
-
-    if ((STRLEN) (send - s) >= PERL_WORDSIZE
-
-                            /* This term is wordsize if subword; 0 if not */
-                          + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
-
-                            /* 'offset' */
-                          - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
-    {
-
-        /* Process per-byte until reach word boundary.  XXX This loop could be
-         * eliminated if we knew that this platform had fast unaligned reads */
-        while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
-            if (isASCII(*s)) {
-                return s;
-            }
-            s++;    /* khw didn't bother creating a separate loop for
-                       utf8_target */
-        }
-
-        /* Here, we know we have at least one full word to process.  Process
-         * per-word as long as we have at least a full word left */
-        do {
-            PERL_UINTMAX_T complemented = ~ * (PERL_UINTMAX_T *) s;
-            if (complemented & PERL_VARIANTS_WORD_MASK)  {
-
-#  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
-     || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
-
-                s += _variant_byte_number(complemented);
-                return s;
-
-#  else   /* If weird byte order, drop into next loop to do byte-at-a-time
-           checks. */
-
-                break;
-#  endif
-            }
-
-            s += PERL_WORDSIZE;
-
-        } while (s + PERL_WORDSIZE <= send);
-    }
-
-#endif
-
-    /* Process per-character */
-    if (utf8_target) {
-        while (s < send) {
-            if (isASCII(*s)) {
-                return s;
-            }
-            s += UTF8SKIP(s);
-        }
-    }
-    else {
-        while (s < send) {
-            if (isASCII(*s)) {
-                return s;
-            }
-            s++;
-        }
-    }
-
-    return s;
-}
-
-STATIC char *
-S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
-{
-    /* Returns the position of the first non-ASCII byte in the sequence between
-     * 's' and 'send-1' inclusive; returns 'send' if none found */
-
-#ifdef EBCDIC
-
-    PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
-
-    if (utf8_target) {
-        while (s < send) {
-            if ( ! isASCII(*s)) {
-                return s;
-            }
-            s += UTF8SKIP(s);
-        }
-    }
-    else {
-        while (s < send) {
-            if ( ! isASCII(*s)) {
-                return s;
-            }
-            s++;
-        }
-    }
-
-    return s;
-
-#else
-
-    const U8 * next_non_ascii = NULL;
-
-    PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
-    PERL_UNUSED_ARG(utf8_target);
-
-    /* On ASCII platforms invariants and ASCII are identical, so if the string
-     * is entirely invariants, there is no non-ASCII character */
-    return (is_utf8_invariant_string_loc((U8 *) s,
-                                         (STRLEN) (send - s),
-                                         &next_non_ascii))
-            ? (char *) send
-            : (char *) next_non_ascii;
-
-#endif
-
-}
-
 STATIC U8 *
 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
 {
@@ -2273,6 +2149,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                                    (U8) ARG(c), FLAGS(c)));
         break;
 
+    case ANYOFH:
+        if (utf8_target) REXEC_FBC_CLASS_SCAN(TRUE,
+                      reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
+        break;
+
     case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
         assert(! is_utf8_pat);
        /* FALLTHROUGH */
@@ -2303,7 +2184,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
     case EXACTF:   /* This node only generated for non-utf8 patterns */
         assert(! is_utf8_pat);
         if (utf8_target) {
-            utf8_fold_flags = 0;
             goto do_exactf_utf8;
         }
         fold_array = PL_fold;
@@ -2320,7 +2200,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         folder = foldEQ_locale;
         goto do_exactf_non_utf8;
 
-    case EXACTFU_SS:
+    case EXACTFUP:      /* Problematic even though pattern isn't UTF-8.  Use
+                           full functionality normally not done except for
+                           UTF-8 */
         assert(! is_utf8_pat);
         goto do_exactf_utf8;
 
@@ -2343,7 +2225,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 
     case EXACTFU:
         if (is_utf8_pat || utf8_target) {
-            utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
+            utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
             goto do_exactf_utf8;
         }
 
@@ -2351,7 +2233,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
          * so we don't have to worry here about this single special case
          * in the Latin1 range */
         fold_array = PL_fold_latin1;
-        folder = foldEQ_latin1;
+        folder = foldEQ_latin1_s2_folded;
 
         /* FALLTHROUGH */
 
@@ -2806,22 +2688,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         );
         break;
 
-    case ASCII:
-        REXEC_FBC_FIND_NEXT_SCAN(0, find_next_ascii(s, strend, utf8_target));
-        break;
-
-    case NASCII:
-        if (utf8_target) {
-            REXEC_FBC_FIND_NEXT_SCAN(1, find_next_non_ascii(s, strend,
-                                                            utf8_target));
-        }
-        else {
-            REXEC_FBC_FIND_NEXT_SCAN(0, find_next_non_ascii(s, strend,
-                                                            utf8_target));
-        }
-
-        break;
-
     /* The argument to all the POSIX node types is the class number to pass to
      * _generic_isCC() to build a mask for searching in PL_charclass[] */
 
@@ -4549,7 +4415,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
                 int i;
 
                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
-                    if (isASCII(*s)) {
+                    if (isASCII(*s) && LIKELY(! PL_in_utf8_turkic_locale)) {
                         *(d++) = (U8) toFOLD_LC(*s);
                         s++;
                     }
@@ -4580,7 +4446,28 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
         }
         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
-            if (c1 > 255) {
+
+            if (   UNLIKELY(PL_in_utf8_turkic_locale)
+                && OP(text_node) == EXACTFL
+                && UNLIKELY(   c1 == 'i' || c1 == 'I'
+                            || c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE
+                            || c1 == LATIN_SMALL_LETTER_DOTLESS_I))
+            {   /* Hard-coded Turkish locale rules for these 4 characters
+                   override normal rules */
+                if (c1 == 'i') {
+                    c2 = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
+                }
+                else if (c1 == 'I') {
+                    c2 = LATIN_SMALL_LETTER_DOTLESS_I;
+                }
+                else if (c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+                    c2 = 'i';
+                }
+                else if (c1 == LATIN_SMALL_LETTER_DOTLESS_I) {
+                    c2 = 'I';
+                }
+            }
+            else if (c1 > 255) {
                 const unsigned int * remaining_folds;
                 unsigned int first_fold;
 
@@ -4660,7 +4547,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
                         assert(! is_utf8_pat);
                         /* FALLTHROUGH */
                     case EXACTFAA:
-                    case EXACTFU_SS:
+                    case EXACTFUP:
                     case EXACTFU:
                         c2 = PL_fold_latin1[c1];
                         break;
@@ -6406,7 +6293,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             }
             fold_utf8_flags =  FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
                                              | FOLDEQ_S2_FOLDS_SANE;
-           folder = foldEQ_latin1;
+           folder = foldEQ_latin1_s2_folded;
            fold_array = PL_fold_latin1;
            goto do_exactf;
 
@@ -6418,13 +6305,18 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
            fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
            goto do_exactf;
 
-       case EXACTFU_SS:         /*  /\x{df}/iu   */
+        case EXACTFUP:          /*  /foo/iu, and something is problematic in
+                                    'foo' so can't take shortcuts. */
             assert(! is_utf8_pat);
-            /* FALLTHROUGH */
+            folder = foldEQ_latin1;
+           fold_array = PL_fold_latin1;
+           fold_utf8_flags = 0;
+           goto do_exactf;
+
        case EXACTFU:            /*  /abc/iu      */
-           folder = foldEQ_latin1;
+            folder = foldEQ_latin1_s2_folded;
            fold_array = PL_fold_latin1;
-           fold_utf8_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
+           fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
            goto do_exactf;
 
         case EXACTFAA_NO_TRIE:   /* This node only generated for non-utf8
@@ -6459,7 +6351,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
 
            if (   utf8_target
                 || is_utf8_pat
-                || state_num == EXACTFU_SS
+                || state_num == EXACTFUP
                 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
             {
              /* Either target or the pattern are utf8, or has the issue where
@@ -6813,19 +6705,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             goto increment_locinput;
             break;
 
-        case ASCII:
-            if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
-                sayNO;
-            }
-
-            locinput++;     /* ASCII is always single byte */
-            break;
-
-        case NASCII:
-            if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
+        case ANYOFH:
+            if (   ! utf8_target
+                ||   NEXTCHR_IS_EOS
+               || ! reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
+                                                                   utf8_target))
+            {
                 sayNO;
             }
-
             goto increment_locinput;
             break;
 
@@ -7438,8 +7325,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                PL_op = NULL;
 
                 re_sv = NULL;
-               if (logical == 0)        /*   (?{})/   */
-                   sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
+               if (logical == 0) {       /*   (?{})/   */
+                    SV *replsv = save_scalar(PL_replgv);
+                    sv_setsv(replsv, ret); /* $^R */
+                    SvSETMAGIC(replsv);
+                }
                else if (logical == 1) { /*   /(?(?{...})X|Y)/    */
                    sw = cBOOL(SvTRUE_NN(ret));
                    logical = 0;
@@ -7614,9 +7504,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             {
                 /* preserve $^R across LEAVE's. See Bug 121070. */
                 SV *save_sv= GvSV(PL_replgv);
+                SV *replsv;
                 SvREFCNT_inc(save_sv);
                 regcpblow(ST.cp); /* LEAVE in disguise */
-                sv_setsv(GvSV(PL_replgv), save_sv);
+                /* don't move this initialization up */
+                replsv = GvSV(PL_replgv);
+                sv_setsv(replsv, save_sv);
+                SvSETMAGIC(replsv);
                 SvREFCNT_dec(save_sv);
             }
            cur_eval = ST.prev_eval;
@@ -9084,8 +8978,10 @@ NULL
          * see code related to PL_replgv elsewhere in this file.
          * Yves
          */
-       if (oreplsv != GvSV(PL_replgv))
+       if (oreplsv != GvSV(PL_replgv)) {
            sv_setsv(oreplsv, GvSV(PL_replgv));
+            SvSETMAGIC(oreplsv);
+        }
     }
     result = 1;
     goto final_exit;
@@ -9355,12 +9251,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
         goto do_exactf;
 
     case EXACTFU:
-       if (reginfo->is_utf8_pat) {
-            utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
-        }
+        utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
         /* FALLTHROUGH */
 
-    case EXACTFU_SS:
+    case EXACTFUP:
 
       do_exactf: {
         int c1, c2;
@@ -9491,28 +9385,16 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
        }
         break;
 
-    case ASCII:
-        if (utf8_target && loceol - scan > max) {
-            loceol = scan + max;
+    case ANYOFH:
+        if (utf8_target) while (   hardcount < max
+                                && scan < loceol
+                                && reginclass(prog, p, (U8*)scan, (U8*) loceol,
+                                                                  TRUE))
+        {
+            scan += UTF8SKIP(scan);
+            hardcount++;
         }
-
-        scan = find_next_non_ascii(scan, loceol, utf8_target);
-       break;
-
-    case NASCII:
-       if (utf8_target) {
-           while (     hardcount < max
-                   &&   scan < loceol
-                  && ! isASCII_utf8_safe(scan, loceol))
-           {
-               scan += UTF8SKIP(scan);
-               hardcount++;
-           }
-       }
-        else {
-            scan = find_next_ascii(scan, loceol, utf8_target);
-       }
-       break;
+        break;
 
     /* The argument (FLAGS) to all the POSIX node types is the class number */
 
@@ -9738,27 +9620,6 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
     return(c);
 }
 
-
-#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
-/*
-- regclass_swash - prepare the utf8 swash.  Wraps the shared core version to
-create a copy so that changes the caller makes won't change the shared one.
-If <altsvp> is non-null, will return NULL in it, for back-compat.
- */
-SV *
-Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
-{
-    PERL_ARGS_ASSERT_REGCLASS_SWASH;
-
-    if (altsvp) {
-        *altsvp = NULL;
-    }
-
-    return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
-}
-
-#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
-
 /*
  - reginclass - determine if a character falls into a character class
  
@@ -9806,7 +9667,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
     }
 
     /* If this character is potentially in the bitmap, check it */
-    if (c < NUM_ANYOF_CODE_POINTS) {
+    if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) {
        if (ANYOF_BITMAP_TEST(n, c))
            match = TRUE;
        else if ((flags
@@ -9907,9 +9768,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
                          && IN_UTF8_CTYPE_LOCALE)))
         {
             SV* only_utf8_locale = NULL;
-           SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
-                                                       &only_utf8_locale, NULL);
-           if (sw) {
+           SV * const definition = _get_regclass_nonbitmap_data(prog, n, TRUE,
+                                                   0, &only_utf8_locale, NULL);
+           if (definition) {
                 U8 utf8_buffer[2];
                U8 * utf8_p;
                if (utf8_target) {
@@ -9920,7 +9781,27 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
                    utf8_p = utf8_buffer;
                }
 
-               if (swash_fetch(sw, utf8_p, TRUE)) {
+                /* Turkish locales have these hard-coded rules overriding
+                 * normal ones */
+                if (   UNLIKELY(PL_in_utf8_turkic_locale)
+                    && isALPHA_FOLD_EQ(*p, 'i'))
+                {
+                    if (*p == 'i') {
+                        if (_invlist_contains_cp(definition,
+                                       LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
+                        {
+                            match = TRUE;
+                        }
+                    }
+                    else if (*p == 'I') {
+                        if (_invlist_contains_cp(definition,
+                                                LATIN_SMALL_LETTER_DOTLESS_I))
+                        {
+                            match = TRUE;
+                        }
+                    }
+                }
+                else if (_invlist_contains_cp(definition, c)) {
                    match = TRUE;
                 }
            }
@@ -9929,6 +9810,25 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
             }
        }
 
+        /* In a Turkic locale under folding, hard-code the I i case pair
+         * matches */
+        if (     UNLIKELY(PL_in_utf8_turkic_locale)
+            && ! match
+            &&   (flags & ANYOFL_FOLD)
+            &&   utf8_target)
+        {
+            if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+               if (ANYOF_BITMAP_TEST(n, 'i')) {
+                    match = TRUE;
+                }
+            }
+            else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
+               if (ANYOF_BITMAP_TEST(n, 'I')) {
+                    match = TRUE;
+                }
+            }
+        }
+
         if (UNICODE_IS_SUPER(c)
             && (flags
                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)