This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add Rodolfo Carvalho to AUTHORS
[perl5.git] / regcomp.c
index 4e35933..70e9e2f 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1387,8 +1387,8 @@ is the recommended Unicode-aware way of saying
               scan += len;                                                   \
               len = 0;                                                       \
            } else {                                                          \
-               uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
-               uvc = to_uni_fold( uvc, foldbuf, &foldlen );                  \
+               len = UTF8SKIP(uc);\
+               uvc = to_utf8_fold( uc, foldbuf, &foldlen);                   \
                foldlen -= UNISKIP( uvc );                                    \
                scan = foldbuf + UNISKIP( uvc );                              \
            }                                                                 \
@@ -2647,13 +2647,13 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
     }
     
 #ifdef DEBUGGING
-    /* Allow dumping */
+    /* Allow dumping but overwriting the collection of skipped
+     * ops and/or strings with fake optimized ops */
     n = scan + NODE_SZ_STR(scan);
     while (n <= stop) {
-        if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
-            OP(n) = OPTIMIZED;
-            NEXT_OFF(n) = 0;
-        }
+       OP(n) = OPTIMIZED;
+       FLAGS(n) = 0;
+       NEXT_OFF(n) = 0;
         n++;
     }
 #endif
@@ -4523,7 +4523,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
     struct regexp *r;
     register regexp_internal *ri;
     STRLEN plen;
-    char  *exp;
+    char* VOL exp;
     char* xend;
     regnode *scan;
     I32 flags;
@@ -4553,7 +4553,14 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
 
     DEBUG_r(if (!PL_colorset) reginitcolors());
 
-    RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
+    exp = SvPV(pattern, plen);
+
+    if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
+       RExC_utf8 = RExC_orig_utf8 = 0;
+    }
+    else {
+       RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
+    }
     RExC_uni_semantics = 0;
     RExC_contains_locale = 0;
 
@@ -4565,12 +4572,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
     }
 
     if (jump_ret == 0) {    /* First time through */
-       exp = SvPV(pattern, plen);
        xend = exp + plen;
-       /* ignore the utf8ness if the pattern is 0 length */
-       if (plen == 0) {
-           RExC_utf8 = RExC_orig_utf8 = 0;
-       }
 
         DEBUG_COMPILE_r({
             SV *dsv= sv_newmortal();
@@ -4602,7 +4604,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
         -- dmq */
         DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
            "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
-        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
+        exp = (char*)Perl_bytes_to_utf8(aTHX_
+                                       (U8*)SvPV_nomg(pattern, plen),
+                                       &len);
         xend = exp + len;
         RExC_orig_utf8 = RExC_utf8 = 1;
         SAVEFREEPV(exp);
@@ -5955,6 +5959,8 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
 
     *get_invlist_len_addr(invlist) = len;
 
+    assert(len <= SvLEN(invlist));
+
     SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
     /* If the list contains U+0000, that element is part of the header,
      * and should not be counted as part of the array.  It will contain
@@ -5964,9 +5970,9 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
      *           TO_INTERNAL_SIZE(len
      *                            - (*get_invlist_zero_addr(inv_list) ^ 1)));
      * But, this is only valid if len is not 0.  The consequences of not doing
-     * this is that the memory allocation code may think that the 1 more UV
-     * is being used than actually is, and so might do an unnecessary grow.
-     * That seems worth not bothering to make this the precise amount.
+     * this is that the memory allocation code may think that 1 more UV is
+     * being used than actually is, and so might do an unnecessary grow.  That
+     * seems worth not bothering to make this the precise amount.
      *
      * Note that when inverting, SvCUR shouldn't change */
 }
@@ -6129,10 +6135,9 @@ Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV
        invlist_set_len(invlist, len - 1);
     }
 }
-#endif
 
-STATIC void
-S_invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
+void
+Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
 {
     /* Take the union of two inversion lists and point 'result' to it.  If
      * 'result' on input points to one of the two lists, the reference count to
@@ -6171,7 +6176,7 @@ S_invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
      */
     UV count = 0;
 
-    PERL_ARGS_ASSERT_INVLIST_UNION;
+    PERL_ARGS_ASSERT__INVLIST_UNION;
 
     /* If either one is empty, the union is the other one */
     len_a = invlist_len(a);
@@ -6321,8 +6326,8 @@ S_invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
     return;
 }
 
-STATIC void
-S_invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
+void
+Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
 {
     /* Take the intersection of two inversion lists and point 'i' to it.  If
      * 'i' on input points to one of the two lists, the reference count to that
@@ -6357,7 +6362,7 @@ S_invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
      */
     UV count = 0;
 
-    PERL_ARGS_ASSERT_INVLIST_INTERSECTION;
+    PERL_ARGS_ASSERT__INVLIST_INTERSECTION;
 
     /* If either one is empty, the intersection is null */
     len_a = invlist_len(a);
@@ -6490,6 +6495,8 @@ S_invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
     return;
 }
 
+#endif
+
 STATIC SV*
 S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
 {
@@ -6524,7 +6531,7 @@ S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
     range_invlist = _new_invlist(2);
     _append_range_to_invlist(range_invlist, start, end);
 
-    invlist_union(invlist, range_invlist, &invlist);
+    _invlist_union(invlist, range_invlist, &invlist);
 
     /* The temporary can be freed */
     SvREFCNT_dec(range_invlist);
@@ -6537,8 +6544,9 @@ S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
     return add_range_to_invlist(invlist, cp, cp);
 }
 
-PERL_STATIC_INLINE void
-S_invlist_invert(pTHX_ SV* const invlist)
+#ifndef PERL_IN_XSUB_RE
+void
+Perl__invlist_invert(pTHX_ SV* const invlist)
 {
     /* Complement the input inversion list.  This adds a 0 if the list didn't
      * have a zero; removes it otherwise.  As described above, the data
@@ -6546,7 +6554,7 @@ S_invlist_invert(pTHX_ SV* const invlist)
 
     UV* len_pos = get_invlist_len_addr(invlist);
 
-    PERL_ARGS_ASSERT_INVLIST_INVERT;
+    PERL_ARGS_ASSERT__INVLIST_INVERT;
 
     /* The inverse of matching nothing is matching everything */
     if (*len_pos == 0) {
@@ -6565,6 +6573,45 @@ S_invlist_invert(pTHX_ SV* const invlist)
     }
 }
 
+void
+Perl__invlist_invert_prop(pTHX_ SV* const invlist)
+{
+    /* Complement the input inversion list (which must be a Unicode property,
+     * all of which don't match above the Unicode maximum code point.)  And
+     * Perl has chosen to not have the inversion match above that either.  This
+     * adds a 0x110000 if the list didn't end with it, and removes it if it did
+     */
+
+    UV len;
+    UV* array;
+
+    PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
+
+    _invlist_invert(invlist);
+
+    len = invlist_len(invlist);
+
+    if (len != 0) { /* If empty do nothing */
+       array = invlist_array(invlist);
+       if (array[len - 1] != PERL_UNICODE_MAX + 1) {
+           /* Add 0x110000.  First, grow if necessary */
+           len++;
+           if (invlist_max(invlist) < len) {
+               invlist_extend(invlist, len);
+               array = invlist_array(invlist);
+           }
+           invlist_set_len(invlist, len);
+           array[len - 1] = PERL_UNICODE_MAX + 1;
+       }
+       else {  /* Remove the 0x110000 */
+           invlist_set_len(invlist, len - 1);
+       }
+    }
+
+    return;
+}
+#endif
+
 PERL_STATIC_INLINE SV*
 S_invlist_clone(pTHX_ SV* const invlist)
 {
@@ -6580,13 +6627,14 @@ S_invlist_clone(pTHX_ SV* const invlist)
     return new_invlist;
 }
 
-STATIC void
-S_invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
+#ifndef PERL_IN_XSUB_RE
+void
+Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
 {
     /* Point result to an inversion list which consists of all elements in 'a'
      * that aren't also in 'b' */
 
-    PERL_ARGS_ASSERT_INVLIST_SUBTRACT;
+    PERL_ARGS_ASSERT__INVLIST_SUBTRACT;
 
     /* Subtracting nothing retains the original */
     if (invlist_len(b) == 0) {
@@ -6598,8 +6646,8 @@ S_invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
        }
     } else {
        SV *b_copy = invlist_clone(b);
-       invlist_invert(b_copy); /* Everything not in 'b' */
-       invlist_intersection(a, b_copy, result);    /* Everything in 'a' not in
+       _invlist_invert(b_copy);        /* Everything not in 'b' */
+       _invlist_intersection(a, b_copy, result);    /* Everything in 'a' not in
                                                       'b' */
        SvREFCNT_dec(b_copy);
     }
@@ -6610,6 +6658,7 @@ S_invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
 
     return;
 }
+#endif
 
 PERL_STATIC_INLINE UV*
 S_get_invlist_iter_addr(pTHX_ SV* invlist)
@@ -9555,7 +9604,7 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
     }
 }
 
-/* No locale test, and always Unicode semantics */
+/* No locale test, and always Unicode semantics, no ignore-case differences */
 #define _C_C_T_NOLOC_(NAME,TEST,WORD)                                          \
 ANYOF_##NAME:                                                                  \
        for (value = 0; value < 256; value++)                                  \
@@ -9575,8 +9624,11 @@ case ANYOF_N##NAME:                                                            \
 /* Like the above, but there are differences if we are in uni-8-bit or not, so
  * there are two tests passed in, to use depending on that. There aren't any
  * cases where the label is different from the name, so no need for that
- * parameter */
-#define _C_C_T_(NAME, TEST_8, TEST_7, WORD)                                    \
+ * parameter.
+ * Sets 'what' to WORD which is the property name for non-bitmap code points;
+ * But, uses FOLD_WORD instead if /i has been selected, to allow a different
+ * property name */
+#define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD)                         \
 ANYOF_##NAME:                                                                  \
     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);                               \
     else if (UNI_SEMANTICS) {                                                  \
@@ -9593,7 +9645,12 @@ ANYOF_##NAME:                                                                  \
         }                                                                      \
     }                                                                          \
     yesno = '+';                                                               \
-    what = WORD;                                                               \
+    if (FOLD) {                                                                \
+        what = FOLD_WORD;                                                      \
+    }                                                                          \
+    else {                                                                     \
+        what = WORD;                                                           \
+    }                                                                          \
     break;                                                                     \
 case ANYOF_N##NAME:                                                            \
     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);                              \
@@ -9625,7 +9682,12 @@ case ANYOF_N##NAME:                                                            \
        }                                                                      \
     }                                                                          \
     yesno = '!';                                                               \
-    what = WORD;                                                               \
+    if (FOLD) {                                                                \
+        what = FOLD_WORD;                                                      \
+    }                                                                          \
+    else {                                                                     \
+        what = WORD;                                                           \
+    }                                                                          \
     break
 
 STATIC U8
@@ -10183,20 +10245,20 @@ parseit:
                 * --jhi */
                switch ((I32)namedclass) {
                
-               case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
-               case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
-               case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
-               case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
-               case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
-               case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
-               case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
-               case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
-               case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
-               case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
+               case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum");
+               case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha");
+               case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank");
+               case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl");
+               case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph");
+               case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i");
+               case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint");
+               case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace");
+               case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct");
+               case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i");
                 /* \s, \w match all unicode if utf8. */
-                case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
-                case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
-               case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
+                case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl");
+                case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word");
+               case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit");
                case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
                case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
                case ANYOF_ASCII:
@@ -10262,7 +10324,7 @@ parseit:
                }
                if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
                    /* Strings such as "+utf8::isWord\n" */
-                   Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
+                   Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", yesno, what);
                }
 
                continue;
@@ -10392,7 +10454,11 @@ parseit:
                if (! PL_utf8_tofold) {
                    U8 dummy[UTF8_MAXBYTES+1];
                    STRLEN dummy_len;
-                   to_utf8_fold((U8*) "A", dummy, &dummy_len);
+
+                   /* This particular string is above \xff in both UTF-8 and
+                    * UTFEBCDIC */
+                   to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
+                   assert(PL_utf8_tofold); /* Verify that worked */
                }
                PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
            }
@@ -10402,7 +10468,7 @@ parseit:
            * be checked.  Get the intersection of this class and all the
            * possible characters that are foldable.  This can quickly narrow
            * down a large class */
-       invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
+       _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
 
        /* Now look at the foldable characters in this class individually */
        invlist_iterinit(fold_intersection);
@@ -10536,7 +10602,7 @@ parseit:
     /* Combine the two lists into one. */
     if (l1_fold_invlist) {
        if (nonbitmap) {
-           invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
+           _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
            SvREFCNT_dec(l1_fold_invlist);
        }
        else {
@@ -10566,8 +10632,8 @@ parseit:
        && SvCUR(listsv) == initial_listsv_len)
     {
        if (! nonbitmap) {
-       for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
-           ANYOF_BITMAP(ret)[value] ^= 0xFF;
+           for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
+               ANYOF_BITMAP(ret)[value] ^= 0xFF;
            /* The inversion means that everything above 255 is matched */
            ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
        }
@@ -10576,7 +10642,7 @@ parseit:
             * individually and add it to the list to get rid of from those
             * things not in the bitmap */
            SV *remove_list = _new_invlist(2);
-           invlist_invert(nonbitmap);
+           _invlist_invert(nonbitmap);
            for (value = 0; value < 256; ++value) {
                if (ANYOF_BITMAP_TEST(ret, value)) {
                    ANYOF_BITMAP_CLEAR(ret, value);
@@ -10586,7 +10652,7 @@ parseit:
                    ANYOF_BITMAP_SET(ret, value);
                }
            }
-           invlist_subtract(nonbitmap, remove_list, &nonbitmap);
+           _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
            SvREFCNT_dec(remove_list);
        }
 
@@ -10657,17 +10723,24 @@ parseit:
            else {
                op = EXACT;
            }
-       }   /* else 2 chars in the bit map: the folds of each other */
-       else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
-
-           /* To join adjacent nodes, they must be the exact EXACTish type.
-            * Try to use the most likely type, by using EXACTFU if the regex
-            * calls for them, or is required because the character is
-            * non-ASCII */
-           op = EXACTFU;
        }
-       else {    /* Otherwise, more likely to be EXACTF type */
-           op = EXACTF;
+       else {   /* else 2 chars in the bit map: the folds of each other */
+
+           /* Use the folded value, which for the cases where we get here,
+            * is just the lower case of the current one (which may resolve to
+            * itself, or to the other one */
+           value = toLOWER_LATIN1(value);
+           if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
+
+               /* To join adjacent nodes, they must be the exact EXACTish
+                * type.  Try to use the most likely type, by using EXACTFU if
+                * the regex calls for them, or is required because the
+                * character is non-ASCII */
+               op = EXACTFU;
+           }
+           else {    /* Otherwise, more likely to be EXACTF type */
+               op = EXACTF;
+           }
        }
 
        ret = reg_node(pRExC_state, op);