This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlpodspec and perlpod rewrite, draft 3 "final"
[perl5.git] / regcomp.c
index 2b9fbbb..a223533 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
 #define PERL_IN_REGCOMP_C
 #include "perl.h"
 
-#ifdef PERL_IN_XSUB_RE
-#  if defined(PERL_CAPI) || defined(PERL_OBJECT)
-#    include "XSUB.h"
-#  endif
-#else
+#ifndef PERL_IN_XSUB_RE
 #  include "INTERN.h"
 #endif
 
@@ -263,7 +259,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 #define        FAIL(msg)                                                             \
     STMT_START {                                                             \
         char *ellipses = "";                                                 \
-        unsigned len = strlen(RExC_precomp);                                \
+        IV len = RExC_end - RExC_precomp;                                \
                                                                              \
        if (!SIZE_ONLY)                                                      \
            SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
@@ -285,7 +281,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 #define        FAIL2(pat,msg)                                                        \
     STMT_START {                                                             \
         char *ellipses = "";                                                 \
-        unsigned len = strlen(RExC_precomp);                                \
+        IV len = RExC_end - RExC_precomp;                                \
                                                                              \
        if (!SIZE_ONLY)                                                      \
            SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx);                 \
@@ -305,7 +301,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  */
 #define        Simple_vFAIL(m)                                                      \
     STMT_START {                                                             \
-      unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+      IV offset = RExC_parse - RExC_precomp; \
                                                                              \
       Perl_croak(aTHX_ "%s" REPORT_LOCATION,               \
                 m, (int)offset, RExC_precomp, RExC_precomp + offset);     \
@@ -326,7 +322,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  */
 #define        Simple_vFAIL2(m,a1)                                                  \
     STMT_START {                                                             \
-      unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+      IV offset = RExC_parse - RExC_precomp; \
                                                                              \
       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1,       \
                  (int)offset, RExC_precomp, RExC_precomp + offset);       \
@@ -348,7 +344,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  */
 #define        Simple_vFAIL3(m, a1, a2)                                             \
     STMT_START {                                                             \
-      unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+      IV offset = RExC_parse - RExC_precomp; \
                                                                              \
       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2,   \
                  (int)offset, RExC_precomp, RExC_precomp + offset);       \
@@ -369,7 +365,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  */
 #define        Simple_vFAIL4(m, a1, a2, a3)                                         \
     STMT_START {                                                             \
-      unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+      IV offset = RExC_parse - RExC_precomp; \
                                                                              \
       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
                  (int)offset, RExC_precomp, RExC_precomp + offset);       \
@@ -380,7 +376,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  */
 #define        Simple_vFAIL5(m, a1, a2, a3, a4)                                     \
     STMT_START {                                                             \
-      unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \
+      IV offset = RExC_parse - RExC_precomp; \
       S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
                  (int)offset, RExC_precomp, RExC_precomp + offset);       \
     } STMT_END
@@ -388,14 +384,14 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 
 #define        vWARN(loc,m)                                                         \
     STMT_START {                                                             \
-        unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc));          \
+        IV offset = loc - RExC_precomp;          \
        Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
                 m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
     } STMT_END                                                               \
 
 #define        vWARNdep(loc,m)                                                         \
     STMT_START {                                                             \
-        unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc));          \
+        IV offset = loc - RExC_precomp;          \
         int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED;  \
        Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\
                 m, (int)offset, RExC_precomp, RExC_precomp + offset);          \
@@ -404,7 +400,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 
 #define        vWARN2(loc, m, a1)                                                   \
     STMT_START {                                                             \
-        unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc));          \
+        IV offset = loc - RExC_precomp;          \
        Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
                  a1,                                                         \
                 (int)offset, RExC_precomp, RExC_precomp + offset);        \
@@ -412,7 +408,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 
 #define        vWARN3(loc, m, a1, a2)                                               \
     STMT_START {                                                             \
-      unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc));        \
+      IV offset = loc - RExC_precomp;        \
        Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,                    \
                  a1, a2,                                                     \
                 (int)offset, RExC_precomp, RExC_precomp + offset);        \
@@ -420,7 +416,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 
 #define        vWARN4(loc, m, a1, a2, a3)                                           \
     STMT_START {                                                             \
-      unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc));            \
+      IV offset = loc - RExC_precomp;            \
        Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
                  a1, a2, a3,                                                 \
                 (int)offset, RExC_precomp, RExC_precomp + offset);        \
@@ -429,7 +425,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 /* used for the parse_flags section for (?c) -- japhy */
 #define        vWARN5(loc, m, a1, a2, a3, a4)                                       \
   STMT_START {                                                   \
-      unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc));   \
+      IV offset = loc - RExC_precomp;   \
         Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,      \
                  a1, a2, a3, a4,                                 \
                  (int)offset, RExC_precomp, RExC_precomp + offset);  \
@@ -483,7 +479,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
 
-static void clear_re(pTHXo_ void *r);
+static void clear_re(pTHX_ void *r);
 
 /* Mark that we cannot extend a found fixed substring at this point.
    Updata the longest found anchored substring and the longest found
@@ -971,6 +967,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
            regnode *oscan = scan;
            struct regnode_charclass_class this_class;
            struct regnode_charclass_class *oclass = NULL;
+           I32 next_is_eval = 0;
 
            switch (PL_regkind[(U8)OP(scan)]) {
            case WHILEM:                /* End of (?:...)* . */
@@ -1016,6 +1013,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                    scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
                }
                scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
+               next_is_eval = (OP(scan) == EVAL);
              do_curly:
                if (flags & SCF_DO_SUBSTR) {
                    if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
@@ -1077,6 +1075,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg
                if (!scan)              /* It was not CURLYX, but CURLY. */
                    scan = next;
                if (ckWARN(WARN_REGEXP)
+                      /* ? quantifier ok, except for (?{ ... }) */
+                   && (next_is_eval || !(mincount == 0 && maxcount == 1))
                    && (minnext == 0) && (deltanext == 0)
                    && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
                    && maxcount <= REG_INFTY/3) /* Complement check for big count */
@@ -1666,7 +1666,11 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     if (exp == NULL)
        FAIL("NULL regexp argument");
 
-    RExC_utf8 = 0;
+    /* XXXX This looks very suspicious... */
+    if (pm->op_pmdynflags & PMdf_CMP_UTF8)
+        RExC_utf8 = 1;
+    else
+        RExC_utf8 = 0;
 
     RExC_precomp = exp;
     DEBUG_r(if (!PL_colorset) reginitcolors());
@@ -1742,6 +1746,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
     RExC_rx = r;
 
     /* Second pass: emit code. */
+    RExC_flags16 = pm->op_pmflags;     /* don't let top level (?i) bleed */
     RExC_parse = exp;
     RExC_end = xend;
     RExC_naughty = 0;
@@ -2874,9 +2879,10 @@ tryagain:
                  /* a lovely hack--pretend we saw [\pX] instead */
                    RExC_end = strchr(RExC_parse, '}');
                    if (!RExC_end) {
+                       U8 c = (U8)*RExC_parse;
                        RExC_parse += 2;
                        RExC_end = oldregxend;
-                       vFAIL("Missing right brace on \\p{}");
+                       vFAIL2("Missing right brace on \\%c{}", c);
                    }
                    RExC_end++;
                }
@@ -2956,6 +2962,8 @@ tryagain:
            register char *p;
            char *oldp, *s;
            STRLEN numlen;
+           STRLEN ulen;
+           U8 tmpbuf[UTF8_MAXLEN*2+1];
 
             parse_start = RExC_parse - 1;
 
@@ -3034,8 +3042,10 @@ tryagain:
                                vFAIL("Missing right brace on \\x{}");
                            }
                            else {
-                               numlen = 1;     /* allow underscores */
-                               ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
+                                I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+                                    | PERL_SCAN_DISALLOW_PREFIX;
+                                numlen = e - p - 1;
+                               ender = grok_hex(p + 1, &numlen, &flags, NULL);
                                if (ender > 0xff)
                                    RExC_utf8 = 1;
                                /* numlen is generous */
@@ -3047,8 +3057,9 @@ tryagain:
                            }
                        }
                        else {
-                           numlen = 0;         /* disallow underscores */
-                           ender = (UV)scan_hex(p, 2, &numlen);
+                            I32 flags = PERL_SCAN_DISALLOW_PREFIX;
+                           numlen = 2;
+                           ender = grok_hex(p, &numlen, &flags, NULL);
                            p += numlen;
                        }
                        break;
@@ -3061,8 +3072,9 @@ tryagain:
                    case '5': case '6': case '7': case '8':case '9':
                        if (*p == '0' ||
                          (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
-                           numlen = 0;         /* disallow underscores */
-                           ender = (UV)scan_oct(p, 3, &numlen);
+                            I32 flags = 0;
+                           numlen = 3;
+                           ender = grok_oct(p, &numlen, &flags, NULL);
                            p += numlen;
                        }
                        else {
@@ -3076,7 +3088,7 @@ tryagain:
                        /* FALL THROUGH */
                    default:
                        if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
-                           vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
+                           vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
                        goto normal_default;
                    }
                    break;
@@ -3094,10 +3106,8 @@ tryagain:
                if (RExC_flags16 & PMf_EXTENDED)
                    p = regwhite(p, RExC_end);
                if (UTF && FOLD) {
-                   if (LOC)
-                       ender = toLOWER_LC_uvchr(ender);
-                   else
-                       ender = toLOWER_uni(ender);
+                   toLOWER_uni(ender, tmpbuf, &ulen);
+                   ender = utf8_to_uvchr(tmpbuf, 0);
                }
                if (ISMULT2(p)) { /* Back off on ?+*. */
                    if (len)
@@ -3412,22 +3422,38 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
            case 'p':
            case 'P':
                if (*RExC_parse == '{') {
+                   U8 c = (U8)value;
                    e = strchr(RExC_parse++, '}');
                     if (!e)
-                        vFAIL("Missing right brace on \\p{}");
+                        vFAIL2("Missing right brace on \\%c{}", c);
+                   while (isSPACE(UCHARAT(RExC_parse)))
+                       RExC_parse++;
+                    if (e == RExC_parse)
+                        vFAIL2("Empty \\%c{}", c);
                    n = e - RExC_parse;
+                   while (isSPACE(UCHARAT(RExC_parse + n - 1)))
+                       n--;
                }
                else {
                    e = RExC_parse;
                    n = 1;
                }
                if (!SIZE_ONLY) {
+                   if (UCHARAT(RExC_parse) == '^') {
+                        RExC_parse++;
+                        n--;
+                        value = value == 'p' ? 'P' : 'p'; /* toggle */
+                        while (isSPACE(UCHARAT(RExC_parse))) {
+                             RExC_parse++;
+                             n--;
+                        }
+                   }
                    if (value == 'p')
-                       Perl_sv_catpvf(aTHX_ listsv,
-                                      "+utf8::%.*s\n", (int)n, RExC_parse);
+                        Perl_sv_catpvf(aTHX_ listsv,
+                                       "+utf8::%.*s\n", (int)n, RExC_parse);
                    else
-                       Perl_sv_catpvf(aTHX_ listsv,
-                                      "!utf8::%.*s\n", (int)n, RExC_parse);
+                        Perl_sv_catpvf(aTHX_ listsv,
+                                       "!utf8::%.*s\n", (int)n, RExC_parse);
                }
                RExC_parse = e + 1;
                ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
@@ -3441,18 +3467,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
            case 'a':   value = ASCII_TO_NATIVE('\007');break;
            case 'x':
                if (*RExC_parse == '{') {
+                    I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+                        | PERL_SCAN_DISALLOW_PREFIX;
                    e = strchr(RExC_parse++, '}');
                     if (!e)
                         vFAIL("Missing right brace on \\x{}");
-                   numlen = 1;         /* allow underscores */
-                   value = (UV)scan_hex(RExC_parse,
-                                        e - RExC_parse,
-                                        &numlen);
+
+                   numlen = e - RExC_parse;
+                   value = grok_hex(RExC_parse, &numlen, &flags, NULL);
                    RExC_parse = e + 1;
                }
                else {
-                   numlen = 0;         /* disallow underscores */
-                   value = (UV)scan_hex(RExC_parse, 2, &numlen);
+                    I32 flags = PERL_SCAN_DISALLOW_PREFIX;
+                   numlen = 2;
+                   value = grok_hex(RExC_parse, &numlen, &flags, NULL);
                    RExC_parse += numlen;
                }
                break;
@@ -3462,10 +3490,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
                break;
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7': case '8': case '9':
-               numlen = 0;             /* disallow underscores */
-               value = (UV)scan_oct(--RExC_parse, 3, &numlen);
+            {
+                I32 flags = 0;
+               numlen = 3;
+               value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
                RExC_parse += numlen;
                break;
+            }
            default:
                if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
                    vWARN2(RExC_parse,
@@ -4730,6 +4761,7 @@ Perl_save_re_context(pTHX)
     SAVEVPTR(PL_reg_re);               /* from regexec.c */
     SAVEPPTR(PL_reg_ganch);            /* from regexec.c */
     SAVESPTR(PL_reg_sv);               /* from regexec.c */
+    SAVEI8(PL_reg_match_utf8);         /* from regexec.c */
     SAVEVPTR(PL_reg_magic);            /* from regexec.c */
     SAVEI32(PL_reg_oldpos);                    /* from regexec.c */
     SAVEVPTR(PL_reg_oldcurpm);         /* from regexec.c */
@@ -4741,14 +4773,8 @@ Perl_save_re_context(pTHX)
 #endif
 }
 
-#ifdef PERL_OBJECT
-#include "XSUB.h"
-#undef this
-#define this pPerl
-#endif
-
 static void
-clear_re(pTHXo_ void *r)
+clear_re(pTHX_ void *r)
 {
     ReREFCNT_dec((regexp *)r);
 }