X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/5700b71cd8484a5485a9fb0bdde355ded1194121..8a93676d2b6d9cfcd46e9efcc3c94cc624b3b332:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 2b9fbbb..a223533 100644 --- a/regcomp.c +++ b/regcomp.c @@ -83,11 +83,7 @@ #define PERL_IN_REGCOMP_C #include "perl.h" -#ifdef PERL_IN_XSUB_RE -# if defined(PERL_CAPI) || defined(PERL_OBJECT) -# include "XSUB.h" -# endif -#else +#ifndef PERL_IN_XSUB_RE # include "INTERN.h" #endif @@ -263,7 +259,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define FAIL(msg) \ STMT_START { \ char *ellipses = ""; \ - unsigned len = strlen(RExC_precomp); \ + IV len = RExC_end - RExC_precomp; \ \ if (!SIZE_ONLY) \ SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ @@ -285,7 +281,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define FAIL2(pat,msg) \ STMT_START { \ char *ellipses = ""; \ - unsigned len = strlen(RExC_precomp); \ + IV len = RExC_end - RExC_precomp; \ \ if (!SIZE_ONLY) \ SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ @@ -305,7 +301,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, */ #define Simple_vFAIL(m) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + IV offset = RExC_parse - RExC_precomp; \ \ Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ m, (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -326,7 +322,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, */ #define Simple_vFAIL2(m,a1) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + IV offset = RExC_parse - RExC_precomp; \ \ S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -348,7 +344,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, */ #define Simple_vFAIL3(m, a1, a2) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + IV offset = RExC_parse - RExC_precomp; \ \ S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -369,7 +365,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, */ #define Simple_vFAIL4(m, a1, a2, a3) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + IV offset = RExC_parse - RExC_precomp; \ \ S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\ (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -380,7 +376,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, */ #define Simple_vFAIL5(m, a1, a2, a3, a4) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + IV offset = RExC_parse - RExC_precomp; \ S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\ (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END @@ -388,14 +384,14 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN(loc,m) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \ + IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\ m, (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END \ #define vWARNdep(loc,m) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \ + IV offset = loc - RExC_precomp; \ int warn_cat = ckWARN(WARN_REGEXP) ? WARN_REGEXP : WARN_DEPRECATED; \ Perl_warner(aTHX_ warn_cat, "%s" REPORT_LOCATION,\ m, (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -404,7 +400,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN2(loc, m, a1) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \ + IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ a1, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -412,7 +408,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN3(loc, m, a1, a2) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \ + IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ a1, a2, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -420,7 +416,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define vWARN4(loc, m, a1, a2, a3) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \ + IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ a1, a2, a3, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -429,7 +425,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* used for the parse_flags section for (?c) -- japhy */ #define vWARN5(loc, m, a1, a2, a3, a4) \ STMT_START { \ - unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \ + IV offset = loc - RExC_precomp; \ Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ a1, a2, a3, a4, \ (int)offset, RExC_precomp, RExC_precomp + offset); \ @@ -483,7 +479,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1]) #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)]) -static void clear_re(pTHXo_ void *r); +static void clear_re(pTHX_ void *r); /* Mark that we cannot extend a found fixed substring at this point. Updata the longest found anchored substring and the longest found @@ -971,6 +967,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg regnode *oscan = scan; struct regnode_charclass_class this_class; struct regnode_charclass_class *oclass = NULL; + I32 next_is_eval = 0; switch (PL_regkind[(U8)OP(scan)]) { case WHILEM: /* End of (?:...)* . */ @@ -1016,6 +1013,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX); } scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; + next_is_eval = (OP(scan) == EVAL); do_curly: if (flags & SCF_DO_SUBSTR) { if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */ @@ -1077,6 +1075,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, reg if (!scan) /* It was not CURLYX, but CURLY. */ scan = next; if (ckWARN(WARN_REGEXP) + /* ? quantifier ok, except for (?{ ... }) */ + && (next_is_eval || !(mincount == 0 && maxcount == 1)) && (minnext == 0) && (deltanext == 0) && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) && maxcount <= REG_INFTY/3) /* Complement check for big count */ @@ -1666,7 +1666,11 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) if (exp == NULL) FAIL("NULL regexp argument"); - RExC_utf8 = 0; + /* XXXX This looks very suspicious... */ + if (pm->op_pmdynflags & PMdf_CMP_UTF8) + RExC_utf8 = 1; + else + RExC_utf8 = 0; RExC_precomp = exp; DEBUG_r(if (!PL_colorset) reginitcolors()); @@ -1742,6 +1746,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) RExC_rx = r; /* Second pass: emit code. */ + RExC_flags16 = pm->op_pmflags; /* don't let top level (?i) bleed */ RExC_parse = exp; RExC_end = xend; RExC_naughty = 0; @@ -2874,9 +2879,10 @@ tryagain: /* a lovely hack--pretend we saw [\pX] instead */ RExC_end = strchr(RExC_parse, '}'); if (!RExC_end) { + U8 c = (U8)*RExC_parse; RExC_parse += 2; RExC_end = oldregxend; - vFAIL("Missing right brace on \\p{}"); + vFAIL2("Missing right brace on \\%c{}", c); } RExC_end++; } @@ -2956,6 +2962,8 @@ tryagain: register char *p; char *oldp, *s; STRLEN numlen; + STRLEN ulen; + U8 tmpbuf[UTF8_MAXLEN*2+1]; parse_start = RExC_parse - 1; @@ -3034,8 +3042,10 @@ tryagain: vFAIL("Missing right brace on \\x{}"); } else { - numlen = 1; /* allow underscores */ - ender = (UV)scan_hex(p + 1, e - p - 1, &numlen); + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX; + numlen = e - p - 1; + ender = grok_hex(p + 1, &numlen, &flags, NULL); if (ender > 0xff) RExC_utf8 = 1; /* numlen is generous */ @@ -3047,8 +3057,9 @@ tryagain: } } else { - numlen = 0; /* disallow underscores */ - ender = (UV)scan_hex(p, 2, &numlen); + I32 flags = PERL_SCAN_DISALLOW_PREFIX; + numlen = 2; + ender = grok_hex(p, &numlen, &flags, NULL); p += numlen; } break; @@ -3061,8 +3072,9 @@ tryagain: case '5': case '6': case '7': case '8':case '9': if (*p == '0' || (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) { - numlen = 0; /* disallow underscores */ - ender = (UV)scan_oct(p, 3, &numlen); + I32 flags = 0; + numlen = 3; + ender = grok_oct(p, &numlen, &flags, NULL); p += numlen; } else { @@ -3076,7 +3088,7 @@ tryagain: /* FALL THROUGH */ default: if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p)) - vWARN2(p +1, "Unrecognized escape \\%c passed through", *p); + vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p)); goto normal_default; } break; @@ -3094,10 +3106,8 @@ tryagain: if (RExC_flags16 & PMf_EXTENDED) p = regwhite(p, RExC_end); if (UTF && FOLD) { - if (LOC) - ender = toLOWER_LC_uvchr(ender); - else - ender = toLOWER_uni(ender); + toLOWER_uni(ender, tmpbuf, &ulen); + ender = utf8_to_uvchr(tmpbuf, 0); } if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) @@ -3412,22 +3422,38 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) case 'p': case 'P': if (*RExC_parse == '{') { + U8 c = (U8)value; e = strchr(RExC_parse++, '}'); if (!e) - vFAIL("Missing right brace on \\p{}"); + vFAIL2("Missing right brace on \\%c{}", c); + while (isSPACE(UCHARAT(RExC_parse))) + RExC_parse++; + if (e == RExC_parse) + vFAIL2("Empty \\%c{}", c); n = e - RExC_parse; + while (isSPACE(UCHARAT(RExC_parse + n - 1))) + n--; } else { e = RExC_parse; n = 1; } if (!SIZE_ONLY) { + if (UCHARAT(RExC_parse) == '^') { + RExC_parse++; + n--; + value = value == 'p' ? 'P' : 'p'; /* toggle */ + while (isSPACE(UCHARAT(RExC_parse))) { + RExC_parse++; + n--; + } + } if (value == 'p') - Perl_sv_catpvf(aTHX_ listsv, - "+utf8::%.*s\n", (int)n, RExC_parse); + Perl_sv_catpvf(aTHX_ listsv, + "+utf8::%.*s\n", (int)n, RExC_parse); else - Perl_sv_catpvf(aTHX_ listsv, - "!utf8::%.*s\n", (int)n, RExC_parse); + Perl_sv_catpvf(aTHX_ listsv, + "!utf8::%.*s\n", (int)n, RExC_parse); } RExC_parse = e + 1; ANYOF_FLAGS(ret) |= ANYOF_UNICODE; @@ -3441,18 +3467,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) case 'a': value = ASCII_TO_NATIVE('\007');break; case 'x': if (*RExC_parse == '{') { + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX; e = strchr(RExC_parse++, '}'); if (!e) vFAIL("Missing right brace on \\x{}"); - numlen = 1; /* allow underscores */ - value = (UV)scan_hex(RExC_parse, - e - RExC_parse, - &numlen); + + numlen = e - RExC_parse; + value = grok_hex(RExC_parse, &numlen, &flags, NULL); RExC_parse = e + 1; } else { - numlen = 0; /* disallow underscores */ - value = (UV)scan_hex(RExC_parse, 2, &numlen); + I32 flags = PERL_SCAN_DISALLOW_PREFIX; + numlen = 2; + value = grok_hex(RExC_parse, &numlen, &flags, NULL); RExC_parse += numlen; } break; @@ -3462,10 +3490,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - numlen = 0; /* disallow underscores */ - value = (UV)scan_oct(--RExC_parse, 3, &numlen); + { + I32 flags = 0; + numlen = 3; + value = grok_oct(--RExC_parse, &numlen, &flags, NULL); RExC_parse += numlen; break; + } default: if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) vWARN2(RExC_parse, @@ -4730,6 +4761,7 @@ Perl_save_re_context(pTHX) SAVEVPTR(PL_reg_re); /* from regexec.c */ SAVEPPTR(PL_reg_ganch); /* from regexec.c */ SAVESPTR(PL_reg_sv); /* from regexec.c */ + SAVEI8(PL_reg_match_utf8); /* from regexec.c */ SAVEVPTR(PL_reg_magic); /* from regexec.c */ SAVEI32(PL_reg_oldpos); /* from regexec.c */ SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */ @@ -4741,14 +4773,8 @@ Perl_save_re_context(pTHX) #endif } -#ifdef PERL_OBJECT -#include "XSUB.h" -#undef this -#define this pPerl -#endif - static void -clear_re(pTHXo_ void *r) +clear_re(pTHX_ void *r) { ReREFCNT_dec((regexp *)r); }