#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
- ((*s) == '{' && regcurly(s)))
+ ((*s) == '{' && regcurly(s, FALSE)))
#ifdef SPSTART
#undef SPSTART /* dratted cpp namespace... */
* REGNODE_SIMPLE */
#define SIMPLE 0x02
#define SPSTART 0x04 /* Starts with * or + */
-#define TRYAGAIN 0x08 /* Weeded out a declaration. */
-#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
+#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
+#define TRYAGAIN 0x10 /* Weeded out a declaration. */
+#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
-/* If not already in utf8, do a longjmp back to the beginning */
-#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
#define REQUIRE_UTF8 STMT_START { \
- if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
+ if (!UTF) { \
+ *flagp = RESTART_UTF8; \
+ return NULL; \
+ } \
} STMT_END
/* This converts the named class defined in regcomp.h to its equivalent class
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
+#define vFAIL4(m,a1,a2,a3) STMT_START { \
+ if (!SIZE_ONLY) \
+ SAVEFREESV(RExC_rx_sv); \
+ Simple_vFAIL4(m, a1, a2, a3); \
+} STMT_END
+
+/* m is not necessarily a "literal string", in this macro */
+#define reg_warn_non_literal_string(loc, m) STMT_START { \
+ const IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
+ m, (int)offset, RExC_precomp, RExC_precomp + offset); \
+} STMT_END
+
#define ckWARNreg(loc,m) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
+#define vWARN_dep(loc, m) STMT_START { \
+ const IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
+} STMT_END
+
+#define ckWARNdep(loc,m) STMT_START { \
+ const IV offset = loc - RExC_precomp; \
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
+ m REPORT_LOCATION, \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
+} STMT_END
+
#define ckWARNregdep(loc,m) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
data->offset_float_min = l ? data->last_start_min : data->pos_min;
data->offset_float_max = (l
? data->last_start_max
- : data->pos_min + data->pos_delta);
+ : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
data->offset_float_max = I32_MAX;
if (data->flags & SF_BEFORE_EOL)
/* OR char bitmap and class bitmap separately */
for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
cl->bitmap[i] |= or_with->bitmap[i];
- ANYOF_CLASS_OR(or_with, cl);
+ if (or_with->flags & ANYOF_CLASS) {
+ ANYOF_CLASS_OR(or_with, cl);
+ }
}
else { /* XXXX: logic is complicated, leave it along for a moment. */
cl_anything(pRExC_state, cl);
stopparen, recursed, NULL, f,depth+1);
if (min1 > minnext)
min1 = minnext;
- if (max1 < minnext + deltanext)
- max1 = minnext + deltanext;
- if (deltanext == I32_MAX)
+ if (deltanext == I32_MAX) {
is_inf = is_inf_internal = 1;
+ max1 = I32_MAX;
+ } else if (max1 < minnext + deltanext)
+ max1 = minnext + deltanext;
scan = next;
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
pars++;
min1 = 0;
if (flags & SCF_DO_SUBSTR) {
data->pos_min += min1;
- data->pos_delta += max1 - min1;
+ if (data->pos_delta >= I32_MAX - (max1 - min1))
+ data->pos_delta = I32_MAX;
+ else
+ data->pos_delta += max1 - min1;
if (max1 != min1 || is_inf)
data->longest = &(data->longest_float);
}
min += min1;
- delta += max1 - min1;
+ if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
+ delta = I32_MAX;
+ else
+ delta += max1 - min1;
if (flags & SCF_DO_STCLASS_OR) {
cl_or(pRExC_state, data->start_class, &accum);
if (min1) {
}
min += minnext * mincount;
- is_inf_internal |= ((maxcount == REG_INFTY
- && (minnext + deltanext) > 0)
- || deltanext == I32_MAX);
+ is_inf_internal |= deltanext == I32_MAX
+ || (maxcount == REG_INFTY && minnext + deltanext > 0);
is_inf |= is_inf_internal;
- delta += (minnext + deltanext) * maxcount - minnext * mincount;
+ if (is_inf)
+ delta = I32_MAX;
+ else
+ delta += (minnext + deltanext) * maxcount - minnext * mincount;
/* Try powerful optimization CURLYX => CURLYN. */
if ( OP(oscan) == CURLYX && data
}
/* It is counted once already... */
data->pos_min += minnext * (mincount - counted);
- data->pos_delta += - counted * deltanext +
+#if 0
+PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
+ counted, deltanext, I32_MAX, minnext, maxcount, mincount);
+if (deltanext != I32_MAX)
+PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
+#endif
+ if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
+ data->pos_delta = I32_MAX;
+ else
+ data->pos_delta += - counted * deltanext +
(minnext + deltanext) * maxcount - minnext * mincount;
if (mincount != maxcount) {
/* Cannot extend fixed substrings found inside
NEXT_OFF(oscan) += NEXT_OFF(next);
}
continue;
- default: /* REF, ANYOFV, and CLUMP only? */
+ default: /* REF, and CLUMP only? */
if (flags & SCF_DO_SUBSTR) {
SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
data->longest = &(data->longest_float);
if (min1 > (I32)(minnext + trie->minlen))
min1 = minnext + trie->minlen;
- if (max1 < (I32)(minnext + deltanext + trie->maxlen))
- max1 = minnext + deltanext + trie->maxlen;
- if (deltanext == I32_MAX)
+ if (deltanext == I32_MAX) {
is_inf = is_inf_internal = 1;
+ max1 = I32_MAX;
+ } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
+ max1 = minnext + deltanext + trie->maxlen;
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
pars++;
NULL, NULL, rx_flags, 0);
}
+
+/* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
+ * blocks, recalculate the indices. Update pat_p and plen_p in-place to
+ * point to the realloced string and length.
+ *
+ * This is essentially a copy of Perl_bytes_to_utf8() with the code index
+ * stuff added */
+
+static void
+S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
+ char **pat_p, STRLEN *plen_p, int num_code_blocks)
+{
+ U8 *const src = (U8*)*pat_p;
+ U8 *dst;
+ int n=0;
+ STRLEN s = 0, d = 0;
+ bool do_end = 0;
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
+
+ Newx(dst, *plen_p * 2 + 1, U8);
+
+ while (s < *plen_p) {
+ const UV uv = NATIVE_TO_ASCII(src[s]);
+ if (UNI_IS_INVARIANT(uv))
+ dst[d] = (U8)UTF_TO_NATIVE(uv);
+ else {
+ dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
+ dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
+ }
+ if (n < num_code_blocks) {
+ if (!do_end && pRExC_state->code_blocks[n].start == s) {
+ pRExC_state->code_blocks[n].start = d;
+ assert(dst[d] == '(');
+ do_end = 1;
+ }
+ else if (do_end && pRExC_state->code_blocks[n].end == s) {
+ pRExC_state->code_blocks[n].end = d;
+ assert(dst[d] == ')');
+ do_end = 0;
+ n++;
+ }
+ }
+ s++;
+ d++;
+ }
+ dst[d] = '\0';
+ *plen_p = d;
+ *pat_p = (char*) dst;
+ SAVEFREEPV(*pat_p);
+ RExC_orig_utf8 = RExC_utf8 = 1;
+}
+
+
+
+/* S_concat_pat(): concatenate a list of args to the pattern string pat,
+ * while recording any code block indices, and handling overloading,
+ * nested qr// objects etc. If pat is null, it will allocate a new
+ * string, or just return the first arg, if there's only one.
+ *
+ * Returns the malloced/updated pat.
+ * patternp and pat_count is the array of SVs to be concatted;
+ * oplist is the optional list of ops that generated the SVs;
+ * recompile_p is a pointer to a boolean that will be set if
+ * the regex will need to be recompiled.
+ * delim, if non-null is an SV that will be inserted between each element
+ */
+
+static SV*
+S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
+ SV *pat, SV ** const patternp, int pat_count,
+ OP *oplist, bool *recompile_p, SV *delim)
+{
+ SV **svp;
+ int n = 0;
+ bool use_delim = FALSE;
+ bool alloced = FALSE;
+
+ /* if we know we have at least two args, create an empty string,
+ * then concatenate args to that. For no args, return an empty string */
+ if (!pat && pat_count != 1) {
+ pat = newSVpvn("", 0);
+ SAVEFREESV(pat);
+ alloced = TRUE;
+ }
+
+ for (svp = patternp; svp < patternp + pat_count; svp++) {
+ SV *sv;
+ SV *rx = NULL;
+ STRLEN orig_patlen = 0;
+ bool code = 0;
+ SV *msv = use_delim ? delim : *svp;
+
+ /* if we've got a delimiter, we go round the loop twice for each
+ * svp slot (except the last), using the delimiter the second
+ * time round */
+ if (use_delim) {
+ svp--;
+ use_delim = FALSE;
+ }
+ else if (delim)
+ use_delim = TRUE;
+
+ if (SvTYPE(msv) == SVt_PVAV) {
+ /* we've encountered an interpolated array within
+ * the pattern, e.g. /...@a..../. Expand the list of elements,
+ * then recursively append elements.
+ * The code in this block is based on S_pushav() */
+
+ AV *const av = (AV*)msv;
+ const I32 maxarg = AvFILL(av) + 1;
+ SV **array;
+
+ if (oplist) {
+ assert(oplist->op_type == OP_PADAV
+ || oplist->op_type == OP_RV2AV);
+ oplist = oplist->op_sibling;;
+ }
+
+ if (SvRMAGICAL(av)) {
+ U32 i;
+
+ Newx(array, maxarg, SV*);
+ SAVEFREEPV(array);
+ for (i=0; i < (U32)maxarg; i++) {
+ SV ** const svp = av_fetch(av, i, FALSE);
+ array[i] = svp ? *svp : &PL_sv_undef;
+ }
+ }
+ else
+ array = AvARRAY(av);
+
+ pat = S_concat_pat(aTHX_ pRExC_state, pat,
+ array, maxarg, NULL, recompile_p,
+ /* $" */
+ GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
+
+ continue;
+ }
+
+
+ /* we make the assumption here that each op in the list of
+ * op_siblings maps to one SV pushed onto the stack,
+ * except for code blocks, with have both an OP_NULL and
+ * and OP_CONST.
+ * This allows us to match up the list of SVs against the
+ * list of OPs to find the next code block.
+ *
+ * Note that PUSHMARK PADSV PADSV ..
+ * is optimised to
+ * PADRANGE PADSV PADSV ..
+ * so the alignment still works. */
+
+ if (oplist) {
+ if (oplist->op_type == OP_NULL
+ && (oplist->op_flags & OPf_SPECIAL))
+ {
+ assert(n < pRExC_state->num_code_blocks);
+ pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
+ pRExC_state->code_blocks[n].block = oplist;
+ pRExC_state->code_blocks[n].src_regex = NULL;
+ n++;
+ code = 1;
+ oplist = oplist->op_sibling; /* skip CONST */
+ assert(oplist);
+ }
+ oplist = oplist->op_sibling;;
+ }
+
+ /* apply magic and QR overloading to arg */
+
+ SvGETMAGIC(msv);
+ if (SvROK(msv) && SvAMAGIC(msv)) {
+ SV *sv = AMG_CALLunary(msv, regexp_amg);
+ if (sv) {
+ if (SvROK(sv))
+ sv = SvRV(sv);
+ if (SvTYPE(sv) != SVt_REGEXP)
+ Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
+ msv = sv;
+ }
+ }
+
+ /* try concatenation overload ... */
+ if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
+ (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
+ {
+ sv_setsv(pat, sv);
+ /* overloading involved: all bets are off over literal
+ * code. Pretend we haven't seen it */
+ pRExC_state->num_code_blocks -= n;
+ n = 0;
+ }
+ else {
+ /* ... or failing that, try "" overload */
+ while (SvAMAGIC(msv)
+ && (sv = AMG_CALLunary(msv, string_amg))
+ && sv != msv
+ && !( SvROK(msv)
+ && SvROK(sv)
+ && SvRV(msv) == SvRV(sv))
+ ) {
+ msv = sv;
+ SvGETMAGIC(msv);
+ }
+ if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
+ msv = SvRV(msv);
+
+ if (pat) {
+ /* this is a partially unrolled
+ * sv_catsv_nomg(pat, msv);
+ * that allows us to adjust code block indices if
+ * needed */
+ STRLEN slen, dlen;
+ char *dst = SvPV_force_nomg(pat, dlen);
+ const char *src = SvPV_flags_const(msv, slen, 0);
+ orig_patlen = dlen;
+ if (SvUTF8(msv) && !SvUTF8(pat)) {
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
+ sv_setpvn(pat, dst, dlen);
+ SvUTF8_on(pat);
+ }
+ sv_catpvn_nomg(pat, src, slen);
+ rx = msv;
+ }
+ else
+ pat = msv;
+
+ if (code)
+ pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
+ }
+
+ /* extract any code blocks within any embedded qr//'s */
+ if (rx && SvTYPE(rx) == SVt_REGEXP
+ && RX_ENGINE((REGEXP*)rx)->op_comp)
+ {
+
+ RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
+ if (ri->num_code_blocks) {
+ int i;
+ /* the presence of an embedded qr// with code means
+ * we should always recompile: the text of the
+ * qr// may not have changed, but it may be a
+ * different closure than last time */
+ *recompile_p = 1;
+ Renew(pRExC_state->code_blocks,
+ pRExC_state->num_code_blocks + ri->num_code_blocks,
+ struct reg_code_block);
+ pRExC_state->num_code_blocks += ri->num_code_blocks;
+
+ for (i=0; i < ri->num_code_blocks; i++) {
+ struct reg_code_block *src, *dst;
+ STRLEN offset = orig_patlen
+ + ReANY((REGEXP *)rx)->pre_prefix;
+ assert(n < pRExC_state->num_code_blocks);
+ src = &ri->code_blocks[i];
+ dst = &pRExC_state->code_blocks[n];
+ dst->start = src->start + offset;
+ dst->end = src->end + offset;
+ dst->block = src->block;
+ dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
+ src->src_regex
+ ? src->src_regex
+ : (REGEXP*)rx);
+ n++;
+ }
+ }
+ }
+ }
+ /* avoid calling magic multiple times on a single element e.g. =~ $qr */
+ if (alloced)
+ SvSETMAGIC(pat);
+
+ return pat;
+}
+
+
+
/* see if there are any run-time code blocks in the pattern.
* False positives are allowed */
static bool
-S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
- U32 pm_flags, char *pat, STRLEN plen)
+S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
+ char *pat, STRLEN plen)
{
int n = 0;
STRLEN s;
- /* avoid infinitely recursing when we recompile the pattern parcelled up
- * as qr'...'. A single constant qr// string can't have have any
- * run-time component in it, and thus, no runtime code. (A non-qr
- * string, however, can, e.g. $x =~ '(?{})') */
- if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
- return 0;
-
for (s = 0; s < plen; s++) {
if (n < pRExC_state->num_code_blocks
&& s == pRExC_state->code_blocks[n].start)
SAVETMPS;
save_re_context();
PUSHSTACKi(PERLSI_REQUIRE);
- /* this causes the toker to collapse \\ into \ when parsing
- * qr''; normally only q'' does this. It also alters hints
- * handling */
- PL_reg_state.re_reparsing = TRUE;
- eval_sv(sv, G_SCALAR);
+ /* G_RE_REPARSING causes the toker to collapse \\ into \ when
+ * parsing qr''; normally only q'' does this. It also alters
+ * hints handling */
+ eval_sv(sv, G_SCALAR|G_RE_REPARSING);
SvREFCNT_dec_NN(sv);
SPAGAIN;
qr_ref = POPs;
S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
{
/* This is the common code for setting up the floating and fixed length
- * string data extracted from Perlre_op_compile() below. Returns a boolean
+ * string data extracted from Perl_re_op_compile() below. Returns a boolean
* as to whether succeeded or not */
I32 t,ml;
REGEXP *
Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
- OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
+ OP *expr, const regexp_engine* eng, REGEXP *old_re,
bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
{
dVAR;
struct regexp *r;
regexp_internal *ri;
STRLEN plen;
- char * VOL exp;
- char* xend;
+ char *exp;
regnode *scan;
I32 flags;
I32 minlen = 0;
U32 rx_flags;
- SV * VOL pat;
- SV * VOL code_blocksv = NULL;
+ SV *pat;
+ SV *code_blocksv = NULL;
+ SV** new_patternp = patternp;
/* these are all flags - maybe they should be turned
* into a single int with different bit masks */
I32 sawlookahead = 0;
I32 sawplus = 0;
I32 sawopen = 0;
- bool used_setjump = FALSE;
regex_charset initial_charset = get_regex_charset(orig_rx_flags);
- bool code_is_utf8 = 0;
- bool VOL recompile = 0;
+ bool recompile = 0;
bool runtime_code = 0;
- U8 jump_ret = 0;
- dJMPENV;
scan_data_t data;
RExC_state_t RExC_state;
RExC_state_t * const pRExC_state = &RExC_state;
#ifdef TRIE_STUDY_OPT
- int restudied;
+ int restudied = 0;
RExC_state_t copyRExC_state;
#endif
GET_RE_DEBUG_FLAGS_DECL;
if (expr && (expr->op_type == OP_LIST ||
(expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
-
- /* is the source UTF8, and how many code blocks are there? */
+ /* allocate code_blocks if needed */
OP *o;
int ncode = 0;
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
- if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
- code_is_utf8 = 1;
- else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
- /* count of DO blocks */
- ncode++;
- }
+ for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
+ if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
+ ncode++; /* count of DO blocks */
if (ncode) {
pRExC_state->num_code_blocks = ncode;
Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
}
}
- if (pat_count) {
- /* handle a list of SVs */
+ if (!pat_count) {
+ /* compile-time pattern with just OP_CONSTs and DO blocks */
- SV **svp;
+ int n;
+ OP *o;
- /* apply magic and RE overloading to each arg */
- for (svp = patternp; svp < patternp + pat_count; svp++) {
- SV *rx = *svp;
- SvGETMAGIC(rx);
- if (SvROK(rx) && SvAMAGIC(rx)) {
- SV *sv = AMG_CALLunary(rx, regexp_amg);
- if (sv) {
- if (SvROK(sv))
- sv = SvRV(sv);
- if (SvTYPE(sv) != SVt_REGEXP)
- Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
- *svp = sv;
- }
- }
- }
+ /* find how many CONSTs there are */
+ assert(expr);
+ n = 0;
+ if (expr->op_type == OP_CONST)
+ n = 1;
+ else
+ for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ if (o->op_type == OP_CONST)
+ n++;
+ }
- if (pat_count > 1) {
- /* concat multiple args and find any code block indexes */
+ /* fake up an SV array */
- OP *o = NULL;
- int n = 0;
- bool utf8 = 0;
- STRLEN orig_patlen = 0;
+ assert(!new_patternp);
+ Newx(new_patternp, n, SV*);
+ SAVEFREEPV(new_patternp);
+ pat_count = n;
- if (pRExC_state->num_code_blocks) {
- o = cLISTOPx(expr)->op_first;
- assert( o->op_type == OP_PUSHMARK
- || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK)
- || o->op_type == OP_PADRANGE);
- o = o->op_sibling;
- }
+ n = 0;
+ if (expr->op_type == OP_CONST)
+ new_patternp[n] = cSVOPx_sv(expr);
+ else
+ for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ if (o->op_type == OP_CONST)
+ new_patternp[n++] = cSVOPo_sv;
+ }
- pat = newSVpvn("", 0);
- SAVEFREESV(pat);
-
- /* determine if the pattern is going to be utf8 (needed
- * in advance to align code block indices correctly).
- * XXX This could fail to be detected for an arg with
- * overloading but not concat overloading; but the main effect
- * in this obscure case is to need a 'use re eval' for a
- * literal code block */
- for (svp = patternp; svp < patternp + pat_count; svp++) {
- if (SvUTF8(*svp))
- utf8 = 1;
- }
- if (utf8)
- SvUTF8_on(pat);
-
- for (svp = patternp; svp < patternp + pat_count; svp++) {
- SV *sv, *msv = *svp;
- SV *rx;
- bool code = 0;
- /* we make the assumption here that each op in the list of
- * op_siblings maps to one SV pushed onto the stack,
- * except for code blocks, with have both an OP_NULL and
- * and OP_CONST.
- * This allows us to match up the list of SVs against the
- * list of OPs to find the next code block.
- *
- * Note that PUSHMARK PADSV PADSV ..
- * is optimised to
- * PADRANGE NULL NULL ..
- * so the alignment still works. */
- if (o) {
- if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
- assert(n < pRExC_state->num_code_blocks);
- pRExC_state->code_blocks[n].start = SvCUR(pat);
- pRExC_state->code_blocks[n].block = o;
- pRExC_state->code_blocks[n].src_regex = NULL;
- n++;
- code = 1;
- o = o->op_sibling; /* skip CONST */
- assert(o);
- }
- o = o->op_sibling;;
- }
+ }
- if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
- (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
- {
- sv_setsv(pat, sv);
- /* overloading involved: all bets are off over literal
- * code. Pretend we haven't seen it */
- pRExC_state->num_code_blocks -= n;
- n = 0;
- rx = NULL;
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "Assembling pattern from %d elements%s\n", pat_count,
+ orig_rx_flags & RXf_SPLIT ? " for split" : ""));
- }
- else {
- while (SvAMAGIC(msv)
- && (sv = AMG_CALLunary(msv, string_amg))
- && sv != msv
- && !( SvROK(msv)
- && SvROK(sv)
- && SvRV(msv) == SvRV(sv))
- ) {
- msv = sv;
- SvGETMAGIC(msv);
- }
- if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
- msv = SvRV(msv);
- orig_patlen = SvCUR(pat);
- sv_catsv_nomg(pat, msv);
- rx = msv;
- if (code)
- pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
- }
+ /* set expr to the first arg op */
- /* extract any code blocks within any embedded qr//'s */
- if (rx && SvTYPE(rx) == SVt_REGEXP
- && RX_ENGINE((REGEXP*)rx)->op_comp)
- {
+ if (pRExC_state->num_code_blocks
+ && expr->op_type != OP_CONST)
+ {
+ expr = cLISTOPx(expr)->op_first;
+ assert( expr->op_type == OP_PUSHMARK
+ || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
+ || expr->op_type == OP_PADRANGE);
+ expr = expr->op_sibling;
+ }
- RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
- if (ri->num_code_blocks) {
- int i;
- /* the presence of an embedded qr// with code means
- * we should always recompile: the text of the
- * qr// may not have changed, but it may be a
- * different closure than last time */
- recompile = 1;
- Renew(pRExC_state->code_blocks,
- pRExC_state->num_code_blocks + ri->num_code_blocks,
- struct reg_code_block);
- pRExC_state->num_code_blocks += ri->num_code_blocks;
- for (i=0; i < ri->num_code_blocks; i++) {
- struct reg_code_block *src, *dst;
- STRLEN offset = orig_patlen
- + ReANY((REGEXP *)rx)->pre_prefix;
- assert(n < pRExC_state->num_code_blocks);
- src = &ri->code_blocks[i];
- dst = &pRExC_state->code_blocks[n];
- dst->start = src->start + offset;
- dst->end = src->end + offset;
- dst->block = src->block;
- dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
- src->src_regex
- ? src->src_regex
- : (REGEXP*)rx);
- n++;
- }
- }
- }
- }
- SvSETMAGIC(pat);
- }
- else {
- SV *sv;
- pat = *patternp;
- while (SvAMAGIC(pat)
- && (sv = AMG_CALLunary(pat, string_amg))
- && sv != pat)
- {
- pat = sv;
- SvGETMAGIC(pat);
- }
- }
+ pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
+ expr, &recompile, NULL);
- /* handle bare regex: foo =~ $re */
- {
- SV *re = pat;
- if (SvROK(re))
- re = SvRV(re);
- if (SvTYPE(re) == SVt_REGEXP) {
- if (is_bare_re)
- *is_bare_re = TRUE;
- SvREFCNT_inc(re);
- Safefree(pRExC_state->code_blocks);
- return (REGEXP*)re;
- }
- }
- }
- else {
- /* not a list of SVs, so must be a list of OPs */
- assert(expr);
- if (expr->op_type == OP_LIST) {
- int i = -1;
- bool is_code = 0;
- OP *o;
-
- pat = newSVpvn("", 0);
- SAVEFREESV(pat);
- if (code_is_utf8)
- SvUTF8_on(pat);
-
- /* given a list of CONSTs and DO blocks in expr, append all
- * the CONSTs to pat, and record the start and end of each
- * code block in code_blocks[] (each DO{} op is followed by an
- * OP_CONST containing the corresponding literal '(?{...})
- * text)
- */
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
- if (o->op_type == OP_CONST) {
- sv_catsv(pat, cSVOPo_sv);
- if (is_code) {
- pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
- is_code = 0;
- }
- }
- else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
- assert(i+1 < pRExC_state->num_code_blocks);
- pRExC_state->code_blocks[++i].start = SvCUR(pat);
- pRExC_state->code_blocks[i].block = o;
- pRExC_state->code_blocks[i].src_regex = NULL;
- is_code = 1;
- }
- }
- }
- else {
- assert(expr->op_type == OP_CONST);
- pat = cSVOPx_sv(expr);
- }
+ /* handle bare (possibly after overloading) regex: foo =~ $re */
+ {
+ SV *re = pat;
+ if (SvROK(re))
+ re = SvRV(re);
+ if (SvTYPE(re) == SVt_REGEXP) {
+ if (is_bare_re)
+ *is_bare_re = TRUE;
+ SvREFCNT_inc(re);
+ Safefree(pRExC_state->code_blocks);
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "Precompiled pattern%s\n",
+ orig_rx_flags & RXf_SPLIT ? " for split" : ""));
+
+ return (REGEXP*)re;
+ }
}
exp = SvPV_nomg(pat, plen);
RExC_contains_locale = 0;
pRExC_state->runtime_code_qr = NULL;
- /****************** LONG JUMP TARGET HERE***********************/
- /* Longjmp back to here if have to switch in midstream to utf8 */
- if (! RExC_orig_utf8) {
- JMPENV_PUSH(jump_ret);
- used_setjump = TRUE;
- }
-
- if (jump_ret == 0) { /* First time through */
- xend = exp + plen;
-
- DEBUG_COMPILE_r({
+ DEBUG_COMPILE_r({
SV *dsv= sv_newmortal();
- RE_PV_QUOTED_DECL(s, RExC_utf8,
- dsv, exp, plen, 60);
+ RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
- PL_colors[4],PL_colors[5],s);
+ PL_colors[4],PL_colors[5],s);
});
- }
- else { /* longjumped back */
- U8 *src, *dst;
- int n=0;
- STRLEN s = 0, d = 0;
- bool do_end = 0;
- /* If the cause for the longjmp was other than changing to utf8, pop
- * our own setjmp, and longjmp to the correct handler */
- if (jump_ret != UTF8_LONGJMP) {
- JMPENV_POP;
- JMPENV_JUMP(jump_ret);
- }
-
- GET_RE_DEBUG_FLAGS;
-
- /* It's possible to write a regexp in ascii that represents Unicode
- codepoints outside of the byte range, such as via \x{100}. If we
- detect such a sequence we have to convert the entire pattern to utf8
- and then recompile, as our sizing calculation will have been based
- on 1 byte == 1 character, but we will need to use utf8 to encode
- at least some part of the pattern, and therefore must convert the whole
- thing.
- -- dmq */
- DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
- "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
-
- /* upgrade pattern to UTF8, and if there are code blocks,
- * recalculate the indices.
- * This is essentially an unrolled Perl_bytes_to_utf8() */
+ redo_first_pass:
+ /* we jump here if we upgrade the pattern to utf8 and have to
+ * recompile */
- src = (U8*)SvPV_nomg(pat, plen);
- Newx(dst, plen * 2 + 1, U8);
-
- while (s < plen) {
- const UV uv = NATIVE_TO_ASCII(src[s]);
- if (UNI_IS_INVARIANT(uv))
- dst[d] = (U8)UTF_TO_NATIVE(uv);
- else {
- dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
- dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
- }
- if (n < pRExC_state->num_code_blocks) {
- if (!do_end && pRExC_state->code_blocks[n].start == s) {
- pRExC_state->code_blocks[n].start = d;
- assert(dst[d] == '(');
- do_end = 1;
- }
- else if (do_end && pRExC_state->code_blocks[n].end == s) {
- pRExC_state->code_blocks[n].end = d;
- assert(dst[d] == ')');
- do_end = 0;
- n++;
- }
- }
- s++;
- d++;
- }
- dst[d] = '\0';
- plen = d;
- exp = (char*) dst;
- xend = exp + plen;
- SAVEFREEPV(exp);
- RExC_orig_utf8 = RExC_utf8 = 1;
- }
+ if ((pm_flags & PMf_USE_RE_EVAL)
+ /* this second condition covers the non-regex literal case,
+ * i.e. $foo =~ '(?{})'. */
+ || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
+ )
+ runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
/* return old regex if pattern hasn't changed */
+ /* XXX: note in the below we have to check the flags as well as the pattern.
+ *
+ * Things get a touch tricky as we have to compare the utf8 flag independently
+ * from the compile flags.
+ */
if ( old_re
&& !recompile
- && !!RX_UTF8(old_re) == !!RExC_utf8
+ && !!RX_UTF8(old_re) == !!RExC_utf8
+ && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
&& RX_PRECOMP(old_re)
&& RX_PRELEN(old_re) == plen
- && memEQ(RX_PRECOMP(old_re), exp, plen))
+ && memEQ(RX_PRECOMP(old_re), exp, plen)
+ && !runtime_code /* with runtime code, always recompile */ )
{
- /* with runtime code, always recompile */
- runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
- exp, plen);
- if (!runtime_code) {
- if (used_setjump) {
- JMPENV_POP;
- }
- Safefree(pRExC_state->code_blocks);
- return old_re;
- }
+ Safefree(pRExC_state->code_blocks);
+ return old_re;
}
- else if ((pm_flags & PMf_USE_RE_EVAL)
- /* this second condition covers the non-regex literal case,
- * i.e. $foo =~ '(?{})'. */
- || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
- && (PL_hints & HINT_RE_EVAL))
- )
- runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
- exp, plen);
-
-#ifdef TRIE_STUDY_OPT
- restudied = 0;
-#endif
rx_flags = orig_rx_flags;
if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
/* whoops, we have a non-utf8 pattern, whilst run-time code
* got compiled as utf8. Try again with a utf8 pattern */
- JMPENV_JUMP(UTF8_LONGJMP);
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
+ pRExC_state->num_code_blocks);
+ goto redo_first_pass;
}
}
assert(!pRExC_state->runtime_code_qr);
/* First pass: determine size, legality. */
RExC_parse = exp;
RExC_start = exp;
- RExC_end = xend;
+ RExC_end = exp + plen;
RExC_naughty = 0;
RExC_npar = 1;
RExC_nestroot = 0;
pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
need it to survive as long as the regexp (qr/(?{})/).
We must check that code_blocksv is not already set, because we may
- have longjmped back. */
+ have jumped back to restart the sizing pass. */
if (pRExC_state->code_blocks && !code_blocksv) {
code_blocksv = newSV_type(SVt_PV);
SAVEFREESV(code_blocksv);
SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
}
if (reg(pRExC_state, 0, &flags,1) == NULL) {
- RExC_precomp = NULL;
- return(NULL);
+ /* It's possible to write a regexp in ascii that represents Unicode
+ codepoints outside of the byte range, such as via \x{100}. If we
+ detect such a sequence we have to convert the entire pattern to utf8
+ and then recompile, as our sizing calculation will have been based
+ on 1 byte == 1 character, but we will need to use utf8 to encode
+ at least some part of the pattern, and therefore must convert the whole
+ thing.
+ -- dmq */
+ if (flags & RESTART_UTF8) {
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
+ pRExC_state->num_code_blocks);
+ goto redo_first_pass;
+ }
+ Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags);
}
if (code_blocksv)
SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
- /* Here, finished first pass. Get rid of any added setjmp */
- if (used_setjump) {
- JMPENV_POP;
- }
-
DEBUG_PARSE_r({
PerlIO_printf(Perl_debug_log,
"Required size %"IVdf" nodes\n"
RXi_SET( r, ri );
r->engine= eng;
r->extflags = rx_flags;
+ RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
+
if (pm_flags & PMf_IS_QR) {
ri->code_blocks = pRExC_state->code_blocks;
ri->num_code_blocks = pRExC_state->num_code_blocks;
RExC_flags = rx_flags; /* don't let top level (?i) bleed */
RExC_pm_flags = pm_flags;
RExC_parse = exp;
- RExC_end = xend;
+ RExC_end = exp + plen;
RExC_naughty = 0;
RExC_npar = 1;
RExC_emit_start = ri->program;
REGC((U8)REG_MAGIC, (char*) RExC_emit++);
if (reg(pRExC_state, 0, &flags,1) == NULL) {
ReREFCNT_dec(rx);
- return(NULL);
+ Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#X", flags);
}
/* XXXX To minimize changes to RE engine we always allocate
3-units-long substrs field. */
if (RExC_seen & REG_SEEN_GPOS)
r->extflags |= RXf_GPOS_SEEN;
if (RExC_seen & REG_SEEN_LOOKBEHIND)
- r->extflags |= RXf_LOOKBEHIND_SEEN;
+ r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */
if (pRExC_state->num_code_blocks)
r->extflags |= RXf_EVAL_SEEN;
if (RExC_seen & REG_SEEN_CANY)
if (RExC_seen & REG_SEEN_VERBARG)
{
r->intflags |= PREGf_VERBARG_SEEN;
- r->extflags |= RXf_MODIFIES_VARS;
+ r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
}
if (RExC_seen & REG_SEEN_CUTGROUP)
r->intflags |= PREGf_CUTGROUP_SEEN;
else
RXp_PAREN_NAMES(r) = NULL;
-#ifdef STUPID_PATTERN_CHECKS
- if (RX_PRELEN(rx) == 0)
- r->extflags |= RXf_NULL;
- if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
- r->extflags |= RXf_WHITE;
- else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
- r->extflags |= RXf_START_ONLY;
-#else
{
regnode *first = ri->program + 1;
U8 fop = OP(first);
+ regnode *next = NEXTOPER(first);
+ U8 nop = OP(next);
- if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
+ if (PL_regkind[fop] == NOTHING && nop == END)
r->extflags |= RXf_NULL;
- else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
+ else if (PL_regkind[fop] == BOL && nop == END)
r->extflags |= RXf_START_ONLY;
- else if (fop == PLUS && PL_regkind[OP(NEXTOPER(first))] == POSIXD && FLAGS(NEXTOPER(first)) == _CC_SPACE
- && OP(regnext(first)) == END)
- r->extflags |= RXf_WHITE;
+ else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE && OP(regnext(first)) == END)
+ r->extflags |= RXf_WHITE;
+ else if ( r->extflags & RXf_SPLIT && fop == EXACT && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END )
+ r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
+
}
-#endif
#ifdef DEBUGGING
if (RExC_paren_names) {
ri->name_list_idx = add_data( pRExC_state, 1, "a" );
* should eventually be made public */
/* The header definitions are in F<inline_invlist.c> */
-#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
-#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
+#define TO_INTERNAL_SIZE(x) (((x) + HEADER_LENGTH) * sizeof(UV))
+#define FROM_INTERNAL_SIZE(x) (((x)/ sizeof(UV)) - HEADER_LENGTH)
#define INVLIST_INITIAL_LEN 10
*get_invlist_previous_index_addr(new_list) = 0;
*get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
#if HEADER_LENGTH != 5
-# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
+# error Need to regenerate INVLIST_VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
#endif
return new_list;
}
else {
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
- cp= array_b[i_b++];
+ cp = array_b[i_b++];
}
/* Here, have chosen which of the two inputs to look at. Only output
/* End of inversion list object */
-/*
- - reg - regular expression, i.e. main body or parenthesized thing
- *
- * Caller must absorb opening parenthesis.
- *
- * Combining parenthesis handling with the base level of regular expression
- * is a trifle forced, but the need to tie the tails of the branches to what
- * follows makes it hard to avoid.
- */
-#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
-#ifdef DEBUGGING
-#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
-#else
-#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
-#endif
-
-STATIC regnode *
-S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
- /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
+STATIC void
+S_parse_lparen_question_flags(pTHX_ struct RExC_state_t *pRExC_state)
{
- dVAR;
- regnode *ret; /* Will be the head of the group. */
- regnode *br;
- regnode *lastbr;
- regnode *ender = NULL;
- I32 parno = 0;
- I32 flags;
- U32 oregflags = RExC_flags;
- bool have_branch = 0;
- bool is_open = 0;
- I32 freeze_paren = 0;
- I32 after_freeze = 0;
+ /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
+ * constructs, and updates RExC_flags with them. On input, RExC_parse
+ * should point to the first flag; it is updated on output to point to the
+ * final ')' or ':'. There needs to be at least one flag, or this will
+ * abort */
/* for (?g), (?gc), and (?o) warnings; warning
about (?c) will warn about (?g) -- japhy */
#define WASTED_C 0x04
#define WASTED_GC (0x02|0x04)
I32 wastedflags = 0x00;
+ U32 posflags = 0, negflags = 0;
+ U32 *flagsp = &posflags;
+ char has_charset_modifier = '\0';
+ regex_charset cs;
+ bool has_use_defaults = FALSE;
+ const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
- char * parse_start = RExC_parse; /* MJD */
- char * const oregcomp_parse = RExC_parse;
+ PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
- GET_RE_DEBUG_FLAGS_DECL;
+ /* '^' as an initial flag sets certain defaults */
+ if (UCHARAT(RExC_parse) == '^') {
+ RExC_parse++;
+ has_use_defaults = TRUE;
+ STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
+ set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
+ ? REGEX_UNICODE_CHARSET
+ : REGEX_DEPENDS_CHARSET);
+ }
- PERL_ARGS_ASSERT_REG;
- DEBUG_PARSE("reg ");
+ cs = get_regex_charset(RExC_flags);
+ if (cs == REGEX_DEPENDS_CHARSET
+ && (RExC_utf8 || RExC_uni_semantics))
+ {
+ cs = REGEX_UNICODE_CHARSET;
+ }
- *flagp = 0; /* Tentatively. */
+ while (*RExC_parse) {
+ /* && strchr("iogcmsx", *RExC_parse) */
+ /* (?g), (?gc) and (?o) are useless here
+ and must be globally applied -- japhy */
+ switch (*RExC_parse) {
+ /* Code for the imsx flags */
+ CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
- /* Make an OPEN node, if parenthesized. */
- if (paren) {
- if ( *RExC_parse == '*') { /* (*VERB:ARG) */
- char *start_verb = RExC_parse;
- STRLEN verb_len = 0;
- char *start_arg = NULL;
- unsigned char op = 0;
- int argok = 1;
- int internal_argval = 0; /* internal_argval is only useful if !argok */
- while ( *RExC_parse && *RExC_parse != ')' ) {
- if ( *RExC_parse == ':' ) {
- start_arg = RExC_parse + 1;
- break;
- }
- RExC_parse++;
- }
- ++start_verb;
- verb_len = RExC_parse - start_verb;
+ case LOCALE_PAT_MOD:
+ if (has_charset_modifier) {
+ goto excess_modifier;
+ }
+ else if (flagsp == &negflags) {
+ goto neg_modifier;
+ }
+ cs = REGEX_LOCALE_CHARSET;
+ has_charset_modifier = LOCALE_PAT_MOD;
+ RExC_contains_locale = 1;
+ break;
+ case UNICODE_PAT_MOD:
+ if (has_charset_modifier) {
+ goto excess_modifier;
+ }
+ else if (flagsp == &negflags) {
+ goto neg_modifier;
+ }
+ cs = REGEX_UNICODE_CHARSET;
+ has_charset_modifier = UNICODE_PAT_MOD;
+ break;
+ case ASCII_RESTRICT_PAT_MOD:
+ if (flagsp == &negflags) {
+ goto neg_modifier;
+ }
+ if (has_charset_modifier) {
+ if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
+ goto excess_modifier;
+ }
+ /* Doubled modifier implies more restricted */
+ cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
+ }
+ else {
+ cs = REGEX_ASCII_RESTRICTED_CHARSET;
+ }
+ has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
+ break;
+ case DEPENDS_PAT_MOD:
+ if (has_use_defaults) {
+ goto fail_modifiers;
+ }
+ else if (flagsp == &negflags) {
+ goto neg_modifier;
+ }
+ else if (has_charset_modifier) {
+ goto excess_modifier;
+ }
+
+ /* The dual charset means unicode semantics if the
+ * pattern (or target, not known until runtime) are
+ * utf8, or something in the pattern indicates unicode
+ * semantics */
+ cs = (RExC_utf8 || RExC_uni_semantics)
+ ? REGEX_UNICODE_CHARSET
+ : REGEX_DEPENDS_CHARSET;
+ has_charset_modifier = DEPENDS_PAT_MOD;
+ break;
+ excess_modifier:
+ RExC_parse++;
+ if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
+ vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
+ }
+ else if (has_charset_modifier == *(RExC_parse - 1)) {
+ vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
+ }
+ else {
+ vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
+ }
+ /*NOTREACHED*/
+ neg_modifier:
+ RExC_parse++;
+ vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
+ /*NOTREACHED*/
+ case ONCE_PAT_MOD: /* 'o' */
+ case GLOBAL_PAT_MOD: /* 'g' */
+ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
+ const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
+ if (! (wastedflags & wflagbit) ) {
+ wastedflags |= wflagbit;
+ vWARN5(
+ RExC_parse + 1,
+ "Useless (%s%c) - %suse /%c modifier",
+ flagsp == &negflags ? "?-" : "?",
+ *RExC_parse,
+ flagsp == &negflags ? "don't " : "",
+ *RExC_parse
+ );
+ }
+ }
+ break;
+
+ case CONTINUE_PAT_MOD: /* 'c' */
+ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
+ if (! (wastedflags & WASTED_C) ) {
+ wastedflags |= WASTED_GC;
+ vWARN3(
+ RExC_parse + 1,
+ "Useless (%sc) - %suse /gc modifier",
+ flagsp == &negflags ? "?-" : "?",
+ flagsp == &negflags ? "don't " : ""
+ );
+ }
+ }
+ break;
+ case KEEPCOPY_PAT_MOD: /* 'p' */
+ if (flagsp == &negflags) {
+ if (SIZE_ONLY)
+ ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
+ } else {
+ *flagsp |= RXf_PMf_KEEPCOPY;
+ }
+ break;
+ case '-':
+ /* A flag is a default iff it is following a minus, so
+ * if there is a minus, it means will be trying to
+ * re-specify a default which is an error */
+ if (has_use_defaults || flagsp == &negflags) {
+ goto fail_modifiers;
+ }
+ flagsp = &negflags;
+ wastedflags = 0; /* reset so (?g-c) warns twice */
+ break;
+ case ':':
+ case ')':
+ RExC_flags |= posflags;
+ RExC_flags &= ~negflags;
+ set_regex_charset(&RExC_flags, cs);
+ return;
+ /*NOTREACHED*/
+ default:
+ fail_modifiers:
+ RExC_parse++;
+ vFAIL3("Sequence (%.*s...) not recognized",
+ RExC_parse-seqstart, seqstart);
+ /*NOTREACHED*/
+ }
+
+ ++RExC_parse;
+ }
+}
+
+/*
+ - reg - regular expression, i.e. main body or parenthesized thing
+ *
+ * Caller must absorb opening parenthesis.
+ *
+ * Combining parenthesis handling with the base level of regular expression
+ * is a trifle forced, but the need to tie the tails of the branches to what
+ * follows makes it hard to avoid.
+ */
+#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
+#ifdef DEBUGGING
+#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
+#else
+#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
+#endif
+
+/* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
+ flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
+ needs to be restarted.
+ Otherwise would only return NULL if regbranch() returns NULL, which
+ cannot happen. */
+STATIC regnode *
+S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
+ /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
+{
+ dVAR;
+ regnode *ret; /* Will be the head of the group. */
+ regnode *br;
+ regnode *lastbr;
+ regnode *ender = NULL;
+ I32 parno = 0;
+ I32 flags;
+ U32 oregflags = RExC_flags;
+ bool have_branch = 0;
+ bool is_open = 0;
+ I32 freeze_paren = 0;
+ I32 after_freeze = 0;
+
+ char * parse_start = RExC_parse; /* MJD */
+ char * const oregcomp_parse = RExC_parse;
+
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_REG;
+ DEBUG_PARSE("reg ");
+
+ *flagp = 0; /* Tentatively. */
+
+
+ /* Make an OPEN node, if parenthesized. */
+ if (paren) {
+ if ( *RExC_parse == '*') { /* (*VERB:ARG) */
+ char *start_verb = RExC_parse;
+ STRLEN verb_len = 0;
+ char *start_arg = NULL;
+ unsigned char op = 0;
+ int argok = 1;
+ int internal_argval = 0; /* internal_argval is only useful if !argok */
+ while ( *RExC_parse && *RExC_parse != ')' ) {
+ if ( *RExC_parse == ':' ) {
+ start_arg = RExC_parse + 1;
+ break;
+ }
+ RExC_parse++;
+ }
+ ++start_verb;
+ verb_len = RExC_parse - start_verb;
if ( start_arg ) {
RExC_parse++;
while ( *RExC_parse && *RExC_parse != ')' )
if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
- bool has_use_defaults = FALSE;
RExC_parse++;
paren = *RExC_parse++;
case '@': /* (?@...) */
vFAIL2("Sequence (?%c...) not implemented", (int)paren);
break;
- case '#': /* (?#...) */
- while (*RExC_parse && *RExC_parse != ')')
+ case '#': /* (?#...) */
+ /* XXX As soon as we disallow separating the '?' and '*' (by
+ * spaces or (?#...) comment), it is believed that this case
+ * will be unreachable and can be removed. See
+ * [perl #117327] */
+ while (*RExC_parse && *RExC_parse != ')')
RExC_parse++;
if (*RExC_parse != ')')
FAIL("Sequence (?#... not terminated");
|| RExC_parse[1] == '<'
|| RExC_parse[1] == '{') { /* Lookahead or eval. */
I32 flag;
+ regnode *tail;
ret = reg_node(pRExC_state, LOGICAL);
if (!SIZE_ONLY)
ret->flags = 1;
- REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
+
+ tail = reg(pRExC_state, 1, &flag, depth+1);
+ if (flag & RESTART_UTF8) {
+ *flagp = RESTART_UTF8;
+ return NULL;
+ }
+ REGTAIL(pRExC_state, ret, tail);
goto insert_if;
}
}
insert_if:
REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
br = regbranch(pRExC_state, &flags, 1,depth+1);
- if (br == NULL)
- br = reganode(pRExC_state, LONGJMP, 0);
- else
+ if (br == NULL) {
+ if (flags & RESTART_UTF8) {
+ *flagp = RESTART_UTF8;
+ return NULL;
+ }
+ FAIL2("panic: regbranch returned NULL, flags=%#X",
+ flags);
+ } else
REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
c = *nextchar(pRExC_state);
if (flags&HASWIDTH)
if (is_define)
vFAIL("(?(DEFINE)....) does not allow branches");
lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
- regbranch(pRExC_state, &flags, 1,depth+1);
+ if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
+ if (flags & RESTART_UTF8) {
+ *flagp = RESTART_UTF8;
+ return NULL;
+ }
+ FAIL2("panic: regbranch returned NULL, flags=%#X",
+ flags);
+ }
REGTAIL(pRExC_state, ret, lastbr);
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
}
}
+ case '[': /* (?[ ... ]) */
+ return handle_regex_sets(pRExC_state, NULL, flagp, depth,
+ oregcomp_parse);
case 0:
RExC_parse--; /* for vFAIL to print correctly */
vFAIL("Sequence (? incomplete");
break;
- case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
- that follow */
- has_use_defaults = TRUE;
- STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
- set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
- ? REGEX_UNICODE_CHARSET
- : REGEX_DEPENDS_CHARSET);
- goto parse_flags;
- default:
+ default: /* e.g., (?i) */
--RExC_parse;
- parse_flags: /* (?i) */
- {
- U32 posflags = 0, negflags = 0;
- U32 *flagsp = &posflags;
- char has_charset_modifier = '\0';
- regex_charset cs = get_regex_charset(RExC_flags);
- if (cs == REGEX_DEPENDS_CHARSET
- && (RExC_utf8 || RExC_uni_semantics))
- {
- cs = REGEX_UNICODE_CHARSET;
- }
-
- while (*RExC_parse) {
- /* && strchr("iogcmsx", *RExC_parse) */
- /* (?g), (?gc) and (?o) are useless here
- and must be globally applied -- japhy */
- switch (*RExC_parse) {
- CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
- case LOCALE_PAT_MOD:
- if (has_charset_modifier) {
- goto excess_modifier;
- }
- else if (flagsp == &negflags) {
- goto neg_modifier;
- }
- cs = REGEX_LOCALE_CHARSET;
- has_charset_modifier = LOCALE_PAT_MOD;
- RExC_contains_locale = 1;
- break;
- case UNICODE_PAT_MOD:
- if (has_charset_modifier) {
- goto excess_modifier;
- }
- else if (flagsp == &negflags) {
- goto neg_modifier;
- }
- cs = REGEX_UNICODE_CHARSET;
- has_charset_modifier = UNICODE_PAT_MOD;
- break;
- case ASCII_RESTRICT_PAT_MOD:
- if (flagsp == &negflags) {
- goto neg_modifier;
- }
- if (has_charset_modifier) {
- if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
- goto excess_modifier;
- }
- /* Doubled modifier implies more restricted */
- cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
- }
- else {
- cs = REGEX_ASCII_RESTRICTED_CHARSET;
- }
- has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
- break;
- case DEPENDS_PAT_MOD:
- if (has_use_defaults) {
- goto fail_modifiers;
- }
- else if (flagsp == &negflags) {
- goto neg_modifier;
- }
- else if (has_charset_modifier) {
- goto excess_modifier;
- }
-
- /* The dual charset means unicode semantics if the
- * pattern (or target, not known until runtime) are
- * utf8, or something in the pattern indicates unicode
- * semantics */
- cs = (RExC_utf8 || RExC_uni_semantics)
- ? REGEX_UNICODE_CHARSET
- : REGEX_DEPENDS_CHARSET;
- has_charset_modifier = DEPENDS_PAT_MOD;
- break;
- excess_modifier:
- RExC_parse++;
- if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
- vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
- }
- else if (has_charset_modifier == *(RExC_parse - 1)) {
- vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
- }
- else {
- vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
- }
- /*NOTREACHED*/
- neg_modifier:
- RExC_parse++;
- vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
- /*NOTREACHED*/
- case ONCE_PAT_MOD: /* 'o' */
- case GLOBAL_PAT_MOD: /* 'g' */
- if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
- const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
- if (! (wastedflags & wflagbit) ) {
- wastedflags |= wflagbit;
- vWARN5(
- RExC_parse + 1,
- "Useless (%s%c) - %suse /%c modifier",
- flagsp == &negflags ? "?-" : "?",
- *RExC_parse,
- flagsp == &negflags ? "don't " : "",
- *RExC_parse
- );
- }
- }
- break;
-
- case CONTINUE_PAT_MOD: /* 'c' */
- if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
- if (! (wastedflags & WASTED_C) ) {
- wastedflags |= WASTED_GC;
- vWARN3(
- RExC_parse + 1,
- "Useless (%sc) - %suse /gc modifier",
- flagsp == &negflags ? "?-" : "?",
- flagsp == &negflags ? "don't " : ""
- );
- }
- }
- break;
- case KEEPCOPY_PAT_MOD: /* 'p' */
- if (flagsp == &negflags) {
- if (SIZE_ONLY)
- ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
- } else {
- *flagsp |= RXf_PMf_KEEPCOPY;
- }
- break;
- case '-':
- /* A flag is a default iff it is following a minus, so
- * if there is a minus, it means will be trying to
- * re-specify a default which is an error */
- if (has_use_defaults || flagsp == &negflags) {
- fail_modifiers:
- RExC_parse++;
- vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
- /*NOTREACHED*/
- }
- flagsp = &negflags;
- wastedflags = 0; /* reset so (?g-c) warns twice */
- break;
- case ':':
- paren = ':';
- /*FALLTHROUGH*/
- case ')':
- RExC_flags |= posflags;
- RExC_flags &= ~negflags;
- set_regex_charset(&RExC_flags, cs);
- if (paren != ':') {
- oregflags |= posflags;
- oregflags &= ~negflags;
- set_regex_charset(&oregflags, cs);
- }
- nextchar(pRExC_state);
- if (paren != ':') {
- *flagp = TRYAGAIN;
- return NULL;
- } else {
- ret = NULL;
- goto parse_rest;
- }
- /*NOTREACHED*/
- default:
- RExC_parse++;
- vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
- /*NOTREACHED*/
- }
- ++RExC_parse;
- }
- }} /* one for the default block, one for the switch */
+ parse_flags:
+ parse_lparen_question_flags(pRExC_state);
+ if (UCHARAT(RExC_parse) != ':') {
+ nextchar(pRExC_state);
+ *flagp = TRYAGAIN;
+ return NULL;
+ }
+ paren = ':';
+ nextchar(pRExC_state);
+ ret = NULL;
+ goto parse_rest;
+ } /* end switch */
}
else { /* (...) */
capturing_parens:
/* branch_len = (paren != 0); */
- if (br == NULL)
- return(NULL);
+ if (br == NULL) {
+ if (flags & RESTART_UTF8) {
+ *flagp = RESTART_UTF8;
+ return NULL;
+ }
+ FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
+ }
if (*RExC_parse == '|') {
if (!SIZE_ONLY && RExC_extralen) {
reginsert(pRExC_state, BRANCHJ, br, depth+1);
}
br = regbranch(pRExC_state, &flags, 0, depth+1);
- if (br == NULL)
- return(NULL);
+ if (br == NULL) {
+ if (flags & RESTART_UTF8) {
+ *flagp = RESTART_UTF8;
+ return NULL;
+ }
+ FAIL2("panic: regbranch returned NULL, flags=%#X", flags);
+ }
REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
lastbr = br;
*flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
- regbranch - one alternative of an | operator
*
* Implements the concatenation operator.
+ *
+ * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
+ * restarted.
*/
STATIC regnode *
S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
if (latest == NULL) {
if (flags & TRYAGAIN)
continue;
- return(NULL);
+ if (flags & RESTART_UTF8) {
+ *flagp = RESTART_UTF8;
+ return NULL;
+ }
+ FAIL2("panic: regpiece returned NULL, flags=%#X", flags);
}
else if (ret == NULL)
ret = latest;
* both the endmarker for their branch list and the body of the last branch.
* It might seem that this node could be dispensed with entirely, but the
* endmarker role is not redundant.
+ *
+ * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
+ * TRYAGAIN.
+ * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
+ * restarted.
*/
STATIC regnode *
S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
ret = regatom(pRExC_state, &flags,depth+1);
if (ret == NULL) {
- if (flags & TRYAGAIN)
- *flagp |= TRYAGAIN;
+ if (flags & (TRYAGAIN|RESTART_UTF8))
+ *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
+ else
+ FAIL2("panic: regatom returned NULL, flags=%#X", flags);
return(NULL);
}
op = *RExC_parse;
- if (op == '{' && regcurly(RExC_parse)) {
+ if (op == '{' && regcurly(RExC_parse, FALSE)) {
maxpos = NULL;
#ifdef RE_TRACK_PATTERN_OFFSETS
parse_start = RExC_parse; /* MJD */
}
STATIC bool
-S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
+S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
+ const bool strict /* Apply stricter parsing rules? */
+ )
{
/* This is expected to be called by a parser routine that has recognized '\N'
The function raises an error (via vFAIL), and doesn't return for various
syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
- success; it returns FALSE otherwise.
+ success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
+ RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
+ only possible if node_p is non-NULL.
+
If <valuep> is non-null, it means the caller can accept an input sequence
consisting of a just a single code point; <*valuep> is set to that value
/* Disambiguate between \N meaning a named character versus \N meaning
* [^\n]. The former is assumed when it can't be the latter. */
- if (*p != '{' || regcurly(p)) {
+ if (*p != '{' || regcurly(p, FALSE)) {
RExC_parse = p;
if (! node_p) {
/* no bare \N in a charclass */
}
else if (in_char_class) {
if (SIZE_ONLY && in_char_class) {
- ckWARNreg(RExC_parse,
- "Ignoring zero length \\N{} in character class"
- );
+ if (strict) {
+ RExC_parse++; /* Position after the "}" */
+ vFAIL("Zero length \\N{}");
+ }
+ else {
+ ckWARNreg(RExC_parse,
+ "Ignoring zero length \\N{} in character class");
+ }
}
ret = FALSE;
}
}
if (in_char_class && has_multiple_chars) {
- ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+ if (strict) {
+ RExC_parse = endbrace;
+ vFAIL("\\N{} in character class restricted to one character");
+ }
+ else {
+ ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+ }
}
RExC_parse = endbrace + 1;
/* The values are Unicode, and therefore not subject to recoding */
RExC_override_recoding = 1;
- *node_p = reg(pRExC_state, 1, &flags, depth+1);
+ if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
+ if (flags & RESTART_UTF8) {
+ *flagp = RESTART_UTF8;
+ return FALSE;
+ }
+ FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#X",
+ flags);
+ }
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
RExC_parse = endbrace;
escape sequences, with the one for handling literal escapes requiring
a dummy entry for all of the special escapes that are actually handled
by the other.
+
+ Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
+ TRYAGAIN.
+ Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
+ restarted.
+ Otherwise does not return NULL.
*/
STATIC regnode *
{
dVAR;
regnode *ret = NULL;
- I32 flags;
+ I32 flags = 0;
char *parse_start = RExC_parse;
U8 op;
int invert = 0;
{
char * const oregcomp_parse = ++RExC_parse;
ret = regclass(pRExC_state, flagp,depth+1,
- FALSE /* means parse the whole char class */ );
+ FALSE, /* means parse the whole char class */
+ TRUE, /* allow multi-char folds */
+ FALSE, /* don't silence non-portable warnings. */
+ NULL);
if (*RExC_parse != ']') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched [");
}
+ if (ret == NULL) {
+ if (*flagp & RESTART_UTF8)
+ return NULL;
+ FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
+ *flagp);
+ }
nextchar(pRExC_state);
Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
break;
}
goto tryagain;
}
- return(NULL);
+ if (flags & RESTART_UTF8) {
+ *flagp = RESTART_UTF8;
+ return NULL;
+ }
+ FAIL2("panic: reg returned NULL to regatom, flags=%#X", flags);
}
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
break;
vFAIL("Internal urp");
/* Supposed to be caught earlier. */
break;
+ case '{':
+ if (!regcurly(RExC_parse, FALSE)) {
+ RExC_parse++;
+ goto defchar;
+ }
+ /* FALL THROUGH */
case '?':
case '+':
case '*':
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
+ if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
+ ckWARNdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" or \"\\b[{]\" instead");
+ }
goto finish_meta_pat;
case 'B':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
+ if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
+ ckWARNdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" or \"\\B[{]\" instead");
+ }
goto finish_meta_pat;
case 'D':
RExC_parse--;
ret = regclass(pRExC_state, flagp,depth+1,
- TRUE /* means just parse this element */ );
+ TRUE, /* means just parse this element */
+ FALSE, /* don't allow multi-char folds */
+ FALSE, /* don't silence non-portable warnings.
+ It would be a bug if these returned
+ non-portables */
+ NULL);
+ /* regclass() can only return RESTART_UTF8 if multi-char folds
+ are allowed. */
+ if (!ret)
+ FAIL2("panic: regclass returned NULL to regatom, flags=%#X",
+ *flagp);
RExC_parse--;
* special treatment for quantifiers is not needed for such single
* character sequences */
++RExC_parse;
- if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
+ if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
+ FALSE /* not strict */ )) {
+ if (*flagp & RESTART_UTF8)
+ return NULL;
RExC_parse--;
goto defchar;
}
* */
RExC_parse = p + 1;
if (! grok_bslash_N(pRExC_state, NULL, &ender,
- flagp, depth, FALSE))
+ flagp, depth, FALSE,
+ FALSE /* not strict */ ))
{
+ if (*flagp & RESTART_UTF8)
+ FAIL("panic: grok_bslash_N set RESTART_UTF8");
RExC_parse = p = oldp;
goto loopdone;
}
&error_msg,
TRUE, /* out warnings */
FALSE, /* not strict */
+ TRUE, /* Output warnings
+ for non-
+ portables */
UTF);
if (! valid) {
RExC_parse = p; /* going to die anyway; point
to exact spot of failure */
vFAIL(error_msg);
}
- else
- {
- ender = result;
- }
+ ender = result;
if (PL_encoding && ender < 0x100) {
goto recode_encoding;
}
}
case 'x':
{
- UV result;
+ UV result = UV_MAX; /* initialize to erroneous
+ value */
const char* error_msg;
bool valid = grok_bslash_x(&p,
&error_msg,
TRUE, /* out warnings */
FALSE, /* not strict */
+ TRUE, /* Output warnings
+ for non-
+ portables */
UTF);
if (! valid) {
RExC_parse = p; /* going to die anyway; point
to exact spot of failure */
vFAIL(error_msg);
}
- else {
- ender = result;
- }
+ ender = result;
+
if (PL_encoding && ender < 0x100) {
goto recode_encoding;
}
REQUIRE_UTF8;
}
p += numlen;
+ if (SIZE_ONLY /* like \08, \178 */
+ && numlen < 3
+ && p < RExC_end
+ && isDIGIT(*p) && ckWARN(WARN_REGEXP))
+ {
+ reg_warn_non_literal_string(
+ p + 1,
+ form_short_octal_warning(p, numlen));
+ }
}
- else {
+ else { /* Not to be treated as an octal constant, go
+ find backref */
--p;
goto loopdone;
}
/* FALL THROUGH */
default:
if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
- ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
+ /* Include any { following the alpha to emphasize
+ * that it could be part of an escape at some point
+ * in the future */
+ int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
+ ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
}
goto normal_default;
- }
+ } /* End of switch on '\' */
break;
- case '{':
- /* Currently we don't warn when the lbrace is at the start
- * of a construct. This catches it in the middle of a
- * literal string, or when its the first thing after
- * something like "\b" */
- if (! SIZE_ONLY
- && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
- {
- ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
- }
- /*FALLTHROUGH*/
- default:
+ default: /* A literal character */
+
+ if (! SIZE_ONLY
+ && RExC_flags & RXf_PMf_EXTENDED
+ && ckWARN(WARN_DEPRECATED)
+ && is_PATWS_non_low(p, UTF))
+ {
+ vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
+ "Escape literal pattern white space under /x");
+ }
+
normal_default:
if (UTF8_IS_START(*p) && UTF) {
STRLEN numlen;
return p;
}
+STATIC char *
+S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
+{
+ /* Returns the next non-pattern-white space, non-comment character (the
+ * latter only if 'recognize_comment is true) in the string p, which is
+ * ended by RExC_end. If there is no line break ending a comment,
+ * RExC_seen has added the REG_SEEN_RUN_ON_COMMENT flag; */
+ const char *e = RExC_end;
+
+ PERL_ARGS_ASSERT_REGPATWS;
+
+ while (p < e) {
+ STRLEN len;
+ if ((len = is_PATWS_safe(p, e, UTF))) {
+ p += len;
+ }
+ else if (recognize_comment && *p == '#') {
+ bool ended = 0;
+ do {
+ p++;
+ if (is_LNBREAK_safe(p, e, UTF)) {
+ ended = 1;
+ break;
+ }
+ } while (p < e);
+ if (!ended)
+ RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
+ }
+ else
+ break;
+ }
+ return p;
+}
+
/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
Character classes ([:foo:]) can also be negated ([:^foo:]).
Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
PERL_STATIC_INLINE I32
-S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me)
+S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
{
dVAR;
I32 namedclass = OOB_NAMEDCLASS;
if (value == '[' && RExC_parse + 1 < RExC_end &&
/* I smell either [: or [= or [. -- POSIX has been here, right? */
- POSIXCC(UCHARAT(RExC_parse))) {
+ POSIXCC(UCHARAT(RExC_parse)))
+ {
const char c = UCHARAT(RExC_parse);
char* const s = RExC_parse++;
while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
RExC_parse++;
- if (RExC_parse == RExC_end)
+ if (RExC_parse == RExC_end) {
+ if (strict) {
+
+ /* Try to give a better location for the error (than the end of
+ * the string) by looking for the matching ']' */
+ RExC_parse = s;
+ while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
+ RExC_parse++;
+ }
+ vFAIL2("Unmatched '%c' in POSIX class", c);
+ }
/* Grandfather lone [:, [=, [. */
RExC_parse = s;
+ }
else {
const char* const t = RExC_parse++; /* skip over the c */
assert(*t == c);
/* Initially switch on the length of the name. */
switch (skip) {
case 4:
- if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
+ if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
+ this is the Perl \w
+ */
namedclass = ANYOF_WORDCHAR;
break;
case 5:
the class closes */
while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
RExC_parse++;
- SvREFCNT_dec(free_me);
vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
} else {
/* Maternal grandfather:
* "[:" ending in ":" but not in ":]" */
+ if (strict) {
+ vFAIL("Unmatched '[' in POSIX class");
+ }
+
+ /* Grandfather lone [:, [=, [. */
RExC_parse = s;
}
}
return namedclass;
}
+STATIC bool
+S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
+{
+ /* This applies some heuristics at the current parse position (which should
+ * be at a '[') to see if what follows might be intended to be a [:posix:]
+ * class. It returns true if it really is a posix class, of course, but it
+ * also can return true if it thinks that what was intended was a posix
+ * class that didn't quite make it.
+ *
+ * It will return true for
+ * [:alphanumerics:
+ * [:alphanumerics] (as long as the ] isn't followed immediately by a
+ * ')' indicating the end of the (?[
+ * [:any garbage including %^&$ punctuation:]
+ *
+ * This is designed to be called only from S_handle_regex_sets; it could be
+ * easily adapted to be called from the spot at the beginning of regclass()
+ * that checks to see in a normal bracketed class if the surrounding []
+ * have been omitted ([:word:] instead of [[:word:]]). But doing so would
+ * change long-standing behavior, so I (khw) didn't do that */
+ char* p = RExC_parse + 1;
+ char first_char = *p;
+
+ PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
+
+ assert(*(p - 1) == '[');
+
+ if (! POSIXCC(first_char)) {
+ return FALSE;
+ }
+
+ p++;
+ while (p < RExC_end && isWORDCHAR(*p)) p++;
+
+ if (p >= RExC_end) {
+ return FALSE;
+ }
+
+ if (p - RExC_parse > 2 /* Got at least 1 word character */
+ && (*p == first_char
+ || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
+ {
+ return TRUE;
+ }
+
+ p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
+
+ return (p
+ && p - RExC_parse > 2 /* [:] evaluates to colon;
+ [::] is a bad posix class. */
+ && first_char == *(p - 1));
+}
+
+STATIC regnode *
+S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, I32 *flagp, U32 depth,
+ char * const oregcomp_parse)
+{
+ /* Handle the (?[...]) construct to do set operations */
+
+ U8 curchar;
+ UV start, end; /* End points of code point ranges */
+ SV* result_string;
+ char *save_end, *save_parse;
+ SV* final;
+ STRLEN len;
+ regnode* node;
+ AV* stack;
+ const bool save_fold = FOLD;
+
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
+
+ if (LOC) {
+ vFAIL("(?[...]) not valid in locale");
+ }
+ RExC_uni_semantics = 1;
+
+ /* This will return only an ANYOF regnode, or (unlikely) something smaller
+ * (such as EXACT). Thus we can skip most everything if just sizing. We
+ * call regclass to handle '[]' so as to not have to reinvent its parsing
+ * rules here (throwing away the size it computes each time). And, we exit
+ * upon an unescaped ']' that isn't one ending a regclass. To do both
+ * these things, we need to realize that something preceded by a backslash
+ * is escaped, so we have to keep track of backslashes */
+ if (SIZE_ONLY) {
+
+ Perl_ck_warner_d(aTHX_
+ packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
+ "The regex_sets feature is experimental" REPORT_LOCATION,
+ (int) (RExC_parse - RExC_precomp) , RExC_precomp, RExC_parse);
+
+ while (RExC_parse < RExC_end) {
+ SV* current = NULL;
+ RExC_parse = regpatws(pRExC_state, RExC_parse,
+ TRUE); /* means recognize comments */
+ switch (*RExC_parse) {
+ default:
+ break;
+ case '\\':
+ /* Skip the next byte (which could cause us to end up in
+ * the middle of a UTF-8 character, but since none of those
+ * are confusable with anything we currently handle in this
+ * switch (invariants all), it's safe. We'll just hit the
+ * default: case next time and keep on incrementing until
+ * we find one of the invariants we do handle. */
+ RExC_parse++;
+ break;
+ case '[':
+ {
+ /* If this looks like it is a [:posix:] class, leave the
+ * parse pointer at the '[' to fool regclass() into
+ * thinking it is part of a '[[:posix:]]'. That function
+ * will use strict checking to force a syntax error if it
+ * doesn't work out to a legitimate class */
+ bool is_posix_class
+ = could_it_be_a_POSIX_class(pRExC_state);
+ if (! is_posix_class) {
+ RExC_parse++;
+ }
+
+ /* regclass() can only return RESTART_UTF8 if multi-char
+ folds are allowed. */
+ if (!regclass(pRExC_state, flagp,depth+1,
+ is_posix_class, /* parse the whole char
+ class only if not a
+ posix class */
+ FALSE, /* don't allow multi-char folds */
+ TRUE, /* silence non-portable warnings. */
+ ¤t))
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
+ *flagp);
+
+ /* function call leaves parse pointing to the ']', except
+ * if we faked it */
+ if (is_posix_class) {
+ RExC_parse--;
+ }
+
+ SvREFCNT_dec(current); /* In case it returned something */
+ break;
+ }
+
+ case ']':
+ RExC_parse++;
+ if (RExC_parse < RExC_end
+ && *RExC_parse == ')')
+ {
+ node = reganode(pRExC_state, ANYOF, 0);
+ RExC_size += ANYOF_SKIP;
+ nextchar(pRExC_state);
+ Set_Node_Length(node,
+ RExC_parse - oregcomp_parse + 1); /* MJD */
+ return node;
+ }
+ goto no_close;
+ }
+ RExC_parse++;
+ }
+
+ no_close:
+ FAIL("Syntax error in (?[...])");
+ }
+
+ /* Pass 2 only after this. Everything in this construct is a
+ * metacharacter. Operands begin with either a '\' (for an escape
+ * sequence), or a '[' for a bracketed character class. Any other
+ * character should be an operator, or parenthesis for grouping. Both
+ * types of operands are handled by calling regclass() to parse them. It
+ * is called with a parameter to indicate to return the computed inversion
+ * list. The parsing here is implemented via a stack. Each entry on the
+ * stack is a single character representing one of the operators, or the
+ * '('; or else a pointer to an operand inversion list. */
+
+#define IS_OPERAND(a) (! SvIOK(a))
+
+ /* The stack starts empty. It is a syntax error if the first thing parsed
+ * is a binary operator; everything else is pushed on the stack. When an
+ * operand is parsed, the top of the stack is examined. If it is a binary
+ * operator, the item before it should be an operand, and both are replaced
+ * by the result of doing that operation on the new operand and the one on
+ * the stack. Thus a sequence of binary operands is reduced to a single
+ * one before the next one is parsed.
+ *
+ * A unary operator may immediately follow a binary in the input, for
+ * example
+ * [a] + ! [b]
+ * When an operand is parsed and the top of the stack is a unary operator,
+ * the operation is performed, and then the stack is rechecked to see if
+ * this new operand is part of a binary operation; if so, it is handled as
+ * above.
+ *
+ * A '(' is simply pushed on the stack; it is valid only if the stack is
+ * empty, or the top element of the stack is an operator or another '('
+ * (for which the parenthesized expression will become an operand). By the
+ * time the corresponding ')' is parsed everything in between should have
+ * been parsed and evaluated to a single operand (or else is a syntax
+ * error), and is handled as a regular operand */
+
+ stack = newAV();
+
+ while (RExC_parse < RExC_end) {
+ I32 top_index = av_tindex(stack);
+ SV** top_ptr;
+ SV* current = NULL;
+
+ /* Skip white space */
+ RExC_parse = regpatws(pRExC_state, RExC_parse,
+ TRUE); /* means recognize comments */
+ if (RExC_parse >= RExC_end) {
+ Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
+ }
+ if ((curchar = UCHARAT(RExC_parse)) == ']') {
+ break;
+ }
+
+ switch (curchar) {
+
+ case '?':
+ if (av_tindex(stack) >= 0 /* This makes sure that we can
+ safely subtract 1 from
+ RExC_parse in the next clause.
+ If we have something on the
+ stack, we have parsed something
+ */
+ && UCHARAT(RExC_parse - 1) == '('
+ && RExC_parse < RExC_end)
+ {
+ /* If is a '(?', could be an embedded '(?flags:(?[...])'.
+ * This happens when we have some thing like
+ *
+ * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
+ * ...
+ * qr/(?[ \p{Digit} & $thai_or_lao ])/;
+ *
+ * Here we would be handling the interpolated
+ * '$thai_or_lao'. We handle this by a recursive call to
+ * ourselves which returns the inversion list the
+ * interpolated expression evaluates to. We use the flags
+ * from the interpolated pattern. */
+ U32 save_flags = RExC_flags;
+ const char * const save_parse = ++RExC_parse;
+
+ parse_lparen_question_flags(pRExC_state);
+
+ if (RExC_parse == save_parse /* Makes sure there was at
+ least one flag (or this
+ embedding wasn't compiled)
+ */
+ || RExC_parse >= RExC_end - 4
+ || UCHARAT(RExC_parse) != ':'
+ || UCHARAT(++RExC_parse) != '('
+ || UCHARAT(++RExC_parse) != '?'
+ || UCHARAT(++RExC_parse) != '[')
+ {
+
+ /* In combination with the above, this moves the
+ * pointer to the point just after the first erroneous
+ * character (or if there are no flags, to where they
+ * should have been) */
+ if (RExC_parse >= RExC_end - 4) {
+ RExC_parse = RExC_end;
+ }
+ else if (RExC_parse != save_parse) {
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ }
+ vFAIL("Expecting '(?flags:(?[...'");
+ }
+ RExC_parse++;
+ (void) handle_regex_sets(pRExC_state, ¤t, flagp,
+ depth+1, oregcomp_parse);
+
+ /* Here, 'current' contains the embedded expression's
+ * inversion list, and RExC_parse points to the trailing
+ * ']'; the next character should be the ')' which will be
+ * paired with the '(' that has been put on the stack, so
+ * the whole embedded expression reduces to '(operand)' */
+ RExC_parse++;
+
+ RExC_flags = save_flags;
+ goto handle_operand;
+ }
+ /* FALL THROUGH */
+
+ default:
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL("Unexpected character");
+
+ case '\\':
+ /* regclass() can only return RESTART_UTF8 if multi-char
+ folds are allowed. */
+ if (!regclass(pRExC_state, flagp,depth+1,
+ TRUE, /* means parse just the next thing */
+ FALSE, /* don't allow multi-char folds */
+ FALSE, /* don't silence non-portable warnings. */
+ ¤t))
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
+ *flagp);
+ /* regclass() will return with parsing just the \ sequence,
+ * leaving the parse pointer at the next thing to parse */
+ RExC_parse--;
+ goto handle_operand;
+
+ case '[': /* Is a bracketed character class */
+ {
+ bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
+
+ if (! is_posix_class) {
+ RExC_parse++;
+ }
+
+ /* regclass() can only return RESTART_UTF8 if multi-char
+ folds are allowed. */
+ if(!regclass(pRExC_state, flagp,depth+1,
+ is_posix_class, /* parse the whole char class
+ only if not a posix class */
+ FALSE, /* don't allow multi-char folds */
+ FALSE, /* don't silence non-portable warnings. */
+ ¤t))
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#X",
+ *flagp);
+ /* function call leaves parse pointing to the ']', except if we
+ * faked it */
+ if (is_posix_class) {
+ RExC_parse--;
+ }
+
+ goto handle_operand;
+ }
+
+ case '&':
+ case '|':
+ case '+':
+ case '-':
+ case '^':
+ if (top_index < 0
+ || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
+ || ! IS_OPERAND(*top_ptr))
+ {
+ RExC_parse++;
+ vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
+ }
+ av_push(stack, newSVuv(curchar));
+ break;
+
+ case '!':
+ av_push(stack, newSVuv(curchar));
+ break;
+
+ case '(':
+ if (top_index >= 0) {
+ top_ptr = av_fetch(stack, top_index, FALSE);
+ assert(top_ptr);
+ if (IS_OPERAND(*top_ptr)) {
+ RExC_parse++;
+ vFAIL("Unexpected '(' with no preceding operator");
+ }
+ }
+ av_push(stack, newSVuv(curchar));
+ break;
+
+ case ')':
+ {
+ SV* lparen;
+ if (top_index < 1
+ || ! (current = av_pop(stack))
+ || ! IS_OPERAND(current)
+ || ! (lparen = av_pop(stack))
+ || IS_OPERAND(lparen)
+ || SvUV(lparen) != '(')
+ {
+ RExC_parse++;
+ vFAIL("Unexpected ')'");
+ }
+ top_index -= 2;
+ SvREFCNT_dec_NN(lparen);
+
+ /* FALL THROUGH */
+ }
+
+ handle_operand:
+
+ /* Here, we have an operand to process, in 'current' */
+
+ if (top_index < 0) { /* Just push if stack is empty */
+ av_push(stack, current);
+ }
+ else {
+ SV* top = av_pop(stack);
+ char current_operator;
+
+ if (IS_OPERAND(top)) {
+ vFAIL("Operand with no preceding operator");
+ }
+ current_operator = (char) SvUV(top);
+ switch (current_operator) {
+ case '(': /* Push the '(' back on followed by the new
+ operand */
+ av_push(stack, top);
+ av_push(stack, current);
+ SvREFCNT_inc(top); /* Counters the '_dec' done
+ just after the 'break', so
+ it doesn't get wrongly freed
+ */
+ break;
+
+ case '!':
+ _invlist_invert(current);
+
+ /* Unlike binary operators, the top of the stack,
+ * now that this unary one has been popped off, may
+ * legally be an operator, and we now have operand
+ * for it. */
+ top_index--;
+ SvREFCNT_dec_NN(top);
+ goto handle_operand;
+
+ case '&':
+ _invlist_intersection(av_pop(stack),
+ current,
+ ¤t);
+ av_push(stack, current);
+ break;
+
+ case '|':
+ case '+':
+ _invlist_union(av_pop(stack), current, ¤t);
+ av_push(stack, current);
+ break;
+
+ case '-':
+ _invlist_subtract(av_pop(stack), current, ¤t);
+ av_push(stack, current);
+ break;
+
+ case '^': /* The union minus the intersection */
+ {
+ SV* i = NULL;
+ SV* u = NULL;
+ SV* element;
+
+ element = av_pop(stack);
+ _invlist_union(element, current, &u);
+ _invlist_intersection(element, current, &i);
+ _invlist_subtract(u, i, ¤t);
+ av_push(stack, current);
+ SvREFCNT_dec_NN(i);
+ SvREFCNT_dec_NN(u);
+ SvREFCNT_dec_NN(element);
+ break;
+ }
+
+ default:
+ Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
+ }
+ SvREFCNT_dec_NN(top);
+ }
+ }
+
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ }
+
+ if (av_tindex(stack) < 0 /* Was empty */
+ || ((final = av_pop(stack)) == NULL)
+ || ! IS_OPERAND(final)
+ || av_tindex(stack) >= 0) /* More left on stack */
+ {
+ vFAIL("Incomplete expression within '(?[ ])'");
+ }
+
+ /* Here, 'final' is the resultant inversion list from evaluating the
+ * expression. Return it if so requested */
+ if (return_invlist) {
+ *return_invlist = final;
+ return END;
+ }
+
+ /* Otherwise generate a resultant node, based on 'final'. regclass() is
+ * expecting a string of ranges and individual code points */
+ invlist_iterinit(final);
+ result_string = newSVpvs("");
+ while (invlist_iternext(final, &start, &end)) {
+ if (start == end) {
+ Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
+ }
+ else {
+ Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
+ start, end);
+ }
+ }
+
+ save_parse = RExC_parse;
+ RExC_parse = SvPV(result_string, len);
+ save_end = RExC_end;
+ RExC_end = RExC_parse + len;
+
+ /* We turn off folding around the call, as the class we have constructed
+ * already has all folding taken into consideration, and we don't want
+ * regclass() to add to that */
+ RExC_flags &= ~RXf_PMf_FOLD;
+ /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
+ */
+ node = regclass(pRExC_state, flagp,depth+1,
+ FALSE, /* means parse the whole char class */
+ FALSE, /* don't allow multi-char folds */
+ TRUE, /* silence non-portable warnings. The above may very
+ well have generated non-portable code points, but
+ they're valid on this machine */
+ NULL);
+ if (!node)
+ FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
+ PTR2UV(flagp));
+ if (save_fold) {
+ RExC_flags |= RXf_PMf_FOLD;
+ }
+ RExC_parse = save_parse + 1;
+ RExC_end = save_end;
+ SvREFCNT_dec_NN(final);
+ SvREFCNT_dec_NN(result_string);
+ SvREFCNT_dec_NN(stack);
+
+ nextchar(pRExC_state);
+ Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
+ return node;
+}
+#undef IS_OPERAND
/* The names of properties whose definitions are not known at compile time are
* stored in this SV, after a constant heading. So if the length has been
#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
STATIC regnode *
-S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool stop_at_1)
+S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
+ const bool stop_at_1, /* Just parse the next thing, don't
+ look for a full character class */
+ bool allow_multi_folds,
+ const bool silence_non_portable, /* Don't output warnings
+ about too large
+ characters */
+ SV** ret_invlist) /* Return an inversion list, not a node */
{
- /* parse a bracketed class specification. Most of these will produce an ANYOF node;
- * but something like [a] will produce an EXACT node; [aA], an EXACTFish
- * node; [[:ascii:]], a POSIXA node; etc. It is more complex under /i with
- * multi-character folds: it will be rewritten following the paradigm of
- * this example, where the <multi-fold>s are characters which fold to
- * multiple character sequences:
+ /* parse a bracketed class specification. Most of these will produce an
+ * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
+ * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
+ * under /i with multi-character folds: it will be rewritten following the
+ * paradigm of this example, where the <multi-fold>s are characters which
+ * fold to multiple character sequences:
* /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
* gets effectively rewritten as:
* /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
* corresponding bit set if that character is in the list. For characters
* above 255, a range list or swash is used. There are extra bits for \w,
* etc. in locale ANYOFs, as what these match is not determinable at
- * compile time */
+ * compile time
+ *
+ * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
+ * to be restarted. This can only happen if ret_invlist is non-NULL.
+ */
dVAR;
- UV nextvalue;
UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
IV range = 0;
UV value = OOB_UNICODE, save_value = OOB_UNICODE;
character; used under /i */
UV n;
char * stop_ptr = RExC_end; /* where to stop parsing */
+ const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
+ space? */
+ const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
/* Unicode properties are stored in a swash; this holds the current one
* being parsed. If this swash is the only above-latin1 component of the
/* Assume we are going to generate an ANYOF node. */
ret = reganode(pRExC_state, ANYOF, 0);
- if (!SIZE_ONLY) {
- ANYOF_FLAGS(ret) = 0;
- }
-
- if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
- RExC_parse++;
- invert = TRUE;
- RExC_naughty++;
- }
-
if (SIZE_ONLY) {
RExC_size += ANYOF_SKIP;
listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
}
else {
+ ANYOF_FLAGS(ret) = 0;
+
RExC_emit += ANYOF_SKIP;
if (LOC) {
ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
}
- listsv = newSVpvs("# comment\n");
+ listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
initial_listsv_len = SvCUR(listsv);
+ SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
}
- nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+ if (skip_white) {
+ RExC_parse = regpatws(pRExC_state, RExC_parse,
+ FALSE /* means don't recognize comments */);
+ }
+
+ if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
+ RExC_parse++;
+ invert = TRUE;
+ allow_multi_folds = FALSE;
+ RExC_naughty++;
+ if (skip_white) {
+ RExC_parse = regpatws(pRExC_state, RExC_parse,
+ FALSE /* means don't recognize comments */);
+ }
+ }
/* Check that they didn't say [:posix:] instead of [[:posix:]] */
- if (!SIZE_ONLY && POSIXCC(nextvalue)) {
+ if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
const char *s = RExC_parse;
const char c = *s++;
s++;
if (*s && c == *s && s[1] == ']') {
SAVEFREESV(RExC_rx_sv);
- SAVEFREESV(listsv);
ckWARN3reg(s+2,
"POSIX syntax [%c %c] belongs inside character classes",
c, c);
(void)ReREFCNT_inc(RExC_rx_sv);
- SvREFCNT_inc_simple_void_NN(listsv);
}
}
stop_ptr = RExC_parse + 1;
}
- /* allow 1st char to be ] (allowing it to be - is dealt with later) */
+ /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
if (UCHARAT(RExC_parse) == ']')
goto charclassloop;
parseit:
- while (RExC_parse < stop_ptr && UCHARAT(RExC_parse) != ']') {
+ while (1) {
+ if (RExC_parse >= stop_ptr) {
+ break;
+ }
+
+ if (skip_white) {
+ RExC_parse = regpatws(pRExC_state, RExC_parse,
+ FALSE /* means don't recognize comments */);
+ }
+
+ if (UCHARAT(RExC_parse) == ']') {
+ break;
+ }
charclassloop:
else
value = UCHARAT(RExC_parse++);
- nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
- if (value == '[' && POSIXCC(nextvalue))
- namedclass = regpposixcc(pRExC_state, value, listsv);
- else if (value == '\\') {
+ if (value == '['
+ && RExC_parse < RExC_end
+ && POSIXCC(UCHARAT(RExC_parse)))
+ {
+ namedclass = regpposixcc(pRExC_state, value, strict);
+ }
+ else if (value == '\\') {
if (UTF) {
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
}
else
value = UCHARAT(RExC_parse++);
+
/* Some compilers cannot handle switching on 64-bit integer
* values, therefore value cannot be an UV. Yes, this will
* be a problem later if we want switch on Unicode.
* A similar issue a little bit later when switching on
* namedclass. --jhi */
- switch ((I32)value) {
+
+ /* If the \ is escaping white space when white space is being
+ * skipped, it means that that white space is wanted literally, and
+ * is already in 'value'. Otherwise, need to translate the escape
+ * into what it signifies. */
+ if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
+
case 'w': namedclass = ANYOF_WORDCHAR; break;
case 'W': namedclass = ANYOF_NWORDCHAR; break;
case 's': namedclass = ANYOF_SPACE; break;
from earlier versions, OTOH that behaviour was broken
as well. */
if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
- TRUE /* => charclass */))
+ TRUE, /* => charclass */
+ strict))
{
+ if (*flagp & RESTART_UTF8)
+ FAIL("panic: grok_bslash_N set RESTART_UTF8");
goto parseit;
}
}
}
/* Here didn't find it. It could be a user-defined
- * property that will be available at run-time. Add it
- * to the list to look up then */
+ * property that will be available at run-time. If we
+ * accept only compile-time properties, is an error;
+ * otherwise add it to the list for run-time look up */
+ if (ret_invlist) {
+ RExC_parse = e + 1;
+ vFAIL3("Property '%.*s' is unknown", (int) n, name);
+ }
Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
(value == 'p' ? '+' : '!'),
name);
Safefree(name);
}
RExC_parse = e + 1;
- namedclass = ANYOF_UNIPROP; /* no official name, but it's named */
+ namedclass = ANYOF_UNIPROP; /* no official name, but it's
+ named */
/* \p means they want Unicode semantics */
RExC_uni_semantics = 1;
bool valid = grok_bslash_o(&RExC_parse,
&value,
&error_msg,
- SIZE_ONLY,
- FALSE, /* Not strict */
+ SIZE_ONLY, /* warnings in pass
+ 1 only */
+ strict,
+ silence_non_portable,
UTF);
if (! valid) {
vFAIL(error_msg);
&value,
&error_msg,
TRUE, /* Output warnings */
- FALSE, /* Not strict */
+ strict,
+ silence_non_portable,
UTF);
- if (! valid) {
+ if (! valid) {
vFAIL(error_msg);
}
}
{
/* Take 1-3 octal digits */
I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
- numlen = 3;
- value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
+ numlen = (strict) ? 4 : 3;
+ value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
RExC_parse += numlen;
+ if (numlen != 3) {
+ if (strict) {
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL("Need exactly 3 octal digits");
+ }
+ else if (! SIZE_ONLY /* like \08, \178 */
+ && numlen < 3
+ && RExC_parse < RExC_end
+ && isDIGIT(*RExC_parse)
+ && ckWARN(WARN_REGEXP))
+ {
+ SAVEFREESV(RExC_rx_sv);
+ reg_warn_non_literal_string(
+ RExC_parse + 1,
+ form_short_octal_warning(RExC_parse, numlen));
+ (void)ReREFCNT_inc(RExC_rx_sv);
+ }
+ }
if (PL_encoding && value < 0x100)
goto recode_encoding;
break;
if (! RExC_override_recoding) {
SV* enc = PL_encoding;
value = reg_recode((const char)(U8)value, &enc);
- if (!enc && SIZE_ONLY)
- ckWARNreg(RExC_parse,
+ if (!enc) {
+ if (strict) {
+ vFAIL("Invalid escape in the specified encoding");
+ }
+ else if (SIZE_ONLY) {
+ ckWARNreg(RExC_parse,
"Invalid escape in the specified encoding");
+ }
+ }
break;
}
default:
/* Allow \_ to not give an error */
if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
- SAVEFREESV(RExC_rx_sv);
- SAVEFREESV(listsv);
- ckWARN2reg(RExC_parse,
- "Unrecognized escape \\%c in character class passed through",
- (int)value);
- (void)ReREFCNT_inc(RExC_rx_sv);
- SvREFCNT_inc_simple_void_NN(listsv);
+ if (strict) {
+ vFAIL2("Unrecognized escape \\%c in character class",
+ (int)value);
+ }
+ else {
+ SAVEFREESV(RExC_rx_sv);
+ ckWARN2reg(RExC_parse,
+ "Unrecognized escape \\%c in character class passed through",
+ (int)value);
+ (void)ReREFCNT_inc(RExC_rx_sv);
+ }
}
break;
- }
+ } /* End of switch on char following backslash */
} /* end of handling backslash escape sequences */
#ifdef EBCDIC
- else
- literal_endpoint++;
+ else
+ literal_endpoint++;
#endif
+ /* Here, we have the current token in 'value' */
+
/* What matches in a locale is not known until runtime. This includes
* what the Posix classes (like \w, [:space:]) match. Room must be
* reserved (one time per class) to store such classes, either if Perl
* the 'a' in the examples */
if (range) {
if (!SIZE_ONLY) {
- const int w =
- RExC_parse >= rangebegin ?
- RExC_parse - rangebegin : 0;
- SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
- SAVEFREESV(listsv);
- ckWARN4reg(RExC_parse,
- "False [] range \"%*.*s\"",
- w, w, rangebegin);
- (void)ReREFCNT_inc(RExC_rx_sv);
- SvREFCNT_inc_simple_void_NN(listsv);
- cp_list = add_cp_to_invlist(cp_list, '-');
- cp_list = add_cp_to_invlist(cp_list, prevvalue);
+ const int w = (RExC_parse >= rangebegin)
+ ? RExC_parse - rangebegin
+ : 0;
+ if (strict) {
+ vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin);
+ }
+ else {
+ SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
+ ckWARN4reg(RExC_parse,
+ "False [] range \"%*.*s\"",
+ w, w, rangebegin);
+ (void)ReREFCNT_inc(RExC_rx_sv);
+ cp_list = add_cp_to_invlist(cp_list, '-');
+ cp_list = add_cp_to_invlist(cp_list, prevvalue);
+ }
}
range = 0; /* this was not a true range */
* class */
const char *Xname = swash_property_names[classnum];
+ /* If returning the inversion list, we can't defer
+ * getting this until runtime */
+ if (ret_invlist && ! PL_utf8_swash_ptrs[classnum]) {
+ PL_utf8_swash_ptrs[classnum] =
+ _core_swash_init("utf8", Xname, &PL_sv_undef,
+ 1, /* binary */
+ 0, /* not tr/// */
+ NULL, /* No inversion list */
+ NULL /* No flags */
+ );
+ assert(PL_utf8_swash_ptrs[classnum]);
+ }
if ( ! PL_utf8_swash_ptrs[classnum]) {
if (namedclass % 2 == 0) { /* A non-complemented
class */
}
} /* end of namedclass \blah */
+ /* Here, we have a single value. If 'range' is set, it is the ending
+ * of a range--check its validity. Later, we will handle each
+ * individual code point in the range. If 'range' isn't set, this
+ * could be the beginning of a range, so check for that by looking
+ * ahead to see if the next real character to be processed is the range
+ * indicator--the minus sign */
+
+ if (skip_white) {
+ RExC_parse = regpatws(pRExC_state, RExC_parse,
+ FALSE /* means don't recognize comments */);
+ }
+
if (range) {
if (prevvalue > value) /* b-a */ {
const int w = RExC_parse - rangebegin;
}
else {
prevvalue = value; /* save the beginning of the potential range */
- if (RExC_parse+1 < RExC_end
- && *RExC_parse == '-'
- && RExC_parse[1] != ']')
- {
- RExC_parse++;
+ if (! stop_at_1 /* Can't be a range if parsing just one thing */
+ && *RExC_parse == '-')
+ {
+ char* next_char_ptr = RExC_parse + 1;
+ if (skip_white) { /* Get the next real char after the '-' */
+ next_char_ptr = regpatws(pRExC_state,
+ RExC_parse + 1,
+ FALSE); /* means don't recognize
+ comments */
+ }
- /* a bad range like \w-, [:word:]- ? */
- if (namedclass > OOB_NAMEDCLASS) {
- if (ckWARN(WARN_REGEXP)) {
- const int w =
- RExC_parse >= rangebegin ?
- RExC_parse - rangebegin : 0;
- vWARN4(RExC_parse,
- "False [] range \"%*.*s\"",
- w, w, rangebegin);
- }
- if (!SIZE_ONLY) {
- cp_list = add_cp_to_invlist(cp_list, '-');
- }
- element_count++;
- } else
- range = 1; /* yeah, it's a range! */
- continue; /* but do it the next time */
+ /* If the '-' is at the end of the class (just before the ']',
+ * it is a literal minus; otherwise it is a range */
+ if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
+ RExC_parse = next_char_ptr;
+
+ /* a bad range like \w-, [:word:]- ? */
+ if (namedclass > OOB_NAMEDCLASS) {
+ if (strict || ckWARN(WARN_REGEXP)) {
+ const int w =
+ RExC_parse >= rangebegin ?
+ RExC_parse - rangebegin : 0;
+ if (strict) {
+ vFAIL4("False [] range \"%*.*s\"",
+ w, w, rangebegin);
+ }
+ else {
+ vWARN4(RExC_parse,
+ "False [] range \"%*.*s\"",
+ w, w, rangebegin);
+ }
+ }
+ if (!SIZE_ONLY) {
+ cp_list = add_cp_to_invlist(cp_list, '-');
+ }
+ element_count++;
+ } else
+ range = 1; /* yeah, it's a range! */
+ continue; /* but do it the next time */
+ }
}
}
* "ss" =~ /^[^\xDF]+$/i => N
*
* See [perl #89750] */
- if (FOLD && ! invert && value == prevvalue) {
+ if (FOLD && allow_multi_folds && value == prevvalue) {
if (value == LATIN_SMALL_LETTER_SHARP_S
|| (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
value)))
#ifndef EBCDIC
cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
#else
- UV* this_range = _new_invlist(1);
+ SV* this_range = _new_invlist(1);
_append_range_to_invlist(this_range, prevvalue, value);
/* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
&& (prevvalue >= 'a' && value <= 'z')
|| (prevvalue >= 'A' && value <= 'Z'))
{
- _invlist_intersection(this_range, PL_ASCII, &this_range, );
- _invlist_intersection(this_range, PL_Alpha, &this_range, );
+ _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
+ &this_range);
}
_invlist_union(cp_list, this_range, &cp_list);
literal_endpoint = 0;
ret = reg(pRExC_state, 1, ®_flags, depth+1);
- *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
+ *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
RExC_parse = save_parse;
RExC_end = save_end;
RExC_in_multi_char_class = 0;
SvREFCNT_dec_NN(multi_char_matches);
- SvREFCNT_dec_NN(listsv);
return ret;
}
/* If the character class contains only a single element, it may be
* optimizable into another node type which is smaller and runs faster.
* Check if this is the case for this class */
- if (element_count == 1) {
+ if (element_count == 1 && ! ret_invlist) {
U8 op = END;
U8 arg = 0;
}
/* FALLTHROUGH */
- /* The rest have more possibilities depending on the charset. We
- * take advantage of the enum ordering of the charset modifiers to
- * get the exact node type, */
+ /* The rest have more possibilities depending on the charset.
+ * We take advantage of the enum ordering of the charset
+ * modifiers to get the exact node type, */
default:
op = POSIXD + get_regex_charset(RExC_flags);
if (op > POSIXA) { /* /aa is same as /a */
RExC_parse = (char *) cur_parse;
SvREFCNT_dec(posixes);
- SvREFCNT_dec_NN(listsv);
SvREFCNT_dec(cp_list);
return ret;
}
* indicators, which are weeded out below using the
* IS_IN_SOME_FOLD_L1() macro */
if (invlist_highest(cp_list) < 256) {
- _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list, &fold_intersection);
+ _invlist_intersection(PL_L1Posix_ptrs[_CC_ALPHA], cp_list,
+ &fold_intersection);
}
else {
invert = FALSE;
}
+ if (ret_invlist) {
+ *ret_invlist = cp_list;
+
+ /* Discard the generated node */
+ if (SIZE_ONLY) {
+ RExC_size = orig_size;
+ }
+ else {
+ RExC_emit = orig_emit;
+ }
+ return orig_emit;
+ }
+
/* If we didn't do folding, it's because some information isn't available
* until runtime; set the run-time fold flag for these. (We don't have to
* worry about properties folding, as that is taken care of by the swash
}
SvREFCNT_dec_NN(cp_list);
- SvREFCNT_dec_NN(listsv);
return ret;
}
}
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
{
ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
- SvREFCNT_dec_NN(listsv);
}
else {
/* av[0] stores the character class description in its textual form:
SV *rv;
av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
- ? listsv
- : (SvREFCNT_dec_NN(listsv), &PL_sv_undef));
+ ? SvREFCNT_inc(listsv) : &PL_sv_undef);
if (swash) {
av_store(av, 1, swash);
SvREFCNT_dec_NN(cp_list);
npar = r->nparens+1;
Newx(ret->offs, npar, regexp_paren_pair);
Copy(r->offs, ret->offs, npar, regexp_paren_pair);
- if(ret->swap) {
- /* no need to copy these */
- Newx(ret->swap, npar, regexp_paren_pair);
- }
if (ret->substrs) {
/* Do it this way to avoid reading from *r after the StructCopy().
EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
ones (binary 1111 1111, hexadecimal FF). It is similar, but not
- identical, to the ASCII delete (DEL) or rubout control character.
- ) So the old condition can be simplified to !isPRINT(c) */
+ identical, to the ASCII delete (DEL) or rubout control character. ...
+ it is typically mapped to hexadecimal code 9F, in order to provide a
+ unique character mapping in both directions)
+
+ So the old condition can be simplified to !isPRINT(c) */
if (!isPRINT(c)) {
if (c < 256) {
Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);