5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
34 /* need to replace pregcomp et al, so enable that */
35 # ifndef PERL_IN_XSUB_RE
36 # define PERL_IN_XSUB_RE
38 /* need access to debugger hooks */
39 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
44 #ifdef PERL_IN_XSUB_RE
45 /* We *really* need to overwrite these symbols: */
46 # define Perl_regexec_flags my_regexec
47 # define Perl_regdump my_regdump
48 # define Perl_regprop my_regprop
49 # define Perl_re_intuit_start my_re_intuit_start
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_pregexec my_pregexec
52 # define Perl_reginitcolors my_reginitcolors
53 # define Perl_regclass_swash my_regclass_swash
55 # define PERL_NO_GET_CONTEXT
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
87 * Beware that some of this code is subtly aware of the way operator
88 * precedence is structured in regular expressions. Serious changes in
89 * regular-expression syntax might require a total rethink.
92 #define PERL_IN_REGEXEC_C
97 #define RF_tainted 1 /* tainted information used? */
98 #define RF_warned 2 /* warned about big count? */
99 #define RF_evaled 4 /* Did an EVAL with setting? */
100 #define RF_utf8 8 /* String contains multibyte chars? */
102 #define UTF ((PL_reg_flags & RF_utf8) != 0)
104 #define RS_init 1 /* eval environment created */
105 #define RS_set 2 /* replsv value is set */
108 #define STATIC static
111 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
117 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
118 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
120 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
121 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
122 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
123 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
124 #define HOPc(pos,off) ((char*)HOP(pos,off))
125 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
127 #define HOPBACK(pos, off) ( \
128 (PL_reg_match_utf8) \
129 ? reghopmaybe((U8*)pos, -off) \
130 : (pos - off >= PL_bostr) \
134 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
136 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
137 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
138 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
139 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
140 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
141 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
143 #define LOAD_UTF8_CHARCLASS(a,b) STMT_START { if (!CAT2(PL_utf8_,a)) { ENTER; save_re_context(); (void)CAT2(is_utf8_, a)((U8*)b); LEAVE; } } STMT_END
145 /* for use after a quantifier and before an EXACT-like node -- japhy */
146 #define JUMPABLE(rn) ( \
147 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
148 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
149 OP(rn) == PLUS || OP(rn) == MINMOD || \
150 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
153 #define HAS_TEXT(rn) ( \
154 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
158 Search for mandatory following text node; for lookahead, the text must
159 follow but for lookbehind (rn->flags != 0) we skip to the next step.
161 #define FIND_NEXT_IMPT(rn) STMT_START { \
162 while (JUMPABLE(rn)) \
163 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
164 rn = NEXTOPER(NEXTOPER(rn)); \
165 else if (OP(rn) == PLUS) \
167 else if (OP(rn) == IFMATCH) \
168 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
169 else rn += NEXT_OFF(rn); \
172 static void restore_pos(pTHX_ void *arg);
175 S_regcppush(pTHX_ I32 parenfloor)
177 int retval = PL_savestack_ix;
178 #define REGCP_PAREN_ELEMS 4
179 int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
182 if (paren_elems_to_push < 0)
183 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
185 #define REGCP_OTHER_ELEMS 6
186 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
187 for (p = PL_regsize; p > parenfloor; p--) {
188 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
189 SSPUSHINT(PL_regendp[p]);
190 SSPUSHINT(PL_regstartp[p]);
191 SSPUSHPTR(PL_reg_start_tmp[p]);
194 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
195 SSPUSHINT(PL_regsize);
196 SSPUSHINT(*PL_reglastparen);
197 SSPUSHINT(*PL_reglastcloseparen);
198 SSPUSHPTR(PL_reginput);
199 #define REGCP_FRAME_ELEMS 2
200 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
201 * are needed for the regexp context stack bookkeeping. */
202 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
203 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
208 /* These are needed since we do not localize EVAL nodes: */
209 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
210 " Setting an EVAL scope, savestack=%"IVdf"\n", \
211 (IV)PL_savestack_ix)); cp = PL_savestack_ix
213 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
214 PerlIO_printf(Perl_debug_log, \
215 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
216 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
226 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
228 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
229 i = SSPOPINT; /* Parentheses elements to pop. */
230 input = (char *) SSPOPPTR;
231 *PL_reglastcloseparen = SSPOPINT;
232 *PL_reglastparen = SSPOPINT;
233 PL_regsize = SSPOPINT;
235 /* Now restore the parentheses context. */
236 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
237 i > 0; i -= REGCP_PAREN_ELEMS) {
238 paren = (U32)SSPOPINT;
239 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
240 PL_regstartp[paren] = SSPOPINT;
242 if (paren <= *PL_reglastparen)
243 PL_regendp[paren] = tmps;
245 PerlIO_printf(Perl_debug_log,
246 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
247 (UV)paren, (IV)PL_regstartp[paren],
248 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
249 (IV)PL_regendp[paren],
250 (paren > *PL_reglastparen ? "(no)" : ""));
254 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
255 PerlIO_printf(Perl_debug_log,
256 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
257 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
261 /* It would seem that the similar code in regtry()
262 * already takes care of this, and in fact it is in
263 * a better location to since this code can #if 0-ed out
264 * but the code in regtry() is needed or otherwise tests
265 * requiring null fields (pat.t#187 and split.t#{13,14}
266 * (as of patchlevel 7877) will fail. Then again,
267 * this code seems to be necessary or otherwise
268 * building DynaLoader will fail:
269 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
271 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
272 if ((I32)paren > PL_regsize)
273 PL_regstartp[paren] = -1;
274 PL_regendp[paren] = -1;
281 S_regcp_set_to(pTHX_ I32 ss)
283 I32 tmp = PL_savestack_ix;
285 PL_savestack_ix = ss;
287 PL_savestack_ix = tmp;
291 typedef struct re_cc_state
295 struct re_cc_state *prev;
300 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
302 #define TRYPAREN(paren, n, input) { \
305 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
306 PL_regendp[paren] = input - PL_bostr; \
309 PL_regendp[paren] = -1; \
311 if (regmatch(next)) \
314 PL_regendp[paren] = -1; \
319 * pregexec and friends
323 - pregexec - match a regexp against a string
326 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
327 char *strbeg, I32 minend, SV *screamer, U32 nosave)
328 /* strend: pointer to null at end of string */
329 /* strbeg: real beginning of string */
330 /* minend: end of match must be >=minend after stringarg. */
331 /* nosave: For optimizations. */
334 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
335 nosave ? 0 : REXEC_COPY_STR);
339 S_cache_re(pTHX_ regexp *prog)
341 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
343 PL_regprogram = prog->program;
345 PL_regnpar = prog->nparens;
346 PL_regdata = prog->data;
351 * Need to implement the following flags for reg_anch:
353 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
355 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
356 * INTUIT_AUTORITATIVE_ML
357 * INTUIT_ONCE_NOML - Intuit can match in one location only.
360 * Another flag for this function: SECOND_TIME (so that float substrs
361 * with giant delta may be not rechecked).
364 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
366 /* If SCREAM, then SvPVX(sv) should be compatible with strpos and strend.
367 Otherwise, only SvCUR(sv) is used to get strbeg. */
369 /* XXXX We assume that strpos is strbeg unless sv. */
371 /* XXXX Some places assume that there is a fixed substring.
372 An update may be needed if optimizer marks as "INTUITable"
373 RExen without fixed substrings. Similarly, it is assumed that
374 lengths of all the strings are no more than minlen, thus they
375 cannot come from lookahead.
376 (Or minlen should take into account lookahead.) */
378 /* A failure to find a constant substring means that there is no need to make
379 an expensive call to REx engine, thus we celebrate a failure. Similarly,
380 finding a substring too deep into the string means that less calls to
381 regtry() should be needed.
383 REx compiler's optimizer found 4 possible hints:
384 a) Anchored substring;
386 c) Whether we are anchored (beginning-of-line or \G);
387 d) First node (of those at offset 0) which may distingush positions;
388 We use a)b)d) and multiline-part of c), and try to find a position in the
389 string which does not contradict any of them.
392 /* Most of decisions we do here should have been done at compile time.
393 The nodes of the REx which we used for the search should have been
394 deleted from the finite automaton. */
397 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
398 char *strend, U32 flags, re_scream_pos_data *data)
400 register I32 start_shift = 0;
401 /* Should be nonnegative! */
402 register I32 end_shift = 0;
407 int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
409 register char *other_last = Nullch; /* other substr checked before this */
410 char *check_at = Nullch; /* check substr found at this pos */
412 char *i_strpos = strpos;
413 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
415 RX_MATCH_UTF8_set(prog,do_utf8);
417 if (prog->reganch & ROPT_UTF8) {
418 DEBUG_r(PerlIO_printf(Perl_debug_log,
419 "UTF-8 regex...\n"));
420 PL_reg_flags |= RF_utf8;
424 char *s = PL_reg_match_utf8 ?
425 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
427 int len = PL_reg_match_utf8 ?
428 strlen(s) : strend - strpos;
431 if (PL_reg_match_utf8)
432 DEBUG_r(PerlIO_printf(Perl_debug_log,
433 "UTF-8 target...\n"));
434 PerlIO_printf(Perl_debug_log,
435 "%sGuessing start of match, REx%s `%s%.60s%s%s' against `%s%.*s%s%s'...\n",
436 PL_colors[4],PL_colors[5],PL_colors[0],
439 (strlen(prog->precomp) > 60 ? "..." : ""),
441 (int)(len > 60 ? 60 : len),
443 (len > 60 ? "..." : "")
447 /* CHR_DIST() would be more correct here but it makes things slow. */
448 if (prog->minlen > strend - strpos) {
449 DEBUG_r(PerlIO_printf(Perl_debug_log,
450 "String too short... [re_intuit_start]\n"));
453 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
456 if (!prog->check_utf8 && prog->check_substr)
457 to_utf8_substr(prog);
458 check = prog->check_utf8;
460 if (!prog->check_substr && prog->check_utf8)
461 to_byte_substr(prog);
462 check = prog->check_substr;
464 if (check == &PL_sv_undef) {
465 DEBUG_r(PerlIO_printf(Perl_debug_log,
466 "Non-utf string cannot match utf check string\n"));
469 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
470 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
471 || ( (prog->reganch & ROPT_ANCH_BOL)
472 && !PL_multiline ) ); /* Check after \n? */
475 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
476 | ROPT_IMPLICIT)) /* not a real BOL */
477 /* SvCUR is not set on references: SvRV and SvPVX overlap */
479 && (strpos != strbeg)) {
480 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
483 if (prog->check_offset_min == prog->check_offset_max &&
484 !(prog->reganch & ROPT_CANY_SEEN)) {
485 /* Substring at constant offset from beg-of-str... */
488 s = HOP3c(strpos, prog->check_offset_min, strend);
490 slen = SvCUR(check); /* >= 1 */
492 if ( strend - s > slen || strend - s < slen - 1
493 || (strend - s == slen && strend[-1] != '\n')) {
494 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
497 /* Now should match s[0..slen-2] */
499 if (slen && (*SvPVX(check) != *s
501 && memNE(SvPVX(check), s, slen)))) {
503 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
507 else if (*SvPVX(check) != *s
508 || ((slen = SvCUR(check)) > 1
509 && memNE(SvPVX(check), s, slen)))
511 goto success_at_start;
514 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
516 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
517 end_shift = prog->minlen - start_shift -
518 CHR_SVLEN(check) + (SvTAIL(check) != 0);
520 I32 end = prog->check_offset_max + CHR_SVLEN(check)
521 - (SvTAIL(check) != 0);
522 I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
524 if (end_shift < eshift)
528 else { /* Can match at random position */
531 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
532 /* Should be nonnegative! */
533 end_shift = prog->minlen - start_shift -
534 CHR_SVLEN(check) + (SvTAIL(check) != 0);
537 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
539 Perl_croak(aTHX_ "panic: end_shift");
543 /* Find a possible match in the region s..strend by looking for
544 the "check" substring in the region corrected by start/end_shift. */
545 if (flags & REXEC_SCREAM) {
546 I32 p = -1; /* Internal iterator of scream. */
547 I32 *pp = data ? data->scream_pos : &p;
549 if (PL_screamfirst[BmRARE(check)] >= 0
550 || ( BmRARE(check) == '\n'
551 && (BmPREVIOUS(check) == SvCUR(check) - 1)
553 s = screaminstr(sv, check,
554 start_shift + (s - strbeg), end_shift, pp, 0);
557 /* we may be pointing at the wrong string */
558 if (s && RX_MATCH_COPIED(prog))
559 s = strbeg + (s - SvPVX(sv));
561 *data->scream_olds = s;
563 else if (prog->reganch & ROPT_CANY_SEEN)
564 s = fbm_instr((U8*)(s + start_shift),
565 (U8*)(strend - end_shift),
566 check, PL_multiline ? FBMrf_MULTILINE : 0);
568 s = fbm_instr(HOP3(s, start_shift, strend),
569 HOP3(strend, -end_shift, strbeg),
570 check, PL_multiline ? FBMrf_MULTILINE : 0);
572 /* Update the count-of-usability, remove useless subpatterns,
575 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr `%s%.*s%s'%s%s",
576 (s ? "Found" : "Did not find"),
577 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
579 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
581 PL_colors[1], (SvTAIL(check) ? "$" : ""),
582 (s ? " at offset " : "...\n") ) );
589 /* Finish the diagnostic message */
590 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
592 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
593 Start with the other substr.
594 XXXX no SCREAM optimization yet - and a very coarse implementation
595 XXXX /ttx+/ results in anchored=`ttx', floating=`x'. floating will
596 *always* match. Probably should be marked during compile...
597 Probably it is right to do no SCREAM here...
600 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
601 /* Take into account the "other" substring. */
602 /* XXXX May be hopelessly wrong for UTF... */
605 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
608 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
612 t = s - prog->check_offset_max;
613 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
615 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
620 t = HOP3c(t, prog->anchored_offset, strend);
621 if (t < other_last) /* These positions already checked */
623 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
626 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
627 /* On end-of-str: see comment below. */
628 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
629 if (must == &PL_sv_undef) {
631 DEBUG_r(must = prog->anchored_utf8); /* for debug */
636 HOP3(HOP3(last1, prog->anchored_offset, strend)
637 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
639 PL_multiline ? FBMrf_MULTILINE : 0
641 DEBUG_r(PerlIO_printf(Perl_debug_log,
642 "%s anchored substr `%s%.*s%s'%s",
643 (s ? "Found" : "Contradicts"),
646 - (SvTAIL(must)!=0)),
648 PL_colors[1], (SvTAIL(must) ? "$" : "")));
650 if (last1 >= last2) {
651 DEBUG_r(PerlIO_printf(Perl_debug_log,
652 ", giving up...\n"));
655 DEBUG_r(PerlIO_printf(Perl_debug_log,
656 ", trying floating at offset %ld...\n",
657 (long)(HOP3c(s1, 1, strend) - i_strpos)));
658 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
659 s = HOP3c(last, 1, strend);
663 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
664 (long)(s - i_strpos)));
665 t = HOP3c(s, -prog->anchored_offset, strbeg);
666 other_last = HOP3c(s, 1, strend);
674 else { /* Take into account the floating substring. */
679 t = HOP3c(s, -start_shift, strbeg);
681 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
682 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
683 last = HOP3c(t, prog->float_max_offset, strend);
684 s = HOP3c(t, prog->float_min_offset, strend);
687 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
688 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
689 /* fbm_instr() takes into account exact value of end-of-str
690 if the check is SvTAIL(ed). Since false positives are OK,
691 and end-of-str is not later than strend we are OK. */
692 if (must == &PL_sv_undef) {
694 DEBUG_r(must = prog->float_utf8); /* for debug message */
697 s = fbm_instr((unsigned char*)s,
698 (unsigned char*)last + SvCUR(must)
700 must, PL_multiline ? FBMrf_MULTILINE : 0);
701 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr `%s%.*s%s'%s",
702 (s ? "Found" : "Contradicts"),
704 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
706 PL_colors[1], (SvTAIL(must) ? "$" : "")));
709 DEBUG_r(PerlIO_printf(Perl_debug_log,
710 ", giving up...\n"));
713 DEBUG_r(PerlIO_printf(Perl_debug_log,
714 ", trying anchored starting at offset %ld...\n",
715 (long)(s1 + 1 - i_strpos)));
717 s = HOP3c(t, 1, strend);
721 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
722 (long)(s - i_strpos)));
723 other_last = s; /* Fix this later. --Hugo */
732 t = s - prog->check_offset_max;
733 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
735 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
737 /* Fixed substring is found far enough so that the match
738 cannot start at strpos. */
740 if (ml_anch && t[-1] != '\n') {
741 /* Eventually fbm_*() should handle this, but often
742 anchored_offset is not 0, so this check will not be wasted. */
743 /* XXXX In the code below we prefer to look for "^" even in
744 presence of anchored substrings. And we search even
745 beyond the found float position. These pessimizations
746 are historical artefacts only. */
748 while (t < strend - prog->minlen) {
750 if (t < check_at - prog->check_offset_min) {
751 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
752 /* Since we moved from the found position,
753 we definitely contradict the found anchored
754 substr. Due to the above check we do not
755 contradict "check" substr.
756 Thus we can arrive here only if check substr
757 is float. Redo checking for "other"=="fixed".
760 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
761 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
762 goto do_other_anchored;
764 /* We don't contradict the found floating substring. */
765 /* XXXX Why not check for STCLASS? */
767 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
768 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
771 /* Position contradicts check-string */
772 /* XXXX probably better to look for check-string
773 than for "\n", so one should lower the limit for t? */
774 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
775 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
776 other_last = strpos = s = t + 1;
781 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
782 PL_colors[0],PL_colors[1]));
786 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
787 PL_colors[0],PL_colors[1]));
791 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
794 /* The found string does not prohibit matching at strpos,
795 - no optimization of calling REx engine can be performed,
796 unless it was an MBOL and we are not after MBOL,
797 or a future STCLASS check will fail this. */
799 /* Even in this situation we may use MBOL flag if strpos is offset
800 wrt the start of the string. */
801 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
802 && (strpos != strbeg) && strpos[-1] != '\n'
803 /* May be due to an implicit anchor of m{.*foo} */
804 && !(prog->reganch & ROPT_IMPLICIT))
809 DEBUG_r( if (ml_anch)
810 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
811 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
814 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
816 prog->check_utf8 /* Could be deleted already */
817 && --BmUSEFUL(prog->check_utf8) < 0
818 && (prog->check_utf8 == prog->float_utf8)
820 prog->check_substr /* Could be deleted already */
821 && --BmUSEFUL(prog->check_substr) < 0
822 && (prog->check_substr == prog->float_substr)
825 /* If flags & SOMETHING - do not do it many times on the same match */
826 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
827 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
828 if (do_utf8 ? prog->check_substr : prog->check_utf8)
829 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
830 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
831 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
832 check = Nullsv; /* abort */
834 /* XXXX This is a remnant of the old implementation. It
835 looks wasteful, since now INTUIT can use many
837 prog->reganch &= ~RE_USE_INTUIT;
844 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
845 if (prog->regstclass) {
846 /* minlen == 0 is possible if regstclass is \b or \B,
847 and the fixed substr is ''$.
848 Since minlen is already taken into account, s+1 is before strend;
849 accidentally, minlen >= 1 guaranties no false positives at s + 1
850 even for \b or \B. But (minlen? 1 : 0) below assumes that
851 regstclass does not come from lookahead... */
852 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
853 This leaves EXACTF only, which is dealt with in find_byclass(). */
854 U8* str = (U8*)STRING(prog->regstclass);
855 int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
856 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
858 char *endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
859 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
860 : (prog->float_substr || prog->float_utf8
861 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
864 char *startpos = strbeg;
868 s = find_byclass(prog, prog->regstclass, s, endpos, startpos, 1);
873 if (endpos == strend) {
874 DEBUG_r( PerlIO_printf(Perl_debug_log,
875 "Could not match STCLASS...\n") );
878 DEBUG_r( PerlIO_printf(Perl_debug_log,
879 "This position contradicts STCLASS...\n") );
880 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
882 /* Contradict one of substrings */
883 if (prog->anchored_substr || prog->anchored_utf8) {
884 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
885 DEBUG_r( what = "anchored" );
887 s = HOP3c(t, 1, strend);
888 if (s + start_shift + end_shift > strend) {
889 /* XXXX Should be taken into account earlier? */
890 DEBUG_r( PerlIO_printf(Perl_debug_log,
891 "Could not match STCLASS...\n") );
896 DEBUG_r( PerlIO_printf(Perl_debug_log,
897 "Looking for %s substr starting at offset %ld...\n",
898 what, (long)(s + start_shift - i_strpos)) );
901 /* Have both, check_string is floating */
902 if (t + start_shift >= check_at) /* Contradicts floating=check */
903 goto retry_floating_check;
904 /* Recheck anchored substring, but not floating... */
908 DEBUG_r( PerlIO_printf(Perl_debug_log,
909 "Looking for anchored substr starting at offset %ld...\n",
910 (long)(other_last - i_strpos)) );
911 goto do_other_anchored;
913 /* Another way we could have checked stclass at the
914 current position only: */
919 DEBUG_r( PerlIO_printf(Perl_debug_log,
920 "Looking for /%s^%s/m starting at offset %ld...\n",
921 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
924 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
926 /* Check is floating subtring. */
927 retry_floating_check:
928 t = check_at - start_shift;
929 DEBUG_r( what = "floating" );
930 goto hop_and_restart;
933 DEBUG_r(PerlIO_printf(Perl_debug_log,
934 "By STCLASS: moving %ld --> %ld\n",
935 (long)(t - i_strpos), (long)(s - i_strpos))
939 DEBUG_r(PerlIO_printf(Perl_debug_log,
940 "Does not contradict STCLASS...\n");
945 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
946 PL_colors[4], (check ? "Guessed" : "Giving up"),
947 PL_colors[5], (long)(s - i_strpos)) );
950 fail_finish: /* Substring not found */
951 if (prog->check_substr || prog->check_utf8) /* could be removed already */
952 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
954 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
955 PL_colors[4],PL_colors[5]));
959 /* We know what class REx starts with. Try to find this position... */
961 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *startpos, I32 norun)
963 I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
967 register STRLEN uskip;
971 register I32 tmp = 1; /* Scratch variable? */
972 register bool do_utf8 = PL_reg_match_utf8;
974 /* We know what class it must start with. */
978 while (s + (uskip = UTF8SKIP(s)) <= strend) {
979 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
980 !UTF8_IS_INVARIANT((U8)s[0]) ?
981 reginclass(c, (U8*)s, 0, do_utf8) :
982 REGINCLASS(c, (U8*)s)) {
983 if (tmp && (norun || regtry(prog, s)))
997 if (REGINCLASS(c, (U8*)s) ||
998 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
999 /* The assignment of 2 is intentional:
1000 * for the folded sharp s, the skip is 2. */
1001 (skip = SHARP_S_SKIP))) {
1002 if (tmp && (norun || regtry(prog, s)))
1014 while (s < strend) {
1015 if (tmp && (norun || regtry(prog, s)))
1024 ln = STR_LEN(c); /* length to match in octets/bytes */
1025 lnc = (I32) ln; /* length to match in characters */
1027 STRLEN ulen1, ulen2;
1029 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1030 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1032 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1033 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1035 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1036 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1037 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1038 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1040 while (sm < ((U8 *) m + ln)) {
1055 c2 = PL_fold_locale[c1];
1057 e = HOP3c(strend, -((I32)lnc), s);
1060 e = s; /* Due to minlen logic of intuit() */
1062 /* The idea in the EXACTF* cases is to first find the
1063 * first character of the EXACTF* node and then, if
1064 * necessary, case-insensitively compare the full
1065 * text of the node. The c1 and c2 are the first
1066 * characters (though in Unicode it gets a bit
1067 * more complicated because there are more cases
1068 * than just upper and lower: one needs to use
1069 * the so-called folding case for case-insensitive
1070 * matching (called "loose matching" in Unicode).
1071 * ibcmp_utf8() will do just that. */
1075 U8 tmpbuf [UTF8_MAXBYTES+1];
1076 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1077 STRLEN len, foldlen;
1080 /* Upper and lower of 1st char are equal -
1081 * probably not a "letter". */
1083 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1085 0 : UTF8_ALLOW_ANY);
1088 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1089 m, (char **)0, ln, (bool)UTF))
1090 && (norun || regtry(prog, s)) )
1093 uvchr_to_utf8(tmpbuf, c);
1094 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1096 && (f == c1 || f == c2)
1097 && (ln == foldlen ||
1098 !ibcmp_utf8((char *) foldbuf,
1099 (char **)0, foldlen, do_utf8,
1101 (char **)0, ln, (bool)UTF))
1102 && (norun || regtry(prog, s)) )
1110 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1112 0 : UTF8_ALLOW_ANY);
1114 /* Handle some of the three Greek sigmas cases.
1115 * Note that not all the possible combinations
1116 * are handled here: some of them are handled
1117 * by the standard folding rules, and some of
1118 * them (the character class or ANYOF cases)
1119 * are handled during compiletime in
1120 * regexec.c:S_regclass(). */
1121 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1122 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1123 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1125 if ( (c == c1 || c == c2)
1127 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1128 m, (char **)0, ln, (bool)UTF))
1129 && (norun || regtry(prog, s)) )
1132 uvchr_to_utf8(tmpbuf, c);
1133 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1135 && (f == c1 || f == c2)
1136 && (ln == foldlen ||
1137 !ibcmp_utf8((char *) foldbuf,
1138 (char **)0, foldlen, do_utf8,
1140 (char **)0, ln, (bool)UTF))
1141 && (norun || regtry(prog, s)) )
1152 && (ln == 1 || !(OP(c) == EXACTF
1154 : ibcmp_locale(s, m, ln)))
1155 && (norun || regtry(prog, s)) )
1161 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1162 && (ln == 1 || !(OP(c) == EXACTF
1164 : ibcmp_locale(s, m, ln)))
1165 && (norun || regtry(prog, s)) )
1172 PL_reg_flags |= RF_tainted;
1179 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1181 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1183 tmp = ((OP(c) == BOUND ?
1184 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1185 LOAD_UTF8_CHARCLASS(alnum,"a");
1186 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1187 if (tmp == !(OP(c) == BOUND ?
1188 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1189 isALNUM_LC_utf8((U8*)s)))
1192 if ((norun || regtry(prog, s)))
1199 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1200 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1201 while (s < strend) {
1203 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1205 if ((norun || regtry(prog, s)))
1211 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1215 PL_reg_flags |= RF_tainted;
1222 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1224 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1226 tmp = ((OP(c) == NBOUND ?
1227 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1228 LOAD_UTF8_CHARCLASS(alnum,"a");
1229 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1230 if (tmp == !(OP(c) == NBOUND ?
1231 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1232 isALNUM_LC_utf8((U8*)s)))
1234 else if ((norun || regtry(prog, s)))
1240 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1241 tmp = ((OP(c) == NBOUND ?
1242 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1243 while (s < strend) {
1245 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1247 else if ((norun || regtry(prog, s)))
1252 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1257 LOAD_UTF8_CHARCLASS(alnum,"a");
1258 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1259 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1260 if (tmp && (norun || regtry(prog, s)))
1271 while (s < strend) {
1273 if (tmp && (norun || regtry(prog, s)))
1285 PL_reg_flags |= RF_tainted;
1287 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1288 if (isALNUM_LC_utf8((U8*)s)) {
1289 if (tmp && (norun || regtry(prog, s)))
1300 while (s < strend) {
1301 if (isALNUM_LC(*s)) {
1302 if (tmp && (norun || regtry(prog, s)))
1315 LOAD_UTF8_CHARCLASS(alnum,"a");
1316 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1317 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1318 if (tmp && (norun || regtry(prog, s)))
1329 while (s < strend) {
1331 if (tmp && (norun || regtry(prog, s)))
1343 PL_reg_flags |= RF_tainted;
1345 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1346 if (!isALNUM_LC_utf8((U8*)s)) {
1347 if (tmp && (norun || regtry(prog, s)))
1358 while (s < strend) {
1359 if (!isALNUM_LC(*s)) {
1360 if (tmp && (norun || regtry(prog, s)))
1373 LOAD_UTF8_CHARCLASS(space," ");
1374 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1375 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1376 if (tmp && (norun || regtry(prog, s)))
1387 while (s < strend) {
1389 if (tmp && (norun || regtry(prog, s)))
1401 PL_reg_flags |= RF_tainted;
1403 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1404 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1405 if (tmp && (norun || regtry(prog, s)))
1416 while (s < strend) {
1417 if (isSPACE_LC(*s)) {
1418 if (tmp && (norun || regtry(prog, s)))
1431 LOAD_UTF8_CHARCLASS(space," ");
1432 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1433 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1434 if (tmp && (norun || regtry(prog, s)))
1445 while (s < strend) {
1447 if (tmp && (norun || regtry(prog, s)))
1459 PL_reg_flags |= RF_tainted;
1461 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1462 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1463 if (tmp && (norun || regtry(prog, s)))
1474 while (s < strend) {
1475 if (!isSPACE_LC(*s)) {
1476 if (tmp && (norun || regtry(prog, s)))
1489 LOAD_UTF8_CHARCLASS(digit,"0");
1490 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1491 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1492 if (tmp && (norun || regtry(prog, s)))
1503 while (s < strend) {
1505 if (tmp && (norun || regtry(prog, s)))
1517 PL_reg_flags |= RF_tainted;
1519 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1520 if (isDIGIT_LC_utf8((U8*)s)) {
1521 if (tmp && (norun || regtry(prog, s)))
1532 while (s < strend) {
1533 if (isDIGIT_LC(*s)) {
1534 if (tmp && (norun || regtry(prog, s)))
1547 LOAD_UTF8_CHARCLASS(digit,"0");
1548 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1549 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1550 if (tmp && (norun || regtry(prog, s)))
1561 while (s < strend) {
1563 if (tmp && (norun || regtry(prog, s)))
1575 PL_reg_flags |= RF_tainted;
1577 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1578 if (!isDIGIT_LC_utf8((U8*)s)) {
1579 if (tmp && (norun || regtry(prog, s)))
1590 while (s < strend) {
1591 if (!isDIGIT_LC(*s)) {
1592 if (tmp && (norun || regtry(prog, s)))
1604 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1613 - regexec_flags - match a regexp against a string
1616 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1617 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1618 /* strend: pointer to null at end of string */
1619 /* strbeg: real beginning of string */
1620 /* minend: end of match must be >=minend after stringarg. */
1621 /* data: May be used for some additional optimizations. */
1622 /* nosave: For optimizations. */
1625 register regnode *c;
1626 register char *startpos = stringarg;
1627 I32 minlen; /* must match at least this many chars */
1628 I32 dontbother = 0; /* how many characters not to try at end */
1629 /* I32 start_shift = 0; */ /* Offset of the start to find
1630 constant substr. */ /* CC */
1631 I32 end_shift = 0; /* Same for the end. */ /* CC */
1632 I32 scream_pos = -1; /* Internal iterator of scream. */
1634 SV* oreplsv = GvSV(PL_replgv);
1635 bool do_utf8 = DO_UTF8(sv);
1637 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1638 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1640 RX_MATCH_UTF8_set(prog,do_utf8);
1646 PL_regnarrate = DEBUG_r_TEST;
1649 /* Be paranoid... */
1650 if (prog == NULL || startpos == NULL) {
1651 Perl_croak(aTHX_ "NULL regexp parameter");
1655 minlen = prog->minlen;
1656 if (strend - startpos < minlen) {
1657 DEBUG_r(PerlIO_printf(Perl_debug_log,
1658 "String too short [regexec_flags]...\n"));
1662 /* Check validity of program. */
1663 if (UCHARAT(prog->program) != REG_MAGIC) {
1664 Perl_croak(aTHX_ "corrupted regexp program");
1668 PL_reg_eval_set = 0;
1671 if (prog->reganch & ROPT_UTF8)
1672 PL_reg_flags |= RF_utf8;
1674 /* Mark beginning of line for ^ and lookbehind. */
1675 PL_regbol = startpos;
1679 /* Mark end of line for $ (and such) */
1682 /* see how far we have to get to not match where we matched before */
1683 PL_regtill = startpos+minend;
1685 /* We start without call_cc context. */
1688 /* If there is a "must appear" string, look for it. */
1691 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1694 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1695 PL_reg_ganch = startpos;
1696 else if (sv && SvTYPE(sv) >= SVt_PVMG
1698 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1699 && mg->mg_len >= 0) {
1700 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1701 if (prog->reganch & ROPT_ANCH_GPOS) {
1702 if (s > PL_reg_ganch)
1707 else /* pos() not defined */
1708 PL_reg_ganch = strbeg;
1711 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
1712 re_scream_pos_data d;
1714 d.scream_olds = &scream_olds;
1715 d.scream_pos = &scream_pos;
1716 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1718 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1719 goto phooey; /* not present */
1725 pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1726 UNI_DISPLAY_REGEX) :
1728 int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1729 char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1730 UNI_DISPLAY_REGEX) : startpos;
1731 int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1734 PerlIO_printf(Perl_debug_log,
1735 "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n",
1736 PL_colors[4],PL_colors[5],PL_colors[0],
1739 len0 > 60 ? "..." : "",
1741 (int)(len1 > 60 ? 60 : len1),
1743 (len1 > 60 ? "..." : "")
1747 /* Simplest case: anchored match need be tried only once. */
1748 /* [unless only anchor is BOL and multiline is set] */
1749 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1750 if (s == startpos && regtry(prog, startpos))
1752 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1753 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1758 dontbother = minlen - 1;
1759 end = HOP3c(strend, -dontbother, strbeg) - 1;
1760 /* for multiline we only have to try after newlines */
1761 if (prog->check_substr || prog->check_utf8) {
1765 if (regtry(prog, s))
1770 if (prog->reganch & RE_USE_INTUIT) {
1771 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1782 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1783 if (regtry(prog, s))
1790 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1791 if (regtry(prog, PL_reg_ganch))
1796 /* Messy cases: unanchored match. */
1797 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1798 /* we have /x+whatever/ */
1799 /* it must be a one character string (XXXX Except UTF?) */
1804 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1805 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1806 ch = SvPVX(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1809 while (s < strend) {
1811 DEBUG_r( did_match = 1 );
1812 if (regtry(prog, s)) goto got_it;
1814 while (s < strend && *s == ch)
1821 while (s < strend) {
1823 DEBUG_r( did_match = 1 );
1824 if (regtry(prog, s)) goto got_it;
1826 while (s < strend && *s == ch)
1832 DEBUG_r(if (!did_match)
1833 PerlIO_printf(Perl_debug_log,
1834 "Did not find anchored character...\n")
1838 else if (prog->anchored_substr != Nullsv
1839 || prog->anchored_utf8 != Nullsv
1840 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1841 && prog->float_max_offset < strend - s)) {
1846 char *last1; /* Last position checked before */
1850 if (prog->anchored_substr || prog->anchored_utf8) {
1851 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1852 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1853 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1854 back_max = back_min = prog->anchored_offset;
1856 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1857 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1858 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1859 back_max = prog->float_max_offset;
1860 back_min = prog->float_min_offset;
1862 if (must == &PL_sv_undef)
1863 /* could not downgrade utf8 check substring, so must fail */
1866 last = HOP3c(strend, /* Cannot start after this */
1867 -(I32)(CHR_SVLEN(must)
1868 - (SvTAIL(must) != 0) + back_min), strbeg);
1871 last1 = HOPc(s, -1);
1873 last1 = s - 1; /* bogus */
1875 /* XXXX check_substr already used to find `s', can optimize if
1876 check_substr==must. */
1878 dontbother = end_shift;
1879 strend = HOPc(strend, -dontbother);
1880 while ( (s <= last) &&
1881 ((flags & REXEC_SCREAM)
1882 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1883 end_shift, &scream_pos, 0))
1884 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1885 (unsigned char*)strend, must,
1886 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1887 /* we may be pointing at the wrong string */
1888 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1889 s = strbeg + (s - SvPVX(sv));
1890 DEBUG_r( did_match = 1 );
1891 if (HOPc(s, -back_max) > last1) {
1892 last1 = HOPc(s, -back_min);
1893 s = HOPc(s, -back_max);
1896 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1898 last1 = HOPc(s, -back_min);
1902 while (s <= last1) {
1903 if (regtry(prog, s))
1909 while (s <= last1) {
1910 if (regtry(prog, s))
1916 DEBUG_r(if (!did_match)
1917 PerlIO_printf(Perl_debug_log,
1918 "Did not find %s substr `%s%.*s%s'%s...\n",
1919 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1920 ? "anchored" : "floating"),
1922 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1924 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1928 else if ((c = prog->regstclass)) {
1930 I32 op = (U8)OP(prog->regstclass);
1931 /* don't bother with what can't match */
1932 if (PL_regkind[op] != EXACT && op != CANY)
1933 strend = HOPc(strend, -(minlen - 1));
1936 SV *prop = sv_newmortal();
1944 pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60,
1945 UNI_DISPLAY_REGEX) :
1947 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1949 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1950 len1 = UTF ? SvCUR(dsv1) : strend - s;
1951 PerlIO_printf(Perl_debug_log,
1952 "Matching stclass `%*.*s' against `%*.*s'\n",
1956 if (find_byclass(prog, c, s, strend, startpos, 0))
1958 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1962 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1967 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1968 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1969 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1971 if (flags & REXEC_SCREAM) {
1972 last = screaminstr(sv, float_real, s - strbeg,
1973 end_shift, &scream_pos, 1); /* last one */
1975 last = scream_olds; /* Only one occurrence. */
1976 /* we may be pointing at the wrong string */
1977 else if (RX_MATCH_COPIED(prog))
1978 s = strbeg + (s - SvPVX(sv));
1982 char *little = SvPV(float_real, len);
1984 if (SvTAIL(float_real)) {
1985 if (memEQ(strend - len + 1, little, len - 1))
1986 last = strend - len + 1;
1987 else if (!PL_multiline)
1988 last = memEQ(strend - len, little, len)
1989 ? strend - len : Nullch;
1995 last = rninstr(s, strend, little, little + len);
1997 last = strend; /* matching `$' */
2001 DEBUG_r(PerlIO_printf(Perl_debug_log,
2002 "%sCan't trim the tail, match fails (should not happen)%s\n",
2003 PL_colors[4],PL_colors[5]));
2004 goto phooey; /* Should not happen! */
2006 dontbother = strend - last + prog->float_min_offset;
2008 if (minlen && (dontbother < minlen))
2009 dontbother = minlen - 1;
2010 strend -= dontbother; /* this one's always in bytes! */
2011 /* We don't know much -- general case. */
2014 if (regtry(prog, s))
2023 if (regtry(prog, s))
2025 } while (s++ < strend);
2033 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2035 if (PL_reg_eval_set) {
2036 /* Preserve the current value of $^R */
2037 if (oreplsv != GvSV(PL_replgv))
2038 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2039 restored, the value remains
2041 restore_pos(aTHX_ 0);
2044 /* make sure $`, $&, $', and $digit will work later */
2045 if ( !(flags & REXEC_NOT_FIRST) ) {
2046 if (RX_MATCH_COPIED(prog)) {
2047 Safefree(prog->subbeg);
2048 RX_MATCH_COPIED_off(prog);
2050 if (flags & REXEC_COPY_STR) {
2051 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2053 s = savepvn(strbeg, i);
2056 RX_MATCH_COPIED_on(prog);
2059 prog->subbeg = strbeg;
2060 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2067 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2068 PL_colors[4],PL_colors[5]));
2069 if (PL_reg_eval_set)
2070 restore_pos(aTHX_ 0);
2075 - regtry - try match at specific point
2077 STATIC I32 /* 0 failure, 1 success */
2078 S_regtry(pTHX_ regexp *prog, char *startpos)
2086 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2088 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2091 PL_reg_eval_set = RS_init;
2093 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2094 (IV)(PL_stack_sp - PL_stack_base));
2096 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2097 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2098 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2100 /* Apparently this is not needed, judging by wantarray. */
2101 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2102 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2105 /* Make $_ available to executed code. */
2106 if (PL_reg_sv != DEFSV) {
2107 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
2112 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
2113 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
2114 /* prepare for quick setting of pos */
2115 sv_magic(PL_reg_sv, (SV*)0,
2116 PERL_MAGIC_regex_global, Nullch, 0);
2117 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
2121 PL_reg_oldpos = mg->mg_len;
2122 SAVEDESTRUCTOR_X(restore_pos, 0);
2124 if (!PL_reg_curpm) {
2125 Newz(22,PL_reg_curpm, 1, PMOP);
2128 SV* repointer = newSViv(0);
2129 /* so we know which PL_regex_padav element is PL_reg_curpm */
2130 SvFLAGS(repointer) |= SVf_BREAK;
2131 av_push(PL_regex_padav,repointer);
2132 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2133 PL_regex_pad = AvARRAY(PL_regex_padav);
2137 PM_SETRE(PL_reg_curpm, prog);
2138 PL_reg_oldcurpm = PL_curpm;
2139 PL_curpm = PL_reg_curpm;
2140 if (RX_MATCH_COPIED(prog)) {
2141 /* Here is a serious problem: we cannot rewrite subbeg,
2142 since it may be needed if this match fails. Thus
2143 $` inside (?{}) could fail... */
2144 PL_reg_oldsaved = prog->subbeg;
2145 PL_reg_oldsavedlen = prog->sublen;
2146 RX_MATCH_COPIED_off(prog);
2149 PL_reg_oldsaved = Nullch;
2150 prog->subbeg = PL_bostr;
2151 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2153 prog->startp[0] = startpos - PL_bostr;
2154 PL_reginput = startpos;
2155 PL_regstartp = prog->startp;
2156 PL_regendp = prog->endp;
2157 PL_reglastparen = &prog->lastparen;
2158 PL_reglastcloseparen = &prog->lastcloseparen;
2159 prog->lastparen = 0;
2160 prog->lastcloseparen = 0;
2162 DEBUG_r(PL_reg_starttry = startpos);
2163 if (PL_reg_start_tmpl <= prog->nparens) {
2164 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2165 if(PL_reg_start_tmp)
2166 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2168 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2171 /* XXXX What this code is doing here?!!! There should be no need
2172 to do this again and again, PL_reglastparen should take care of
2175 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2176 * Actually, the code in regcppop() (which Ilya may be meaning by
2177 * PL_reglastparen), is not needed at all by the test suite
2178 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2179 * enough, for building DynaLoader, or otherwise this
2180 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2181 * will happen. Meanwhile, this code *is* needed for the
2182 * above-mentioned test suite tests to succeed. The common theme
2183 * on those tests seems to be returning null fields from matches.
2188 if (prog->nparens) {
2189 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2196 if (regmatch(prog->program + 1)) {
2197 prog->endp[0] = PL_reginput - PL_bostr;
2200 REGCP_UNWIND(lastcp);
2204 #define RE_UNWIND_BRANCH 1
2205 #define RE_UNWIND_BRANCHJ 2
2209 typedef struct { /* XX: makes sense to enlarge it... */
2213 } re_unwind_generic_t;
2226 } re_unwind_branch_t;
2228 typedef union re_unwind_t {
2230 re_unwind_generic_t generic;
2231 re_unwind_branch_t branch;
2234 #define sayYES goto yes
2235 #define sayNO goto no
2236 #define sayNO_ANYOF goto no_anyof
2237 #define sayYES_FINAL goto yes_final
2238 #define sayYES_LOUD goto yes_loud
2239 #define sayNO_FINAL goto no_final
2240 #define sayNO_SILENT goto do_no
2241 #define saySAME(x) if (x) goto yes; else goto no
2243 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2244 #define POSCACHE_SEEN 1 /* we know what we're caching */
2245 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2246 #define CACHEsayYES STMT_START { \
2247 if (cache_offset | cache_bit) { \
2248 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2249 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2250 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2251 /* cache records failure, but this is success */ \
2253 PerlIO_printf(Perl_debug_log, \
2254 "%*s (remove success from failure cache)\n", \
2255 REPORT_CODE_OFF+PL_regindent*2, "") \
2257 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2262 #define CACHEsayNO STMT_START { \
2263 if (cache_offset | cache_bit) { \
2264 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2265 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2266 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2267 /* cache records success, but this is failure */ \
2269 PerlIO_printf(Perl_debug_log, \
2270 "%*s (remove failure from success cache)\n", \
2271 REPORT_CODE_OFF+PL_regindent*2, "") \
2273 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2279 #define REPORT_CODE_OFF 24
2282 - regmatch - main matching routine
2284 * Conceptually the strategy is simple: check to see whether the current
2285 * node matches, call self recursively to see whether the rest matches,
2286 * and then act accordingly. In practice we make some effort to avoid
2287 * recursion, in particular by going through "ordinary" nodes (that don't
2288 * need to know whether the rest of the match failed) by a loop instead of
2291 /* [lwall] I've hoisted the register declarations to the outer block in order to
2292 * maybe save a little bit of pushing and popping on the stack. It also takes
2293 * advantage of machines that use a register save mask on subroutine entry.
2295 STATIC I32 /* 0 failure, 1 success */
2296 S_regmatch(pTHX_ regnode *prog)
2298 register regnode *scan; /* Current node. */
2299 regnode *next; /* Next node. */
2300 regnode *inner; /* Next node in internal branch. */
2301 register I32 nextchr; /* renamed nextchr - nextchar colides with
2302 function of same name */
2303 register I32 n; /* no or next */
2304 register I32 ln = 0; /* len or last */
2305 register char *s = Nullch; /* operand or save */
2306 register char *locinput = PL_reginput;
2307 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2308 int minmod = 0, sw = 0, logical = 0;
2311 I32 firstcp = PL_savestack_ix;
2313 register bool do_utf8 = PL_reg_match_utf8;
2315 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2316 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2317 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2324 /* Note that nextchr is a byte even in UTF */
2325 nextchr = UCHARAT(locinput);
2327 while (scan != NULL) {
2330 SV *prop = sv_newmortal();
2331 int docolor = *PL_colors[0];
2332 int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2333 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2334 /* The part of the string before starttry has one color
2335 (pref0_len chars), between starttry and current
2336 position another one (pref_len - pref0_len chars),
2337 after the current position the third one.
2338 We assume that pref0_len <= pref_len, otherwise we
2339 decrease pref0_len. */
2340 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2341 ? (5 + taill) - l : locinput - PL_bostr;
2344 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2346 pref0_len = pref_len - (locinput - PL_reg_starttry);
2347 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2348 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2349 ? (5 + taill) - pref_len : PL_regeol - locinput);
2350 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2354 if (pref0_len > pref_len)
2355 pref0_len = pref_len;
2356 regprop(prop, scan);
2359 do_utf8 && OP(scan) != CANY ?
2360 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2361 pref0_len, 60, UNI_DISPLAY_REGEX) :
2362 locinput - pref_len;
2363 int len0 = do_utf8 ? strlen(s0) : pref0_len;
2364 char *s1 = do_utf8 && OP(scan) != CANY ?
2365 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2366 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2367 locinput - pref_len + pref0_len;
2368 int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2369 char *s2 = do_utf8 && OP(scan) != CANY ?
2370 pv_uni_display(dsv2, (U8*)locinput,
2371 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2373 int len2 = do_utf8 ? strlen(s2) : l;
2374 PerlIO_printf(Perl_debug_log,
2375 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2376 (IV)(locinput - PL_bostr),
2383 (docolor ? "" : "> <"),
2387 15 - l - pref_len + 1,
2389 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2394 next = scan + NEXT_OFF(scan);
2400 if (locinput == PL_bostr || (PL_multiline &&
2401 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2403 /* regtill = regbol; */
2408 if (locinput == PL_bostr ||
2409 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2415 if (locinput == PL_bostr)
2419 if (locinput == PL_reg_ganch)
2429 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2434 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2436 if (PL_regeol - locinput > 1)
2440 if (PL_regeol != locinput)
2444 if (!nextchr && locinput >= PL_regeol)
2447 locinput += PL_utf8skip[nextchr];
2448 if (locinput > PL_regeol)
2450 nextchr = UCHARAT(locinput);
2453 nextchr = UCHARAT(++locinput);
2456 if (!nextchr && locinput >= PL_regeol)
2458 nextchr = UCHARAT(++locinput);
2461 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2464 locinput += PL_utf8skip[nextchr];
2465 if (locinput > PL_regeol)
2467 nextchr = UCHARAT(locinput);
2470 nextchr = UCHARAT(++locinput);
2475 if (do_utf8 != UTF) {
2476 /* The target and the pattern have differing utf8ness. */
2482 /* The target is utf8, the pattern is not utf8. */
2486 if (NATIVE_TO_UNI(*(U8*)s) !=
2487 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2489 0 : UTF8_ALLOW_ANY))
2496 /* The target is not utf8, the pattern is utf8. */
2500 if (NATIVE_TO_UNI(*((U8*)l)) !=
2501 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2503 0 : UTF8_ALLOW_ANY))
2510 nextchr = UCHARAT(locinput);
2513 /* The target and the pattern have the same utf8ness. */
2514 /* Inline the first character, for speed. */
2515 if (UCHARAT(s) != nextchr)
2517 if (PL_regeol - locinput < ln)
2519 if (ln > 1 && memNE(s, locinput, ln))
2522 nextchr = UCHARAT(locinput);
2525 PL_reg_flags |= RF_tainted;
2531 if (do_utf8 || UTF) {
2532 /* Either target or the pattern are utf8. */
2534 char *e = PL_regeol;
2536 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
2537 l, &e, 0, do_utf8)) {
2538 /* One more case for the sharp s:
2539 * pack("U0U*", 0xDF) =~ /ss/i,
2540 * the 0xC3 0x9F are the UTF-8
2541 * byte sequence for the U+00DF. */
2543 toLOWER(s[0]) == 's' &&
2545 toLOWER(s[1]) == 's' &&
2552 nextchr = UCHARAT(locinput);
2556 /* Neither the target and the pattern are utf8. */
2558 /* Inline the first character, for speed. */
2559 if (UCHARAT(s) != nextchr &&
2560 UCHARAT(s) != ((OP(scan) == EXACTF)
2561 ? PL_fold : PL_fold_locale)[nextchr])
2563 if (PL_regeol - locinput < ln)
2565 if (ln > 1 && (OP(scan) == EXACTF
2566 ? ibcmp(s, locinput, ln)
2567 : ibcmp_locale(s, locinput, ln)))
2570 nextchr = UCHARAT(locinput);
2574 STRLEN inclasslen = PL_regeol - locinput;
2576 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
2578 if (locinput >= PL_regeol)
2580 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2581 nextchr = UCHARAT(locinput);
2586 nextchr = UCHARAT(locinput);
2587 if (!REGINCLASS(scan, (U8*)locinput))
2589 if (!nextchr && locinput >= PL_regeol)
2591 nextchr = UCHARAT(++locinput);
2595 /* If we might have the case of the German sharp s
2596 * in a casefolding Unicode character class. */
2598 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2599 locinput += SHARP_S_SKIP;
2600 nextchr = UCHARAT(locinput);
2606 PL_reg_flags |= RF_tainted;
2612 LOAD_UTF8_CHARCLASS(alnum,"a");
2613 if (!(OP(scan) == ALNUM
2614 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2615 : isALNUM_LC_utf8((U8*)locinput)))
2619 locinput += PL_utf8skip[nextchr];
2620 nextchr = UCHARAT(locinput);
2623 if (!(OP(scan) == ALNUM
2624 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2626 nextchr = UCHARAT(++locinput);
2629 PL_reg_flags |= RF_tainted;
2632 if (!nextchr && locinput >= PL_regeol)
2635 LOAD_UTF8_CHARCLASS(alnum,"a");
2636 if (OP(scan) == NALNUM
2637 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2638 : isALNUM_LC_utf8((U8*)locinput))
2642 locinput += PL_utf8skip[nextchr];
2643 nextchr = UCHARAT(locinput);
2646 if (OP(scan) == NALNUM
2647 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2649 nextchr = UCHARAT(++locinput);
2653 PL_reg_flags |= RF_tainted;
2657 /* was last char in word? */
2659 if (locinput == PL_bostr)
2662 U8 *r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2664 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
2666 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2667 ln = isALNUM_uni(ln);
2668 LOAD_UTF8_CHARCLASS(alnum,"a");
2669 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2672 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2673 n = isALNUM_LC_utf8((U8*)locinput);
2677 ln = (locinput != PL_bostr) ?
2678 UCHARAT(locinput - 1) : '\n';
2679 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2681 n = isALNUM(nextchr);
2684 ln = isALNUM_LC(ln);
2685 n = isALNUM_LC(nextchr);
2688 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2689 OP(scan) == BOUNDL))
2693 PL_reg_flags |= RF_tainted;
2699 if (UTF8_IS_CONTINUED(nextchr)) {
2700 LOAD_UTF8_CHARCLASS(space," ");
2701 if (!(OP(scan) == SPACE
2702 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2703 : isSPACE_LC_utf8((U8*)locinput)))
2707 locinput += PL_utf8skip[nextchr];
2708 nextchr = UCHARAT(locinput);
2711 if (!(OP(scan) == SPACE
2712 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2714 nextchr = UCHARAT(++locinput);
2717 if (!(OP(scan) == SPACE
2718 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2720 nextchr = UCHARAT(++locinput);
2724 PL_reg_flags |= RF_tainted;
2727 if (!nextchr && locinput >= PL_regeol)
2730 LOAD_UTF8_CHARCLASS(space," ");
2731 if (OP(scan) == NSPACE
2732 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2733 : isSPACE_LC_utf8((U8*)locinput))
2737 locinput += PL_utf8skip[nextchr];
2738 nextchr = UCHARAT(locinput);
2741 if (OP(scan) == NSPACE
2742 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2744 nextchr = UCHARAT(++locinput);
2747 PL_reg_flags |= RF_tainted;
2753 LOAD_UTF8_CHARCLASS(digit,"0");
2754 if (!(OP(scan) == DIGIT
2755 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2756 : isDIGIT_LC_utf8((U8*)locinput)))
2760 locinput += PL_utf8skip[nextchr];
2761 nextchr = UCHARAT(locinput);
2764 if (!(OP(scan) == DIGIT
2765 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2767 nextchr = UCHARAT(++locinput);
2770 PL_reg_flags |= RF_tainted;
2773 if (!nextchr && locinput >= PL_regeol)
2776 LOAD_UTF8_CHARCLASS(digit,"0");
2777 if (OP(scan) == NDIGIT
2778 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2779 : isDIGIT_LC_utf8((U8*)locinput))
2783 locinput += PL_utf8skip[nextchr];
2784 nextchr = UCHARAT(locinput);
2787 if (OP(scan) == NDIGIT
2788 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2790 nextchr = UCHARAT(++locinput);
2793 if (locinput >= PL_regeol)
2796 LOAD_UTF8_CHARCLASS(mark,"~");
2797 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2799 locinput += PL_utf8skip[nextchr];
2800 while (locinput < PL_regeol &&
2801 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2802 locinput += UTF8SKIP(locinput);
2803 if (locinput > PL_regeol)
2808 nextchr = UCHARAT(locinput);
2811 PL_reg_flags |= RF_tainted;
2815 n = ARG(scan); /* which paren pair */
2816 ln = PL_regstartp[n];
2817 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2818 if ((I32)*PL_reglastparen < n || ln == -1)
2819 sayNO; /* Do not match unless seen CLOSEn. */
2820 if (ln == PL_regendp[n])
2824 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2826 char *e = PL_bostr + PL_regendp[n];
2828 * Note that we can't do the "other character" lookup trick as
2829 * in the 8-bit case (no pun intended) because in Unicode we
2830 * have to map both upper and title case to lower case.
2832 if (OP(scan) == REFF) {
2833 STRLEN ulen1, ulen2;
2834 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
2835 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
2839 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2840 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
2841 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
2848 nextchr = UCHARAT(locinput);
2852 /* Inline the first character, for speed. */
2853 if (UCHARAT(s) != nextchr &&
2855 (UCHARAT(s) != ((OP(scan) == REFF
2856 ? PL_fold : PL_fold_locale)[nextchr]))))
2858 ln = PL_regendp[n] - ln;
2859 if (locinput + ln > PL_regeol)
2861 if (ln > 1 && (OP(scan) == REF
2862 ? memNE(s, locinput, ln)
2864 ? ibcmp(s, locinput, ln)
2865 : ibcmp_locale(s, locinput, ln))))
2868 nextchr = UCHARAT(locinput);
2879 OP_4tree *oop = PL_op;
2880 COP *ocurcop = PL_curcop;
2883 struct regexp *oreg = PL_reg_re;
2886 PL_op = (OP_4tree*)PL_regdata->data[n];
2887 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2888 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
2889 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2893 CALLRUNOPS(aTHX); /* Scalar context. */
2896 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
2904 PAD_RESTORE_LOCAL(old_comppad);
2905 PL_curcop = ocurcop;
2907 if (logical == 2) { /* Postponed subexpression. */
2909 MAGIC *mg = Null(MAGIC*);
2911 CHECKPOINT cp, lastcp;
2915 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2916 mg = mg_find(sv, PERL_MAGIC_qr);
2917 else if (SvSMAGICAL(ret)) {
2918 if (SvGMAGICAL(ret))
2919 sv_unmagic(ret, PERL_MAGIC_qr);
2921 mg = mg_find(ret, PERL_MAGIC_qr);
2925 re = (regexp *)mg->mg_obj;
2926 (void)ReREFCNT_inc(re);
2930 char *t = SvPV(ret, len);
2932 char *oprecomp = PL_regprecomp;
2933 I32 osize = PL_regsize;
2934 I32 onpar = PL_regnpar;
2937 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
2938 re = CALLREGCOMP(aTHX_ t, t + len, &pm);
2940 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2942 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2944 PL_regprecomp = oprecomp;
2949 PerlIO_printf(Perl_debug_log,
2950 "Entering embedded `%s%.60s%s%s'\n",
2954 (strlen(re->precomp) > 60 ? "..." : ""))
2957 state.prev = PL_reg_call_cc;
2958 state.cc = PL_regcc;
2959 state.re = PL_reg_re;
2963 cp = regcppush(0); /* Save *all* the positions. */
2966 state.ss = PL_savestack_ix;
2967 *PL_reglastparen = 0;
2968 *PL_reglastcloseparen = 0;
2969 PL_reg_call_cc = &state;
2970 PL_reginput = locinput;
2971 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2972 ((re->reganch & ROPT_UTF8) != 0);
2973 if (toggleutf) PL_reg_flags ^= RF_utf8;
2975 /* XXXX This is too dramatic a measure... */
2978 if (regmatch(re->program + 1)) {
2979 /* Even though we succeeded, we need to restore
2980 global variables, since we may be wrapped inside
2981 SUSPEND, thus the match may be not finished yet. */
2983 /* XXXX Do this only if SUSPENDed? */
2984 PL_reg_call_cc = state.prev;
2985 PL_regcc = state.cc;
2986 PL_reg_re = state.re;
2987 cache_re(PL_reg_re);
2988 if (toggleutf) PL_reg_flags ^= RF_utf8;
2990 /* XXXX This is too dramatic a measure... */
2993 /* These are needed even if not SUSPEND. */
2999 REGCP_UNWIND(lastcp);
3001 PL_reg_call_cc = state.prev;
3002 PL_regcc = state.cc;
3003 PL_reg_re = state.re;
3004 cache_re(PL_reg_re);
3005 if (toggleutf) PL_reg_flags ^= RF_utf8;
3007 /* XXXX This is too dramatic a measure... */
3017 sv_setsv(save_scalar(PL_replgv), ret);
3023 n = ARG(scan); /* which paren pair */
3024 PL_reg_start_tmp[n] = locinput;
3029 n = ARG(scan); /* which paren pair */
3030 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3031 PL_regendp[n] = locinput - PL_bostr;
3032 if (n > (I32)*PL_reglastparen)
3033 *PL_reglastparen = n;
3034 *PL_reglastcloseparen = n;
3037 n = ARG(scan); /* which paren pair */
3038 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3041 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3043 next = NEXTOPER(NEXTOPER(scan));
3045 next = scan + ARG(scan);
3046 if (OP(next) == IFTHEN) /* Fake one. */
3047 next = NEXTOPER(NEXTOPER(next));
3051 logical = scan->flags;
3053 /*******************************************************************
3054 PL_regcc contains infoblock about the innermost (...)* loop, and
3055 a pointer to the next outer infoblock.
3057 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3059 1) After matching X, regnode for CURLYX is processed;
3061 2) This regnode creates infoblock on the stack, and calls
3062 regmatch() recursively with the starting point at WHILEM node;
3064 3) Each hit of WHILEM node tries to match A and Z (in the order
3065 depending on the current iteration, min/max of {min,max} and
3066 greediness). The information about where are nodes for "A"
3067 and "Z" is read from the infoblock, as is info on how many times "A"
3068 was already matched, and greediness.
3070 4) After A matches, the same WHILEM node is hit again.
3072 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3073 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3074 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3075 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3076 of the external loop.
3078 Currently present infoblocks form a tree with a stem formed by PL_curcc
3079 and whatever it mentions via ->next, and additional attached trees
3080 corresponding to temporarily unset infoblocks as in "5" above.
3082 In the following picture infoblocks for outer loop of
3083 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3084 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3085 infoblocks are drawn below the "reset" infoblock.
3087 In fact in the picture below we do not show failed matches for Z and T
3088 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3089 more obvious *why* one needs to *temporary* unset infoblocks.]
3091 Matched REx position InfoBlocks Comment
3095 Y A)*?Z)*?T x <- O <- I
3096 YA )*?Z)*?T x <- O <- I
3097 YA A)*?Z)*?T x <- O <- I
3098 YAA )*?Z)*?T x <- O <- I
3099 YAA Z)*?T x <- O # Temporary unset I
3102 YAAZ Y(A)*?Z)*?T x <- O
3105 YAAZY (A)*?Z)*?T x <- O
3108 YAAZY A)*?Z)*?T x <- O <- I
3111 YAAZYA )*?Z)*?T x <- O <- I
3114 YAAZYA Z)*?T x <- O # Temporary unset I
3120 YAAZYAZ T x # Temporary unset O
3127 *******************************************************************/
3130 CHECKPOINT cp = PL_savestack_ix;
3131 /* No need to save/restore up to this paren */
3132 I32 parenfloor = scan->flags;
3134 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3136 cc.oldcc = PL_regcc;
3138 /* XXXX Probably it is better to teach regpush to support
3139 parenfloor > PL_regsize... */
3140 if (parenfloor > (I32)*PL_reglastparen)
3141 parenfloor = *PL_reglastparen; /* Pessimization... */
3142 cc.parenfloor = parenfloor;
3144 cc.min = ARG1(scan);
3145 cc.max = ARG2(scan);
3146 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3150 PL_reginput = locinput;
3151 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3153 PL_regcc = cc.oldcc;
3159 * This is really hard to understand, because after we match
3160 * what we're trying to match, we must make sure the rest of
3161 * the REx is going to match for sure, and to do that we have
3162 * to go back UP the parse tree by recursing ever deeper. And
3163 * if it fails, we have to reset our parent's current state
3164 * that we can try again after backing off.
3167 CHECKPOINT cp, lastcp;
3168 CURCUR* cc = PL_regcc;
3169 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3170 I32 cache_offset = 0, cache_bit = 0;
3172 n = cc->cur + 1; /* how many we know we matched */
3173 PL_reginput = locinput;
3176 PerlIO_printf(Perl_debug_log,
3177 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3178 REPORT_CODE_OFF+PL_regindent*2, "",
3179 (long)n, (long)cc->min,
3180 (long)cc->max, PTR2UV(cc))
3183 /* If degenerate scan matches "", assume scan done. */
3185 if (locinput == cc->lastloc && n >= cc->min) {
3186 PL_regcc = cc->oldcc;
3190 PerlIO_printf(Perl_debug_log,
3191 "%*s empty match detected, try continuation...\n",
3192 REPORT_CODE_OFF+PL_regindent*2, "")
3194 if (regmatch(cc->next))
3202 /* First just match a string of min scans. */
3206 cc->lastloc = locinput;
3207 if (regmatch(cc->scan))
3210 cc->lastloc = lastloc;
3215 /* Check whether we already were at this position.
3216 Postpone detection until we know the match is not
3217 *that* much linear. */
3218 if (!PL_reg_maxiter) {
3219 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3220 PL_reg_leftiter = PL_reg_maxiter;
3222 if (PL_reg_leftiter-- == 0) {
3223 I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3224 if (PL_reg_poscache) {
3225 if ((I32)PL_reg_poscache_size < size) {
3226 Renew(PL_reg_poscache, size, char);
3227 PL_reg_poscache_size = size;
3229 Zero(PL_reg_poscache, size, char);
3232 PL_reg_poscache_size = size;
3233 Newz(29, PL_reg_poscache, size, char);
3236 PerlIO_printf(Perl_debug_log,
3237 "%sDetected a super-linear match, switching on caching%s...\n",
3238 PL_colors[4], PL_colors[5])
3241 if (PL_reg_leftiter < 0) {
3242 cache_offset = locinput - PL_bostr;
3244 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3245 + cache_offset * (scan->flags>>4);
3246 cache_bit = cache_offset % 8;
3248 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
3250 PerlIO_printf(Perl_debug_log,
3251 "%*s already tried at this position...\n",
3252 REPORT_CODE_OFF+PL_regindent*2, "")
3254 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3255 /* cache records success */
3258 /* cache records failure */
3261 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
3265 /* Prefer next over scan for minimal matching. */
3268 PL_regcc = cc->oldcc;
3271 cp = regcppush(cc->parenfloor);
3273 if (regmatch(cc->next)) {
3275 CACHEsayYES; /* All done. */
3277 REGCP_UNWIND(lastcp);
3283 if (n >= cc->max) { /* Maximum greed exceeded? */
3284 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3285 && !(PL_reg_flags & RF_warned)) {
3286 PL_reg_flags |= RF_warned;
3287 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3288 "Complex regular subexpression recursion",
3295 PerlIO_printf(Perl_debug_log,
3296 "%*s trying longer...\n",
3297 REPORT_CODE_OFF+PL_regindent*2, "")
3299 /* Try scanning more and see if it helps. */
3300 PL_reginput = locinput;
3302 cc->lastloc = locinput;
3303 cp = regcppush(cc->parenfloor);
3305 if (regmatch(cc->scan)) {
3309 REGCP_UNWIND(lastcp);
3312 cc->lastloc = lastloc;
3316 /* Prefer scan over next for maximal matching. */
3318 if (n < cc->max) { /* More greed allowed? */
3319 cp = regcppush(cc->parenfloor);
3321 cc->lastloc = locinput;
3323 if (regmatch(cc->scan)) {
3327 REGCP_UNWIND(lastcp);
3328 regcppop(); /* Restore some previous $<digit>s? */
3329 PL_reginput = locinput;
3331 PerlIO_printf(Perl_debug_log,
3332 "%*s failed, try continuation...\n",
3333 REPORT_CODE_OFF+PL_regindent*2, "")
3336 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3337 && !(PL_reg_flags & RF_warned)) {
3338 PL_reg_flags |= RF_warned;
3339 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3340 "Complex regular subexpression recursion",
3344 /* Failed deeper matches of scan, so see if this one works. */
3345 PL_regcc = cc->oldcc;
3348 if (regmatch(cc->next))
3354 cc->lastloc = lastloc;
3359 next = scan + ARG(scan);
3362 inner = NEXTOPER(NEXTOPER(scan));
3365 inner = NEXTOPER(scan);
3369 if (OP(next) != c1) /* No choice. */
3370 next = inner; /* Avoid recursion. */
3372 I32 lastparen = *PL_reglastparen;
3374 re_unwind_branch_t *uw;
3376 /* Put unwinding data on stack */
3377 unwind1 = SSNEWt(1,re_unwind_branch_t);
3378 uw = SSPTRt(unwind1,re_unwind_branch_t);
3381 uw->type = ((c1 == BRANCH)
3383 : RE_UNWIND_BRANCHJ);
3384 uw->lastparen = lastparen;
3386 uw->locinput = locinput;
3387 uw->nextchr = nextchr;
3389 uw->regindent = ++PL_regindent;
3392 REGCP_SET(uw->lastcp);
3394 /* Now go into the first branch */
3407 /* We suppose that the next guy does not need
3408 backtracking: in particular, it is of constant non-zero length,
3409 and has no parenths to influence future backrefs. */
3410 ln = ARG1(scan); /* min to match */
3411 n = ARG2(scan); /* max to match */
3412 paren = scan->flags;
3414 if (paren > PL_regsize)
3416 if (paren > (I32)*PL_reglastparen)
3417 *PL_reglastparen = paren;
3419 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3421 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3422 PL_reginput = locinput;
3425 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3427 locinput = PL_reginput;
3428 if (HAS_TEXT(next) || JUMPABLE(next)) {
3429 regnode *text_node = next;
3431 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3433 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3435 if (PL_regkind[(U8)OP(text_node)] == REF) {
3439 else { c1 = (U8)*STRING(text_node); }
3440 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3442 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3443 c2 = PL_fold_locale[c1];
3452 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3453 /* If it could work, try it. */
3455 UCHARAT(PL_reginput) == c1 ||
3456 UCHARAT(PL_reginput) == c2)
3460 PL_regstartp[paren] =
3461 HOPc(PL_reginput, -l) - PL_bostr;
3462 PL_regendp[paren] = PL_reginput - PL_bostr;
3465 PL_regendp[paren] = -1;
3469 REGCP_UNWIND(lastcp);
3471 /* Couldn't or didn't -- move forward. */
3472 PL_reginput = locinput;
3473 if (regrepeat_hard(scan, 1, &l)) {
3475 locinput = PL_reginput;
3482 n = regrepeat_hard(scan, n, &l);
3483 locinput = PL_reginput;
3485 PerlIO_printf(Perl_debug_log,
3486 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3487 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3491 if (HAS_TEXT(next) || JUMPABLE(next)) {
3492 regnode *text_node = next;
3494 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3496 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3498 if (PL_regkind[(U8)OP(text_node)] == REF) {
3502 else { c1 = (U8)*STRING(text_node); }
3504 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3506 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3507 c2 = PL_fold_locale[c1];
3518 /* If it could work, try it. */
3520 UCHARAT(PL_reginput) == c1 ||
3521 UCHARAT(PL_reginput) == c2)
3524 PerlIO_printf(Perl_debug_log,
3525 "%*s trying tail with n=%"IVdf"...\n",
3526 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3530 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3531 PL_regendp[paren] = PL_reginput - PL_bostr;
3534 PL_regendp[paren] = -1;
3538 REGCP_UNWIND(lastcp);
3540 /* Couldn't or didn't -- back up. */
3542 locinput = HOPc(locinput, -l);
3543 PL_reginput = locinput;
3550 paren = scan->flags; /* Which paren to set */
3551 if (paren > PL_regsize)
3553 if (paren > (I32)*PL_reglastparen)
3554 *PL_reglastparen = paren;
3555 ln = ARG1(scan); /* min to match */
3556 n = ARG2(scan); /* max to match */
3557 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3561 ln = ARG1(scan); /* min to match */
3562 n = ARG2(scan); /* max to match */
3563 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3568 scan = NEXTOPER(scan);
3574 scan = NEXTOPER(scan);
3578 * Lookahead to avoid useless match attempts
3579 * when we know what character comes next.
3583 * Used to only do .*x and .*?x, but now it allows
3584 * for )'s, ('s and (?{ ... })'s to be in the way
3585 * of the quantifier and the EXACT-like node. -- japhy
3588 if (HAS_TEXT(next) || JUMPABLE(next)) {
3590 regnode *text_node = next;
3592 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3594 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3596 if (PL_regkind[(U8)OP(text_node)] == REF) {
3598 goto assume_ok_easy;
3600 else { s = (U8*)STRING(text_node); }
3604 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3606 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3607 c2 = PL_fold_locale[c1];
3610 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3611 STRLEN ulen1, ulen2;
3612 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3613 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3615 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3616 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3618 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
3620 0 : UTF8_ALLOW_ANY);
3621 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
3623 0 : UTF8_ALLOW_ANY);
3626 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
3628 0 : UTF8_ALLOW_ANY);
3636 PL_reginput = locinput;
3640 if (ln && regrepeat(scan, ln) < ln)
3642 locinput = PL_reginput;
3645 char *e; /* Should not check after this */
3646 char *old = locinput;
3649 if (n == REG_INFTY) {
3652 while (UTF8_IS_CONTINUATION(*(U8*)e))
3658 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3662 e = locinput + n - ln;
3667 /* Find place 'next' could work */
3670 while (locinput <= e &&
3671 UCHARAT(locinput) != c1)
3674 while (locinput <= e
3675 && UCHARAT(locinput) != c1
3676 && UCHARAT(locinput) != c2)
3679 count = locinput - old;
3684 /* count initialised to
3685 * utf8_distance(old, locinput) */
3686 while (locinput <= e &&
3687 utf8n_to_uvchr((U8*)locinput,
3688 UTF8_MAXBYTES, &len,
3690 0 : UTF8_ALLOW_ANY) != (UV)c1) {
3695 /* count initialised to
3696 * utf8_distance(old, locinput) */
3697 while (locinput <= e) {
3698 UV c = utf8n_to_uvchr((U8*)locinput,
3699 UTF8_MAXBYTES, &len,
3701 0 : UTF8_ALLOW_ANY);
3702 if (c == (UV)c1 || c == (UV)c2)
3711 /* PL_reginput == old now */
3712 if (locinput != old) {
3713 ln = 1; /* Did some */
3714 if (regrepeat(scan, count) < count)
3717 /* PL_reginput == locinput now */
3718 TRYPAREN(paren, ln, locinput);
3719 PL_reginput = locinput; /* Could be reset... */
3720 REGCP_UNWIND(lastcp);
3721 /* Couldn't or didn't -- move forward. */
3724 locinput += UTF8SKIP(locinput);
3731 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3735 c = utf8n_to_uvchr((U8*)PL_reginput,
3738 0 : UTF8_ALLOW_ANY);
3740 c = UCHARAT(PL_reginput);
3741 /* If it could work, try it. */
3742 if (c == (UV)c1 || c == (UV)c2)
3744 TRYPAREN(paren, ln, PL_reginput);
3745 REGCP_UNWIND(lastcp);
3748 /* If it could work, try it. */
3749 else if (c1 == -1000)
3751 TRYPAREN(paren, ln, PL_reginput);
3752 REGCP_UNWIND(lastcp);
3754 /* Couldn't or didn't -- move forward. */
3755 PL_reginput = locinput;
3756 if (regrepeat(scan, 1)) {
3758 locinput = PL_reginput;
3766 n = regrepeat(scan, n);
3767 locinput = PL_reginput;
3768 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3769 ((!PL_multiline && OP(next) != MEOL) ||
3770 OP(next) == SEOL || OP(next) == EOS))
3772 ln = n; /* why back off? */
3773 /* ...because $ and \Z can match before *and* after
3774 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3775 We should back off by one in this case. */
3776 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3785 c = utf8n_to_uvchr((U8*)PL_reginput,
3788 0 : UTF8_ALLOW_ANY);
3790 c = UCHARAT(PL_reginput);
3792 /* If it could work, try it. */
3793 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3795 TRYPAREN(paren, n, PL_reginput);
3796 REGCP_UNWIND(lastcp);
3798 /* Couldn't or didn't -- back up. */
3800 PL_reginput = locinput = HOPc(locinput, -1);
3808 c = utf8n_to_uvchr((U8*)PL_reginput,
3811 0 : UTF8_ALLOW_ANY);
3813 c = UCHARAT(PL_reginput);
3815 /* If it could work, try it. */
3816 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3818 TRYPAREN(paren, n, PL_reginput);
3819 REGCP_UNWIND(lastcp);
3821 /* Couldn't or didn't -- back up. */
3823 PL_reginput = locinput = HOPc(locinput, -1);
3830 if (PL_reg_call_cc) {
3831 re_cc_state *cur_call_cc = PL_reg_call_cc;
3832 CURCUR *cctmp = PL_regcc;
3833 regexp *re = PL_reg_re;
3834 CHECKPOINT cp, lastcp;
3836 cp = regcppush(0); /* Save *all* the positions. */
3838 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3840 PL_reginput = locinput; /* Make position available to
3842 cache_re(PL_reg_call_cc->re);
3843 PL_regcc = PL_reg_call_cc->cc;
3844 PL_reg_call_cc = PL_reg_call_cc->prev;
3845 if (regmatch(cur_call_cc->node)) {
3846 PL_reg_call_cc = cur_call_cc;
3850 REGCP_UNWIND(lastcp);
3852 PL_reg_call_cc = cur_call_cc;
3858 PerlIO_printf(Perl_debug_log,
3859 "%*s continuation failed...\n",
3860 REPORT_CODE_OFF+PL_regindent*2, "")
3864 if (locinput < PL_regtill) {
3865 DEBUG_r(PerlIO_printf(Perl_debug_log,
3866 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3868 (long)(locinput - PL_reg_starttry),
3869 (long)(PL_regtill - PL_reg_starttry),
3871 sayNO_FINAL; /* Cannot match: too short. */
3873 PL_reginput = locinput; /* put where regtry can find it */
3874 sayYES_FINAL; /* Success! */
3876 PL_reginput = locinput; /* put where regtry can find it */
3877 sayYES_LOUD; /* Success! */
3880 PL_reginput = locinput;
3885 s = HOPBACKc(locinput, scan->flags);
3891 PL_reginput = locinput;
3896 s = HOPBACKc(locinput, scan->flags);
3902 PL_reginput = locinput;
3905 inner = NEXTOPER(NEXTOPER(scan));
3906 if (regmatch(inner) != n) {
3921 if (OP(scan) == SUSPEND) {
3922 locinput = PL_reginput;
3923 nextchr = UCHARAT(locinput);
3928 next = scan + ARG(scan);
3933 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3934 PTR2UV(scan), OP(scan));
3935 Perl_croak(aTHX_ "regexp memory corruption");
3942 * We get here only if there's trouble -- normally "case END" is
3943 * the terminating point.
3945 Perl_croak(aTHX_ "corrupted regexp pointers");
3951 PerlIO_printf(Perl_debug_log,
3952 "%*s %scould match...%s\n",
3953 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3957 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3958 PL_colors[4],PL_colors[5]));
3964 #if 0 /* Breaks $^R */
3972 PerlIO_printf(Perl_debug_log,
3973 "%*s %sfailed...%s\n",
3974 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3980 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3983 case RE_UNWIND_BRANCH:
3984 case RE_UNWIND_BRANCHJ:
3986 re_unwind_branch_t *uwb = &(uw->branch);
3987 I32 lastparen = uwb->lastparen;
3989 REGCP_UNWIND(uwb->lastcp);
3990 for (n = *PL_reglastparen; n > lastparen; n--)
3992 *PL_reglastparen = n;
3993 scan = next = uwb->next;
3995 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3996 ? BRANCH : BRANCHJ) ) { /* Failure */
4003 /* Have more choice yet. Reuse the same uwb. */
4005 if ((n = (uwb->type == RE_UNWIND_BRANCH
4006 ? NEXT_OFF(next) : ARG(next))))
4009 next = NULL; /* XXXX Needn't unwinding in this case... */
4011 next = NEXTOPER(scan);
4012 if (uwb->type == RE_UNWIND_BRANCHJ)
4013 next = NEXTOPER(next);
4014 locinput = uwb->locinput;
4015 nextchr = uwb->nextchr;
4017 PL_regindent = uwb->regindent;
4024 Perl_croak(aTHX_ "regexp unwind memory corruption");
4035 - regrepeat - repeatedly match something simple, report how many
4038 * [This routine now assumes that it will only match on things of length 1.
4039 * That was true before, but now we assume scan - reginput is the count,
4040 * rather than incrementing count on every character. [Er, except utf8.]]
4043 S_regrepeat(pTHX_ regnode *p, I32 max)
4045 register char *scan;
4047 register char *loceol = PL_regeol;
4048 register I32 hardcount = 0;
4049 register bool do_utf8 = PL_reg_match_utf8;
4052 if (max == REG_INFTY)
4054 else if (max < loceol - scan)
4055 loceol = scan + max;
4060 while (scan < loceol && hardcount < max && *scan != '\n') {
4061 scan += UTF8SKIP(scan);
4065 while (scan < loceol && *scan != '\n')
4072 while (scan < loceol && hardcount < max) {
4073 scan += UTF8SKIP(scan);
4083 case EXACT: /* length of string is 1 */
4085 while (scan < loceol && UCHARAT(scan) == c)
4088 case EXACTF: /* length of string is 1 */
4090 while (scan < loceol &&
4091 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4094 case EXACTFL: /* length of string is 1 */
4095 PL_reg_flags |= RF_tainted;
4097 while (scan < loceol &&
4098 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4104 while (hardcount < max && scan < loceol &&
4105 reginclass(p, (U8*)scan, 0, do_utf8)) {
4106 scan += UTF8SKIP(scan);
4110 while (scan < loceol && REGINCLASS(p, (U8*)scan))
4117 LOAD_UTF8_CHARCLASS(alnum,"a");
4118 while (hardcount < max && scan < loceol &&
4119 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4120 scan += UTF8SKIP(scan);
4124 while (scan < loceol && isALNUM(*scan))
4129 PL_reg_flags |= RF_tainted;
4132 while (hardcount < max && scan < loceol &&
4133 isALNUM_LC_utf8((U8*)scan)) {
4134 scan += UTF8SKIP(scan);
4138 while (scan < loceol && isALNUM_LC(*scan))
4145 LOAD_UTF8_CHARCLASS(alnum,"a");
4146 while (hardcount < max && scan < loceol &&
4147 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4148 scan += UTF8SKIP(scan);
4152 while (scan < loceol && !isALNUM(*scan))
4157 PL_reg_flags |= RF_tainted;
4160 while (hardcount < max && scan < loceol &&
4161 !isALNUM_LC_utf8((U8*)scan)) {
4162 scan += UTF8SKIP(scan);
4166 while (scan < loceol && !isALNUM_LC(*scan))
4173 LOAD_UTF8_CHARCLASS(space," ");
4174 while (hardcount < max && scan < loceol &&
4176 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4177 scan += UTF8SKIP(scan);
4181 while (scan < loceol && isSPACE(*scan))
4186 PL_reg_flags |= RF_tainted;
4189 while (hardcount < max && scan < loceol &&
4190 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4191 scan += UTF8SKIP(scan);
4195 while (scan < loceol && isSPACE_LC(*scan))
4202 LOAD_UTF8_CHARCLASS(space," ");
4203 while (hardcount < max && scan < loceol &&
4205 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4206 scan += UTF8SKIP(scan);
4210 while (scan < loceol && !isSPACE(*scan))
4215 PL_reg_flags |= RF_tainted;
4218 while (hardcount < max && scan < loceol &&
4219 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4220 scan += UTF8SKIP(scan);
4224 while (scan < loceol && !isSPACE_LC(*scan))
4231 LOAD_UTF8_CHARCLASS(digit,"0");
4232 while (hardcount < max && scan < loceol &&
4233 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4234 scan += UTF8SKIP(scan);
4238 while (scan < loceol && isDIGIT(*scan))
4245 LOAD_UTF8_CHARCLASS(digit,"0");
4246 while (hardcount < max && scan < loceol &&
4247 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4248 scan += UTF8SKIP(scan);
4252 while (scan < loceol && !isDIGIT(*scan))
4256 default: /* Called on something of 0 width. */
4257 break; /* So match right here or not at all. */
4263 c = scan - PL_reginput;
4268 SV *prop = sv_newmortal();
4271 PerlIO_printf(Perl_debug_log,
4272 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4273 REPORT_CODE_OFF+1, "", SvPVX(prop),(IV)c,(IV)max);
4280 - regrepeat_hard - repeatedly match something, report total lenth and length
4282 * The repeater is supposed to have constant non-zero length.
4286 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
4288 register char *scan = Nullch;
4289 register char *start;
4290 register char *loceol = PL_regeol;
4292 I32 count = 0, res = 1;
4297 start = PL_reginput;
4298 if (PL_reg_match_utf8) {
4299 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4302 while (start < PL_reginput) {
4304 start += UTF8SKIP(start);
4315 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4317 *lp = l = PL_reginput - start;
4318 if (max != REG_INFTY && l*max < loceol - scan)
4319 loceol = scan + l*max;
4332 - regclass_swash - prepare the utf8 swash
4336 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4342 if (PL_regdata && PL_regdata->count) {
4345 if (PL_regdata->what[n] == 's') {
4346 SV *rv = (SV*)PL_regdata->data[n];
4347 AV *av = (AV*)SvRV((SV*)rv);
4348 SV **ary = AvARRAY(av);
4351 /* See the end of regcomp.c:S_reglass() for
4352 * documentation of these array elements. */
4355 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4356 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4360 else if (si && doinit) {
4361 sw = swash_init("utf8", "", si, 1, 0);
4362 (void)av_store(av, 1, sw);
4378 - reginclass - determine if a character falls into a character class
4380 The n is the ANYOF regnode, the p is the target string, lenp
4381 is pointer to the maximum length of how far to go in the p
4382 (if the lenp is zero, UTF8SKIP(p) is used),
4383 do_utf8 tells whether the target string is in UTF-8.
4388 S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8)
4390 char flags = ANYOF_FLAGS(n);
4396 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4397 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4398 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
4400 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4401 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4404 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4405 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4408 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4412 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
4415 if (swash_fetch(sw, p, do_utf8))
4417 else if (flags & ANYOF_FOLD) {
4418 if (!match && lenp && av) {
4421 for (i = 0; i <= av_len(av); i++) {
4422 SV* sv = *av_fetch(av, i, FALSE);
4424 char *s = SvPV(sv, len);
4426 if (len <= plen && memEQ(s, (char*)p, len)) {
4434 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4437 to_utf8_fold(p, tmpbuf, &tmplen);
4438 if (swash_fetch(sw, tmpbuf, do_utf8))
4444 if (match && lenp && *lenp == 0)
4445 *lenp = UNISKIP(NATIVE_TO_UNI(c));
4447 if (!match && c < 256) {
4448 if (ANYOF_BITMAP_TEST(n, c))
4450 else if (flags & ANYOF_FOLD) {
4453 if (flags & ANYOF_LOCALE) {
4454 PL_reg_flags |= RF_tainted;
4455 f = PL_fold_locale[c];
4459 if (f != c && ANYOF_BITMAP_TEST(n, f))
4463 if (!match && (flags & ANYOF_CLASS)) {
4464 PL_reg_flags |= RF_tainted;
4466 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4468 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4469 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4470 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4471 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4472 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4473 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4474 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4475 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4476 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4477 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4478 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4479 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4480 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4481 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4482 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4483 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4484 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4485 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4486 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4487 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4488 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4489 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4490 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4491 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4492 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4493 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4494 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4495 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4496 ) /* How's that for a conditional? */
4503 return (flags & ANYOF_INVERT) ? !match : match;
4507 S_reghop(pTHX_ U8 *s, I32 off)
4509 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4513 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4516 while (off-- && s < lim) {
4517 /* XXX could check well-formedness here */
4525 if (UTF8_IS_CONTINUED(*s)) {
4526 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4529 /* XXX could check well-formedness here */
4537 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4539 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4543 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4546 while (off-- && s < lim) {
4547 /* XXX could check well-formedness here */
4557 if (UTF8_IS_CONTINUED(*s)) {
4558 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4561 /* XXX could check well-formedness here */
4573 restore_pos(pTHX_ void *arg)
4575 if (PL_reg_eval_set) {
4576 if (PL_reg_oldsaved) {
4577 PL_reg_re->subbeg = PL_reg_oldsaved;
4578 PL_reg_re->sublen = PL_reg_oldsavedlen;
4579 RX_MATCH_COPIED_on(PL_reg_re);
4581 PL_reg_magic->mg_len = PL_reg_oldpos;
4582 PL_reg_eval_set = 0;
4583 PL_curpm = PL_reg_oldcurpm;
4588 S_to_utf8_substr(pTHX_ register regexp *prog)
4591 if (prog->float_substr && !prog->float_utf8) {
4592 prog->float_utf8 = sv = newSVsv(prog->float_substr);
4593 sv_utf8_upgrade(sv);
4594 if (SvTAIL(prog->float_substr))
4596 if (prog->float_substr == prog->check_substr)
4597 prog->check_utf8 = sv;
4599 if (prog->anchored_substr && !prog->anchored_utf8) {
4600 prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
4601 sv_utf8_upgrade(sv);
4602 if (SvTAIL(prog->anchored_substr))
4604 if (prog->anchored_substr == prog->check_substr)
4605 prog->check_utf8 = sv;
4610 S_to_byte_substr(pTHX_ register regexp *prog)
4613 if (prog->float_utf8 && !prog->float_substr) {
4614 prog->float_substr = sv = newSVsv(prog->float_utf8);
4615 if (sv_utf8_downgrade(sv, TRUE)) {
4616 if (SvTAIL(prog->float_utf8))
4620 prog->float_substr = sv = &PL_sv_undef;
4622 if (prog->float_utf8 == prog->check_utf8)
4623 prog->check_substr = sv;
4625 if (prog->anchored_utf8 && !prog->anchored_substr) {
4626 prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
4627 if (sv_utf8_downgrade(sv, TRUE)) {
4628 if (SvTAIL(prog->anchored_utf8))
4632 prog->anchored_substr = sv = &PL_sv_undef;
4634 if (prog->anchored_utf8 == prog->check_utf8)
4635 prog->check_substr = sv;