5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
34 /* need to replace pregcomp et al, so enable that */
35 # ifndef PERL_IN_XSUB_RE
36 # define PERL_IN_XSUB_RE
38 /* need access to debugger hooks */
39 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
44 #ifdef PERL_IN_XSUB_RE
45 /* We *really* need to overwrite these symbols: */
46 # define Perl_regexec_flags my_regexec
47 # define Perl_regdump my_regdump
48 # define Perl_regprop my_regprop
49 # define Perl_re_intuit_start my_re_intuit_start
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_pregexec my_pregexec
52 # define Perl_reginitcolors my_reginitcolors
53 # define Perl_regclass_swash my_regclass_swash
55 # define PERL_NO_GET_CONTEXT
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
87 * Beware that some of this code is subtly aware of the way operator
88 * precedence is structured in regular expressions. Serious changes in
89 * regular-expression syntax might require a total rethink.
92 #define PERL_IN_REGEXEC_C
97 #define RF_tainted 1 /* tainted information used? */
98 #define RF_warned 2 /* warned about big count? */
99 #define RF_evaled 4 /* Did an EVAL with setting? */
100 #define RF_utf8 8 /* String contains multibyte chars? */
102 #define UTF ((PL_reg_flags & RF_utf8) != 0)
104 #define RS_init 1 /* eval environment created */
105 #define RS_set 2 /* replsv value is set */
108 #define STATIC static
111 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
117 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
118 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
120 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
121 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
122 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
123 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
124 #define HOPc(pos,off) ((char*)HOP(pos,off))
125 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
127 #define HOPBACK(pos, off) ( \
128 (PL_reg_match_utf8) \
129 ? reghopmaybe((U8*)pos, -off) \
130 : (pos - off >= PL_bostr) \
134 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
136 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
137 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
138 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
139 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
140 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
141 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
143 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
144 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
145 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
146 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
147 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
148 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
150 /* for use after a quantifier and before an EXACT-like node -- japhy */
151 #define JUMPABLE(rn) ( \
152 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
153 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
154 OP(rn) == PLUS || OP(rn) == MINMOD || \
155 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
158 #define HAS_TEXT(rn) ( \
159 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
163 Search for mandatory following text node; for lookahead, the text must
164 follow but for lookbehind (rn->flags != 0) we skip to the next step.
166 #define FIND_NEXT_IMPT(rn) STMT_START { \
167 while (JUMPABLE(rn)) \
168 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
169 rn = NEXTOPER(NEXTOPER(rn)); \
170 else if (OP(rn) == PLUS) \
172 else if (OP(rn) == IFMATCH) \
173 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
174 else rn += NEXT_OFF(rn); \
177 static void restore_pos(pTHX_ void *arg);
180 S_regcppush(pTHX_ I32 parenfloor)
182 const int retval = PL_savestack_ix;
183 #define REGCP_PAREN_ELEMS 4
184 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
187 if (paren_elems_to_push < 0)
188 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
190 #define REGCP_OTHER_ELEMS 6
191 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
192 for (p = PL_regsize; p > parenfloor; p--) {
193 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
194 SSPUSHINT(PL_regendp[p]);
195 SSPUSHINT(PL_regstartp[p]);
196 SSPUSHPTR(PL_reg_start_tmp[p]);
199 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
200 SSPUSHINT(PL_regsize);
201 SSPUSHINT(*PL_reglastparen);
202 SSPUSHINT(*PL_reglastcloseparen);
203 SSPUSHPTR(PL_reginput);
204 #define REGCP_FRAME_ELEMS 2
205 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
206 * are needed for the regexp context stack bookkeeping. */
207 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
208 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
213 /* These are needed since we do not localize EVAL nodes: */
214 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
215 " Setting an EVAL scope, savestack=%"IVdf"\n", \
216 (IV)PL_savestack_ix)); cp = PL_savestack_ix
218 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
219 PerlIO_printf(Perl_debug_log, \
220 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
221 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
230 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
232 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
233 i = SSPOPINT; /* Parentheses elements to pop. */
234 input = (char *) SSPOPPTR;
235 *PL_reglastcloseparen = SSPOPINT;
236 *PL_reglastparen = SSPOPINT;
237 PL_regsize = SSPOPINT;
239 /* Now restore the parentheses context. */
240 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
241 i > 0; i -= REGCP_PAREN_ELEMS) {
243 paren = (U32)SSPOPINT;
244 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
245 PL_regstartp[paren] = SSPOPINT;
247 if (paren <= *PL_reglastparen)
248 PL_regendp[paren] = tmps;
250 PerlIO_printf(Perl_debug_log,
251 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
252 (UV)paren, (IV)PL_regstartp[paren],
253 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
254 (IV)PL_regendp[paren],
255 (paren > *PL_reglastparen ? "(no)" : ""));
259 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
260 PerlIO_printf(Perl_debug_log,
261 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
262 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
266 /* It would seem that the similar code in regtry()
267 * already takes care of this, and in fact it is in
268 * a better location to since this code can #if 0-ed out
269 * but the code in regtry() is needed or otherwise tests
270 * requiring null fields (pat.t#187 and split.t#{13,14}
271 * (as of patchlevel 7877) will fail. Then again,
272 * this code seems to be necessary or otherwise
273 * building DynaLoader will fail:
274 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
276 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
277 if ((I32)paren > PL_regsize)
278 PL_regstartp[paren] = -1;
279 PL_regendp[paren] = -1;
286 S_regcp_set_to(pTHX_ I32 ss)
288 const I32 tmp = PL_savestack_ix;
290 PL_savestack_ix = ss;
292 PL_savestack_ix = tmp;
296 typedef struct re_cc_state
300 struct re_cc_state *prev;
305 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
307 #define TRYPAREN(paren, n, input) { \
310 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
311 PL_regendp[paren] = input - PL_bostr; \
314 PL_regendp[paren] = -1; \
316 if (regmatch(next)) \
319 PL_regendp[paren] = -1; \
324 * pregexec and friends
328 - pregexec - match a regexp against a string
331 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
332 char *strbeg, I32 minend, SV *screamer, U32 nosave)
333 /* strend: pointer to null at end of string */
334 /* strbeg: real beginning of string */
335 /* minend: end of match must be >=minend after stringarg. */
336 /* nosave: For optimizations. */
339 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
340 nosave ? 0 : REXEC_COPY_STR);
344 S_cache_re(pTHX_ regexp *prog)
346 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
348 PL_regprogram = prog->program;
350 PL_regnpar = prog->nparens;
351 PL_regdata = prog->data;
356 * Need to implement the following flags for reg_anch:
358 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
360 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
361 * INTUIT_AUTORITATIVE_ML
362 * INTUIT_ONCE_NOML - Intuit can match in one location only.
365 * Another flag for this function: SECOND_TIME (so that float substrs
366 * with giant delta may be not rechecked).
369 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
371 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
372 Otherwise, only SvCUR(sv) is used to get strbeg. */
374 /* XXXX We assume that strpos is strbeg unless sv. */
376 /* XXXX Some places assume that there is a fixed substring.
377 An update may be needed if optimizer marks as "INTUITable"
378 RExen without fixed substrings. Similarly, it is assumed that
379 lengths of all the strings are no more than minlen, thus they
380 cannot come from lookahead.
381 (Or minlen should take into account lookahead.) */
383 /* A failure to find a constant substring means that there is no need to make
384 an expensive call to REx engine, thus we celebrate a failure. Similarly,
385 finding a substring too deep into the string means that less calls to
386 regtry() should be needed.
388 REx compiler's optimizer found 4 possible hints:
389 a) Anchored substring;
391 c) Whether we are anchored (beginning-of-line or \G);
392 d) First node (of those at offset 0) which may distingush positions;
393 We use a)b)d) and multiline-part of c), and try to find a position in the
394 string which does not contradict any of them.
397 /* Most of decisions we do here should have been done at compile time.
398 The nodes of the REx which we used for the search should have been
399 deleted from the finite automaton. */
402 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
403 char *strend, U32 flags, re_scream_pos_data *data)
405 register I32 start_shift = 0;
406 /* Should be nonnegative! */
407 register I32 end_shift = 0;
412 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
414 register char *other_last = Nullch; /* other substr checked before this */
415 char *check_at = Nullch; /* check substr found at this pos */
417 char *i_strpos = strpos;
418 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
420 RX_MATCH_UTF8_set(prog,do_utf8);
422 if (prog->reganch & ROPT_UTF8) {
423 DEBUG_r(PerlIO_printf(Perl_debug_log,
424 "UTF-8 regex...\n"));
425 PL_reg_flags |= RF_utf8;
429 const char *s = PL_reg_match_utf8 ?
430 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
432 const int len = PL_reg_match_utf8 ?
433 strlen(s) : strend - strpos;
436 if (PL_reg_match_utf8)
437 DEBUG_r(PerlIO_printf(Perl_debug_log,
438 "UTF-8 target...\n"));
439 PerlIO_printf(Perl_debug_log,
440 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
441 PL_colors[4],PL_colors[5],PL_colors[0],
444 (strlen(prog->precomp) > 60 ? "..." : ""),
446 (int)(len > 60 ? 60 : len),
448 (len > 60 ? "..." : "")
452 /* CHR_DIST() would be more correct here but it makes things slow. */
453 if (prog->minlen > strend - strpos) {
454 DEBUG_r(PerlIO_printf(Perl_debug_log,
455 "String too short... [re_intuit_start]\n"));
458 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
461 if (!prog->check_utf8 && prog->check_substr)
462 to_utf8_substr(prog);
463 check = prog->check_utf8;
465 if (!prog->check_substr && prog->check_utf8)
466 to_byte_substr(prog);
467 check = prog->check_substr;
469 if (check == &PL_sv_undef) {
470 DEBUG_r(PerlIO_printf(Perl_debug_log,
471 "Non-utf string cannot match utf check string\n"));
474 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
475 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
476 || ( (prog->reganch & ROPT_ANCH_BOL)
477 && !PL_multiline ) ); /* Check after \n? */
480 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
481 | ROPT_IMPLICIT)) /* not a real BOL */
482 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
484 && (strpos != strbeg)) {
485 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
488 if (prog->check_offset_min == prog->check_offset_max &&
489 !(prog->reganch & ROPT_CANY_SEEN)) {
490 /* Substring at constant offset from beg-of-str... */
493 s = HOP3c(strpos, prog->check_offset_min, strend);
495 slen = SvCUR(check); /* >= 1 */
497 if ( strend - s > slen || strend - s < slen - 1
498 || (strend - s == slen && strend[-1] != '\n')) {
499 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
502 /* Now should match s[0..slen-2] */
504 if (slen && (*SvPVX_const(check) != *s
506 && memNE(SvPVX_const(check), s, slen)))) {
508 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
512 else if (*SvPVX_const(check) != *s
513 || ((slen = SvCUR(check)) > 1
514 && memNE(SvPVX_const(check), s, slen)))
516 goto success_at_start;
519 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
521 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
522 end_shift = prog->minlen - start_shift -
523 CHR_SVLEN(check) + (SvTAIL(check) != 0);
525 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
526 - (SvTAIL(check) != 0);
527 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
529 if (end_shift < eshift)
533 else { /* Can match at random position */
536 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
537 /* Should be nonnegative! */
538 end_shift = prog->minlen - start_shift -
539 CHR_SVLEN(check) + (SvTAIL(check) != 0);
542 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
544 Perl_croak(aTHX_ "panic: end_shift");
548 /* Find a possible match in the region s..strend by looking for
549 the "check" substring in the region corrected by start/end_shift. */
550 if (flags & REXEC_SCREAM) {
551 I32 p = -1; /* Internal iterator of scream. */
552 I32 * const pp = data ? data->scream_pos : &p;
554 if (PL_screamfirst[BmRARE(check)] >= 0
555 || ( BmRARE(check) == '\n'
556 && (BmPREVIOUS(check) == SvCUR(check) - 1)
558 s = screaminstr(sv, check,
559 start_shift + (s - strbeg), end_shift, pp, 0);
562 /* we may be pointing at the wrong string */
563 if (s && RX_MATCH_COPIED(prog))
564 s = strbeg + (s - SvPVX_const(sv));
566 *data->scream_olds = s;
568 else if (prog->reganch & ROPT_CANY_SEEN)
569 s = fbm_instr((U8*)(s + start_shift),
570 (U8*)(strend - end_shift),
571 check, PL_multiline ? FBMrf_MULTILINE : 0);
573 s = fbm_instr(HOP3(s, start_shift, strend),
574 HOP3(strend, -end_shift, strbeg),
575 check, PL_multiline ? FBMrf_MULTILINE : 0);
577 /* Update the count-of-usability, remove useless subpatterns,
580 /* FIXME - DEBUG_EXECUTE_r if that is merged to maint. */
581 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
582 (s ? "Found" : "Did not find"),
583 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
585 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
587 PL_colors[1], (SvTAIL(check) ? "$" : ""),
588 (s ? " at offset " : "...\n") ) );
595 /* Finish the diagnostic message */
596 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
598 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
599 Start with the other substr.
600 XXXX no SCREAM optimization yet - and a very coarse implementation
601 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
602 *always* match. Probably should be marked during compile...
603 Probably it is right to do no SCREAM here...
606 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
607 /* Take into account the "other" substring. */
608 /* XXXX May be hopelessly wrong for UTF... */
611 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
614 char *last = HOP3c(s, -start_shift, strbeg), *last1, *last2;
618 t = s - prog->check_offset_max;
619 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
621 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
626 t = HOP3c(t, prog->anchored_offset, strend);
627 if (t < other_last) /* These positions already checked */
629 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
632 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
633 /* On end-of-str: see comment below. */
634 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
635 if (must == &PL_sv_undef) {
637 DEBUG_r(must = prog->anchored_utf8); /* for debug */
642 HOP3(HOP3(last1, prog->anchored_offset, strend)
643 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
645 PL_multiline ? FBMrf_MULTILINE : 0
647 DEBUG_r(PerlIO_printf(Perl_debug_log,
648 "%s anchored substr \"%s%.*s%s\"%s",
649 (s ? "Found" : "Contradicts"),
652 - (SvTAIL(must)!=0)),
654 PL_colors[1], (SvTAIL(must) ? "$" : "")));
656 if (last1 >= last2) {
657 DEBUG_r(PerlIO_printf(Perl_debug_log,
658 ", giving up...\n"));
661 DEBUG_r(PerlIO_printf(Perl_debug_log,
662 ", trying floating at offset %ld...\n",
663 (long)(HOP3c(s1, 1, strend) - i_strpos)));
664 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
665 s = HOP3c(last, 1, strend);
669 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
670 (long)(s - i_strpos)));
671 t = HOP3c(s, -prog->anchored_offset, strbeg);
672 other_last = HOP3c(s, 1, strend);
680 else { /* Take into account the floating substring. */
685 t = HOP3c(s, -start_shift, strbeg);
687 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
688 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
689 last = HOP3c(t, prog->float_max_offset, strend);
690 s = HOP3c(t, prog->float_min_offset, strend);
693 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
694 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
695 /* fbm_instr() takes into account exact value of end-of-str
696 if the check is SvTAIL(ed). Since false positives are OK,
697 and end-of-str is not later than strend we are OK. */
698 if (must == &PL_sv_undef) {
700 DEBUG_r(must = prog->float_utf8); /* for debug message */
703 s = fbm_instr((unsigned char*)s,
704 (unsigned char*)last + SvCUR(must)
706 must, PL_multiline ? FBMrf_MULTILINE : 0);
707 /* FIXME - DEBUG_EXECUTE_r if that is merged to maint */
708 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
709 (s ? "Found" : "Contradicts"),
711 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
713 PL_colors[1], (SvTAIL(must) ? "$" : "")));
716 DEBUG_r(PerlIO_printf(Perl_debug_log,
717 ", giving up...\n"));
720 DEBUG_r(PerlIO_printf(Perl_debug_log,
721 ", trying anchored starting at offset %ld...\n",
722 (long)(s1 + 1 - i_strpos)));
724 s = HOP3c(t, 1, strend);
728 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
729 (long)(s - i_strpos)));
730 other_last = s; /* Fix this later. --Hugo */
739 t = s - prog->check_offset_max;
740 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
742 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
744 /* Fixed substring is found far enough so that the match
745 cannot start at strpos. */
747 if (ml_anch && t[-1] != '\n') {
748 /* Eventually fbm_*() should handle this, but often
749 anchored_offset is not 0, so this check will not be wasted. */
750 /* XXXX In the code below we prefer to look for "^" even in
751 presence of anchored substrings. And we search even
752 beyond the found float position. These pessimizations
753 are historical artefacts only. */
755 while (t < strend - prog->minlen) {
757 if (t < check_at - prog->check_offset_min) {
758 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
759 /* Since we moved from the found position,
760 we definitely contradict the found anchored
761 substr. Due to the above check we do not
762 contradict "check" substr.
763 Thus we can arrive here only if check substr
764 is float. Redo checking for "other"=="fixed".
767 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
768 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
769 goto do_other_anchored;
771 /* We don't contradict the found floating substring. */
772 /* XXXX Why not check for STCLASS? */
774 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
775 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
778 /* Position contradicts check-string */
779 /* XXXX probably better to look for check-string
780 than for "\n", so one should lower the limit for t? */
781 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
782 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
783 other_last = strpos = s = t + 1;
788 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
789 PL_colors[0],PL_colors[1]));
793 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
794 PL_colors[0],PL_colors[1]));
798 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
801 /* The found string does not prohibit matching at strpos,
802 - no optimization of calling REx engine can be performed,
803 unless it was an MBOL and we are not after MBOL,
804 or a future STCLASS check will fail this. */
806 /* Even in this situation we may use MBOL flag if strpos is offset
807 wrt the start of the string. */
808 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
809 && (strpos != strbeg) && strpos[-1] != '\n'
810 /* May be due to an implicit anchor of m{.*foo} */
811 && !(prog->reganch & ROPT_IMPLICIT))
816 DEBUG_r( if (ml_anch)
817 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
818 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
821 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
823 prog->check_utf8 /* Could be deleted already */
824 && --BmUSEFUL(prog->check_utf8) < 0
825 && (prog->check_utf8 == prog->float_utf8)
827 prog->check_substr /* Could be deleted already */
828 && --BmUSEFUL(prog->check_substr) < 0
829 && (prog->check_substr == prog->float_substr)
832 /* If flags & SOMETHING - do not do it many times on the same match */
833 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
834 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
835 if (do_utf8 ? prog->check_substr : prog->check_utf8)
836 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
837 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
838 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
839 check = Nullsv; /* abort */
841 /* XXXX This is a remnant of the old implementation. It
842 looks wasteful, since now INTUIT can use many
844 prog->reganch &= ~RE_USE_INTUIT;
851 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
852 if (prog->regstclass) {
853 /* minlen == 0 is possible if regstclass is \b or \B,
854 and the fixed substr is ''$.
855 Since minlen is already taken into account, s+1 is before strend;
856 accidentally, minlen >= 1 guaranties no false positives at s + 1
857 even for \b or \B. But (minlen? 1 : 0) below assumes that
858 regstclass does not come from lookahead... */
859 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
860 This leaves EXACTF only, which is dealt with in find_byclass(). */
861 const U8* str = (U8*)STRING(prog->regstclass);
862 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
863 ? CHR_DIST(str+STR_LEN(prog->regstclass), str)
865 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
866 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
867 : (prog->float_substr || prog->float_utf8
868 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
874 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
877 const char *what = 0;
879 if (endpos == strend) {
880 DEBUG_r( PerlIO_printf(Perl_debug_log,
881 "Could not match STCLASS...\n") );
884 DEBUG_r( PerlIO_printf(Perl_debug_log,
885 "This position contradicts STCLASS...\n") );
886 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
888 /* Contradict one of substrings */
889 if (prog->anchored_substr || prog->anchored_utf8) {
890 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
891 DEBUG_r( what = "anchored" );
893 s = HOP3c(t, 1, strend);
894 if (s + start_shift + end_shift > strend) {
895 /* XXXX Should be taken into account earlier? */
896 DEBUG_r( PerlIO_printf(Perl_debug_log,
897 "Could not match STCLASS...\n") );
902 DEBUG_r( PerlIO_printf(Perl_debug_log,
903 "Looking for %s substr starting at offset %ld...\n",
904 what, (long)(s + start_shift - i_strpos)) );
907 /* Have both, check_string is floating */
908 if (t + start_shift >= check_at) /* Contradicts floating=check */
909 goto retry_floating_check;
910 /* Recheck anchored substring, but not floating... */
914 DEBUG_r( PerlIO_printf(Perl_debug_log,
915 "Looking for anchored substr starting at offset %ld...\n",
916 (long)(other_last - i_strpos)) );
917 goto do_other_anchored;
919 /* Another way we could have checked stclass at the
920 current position only: */
925 DEBUG_r( PerlIO_printf(Perl_debug_log,
926 "Looking for /%s^%s/m starting at offset %ld...\n",
927 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
930 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
932 /* Check is floating subtring. */
933 retry_floating_check:
934 t = check_at - start_shift;
935 DEBUG_r( what = "floating" );
936 goto hop_and_restart;
939 DEBUG_r(PerlIO_printf(Perl_debug_log,
940 "By STCLASS: moving %ld --> %ld\n",
941 (long)(t - i_strpos), (long)(s - i_strpos))
945 DEBUG_r(PerlIO_printf(Perl_debug_log,
946 "Does not contradict STCLASS...\n");
951 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
952 PL_colors[4], (check ? "Guessed" : "Giving up"),
953 PL_colors[5], (long)(s - i_strpos)) );
956 fail_finish: /* Substring not found */
957 if (prog->check_substr || prog->check_utf8) /* could be removed already */
958 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
960 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
961 PL_colors[4],PL_colors[5]));
965 /* We know what class REx starts with. Try to find this position... */
967 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
969 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
973 register STRLEN uskip;
977 register I32 tmp = 1; /* Scratch variable? */
978 register const bool do_utf8 = PL_reg_match_utf8;
980 /* We know what class it must start with. */
984 while (s + (uskip = UTF8SKIP(s)) <= strend) {
985 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
986 !UTF8_IS_INVARIANT((U8)s[0]) ?
987 reginclass(c, (U8*)s, 0, do_utf8) :
988 REGINCLASS(c, (U8*)s)) {
989 if (tmp && (norun || regtry(prog, s)))
1000 while (s < strend) {
1003 if (REGINCLASS(c, (U8*)s) ||
1004 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1005 /* The assignment of 2 is intentional:
1006 * for the folded sharp s, the skip is 2. */
1007 (skip = SHARP_S_SKIP))) {
1008 if (tmp && (norun || regtry(prog, s)))
1020 while (s < strend) {
1021 if (tmp && (norun || regtry(prog, s)))
1030 ln = STR_LEN(c); /* length to match in octets/bytes */
1031 lnc = (I32) ln; /* length to match in characters */
1033 STRLEN ulen1, ulen2;
1035 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1036 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1038 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1039 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1041 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1042 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1043 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1044 0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1046 while (sm < ((U8 *) m + ln)) {
1061 c2 = PL_fold_locale[c1];
1063 e = HOP3c(strend, -((I32)lnc), s);
1066 e = s; /* Due to minlen logic of intuit() */
1068 /* The idea in the EXACTF* cases is to first find the
1069 * first character of the EXACTF* node and then, if
1070 * necessary, case-insensitively compare the full
1071 * text of the node. The c1 and c2 are the first
1072 * characters (though in Unicode it gets a bit
1073 * more complicated because there are more cases
1074 * than just upper and lower: one needs to use
1075 * the so-called folding case for case-insensitive
1076 * matching (called "loose matching" in Unicode).
1077 * ibcmp_utf8() will do just that. */
1081 U8 tmpbuf [UTF8_MAXBYTES+1];
1082 STRLEN len, foldlen;
1085 /* Upper and lower of 1st char are equal -
1086 * probably not a "letter". */
1088 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1090 0 : UTF8_ALLOW_ANY);
1093 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1094 m, (char **)0, ln, (bool)UTF))
1095 && (norun || regtry(prog, s)) )
1098 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1099 uvchr_to_utf8(tmpbuf, c);
1100 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1102 && (f == c1 || f == c2)
1103 && (ln == foldlen ||
1104 !ibcmp_utf8((char *) foldbuf,
1105 (char **)0, foldlen, do_utf8,
1107 (char **)0, ln, (bool)UTF))
1108 && (norun || regtry(prog, s)) )
1116 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1118 0 : UTF8_ALLOW_ANY);
1120 /* Handle some of the three Greek sigmas cases.
1121 * Note that not all the possible combinations
1122 * are handled here: some of them are handled
1123 * by the standard folding rules, and some of
1124 * them (the character class or ANYOF cases)
1125 * are handled during compiletime in
1126 * regexec.c:S_regclass(). */
1127 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1128 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1129 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1131 if ( (c == c1 || c == c2)
1133 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1134 m, (char **)0, ln, (bool)UTF))
1135 && (norun || regtry(prog, s)) )
1138 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1139 uvchr_to_utf8(tmpbuf, c);
1140 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1142 && (f == c1 || f == c2)
1143 && (ln == foldlen ||
1144 !ibcmp_utf8((char *) foldbuf,
1145 (char **)0, foldlen, do_utf8,
1147 (char **)0, ln, (bool)UTF))
1148 && (norun || regtry(prog, s)) )
1159 && (ln == 1 || !(OP(c) == EXACTF
1161 : ibcmp_locale(s, m, ln)))
1162 && (norun || regtry(prog, s)) )
1168 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1169 && (ln == 1 || !(OP(c) == EXACTF
1171 : ibcmp_locale(s, m, ln)))
1172 && (norun || regtry(prog, s)) )
1179 PL_reg_flags |= RF_tainted;
1186 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1188 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1190 tmp = ((OP(c) == BOUND ?
1191 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1192 LOAD_UTF8_CHARCLASS_ALNUM();
1193 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1194 if (tmp == !(OP(c) == BOUND ?
1195 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1196 isALNUM_LC_utf8((U8*)s)))
1199 if ((norun || regtry(prog, s)))
1206 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1207 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1208 while (s < strend) {
1210 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1212 if ((norun || regtry(prog, s)))
1218 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1222 PL_reg_flags |= RF_tainted;
1229 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1231 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1233 tmp = ((OP(c) == NBOUND ?
1234 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1235 LOAD_UTF8_CHARCLASS_ALNUM();
1236 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1237 if (tmp == !(OP(c) == NBOUND ?
1238 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1239 isALNUM_LC_utf8((U8*)s)))
1241 else if ((norun || regtry(prog, s)))
1247 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1248 tmp = ((OP(c) == NBOUND ?
1249 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1250 while (s < strend) {
1252 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1254 else if ((norun || regtry(prog, s)))
1259 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1264 LOAD_UTF8_CHARCLASS_ALNUM();
1265 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1266 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1267 if (tmp && (norun || regtry(prog, s)))
1278 while (s < strend) {
1280 if (tmp && (norun || regtry(prog, s)))
1292 PL_reg_flags |= RF_tainted;
1294 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1295 if (isALNUM_LC_utf8((U8*)s)) {
1296 if (tmp && (norun || regtry(prog, s)))
1307 while (s < strend) {
1308 if (isALNUM_LC(*s)) {
1309 if (tmp && (norun || regtry(prog, s)))
1322 LOAD_UTF8_CHARCLASS_ALNUM();
1323 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1324 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1325 if (tmp && (norun || regtry(prog, s)))
1336 while (s < strend) {
1338 if (tmp && (norun || regtry(prog, s)))
1350 PL_reg_flags |= RF_tainted;
1352 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1353 if (!isALNUM_LC_utf8((U8*)s)) {
1354 if (tmp && (norun || regtry(prog, s)))
1365 while (s < strend) {
1366 if (!isALNUM_LC(*s)) {
1367 if (tmp && (norun || regtry(prog, s)))
1380 LOAD_UTF8_CHARCLASS_SPACE();
1381 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1382 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1383 if (tmp && (norun || regtry(prog, s)))
1394 while (s < strend) {
1396 if (tmp && (norun || regtry(prog, s)))
1408 PL_reg_flags |= RF_tainted;
1410 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1411 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1412 if (tmp && (norun || regtry(prog, s)))
1423 while (s < strend) {
1424 if (isSPACE_LC(*s)) {
1425 if (tmp && (norun || regtry(prog, s)))
1438 LOAD_UTF8_CHARCLASS_SPACE();
1439 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1440 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1441 if (tmp && (norun || regtry(prog, s)))
1452 while (s < strend) {
1454 if (tmp && (norun || regtry(prog, s)))
1466 PL_reg_flags |= RF_tainted;
1468 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1469 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1470 if (tmp && (norun || regtry(prog, s)))
1481 while (s < strend) {
1482 if (!isSPACE_LC(*s)) {
1483 if (tmp && (norun || regtry(prog, s)))
1496 LOAD_UTF8_CHARCLASS_DIGIT();
1497 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1498 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1499 if (tmp && (norun || regtry(prog, s)))
1510 while (s < strend) {
1512 if (tmp && (norun || regtry(prog, s)))
1524 PL_reg_flags |= RF_tainted;
1526 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1527 if (isDIGIT_LC_utf8((U8*)s)) {
1528 if (tmp && (norun || regtry(prog, s)))
1539 while (s < strend) {
1540 if (isDIGIT_LC(*s)) {
1541 if (tmp && (norun || regtry(prog, s)))
1554 LOAD_UTF8_CHARCLASS_DIGIT();
1555 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1556 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1557 if (tmp && (norun || regtry(prog, s)))
1568 while (s < strend) {
1570 if (tmp && (norun || regtry(prog, s)))
1582 PL_reg_flags |= RF_tainted;
1584 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1585 if (!isDIGIT_LC_utf8((U8*)s)) {
1586 if (tmp && (norun || regtry(prog, s)))
1597 while (s < strend) {
1598 if (!isDIGIT_LC(*s)) {
1599 if (tmp && (norun || regtry(prog, s)))
1611 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1620 - regexec_flags - match a regexp against a string
1623 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1624 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1625 /* strend: pointer to null at end of string */
1626 /* strbeg: real beginning of string */
1627 /* minend: end of match must be >=minend after stringarg. */
1628 /* data: May be used for some additional optimizations. */
1629 /* nosave: For optimizations. */
1632 register regnode *c;
1633 register char *startpos = stringarg;
1634 I32 minlen; /* must match at least this many chars */
1635 I32 dontbother = 0; /* how many characters not to try at end */
1636 I32 end_shift = 0; /* Same for the end. */ /* CC */
1637 I32 scream_pos = -1; /* Internal iterator of scream. */
1639 SV* oreplsv = GvSV(PL_replgv);
1640 const bool do_utf8 = DO_UTF8(sv);
1642 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1643 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1645 (void)data; /* Currently unused */
1646 RX_MATCH_UTF8_set(prog,do_utf8);
1652 PL_regnarrate = DEBUG_r_TEST;
1655 /* Be paranoid... */
1656 if (prog == NULL || startpos == NULL) {
1657 Perl_croak(aTHX_ "NULL regexp parameter");
1661 minlen = prog->minlen;
1662 if (strend - startpos < minlen) {
1663 DEBUG_r(PerlIO_printf(Perl_debug_log,
1664 "String too short [regexec_flags]...\n"));
1668 /* Check validity of program. */
1669 if (UCHARAT(prog->program) != REG_MAGIC) {
1670 Perl_croak(aTHX_ "corrupted regexp program");
1674 PL_reg_eval_set = 0;
1677 if (prog->reganch & ROPT_UTF8)
1678 PL_reg_flags |= RF_utf8;
1680 /* Mark beginning of line for ^ and lookbehind. */
1681 PL_regbol = startpos;
1685 /* Mark end of line for $ (and such) */
1688 /* see how far we have to get to not match where we matched before */
1689 PL_regtill = startpos+minend;
1691 /* We start without call_cc context. */
1694 /* If there is a "must appear" string, look for it. */
1697 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1700 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1701 PL_reg_ganch = startpos;
1702 else if (sv && SvTYPE(sv) >= SVt_PVMG
1704 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1705 && mg->mg_len >= 0) {
1706 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1707 if (prog->reganch & ROPT_ANCH_GPOS) {
1708 if (s > PL_reg_ganch)
1713 else /* pos() not defined */
1714 PL_reg_ganch = strbeg;
1717 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
1718 re_scream_pos_data d;
1720 d.scream_olds = &scream_olds;
1721 d.scream_pos = &scream_pos;
1722 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1724 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1725 goto phooey; /* not present */
1730 const char * const s0 = UTF
1731 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1734 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1735 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1736 UNI_DISPLAY_REGEX) : startpos;
1737 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1740 PerlIO_printf(Perl_debug_log,
1741 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1742 PL_colors[4],PL_colors[5],PL_colors[0],
1745 len0 > 60 ? "..." : "",
1747 (int)(len1 > 60 ? 60 : len1),
1749 (len1 > 60 ? "..." : "")
1753 /* Simplest case: anchored match need be tried only once. */
1754 /* [unless only anchor is BOL and multiline is set] */
1755 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1756 if (s == startpos && regtry(prog, startpos))
1758 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1759 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1764 dontbother = minlen - 1;
1765 end = HOP3c(strend, -dontbother, strbeg) - 1;
1766 /* for multiline we only have to try after newlines */
1767 if (prog->check_substr || prog->check_utf8) {
1771 if (regtry(prog, s))
1776 if (prog->reganch & RE_USE_INTUIT) {
1777 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1788 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1789 if (regtry(prog, s))
1796 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1797 if (regtry(prog, PL_reg_ganch))
1802 /* Messy cases: unanchored match. */
1803 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1804 /* we have /x+whatever/ */
1805 /* it must be a one character string (XXXX Except UTF?) */
1810 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1811 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1812 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1815 while (s < strend) {
1817 DEBUG_r( did_match = 1 );
1818 if (regtry(prog, s)) goto got_it;
1820 while (s < strend && *s == ch)
1827 while (s < strend) {
1829 DEBUG_r( did_match = 1 );
1830 if (regtry(prog, s)) goto got_it;
1832 while (s < strend && *s == ch)
1838 DEBUG_r(if (!did_match)
1839 PerlIO_printf(Perl_debug_log,
1840 "Did not find anchored character...\n")
1844 else if (prog->anchored_substr != Nullsv
1845 || prog->anchored_utf8 != Nullsv
1846 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1847 && prog->float_max_offset < strend - s)) {
1852 char *last1; /* Last position checked before */
1856 if (prog->anchored_substr || prog->anchored_utf8) {
1857 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1858 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1859 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1860 back_max = back_min = prog->anchored_offset;
1862 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1863 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1864 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1865 back_max = prog->float_max_offset;
1866 back_min = prog->float_min_offset;
1868 if (must == &PL_sv_undef)
1869 /* could not downgrade utf8 check substring, so must fail */
1872 last = HOP3c(strend, /* Cannot start after this */
1873 -(I32)(CHR_SVLEN(must)
1874 - (SvTAIL(must) != 0) + back_min), strbeg);
1877 last1 = HOPc(s, -1);
1879 last1 = s - 1; /* bogus */
1881 /* XXXX check_substr already used to find "s", can optimize if
1882 check_substr==must. */
1884 dontbother = end_shift;
1885 strend = HOPc(strend, -dontbother);
1886 while ( (s <= last) &&
1887 ((flags & REXEC_SCREAM)
1888 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1889 end_shift, &scream_pos, 0))
1890 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1891 (unsigned char*)strend, must,
1892 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1893 /* we may be pointing at the wrong string */
1894 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1895 s = strbeg + (s - SvPVX_const(sv));
1896 DEBUG_r( did_match = 1 );
1897 if (HOPc(s, -back_max) > last1) {
1898 last1 = HOPc(s, -back_min);
1899 s = HOPc(s, -back_max);
1902 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1904 last1 = HOPc(s, -back_min);
1908 while (s <= last1) {
1909 if (regtry(prog, s))
1915 while (s <= last1) {
1916 if (regtry(prog, s))
1922 DEBUG_r(if (!did_match)
1923 PerlIO_printf(Perl_debug_log,
1924 "Did not find %s substr \"%s%.*s%s\"%s...\n",
1925 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1926 ? "anchored" : "floating"),
1928 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1930 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1934 else if ((c = prog->regstclass)) {
1936 I32 op = (U8)OP(prog->regstclass);
1937 /* don't bother with what can't match */
1938 if (PL_regkind[op] != EXACT && op != CANY)
1939 strend = HOPc(strend, -(minlen - 1));
1942 SV *prop = sv_newmortal();
1950 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
1951 UNI_DISPLAY_REGEX) :
1953 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1955 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1956 len1 = UTF ? SvCUR(dsv1) : strend - s;
1957 PerlIO_printf(Perl_debug_log,
1958 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
1962 if (find_byclass(prog, c, s, strend, 0))
1964 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1968 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1973 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1974 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1975 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1977 if (flags & REXEC_SCREAM) {
1978 last = screaminstr(sv, float_real, s - strbeg,
1979 end_shift, &scream_pos, 1); /* last one */
1981 last = scream_olds; /* Only one occurrence. */
1982 /* we may be pointing at the wrong string */
1983 else if (RX_MATCH_COPIED(prog))
1984 s = strbeg + (s - SvPVX_const(sv));
1988 const char * const little = SvPV(float_real, len);
1990 if (SvTAIL(float_real)) {
1991 if (memEQ(strend - len + 1, little, len - 1))
1992 last = strend - len + 1;
1993 else if (!PL_multiline)
1994 last = memEQ(strend - len, little, len)
1995 ? strend - len : Nullch;
2001 last = rninstr(s, strend, little, little + len);
2003 last = strend; /* matching "$" */
2007 DEBUG_r(PerlIO_printf(Perl_debug_log,
2008 "%sCan't trim the tail, match fails (should not happen)%s\n",
2009 PL_colors[4],PL_colors[5]));
2010 goto phooey; /* Should not happen! */
2012 dontbother = strend - last + prog->float_min_offset;
2014 if (minlen && (dontbother < minlen))
2015 dontbother = minlen - 1;
2016 strend -= dontbother; /* this one's always in bytes! */
2017 /* We don't know much -- general case. */
2020 if (regtry(prog, s))
2029 if (regtry(prog, s))
2031 } while (s++ < strend);
2039 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2041 if (PL_reg_eval_set) {
2042 /* Preserve the current value of $^R */
2043 if (oreplsv != GvSV(PL_replgv))
2044 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2045 restored, the value remains
2047 restore_pos(aTHX_ 0);
2050 /* make sure $`, $&, $', and $digit will work later */
2051 if ( !(flags & REXEC_NOT_FIRST) ) {
2052 if (RX_MATCH_COPIED(prog)) {
2053 Safefree(prog->subbeg);
2054 RX_MATCH_COPIED_off(prog);
2056 if (flags & REXEC_COPY_STR) {
2057 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2059 s = savepvn(strbeg, i);
2062 RX_MATCH_COPIED_on(prog);
2065 prog->subbeg = strbeg;
2066 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2073 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2074 PL_colors[4],PL_colors[5]));
2075 if (PL_reg_eval_set)
2076 restore_pos(aTHX_ 0);
2081 - regtry - try match at specific point
2083 STATIC I32 /* 0 failure, 1 success */
2084 S_regtry(pTHX_ regexp *prog, char *startpos)
2092 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2094 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2097 PL_reg_eval_set = RS_init;
2099 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2100 (IV)(PL_stack_sp - PL_stack_base));
2102 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2103 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2104 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2106 /* Apparently this is not needed, judging by wantarray. */
2107 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2108 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2111 /* Make $_ available to executed code. */
2112 if (PL_reg_sv != DEFSV) {
2113 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
2118 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
2119 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
2120 /* prepare for quick setting of pos */
2121 sv_magic(PL_reg_sv, (SV*)0,
2122 PERL_MAGIC_regex_global, Nullch, 0);
2123 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
2127 PL_reg_oldpos = mg->mg_len;
2128 SAVEDESTRUCTOR_X(restore_pos, 0);
2130 if (!PL_reg_curpm) {
2131 Newz(22,PL_reg_curpm, 1, PMOP);
2134 SV* repointer = newSViv(0);
2135 /* so we know which PL_regex_padav element is PL_reg_curpm */
2136 SvFLAGS(repointer) |= SVf_BREAK;
2137 av_push(PL_regex_padav,repointer);
2138 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2139 PL_regex_pad = AvARRAY(PL_regex_padav);
2143 PM_SETRE(PL_reg_curpm, prog);
2144 PL_reg_oldcurpm = PL_curpm;
2145 PL_curpm = PL_reg_curpm;
2146 if (RX_MATCH_COPIED(prog)) {
2147 /* Here is a serious problem: we cannot rewrite subbeg,
2148 since it may be needed if this match fails. Thus
2149 $` inside (?{}) could fail... */
2150 PL_reg_oldsaved = prog->subbeg;
2151 PL_reg_oldsavedlen = prog->sublen;
2152 RX_MATCH_COPIED_off(prog);
2155 PL_reg_oldsaved = Nullch;
2156 prog->subbeg = PL_bostr;
2157 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2159 prog->startp[0] = startpos - PL_bostr;
2160 PL_reginput = startpos;
2161 PL_regstartp = prog->startp;
2162 PL_regendp = prog->endp;
2163 PL_reglastparen = &prog->lastparen;
2164 PL_reglastcloseparen = &prog->lastcloseparen;
2165 prog->lastparen = 0;
2166 prog->lastcloseparen = 0;
2168 DEBUG_r(PL_reg_starttry = startpos);
2169 if (PL_reg_start_tmpl <= prog->nparens) {
2170 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2171 if(PL_reg_start_tmp)
2172 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2174 New(22,PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2177 /* XXXX What this code is doing here?!!! There should be no need
2178 to do this again and again, PL_reglastparen should take care of
2181 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2182 * Actually, the code in regcppop() (which Ilya may be meaning by
2183 * PL_reglastparen), is not needed at all by the test suite
2184 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2185 * enough, for building DynaLoader, or otherwise this
2186 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2187 * will happen. Meanwhile, this code *is* needed for the
2188 * above-mentioned test suite tests to succeed. The common theme
2189 * on those tests seems to be returning null fields from matches.
2194 if (prog->nparens) {
2195 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2202 if (regmatch(prog->program + 1)) {
2203 prog->endp[0] = PL_reginput - PL_bostr;
2206 REGCP_UNWIND(lastcp);
2210 #define RE_UNWIND_BRANCH 1
2211 #define RE_UNWIND_BRANCHJ 2
2215 typedef struct { /* XX: makes sense to enlarge it... */
2219 } re_unwind_generic_t;
2232 } re_unwind_branch_t;
2234 typedef union re_unwind_t {
2236 re_unwind_generic_t generic;
2237 re_unwind_branch_t branch;
2240 #define sayYES goto yes
2241 #define sayNO goto no
2242 #define sayNO_ANYOF goto no_anyof
2243 #define sayYES_FINAL goto yes_final
2244 #define sayYES_LOUD goto yes_loud
2245 #define sayNO_FINAL goto no_final
2246 #define sayNO_SILENT goto do_no
2247 #define saySAME(x) if (x) goto yes; else goto no
2249 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2250 #define POSCACHE_SEEN 1 /* we know what we're caching */
2251 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2252 #define CACHEsayYES STMT_START { \
2253 if (cache_offset | cache_bit) { \
2254 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2255 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2256 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2257 /* cache records failure, but this is success */ \
2259 PerlIO_printf(Perl_debug_log, \
2260 "%*s (remove success from failure cache)\n", \
2261 REPORT_CODE_OFF+PL_regindent*2, "") \
2263 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2268 #define CACHEsayNO STMT_START { \
2269 if (cache_offset | cache_bit) { \
2270 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2271 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2272 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2273 /* cache records success, but this is failure */ \
2275 PerlIO_printf(Perl_debug_log, \
2276 "%*s (remove failure from success cache)\n", \
2277 REPORT_CODE_OFF+PL_regindent*2, "") \
2279 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2285 #define REPORT_CODE_OFF 24
2288 - regmatch - main matching routine
2290 * Conceptually the strategy is simple: check to see whether the current
2291 * node matches, call self recursively to see whether the rest matches,
2292 * and then act accordingly. In practice we make some effort to avoid
2293 * recursion, in particular by going through "ordinary" nodes (that don't
2294 * need to know whether the rest of the match failed) by a loop instead of
2297 /* [lwall] I've hoisted the register declarations to the outer block in order to
2298 * maybe save a little bit of pushing and popping on the stack. It also takes
2299 * advantage of machines that use a register save mask on subroutine entry.
2301 STATIC I32 /* 0 failure, 1 success */
2302 S_regmatch(pTHX_ regnode *prog)
2304 register regnode *scan; /* Current node. */
2305 regnode *next; /* Next node. */
2306 regnode *inner; /* Next node in internal branch. */
2307 register I32 nextchr; /* renamed nextchr - nextchar colides with
2308 function of same name */
2309 register I32 n; /* no or next */
2310 register I32 ln = 0; /* len or last */
2311 register char *s = Nullch; /* operand or save */
2312 register char *locinput = PL_reginput;
2313 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2314 int minmod = 0, sw = 0, logical = 0;
2317 I32 firstcp = PL_savestack_ix;
2319 const register bool do_utf8 = PL_reg_match_utf8;
2321 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2322 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2323 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2330 /* Note that nextchr is a byte even in UTF */
2331 nextchr = UCHARAT(locinput);
2333 while (scan != NULL) {
2336 SV *prop = sv_newmortal();
2337 const int docolor = *PL_colors[0];
2338 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2339 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2340 /* The part of the string before starttry has one color
2341 (pref0_len chars), between starttry and current
2342 position another one (pref_len - pref0_len chars),
2343 after the current position the third one.
2344 We assume that pref0_len <= pref_len, otherwise we
2345 decrease pref0_len. */
2346 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2347 ? (5 + taill) - l : locinput - PL_bostr;
2350 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2352 pref0_len = pref_len - (locinput - PL_reg_starttry);
2353 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2354 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2355 ? (5 + taill) - pref_len : PL_regeol - locinput);
2356 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2360 if (pref0_len > pref_len)
2361 pref0_len = pref_len;
2362 regprop(prop, scan);
2364 const char * const s0 =
2365 do_utf8 && OP(scan) != CANY ?
2366 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2367 pref0_len, 60, UNI_DISPLAY_REGEX) :
2368 locinput - pref_len;
2369 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2370 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2371 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2372 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2373 locinput - pref_len + pref0_len;
2374 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2375 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2376 pv_uni_display(dsv2, (U8*)locinput,
2377 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2379 const int len2 = do_utf8 ? strlen(s2) : l;
2380 PerlIO_printf(Perl_debug_log,
2381 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2382 (IV)(locinput - PL_bostr),
2389 (docolor ? "" : "> <"),
2393 15 - l - pref_len + 1,
2395 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2400 next = scan + NEXT_OFF(scan);
2406 if (locinput == PL_bostr || (PL_multiline &&
2407 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2409 /* regtill = regbol; */
2414 if (locinput == PL_bostr ||
2415 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2421 if (locinput == PL_bostr)
2425 if (locinput == PL_reg_ganch)
2435 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2440 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2442 if (PL_regeol - locinput > 1)
2446 if (PL_regeol != locinput)
2450 if (!nextchr && locinput >= PL_regeol)
2453 locinput += PL_utf8skip[nextchr];
2454 if (locinput > PL_regeol)
2456 nextchr = UCHARAT(locinput);
2459 nextchr = UCHARAT(++locinput);
2462 if (!nextchr && locinput >= PL_regeol)
2464 nextchr = UCHARAT(++locinput);
2467 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2470 locinput += PL_utf8skip[nextchr];
2471 if (locinput > PL_regeol)
2473 nextchr = UCHARAT(locinput);
2476 nextchr = UCHARAT(++locinput);
2482 if (do_utf8 != UTF) {
2483 /* The target and the pattern have differing utf8ness. */
2485 const char *e = s + ln;
2488 /* The target is utf8, the pattern is not utf8. */
2493 if (NATIVE_TO_UNI(*(U8*)s) !=
2494 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2496 0 : UTF8_ALLOW_ANY))
2503 /* The target is not utf8, the pattern is utf8. */
2508 if (NATIVE_TO_UNI(*((U8*)l)) !=
2509 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2511 0 : UTF8_ALLOW_ANY))
2518 nextchr = UCHARAT(locinput);
2521 /* The target and the pattern have the same utf8ness. */
2522 /* Inline the first character, for speed. */
2523 if (UCHARAT(s) != nextchr)
2525 if (PL_regeol - locinput < ln)
2527 if (ln > 1 && memNE(s, locinput, ln))
2530 nextchr = UCHARAT(locinput);
2533 PL_reg_flags |= RF_tainted;
2539 if (do_utf8 || UTF) {
2540 /* Either target or the pattern are utf8. */
2542 char *e = PL_regeol;
2544 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
2545 l, &e, 0, do_utf8)) {
2546 /* One more case for the sharp s:
2547 * pack("U0U*", 0xDF) =~ /ss/i,
2548 * the 0xC3 0x9F are the UTF-8
2549 * byte sequence for the U+00DF. */
2551 toLOWER(s[0]) == 's' &&
2553 toLOWER(s[1]) == 's' &&
2560 nextchr = UCHARAT(locinput);
2564 /* Neither the target and the pattern are utf8. */
2566 /* Inline the first character, for speed. */
2567 if (UCHARAT(s) != nextchr &&
2568 UCHARAT(s) != ((OP(scan) == EXACTF)
2569 ? PL_fold : PL_fold_locale)[nextchr])
2571 if (PL_regeol - locinput < ln)
2573 if (ln > 1 && (OP(scan) == EXACTF
2574 ? ibcmp(s, locinput, ln)
2575 : ibcmp_locale(s, locinput, ln)))
2578 nextchr = UCHARAT(locinput);
2582 STRLEN inclasslen = PL_regeol - locinput;
2584 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
2586 if (locinput >= PL_regeol)
2588 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2589 nextchr = UCHARAT(locinput);
2594 nextchr = UCHARAT(locinput);
2595 if (!REGINCLASS(scan, (U8*)locinput))
2597 if (!nextchr && locinput >= PL_regeol)
2599 nextchr = UCHARAT(++locinput);
2603 /* If we might have the case of the German sharp s
2604 * in a casefolding Unicode character class. */
2606 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2607 locinput += SHARP_S_SKIP;
2608 nextchr = UCHARAT(locinput);
2614 PL_reg_flags |= RF_tainted;
2620 LOAD_UTF8_CHARCLASS_ALNUM();
2621 if (!(OP(scan) == ALNUM
2622 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2623 : isALNUM_LC_utf8((U8*)locinput)))
2627 locinput += PL_utf8skip[nextchr];
2628 nextchr = UCHARAT(locinput);
2631 if (!(OP(scan) == ALNUM
2632 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2634 nextchr = UCHARAT(++locinput);
2637 PL_reg_flags |= RF_tainted;
2640 if (!nextchr && locinput >= PL_regeol)
2643 LOAD_UTF8_CHARCLASS_ALNUM();
2644 if (OP(scan) == NALNUM
2645 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2646 : isALNUM_LC_utf8((U8*)locinput))
2650 locinput += PL_utf8skip[nextchr];
2651 nextchr = UCHARAT(locinput);
2654 if (OP(scan) == NALNUM
2655 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2657 nextchr = UCHARAT(++locinput);
2661 PL_reg_flags |= RF_tainted;
2665 /* was last char in word? */
2667 if (locinput == PL_bostr)
2670 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2672 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
2674 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2675 ln = isALNUM_uni(ln);
2676 LOAD_UTF8_CHARCLASS_ALNUM();
2677 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2680 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2681 n = isALNUM_LC_utf8((U8*)locinput);
2685 ln = (locinput != PL_bostr) ?
2686 UCHARAT(locinput - 1) : '\n';
2687 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2689 n = isALNUM(nextchr);
2692 ln = isALNUM_LC(ln);
2693 n = isALNUM_LC(nextchr);
2696 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2697 OP(scan) == BOUNDL))
2701 PL_reg_flags |= RF_tainted;
2707 if (UTF8_IS_CONTINUED(nextchr)) {
2708 LOAD_UTF8_CHARCLASS_SPACE();
2709 if (!(OP(scan) == SPACE
2710 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2711 : isSPACE_LC_utf8((U8*)locinput)))
2715 locinput += PL_utf8skip[nextchr];
2716 nextchr = UCHARAT(locinput);
2719 if (!(OP(scan) == SPACE
2720 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2722 nextchr = UCHARAT(++locinput);
2725 if (!(OP(scan) == SPACE
2726 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2728 nextchr = UCHARAT(++locinput);
2732 PL_reg_flags |= RF_tainted;
2735 if (!nextchr && locinput >= PL_regeol)
2738 LOAD_UTF8_CHARCLASS_SPACE();
2739 if (OP(scan) == NSPACE
2740 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2741 : isSPACE_LC_utf8((U8*)locinput))
2745 locinput += PL_utf8skip[nextchr];
2746 nextchr = UCHARAT(locinput);
2749 if (OP(scan) == NSPACE
2750 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2752 nextchr = UCHARAT(++locinput);
2755 PL_reg_flags |= RF_tainted;
2761 LOAD_UTF8_CHARCLASS_DIGIT();
2762 if (!(OP(scan) == DIGIT
2763 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2764 : isDIGIT_LC_utf8((U8*)locinput)))
2768 locinput += PL_utf8skip[nextchr];
2769 nextchr = UCHARAT(locinput);
2772 if (!(OP(scan) == DIGIT
2773 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2775 nextchr = UCHARAT(++locinput);
2778 PL_reg_flags |= RF_tainted;
2781 if (!nextchr && locinput >= PL_regeol)
2784 LOAD_UTF8_CHARCLASS_DIGIT();
2785 if (OP(scan) == NDIGIT
2786 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2787 : isDIGIT_LC_utf8((U8*)locinput))
2791 locinput += PL_utf8skip[nextchr];
2792 nextchr = UCHARAT(locinput);
2795 if (OP(scan) == NDIGIT
2796 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2798 nextchr = UCHARAT(++locinput);
2801 if (locinput >= PL_regeol)
2804 LOAD_UTF8_CHARCLASS_MARK();
2805 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2807 locinput += PL_utf8skip[nextchr];
2808 while (locinput < PL_regeol &&
2809 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2810 locinput += UTF8SKIP(locinput);
2811 if (locinput > PL_regeol)
2816 nextchr = UCHARAT(locinput);
2819 PL_reg_flags |= RF_tainted;
2823 n = ARG(scan); /* which paren pair */
2824 ln = PL_regstartp[n];
2825 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2826 if ((I32)*PL_reglastparen < n || ln == -1)
2827 sayNO; /* Do not match unless seen CLOSEn. */
2828 if (ln == PL_regendp[n])
2832 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2834 const char *e = PL_bostr + PL_regendp[n];
2836 * Note that we can't do the "other character" lookup trick as
2837 * in the 8-bit case (no pun intended) because in Unicode we
2838 * have to map both upper and title case to lower case.
2840 if (OP(scan) == REFF) {
2842 STRLEN ulen1, ulen2;
2843 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
2844 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
2848 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2849 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
2850 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
2857 nextchr = UCHARAT(locinput);
2861 /* Inline the first character, for speed. */
2862 if (UCHARAT(s) != nextchr &&
2864 (UCHARAT(s) != ((OP(scan) == REFF
2865 ? PL_fold : PL_fold_locale)[nextchr]))))
2867 ln = PL_regendp[n] - ln;
2868 if (locinput + ln > PL_regeol)
2870 if (ln > 1 && (OP(scan) == REF
2871 ? memNE(s, locinput, ln)
2873 ? ibcmp(s, locinput, ln)
2874 : ibcmp_locale(s, locinput, ln))))
2877 nextchr = UCHARAT(locinput);
2888 OP_4tree *oop = PL_op;
2889 COP *ocurcop = PL_curcop;
2892 struct regexp *oreg = PL_reg_re;
2895 PL_op = (OP_4tree*)PL_regdata->data[n];
2896 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2897 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
2898 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2902 CALLRUNOPS(aTHX); /* Scalar context. */
2905 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
2913 PAD_RESTORE_LOCAL(old_comppad);
2914 PL_curcop = ocurcop;
2916 if (logical == 2) { /* Postponed subexpression. */
2918 MAGIC *mg = Null(MAGIC*);
2920 CHECKPOINT cp, lastcp;
2924 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2925 mg = mg_find(sv, PERL_MAGIC_qr);
2926 else if (SvSMAGICAL(ret)) {
2927 if (SvGMAGICAL(ret))
2928 sv_unmagic(ret, PERL_MAGIC_qr);
2930 mg = mg_find(ret, PERL_MAGIC_qr);
2934 re = (regexp *)mg->mg_obj;
2935 (void)ReREFCNT_inc(re);
2939 const char *t = SvPV_const(ret, len);
2941 char * const oprecomp = PL_regprecomp;
2942 const I32 osize = PL_regsize;
2943 const I32 onpar = PL_regnpar;
2946 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
2947 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
2949 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2951 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2953 PL_regprecomp = oprecomp;
2958 PerlIO_printf(Perl_debug_log,
2959 "Entering embedded \"%s%.60s%s%s\"\n",
2963 (strlen(re->precomp) > 60 ? "..." : ""))
2966 state.prev = PL_reg_call_cc;
2967 state.cc = PL_regcc;
2968 state.re = PL_reg_re;
2972 cp = regcppush(0); /* Save *all* the positions. */
2975 state.ss = PL_savestack_ix;
2976 *PL_reglastparen = 0;
2977 *PL_reglastcloseparen = 0;
2978 PL_reg_call_cc = &state;
2979 PL_reginput = locinput;
2980 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2981 ((re->reganch & ROPT_UTF8) != 0);
2982 if (toggleutf) PL_reg_flags ^= RF_utf8;
2984 /* XXXX This is too dramatic a measure... */
2987 if (regmatch(re->program + 1)) {
2988 /* Even though we succeeded, we need to restore
2989 global variables, since we may be wrapped inside
2990 SUSPEND, thus the match may be not finished yet. */
2992 /* XXXX Do this only if SUSPENDed? */
2993 PL_reg_call_cc = state.prev;
2994 PL_regcc = state.cc;
2995 PL_reg_re = state.re;
2996 cache_re(PL_reg_re);
2997 if (toggleutf) PL_reg_flags ^= RF_utf8;
2999 /* XXXX This is too dramatic a measure... */
3002 /* These are needed even if not SUSPEND. */
3008 REGCP_UNWIND(lastcp);
3010 PL_reg_call_cc = state.prev;
3011 PL_regcc = state.cc;
3012 PL_reg_re = state.re;
3013 cache_re(PL_reg_re);
3014 if (toggleutf) PL_reg_flags ^= RF_utf8;
3016 /* XXXX This is too dramatic a measure... */
3026 sv_setsv(save_scalar(PL_replgv), ret);
3032 n = ARG(scan); /* which paren pair */
3033 PL_reg_start_tmp[n] = locinput;
3038 n = ARG(scan); /* which paren pair */
3039 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3040 PL_regendp[n] = locinput - PL_bostr;
3041 if (n > (I32)*PL_reglastparen)
3042 *PL_reglastparen = n;
3043 *PL_reglastcloseparen = n;
3046 n = ARG(scan); /* which paren pair */
3047 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3050 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3052 next = NEXTOPER(NEXTOPER(scan));
3054 next = scan + ARG(scan);
3055 if (OP(next) == IFTHEN) /* Fake one. */
3056 next = NEXTOPER(NEXTOPER(next));
3060 logical = scan->flags;
3062 /*******************************************************************
3063 PL_regcc contains infoblock about the innermost (...)* loop, and
3064 a pointer to the next outer infoblock.
3066 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3068 1) After matching X, regnode for CURLYX is processed;
3070 2) This regnode creates infoblock on the stack, and calls
3071 regmatch() recursively with the starting point at WHILEM node;
3073 3) Each hit of WHILEM node tries to match A and Z (in the order
3074 depending on the current iteration, min/max of {min,max} and
3075 greediness). The information about where are nodes for "A"
3076 and "Z" is read from the infoblock, as is info on how many times "A"
3077 was already matched, and greediness.
3079 4) After A matches, the same WHILEM node is hit again.
3081 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3082 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3083 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3084 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3085 of the external loop.
3087 Currently present infoblocks form a tree with a stem formed by PL_curcc
3088 and whatever it mentions via ->next, and additional attached trees
3089 corresponding to temporarily unset infoblocks as in "5" above.
3091 In the following picture infoblocks for outer loop of
3092 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3093 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3094 infoblocks are drawn below the "reset" infoblock.
3096 In fact in the picture below we do not show failed matches for Z and T
3097 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3098 more obvious *why* one needs to *temporary* unset infoblocks.]
3100 Matched REx position InfoBlocks Comment
3104 Y A)*?Z)*?T x <- O <- I
3105 YA )*?Z)*?T x <- O <- I
3106 YA A)*?Z)*?T x <- O <- I
3107 YAA )*?Z)*?T x <- O <- I
3108 YAA Z)*?T x <- O # Temporary unset I
3111 YAAZ Y(A)*?Z)*?T x <- O
3114 YAAZY (A)*?Z)*?T x <- O
3117 YAAZY A)*?Z)*?T x <- O <- I
3120 YAAZYA )*?Z)*?T x <- O <- I
3123 YAAZYA Z)*?T x <- O # Temporary unset I
3129 YAAZYAZ T x # Temporary unset O
3136 *******************************************************************/
3139 CHECKPOINT cp = PL_savestack_ix;
3140 /* No need to save/restore up to this paren */
3141 I32 parenfloor = scan->flags;
3143 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3145 cc.oldcc = PL_regcc;
3147 /* XXXX Probably it is better to teach regpush to support
3148 parenfloor > PL_regsize... */
3149 if (parenfloor > (I32)*PL_reglastparen)
3150 parenfloor = *PL_reglastparen; /* Pessimization... */
3151 cc.parenfloor = parenfloor;
3153 cc.min = ARG1(scan);
3154 cc.max = ARG2(scan);
3155 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3159 PL_reginput = locinput;
3160 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3162 PL_regcc = cc.oldcc;
3168 * This is really hard to understand, because after we match
3169 * what we're trying to match, we must make sure the rest of
3170 * the REx is going to match for sure, and to do that we have
3171 * to go back UP the parse tree by recursing ever deeper. And
3172 * if it fails, we have to reset our parent's current state
3173 * that we can try again after backing off.
3176 CHECKPOINT cp, lastcp;
3177 CURCUR* cc = PL_regcc;
3178 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3179 I32 cache_offset = 0, cache_bit = 0;
3181 n = cc->cur + 1; /* how many we know we matched */
3182 PL_reginput = locinput;
3185 PerlIO_printf(Perl_debug_log,
3186 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3187 REPORT_CODE_OFF+PL_regindent*2, "",
3188 (long)n, (long)cc->min,
3189 (long)cc->max, PTR2UV(cc))
3192 /* If degenerate scan matches "", assume scan done. */
3194 if (locinput == cc->lastloc && n >= cc->min) {
3195 PL_regcc = cc->oldcc;
3199 PerlIO_printf(Perl_debug_log,
3200 "%*s empty match detected, try continuation...\n",
3201 REPORT_CODE_OFF+PL_regindent*2, "")
3203 if (regmatch(cc->next))
3211 /* First just match a string of min scans. */
3215 cc->lastloc = locinput;
3216 if (regmatch(cc->scan))
3219 cc->lastloc = lastloc;
3224 /* Check whether we already were at this position.
3225 Postpone detection until we know the match is not
3226 *that* much linear. */
3227 if (!PL_reg_maxiter) {
3228 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3229 PL_reg_leftiter = PL_reg_maxiter;
3231 if (PL_reg_leftiter-- == 0) {
3232 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3233 if (PL_reg_poscache) {
3234 if ((I32)PL_reg_poscache_size < size) {
3235 Renew(PL_reg_poscache, size, char);
3236 PL_reg_poscache_size = size;
3238 Zero(PL_reg_poscache, size, char);
3241 PL_reg_poscache_size = size;
3242 Newz(29, PL_reg_poscache, size, char);
3245 PerlIO_printf(Perl_debug_log,
3246 "%sDetected a super-linear match, switching on caching%s...\n",
3247 PL_colors[4], PL_colors[5])
3250 if (PL_reg_leftiter < 0) {
3251 cache_offset = locinput - PL_bostr;
3253 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3254 + cache_offset * (scan->flags>>4);
3255 cache_bit = cache_offset % 8;
3257 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
3259 PerlIO_printf(Perl_debug_log,
3260 "%*s already tried at this position...\n",
3261 REPORT_CODE_OFF+PL_regindent*2, "")
3263 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3264 /* cache records success */
3267 /* cache records failure */
3270 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
3274 /* Prefer next over scan for minimal matching. */
3277 PL_regcc = cc->oldcc;
3280 cp = regcppush(cc->parenfloor);
3282 if (regmatch(cc->next)) {
3284 CACHEsayYES; /* All done. */
3286 REGCP_UNWIND(lastcp);
3292 if (n >= cc->max) { /* Maximum greed exceeded? */
3293 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3294 && !(PL_reg_flags & RF_warned)) {
3295 PL_reg_flags |= RF_warned;
3296 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3297 "Complex regular subexpression recursion",
3304 PerlIO_printf(Perl_debug_log,
3305 "%*s trying longer...\n",
3306 REPORT_CODE_OFF+PL_regindent*2, "")
3308 /* Try scanning more and see if it helps. */
3309 PL_reginput = locinput;
3311 cc->lastloc = locinput;
3312 cp = regcppush(cc->parenfloor);
3314 if (regmatch(cc->scan)) {
3318 REGCP_UNWIND(lastcp);
3321 cc->lastloc = lastloc;
3325 /* Prefer scan over next for maximal matching. */
3327 if (n < cc->max) { /* More greed allowed? */
3328 cp = regcppush(cc->parenfloor);
3330 cc->lastloc = locinput;
3332 if (regmatch(cc->scan)) {
3336 REGCP_UNWIND(lastcp);
3337 regcppop(); /* Restore some previous $<digit>s? */
3338 PL_reginput = locinput;
3340 PerlIO_printf(Perl_debug_log,
3341 "%*s failed, try continuation...\n",
3342 REPORT_CODE_OFF+PL_regindent*2, "")
3345 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3346 && !(PL_reg_flags & RF_warned)) {
3347 PL_reg_flags |= RF_warned;
3348 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3349 "Complex regular subexpression recursion",
3353 /* Failed deeper matches of scan, so see if this one works. */
3354 PL_regcc = cc->oldcc;
3357 if (regmatch(cc->next))
3363 cc->lastloc = lastloc;
3368 next = scan + ARG(scan);
3371 inner = NEXTOPER(NEXTOPER(scan));
3374 inner = NEXTOPER(scan);
3378 if (OP(next) != c1) /* No choice. */
3379 next = inner; /* Avoid recursion. */
3381 const I32 lastparen = *PL_reglastparen;
3383 re_unwind_branch_t *uw;
3385 /* Put unwinding data on stack */
3386 unwind1 = SSNEWt(1,re_unwind_branch_t);
3387 uw = SSPTRt(unwind1,re_unwind_branch_t);
3390 uw->type = ((c1 == BRANCH)
3392 : RE_UNWIND_BRANCHJ);
3393 uw->lastparen = lastparen;
3395 uw->locinput = locinput;
3396 uw->nextchr = nextchr;
3398 uw->regindent = ++PL_regindent;
3401 REGCP_SET(uw->lastcp);
3403 /* Now go into the first branch */
3416 /* We suppose that the next guy does not need
3417 backtracking: in particular, it is of constant non-zero length,
3418 and has no parenths to influence future backrefs. */
3419 ln = ARG1(scan); /* min to match */
3420 n = ARG2(scan); /* max to match */
3421 paren = scan->flags;
3423 if (paren > PL_regsize)
3425 if (paren > (I32)*PL_reglastparen)
3426 *PL_reglastparen = paren;
3428 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3430 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3431 PL_reginput = locinput;
3434 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3436 locinput = PL_reginput;
3437 if (HAS_TEXT(next) || JUMPABLE(next)) {
3438 regnode *text_node = next;
3440 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3442 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3444 if (PL_regkind[(U8)OP(text_node)] == REF) {
3448 else { c1 = (U8)*STRING(text_node); }
3449 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3451 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3452 c2 = PL_fold_locale[c1];
3461 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3462 /* If it could work, try it. */
3464 UCHARAT(PL_reginput) == c1 ||
3465 UCHARAT(PL_reginput) == c2)
3469 PL_regstartp[paren] =
3470 HOPc(PL_reginput, -l) - PL_bostr;
3471 PL_regendp[paren] = PL_reginput - PL_bostr;
3474 PL_regendp[paren] = -1;
3478 REGCP_UNWIND(lastcp);
3480 /* Couldn't or didn't -- move forward. */
3481 PL_reginput = locinput;
3482 if (regrepeat_hard(scan, 1, &l)) {
3484 locinput = PL_reginput;
3491 n = regrepeat_hard(scan, n, &l);
3492 locinput = PL_reginput;
3494 PerlIO_printf(Perl_debug_log,
3495 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3496 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3500 if (HAS_TEXT(next) || JUMPABLE(next)) {
3501 regnode *text_node = next;
3503 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3505 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3507 if (PL_regkind[(U8)OP(text_node)] == REF) {
3511 else { c1 = (U8)*STRING(text_node); }
3513 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3515 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3516 c2 = PL_fold_locale[c1];
3527 /* If it could work, try it. */
3529 UCHARAT(PL_reginput) == c1 ||
3530 UCHARAT(PL_reginput) == c2)
3533 PerlIO_printf(Perl_debug_log,
3534 "%*s trying tail with n=%"IVdf"...\n",
3535 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3539 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3540 PL_regendp[paren] = PL_reginput - PL_bostr;
3543 PL_regendp[paren] = -1;
3547 REGCP_UNWIND(lastcp);
3549 /* Couldn't or didn't -- back up. */
3551 locinput = HOPc(locinput, -l);
3552 PL_reginput = locinput;
3559 paren = scan->flags; /* Which paren to set */
3560 if (paren > PL_regsize)
3562 if (paren > (I32)*PL_reglastparen)
3563 *PL_reglastparen = paren;
3564 ln = ARG1(scan); /* min to match */
3565 n = ARG2(scan); /* max to match */
3566 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3570 ln = ARG1(scan); /* min to match */
3571 n = ARG2(scan); /* max to match */
3572 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3577 scan = NEXTOPER(scan);
3583 scan = NEXTOPER(scan);
3587 * Lookahead to avoid useless match attempts
3588 * when we know what character comes next.
3592 * Used to only do .*x and .*?x, but now it allows
3593 * for )'s, ('s and (?{ ... })'s to be in the way
3594 * of the quantifier and the EXACT-like node. -- japhy
3597 if (HAS_TEXT(next) || JUMPABLE(next)) {
3599 regnode *text_node = next;
3601 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3603 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3605 if (PL_regkind[(U8)OP(text_node)] == REF) {
3607 goto assume_ok_easy;
3609 else { s = (U8*)STRING(text_node); }
3613 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3615 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3616 c2 = PL_fold_locale[c1];
3619 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3620 STRLEN ulen1, ulen2;
3621 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3622 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3624 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3625 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3627 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
3629 0 : UTF8_ALLOW_ANY);
3630 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
3632 0 : UTF8_ALLOW_ANY);
3635 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
3637 0 : UTF8_ALLOW_ANY);
3645 PL_reginput = locinput;
3649 if (ln && regrepeat(scan, ln) < ln)
3651 locinput = PL_reginput;
3654 char *e; /* Should not check after this */
3655 char *old = locinput;
3658 if (n == REG_INFTY) {
3661 while (UTF8_IS_CONTINUATION(*(U8*)e))
3667 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3671 e = locinput + n - ln;
3676 /* Find place 'next' could work */
3679 while (locinput <= e &&
3680 UCHARAT(locinput) != c1)
3683 while (locinput <= e
3684 && UCHARAT(locinput) != c1
3685 && UCHARAT(locinput) != c2)
3688 count = locinput - old;
3693 /* count initialised to
3694 * utf8_distance(old, locinput) */
3695 while (locinput <= e &&
3696 utf8n_to_uvchr((U8*)locinput,
3697 UTF8_MAXBYTES, &len,
3699 0 : UTF8_ALLOW_ANY) != (UV)c1) {
3705 /* count initialised to
3706 * utf8_distance(old, locinput) */
3707 while (locinput <= e) {
3708 UV c = utf8n_to_uvchr((U8*)locinput,
3709 UTF8_MAXBYTES, &len,
3711 0 : UTF8_ALLOW_ANY);
3712 if (c == (UV)c1 || c == (UV)c2)
3721 /* PL_reginput == old now */
3722 if (locinput != old) {
3723 ln = 1; /* Did some */
3724 if (regrepeat(scan, count) < count)
3727 /* PL_reginput == locinput now */
3728 TRYPAREN(paren, ln, locinput);
3729 PL_reginput = locinput; /* Could be reset... */
3730 REGCP_UNWIND(lastcp);
3731 /* Couldn't or didn't -- move forward. */
3734 locinput += UTF8SKIP(locinput);
3741 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3745 c = utf8n_to_uvchr((U8*)PL_reginput,
3748 0 : UTF8_ALLOW_ANY);
3750 c = UCHARAT(PL_reginput);
3751 /* If it could work, try it. */
3752 if (c == (UV)c1 || c == (UV)c2)
3754 TRYPAREN(paren, ln, PL_reginput);
3755 REGCP_UNWIND(lastcp);
3758 /* If it could work, try it. */
3759 else if (c1 == -1000)
3761 TRYPAREN(paren, ln, PL_reginput);
3762 REGCP_UNWIND(lastcp);
3764 /* Couldn't or didn't -- move forward. */
3765 PL_reginput = locinput;
3766 if (regrepeat(scan, 1)) {
3768 locinput = PL_reginput;
3776 n = regrepeat(scan, n);
3777 locinput = PL_reginput;
3778 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3779 ((!PL_multiline && OP(next) != MEOL) ||
3780 OP(next) == SEOL || OP(next) == EOS))
3782 ln = n; /* why back off? */
3783 /* ...because $ and \Z can match before *and* after
3784 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3785 We should back off by one in this case. */
3786 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3795 c = utf8n_to_uvchr((U8*)PL_reginput,
3798 0 : UTF8_ALLOW_ANY);
3800 c = UCHARAT(PL_reginput);
3802 /* If it could work, try it. */
3803 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3805 TRYPAREN(paren, n, PL_reginput);
3806 REGCP_UNWIND(lastcp);
3808 /* Couldn't or didn't -- back up. */
3810 PL_reginput = locinput = HOPc(locinput, -1);
3818 c = utf8n_to_uvchr((U8*)PL_reginput,
3821 0 : UTF8_ALLOW_ANY);
3823 c = UCHARAT(PL_reginput);
3825 /* If it could work, try it. */
3826 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3828 TRYPAREN(paren, n, PL_reginput);
3829 REGCP_UNWIND(lastcp);
3831 /* Couldn't or didn't -- back up. */
3833 PL_reginput = locinput = HOPc(locinput, -1);
3840 if (PL_reg_call_cc) {
3841 re_cc_state *cur_call_cc = PL_reg_call_cc;
3842 CURCUR *cctmp = PL_regcc;
3843 regexp *re = PL_reg_re;
3844 CHECKPOINT cp, lastcp;
3846 cp = regcppush(0); /* Save *all* the positions. */
3848 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3850 PL_reginput = locinput; /* Make position available to
3852 cache_re(PL_reg_call_cc->re);
3853 PL_regcc = PL_reg_call_cc->cc;
3854 PL_reg_call_cc = PL_reg_call_cc->prev;
3855 if (regmatch(cur_call_cc->node)) {
3856 PL_reg_call_cc = cur_call_cc;
3860 REGCP_UNWIND(lastcp);
3862 PL_reg_call_cc = cur_call_cc;
3868 PerlIO_printf(Perl_debug_log,
3869 "%*s continuation failed...\n",
3870 REPORT_CODE_OFF+PL_regindent*2, "")
3874 if (locinput < PL_regtill) {
3875 DEBUG_r(PerlIO_printf(Perl_debug_log,
3876 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3878 (long)(locinput - PL_reg_starttry),
3879 (long)(PL_regtill - PL_reg_starttry),
3881 sayNO_FINAL; /* Cannot match: too short. */
3883 PL_reginput = locinput; /* put where regtry can find it */
3884 sayYES_FINAL; /* Success! */
3886 PL_reginput = locinput; /* put where regtry can find it */
3887 sayYES_LOUD; /* Success! */
3890 PL_reginput = locinput;
3895 s = HOPBACKc(locinput, scan->flags);
3901 PL_reginput = locinput;
3906 s = HOPBACKc(locinput, scan->flags);
3912 PL_reginput = locinput;
3915 inner = NEXTOPER(NEXTOPER(scan));
3916 if (regmatch(inner) != n) {
3931 if (OP(scan) == SUSPEND) {
3932 locinput = PL_reginput;
3933 nextchr = UCHARAT(locinput);
3938 next = scan + ARG(scan);
3943 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3944 PTR2UV(scan), OP(scan));
3945 Perl_croak(aTHX_ "regexp memory corruption");
3952 * We get here only if there's trouble -- normally "case END" is
3953 * the terminating point.
3955 Perl_croak(aTHX_ "corrupted regexp pointers");
3961 PerlIO_printf(Perl_debug_log,
3962 "%*s %scould match...%s\n",
3963 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3967 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3968 PL_colors[4],PL_colors[5]));
3974 #if 0 /* Breaks $^R */
3982 PerlIO_printf(Perl_debug_log,
3983 "%*s %sfailed...%s\n",
3984 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3990 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3993 case RE_UNWIND_BRANCH:
3994 case RE_UNWIND_BRANCHJ:
3996 re_unwind_branch_t *uwb = &(uw->branch);
3997 const I32 lastparen = uwb->lastparen;
3999 REGCP_UNWIND(uwb->lastcp);
4000 for (n = *PL_reglastparen; n > lastparen; n--)
4002 *PL_reglastparen = n;
4003 scan = next = uwb->next;
4005 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
4006 ? BRANCH : BRANCHJ) ) { /* Failure */
4013 /* Have more choice yet. Reuse the same uwb. */
4015 if ((n = (uwb->type == RE_UNWIND_BRANCH
4016 ? NEXT_OFF(next) : ARG(next))))
4019 next = NULL; /* XXXX Needn't unwinding in this case... */
4021 next = NEXTOPER(scan);
4022 if (uwb->type == RE_UNWIND_BRANCHJ)
4023 next = NEXTOPER(next);
4024 locinput = uwb->locinput;
4025 nextchr = uwb->nextchr;
4027 PL_regindent = uwb->regindent;
4034 Perl_croak(aTHX_ "regexp unwind memory corruption");
4045 - regrepeat - repeatedly match something simple, report how many
4048 * [This routine now assumes that it will only match on things of length 1.
4049 * That was true before, but now we assume scan - reginput is the count,
4050 * rather than incrementing count on every character. [Er, except utf8.]]
4053 S_regrepeat(pTHX_ const regnode *p, I32 max)
4055 register char *scan;
4057 register char *loceol = PL_regeol;
4058 register I32 hardcount = 0;
4059 register bool do_utf8 = PL_reg_match_utf8;
4062 if (max == REG_INFTY)
4064 else if (max < loceol - scan)
4065 loceol = scan + max;
4070 while (scan < loceol && hardcount < max && *scan != '\n') {
4071 scan += UTF8SKIP(scan);
4075 while (scan < loceol && *scan != '\n')
4082 while (scan < loceol && hardcount < max) {
4083 scan += UTF8SKIP(scan);
4093 case EXACT: /* length of string is 1 */
4095 while (scan < loceol && UCHARAT(scan) == c)
4098 case EXACTF: /* length of string is 1 */
4100 while (scan < loceol &&
4101 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4104 case EXACTFL: /* length of string is 1 */
4105 PL_reg_flags |= RF_tainted;
4107 while (scan < loceol &&
4108 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4114 while (hardcount < max && scan < loceol &&
4115 reginclass(p, (U8*)scan, 0, do_utf8)) {
4116 scan += UTF8SKIP(scan);
4120 while (scan < loceol && REGINCLASS(p, (U8*)scan))
4127 LOAD_UTF8_CHARCLASS_ALNUM();
4128 while (hardcount < max && scan < loceol &&
4129 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4130 scan += UTF8SKIP(scan);
4134 while (scan < loceol && isALNUM(*scan))
4139 PL_reg_flags |= RF_tainted;
4142 while (hardcount < max && scan < loceol &&
4143 isALNUM_LC_utf8((U8*)scan)) {
4144 scan += UTF8SKIP(scan);
4148 while (scan < loceol && isALNUM_LC(*scan))
4155 LOAD_UTF8_CHARCLASS_ALNUM();
4156 while (hardcount < max && scan < loceol &&
4157 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4158 scan += UTF8SKIP(scan);
4162 while (scan < loceol && !isALNUM(*scan))
4167 PL_reg_flags |= RF_tainted;
4170 while (hardcount < max && scan < loceol &&
4171 !isALNUM_LC_utf8((U8*)scan)) {
4172 scan += UTF8SKIP(scan);
4176 while (scan < loceol && !isALNUM_LC(*scan))
4183 LOAD_UTF8_CHARCLASS_SPACE();
4184 while (hardcount < max && scan < loceol &&
4186 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4187 scan += UTF8SKIP(scan);
4191 while (scan < loceol && isSPACE(*scan))
4196 PL_reg_flags |= RF_tainted;
4199 while (hardcount < max && scan < loceol &&
4200 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4201 scan += UTF8SKIP(scan);
4205 while (scan < loceol && isSPACE_LC(*scan))
4212 LOAD_UTF8_CHARCLASS_SPACE();
4213 while (hardcount < max && scan < loceol &&
4215 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4216 scan += UTF8SKIP(scan);
4220 while (scan < loceol && !isSPACE(*scan))
4225 PL_reg_flags |= RF_tainted;
4228 while (hardcount < max && scan < loceol &&
4229 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4230 scan += UTF8SKIP(scan);
4234 while (scan < loceol && !isSPACE_LC(*scan))
4241 LOAD_UTF8_CHARCLASS_DIGIT();
4242 while (hardcount < max && scan < loceol &&
4243 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4244 scan += UTF8SKIP(scan);
4248 while (scan < loceol && isDIGIT(*scan))
4255 LOAD_UTF8_CHARCLASS_DIGIT();
4256 while (hardcount < max && scan < loceol &&
4257 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4258 scan += UTF8SKIP(scan);
4262 while (scan < loceol && !isDIGIT(*scan))
4266 default: /* Called on something of 0 width. */
4267 break; /* So match right here or not at all. */
4273 c = scan - PL_reginput;
4278 SV *prop = sv_newmortal();
4281 PerlIO_printf(Perl_debug_log,
4282 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4283 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4290 - regrepeat_hard - repeatedly match something, report total lenth and length
4292 * The repeater is supposed to have constant non-zero length.
4296 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
4298 register char *scan = Nullch;
4299 register char *start;
4300 register char *loceol = PL_regeol;
4302 I32 count = 0, res = 1;
4307 start = PL_reginput;
4308 if (PL_reg_match_utf8) {
4309 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4312 while (start < PL_reginput) {
4314 start += UTF8SKIP(start);
4325 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4327 *lp = l = PL_reginput - start;
4328 if (max != REG_INFTY && l*max < loceol - scan)
4329 loceol = scan + l*max;
4342 - regclass_swash - prepare the utf8 swash
4346 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4352 if (PL_regdata && PL_regdata->count) {
4353 const U32 n = ARG(node);
4355 if (PL_regdata->what[n] == 's') {
4356 SV *rv = (SV*)PL_regdata->data[n];
4357 AV *av = (AV*)SvRV((SV*)rv);
4358 SV **ary = AvARRAY(av);
4361 /* See the end of regcomp.c:S_reglass() for
4362 * documentation of these array elements. */
4365 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4366 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4370 else if (si && doinit) {
4371 sw = swash_init("utf8", "", si, 1, 0);
4372 (void)av_store(av, 1, sw);
4388 - reginclass - determine if a character falls into a character class
4390 The n is the ANYOF regnode, the p is the target string, lenp
4391 is pointer to the maximum length of how far to go in the p
4392 (if the lenp is zero, UTF8SKIP(p) is used),
4393 do_utf8 tells whether the target string is in UTF-8.
4398 S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
4400 const char flags = ANYOF_FLAGS(n);
4406 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4407 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
4408 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
4410 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4411 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4414 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4415 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4418 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4422 SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av);
4425 if (swash_fetch(sw, p, do_utf8))
4427 else if (flags & ANYOF_FOLD) {
4428 if (!match && lenp && av) {
4431 for (i = 0; i <= av_len(av); i++) {
4432 SV* sv = *av_fetch(av, i, FALSE);
4434 const char *s = SvPV(sv, len);
4436 if (len <= plen && memEQ(s, (char*)p, len)) {
4444 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4447 to_utf8_fold(p, tmpbuf, &tmplen);
4448 if (swash_fetch(sw, tmpbuf, do_utf8))
4454 if (match && lenp && *lenp == 0)
4455 *lenp = UNISKIP(NATIVE_TO_UNI(c));
4457 if (!match && c < 256) {
4458 if (ANYOF_BITMAP_TEST(n, c))
4460 else if (flags & ANYOF_FOLD) {
4463 if (flags & ANYOF_LOCALE) {
4464 PL_reg_flags |= RF_tainted;
4465 f = PL_fold_locale[c];
4469 if (f != c && ANYOF_BITMAP_TEST(n, f))
4473 if (!match && (flags & ANYOF_CLASS)) {
4474 PL_reg_flags |= RF_tainted;
4476 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4477 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4478 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4479 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4480 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4481 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4482 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4483 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4484 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4485 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4486 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4487 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4488 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4489 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4490 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4491 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4492 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4493 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4494 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4495 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4496 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4497 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4498 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4499 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4500 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4501 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4502 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4503 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4504 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4505 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4506 ) /* How's that for a conditional? */
4513 return (flags & ANYOF_INVERT) ? !match : match;
4517 S_reghop(pTHX_ U8 *s, I32 off)
4519 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4523 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4526 while (off-- && s < lim) {
4527 /* XXX could check well-formedness here */
4535 if (UTF8_IS_CONTINUED(*s)) {
4536 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4539 /* XXX could check well-formedness here */
4547 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4549 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4553 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4556 while (off-- && s < lim) {
4557 /* XXX could check well-formedness here */
4567 if (UTF8_IS_CONTINUED(*s)) {
4568 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4571 /* XXX could check well-formedness here */
4583 restore_pos(pTHX_ void *arg)
4585 (void)arg; /* unused */
4586 if (PL_reg_eval_set) {
4587 if (PL_reg_oldsaved) {
4588 PL_reg_re->subbeg = PL_reg_oldsaved;
4589 PL_reg_re->sublen = PL_reg_oldsavedlen;
4590 RX_MATCH_COPIED_on(PL_reg_re);
4592 PL_reg_magic->mg_len = PL_reg_oldpos;
4593 PL_reg_eval_set = 0;
4594 PL_curpm = PL_reg_oldcurpm;
4599 S_to_utf8_substr(pTHX_ register regexp *prog)
4602 if (prog->float_substr && !prog->float_utf8) {
4603 prog->float_utf8 = sv = newSVsv(prog->float_substr);
4604 sv_utf8_upgrade(sv);
4605 if (SvTAIL(prog->float_substr))
4607 if (prog->float_substr == prog->check_substr)
4608 prog->check_utf8 = sv;
4610 if (prog->anchored_substr && !prog->anchored_utf8) {
4611 prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
4612 sv_utf8_upgrade(sv);
4613 if (SvTAIL(prog->anchored_substr))
4615 if (prog->anchored_substr == prog->check_substr)
4616 prog->check_utf8 = sv;
4621 S_to_byte_substr(pTHX_ register regexp *prog)
4624 if (prog->float_utf8 && !prog->float_substr) {
4625 prog->float_substr = sv = newSVsv(prog->float_utf8);
4626 if (sv_utf8_downgrade(sv, TRUE)) {
4627 if (SvTAIL(prog->float_utf8))
4631 prog->float_substr = sv = &PL_sv_undef;
4633 if (prog->float_utf8 == prog->check_utf8)
4634 prog->check_substr = sv;
4636 if (prog->anchored_utf8 && !prog->anchored_substr) {
4637 prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
4638 if (sv_utf8_downgrade(sv, TRUE)) {
4639 if (SvTAIL(prog->anchored_utf8))
4643 prog->anchored_substr = sv = &PL_sv_undef;
4645 if (prog->anchored_utf8 == prog->check_utf8)
4646 prog->check_substr = sv;
4652 * c-indentation-style: bsd
4654 * indent-tabs-mode: t
4657 * ex: set ts=8 sts=4 sw=4 noet: