5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
34 /* need to replace pregcomp et al, so enable that */
35 # ifndef PERL_IN_XSUB_RE
36 # define PERL_IN_XSUB_RE
38 /* need access to debugger hooks */
39 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
44 #ifdef PERL_IN_XSUB_RE
45 /* We *really* need to overwrite these symbols: */
46 # define Perl_regexec_flags my_regexec
47 # define Perl_regdump my_regdump
48 # define Perl_regprop my_regprop
49 # define Perl_re_intuit_start my_re_intuit_start
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_pregexec my_pregexec
52 # define Perl_reginitcolors my_reginitcolors
53 # define Perl_regclass_swash my_regclass_swash
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
78 **** Alterations to Henry's code are...
80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
81 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
91 #define PERL_IN_REGEXEC_C
96 #define RF_tainted 1 /* tainted information used? */
97 #define RF_warned 2 /* warned about big count? */
98 #define RF_evaled 4 /* Did an EVAL with setting? */
99 #define RF_utf8 8 /* String contains multibyte chars? */
101 #define UTF ((PL_reg_flags & RF_utf8) != 0)
103 #define RS_init 1 /* eval environment created */
104 #define RS_set 2 /* replsv value is set */
107 #define STATIC static
110 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
116 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
117 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
119 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
120 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
121 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
122 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
123 #define HOPc(pos,off) ((char*)HOP(pos,off))
124 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
126 #define HOPBACK(pos, off) ( \
127 (PL_reg_match_utf8) \
128 ? reghopmaybe((U8*)pos, -off) \
129 : (pos - off >= PL_bostr) \
133 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
135 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
136 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
137 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
138 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
139 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
142 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
143 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((U8*)str); assert(ok); LEAVE; } } STMT_END
144 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
145 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
146 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
147 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
149 /* for use after a quantifier and before an EXACT-like node -- japhy */
150 #define JUMPABLE(rn) ( \
151 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
152 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
153 OP(rn) == PLUS || OP(rn) == MINMOD || \
154 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
157 #define HAS_TEXT(rn) ( \
158 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
162 Search for mandatory following text node; for lookahead, the text must
163 follow but for lookbehind (rn->flags != 0) we skip to the next step.
165 #define FIND_NEXT_IMPT(rn) STMT_START { \
166 while (JUMPABLE(rn)) \
167 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
168 rn = NEXTOPER(NEXTOPER(rn)); \
169 else if (OP(rn) == PLUS) \
171 else if (OP(rn) == IFMATCH) \
172 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
173 else rn += NEXT_OFF(rn); \
176 static void restore_pos(pTHX_ void *arg);
179 S_regcppush(pTHX_ I32 parenfloor)
181 const int retval = PL_savestack_ix;
182 #define REGCP_PAREN_ELEMS 4
183 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
186 if (paren_elems_to_push < 0)
187 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
189 #define REGCP_OTHER_ELEMS 6
190 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
191 for (p = PL_regsize; p > parenfloor; p--) {
192 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
193 SSPUSHINT(PL_regendp[p]);
194 SSPUSHINT(PL_regstartp[p]);
195 SSPUSHPTR(PL_reg_start_tmp[p]);
198 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
199 SSPUSHINT(PL_regsize);
200 SSPUSHINT(*PL_reglastparen);
201 SSPUSHINT(*PL_reglastcloseparen);
202 SSPUSHPTR(PL_reginput);
203 #define REGCP_FRAME_ELEMS 2
204 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
205 * are needed for the regexp context stack bookkeeping. */
206 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
207 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
212 /* These are needed since we do not localize EVAL nodes: */
213 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
214 " Setting an EVAL scope, savestack=%"IVdf"\n", \
215 (IV)PL_savestack_ix)); cp = PL_savestack_ix
217 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
218 PerlIO_printf(Perl_debug_log, \
219 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
220 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
229 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
231 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
232 i = SSPOPINT; /* Parentheses elements to pop. */
233 input = (char *) SSPOPPTR;
234 *PL_reglastcloseparen = SSPOPINT;
235 *PL_reglastparen = SSPOPINT;
236 PL_regsize = SSPOPINT;
238 /* Now restore the parentheses context. */
239 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
240 i > 0; i -= REGCP_PAREN_ELEMS) {
242 paren = (U32)SSPOPINT;
243 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
244 PL_regstartp[paren] = SSPOPINT;
246 if (paren <= *PL_reglastparen)
247 PL_regendp[paren] = tmps;
249 PerlIO_printf(Perl_debug_log,
250 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
251 (UV)paren, (IV)PL_regstartp[paren],
252 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
253 (IV)PL_regendp[paren],
254 (paren > *PL_reglastparen ? "(no)" : ""));
258 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
259 PerlIO_printf(Perl_debug_log,
260 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
261 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
265 /* It would seem that the similar code in regtry()
266 * already takes care of this, and in fact it is in
267 * a better location to since this code can #if 0-ed out
268 * but the code in regtry() is needed or otherwise tests
269 * requiring null fields (pat.t#187 and split.t#{13,14}
270 * (as of patchlevel 7877) will fail. Then again,
271 * this code seems to be necessary or otherwise
272 * building DynaLoader will fail:
273 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
275 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
276 if ((I32)paren > PL_regsize)
277 PL_regstartp[paren] = -1;
278 PL_regendp[paren] = -1;
285 S_regcp_set_to(pTHX_ I32 ss)
287 const I32 tmp = PL_savestack_ix;
289 PL_savestack_ix = ss;
291 PL_savestack_ix = tmp;
295 typedef struct re_cc_state
299 struct re_cc_state *prev;
304 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
306 #define TRYPAREN(paren, n, input) { \
309 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
310 PL_regendp[paren] = input - PL_bostr; \
313 PL_regendp[paren] = -1; \
315 if (regmatch(next)) \
318 PL_regendp[paren] = -1; \
323 * pregexec and friends
327 - pregexec - match a regexp against a string
330 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
331 char *strbeg, I32 minend, SV *screamer, U32 nosave)
332 /* strend: pointer to null at end of string */
333 /* strbeg: real beginning of string */
334 /* minend: end of match must be >=minend after stringarg. */
335 /* nosave: For optimizations. */
338 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
339 nosave ? 0 : REXEC_COPY_STR);
343 S_cache_re(pTHX_ regexp *prog)
345 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
347 PL_regprogram = prog->program;
349 PL_regnpar = prog->nparens;
350 PL_regdata = prog->data;
355 * Need to implement the following flags for reg_anch:
357 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
359 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
360 * INTUIT_AUTORITATIVE_ML
361 * INTUIT_ONCE_NOML - Intuit can match in one location only.
364 * Another flag for this function: SECOND_TIME (so that float substrs
365 * with giant delta may be not rechecked).
368 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
370 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
371 Otherwise, only SvCUR(sv) is used to get strbeg. */
373 /* XXXX We assume that strpos is strbeg unless sv. */
375 /* XXXX Some places assume that there is a fixed substring.
376 An update may be needed if optimizer marks as "INTUITable"
377 RExen without fixed substrings. Similarly, it is assumed that
378 lengths of all the strings are no more than minlen, thus they
379 cannot come from lookahead.
380 (Or minlen should take into account lookahead.) */
382 /* A failure to find a constant substring means that there is no need to make
383 an expensive call to REx engine, thus we celebrate a failure. Similarly,
384 finding a substring too deep into the string means that less calls to
385 regtry() should be needed.
387 REx compiler's optimizer found 4 possible hints:
388 a) Anchored substring;
390 c) Whether we are anchored (beginning-of-line or \G);
391 d) First node (of those at offset 0) which may distingush positions;
392 We use a)b)d) and multiline-part of c), and try to find a position in the
393 string which does not contradict any of them.
396 /* Most of decisions we do here should have been done at compile time.
397 The nodes of the REx which we used for the search should have been
398 deleted from the finite automaton. */
401 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
402 char *strend, U32 flags, re_scream_pos_data *data)
404 register I32 start_shift = 0;
405 /* Should be nonnegative! */
406 register I32 end_shift = 0;
411 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
413 register char *other_last = Nullch; /* other substr checked before this */
414 char *check_at = Nullch; /* check substr found at this pos */
416 const char * const i_strpos = strpos;
417 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
419 RX_MATCH_UTF8_set(prog,do_utf8);
421 if (prog->reganch & ROPT_UTF8) {
422 DEBUG_r(PerlIO_printf(Perl_debug_log,
423 "UTF-8 regex...\n"));
424 PL_reg_flags |= RF_utf8;
428 const char *s = PL_reg_match_utf8 ?
429 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
431 const int len = PL_reg_match_utf8 ?
432 strlen(s) : strend - strpos;
435 if (PL_reg_match_utf8)
436 DEBUG_r(PerlIO_printf(Perl_debug_log,
437 "UTF-8 target...\n"));
438 PerlIO_printf(Perl_debug_log,
439 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
440 PL_colors[4],PL_colors[5],PL_colors[0],
443 (strlen(prog->precomp) > 60 ? "..." : ""),
445 (int)(len > 60 ? 60 : len),
447 (len > 60 ? "..." : "")
451 /* CHR_DIST() would be more correct here but it makes things slow. */
452 if (prog->minlen > strend - strpos) {
453 DEBUG_r(PerlIO_printf(Perl_debug_log,
454 "String too short... [re_intuit_start]\n"));
457 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
460 if (!prog->check_utf8 && prog->check_substr)
461 to_utf8_substr(prog);
462 check = prog->check_utf8;
464 if (!prog->check_substr && prog->check_utf8)
465 to_byte_substr(prog);
466 check = prog->check_substr;
468 if (check == &PL_sv_undef) {
469 DEBUG_r(PerlIO_printf(Perl_debug_log,
470 "Non-utf string cannot match utf check string\n"));
473 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
474 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
475 || ( (prog->reganch & ROPT_ANCH_BOL)
476 && !PL_multiline ) ); /* Check after \n? */
479 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
480 | ROPT_IMPLICIT)) /* not a real BOL */
481 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
483 && (strpos != strbeg)) {
484 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
487 if (prog->check_offset_min == prog->check_offset_max &&
488 !(prog->reganch & ROPT_CANY_SEEN)) {
489 /* Substring at constant offset from beg-of-str... */
492 s = HOP3c(strpos, prog->check_offset_min, strend);
494 slen = SvCUR(check); /* >= 1 */
496 if ( strend - s > slen || strend - s < slen - 1
497 || (strend - s == slen && strend[-1] != '\n')) {
498 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
501 /* Now should match s[0..slen-2] */
503 if (slen && (*SvPVX_const(check) != *s
505 && memNE(SvPVX_const(check), s, slen)))) {
507 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
511 else if (*SvPVX_const(check) != *s
512 || ((slen = SvCUR(check)) > 1
513 && memNE(SvPVX_const(check), s, slen)))
516 goto success_at_start;
519 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
521 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
522 end_shift = prog->minlen - start_shift -
523 CHR_SVLEN(check) + (SvTAIL(check) != 0);
525 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
526 - (SvTAIL(check) != 0);
527 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
529 if (end_shift < eshift)
533 else { /* Can match at random position */
536 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
537 /* Should be nonnegative! */
538 end_shift = prog->minlen - start_shift -
539 CHR_SVLEN(check) + (SvTAIL(check) != 0);
542 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
544 Perl_croak(aTHX_ "panic: end_shift");
548 /* Find a possible match in the region s..strend by looking for
549 the "check" substring in the region corrected by start/end_shift. */
550 if (flags & REXEC_SCREAM) {
551 I32 p = -1; /* Internal iterator of scream. */
552 I32 * const pp = data ? data->scream_pos : &p;
554 if (PL_screamfirst[BmRARE(check)] >= 0
555 || ( BmRARE(check) == '\n'
556 && (BmPREVIOUS(check) == SvCUR(check) - 1)
558 s = screaminstr(sv, check,
559 start_shift + (s - strbeg), end_shift, pp, 0);
562 /* we may be pointing at the wrong string */
563 if (s && RX_MATCH_COPIED(prog))
564 s = strbeg + (s - SvPVX_const(sv));
566 *data->scream_olds = s;
568 else if (prog->reganch & ROPT_CANY_SEEN)
569 s = fbm_instr((U8*)(s + start_shift),
570 (U8*)(strend - end_shift),
571 check, PL_multiline ? FBMrf_MULTILINE : 0);
573 s = fbm_instr(HOP3(s, start_shift, strend),
574 HOP3(strend, -end_shift, strbeg),
575 check, PL_multiline ? FBMrf_MULTILINE : 0);
577 /* Update the count-of-usability, remove useless subpatterns,
580 /* FIXME - DEBUG_EXECUTE_r if that is merged to maint. */
581 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
582 (s ? "Found" : "Did not find"),
583 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
585 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
587 PL_colors[1], (SvTAIL(check) ? "$" : ""),
588 (s ? " at offset " : "...\n") ) );
595 /* Finish the diagnostic message */
596 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
598 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
599 Start with the other substr.
600 XXXX no SCREAM optimization yet - and a very coarse implementation
601 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
602 *always* match. Probably should be marked during compile...
603 Probably it is right to do no SCREAM here...
606 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
607 /* Take into account the "other" substring. */
608 /* XXXX May be hopelessly wrong for UTF... */
611 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
614 char * const last = HOP3c(s, -start_shift, strbeg);
619 t = s - prog->check_offset_max;
620 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
622 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
627 t = HOP3c(t, prog->anchored_offset, strend);
628 if (t < other_last) /* These positions already checked */
630 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
633 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
634 /* On end-of-str: see comment below. */
635 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
636 if (must == &PL_sv_undef) {
638 DEBUG_r(must = prog->anchored_utf8); /* for debug */
643 HOP3(HOP3(last1, prog->anchored_offset, strend)
644 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
646 PL_multiline ? FBMrf_MULTILINE : 0
648 DEBUG_r(PerlIO_printf(Perl_debug_log,
649 "%s anchored substr \"%s%.*s%s\"%s",
650 (s ? "Found" : "Contradicts"),
653 - (SvTAIL(must)!=0)),
655 PL_colors[1], (SvTAIL(must) ? "$" : "")));
657 if (last1 >= last2) {
658 DEBUG_r(PerlIO_printf(Perl_debug_log,
659 ", giving up...\n"));
662 DEBUG_r(PerlIO_printf(Perl_debug_log,
663 ", trying floating at offset %ld...\n",
664 (long)(HOP3c(s1, 1, strend) - i_strpos)));
665 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
666 s = HOP3c(last, 1, strend);
670 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
671 (long)(s - i_strpos)));
672 t = HOP3c(s, -prog->anchored_offset, strbeg);
673 other_last = HOP3c(s, 1, strend);
681 else { /* Take into account the floating substring. */
686 t = HOP3c(s, -start_shift, strbeg);
688 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
689 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
690 last = HOP3c(t, prog->float_max_offset, strend);
691 s = HOP3c(t, prog->float_min_offset, strend);
694 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
695 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
696 /* fbm_instr() takes into account exact value of end-of-str
697 if the check is SvTAIL(ed). Since false positives are OK,
698 and end-of-str is not later than strend we are OK. */
699 if (must == &PL_sv_undef) {
701 DEBUG_r(must = prog->float_utf8); /* for debug message */
704 s = fbm_instr((unsigned char*)s,
705 (unsigned char*)last + SvCUR(must)
707 must, PL_multiline ? FBMrf_MULTILINE : 0);
708 /* FIXME - DEBUG_EXECUTE_r if that is merged to maint */
709 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
710 (s ? "Found" : "Contradicts"),
712 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
714 PL_colors[1], (SvTAIL(must) ? "$" : "")));
717 DEBUG_r(PerlIO_printf(Perl_debug_log,
718 ", giving up...\n"));
721 DEBUG_r(PerlIO_printf(Perl_debug_log,
722 ", trying anchored starting at offset %ld...\n",
723 (long)(s1 + 1 - i_strpos)));
725 s = HOP3c(t, 1, strend);
729 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
730 (long)(s - i_strpos)));
731 other_last = s; /* Fix this later. --Hugo */
740 t = s - prog->check_offset_max;
741 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
743 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
745 /* Fixed substring is found far enough so that the match
746 cannot start at strpos. */
748 if (ml_anch && t[-1] != '\n') {
749 /* Eventually fbm_*() should handle this, but often
750 anchored_offset is not 0, so this check will not be wasted. */
751 /* XXXX In the code below we prefer to look for "^" even in
752 presence of anchored substrings. And we search even
753 beyond the found float position. These pessimizations
754 are historical artefacts only. */
756 while (t < strend - prog->minlen) {
758 if (t < check_at - prog->check_offset_min) {
759 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
760 /* Since we moved from the found position,
761 we definitely contradict the found anchored
762 substr. Due to the above check we do not
763 contradict "check" substr.
764 Thus we can arrive here only if check substr
765 is float. Redo checking for "other"=="fixed".
768 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
769 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
770 goto do_other_anchored;
772 /* We don't contradict the found floating substring. */
773 /* XXXX Why not check for STCLASS? */
775 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
776 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
779 /* Position contradicts check-string */
780 /* XXXX probably better to look for check-string
781 than for "\n", so one should lower the limit for t? */
782 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
783 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
784 other_last = strpos = s = t + 1;
789 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
790 PL_colors[0],PL_colors[1]));
794 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
795 PL_colors[0],PL_colors[1]));
799 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
802 /* The found string does not prohibit matching at strpos,
803 - no optimization of calling REx engine can be performed,
804 unless it was an MBOL and we are not after MBOL,
805 or a future STCLASS check will fail this. */
807 /* Even in this situation we may use MBOL flag if strpos is offset
808 wrt the start of the string. */
809 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
810 && (strpos != strbeg) && strpos[-1] != '\n'
811 /* May be due to an implicit anchor of m{.*foo} */
812 && !(prog->reganch & ROPT_IMPLICIT))
817 DEBUG_r( if (ml_anch)
818 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
819 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
822 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
824 prog->check_utf8 /* Could be deleted already */
825 && --BmUSEFUL(prog->check_utf8) < 0
826 && (prog->check_utf8 == prog->float_utf8)
828 prog->check_substr /* Could be deleted already */
829 && --BmUSEFUL(prog->check_substr) < 0
830 && (prog->check_substr == prog->float_substr)
833 /* If flags & SOMETHING - do not do it many times on the same match */
834 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
835 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
836 if (do_utf8 ? prog->check_substr : prog->check_utf8)
837 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
838 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
839 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
840 check = Nullsv; /* abort */
842 /* XXXX This is a remnant of the old implementation. It
843 looks wasteful, since now INTUIT can use many
845 prog->reganch &= ~RE_USE_INTUIT;
852 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
853 if (prog->regstclass) {
854 /* minlen == 0 is possible if regstclass is \b or \B,
855 and the fixed substr is ''$.
856 Since minlen is already taken into account, s+1 is before strend;
857 accidentally, minlen >= 1 guaranties no false positives at s + 1
858 even for \b or \B. But (minlen? 1 : 0) below assumes that
859 regstclass does not come from lookahead... */
860 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
861 This leaves EXACTF only, which is dealt with in find_byclass(). */
862 const U8* const str = (U8*)STRING(prog->regstclass);
863 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
864 ? CHR_DIST((U8 *)str+STR_LEN(prog->regstclass),
867 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
868 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
869 : (prog->float_substr || prog->float_utf8
870 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
876 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
879 const char *what = 0;
881 if (endpos == strend) {
882 DEBUG_r( PerlIO_printf(Perl_debug_log,
883 "Could not match STCLASS...\n") );
886 DEBUG_r( PerlIO_printf(Perl_debug_log,
887 "This position contradicts STCLASS...\n") );
888 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
890 /* Contradict one of substrings */
891 if (prog->anchored_substr || prog->anchored_utf8) {
892 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
893 DEBUG_r( what = "anchored" );
895 s = HOP3c(t, 1, strend);
896 if (s + start_shift + end_shift > strend) {
897 /* XXXX Should be taken into account earlier? */
898 DEBUG_r( PerlIO_printf(Perl_debug_log,
899 "Could not match STCLASS...\n") );
904 DEBUG_r( PerlIO_printf(Perl_debug_log,
905 "Looking for %s substr starting at offset %ld...\n",
906 what, (long)(s + start_shift - i_strpos)) );
909 /* Have both, check_string is floating */
910 if (t + start_shift >= check_at) /* Contradicts floating=check */
911 goto retry_floating_check;
912 /* Recheck anchored substring, but not floating... */
916 DEBUG_r( PerlIO_printf(Perl_debug_log,
917 "Looking for anchored substr starting at offset %ld...\n",
918 (long)(other_last - i_strpos)) );
919 goto do_other_anchored;
921 /* Another way we could have checked stclass at the
922 current position only: */
927 DEBUG_r( PerlIO_printf(Perl_debug_log,
928 "Looking for /%s^%s/m starting at offset %ld...\n",
929 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
932 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
934 /* Check is floating subtring. */
935 retry_floating_check:
936 t = check_at - start_shift;
937 DEBUG_r( what = "floating" );
938 goto hop_and_restart;
941 DEBUG_r(PerlIO_printf(Perl_debug_log,
942 "By STCLASS: moving %ld --> %ld\n",
943 (long)(t - i_strpos), (long)(s - i_strpos))
947 DEBUG_r(PerlIO_printf(Perl_debug_log,
948 "Does not contradict STCLASS...\n");
953 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
954 PL_colors[4], (check ? "Guessed" : "Giving up"),
955 PL_colors[5], (long)(s - i_strpos)) );
958 fail_finish: /* Substring not found */
959 if (prog->check_substr || prog->check_utf8) /* could be removed already */
960 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
962 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
963 PL_colors[4],PL_colors[5]));
967 /* We know what class REx starts with. Try to find this position... */
969 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
971 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
975 register STRLEN uskip;
979 register I32 tmp = 1; /* Scratch variable? */
980 register const bool do_utf8 = PL_reg_match_utf8;
982 /* We know what class it must start with. */
986 while (s + (uskip = UTF8SKIP(s)) <= strend) {
987 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
988 !UTF8_IS_INVARIANT((U8)s[0]) ?
989 reginclass(c, (U8*)s, 0, do_utf8) :
990 REGINCLASS(c, (U8*)s)) {
991 if (tmp && (norun || regtry(prog, s)))
1002 while (s < strend) {
1005 if (REGINCLASS(c, (U8*)s) ||
1006 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1007 /* The assignment of 2 is intentional:
1008 * for the folded sharp s, the skip is 2. */
1009 (skip = SHARP_S_SKIP))) {
1010 if (tmp && (norun || regtry(prog, s)))
1022 while (s < strend) {
1023 if (tmp && (norun || regtry(prog, s)))
1032 ln = STR_LEN(c); /* length to match in octets/bytes */
1033 lnc = (I32) ln; /* length to match in characters */
1035 STRLEN ulen1, ulen2;
1037 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1038 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1039 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
1041 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1042 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1044 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1046 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1049 while (sm < ((U8 *) m + ln)) {
1064 c2 = PL_fold_locale[c1];
1066 e = HOP3c(strend, -((I32)lnc), s);
1069 e = s; /* Due to minlen logic of intuit() */
1071 /* The idea in the EXACTF* cases is to first find the
1072 * first character of the EXACTF* node and then, if
1073 * necessary, case-insensitively compare the full
1074 * text of the node. The c1 and c2 are the first
1075 * characters (though in Unicode it gets a bit
1076 * more complicated because there are more cases
1077 * than just upper and lower: one needs to use
1078 * the so-called folding case for case-insensitive
1079 * matching (called "loose matching" in Unicode).
1080 * ibcmp_utf8() will do just that. */
1084 U8 tmpbuf [UTF8_MAXBYTES+1];
1085 STRLEN len, foldlen;
1086 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
1088 /* Upper and lower of 1st char are equal -
1089 * probably not a "letter". */
1091 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1095 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1096 m, (char **)0, ln, (bool)UTF))
1097 && (norun || regtry(prog, s)) )
1100 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1101 uvchr_to_utf8(tmpbuf, c);
1102 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1104 && (f == c1 || f == c2)
1105 && (ln == foldlen ||
1106 !ibcmp_utf8((char *) foldbuf,
1107 (char **)0, foldlen, do_utf8,
1109 (char **)0, ln, (bool)UTF))
1110 && (norun || regtry(prog, s)) )
1118 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1121 /* Handle some of the three Greek sigmas cases.
1122 * Note that not all the possible combinations
1123 * are handled here: some of them are handled
1124 * by the standard folding rules, and some of
1125 * them (the character class or ANYOF cases)
1126 * are handled during compiletime in
1127 * regexec.c:S_regclass(). */
1128 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1129 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1130 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1132 if ( (c == c1 || c == c2)
1134 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1135 m, (char **)0, ln, (bool)UTF))
1136 && (norun || regtry(prog, s)) )
1139 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1140 uvchr_to_utf8(tmpbuf, c);
1141 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1143 && (f == c1 || f == c2)
1144 && (ln == foldlen ||
1145 !ibcmp_utf8((char *) foldbuf,
1146 (char **)0, foldlen, do_utf8,
1148 (char **)0, ln, (bool)UTF))
1149 && (norun || regtry(prog, s)) )
1160 && (ln == 1 || !(OP(c) == EXACTF
1162 : ibcmp_locale(s, m, ln)))
1163 && (norun || regtry(prog, s)) )
1169 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1170 && (ln == 1 || !(OP(c) == EXACTF
1172 : ibcmp_locale(s, m, ln)))
1173 && (norun || regtry(prog, s)) )
1180 PL_reg_flags |= RF_tainted;
1187 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1189 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1191 tmp = ((OP(c) == BOUND ?
1192 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1193 LOAD_UTF8_CHARCLASS_ALNUM();
1194 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1195 if (tmp == !(OP(c) == BOUND ?
1196 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1197 isALNUM_LC_utf8((U8*)s)))
1200 if ((norun || regtry(prog, s)))
1207 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1208 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1209 while (s < strend) {
1211 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1213 if ((norun || regtry(prog, s)))
1219 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1223 PL_reg_flags |= RF_tainted;
1230 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1232 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1234 tmp = ((OP(c) == NBOUND ?
1235 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1236 LOAD_UTF8_CHARCLASS_ALNUM();
1237 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1238 if (tmp == !(OP(c) == NBOUND ?
1239 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1240 isALNUM_LC_utf8((U8*)s)))
1242 else if ((norun || regtry(prog, s)))
1248 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1249 tmp = ((OP(c) == NBOUND ?
1250 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1251 while (s < strend) {
1253 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1255 else if ((norun || regtry(prog, s)))
1260 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1265 LOAD_UTF8_CHARCLASS_ALNUM();
1266 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1267 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1268 if (tmp && (norun || regtry(prog, s)))
1279 while (s < strend) {
1281 if (tmp && (norun || regtry(prog, s)))
1293 PL_reg_flags |= RF_tainted;
1295 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1296 if (isALNUM_LC_utf8((U8*)s)) {
1297 if (tmp && (norun || regtry(prog, s)))
1308 while (s < strend) {
1309 if (isALNUM_LC(*s)) {
1310 if (tmp && (norun || regtry(prog, s)))
1323 LOAD_UTF8_CHARCLASS_ALNUM();
1324 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1325 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1326 if (tmp && (norun || regtry(prog, s)))
1337 while (s < strend) {
1339 if (tmp && (norun || regtry(prog, s)))
1351 PL_reg_flags |= RF_tainted;
1353 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1354 if (!isALNUM_LC_utf8((U8*)s)) {
1355 if (tmp && (norun || regtry(prog, s)))
1366 while (s < strend) {
1367 if (!isALNUM_LC(*s)) {
1368 if (tmp && (norun || regtry(prog, s)))
1381 LOAD_UTF8_CHARCLASS_SPACE();
1382 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1383 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1384 if (tmp && (norun || regtry(prog, s)))
1395 while (s < strend) {
1397 if (tmp && (norun || regtry(prog, s)))
1409 PL_reg_flags |= RF_tainted;
1411 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1412 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1413 if (tmp && (norun || regtry(prog, s)))
1424 while (s < strend) {
1425 if (isSPACE_LC(*s)) {
1426 if (tmp && (norun || regtry(prog, s)))
1439 LOAD_UTF8_CHARCLASS_SPACE();
1440 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1441 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1442 if (tmp && (norun || regtry(prog, s)))
1453 while (s < strend) {
1455 if (tmp && (norun || regtry(prog, s)))
1467 PL_reg_flags |= RF_tainted;
1469 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1470 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1471 if (tmp && (norun || regtry(prog, s)))
1482 while (s < strend) {
1483 if (!isSPACE_LC(*s)) {
1484 if (tmp && (norun || regtry(prog, s)))
1497 LOAD_UTF8_CHARCLASS_DIGIT();
1498 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1499 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1500 if (tmp && (norun || regtry(prog, s)))
1511 while (s < strend) {
1513 if (tmp && (norun || regtry(prog, s)))
1525 PL_reg_flags |= RF_tainted;
1527 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1528 if (isDIGIT_LC_utf8((U8*)s)) {
1529 if (tmp && (norun || regtry(prog, s)))
1540 while (s < strend) {
1541 if (isDIGIT_LC(*s)) {
1542 if (tmp && (norun || regtry(prog, s)))
1555 LOAD_UTF8_CHARCLASS_DIGIT();
1556 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1557 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1558 if (tmp && (norun || regtry(prog, s)))
1569 while (s < strend) {
1571 if (tmp && (norun || regtry(prog, s)))
1583 PL_reg_flags |= RF_tainted;
1585 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1586 if (!isDIGIT_LC_utf8((U8*)s)) {
1587 if (tmp && (norun || regtry(prog, s)))
1598 while (s < strend) {
1599 if (!isDIGIT_LC(*s)) {
1600 if (tmp && (norun || regtry(prog, s)))
1612 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1621 - regexec_flags - match a regexp against a string
1624 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1625 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1626 /* strend: pointer to null at end of string */
1627 /* strbeg: real beginning of string */
1628 /* minend: end of match must be >=minend after stringarg. */
1629 /* data: May be used for some additional optimizations. */
1630 /* nosave: For optimizations. */
1633 register regnode *c;
1634 register char *startpos = stringarg;
1635 I32 minlen; /* must match at least this many chars */
1636 I32 dontbother = 0; /* how many characters not to try at end */
1637 I32 end_shift = 0; /* Same for the end. */ /* CC */
1638 I32 scream_pos = -1; /* Internal iterator of scream. */
1640 SV* oreplsv = GvSV(PL_replgv);
1641 const bool do_utf8 = DO_UTF8(sv);
1643 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1644 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1646 PERL_UNUSED_ARG(data);
1647 RX_MATCH_UTF8_set(prog,do_utf8);
1653 PL_regnarrate = DEBUG_r_TEST;
1656 /* Be paranoid... */
1657 if (prog == NULL || startpos == NULL) {
1658 Perl_croak(aTHX_ "NULL regexp parameter");
1662 minlen = prog->minlen;
1663 if (strend - startpos < minlen) {
1664 DEBUG_r(PerlIO_printf(Perl_debug_log,
1665 "String too short [regexec_flags]...\n"));
1669 /* Check validity of program. */
1670 if (UCHARAT(prog->program) != REG_MAGIC) {
1671 Perl_croak(aTHX_ "corrupted regexp program");
1675 PL_reg_eval_set = 0;
1678 if (prog->reganch & ROPT_UTF8)
1679 PL_reg_flags |= RF_utf8;
1681 /* Mark beginning of line for ^ and lookbehind. */
1682 PL_regbol = startpos;
1686 /* Mark end of line for $ (and such) */
1689 /* see how far we have to get to not match where we matched before */
1690 PL_regtill = startpos+minend;
1692 /* We start without call_cc context. */
1695 /* If there is a "must appear" string, look for it. */
1698 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1701 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1702 PL_reg_ganch = startpos;
1703 else if (sv && SvTYPE(sv) >= SVt_PVMG
1705 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1706 && mg->mg_len >= 0) {
1707 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1708 if (prog->reganch & ROPT_ANCH_GPOS) {
1709 if (s > PL_reg_ganch)
1714 else /* pos() not defined */
1715 PL_reg_ganch = strbeg;
1718 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
1719 re_scream_pos_data d;
1721 d.scream_olds = &scream_olds;
1722 d.scream_pos = &scream_pos;
1723 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1725 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1726 goto phooey; /* not present */
1731 const char * const s0 = UTF
1732 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1735 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1736 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1737 UNI_DISPLAY_REGEX) : startpos;
1738 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1741 PerlIO_printf(Perl_debug_log,
1742 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1743 PL_colors[4],PL_colors[5],PL_colors[0],
1746 len0 > 60 ? "..." : "",
1748 (int)(len1 > 60 ? 60 : len1),
1750 (len1 > 60 ? "..." : "")
1754 /* Simplest case: anchored match need be tried only once. */
1755 /* [unless only anchor is BOL and multiline is set] */
1756 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1757 if (s == startpos && regtry(prog, startpos))
1759 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1760 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1765 dontbother = minlen - 1;
1766 end = HOP3c(strend, -dontbother, strbeg) - 1;
1767 /* for multiline we only have to try after newlines */
1768 if (prog->check_substr || prog->check_utf8) {
1772 if (regtry(prog, s))
1777 if (prog->reganch & RE_USE_INTUIT) {
1778 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1789 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1790 if (regtry(prog, s))
1797 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1798 if (regtry(prog, PL_reg_ganch))
1803 /* Messy cases: unanchored match. */
1804 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1805 /* we have /x+whatever/ */
1806 /* it must be a one character string (XXXX Except UTF?) */
1811 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1812 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1813 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1816 while (s < strend) {
1818 DEBUG_r( did_match = 1 );
1819 if (regtry(prog, s)) goto got_it;
1821 while (s < strend && *s == ch)
1828 while (s < strend) {
1830 DEBUG_r( did_match = 1 );
1831 if (regtry(prog, s)) goto got_it;
1833 while (s < strend && *s == ch)
1839 DEBUG_r(if (!did_match)
1840 PerlIO_printf(Perl_debug_log,
1841 "Did not find anchored character...\n")
1844 else if (prog->anchored_substr != Nullsv
1845 || prog->anchored_utf8 != Nullsv
1846 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1847 && prog->float_max_offset < strend - s)) {
1852 char *last1; /* Last position checked before */
1856 if (prog->anchored_substr || prog->anchored_utf8) {
1857 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1858 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1859 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1860 back_max = back_min = prog->anchored_offset;
1862 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1863 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1864 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1865 back_max = prog->float_max_offset;
1866 back_min = prog->float_min_offset;
1868 if (must == &PL_sv_undef)
1869 /* could not downgrade utf8 check substring, so must fail */
1872 last = HOP3c(strend, /* Cannot start after this */
1873 -(I32)(CHR_SVLEN(must)
1874 - (SvTAIL(must) != 0) + back_min), strbeg);
1877 last1 = HOPc(s, -1);
1879 last1 = s - 1; /* bogus */
1881 /* XXXX check_substr already used to find "s", can optimize if
1882 check_substr==must. */
1884 dontbother = end_shift;
1885 strend = HOPc(strend, -dontbother);
1886 while ( (s <= last) &&
1887 ((flags & REXEC_SCREAM)
1888 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1889 end_shift, &scream_pos, 0))
1890 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1891 (unsigned char*)strend, must,
1892 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1893 /* we may be pointing at the wrong string */
1894 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1895 s = strbeg + (s - SvPVX_const(sv));
1896 DEBUG_r( did_match = 1 );
1897 if (HOPc(s, -back_max) > last1) {
1898 last1 = HOPc(s, -back_min);
1899 s = HOPc(s, -back_max);
1902 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1904 last1 = HOPc(s, -back_min);
1908 while (s <= last1) {
1909 if (regtry(prog, s))
1915 while (s <= last1) {
1916 if (regtry(prog, s))
1922 DEBUG_r(if (!did_match)
1923 PerlIO_printf(Perl_debug_log,
1924 "Did not find %s substr \"%s%.*s%s\"%s...\n",
1925 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1926 ? "anchored" : "floating"),
1928 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1930 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1934 else if ((c = prog->regstclass)) {
1936 I32 op = (U8)OP(prog->regstclass);
1937 /* don't bother with what can't match */
1938 if (PL_regkind[op] != EXACT && op != CANY)
1939 strend = HOPc(strend, -(minlen - 1));
1942 SV *prop = sv_newmortal();
1950 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
1951 UNI_DISPLAY_REGEX) :
1953 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1955 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1956 len1 = UTF ? SvCUR(dsv1) : strend - s;
1957 PerlIO_printf(Perl_debug_log,
1958 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
1962 if (find_byclass(prog, c, s, strend, 0))
1964 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1968 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1973 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1974 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1975 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1977 if (flags & REXEC_SCREAM) {
1978 last = screaminstr(sv, float_real, s - strbeg,
1979 end_shift, &scream_pos, 1); /* last one */
1981 last = scream_olds; /* Only one occurrence. */
1982 /* we may be pointing at the wrong string */
1983 else if (RX_MATCH_COPIED(prog))
1984 s = strbeg + (s - SvPVX_const(sv));
1988 const char * const little = SvPV_const(float_real, len);
1990 if (SvTAIL(float_real)) {
1991 if (memEQ(strend - len + 1, little, len - 1))
1992 last = strend - len + 1;
1993 else if (!PL_multiline)
1994 last = memEQ(strend - len, little, len)
1995 ? strend - len : Nullch;
2001 last = rninstr(s, strend, little, little + len);
2003 last = strend; /* matching "$" */
2007 DEBUG_r(PerlIO_printf(Perl_debug_log,
2008 "%sCan't trim the tail, match fails (should not happen)%s\n",
2009 PL_colors[4],PL_colors[5]));
2010 goto phooey; /* Should not happen! */
2012 dontbother = strend - last + prog->float_min_offset;
2014 if (minlen && (dontbother < minlen))
2015 dontbother = minlen - 1;
2016 strend -= dontbother; /* this one's always in bytes! */
2017 /* We don't know much -- general case. */
2020 if (regtry(prog, s))
2029 if (regtry(prog, s))
2031 } while (s++ < strend);
2039 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2041 if (PL_reg_eval_set) {
2042 /* Preserve the current value of $^R */
2043 if (oreplsv != GvSV(PL_replgv))
2044 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2045 restored, the value remains
2047 restore_pos(aTHX_ 0);
2050 /* make sure $`, $&, $', and $digit will work later */
2051 if ( !(flags & REXEC_NOT_FIRST) ) {
2052 if (RX_MATCH_COPIED(prog)) {
2053 Safefree(prog->subbeg);
2054 RX_MATCH_COPIED_off(prog);
2056 if (flags & REXEC_COPY_STR) {
2057 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2059 s = savepvn(strbeg, i);
2062 RX_MATCH_COPIED_on(prog);
2065 prog->subbeg = strbeg;
2066 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2073 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2074 PL_colors[4],PL_colors[5]));
2075 if (PL_reg_eval_set)
2076 restore_pos(aTHX_ 0);
2081 - regtry - try match at specific point
2083 STATIC I32 /* 0 failure, 1 success */
2084 S_regtry(pTHX_ regexp *prog, char *startpos)
2092 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2094 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2097 PL_reg_eval_set = RS_init;
2099 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2100 (IV)(PL_stack_sp - PL_stack_base));
2102 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2103 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2104 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2106 /* Apparently this is not needed, judging by wantarray. */
2107 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2108 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2111 /* Make $_ available to executed code. */
2112 if (PL_reg_sv != DEFSV) {
2113 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
2118 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
2119 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
2120 /* prepare for quick setting of pos */
2121 sv_magic(PL_reg_sv, (SV*)0,
2122 PERL_MAGIC_regex_global, Nullch, 0);
2123 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
2127 PL_reg_oldpos = mg->mg_len;
2128 SAVEDESTRUCTOR_X(restore_pos, 0);
2130 if (!PL_reg_curpm) {
2131 Newxz(PL_reg_curpm, 1, PMOP);
2134 SV* repointer = newSViv(0);
2135 /* so we know which PL_regex_padav element is PL_reg_curpm */
2136 SvFLAGS(repointer) |= SVf_BREAK;
2137 av_push(PL_regex_padav,repointer);
2138 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2139 PL_regex_pad = AvARRAY(PL_regex_padav);
2143 PM_SETRE(PL_reg_curpm, prog);
2144 PL_reg_oldcurpm = PL_curpm;
2145 PL_curpm = PL_reg_curpm;
2146 if (RX_MATCH_COPIED(prog)) {
2147 /* Here is a serious problem: we cannot rewrite subbeg,
2148 since it may be needed if this match fails. Thus
2149 $` inside (?{}) could fail... */
2150 PL_reg_oldsaved = prog->subbeg;
2151 PL_reg_oldsavedlen = prog->sublen;
2152 RX_MATCH_COPIED_off(prog);
2155 PL_reg_oldsaved = Nullch;
2156 prog->subbeg = PL_bostr;
2157 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2159 prog->startp[0] = startpos - PL_bostr;
2160 PL_reginput = startpos;
2161 PL_regstartp = prog->startp;
2162 PL_regendp = prog->endp;
2163 PL_reglastparen = &prog->lastparen;
2164 PL_reglastcloseparen = &prog->lastcloseparen;
2165 prog->lastparen = 0;
2166 prog->lastcloseparen = 0;
2168 DEBUG_r(PL_reg_starttry = startpos);
2169 if (PL_reg_start_tmpl <= prog->nparens) {
2170 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2171 if(PL_reg_start_tmp)
2172 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2174 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2177 /* XXXX What this code is doing here?!!! There should be no need
2178 to do this again and again, PL_reglastparen should take care of
2181 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2182 * Actually, the code in regcppop() (which Ilya may be meaning by
2183 * PL_reglastparen), is not needed at all by the test suite
2184 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2185 * enough, for building DynaLoader, or otherwise this
2186 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2187 * will happen. Meanwhile, this code *is* needed for the
2188 * above-mentioned test suite tests to succeed. The common theme
2189 * on those tests seems to be returning null fields from matches.
2194 if (prog->nparens) {
2195 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2202 if (regmatch(prog->program + 1)) {
2203 prog->endp[0] = PL_reginput - PL_bostr;
2206 REGCP_UNWIND(lastcp);
2210 #define RE_UNWIND_BRANCH 1
2211 #define RE_UNWIND_BRANCHJ 2
2215 typedef struct { /* XX: makes sense to enlarge it... */
2219 } re_unwind_generic_t;
2232 } re_unwind_branch_t;
2234 typedef union re_unwind_t {
2236 re_unwind_generic_t generic;
2237 re_unwind_branch_t branch;
2240 #define sayYES goto yes
2241 #define sayNO goto no
2242 #define sayNO_ANYOF goto no_anyof
2243 #define sayYES_FINAL goto yes_final
2244 #define sayYES_LOUD goto yes_loud
2245 #define sayNO_FINAL goto no_final
2246 #define sayNO_SILENT goto do_no
2247 #define saySAME(x) if (x) goto yes; else goto no
2249 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2250 #define POSCACHE_SEEN 1 /* we know what we're caching */
2251 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2252 #define CACHEsayYES STMT_START { \
2253 if (cache_offset | cache_bit) { \
2254 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2255 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2256 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2257 /* cache records failure, but this is success */ \
2259 PerlIO_printf(Perl_debug_log, \
2260 "%*s (remove success from failure cache)\n", \
2261 REPORT_CODE_OFF+PL_regindent*2, "") \
2263 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2268 #define CACHEsayNO STMT_START { \
2269 if (cache_offset | cache_bit) { \
2270 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2271 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2272 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2273 /* cache records success, but this is failure */ \
2275 PerlIO_printf(Perl_debug_log, \
2276 "%*s (remove failure from success cache)\n", \
2277 REPORT_CODE_OFF+PL_regindent*2, "") \
2279 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2285 #define REPORT_CODE_OFF 24
2288 - regmatch - main matching routine
2290 * Conceptually the strategy is simple: check to see whether the current
2291 * node matches, call self recursively to see whether the rest matches,
2292 * and then act accordingly. In practice we make some effort to avoid
2293 * recursion, in particular by going through "ordinary" nodes (that don't
2294 * need to know whether the rest of the match failed) by a loop instead of
2297 /* [lwall] I've hoisted the register declarations to the outer block in order to
2298 * maybe save a little bit of pushing and popping on the stack. It also takes
2299 * advantage of machines that use a register save mask on subroutine entry.
2301 STATIC I32 /* 0 failure, 1 success */
2302 S_regmatch(pTHX_ regnode *prog)
2304 register regnode *scan; /* Current node. */
2305 regnode *next; /* Next node. */
2306 regnode *inner; /* Next node in internal branch. */
2307 register I32 nextchr; /* renamed nextchr - nextchar colides with
2308 function of same name */
2309 register I32 n; /* no or next */
2310 register I32 ln = 0; /* len or last */
2311 register char *s = Nullch; /* operand or save */
2312 register char *locinput = PL_reginput;
2313 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2314 int minmod = 0, sw = 0, logical = 0;
2317 I32 firstcp = PL_savestack_ix;
2319 register const bool do_utf8 = PL_reg_match_utf8;
2321 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2322 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2323 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2325 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
2331 /* Note that nextchr is a byte even in UTF */
2332 nextchr = UCHARAT(locinput);
2334 while (scan != NULL) {
2337 SV *prop = sv_newmortal();
2338 const int docolor = *PL_colors[0];
2339 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2340 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2341 /* The part of the string before starttry has one color
2342 (pref0_len chars), between starttry and current
2343 position another one (pref_len - pref0_len chars),
2344 after the current position the third one.
2345 We assume that pref0_len <= pref_len, otherwise we
2346 decrease pref0_len. */
2347 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2348 ? (5 + taill) - l : locinput - PL_bostr;
2351 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2353 pref0_len = pref_len - (locinput - PL_reg_starttry);
2354 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2355 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2356 ? (5 + taill) - pref_len : PL_regeol - locinput);
2357 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2361 if (pref0_len > pref_len)
2362 pref0_len = pref_len;
2363 regprop(prop, scan);
2365 const char * const s0 =
2366 do_utf8 && OP(scan) != CANY ?
2367 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2368 pref0_len, 60, UNI_DISPLAY_REGEX) :
2369 locinput - pref_len;
2370 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2371 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2372 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2373 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2374 locinput - pref_len + pref0_len;
2375 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2376 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2377 pv_uni_display(dsv2, (U8*)locinput,
2378 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2380 const int len2 = do_utf8 ? strlen(s2) : l;
2381 PerlIO_printf(Perl_debug_log,
2382 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2383 (IV)(locinput - PL_bostr),
2390 (docolor ? "" : "> <"),
2394 15 - l - pref_len + 1,
2396 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2401 next = scan + NEXT_OFF(scan);
2407 if (locinput == PL_bostr || (PL_multiline &&
2408 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2410 /* regtill = regbol; */
2415 if (locinput == PL_bostr ||
2416 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2422 if (locinput == PL_bostr)
2426 if (locinput == PL_reg_ganch)
2436 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2441 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2443 if (PL_regeol - locinput > 1)
2447 if (PL_regeol != locinput)
2451 if (!nextchr && locinput >= PL_regeol)
2454 locinput += PL_utf8skip[nextchr];
2455 if (locinput > PL_regeol)
2457 nextchr = UCHARAT(locinput);
2460 nextchr = UCHARAT(++locinput);
2463 if (!nextchr && locinput >= PL_regeol)
2465 nextchr = UCHARAT(++locinput);
2468 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2471 locinput += PL_utf8skip[nextchr];
2472 if (locinput > PL_regeol)
2474 nextchr = UCHARAT(locinput);
2477 nextchr = UCHARAT(++locinput);
2482 if (do_utf8 != UTF) {
2483 /* The target and the pattern have differing utf8ness. */
2485 const char *e = s + ln;
2488 /* The target is utf8, the pattern is not utf8. */
2493 if (NATIVE_TO_UNI(*(U8*)s) !=
2494 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2502 /* The target is not utf8, the pattern is utf8. */
2507 if (NATIVE_TO_UNI(*((U8*)l)) !=
2508 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2516 nextchr = UCHARAT(locinput);
2519 /* The target and the pattern have the same utf8ness. */
2520 /* Inline the first character, for speed. */
2521 if (UCHARAT(s) != nextchr)
2523 if (PL_regeol - locinput < ln)
2525 if (ln > 1 && memNE(s, locinput, ln))
2528 nextchr = UCHARAT(locinput);
2531 PL_reg_flags |= RF_tainted;
2537 if (do_utf8 || UTF) {
2538 /* Either target or the pattern are utf8. */
2540 char *e = PL_regeol;
2542 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
2543 l, &e, 0, do_utf8)) {
2544 /* One more case for the sharp s:
2545 * pack("U0U*", 0xDF) =~ /ss/i,
2546 * the 0xC3 0x9F are the UTF-8
2547 * byte sequence for the U+00DF. */
2549 toLOWER(s[0]) == 's' &&
2551 toLOWER(s[1]) == 's' &&
2558 nextchr = UCHARAT(locinput);
2562 /* Neither the target and the pattern are utf8. */
2564 /* Inline the first character, for speed. */
2565 if (UCHARAT(s) != nextchr &&
2566 UCHARAT(s) != ((OP(scan) == EXACTF)
2567 ? PL_fold : PL_fold_locale)[nextchr])
2569 if (PL_regeol - locinput < ln)
2571 if (ln > 1 && (OP(scan) == EXACTF
2572 ? ibcmp(s, locinput, ln)
2573 : ibcmp_locale(s, locinput, ln)))
2576 nextchr = UCHARAT(locinput);
2580 STRLEN inclasslen = PL_regeol - locinput;
2582 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
2584 if (locinput >= PL_regeol)
2586 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2587 nextchr = UCHARAT(locinput);
2592 nextchr = UCHARAT(locinput);
2593 if (!REGINCLASS(scan, (U8*)locinput))
2595 if (!nextchr && locinput >= PL_regeol)
2597 nextchr = UCHARAT(++locinput);
2601 /* If we might have the case of the German sharp s
2602 * in a casefolding Unicode character class. */
2604 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2605 locinput += SHARP_S_SKIP;
2606 nextchr = UCHARAT(locinput);
2612 PL_reg_flags |= RF_tainted;
2618 LOAD_UTF8_CHARCLASS_ALNUM();
2619 if (!(OP(scan) == ALNUM
2620 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2621 : isALNUM_LC_utf8((U8*)locinput)))
2625 locinput += PL_utf8skip[nextchr];
2626 nextchr = UCHARAT(locinput);
2629 if (!(OP(scan) == ALNUM
2630 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2632 nextchr = UCHARAT(++locinput);
2635 PL_reg_flags |= RF_tainted;
2638 if (!nextchr && locinput >= PL_regeol)
2641 LOAD_UTF8_CHARCLASS_ALNUM();
2642 if (OP(scan) == NALNUM
2643 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2644 : isALNUM_LC_utf8((U8*)locinput))
2648 locinput += PL_utf8skip[nextchr];
2649 nextchr = UCHARAT(locinput);
2652 if (OP(scan) == NALNUM
2653 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2655 nextchr = UCHARAT(++locinput);
2659 PL_reg_flags |= RF_tainted;
2663 /* was last char in word? */
2665 if (locinput == PL_bostr)
2668 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2670 ln = utf8n_to_uvchr((U8 *)r, UTF8SKIP(r), 0, 0);
2672 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2673 ln = isALNUM_uni(ln);
2674 LOAD_UTF8_CHARCLASS_ALNUM();
2675 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2678 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2679 n = isALNUM_LC_utf8((U8*)locinput);
2683 ln = (locinput != PL_bostr) ?
2684 UCHARAT(locinput - 1) : '\n';
2685 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2687 n = isALNUM(nextchr);
2690 ln = isALNUM_LC(ln);
2691 n = isALNUM_LC(nextchr);
2694 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2695 OP(scan) == BOUNDL))
2699 PL_reg_flags |= RF_tainted;
2705 if (UTF8_IS_CONTINUED(nextchr)) {
2706 LOAD_UTF8_CHARCLASS_SPACE();
2707 if (!(OP(scan) == SPACE
2708 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2709 : isSPACE_LC_utf8((U8*)locinput)))
2713 locinput += PL_utf8skip[nextchr];
2714 nextchr = UCHARAT(locinput);
2717 if (!(OP(scan) == SPACE
2718 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2720 nextchr = UCHARAT(++locinput);
2723 if (!(OP(scan) == SPACE
2724 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2726 nextchr = UCHARAT(++locinput);
2730 PL_reg_flags |= RF_tainted;
2733 if (!nextchr && locinput >= PL_regeol)
2736 LOAD_UTF8_CHARCLASS_SPACE();
2737 if (OP(scan) == NSPACE
2738 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2739 : isSPACE_LC_utf8((U8*)locinput))
2743 locinput += PL_utf8skip[nextchr];
2744 nextchr = UCHARAT(locinput);
2747 if (OP(scan) == NSPACE
2748 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2750 nextchr = UCHARAT(++locinput);
2753 PL_reg_flags |= RF_tainted;
2759 LOAD_UTF8_CHARCLASS_DIGIT();
2760 if (!(OP(scan) == DIGIT
2761 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2762 : isDIGIT_LC_utf8((U8*)locinput)))
2766 locinput += PL_utf8skip[nextchr];
2767 nextchr = UCHARAT(locinput);
2770 if (!(OP(scan) == DIGIT
2771 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2773 nextchr = UCHARAT(++locinput);
2776 PL_reg_flags |= RF_tainted;
2779 if (!nextchr && locinput >= PL_regeol)
2782 LOAD_UTF8_CHARCLASS_DIGIT();
2783 if (OP(scan) == NDIGIT
2784 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2785 : isDIGIT_LC_utf8((U8*)locinput))
2789 locinput += PL_utf8skip[nextchr];
2790 nextchr = UCHARAT(locinput);
2793 if (OP(scan) == NDIGIT
2794 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2796 nextchr = UCHARAT(++locinput);
2799 if (locinput >= PL_regeol)
2802 LOAD_UTF8_CHARCLASS_MARK();
2803 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2805 locinput += PL_utf8skip[nextchr];
2806 while (locinput < PL_regeol &&
2807 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2808 locinput += UTF8SKIP(locinput);
2809 if (locinput > PL_regeol)
2814 nextchr = UCHARAT(locinput);
2817 PL_reg_flags |= RF_tainted;
2821 n = ARG(scan); /* which paren pair */
2822 ln = PL_regstartp[n];
2823 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2824 if ((I32)*PL_reglastparen < n || ln == -1)
2825 sayNO; /* Do not match unless seen CLOSEn. */
2826 if (ln == PL_regendp[n])
2830 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2832 const char *e = PL_bostr + PL_regendp[n];
2834 * Note that we can't do the "other character" lookup trick as
2835 * in the 8-bit case (no pun intended) because in Unicode we
2836 * have to map both upper and title case to lower case.
2838 if (OP(scan) == REFF) {
2840 STRLEN ulen1, ulen2;
2841 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
2842 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
2846 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2847 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
2848 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
2855 nextchr = UCHARAT(locinput);
2859 /* Inline the first character, for speed. */
2860 if (UCHARAT(s) != nextchr &&
2862 (UCHARAT(s) != ((OP(scan) == REFF
2863 ? PL_fold : PL_fold_locale)[nextchr]))))
2865 ln = PL_regendp[n] - ln;
2866 if (locinput + ln > PL_regeol)
2868 if (ln > 1 && (OP(scan) == REF
2869 ? memNE(s, locinput, ln)
2871 ? ibcmp(s, locinput, ln)
2872 : ibcmp_locale(s, locinput, ln))))
2875 nextchr = UCHARAT(locinput);
2886 OP_4tree *oop = PL_op;
2887 COP *ocurcop = PL_curcop;
2890 struct regexp *oreg = PL_reg_re;
2893 PL_op = (OP_4tree*)PL_regdata->data[n];
2894 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2895 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
2896 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2900 CALLRUNOPS(aTHX); /* Scalar context. */
2903 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
2911 PAD_RESTORE_LOCAL(old_comppad);
2912 PL_curcop = ocurcop;
2914 if (logical == 2) { /* Postponed subexpression. */
2916 MAGIC *mg = Null(MAGIC*);
2918 CHECKPOINT cp, lastcp;
2922 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2923 mg = mg_find(sv, PERL_MAGIC_qr);
2924 else if (SvSMAGICAL(ret)) {
2925 if (SvGMAGICAL(ret))
2926 sv_unmagic(ret, PERL_MAGIC_qr);
2928 mg = mg_find(ret, PERL_MAGIC_qr);
2932 re = (regexp *)mg->mg_obj;
2933 (void)ReREFCNT_inc(re);
2937 const char *t = SvPV_const(ret, len);
2939 char * const oprecomp = PL_regprecomp;
2940 const I32 osize = PL_regsize;
2941 const I32 onpar = PL_regnpar;
2944 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
2945 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
2947 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2949 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2951 PL_regprecomp = oprecomp;
2956 PerlIO_printf(Perl_debug_log,
2957 "Entering embedded \"%s%.60s%s%s\"\n",
2961 (strlen(re->precomp) > 60 ? "..." : ""))
2964 state.prev = PL_reg_call_cc;
2965 state.cc = PL_regcc;
2966 state.re = PL_reg_re;
2970 cp = regcppush(0); /* Save *all* the positions. */
2973 state.ss = PL_savestack_ix;
2974 *PL_reglastparen = 0;
2975 *PL_reglastcloseparen = 0;
2976 PL_reg_call_cc = &state;
2977 PL_reginput = locinput;
2978 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2979 ((re->reganch & ROPT_UTF8) != 0);
2980 if (toggleutf) PL_reg_flags ^= RF_utf8;
2982 /* XXXX This is too dramatic a measure... */
2985 if (regmatch(re->program + 1)) {
2986 /* Even though we succeeded, we need to restore
2987 global variables, since we may be wrapped inside
2988 SUSPEND, thus the match may be not finished yet. */
2990 /* XXXX Do this only if SUSPENDed? */
2991 PL_reg_call_cc = state.prev;
2992 PL_regcc = state.cc;
2993 PL_reg_re = state.re;
2994 cache_re(PL_reg_re);
2995 if (toggleutf) PL_reg_flags ^= RF_utf8;
2997 /* XXXX This is too dramatic a measure... */
3000 /* These are needed even if not SUSPEND. */
3006 REGCP_UNWIND(lastcp);
3008 PL_reg_call_cc = state.prev;
3009 PL_regcc = state.cc;
3010 PL_reg_re = state.re;
3011 cache_re(PL_reg_re);
3012 if (toggleutf) PL_reg_flags ^= RF_utf8;
3014 /* XXXX This is too dramatic a measure... */
3024 sv_setsv(save_scalar(PL_replgv), ret);
3030 n = ARG(scan); /* which paren pair */
3031 PL_reg_start_tmp[n] = locinput;
3036 n = ARG(scan); /* which paren pair */
3037 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3038 PL_regendp[n] = locinput - PL_bostr;
3039 if (n > (I32)*PL_reglastparen)
3040 *PL_reglastparen = n;
3041 *PL_reglastcloseparen = n;
3044 n = ARG(scan); /* which paren pair */
3045 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3048 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3050 next = NEXTOPER(NEXTOPER(scan));
3052 next = scan + ARG(scan);
3053 if (OP(next) == IFTHEN) /* Fake one. */
3054 next = NEXTOPER(NEXTOPER(next));
3058 logical = scan->flags;
3060 /*******************************************************************
3061 PL_regcc contains infoblock about the innermost (...)* loop, and
3062 a pointer to the next outer infoblock.
3064 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3066 1) After matching X, regnode for CURLYX is processed;
3068 2) This regnode creates infoblock on the stack, and calls
3069 regmatch() recursively with the starting point at WHILEM node;
3071 3) Each hit of WHILEM node tries to match A and Z (in the order
3072 depending on the current iteration, min/max of {min,max} and
3073 greediness). The information about where are nodes for "A"
3074 and "Z" is read from the infoblock, as is info on how many times "A"
3075 was already matched, and greediness.
3077 4) After A matches, the same WHILEM node is hit again.
3079 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3080 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3081 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3082 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3083 of the external loop.
3085 Currently present infoblocks form a tree with a stem formed by PL_curcc
3086 and whatever it mentions via ->next, and additional attached trees
3087 corresponding to temporarily unset infoblocks as in "5" above.
3089 In the following picture infoblocks for outer loop of
3090 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3091 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3092 infoblocks are drawn below the "reset" infoblock.
3094 In fact in the picture below we do not show failed matches for Z and T
3095 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3096 more obvious *why* one needs to *temporary* unset infoblocks.]
3098 Matched REx position InfoBlocks Comment
3102 Y A)*?Z)*?T x <- O <- I
3103 YA )*?Z)*?T x <- O <- I
3104 YA A)*?Z)*?T x <- O <- I
3105 YAA )*?Z)*?T x <- O <- I
3106 YAA Z)*?T x <- O # Temporary unset I
3109 YAAZ Y(A)*?Z)*?T x <- O
3112 YAAZY (A)*?Z)*?T x <- O
3115 YAAZY A)*?Z)*?T x <- O <- I
3118 YAAZYA )*?Z)*?T x <- O <- I
3121 YAAZYA Z)*?T x <- O # Temporary unset I
3127 YAAZYAZ T x # Temporary unset O
3134 *******************************************************************/
3137 CHECKPOINT cp = PL_savestack_ix;
3138 /* No need to save/restore up to this paren */
3139 I32 parenfloor = scan->flags;
3141 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3143 cc.oldcc = PL_regcc;
3145 /* XXXX Probably it is better to teach regpush to support
3146 parenfloor > PL_regsize... */
3147 if (parenfloor > (I32)*PL_reglastparen)
3148 parenfloor = *PL_reglastparen; /* Pessimization... */
3149 cc.parenfloor = parenfloor;
3151 cc.min = ARG1(scan);
3152 cc.max = ARG2(scan);
3153 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3157 PL_reginput = locinput;
3158 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3160 PL_regcc = cc.oldcc;
3166 * This is really hard to understand, because after we match
3167 * what we're trying to match, we must make sure the rest of
3168 * the REx is going to match for sure, and to do that we have
3169 * to go back UP the parse tree by recursing ever deeper. And
3170 * if it fails, we have to reset our parent's current state
3171 * that we can try again after backing off.
3174 CHECKPOINT cp, lastcp;
3175 CURCUR* cc = PL_regcc;
3176 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3177 I32 cache_offset = 0, cache_bit = 0;
3179 n = cc->cur + 1; /* how many we know we matched */
3180 PL_reginput = locinput;
3183 PerlIO_printf(Perl_debug_log,
3184 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3185 REPORT_CODE_OFF+PL_regindent*2, "",
3186 (long)n, (long)cc->min,
3187 (long)cc->max, PTR2UV(cc))
3190 /* If degenerate scan matches "", assume scan done. */
3192 if (locinput == cc->lastloc && n >= cc->min) {
3193 PL_regcc = cc->oldcc;
3197 PerlIO_printf(Perl_debug_log,
3198 "%*s empty match detected, try continuation...\n",
3199 REPORT_CODE_OFF+PL_regindent*2, "")
3201 if (regmatch(cc->next))
3209 /* First just match a string of min scans. */
3213 cc->lastloc = locinput;
3214 if (regmatch(cc->scan))
3217 cc->lastloc = lastloc;
3222 /* Check whether we already were at this position.
3223 Postpone detection until we know the match is not
3224 *that* much linear. */
3225 if (!PL_reg_maxiter) {
3226 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3227 PL_reg_leftiter = PL_reg_maxiter;
3229 if (PL_reg_leftiter-- == 0) {
3230 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3231 if (PL_reg_poscache) {
3232 if ((I32)PL_reg_poscache_size < size) {
3233 Renew(PL_reg_poscache, size, char);
3234 PL_reg_poscache_size = size;
3236 Zero(PL_reg_poscache, size, char);
3239 PL_reg_poscache_size = size;
3240 Newxz(PL_reg_poscache, size, char);
3243 PerlIO_printf(Perl_debug_log,
3244 "%sDetected a super-linear match, switching on caching%s...\n",
3245 PL_colors[4], PL_colors[5])
3248 if (PL_reg_leftiter < 0) {
3249 cache_offset = locinput - PL_bostr;
3251 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3252 + cache_offset * (scan->flags>>4);
3253 cache_bit = cache_offset % 8;
3255 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
3257 PerlIO_printf(Perl_debug_log,
3258 "%*s already tried at this position...\n",
3259 REPORT_CODE_OFF+PL_regindent*2, "")
3261 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3262 /* cache records success */
3265 /* cache records failure */
3268 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
3272 /* Prefer next over scan for minimal matching. */
3275 PL_regcc = cc->oldcc;
3278 cp = regcppush(cc->parenfloor);
3280 if (regmatch(cc->next)) {
3282 CACHEsayYES; /* All done. */
3284 REGCP_UNWIND(lastcp);
3290 if (n >= cc->max) { /* Maximum greed exceeded? */
3291 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3292 && !(PL_reg_flags & RF_warned)) {
3293 PL_reg_flags |= RF_warned;
3294 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3295 "Complex regular subexpression recursion",
3302 PerlIO_printf(Perl_debug_log,
3303 "%*s trying longer...\n",
3304 REPORT_CODE_OFF+PL_regindent*2, "")
3306 /* Try scanning more and see if it helps. */
3307 PL_reginput = locinput;
3309 cc->lastloc = locinput;
3310 cp = regcppush(cc->parenfloor);
3312 if (regmatch(cc->scan)) {
3316 REGCP_UNWIND(lastcp);
3319 cc->lastloc = lastloc;
3323 /* Prefer scan over next for maximal matching. */
3325 if (n < cc->max) { /* More greed allowed? */
3326 cp = regcppush(cc->parenfloor);
3328 cc->lastloc = locinput;
3330 if (regmatch(cc->scan)) {
3334 REGCP_UNWIND(lastcp);
3335 regcppop(); /* Restore some previous $<digit>s? */
3336 PL_reginput = locinput;
3338 PerlIO_printf(Perl_debug_log,
3339 "%*s failed, try continuation...\n",
3340 REPORT_CODE_OFF+PL_regindent*2, "")
3343 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3344 && !(PL_reg_flags & RF_warned)) {
3345 PL_reg_flags |= RF_warned;
3346 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3347 "Complex regular subexpression recursion",
3351 /* Failed deeper matches of scan, so see if this one works. */
3352 PL_regcc = cc->oldcc;
3355 if (regmatch(cc->next))
3361 cc->lastloc = lastloc;
3366 next = scan + ARG(scan);
3369 inner = NEXTOPER(NEXTOPER(scan));
3372 inner = NEXTOPER(scan);
3376 if (OP(next) != c1) /* No choice. */
3377 next = inner; /* Avoid recursion. */
3379 const I32 lastparen = *PL_reglastparen;
3381 re_unwind_branch_t *uw;
3383 /* Put unwinding data on stack */
3384 unwind1 = SSNEWt(1,re_unwind_branch_t);
3385 uw = SSPTRt(unwind1,re_unwind_branch_t);
3388 uw->type = ((c1 == BRANCH)
3390 : RE_UNWIND_BRANCHJ);
3391 uw->lastparen = lastparen;
3393 uw->locinput = locinput;
3394 uw->nextchr = nextchr;
3396 uw->regindent = ++PL_regindent;
3399 REGCP_SET(uw->lastcp);
3401 /* Now go into the first branch */
3414 /* We suppose that the next guy does not need
3415 backtracking: in particular, it is of constant non-zero length,
3416 and has no parenths to influence future backrefs. */
3417 ln = ARG1(scan); /* min to match */
3418 n = ARG2(scan); /* max to match */
3419 paren = scan->flags;
3421 if (paren > PL_regsize)
3423 if (paren > (I32)*PL_reglastparen)
3424 *PL_reglastparen = paren;
3426 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3428 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3429 PL_reginput = locinput;
3432 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3434 locinput = PL_reginput;
3435 if (HAS_TEXT(next) || JUMPABLE(next)) {
3436 regnode *text_node = next;
3438 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3440 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3442 if (PL_regkind[(U8)OP(text_node)] == REF) {
3446 else { c1 = (U8)*STRING(text_node); }
3447 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3449 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3450 c2 = PL_fold_locale[c1];
3459 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3460 /* If it could work, try it. */
3462 UCHARAT(PL_reginput) == c1 ||
3463 UCHARAT(PL_reginput) == c2)
3467 PL_regstartp[paren] =
3468 HOPc(PL_reginput, -l) - PL_bostr;
3469 PL_regendp[paren] = PL_reginput - PL_bostr;
3472 PL_regendp[paren] = -1;
3476 REGCP_UNWIND(lastcp);
3478 /* Couldn't or didn't -- move forward. */
3479 PL_reginput = locinput;
3480 if (regrepeat_hard(scan, 1, &l)) {
3482 locinput = PL_reginput;
3489 n = regrepeat_hard(scan, n, &l);
3490 locinput = PL_reginput;
3492 PerlIO_printf(Perl_debug_log,
3493 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3494 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3498 if (HAS_TEXT(next) || JUMPABLE(next)) {
3499 regnode *text_node = next;
3501 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3503 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3505 if (PL_regkind[(U8)OP(text_node)] == REF) {
3509 else { c1 = (U8)*STRING(text_node); }
3511 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3513 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3514 c2 = PL_fold_locale[c1];
3525 /* If it could work, try it. */
3527 UCHARAT(PL_reginput) == c1 ||
3528 UCHARAT(PL_reginput) == c2)
3531 PerlIO_printf(Perl_debug_log,
3532 "%*s trying tail with n=%"IVdf"...\n",
3533 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3537 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3538 PL_regendp[paren] = PL_reginput - PL_bostr;
3541 PL_regendp[paren] = -1;
3545 REGCP_UNWIND(lastcp);
3547 /* Couldn't or didn't -- back up. */
3549 locinput = HOPc(locinput, -l);
3550 PL_reginput = locinput;
3557 paren = scan->flags; /* Which paren to set */
3558 if (paren > PL_regsize)
3560 if (paren > (I32)*PL_reglastparen)
3561 *PL_reglastparen = paren;
3562 ln = ARG1(scan); /* min to match */
3563 n = ARG2(scan); /* max to match */
3564 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3568 ln = ARG1(scan); /* min to match */
3569 n = ARG2(scan); /* max to match */
3570 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3575 scan = NEXTOPER(scan);
3581 scan = NEXTOPER(scan);
3585 * Lookahead to avoid useless match attempts
3586 * when we know what character comes next.
3590 * Used to only do .*x and .*?x, but now it allows
3591 * for )'s, ('s and (?{ ... })'s to be in the way
3592 * of the quantifier and the EXACT-like node. -- japhy
3595 if (HAS_TEXT(next) || JUMPABLE(next)) {
3597 regnode *text_node = next;
3599 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3601 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3603 if (PL_regkind[(U8)OP(text_node)] == REF) {
3605 goto assume_ok_easy;
3607 else { s = (U8*)STRING(text_node); }
3611 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3613 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3614 c2 = PL_fold_locale[c1];
3617 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3618 STRLEN ulen1, ulen2;
3619 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3620 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3622 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3623 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3625 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
3627 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
3631 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
3640 PL_reginput = locinput;
3644 if (ln && regrepeat(scan, ln) < ln)
3646 locinput = PL_reginput;
3649 char *e; /* Should not check after this */
3650 char *old = locinput;
3653 if (n == REG_INFTY) {
3656 while (UTF8_IS_CONTINUATION(*(U8*)e))
3662 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3666 e = locinput + n - ln;
3671 /* Find place 'next' could work */
3674 while (locinput <= e &&
3675 UCHARAT(locinput) != c1)
3678 while (locinput <= e
3679 && UCHARAT(locinput) != c1
3680 && UCHARAT(locinput) != c2)
3683 count = locinput - old;
3688 /* count initialised to
3689 * utf8_distance(old, locinput) */
3690 while (locinput <= e &&
3691 utf8n_to_uvchr((U8*)locinput,
3692 UTF8_MAXBYTES, &len,
3693 uniflags) != (UV)c1) {
3699 /* count initialised to
3700 * utf8_distance(old, locinput) */
3701 while (locinput <= e) {
3702 UV c = utf8n_to_uvchr((U8*)locinput,
3703 UTF8_MAXBYTES, &len,
3705 if (c == (UV)c1 || c == (UV)c2)
3714 /* PL_reginput == old now */
3715 if (locinput != old) {
3716 ln = 1; /* Did some */
3717 if (regrepeat(scan, count) < count)
3720 /* PL_reginput == locinput now */
3721 TRYPAREN(paren, ln, locinput);
3722 PL_reginput = locinput; /* Could be reset... */
3723 REGCP_UNWIND(lastcp);
3724 /* Couldn't or didn't -- move forward. */
3727 locinput += UTF8SKIP(locinput);
3734 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3738 c = utf8n_to_uvchr((U8*)PL_reginput,
3742 c = UCHARAT(PL_reginput);
3743 /* If it could work, try it. */
3744 if (c == (UV)c1 || c == (UV)c2)
3746 TRYPAREN(paren, ln, PL_reginput);
3747 REGCP_UNWIND(lastcp);
3750 /* If it could work, try it. */
3751 else if (c1 == -1000)
3753 TRYPAREN(paren, ln, PL_reginput);
3754 REGCP_UNWIND(lastcp);
3756 /* Couldn't or didn't -- move forward. */
3757 PL_reginput = locinput;
3758 if (regrepeat(scan, 1)) {
3760 locinput = PL_reginput;
3768 n = regrepeat(scan, n);
3769 locinput = PL_reginput;
3770 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3771 ((!PL_multiline && OP(next) != MEOL) ||
3772 OP(next) == SEOL || OP(next) == EOS))
3774 ln = n; /* why back off? */
3775 /* ...because $ and \Z can match before *and* after
3776 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3777 We should back off by one in this case. */
3778 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3787 c = utf8n_to_uvchr((U8*)PL_reginput,
3791 c = UCHARAT(PL_reginput);
3793 /* If it could work, try it. */
3794 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3796 TRYPAREN(paren, n, PL_reginput);
3797 REGCP_UNWIND(lastcp);
3799 /* Couldn't or didn't -- back up. */
3801 PL_reginput = locinput = HOPc(locinput, -1);
3809 c = utf8n_to_uvchr((U8*)PL_reginput,
3813 c = UCHARAT(PL_reginput);
3815 /* If it could work, try it. */
3816 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3818 TRYPAREN(paren, n, PL_reginput);
3819 REGCP_UNWIND(lastcp);
3821 /* Couldn't or didn't -- back up. */
3823 PL_reginput = locinput = HOPc(locinput, -1);
3830 if (PL_reg_call_cc) {
3831 re_cc_state *cur_call_cc = PL_reg_call_cc;
3832 CURCUR *cctmp = PL_regcc;
3833 regexp *re = PL_reg_re;
3834 CHECKPOINT cp, lastcp;
3836 cp = regcppush(0); /* Save *all* the positions. */
3838 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3840 PL_reginput = locinput; /* Make position available to
3842 cache_re(PL_reg_call_cc->re);
3843 PL_regcc = PL_reg_call_cc->cc;
3844 PL_reg_call_cc = PL_reg_call_cc->prev;
3845 if (regmatch(cur_call_cc->node)) {
3846 PL_reg_call_cc = cur_call_cc;
3850 REGCP_UNWIND(lastcp);
3852 PL_reg_call_cc = cur_call_cc;
3858 PerlIO_printf(Perl_debug_log,
3859 "%*s continuation failed...\n",
3860 REPORT_CODE_OFF+PL_regindent*2, "")
3864 if (locinput < PL_regtill) {
3865 DEBUG_r(PerlIO_printf(Perl_debug_log,
3866 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3868 (long)(locinput - PL_reg_starttry),
3869 (long)(PL_regtill - PL_reg_starttry),
3871 sayNO_FINAL; /* Cannot match: too short. */
3873 PL_reginput = locinput; /* put where regtry can find it */
3874 sayYES_FINAL; /* Success! */
3876 PL_reginput = locinput; /* put where regtry can find it */
3877 sayYES_LOUD; /* Success! */
3880 PL_reginput = locinput;
3885 s = HOPBACKc(locinput, scan->flags);
3891 PL_reginput = locinput;
3896 s = HOPBACKc(locinput, scan->flags);
3902 PL_reginput = locinput;
3905 inner = NEXTOPER(NEXTOPER(scan));
3906 if (regmatch(inner) != n) {
3921 if (OP(scan) == SUSPEND) {
3922 locinput = PL_reginput;
3923 nextchr = UCHARAT(locinput);
3928 next = scan + ARG(scan);
3933 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3934 PTR2UV(scan), OP(scan));
3935 Perl_croak(aTHX_ "regexp memory corruption");
3942 * We get here only if there's trouble -- normally "case END" is
3943 * the terminating point.
3945 Perl_croak(aTHX_ "corrupted regexp pointers");
3951 PerlIO_printf(Perl_debug_log,
3952 "%*s %scould match...%s\n",
3953 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3957 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3958 PL_colors[4],PL_colors[5]));
3964 #if 0 /* Breaks $^R */
3972 PerlIO_printf(Perl_debug_log,
3973 "%*s %sfailed...%s\n",
3974 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3980 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3983 case RE_UNWIND_BRANCH:
3984 case RE_UNWIND_BRANCHJ:
3986 re_unwind_branch_t *uwb = &(uw->branch);
3987 const I32 lastparen = uwb->lastparen;
3989 REGCP_UNWIND(uwb->lastcp);
3990 for (n = *PL_reglastparen; n > lastparen; n--)
3992 *PL_reglastparen = n;
3993 scan = next = uwb->next;
3995 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3996 ? BRANCH : BRANCHJ) ) { /* Failure */
4003 /* Have more choice yet. Reuse the same uwb. */
4004 if ((n = (uwb->type == RE_UNWIND_BRANCH
4005 ? NEXT_OFF(next) : ARG(next))))
4008 next = NULL; /* XXXX Needn't unwinding in this case... */
4010 next = NEXTOPER(scan);
4011 if (uwb->type == RE_UNWIND_BRANCHJ)
4012 next = NEXTOPER(next);
4013 locinput = uwb->locinput;
4014 nextchr = uwb->nextchr;
4016 PL_regindent = uwb->regindent;
4023 Perl_croak(aTHX_ "regexp unwind memory corruption");
4034 - regrepeat - repeatedly match something simple, report how many
4037 * [This routine now assumes that it will only match on things of length 1.
4038 * That was true before, but now we assume scan - reginput is the count,
4039 * rather than incrementing count on every character. [Er, except utf8.]]
4042 S_regrepeat(pTHX_ const regnode *p, I32 max)
4044 register char *scan;
4046 register char *loceol = PL_regeol;
4047 register I32 hardcount = 0;
4048 register bool do_utf8 = PL_reg_match_utf8;
4051 if (max == REG_INFTY)
4053 else if (max < loceol - scan)
4054 loceol = scan + max;
4059 while (scan < loceol && hardcount < max && *scan != '\n') {
4060 scan += UTF8SKIP(scan);
4064 while (scan < loceol && *scan != '\n')
4071 while (scan < loceol && hardcount < max) {
4072 scan += UTF8SKIP(scan);
4082 case EXACT: /* length of string is 1 */
4084 while (scan < loceol && UCHARAT(scan) == c)
4087 case EXACTF: /* length of string is 1 */
4089 while (scan < loceol &&
4090 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4093 case EXACTFL: /* length of string is 1 */
4094 PL_reg_flags |= RF_tainted;
4096 while (scan < loceol &&
4097 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4103 while (hardcount < max && scan < loceol &&
4104 reginclass(p, (U8*)scan, 0, do_utf8)) {
4105 scan += UTF8SKIP(scan);
4109 while (scan < loceol && REGINCLASS(p, (U8*)scan))
4116 LOAD_UTF8_CHARCLASS_ALNUM();
4117 while (hardcount < max && scan < loceol &&
4118 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4119 scan += UTF8SKIP(scan);
4123 while (scan < loceol && isALNUM(*scan))
4128 PL_reg_flags |= RF_tainted;
4131 while (hardcount < max && scan < loceol &&
4132 isALNUM_LC_utf8((U8*)scan)) {
4133 scan += UTF8SKIP(scan);
4137 while (scan < loceol && isALNUM_LC(*scan))
4144 LOAD_UTF8_CHARCLASS_ALNUM();
4145 while (hardcount < max && scan < loceol &&
4146 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4147 scan += UTF8SKIP(scan);
4151 while (scan < loceol && !isALNUM(*scan))
4156 PL_reg_flags |= RF_tainted;
4159 while (hardcount < max && scan < loceol &&
4160 !isALNUM_LC_utf8((U8*)scan)) {
4161 scan += UTF8SKIP(scan);
4165 while (scan < loceol && !isALNUM_LC(*scan))
4172 LOAD_UTF8_CHARCLASS_SPACE();
4173 while (hardcount < max && scan < loceol &&
4175 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4176 scan += UTF8SKIP(scan);
4180 while (scan < loceol && isSPACE(*scan))
4185 PL_reg_flags |= RF_tainted;
4188 while (hardcount < max && scan < loceol &&
4189 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4190 scan += UTF8SKIP(scan);
4194 while (scan < loceol && isSPACE_LC(*scan))
4201 LOAD_UTF8_CHARCLASS_SPACE();
4202 while (hardcount < max && scan < loceol &&
4204 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4205 scan += UTF8SKIP(scan);
4209 while (scan < loceol && !isSPACE(*scan))
4214 PL_reg_flags |= RF_tainted;
4217 while (hardcount < max && scan < loceol &&
4218 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4219 scan += UTF8SKIP(scan);
4223 while (scan < loceol && !isSPACE_LC(*scan))
4230 LOAD_UTF8_CHARCLASS_DIGIT();
4231 while (hardcount < max && scan < loceol &&
4232 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4233 scan += UTF8SKIP(scan);
4237 while (scan < loceol && isDIGIT(*scan))
4244 LOAD_UTF8_CHARCLASS_DIGIT();
4245 while (hardcount < max && scan < loceol &&
4246 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4247 scan += UTF8SKIP(scan);
4251 while (scan < loceol && !isDIGIT(*scan))
4255 default: /* Called on something of 0 width. */
4256 break; /* So match right here or not at all. */
4262 c = scan - PL_reginput;
4267 SV *prop = sv_newmortal();
4269 regprop(prop, (regnode *)p);
4270 PerlIO_printf(Perl_debug_log,
4271 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4272 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4279 - regrepeat_hard - repeatedly match something, report total lenth and length
4281 * The repeater is supposed to have constant non-zero length.
4285 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
4287 register char *scan = Nullch;
4288 register char *start;
4289 register char *loceol = PL_regeol;
4291 I32 count = 0, res = 1;
4296 start = PL_reginput;
4297 if (PL_reg_match_utf8) {
4298 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4301 while (start < PL_reginput) {
4303 start += UTF8SKIP(start);
4314 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4316 *lp = l = PL_reginput - start;
4317 if (max != REG_INFTY && l*max < loceol - scan)
4318 loceol = scan + l*max;
4331 - regclass_swash - prepare the utf8 swash
4335 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4341 if (PL_regdata && PL_regdata->count) {
4342 const U32 n = ARG(node);
4344 if (PL_regdata->what[n] == 's') {
4345 SV * const rv = (SV*)PL_regdata->data[n];
4346 AV * const av = (AV*)SvRV((SV*)rv);
4347 SV **const ary = AvARRAY(av);
4350 /* See the end of regcomp.c:S_reglass() for
4351 * documentation of these array elements. */
4354 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4355 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4359 else if (si && doinit) {
4360 sw = swash_init("utf8", "", si, 1, 0);
4361 (void)av_store(av, 1, sw);
4377 - reginclass - determine if a character falls into a character class
4379 The n is the ANYOF regnode, the p is the target string, lenp
4380 is pointer to the maximum length of how far to go in the p
4381 (if the lenp is zero, UTF8SKIP(p) is used),
4382 do_utf8 tells whether the target string is in UTF-8.
4387 S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
4389 const char flags = ANYOF_FLAGS(n);
4395 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4396 c = utf8n_to_uvchr((U8 *)p, UTF8_MAXBYTES, &len,
4397 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
4399 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4400 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4403 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4404 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4407 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4411 SV * const sw = regclass_swash((regnode *)n, TRUE, 0, (SV**)&av);
4414 if (swash_fetch(sw, (U8 *)p, do_utf8))
4416 else if (flags & ANYOF_FOLD) {
4417 if (!match && lenp && av) {
4419 for (i = 0; i <= av_len(av); i++) {
4420 SV* const sv = *av_fetch(av, i, FALSE);
4422 const char * const s = SvPV_const(sv, len);
4424 if (len <= plen && memEQ(s, (char*)p, len)) {
4432 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4435 to_utf8_fold((U8 *)p, tmpbuf, &tmplen);
4436 if (swash_fetch(sw, tmpbuf, do_utf8))
4442 if (match && lenp && *lenp == 0)
4443 *lenp = UNISKIP(NATIVE_TO_UNI(c));
4445 if (!match && c < 256) {
4446 if (ANYOF_BITMAP_TEST(n, c))
4448 else if (flags & ANYOF_FOLD) {
4451 if (flags & ANYOF_LOCALE) {
4452 PL_reg_flags |= RF_tainted;
4453 f = PL_fold_locale[c];
4457 if (f != c && ANYOF_BITMAP_TEST(n, f))
4461 if (!match && (flags & ANYOF_CLASS)) {
4462 PL_reg_flags |= RF_tainted;
4464 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4465 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4466 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4468 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4469 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4470 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4471 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4472 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4473 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4474 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4475 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4476 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4477 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4478 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4479 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4480 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4481 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4482 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4483 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4484 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4485 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4486 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4487 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4488 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4489 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4490 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4491 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4492 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4493 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4494 ) /* How's that for a conditional? */
4501 return (flags & ANYOF_INVERT) ? !match : match;
4505 S_reghop(pTHX_ U8 *s, I32 off)
4507 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4511 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4514 while (off-- && s < lim) {
4515 /* XXX could check well-formedness here */
4523 if (UTF8_IS_CONTINUED(*s)) {
4524 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4527 /* XXX could check well-formedness here */
4535 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4537 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4541 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4544 while (off-- && s < lim) {
4545 /* XXX could check well-formedness here */
4555 if (UTF8_IS_CONTINUED(*s)) {
4556 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4559 /* XXX could check well-formedness here */
4571 restore_pos(pTHX_ void *arg)
4573 PERL_UNUSED_ARG(arg);
4574 if (PL_reg_eval_set) {
4575 if (PL_reg_oldsaved) {
4576 PL_reg_re->subbeg = PL_reg_oldsaved;
4577 PL_reg_re->sublen = PL_reg_oldsavedlen;
4578 RX_MATCH_COPIED_on(PL_reg_re);
4580 PL_reg_magic->mg_len = PL_reg_oldpos;
4581 PL_reg_eval_set = 0;
4582 PL_curpm = PL_reg_oldcurpm;
4587 S_to_utf8_substr(pTHX_ register regexp *prog)
4589 if (prog->float_substr && !prog->float_utf8) {
4591 prog->float_utf8 = sv = newSVsv(prog->float_substr);
4592 sv_utf8_upgrade(sv);
4593 if (SvTAIL(prog->float_substr))
4595 if (prog->float_substr == prog->check_substr)
4596 prog->check_utf8 = sv;
4598 if (prog->anchored_substr && !prog->anchored_utf8) {
4600 prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
4601 sv_utf8_upgrade(sv);
4602 if (SvTAIL(prog->anchored_substr))
4604 if (prog->anchored_substr == prog->check_substr)
4605 prog->check_utf8 = sv;
4610 S_to_byte_substr(pTHX_ register regexp *prog)
4612 if (prog->float_utf8 && !prog->float_substr) {
4614 prog->float_substr = sv = newSVsv(prog->float_utf8);
4615 if (sv_utf8_downgrade(sv, TRUE)) {
4616 if (SvTAIL(prog->float_utf8))
4620 prog->float_substr = sv = &PL_sv_undef;
4622 if (prog->float_utf8 == prog->check_utf8)
4623 prog->check_substr = sv;
4625 if (prog->anchored_utf8 && !prog->anchored_substr) {
4627 prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
4628 if (sv_utf8_downgrade(sv, TRUE)) {
4629 if (SvTAIL(prog->anchored_utf8))
4633 prog->anchored_substr = sv = &PL_sv_undef;
4635 if (prog->anchored_utf8 == prog->check_utf8)
4636 prog->check_substr = sv;
4642 * c-indentation-style: bsd
4644 * indent-tabs-mode: t
4647 * ex: set ts=8 sts=4 sw=4 noet: