5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
19 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
20 * confused with the original package (see point 3 below). Thanks, Henry!
23 /* Additional note: this code is very heavily munged from Henry's version
24 * in places. In some spots I've traded clarity for efficiency, so don't
25 * blame Henry for some of the lack of readability.
28 /* The names of the functions have been changed from regcomp and
29 * regexec to pregcomp and pregexec in order to avoid conflicts
30 * with the POSIX routines of the same names.
33 #ifdef PERL_EXT_RE_BUILD
34 /* need to replace pregcomp et al, so enable that */
35 # ifndef PERL_IN_XSUB_RE
36 # define PERL_IN_XSUB_RE
38 /* need access to debugger hooks */
39 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
44 #ifdef PERL_IN_XSUB_RE
45 /* We *really* need to overwrite these symbols: */
46 # define Perl_regexec_flags my_regexec
47 # define Perl_regdump my_regdump
48 # define Perl_regprop my_regprop
49 # define Perl_re_intuit_start my_re_intuit_start
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_pregexec my_pregexec
52 # define Perl_reginitcolors my_reginitcolors
53 # define Perl_regclass_swash my_regclass_swash
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
78 **** Alterations to Henry's code are...
80 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
81 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
83 **** You may distribute under the terms of either the GNU General Public
84 **** License or the Artistic License, as specified in the README file.
86 * Beware that some of this code is subtly aware of the way operator
87 * precedence is structured in regular expressions. Serious changes in
88 * regular-expression syntax might require a total rethink.
91 #define PERL_IN_REGEXEC_C
96 #define RF_tainted 1 /* tainted information used? */
97 #define RF_warned 2 /* warned about big count? */
98 #define RF_evaled 4 /* Did an EVAL with setting? */
99 #define RF_utf8 8 /* String contains multibyte chars? */
101 #define UTF ((PL_reg_flags & RF_utf8) != 0)
103 #define RS_init 1 /* eval environment created */
104 #define RS_set 2 /* replsv value is set */
107 #define STATIC static
110 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
116 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
117 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
119 #define reghop_c(pos,off) ((char*)reghop((U8*)pos, off))
120 #define reghopmaybe_c(pos,off) ((char*)reghopmaybe((U8*)pos, off))
121 #define HOP(pos,off) (PL_reg_match_utf8 ? reghop((U8*)pos, off) : (U8*)(pos + off))
122 #define HOPMAYBE(pos,off) (PL_reg_match_utf8 ? reghopmaybe((U8*)pos, off) : (U8*)(pos + off))
123 #define HOPc(pos,off) ((char*)HOP(pos,off))
124 #define HOPMAYBEc(pos,off) ((char*)HOPMAYBE(pos,off))
126 #define HOPBACK(pos, off) ( \
127 (PL_reg_match_utf8) \
128 ? reghopmaybe((U8*)pos, -off) \
129 : (pos - off >= PL_bostr) \
133 #define HOPBACKc(pos, off) (char*)HOPBACK(pos, off)
135 #define reghop3_c(pos,off,lim) ((char*)reghop3((U8*)pos, off, (U8*)lim))
136 #define reghopmaybe3_c(pos,off,lim) ((char*)reghopmaybe3((U8*)pos, off, (U8*)lim))
137 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
138 #define HOPMAYBE3(pos,off,lim) (PL_reg_match_utf8 ? reghopmaybe3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
139 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140 #define HOPMAYBE3c(pos,off,lim) ((char*)HOPMAYBE3(pos,off,lim))
142 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
143 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((U8*)str); assert(ok); LEAVE; } } STMT_END
144 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
145 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
146 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
147 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
149 /* for use after a quantifier and before an EXACT-like node -- japhy */
150 #define JUMPABLE(rn) ( \
151 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
152 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
153 OP(rn) == PLUS || OP(rn) == MINMOD || \
154 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
157 #define HAS_TEXT(rn) ( \
158 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
162 Search for mandatory following text node; for lookahead, the text must
163 follow but for lookbehind (rn->flags != 0) we skip to the next step.
165 #define FIND_NEXT_IMPT(rn) STMT_START { \
166 while (JUMPABLE(rn)) \
167 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
168 rn = NEXTOPER(NEXTOPER(rn)); \
169 else if (OP(rn) == PLUS) \
171 else if (OP(rn) == IFMATCH) \
172 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
173 else rn += NEXT_OFF(rn); \
176 static void restore_pos(pTHX_ void *arg);
179 S_regcppush(pTHX_ I32 parenfloor)
181 const int retval = PL_savestack_ix;
182 #define REGCP_PAREN_ELEMS 4
183 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
186 if (paren_elems_to_push < 0)
187 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
189 #define REGCP_OTHER_ELEMS 6
190 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
191 for (p = PL_regsize; p > parenfloor; p--) {
192 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
193 SSPUSHINT(PL_regendp[p]);
194 SSPUSHINT(PL_regstartp[p]);
195 SSPUSHPTR(PL_reg_start_tmp[p]);
198 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
199 SSPUSHINT(PL_regsize);
200 SSPUSHINT(*PL_reglastparen);
201 SSPUSHINT(*PL_reglastcloseparen);
202 SSPUSHPTR(PL_reginput);
203 #define REGCP_FRAME_ELEMS 2
204 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
205 * are needed for the regexp context stack bookkeeping. */
206 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
207 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
212 /* These are needed since we do not localize EVAL nodes: */
213 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
214 " Setting an EVAL scope, savestack=%"IVdf"\n", \
215 (IV)PL_savestack_ix)); cp = PL_savestack_ix
217 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
218 PerlIO_printf(Perl_debug_log, \
219 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
220 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
229 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
231 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
232 i = SSPOPINT; /* Parentheses elements to pop. */
233 input = (char *) SSPOPPTR;
234 *PL_reglastcloseparen = SSPOPINT;
235 *PL_reglastparen = SSPOPINT;
236 PL_regsize = SSPOPINT;
238 /* Now restore the parentheses context. */
239 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
240 i > 0; i -= REGCP_PAREN_ELEMS) {
242 paren = (U32)SSPOPINT;
243 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
244 PL_regstartp[paren] = SSPOPINT;
246 if (paren <= *PL_reglastparen)
247 PL_regendp[paren] = tmps;
249 PerlIO_printf(Perl_debug_log,
250 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
251 (UV)paren, (IV)PL_regstartp[paren],
252 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
253 (IV)PL_regendp[paren],
254 (paren > *PL_reglastparen ? "(no)" : ""));
258 if ((I32)(*PL_reglastparen + 1) <= PL_regnpar) {
259 PerlIO_printf(Perl_debug_log,
260 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
261 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
265 /* It would seem that the similar code in regtry()
266 * already takes care of this, and in fact it is in
267 * a better location to since this code can #if 0-ed out
268 * but the code in regtry() is needed or otherwise tests
269 * requiring null fields (pat.t#187 and split.t#{13,14}
270 * (as of patchlevel 7877) will fail. Then again,
271 * this code seems to be necessary or otherwise
272 * building DynaLoader will fail:
273 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
275 for (paren = *PL_reglastparen + 1; (I32)paren <= PL_regnpar; paren++) {
276 if ((I32)paren > PL_regsize)
277 PL_regstartp[paren] = -1;
278 PL_regendp[paren] = -1;
285 S_regcp_set_to(pTHX_ I32 ss)
287 const I32 tmp = PL_savestack_ix;
289 PL_savestack_ix = ss;
291 PL_savestack_ix = tmp;
295 typedef struct re_cc_state
299 struct re_cc_state *prev;
304 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
306 #define TRYPAREN(paren, n, input) { \
309 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
310 PL_regendp[paren] = input - PL_bostr; \
313 PL_regendp[paren] = -1; \
315 if (regmatch(next)) \
318 PL_regendp[paren] = -1; \
323 * pregexec and friends
327 - pregexec - match a regexp against a string
330 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
331 char *strbeg, I32 minend, SV *screamer, U32 nosave)
332 /* strend: pointer to null at end of string */
333 /* strbeg: real beginning of string */
334 /* minend: end of match must be >=minend after stringarg. */
335 /* nosave: For optimizations. */
338 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
339 nosave ? 0 : REXEC_COPY_STR);
343 S_cache_re(pTHX_ regexp *prog)
345 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
347 PL_regprogram = prog->program;
349 PL_regnpar = prog->nparens;
350 PL_regdata = prog->data;
355 * Need to implement the following flags for reg_anch:
357 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
359 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
360 * INTUIT_AUTORITATIVE_ML
361 * INTUIT_ONCE_NOML - Intuit can match in one location only.
364 * Another flag for this function: SECOND_TIME (so that float substrs
365 * with giant delta may be not rechecked).
368 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
370 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
371 Otherwise, only SvCUR(sv) is used to get strbeg. */
373 /* XXXX We assume that strpos is strbeg unless sv. */
375 /* XXXX Some places assume that there is a fixed substring.
376 An update may be needed if optimizer marks as "INTUITable"
377 RExen without fixed substrings. Similarly, it is assumed that
378 lengths of all the strings are no more than minlen, thus they
379 cannot come from lookahead.
380 (Or minlen should take into account lookahead.) */
382 /* A failure to find a constant substring means that there is no need to make
383 an expensive call to REx engine, thus we celebrate a failure. Similarly,
384 finding a substring too deep into the string means that less calls to
385 regtry() should be needed.
387 REx compiler's optimizer found 4 possible hints:
388 a) Anchored substring;
390 c) Whether we are anchored (beginning-of-line or \G);
391 d) First node (of those at offset 0) which may distingush positions;
392 We use a)b)d) and multiline-part of c), and try to find a position in the
393 string which does not contradict any of them.
396 /* Most of decisions we do here should have been done at compile time.
397 The nodes of the REx which we used for the search should have been
398 deleted from the finite automaton. */
401 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
402 char *strend, U32 flags, re_scream_pos_data *data)
404 register I32 start_shift = 0;
405 /* Should be nonnegative! */
406 register I32 end_shift = 0;
411 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
413 register char *other_last = Nullch; /* other substr checked before this */
414 char *check_at = Nullch; /* check substr found at this pos */
416 const char * const i_strpos = strpos;
417 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
419 RX_MATCH_UTF8_set(prog,do_utf8);
421 if (prog->reganch & ROPT_UTF8) {
422 DEBUG_r(PerlIO_printf(Perl_debug_log,
423 "UTF-8 regex...\n"));
424 PL_reg_flags |= RF_utf8;
428 const char *s = PL_reg_match_utf8 ?
429 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
431 const int len = PL_reg_match_utf8 ?
432 strlen(s) : strend - strpos;
435 if (PL_reg_match_utf8)
436 DEBUG_r(PerlIO_printf(Perl_debug_log,
437 "UTF-8 target...\n"));
438 PerlIO_printf(Perl_debug_log,
439 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
440 PL_colors[4],PL_colors[5],PL_colors[0],
443 (strlen(prog->precomp) > 60 ? "..." : ""),
445 (int)(len > 60 ? 60 : len),
447 (len > 60 ? "..." : "")
451 /* CHR_DIST() would be more correct here but it makes things slow. */
452 if (prog->minlen > strend - strpos) {
453 DEBUG_r(PerlIO_printf(Perl_debug_log,
454 "String too short... [re_intuit_start]\n"));
457 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
460 if (!prog->check_utf8 && prog->check_substr)
461 to_utf8_substr(prog);
462 check = prog->check_utf8;
464 if (!prog->check_substr && prog->check_utf8)
465 to_byte_substr(prog);
466 check = prog->check_substr;
468 if (check == &PL_sv_undef) {
469 DEBUG_r(PerlIO_printf(Perl_debug_log,
470 "Non-utf string cannot match utf check string\n"));
473 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
474 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
475 || ( (prog->reganch & ROPT_ANCH_BOL)
476 && !PL_multiline ) ); /* Check after \n? */
479 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
480 | ROPT_IMPLICIT)) /* not a real BOL */
481 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
483 && (strpos != strbeg)) {
484 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
487 if (prog->check_offset_min == prog->check_offset_max &&
488 !(prog->reganch & ROPT_CANY_SEEN)) {
489 /* Substring at constant offset from beg-of-str... */
492 s = HOP3c(strpos, prog->check_offset_min, strend);
494 slen = SvCUR(check); /* >= 1 */
496 if ( strend - s > slen || strend - s < slen - 1
497 || (strend - s == slen && strend[-1] != '\n')) {
498 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
501 /* Now should match s[0..slen-2] */
503 if (slen && (*SvPVX_const(check) != *s
505 && memNE(SvPVX_const(check), s, slen)))) {
507 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
511 else if (*SvPVX_const(check) != *s
512 || ((slen = SvCUR(check)) > 1
513 && memNE(SvPVX_const(check), s, slen)))
515 goto success_at_start;
518 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
520 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
521 end_shift = prog->minlen - start_shift -
522 CHR_SVLEN(check) + (SvTAIL(check) != 0);
524 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
525 - (SvTAIL(check) != 0);
526 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
528 if (end_shift < eshift)
532 else { /* Can match at random position */
535 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
536 /* Should be nonnegative! */
537 end_shift = prog->minlen - start_shift -
538 CHR_SVLEN(check) + (SvTAIL(check) != 0);
541 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
543 Perl_croak(aTHX_ "panic: end_shift");
547 /* Find a possible match in the region s..strend by looking for
548 the "check" substring in the region corrected by start/end_shift. */
549 if (flags & REXEC_SCREAM) {
550 I32 p = -1; /* Internal iterator of scream. */
551 I32 * const pp = data ? data->scream_pos : &p;
553 if (PL_screamfirst[BmRARE(check)] >= 0
554 || ( BmRARE(check) == '\n'
555 && (BmPREVIOUS(check) == SvCUR(check) - 1)
557 s = screaminstr(sv, check,
558 start_shift + (s - strbeg), end_shift, pp, 0);
561 /* we may be pointing at the wrong string */
562 if (s && RX_MATCH_COPIED(prog))
563 s = strbeg + (s - SvPVX_const(sv));
565 *data->scream_olds = s;
567 else if (prog->reganch & ROPT_CANY_SEEN)
568 s = fbm_instr((U8*)(s + start_shift),
569 (U8*)(strend - end_shift),
570 check, PL_multiline ? FBMrf_MULTILINE : 0);
572 s = fbm_instr(HOP3(s, start_shift, strend),
573 HOP3(strend, -end_shift, strbeg),
574 check, PL_multiline ? FBMrf_MULTILINE : 0);
576 /* Update the count-of-usability, remove useless subpatterns,
579 /* FIXME - DEBUG_EXECUTE_r if that is merged to maint. */
580 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
581 (s ? "Found" : "Did not find"),
582 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
584 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
586 PL_colors[1], (SvTAIL(check) ? "$" : ""),
587 (s ? " at offset " : "...\n") ) );
594 /* Finish the diagnostic message */
595 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
597 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
598 Start with the other substr.
599 XXXX no SCREAM optimization yet - and a very coarse implementation
600 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
601 *always* match. Probably should be marked during compile...
602 Probably it is right to do no SCREAM here...
605 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
606 /* Take into account the "other" substring. */
607 /* XXXX May be hopelessly wrong for UTF... */
610 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
613 char * const last = HOP3c(s, -start_shift, strbeg);
618 t = s - prog->check_offset_max;
619 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
621 || ((t = reghopmaybe3_c(s, -(prog->check_offset_max), strpos))
626 t = HOP3c(t, prog->anchored_offset, strend);
627 if (t < other_last) /* These positions already checked */
629 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
632 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
633 /* On end-of-str: see comment below. */
634 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
635 if (must == &PL_sv_undef) {
637 DEBUG_r(must = prog->anchored_utf8); /* for debug */
642 HOP3(HOP3(last1, prog->anchored_offset, strend)
643 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
645 PL_multiline ? FBMrf_MULTILINE : 0
647 DEBUG_r(PerlIO_printf(Perl_debug_log,
648 "%s anchored substr \"%s%.*s%s\"%s",
649 (s ? "Found" : "Contradicts"),
652 - (SvTAIL(must)!=0)),
654 PL_colors[1], (SvTAIL(must) ? "$" : "")));
656 if (last1 >= last2) {
657 DEBUG_r(PerlIO_printf(Perl_debug_log,
658 ", giving up...\n"));
661 DEBUG_r(PerlIO_printf(Perl_debug_log,
662 ", trying floating at offset %ld...\n",
663 (long)(HOP3c(s1, 1, strend) - i_strpos)));
664 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
665 s = HOP3c(last, 1, strend);
669 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
670 (long)(s - i_strpos)));
671 t = HOP3c(s, -prog->anchored_offset, strbeg);
672 other_last = HOP3c(s, 1, strend);
680 else { /* Take into account the floating substring. */
685 t = HOP3c(s, -start_shift, strbeg);
687 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
688 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
689 last = HOP3c(t, prog->float_max_offset, strend);
690 s = HOP3c(t, prog->float_min_offset, strend);
693 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
694 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
695 /* fbm_instr() takes into account exact value of end-of-str
696 if the check is SvTAIL(ed). Since false positives are OK,
697 and end-of-str is not later than strend we are OK. */
698 if (must == &PL_sv_undef) {
700 DEBUG_r(must = prog->float_utf8); /* for debug message */
703 s = fbm_instr((unsigned char*)s,
704 (unsigned char*)last + SvCUR(must)
706 must, PL_multiline ? FBMrf_MULTILINE : 0);
707 /* FIXME - DEBUG_EXECUTE_r if that is merged to maint */
708 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
709 (s ? "Found" : "Contradicts"),
711 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
713 PL_colors[1], (SvTAIL(must) ? "$" : "")));
716 DEBUG_r(PerlIO_printf(Perl_debug_log,
717 ", giving up...\n"));
720 DEBUG_r(PerlIO_printf(Perl_debug_log,
721 ", trying anchored starting at offset %ld...\n",
722 (long)(s1 + 1 - i_strpos)));
724 s = HOP3c(t, 1, strend);
728 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
729 (long)(s - i_strpos)));
730 other_last = s; /* Fix this later. --Hugo */
739 t = s - prog->check_offset_max;
740 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
742 || ((t = reghopmaybe3_c(s, -prog->check_offset_max, strpos))
744 /* Fixed substring is found far enough so that the match
745 cannot start at strpos. */
747 if (ml_anch && t[-1] != '\n') {
748 /* Eventually fbm_*() should handle this, but often
749 anchored_offset is not 0, so this check will not be wasted. */
750 /* XXXX In the code below we prefer to look for "^" even in
751 presence of anchored substrings. And we search even
752 beyond the found float position. These pessimizations
753 are historical artefacts only. */
755 while (t < strend - prog->minlen) {
757 if (t < check_at - prog->check_offset_min) {
758 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
759 /* Since we moved from the found position,
760 we definitely contradict the found anchored
761 substr. Due to the above check we do not
762 contradict "check" substr.
763 Thus we can arrive here only if check substr
764 is float. Redo checking for "other"=="fixed".
767 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
768 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
769 goto do_other_anchored;
771 /* We don't contradict the found floating substring. */
772 /* XXXX Why not check for STCLASS? */
774 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
775 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
778 /* Position contradicts check-string */
779 /* XXXX probably better to look for check-string
780 than for "\n", so one should lower the limit for t? */
781 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
782 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
783 other_last = strpos = s = t + 1;
788 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
789 PL_colors[0],PL_colors[1]));
793 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
794 PL_colors[0],PL_colors[1]));
798 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
801 /* The found string does not prohibit matching at strpos,
802 - no optimization of calling REx engine can be performed,
803 unless it was an MBOL and we are not after MBOL,
804 or a future STCLASS check will fail this. */
806 /* Even in this situation we may use MBOL flag if strpos is offset
807 wrt the start of the string. */
808 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
809 && (strpos != strbeg) && strpos[-1] != '\n'
810 /* May be due to an implicit anchor of m{.*foo} */
811 && !(prog->reganch & ROPT_IMPLICIT))
816 DEBUG_r( if (ml_anch)
817 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
818 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
821 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
823 prog->check_utf8 /* Could be deleted already */
824 && --BmUSEFUL(prog->check_utf8) < 0
825 && (prog->check_utf8 == prog->float_utf8)
827 prog->check_substr /* Could be deleted already */
828 && --BmUSEFUL(prog->check_substr) < 0
829 && (prog->check_substr == prog->float_substr)
832 /* If flags & SOMETHING - do not do it many times on the same match */
833 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
834 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
835 if (do_utf8 ? prog->check_substr : prog->check_utf8)
836 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
837 prog->check_substr = prog->check_utf8 = Nullsv; /* disable */
838 prog->float_substr = prog->float_utf8 = Nullsv; /* clear */
839 check = Nullsv; /* abort */
841 /* XXXX This is a remnant of the old implementation. It
842 looks wasteful, since now INTUIT can use many
844 prog->reganch &= ~RE_USE_INTUIT;
851 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
852 if (prog->regstclass) {
853 /* minlen == 0 is possible if regstclass is \b or \B,
854 and the fixed substr is ''$.
855 Since minlen is already taken into account, s+1 is before strend;
856 accidentally, minlen >= 1 guaranties no false positives at s + 1
857 even for \b or \B. But (minlen? 1 : 0) below assumes that
858 regstclass does not come from lookahead... */
859 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
860 This leaves EXACTF only, which is dealt with in find_byclass(). */
861 const U8* const str = (U8*)STRING(prog->regstclass);
862 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
863 ? CHR_DIST((U8 *)str+STR_LEN(prog->regstclass),
866 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
867 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
868 : (prog->float_substr || prog->float_utf8
869 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
875 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
878 const char *what = 0;
880 if (endpos == strend) {
881 DEBUG_r( PerlIO_printf(Perl_debug_log,
882 "Could not match STCLASS...\n") );
885 DEBUG_r( PerlIO_printf(Perl_debug_log,
886 "This position contradicts STCLASS...\n") );
887 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
889 /* Contradict one of substrings */
890 if (prog->anchored_substr || prog->anchored_utf8) {
891 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
892 DEBUG_r( what = "anchored" );
894 s = HOP3c(t, 1, strend);
895 if (s + start_shift + end_shift > strend) {
896 /* XXXX Should be taken into account earlier? */
897 DEBUG_r( PerlIO_printf(Perl_debug_log,
898 "Could not match STCLASS...\n") );
903 DEBUG_r( PerlIO_printf(Perl_debug_log,
904 "Looking for %s substr starting at offset %ld...\n",
905 what, (long)(s + start_shift - i_strpos)) );
908 /* Have both, check_string is floating */
909 if (t + start_shift >= check_at) /* Contradicts floating=check */
910 goto retry_floating_check;
911 /* Recheck anchored substring, but not floating... */
915 DEBUG_r( PerlIO_printf(Perl_debug_log,
916 "Looking for anchored substr starting at offset %ld...\n",
917 (long)(other_last - i_strpos)) );
918 goto do_other_anchored;
920 /* Another way we could have checked stclass at the
921 current position only: */
926 DEBUG_r( PerlIO_printf(Perl_debug_log,
927 "Looking for /%s^%s/m starting at offset %ld...\n",
928 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
931 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
933 /* Check is floating subtring. */
934 retry_floating_check:
935 t = check_at - start_shift;
936 DEBUG_r( what = "floating" );
937 goto hop_and_restart;
940 DEBUG_r(PerlIO_printf(Perl_debug_log,
941 "By STCLASS: moving %ld --> %ld\n",
942 (long)(t - i_strpos), (long)(s - i_strpos))
946 DEBUG_r(PerlIO_printf(Perl_debug_log,
947 "Does not contradict STCLASS...\n");
952 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
953 PL_colors[4], (check ? "Guessed" : "Giving up"),
954 PL_colors[5], (long)(s - i_strpos)) );
957 fail_finish: /* Substring not found */
958 if (prog->check_substr || prog->check_utf8) /* could be removed already */
959 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
961 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
962 PL_colors[4],PL_colors[5]));
966 /* We know what class REx starts with. Try to find this position... */
968 S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, const char *strend, I32 norun)
970 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
974 register STRLEN uskip;
978 register I32 tmp = 1; /* Scratch variable? */
979 register const bool do_utf8 = PL_reg_match_utf8;
981 /* We know what class it must start with. */
985 while (s + (uskip = UTF8SKIP(s)) <= strend) {
986 if ((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
987 !UTF8_IS_INVARIANT((U8)s[0]) ?
988 reginclass(c, (U8*)s, 0, do_utf8) :
989 REGINCLASS(c, (U8*)s)) {
990 if (tmp && (norun || regtry(prog, s)))
1001 while (s < strend) {
1004 if (REGINCLASS(c, (U8*)s) ||
1005 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1006 /* The assignment of 2 is intentional:
1007 * for the folded sharp s, the skip is 2. */
1008 (skip = SHARP_S_SKIP))) {
1009 if (tmp && (norun || regtry(prog, s)))
1021 while (s < strend) {
1022 if (tmp && (norun || regtry(prog, s)))
1031 ln = STR_LEN(c); /* length to match in octets/bytes */
1032 lnc = (I32) ln; /* length to match in characters */
1034 STRLEN ulen1, ulen2;
1036 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1037 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1038 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
1040 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1041 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1043 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1045 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1048 while (sm < ((U8 *) m + ln)) {
1063 c2 = PL_fold_locale[c1];
1065 e = HOP3c(strend, -((I32)lnc), s);
1068 e = s; /* Due to minlen logic of intuit() */
1070 /* The idea in the EXACTF* cases is to first find the
1071 * first character of the EXACTF* node and then, if
1072 * necessary, case-insensitively compare the full
1073 * text of the node. The c1 and c2 are the first
1074 * characters (though in Unicode it gets a bit
1075 * more complicated because there are more cases
1076 * than just upper and lower: one needs to use
1077 * the so-called folding case for case-insensitive
1078 * matching (called "loose matching" in Unicode).
1079 * ibcmp_utf8() will do just that. */
1083 U8 tmpbuf [UTF8_MAXBYTES+1];
1084 STRLEN len, foldlen;
1085 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
1087 /* Upper and lower of 1st char are equal -
1088 * probably not a "letter". */
1090 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1094 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1095 m, (char **)0, ln, (bool)UTF))
1096 && (norun || regtry(prog, s)) )
1099 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1100 uvchr_to_utf8(tmpbuf, c);
1101 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1103 && (f == c1 || f == c2)
1104 && (ln == foldlen ||
1105 !ibcmp_utf8((char *) foldbuf,
1106 (char **)0, foldlen, do_utf8,
1108 (char **)0, ln, (bool)UTF))
1109 && (norun || regtry(prog, s)) )
1117 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1120 /* Handle some of the three Greek sigmas cases.
1121 * Note that not all the possible combinations
1122 * are handled here: some of them are handled
1123 * by the standard folding rules, and some of
1124 * them (the character class or ANYOF cases)
1125 * are handled during compiletime in
1126 * regexec.c:S_regclass(). */
1127 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1128 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1129 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1131 if ( (c == c1 || c == c2)
1133 ibcmp_utf8(s, (char **)0, 0, do_utf8,
1134 m, (char **)0, ln, (bool)UTF))
1135 && (norun || regtry(prog, s)) )
1138 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
1139 uvchr_to_utf8(tmpbuf, c);
1140 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen);
1142 && (f == c1 || f == c2)
1143 && (ln == foldlen ||
1144 !ibcmp_utf8((char *) foldbuf,
1145 (char **)0, foldlen, do_utf8,
1147 (char **)0, ln, (bool)UTF))
1148 && (norun || regtry(prog, s)) )
1159 && (ln == 1 || !(OP(c) == EXACTF
1161 : ibcmp_locale(s, m, ln)))
1162 && (norun || regtry(prog, s)) )
1168 if ( (*(U8*)s == c1 || *(U8*)s == c2)
1169 && (ln == 1 || !(OP(c) == EXACTF
1171 : ibcmp_locale(s, m, ln)))
1172 && (norun || regtry(prog, s)) )
1179 PL_reg_flags |= RF_tainted;
1186 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1188 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1190 tmp = ((OP(c) == BOUND ?
1191 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1192 LOAD_UTF8_CHARCLASS_ALNUM();
1193 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1194 if (tmp == !(OP(c) == BOUND ?
1195 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1196 isALNUM_LC_utf8((U8*)s)))
1199 if ((norun || regtry(prog, s)))
1206 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1207 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1208 while (s < strend) {
1210 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1212 if ((norun || regtry(prog, s)))
1218 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1222 PL_reg_flags |= RF_tainted;
1229 U8 *r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1231 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1233 tmp = ((OP(c) == NBOUND ?
1234 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1235 LOAD_UTF8_CHARCLASS_ALNUM();
1236 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1237 if (tmp == !(OP(c) == NBOUND ?
1238 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1239 isALNUM_LC_utf8((U8*)s)))
1241 else if ((norun || regtry(prog, s)))
1247 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1248 tmp = ((OP(c) == NBOUND ?
1249 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1250 while (s < strend) {
1252 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1254 else if ((norun || regtry(prog, s)))
1259 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1264 LOAD_UTF8_CHARCLASS_ALNUM();
1265 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1266 if (swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1267 if (tmp && (norun || regtry(prog, s)))
1278 while (s < strend) {
1280 if (tmp && (norun || regtry(prog, s)))
1292 PL_reg_flags |= RF_tainted;
1294 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1295 if (isALNUM_LC_utf8((U8*)s)) {
1296 if (tmp && (norun || regtry(prog, s)))
1307 while (s < strend) {
1308 if (isALNUM_LC(*s)) {
1309 if (tmp && (norun || regtry(prog, s)))
1322 LOAD_UTF8_CHARCLASS_ALNUM();
1323 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1324 if (!swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) {
1325 if (tmp && (norun || regtry(prog, s)))
1336 while (s < strend) {
1338 if (tmp && (norun || regtry(prog, s)))
1350 PL_reg_flags |= RF_tainted;
1352 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1353 if (!isALNUM_LC_utf8((U8*)s)) {
1354 if (tmp && (norun || regtry(prog, s)))
1365 while (s < strend) {
1366 if (!isALNUM_LC(*s)) {
1367 if (tmp && (norun || regtry(prog, s)))
1380 LOAD_UTF8_CHARCLASS_SPACE();
1381 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1382 if (*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)) {
1383 if (tmp && (norun || regtry(prog, s)))
1394 while (s < strend) {
1396 if (tmp && (norun || regtry(prog, s)))
1408 PL_reg_flags |= RF_tainted;
1410 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1411 if (*s == ' ' || isSPACE_LC_utf8((U8*)s)) {
1412 if (tmp && (norun || regtry(prog, s)))
1423 while (s < strend) {
1424 if (isSPACE_LC(*s)) {
1425 if (tmp && (norun || regtry(prog, s)))
1438 LOAD_UTF8_CHARCLASS_SPACE();
1439 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1440 if (!(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8))) {
1441 if (tmp && (norun || regtry(prog, s)))
1452 while (s < strend) {
1454 if (tmp && (norun || regtry(prog, s)))
1466 PL_reg_flags |= RF_tainted;
1468 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1469 if (!(*s == ' ' || isSPACE_LC_utf8((U8*)s))) {
1470 if (tmp && (norun || regtry(prog, s)))
1481 while (s < strend) {
1482 if (!isSPACE_LC(*s)) {
1483 if (tmp && (norun || regtry(prog, s)))
1496 LOAD_UTF8_CHARCLASS_DIGIT();
1497 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1498 if (swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1499 if (tmp && (norun || regtry(prog, s)))
1510 while (s < strend) {
1512 if (tmp && (norun || regtry(prog, s)))
1524 PL_reg_flags |= RF_tainted;
1526 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1527 if (isDIGIT_LC_utf8((U8*)s)) {
1528 if (tmp && (norun || regtry(prog, s)))
1539 while (s < strend) {
1540 if (isDIGIT_LC(*s)) {
1541 if (tmp && (norun || regtry(prog, s)))
1554 LOAD_UTF8_CHARCLASS_DIGIT();
1555 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1556 if (!swash_fetch(PL_utf8_digit,(U8*)s, do_utf8)) {
1557 if (tmp && (norun || regtry(prog, s)))
1568 while (s < strend) {
1570 if (tmp && (norun || regtry(prog, s)))
1582 PL_reg_flags |= RF_tainted;
1584 while (s + (uskip = UTF8SKIP(s)) <= strend) {
1585 if (!isDIGIT_LC_utf8((U8*)s)) {
1586 if (tmp && (norun || regtry(prog, s)))
1597 while (s < strend) {
1598 if (!isDIGIT_LC(*s)) {
1599 if (tmp && (norun || regtry(prog, s)))
1611 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1620 - regexec_flags - match a regexp against a string
1623 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1624 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1625 /* strend: pointer to null at end of string */
1626 /* strbeg: real beginning of string */
1627 /* minend: end of match must be >=minend after stringarg. */
1628 /* data: May be used for some additional optimizations. */
1629 /* nosave: For optimizations. */
1632 register regnode *c;
1633 register char *startpos = stringarg;
1634 I32 minlen; /* must match at least this many chars */
1635 I32 dontbother = 0; /* how many characters not to try at end */
1636 I32 end_shift = 0; /* Same for the end. */ /* CC */
1637 I32 scream_pos = -1; /* Internal iterator of scream. */
1639 SV* oreplsv = GvSV(PL_replgv);
1640 const bool do_utf8 = DO_UTF8(sv);
1642 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
1643 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
1645 PERL_UNUSED_ARG(data);
1646 RX_MATCH_UTF8_set(prog,do_utf8);
1652 PL_regnarrate = DEBUG_r_TEST;
1655 /* Be paranoid... */
1656 if (prog == NULL || startpos == NULL) {
1657 Perl_croak(aTHX_ "NULL regexp parameter");
1661 minlen = prog->minlen;
1662 if (strend - startpos < minlen) {
1663 DEBUG_r(PerlIO_printf(Perl_debug_log,
1664 "String too short [regexec_flags]...\n"));
1668 /* Check validity of program. */
1669 if (UCHARAT(prog->program) != REG_MAGIC) {
1670 Perl_croak(aTHX_ "corrupted regexp program");
1674 PL_reg_eval_set = 0;
1677 if (prog->reganch & ROPT_UTF8)
1678 PL_reg_flags |= RF_utf8;
1680 /* Mark beginning of line for ^ and lookbehind. */
1681 PL_regbol = startpos;
1685 /* Mark end of line for $ (and such) */
1688 /* see how far we have to get to not match where we matched before */
1689 PL_regtill = startpos+minend;
1691 /* We start without call_cc context. */
1694 /* If there is a "must appear" string, look for it. */
1697 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1700 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1701 PL_reg_ganch = startpos;
1702 else if (sv && SvTYPE(sv) >= SVt_PVMG
1704 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1705 && mg->mg_len >= 0) {
1706 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1707 if (prog->reganch & ROPT_ANCH_GPOS) {
1708 if (s > PL_reg_ganch)
1713 else /* pos() not defined */
1714 PL_reg_ganch = strbeg;
1717 if (!(flags & REXEC_CHECKED) && (prog->check_substr != Nullsv || prog->check_utf8 != Nullsv)) {
1718 re_scream_pos_data d;
1720 d.scream_olds = &scream_olds;
1721 d.scream_pos = &scream_pos;
1722 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1724 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1725 goto phooey; /* not present */
1730 const char * const s0 = UTF
1731 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1734 const int len0 = UTF ? SvCUR(dsv0) : prog->prelen;
1735 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1736 UNI_DISPLAY_REGEX) : startpos;
1737 const int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos;
1740 PerlIO_printf(Perl_debug_log,
1741 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1742 PL_colors[4],PL_colors[5],PL_colors[0],
1745 len0 > 60 ? "..." : "",
1747 (int)(len1 > 60 ? 60 : len1),
1749 (len1 > 60 ? "..." : "")
1753 /* Simplest case: anchored match need be tried only once. */
1754 /* [unless only anchor is BOL and multiline is set] */
1755 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1756 if (s == startpos && regtry(prog, startpos))
1758 else if (PL_multiline || (prog->reganch & ROPT_IMPLICIT)
1759 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1764 dontbother = minlen - 1;
1765 end = HOP3c(strend, -dontbother, strbeg) - 1;
1766 /* for multiline we only have to try after newlines */
1767 if (prog->check_substr || prog->check_utf8) {
1771 if (regtry(prog, s))
1776 if (prog->reganch & RE_USE_INTUIT) {
1777 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1788 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1789 if (regtry(prog, s))
1796 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1797 if (regtry(prog, PL_reg_ganch))
1802 /* Messy cases: unanchored match. */
1803 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1804 /* we have /x+whatever/ */
1805 /* it must be a one character string (XXXX Except UTF?) */
1810 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1811 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1812 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1815 while (s < strend) {
1817 DEBUG_r( did_match = 1 );
1818 if (regtry(prog, s)) goto got_it;
1820 while (s < strend && *s == ch)
1827 while (s < strend) {
1829 DEBUG_r( did_match = 1 );
1830 if (regtry(prog, s)) goto got_it;
1832 while (s < strend && *s == ch)
1838 DEBUG_r(if (!did_match)
1839 PerlIO_printf(Perl_debug_log,
1840 "Did not find anchored character...\n")
1843 else if (prog->anchored_substr != Nullsv
1844 || prog->anchored_utf8 != Nullsv
1845 || ((prog->float_substr != Nullsv || prog->float_utf8 != Nullsv)
1846 && prog->float_max_offset < strend - s)) {
1851 char *last1; /* Last position checked before */
1855 if (prog->anchored_substr || prog->anchored_utf8) {
1856 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1857 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1858 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1859 back_max = back_min = prog->anchored_offset;
1861 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1862 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1863 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1864 back_max = prog->float_max_offset;
1865 back_min = prog->float_min_offset;
1867 if (must == &PL_sv_undef)
1868 /* could not downgrade utf8 check substring, so must fail */
1871 last = HOP3c(strend, /* Cannot start after this */
1872 -(I32)(CHR_SVLEN(must)
1873 - (SvTAIL(must) != 0) + back_min), strbeg);
1876 last1 = HOPc(s, -1);
1878 last1 = s - 1; /* bogus */
1880 /* XXXX check_substr already used to find "s", can optimize if
1881 check_substr==must. */
1883 dontbother = end_shift;
1884 strend = HOPc(strend, -dontbother);
1885 while ( (s <= last) &&
1886 ((flags & REXEC_SCREAM)
1887 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1888 end_shift, &scream_pos, 0))
1889 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1890 (unsigned char*)strend, must,
1891 PL_multiline ? FBMrf_MULTILINE : 0))) ) {
1892 /* we may be pointing at the wrong string */
1893 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1894 s = strbeg + (s - SvPVX_const(sv));
1895 DEBUG_r( did_match = 1 );
1896 if (HOPc(s, -back_max) > last1) {
1897 last1 = HOPc(s, -back_min);
1898 s = HOPc(s, -back_max);
1901 char *t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1903 last1 = HOPc(s, -back_min);
1907 while (s <= last1) {
1908 if (regtry(prog, s))
1914 while (s <= last1) {
1915 if (regtry(prog, s))
1921 DEBUG_r(if (!did_match)
1922 PerlIO_printf(Perl_debug_log,
1923 "Did not find %s substr \"%s%.*s%s\"%s...\n",
1924 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1925 ? "anchored" : "floating"),
1927 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1929 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1933 else if ((c = prog->regstclass)) {
1935 I32 op = (U8)OP(prog->regstclass);
1936 /* don't bother with what can't match */
1937 if (PL_regkind[op] != EXACT && op != CANY)
1938 strend = HOPc(strend, -(minlen - 1));
1941 SV *prop = sv_newmortal();
1949 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
1950 UNI_DISPLAY_REGEX) :
1952 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1954 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1955 len1 = UTF ? SvCUR(dsv1) : strend - s;
1956 PerlIO_printf(Perl_debug_log,
1957 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
1961 if (find_byclass(prog, c, s, strend, 0))
1963 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1967 if (prog->float_substr != Nullsv || prog->float_utf8 != Nullsv) {
1972 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1973 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1974 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1976 if (flags & REXEC_SCREAM) {
1977 last = screaminstr(sv, float_real, s - strbeg,
1978 end_shift, &scream_pos, 1); /* last one */
1980 last = scream_olds; /* Only one occurrence. */
1981 /* we may be pointing at the wrong string */
1982 else if (RX_MATCH_COPIED(prog))
1983 s = strbeg + (s - SvPVX_const(sv));
1987 const char * const little = SvPV_const(float_real, len);
1989 if (SvTAIL(float_real)) {
1990 if (memEQ(strend - len + 1, little, len - 1))
1991 last = strend - len + 1;
1992 else if (!PL_multiline)
1993 last = memEQ(strend - len, little, len)
1994 ? strend - len : Nullch;
2000 last = rninstr(s, strend, little, little + len);
2002 last = strend; /* matching "$" */
2006 DEBUG_r(PerlIO_printf(Perl_debug_log,
2007 "%sCan't trim the tail, match fails (should not happen)%s\n",
2008 PL_colors[4],PL_colors[5]));
2009 goto phooey; /* Should not happen! */
2011 dontbother = strend - last + prog->float_min_offset;
2013 if (minlen && (dontbother < minlen))
2014 dontbother = minlen - 1;
2015 strend -= dontbother; /* this one's always in bytes! */
2016 /* We don't know much -- general case. */
2019 if (regtry(prog, s))
2028 if (regtry(prog, s))
2030 } while (s++ < strend);
2038 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
2040 if (PL_reg_eval_set) {
2041 /* Preserve the current value of $^R */
2042 if (oreplsv != GvSV(PL_replgv))
2043 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
2044 restored, the value remains
2046 restore_pos(aTHX_ 0);
2049 /* make sure $`, $&, $', and $digit will work later */
2050 if ( !(flags & REXEC_NOT_FIRST) ) {
2051 if (RX_MATCH_COPIED(prog)) {
2052 Safefree(prog->subbeg);
2053 RX_MATCH_COPIED_off(prog);
2055 if (flags & REXEC_COPY_STR) {
2056 I32 i = PL_regeol - startpos + (stringarg - strbeg);
2058 s = savepvn(strbeg, i);
2061 RX_MATCH_COPIED_on(prog);
2064 prog->subbeg = strbeg;
2065 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2072 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2073 PL_colors[4],PL_colors[5]));
2074 if (PL_reg_eval_set)
2075 restore_pos(aTHX_ 0);
2080 - regtry - try match at specific point
2082 STATIC I32 /* 0 failure, 1 success */
2083 S_regtry(pTHX_ regexp *prog, char *startpos)
2091 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
2093 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
2096 PL_reg_eval_set = RS_init;
2098 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2099 (IV)(PL_stack_sp - PL_stack_base));
2101 SAVEI32(cxstack[cxstack_ix].blk_oldsp);
2102 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2103 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2105 /* Apparently this is not needed, judging by wantarray. */
2106 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2107 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2110 /* Make $_ available to executed code. */
2111 if (PL_reg_sv != DEFSV) {
2112 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
2117 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
2118 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
2119 /* prepare for quick setting of pos */
2120 sv_magic(PL_reg_sv, (SV*)0,
2121 PERL_MAGIC_regex_global, Nullch, 0);
2122 mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global);
2126 PL_reg_oldpos = mg->mg_len;
2127 SAVEDESTRUCTOR_X(restore_pos, 0);
2129 if (!PL_reg_curpm) {
2130 Newxz(PL_reg_curpm, 1, PMOP);
2133 SV* repointer = newSViv(0);
2134 /* so we know which PL_regex_padav element is PL_reg_curpm */
2135 SvFLAGS(repointer) |= SVf_BREAK;
2136 av_push(PL_regex_padav,repointer);
2137 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2138 PL_regex_pad = AvARRAY(PL_regex_padav);
2142 PM_SETRE(PL_reg_curpm, prog);
2143 PL_reg_oldcurpm = PL_curpm;
2144 PL_curpm = PL_reg_curpm;
2145 if (RX_MATCH_COPIED(prog)) {
2146 /* Here is a serious problem: we cannot rewrite subbeg,
2147 since it may be needed if this match fails. Thus
2148 $` inside (?{}) could fail... */
2149 PL_reg_oldsaved = prog->subbeg;
2150 PL_reg_oldsavedlen = prog->sublen;
2151 RX_MATCH_COPIED_off(prog);
2154 PL_reg_oldsaved = Nullch;
2155 prog->subbeg = PL_bostr;
2156 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2158 prog->startp[0] = startpos - PL_bostr;
2159 PL_reginput = startpos;
2160 PL_regstartp = prog->startp;
2161 PL_regendp = prog->endp;
2162 PL_reglastparen = &prog->lastparen;
2163 PL_reglastcloseparen = &prog->lastcloseparen;
2164 prog->lastparen = 0;
2165 prog->lastcloseparen = 0;
2167 DEBUG_r(PL_reg_starttry = startpos);
2168 if (PL_reg_start_tmpl <= prog->nparens) {
2169 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2170 if(PL_reg_start_tmp)
2171 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2173 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2176 /* XXXX What this code is doing here?!!! There should be no need
2177 to do this again and again, PL_reglastparen should take care of
2180 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2181 * Actually, the code in regcppop() (which Ilya may be meaning by
2182 * PL_reglastparen), is not needed at all by the test suite
2183 * (op/regexp, op/pat, op/split), but that code is needed, oddly
2184 * enough, for building DynaLoader, or otherwise this
2185 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
2186 * will happen. Meanwhile, this code *is* needed for the
2187 * above-mentioned test suite tests to succeed. The common theme
2188 * on those tests seems to be returning null fields from matches.
2193 if (prog->nparens) {
2194 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2201 if (regmatch(prog->program + 1)) {
2202 prog->endp[0] = PL_reginput - PL_bostr;
2205 REGCP_UNWIND(lastcp);
2209 #define RE_UNWIND_BRANCH 1
2210 #define RE_UNWIND_BRANCHJ 2
2214 typedef struct { /* XX: makes sense to enlarge it... */
2218 } re_unwind_generic_t;
2231 } re_unwind_branch_t;
2233 typedef union re_unwind_t {
2235 re_unwind_generic_t generic;
2236 re_unwind_branch_t branch;
2239 #define sayYES goto yes
2240 #define sayNO goto no
2241 #define sayNO_ANYOF goto no_anyof
2242 #define sayYES_FINAL goto yes_final
2243 #define sayYES_LOUD goto yes_loud
2244 #define sayNO_FINAL goto no_final
2245 #define sayNO_SILENT goto do_no
2246 #define saySAME(x) if (x) goto yes; else goto no
2248 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
2249 #define POSCACHE_SEEN 1 /* we know what we're caching */
2250 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
2251 #define CACHEsayYES STMT_START { \
2252 if (cache_offset | cache_bit) { \
2253 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2254 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
2255 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2256 /* cache records failure, but this is success */ \
2258 PerlIO_printf(Perl_debug_log, \
2259 "%*s (remove success from failure cache)\n", \
2260 REPORT_CODE_OFF+PL_regindent*2, "") \
2262 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2267 #define CACHEsayNO STMT_START { \
2268 if (cache_offset | cache_bit) { \
2269 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2270 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2271 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2272 /* cache records success, but this is failure */ \
2274 PerlIO_printf(Perl_debug_log, \
2275 "%*s (remove failure from success cache)\n", \
2276 REPORT_CODE_OFF+PL_regindent*2, "") \
2278 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2284 #define REPORT_CODE_OFF 24
2287 - regmatch - main matching routine
2289 * Conceptually the strategy is simple: check to see whether the current
2290 * node matches, call self recursively to see whether the rest matches,
2291 * and then act accordingly. In practice we make some effort to avoid
2292 * recursion, in particular by going through "ordinary" nodes (that don't
2293 * need to know whether the rest of the match failed) by a loop instead of
2296 /* [lwall] I've hoisted the register declarations to the outer block in order to
2297 * maybe save a little bit of pushing and popping on the stack. It also takes
2298 * advantage of machines that use a register save mask on subroutine entry.
2300 STATIC I32 /* 0 failure, 1 success */
2301 S_regmatch(pTHX_ regnode *prog)
2303 register regnode *scan; /* Current node. */
2304 regnode *next; /* Next node. */
2305 regnode *inner; /* Next node in internal branch. */
2306 register I32 nextchr; /* renamed nextchr - nextchar colides with
2307 function of same name */
2308 register I32 n; /* no or next */
2309 register I32 ln = 0; /* len or last */
2310 register char *s = Nullch; /* operand or save */
2311 register char *locinput = PL_reginput;
2312 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2313 int minmod = 0, sw = 0, logical = 0;
2316 I32 firstcp = PL_savestack_ix;
2318 const register bool do_utf8 = PL_reg_match_utf8;
2320 SV *dsv0 = PERL_DEBUG_PAD_ZERO(0);
2321 SV *dsv1 = PERL_DEBUG_PAD_ZERO(1);
2322 SV *dsv2 = PERL_DEBUG_PAD_ZERO(2);
2324 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
2330 /* Note that nextchr is a byte even in UTF */
2331 nextchr = UCHARAT(locinput);
2333 while (scan != NULL) {
2336 SV *prop = sv_newmortal();
2337 const int docolor = *PL_colors[0];
2338 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2339 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2340 /* The part of the string before starttry has one color
2341 (pref0_len chars), between starttry and current
2342 position another one (pref_len - pref0_len chars),
2343 after the current position the third one.
2344 We assume that pref0_len <= pref_len, otherwise we
2345 decrease pref0_len. */
2346 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2347 ? (5 + taill) - l : locinput - PL_bostr;
2350 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2352 pref0_len = pref_len - (locinput - PL_reg_starttry);
2353 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2354 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2355 ? (5 + taill) - pref_len : PL_regeol - locinput);
2356 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2360 if (pref0_len > pref_len)
2361 pref0_len = pref_len;
2362 regprop(prop, scan);
2364 const char * const s0 =
2365 do_utf8 && OP(scan) != CANY ?
2366 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2367 pref0_len, 60, UNI_DISPLAY_REGEX) :
2368 locinput - pref_len;
2369 const int len0 = do_utf8 ? strlen(s0) : pref0_len;
2370 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2371 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2372 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2373 locinput - pref_len + pref0_len;
2374 const int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
2375 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2376 pv_uni_display(dsv2, (U8*)locinput,
2377 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2379 const int len2 = do_utf8 ? strlen(s2) : l;
2380 PerlIO_printf(Perl_debug_log,
2381 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2382 (IV)(locinput - PL_bostr),
2389 (docolor ? "" : "> <"),
2393 15 - l - pref_len + 1,
2395 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2400 next = scan + NEXT_OFF(scan);
2406 if (locinput == PL_bostr || (PL_multiline &&
2407 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2409 /* regtill = regbol; */
2414 if (locinput == PL_bostr ||
2415 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2421 if (locinput == PL_bostr)
2425 if (locinput == PL_reg_ganch)
2435 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2440 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2442 if (PL_regeol - locinput > 1)
2446 if (PL_regeol != locinput)
2450 if (!nextchr && locinput >= PL_regeol)
2453 locinput += PL_utf8skip[nextchr];
2454 if (locinput > PL_regeol)
2456 nextchr = UCHARAT(locinput);
2459 nextchr = UCHARAT(++locinput);
2462 if (!nextchr && locinput >= PL_regeol)
2464 nextchr = UCHARAT(++locinput);
2467 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2470 locinput += PL_utf8skip[nextchr];
2471 if (locinput > PL_regeol)
2473 nextchr = UCHARAT(locinput);
2476 nextchr = UCHARAT(++locinput);
2481 if (do_utf8 != UTF) {
2482 /* The target and the pattern have differing utf8ness. */
2484 const char *e = s + ln;
2487 /* The target is utf8, the pattern is not utf8. */
2492 if (NATIVE_TO_UNI(*(U8*)s) !=
2493 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2501 /* The target is not utf8, the pattern is utf8. */
2506 if (NATIVE_TO_UNI(*((U8*)l)) !=
2507 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2515 nextchr = UCHARAT(locinput);
2518 /* The target and the pattern have the same utf8ness. */
2519 /* Inline the first character, for speed. */
2520 if (UCHARAT(s) != nextchr)
2522 if (PL_regeol - locinput < ln)
2524 if (ln > 1 && memNE(s, locinput, ln))
2527 nextchr = UCHARAT(locinput);
2530 PL_reg_flags |= RF_tainted;
2536 if (do_utf8 || UTF) {
2537 /* Either target or the pattern are utf8. */
2539 char *e = PL_regeol;
2541 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
2542 l, &e, 0, do_utf8)) {
2543 /* One more case for the sharp s:
2544 * pack("U0U*", 0xDF) =~ /ss/i,
2545 * the 0xC3 0x9F are the UTF-8
2546 * byte sequence for the U+00DF. */
2548 toLOWER(s[0]) == 's' &&
2550 toLOWER(s[1]) == 's' &&
2557 nextchr = UCHARAT(locinput);
2561 /* Neither the target and the pattern are utf8. */
2563 /* Inline the first character, for speed. */
2564 if (UCHARAT(s) != nextchr &&
2565 UCHARAT(s) != ((OP(scan) == EXACTF)
2566 ? PL_fold : PL_fold_locale)[nextchr])
2568 if (PL_regeol - locinput < ln)
2570 if (ln > 1 && (OP(scan) == EXACTF
2571 ? ibcmp(s, locinput, ln)
2572 : ibcmp_locale(s, locinput, ln)))
2575 nextchr = UCHARAT(locinput);
2579 STRLEN inclasslen = PL_regeol - locinput;
2581 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
2583 if (locinput >= PL_regeol)
2585 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2586 nextchr = UCHARAT(locinput);
2591 nextchr = UCHARAT(locinput);
2592 if (!REGINCLASS(scan, (U8*)locinput))
2594 if (!nextchr && locinput >= PL_regeol)
2596 nextchr = UCHARAT(++locinput);
2600 /* If we might have the case of the German sharp s
2601 * in a casefolding Unicode character class. */
2603 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2604 locinput += SHARP_S_SKIP;
2605 nextchr = UCHARAT(locinput);
2611 PL_reg_flags |= RF_tainted;
2617 LOAD_UTF8_CHARCLASS_ALNUM();
2618 if (!(OP(scan) == ALNUM
2619 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2620 : isALNUM_LC_utf8((U8*)locinput)))
2624 locinput += PL_utf8skip[nextchr];
2625 nextchr = UCHARAT(locinput);
2628 if (!(OP(scan) == ALNUM
2629 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2631 nextchr = UCHARAT(++locinput);
2634 PL_reg_flags |= RF_tainted;
2637 if (!nextchr && locinput >= PL_regeol)
2640 LOAD_UTF8_CHARCLASS_ALNUM();
2641 if (OP(scan) == NALNUM
2642 ? swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2643 : isALNUM_LC_utf8((U8*)locinput))
2647 locinput += PL_utf8skip[nextchr];
2648 nextchr = UCHARAT(locinput);
2651 if (OP(scan) == NALNUM
2652 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2654 nextchr = UCHARAT(++locinput);
2658 PL_reg_flags |= RF_tainted;
2662 /* was last char in word? */
2664 if (locinput == PL_bostr)
2667 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2669 ln = utf8n_to_uvchr((U8 *)r, UTF8SKIP(r), 0, 0);
2671 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2672 ln = isALNUM_uni(ln);
2673 LOAD_UTF8_CHARCLASS_ALNUM();
2674 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2677 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2678 n = isALNUM_LC_utf8((U8*)locinput);
2682 ln = (locinput != PL_bostr) ?
2683 UCHARAT(locinput - 1) : '\n';
2684 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2686 n = isALNUM(nextchr);
2689 ln = isALNUM_LC(ln);
2690 n = isALNUM_LC(nextchr);
2693 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2694 OP(scan) == BOUNDL))
2698 PL_reg_flags |= RF_tainted;
2704 if (UTF8_IS_CONTINUED(nextchr)) {
2705 LOAD_UTF8_CHARCLASS_SPACE();
2706 if (!(OP(scan) == SPACE
2707 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2708 : isSPACE_LC_utf8((U8*)locinput)))
2712 locinput += PL_utf8skip[nextchr];
2713 nextchr = UCHARAT(locinput);
2716 if (!(OP(scan) == SPACE
2717 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2719 nextchr = UCHARAT(++locinput);
2722 if (!(OP(scan) == SPACE
2723 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2725 nextchr = UCHARAT(++locinput);
2729 PL_reg_flags |= RF_tainted;
2732 if (!nextchr && locinput >= PL_regeol)
2735 LOAD_UTF8_CHARCLASS_SPACE();
2736 if (OP(scan) == NSPACE
2737 ? swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2738 : isSPACE_LC_utf8((U8*)locinput))
2742 locinput += PL_utf8skip[nextchr];
2743 nextchr = UCHARAT(locinput);
2746 if (OP(scan) == NSPACE
2747 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2749 nextchr = UCHARAT(++locinput);
2752 PL_reg_flags |= RF_tainted;
2758 LOAD_UTF8_CHARCLASS_DIGIT();
2759 if (!(OP(scan) == DIGIT
2760 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2761 : isDIGIT_LC_utf8((U8*)locinput)))
2765 locinput += PL_utf8skip[nextchr];
2766 nextchr = UCHARAT(locinput);
2769 if (!(OP(scan) == DIGIT
2770 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2772 nextchr = UCHARAT(++locinput);
2775 PL_reg_flags |= RF_tainted;
2778 if (!nextchr && locinput >= PL_regeol)
2781 LOAD_UTF8_CHARCLASS_DIGIT();
2782 if (OP(scan) == NDIGIT
2783 ? swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2784 : isDIGIT_LC_utf8((U8*)locinput))
2788 locinput += PL_utf8skip[nextchr];
2789 nextchr = UCHARAT(locinput);
2792 if (OP(scan) == NDIGIT
2793 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2795 nextchr = UCHARAT(++locinput);
2798 if (locinput >= PL_regeol)
2801 LOAD_UTF8_CHARCLASS_MARK();
2802 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2804 locinput += PL_utf8skip[nextchr];
2805 while (locinput < PL_regeol &&
2806 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2807 locinput += UTF8SKIP(locinput);
2808 if (locinput > PL_regeol)
2813 nextchr = UCHARAT(locinput);
2816 PL_reg_flags |= RF_tainted;
2820 n = ARG(scan); /* which paren pair */
2821 ln = PL_regstartp[n];
2822 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2823 if ((I32)*PL_reglastparen < n || ln == -1)
2824 sayNO; /* Do not match unless seen CLOSEn. */
2825 if (ln == PL_regendp[n])
2829 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2831 const char *e = PL_bostr + PL_regendp[n];
2833 * Note that we can't do the "other character" lookup trick as
2834 * in the 8-bit case (no pun intended) because in Unicode we
2835 * have to map both upper and title case to lower case.
2837 if (OP(scan) == REFF) {
2839 STRLEN ulen1, ulen2;
2840 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
2841 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
2845 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2846 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
2847 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
2854 nextchr = UCHARAT(locinput);
2858 /* Inline the first character, for speed. */
2859 if (UCHARAT(s) != nextchr &&
2861 (UCHARAT(s) != ((OP(scan) == REFF
2862 ? PL_fold : PL_fold_locale)[nextchr]))))
2864 ln = PL_regendp[n] - ln;
2865 if (locinput + ln > PL_regeol)
2867 if (ln > 1 && (OP(scan) == REF
2868 ? memNE(s, locinput, ln)
2870 ? ibcmp(s, locinput, ln)
2871 : ibcmp_locale(s, locinput, ln))))
2874 nextchr = UCHARAT(locinput);
2885 OP_4tree *oop = PL_op;
2886 COP *ocurcop = PL_curcop;
2889 struct regexp *oreg = PL_reg_re;
2892 PL_op = (OP_4tree*)PL_regdata->data[n];
2893 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2894 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
2895 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2899 CALLRUNOPS(aTHX); /* Scalar context. */
2902 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
2910 PAD_RESTORE_LOCAL(old_comppad);
2911 PL_curcop = ocurcop;
2913 if (logical == 2) { /* Postponed subexpression. */
2915 MAGIC *mg = Null(MAGIC*);
2917 CHECKPOINT cp, lastcp;
2921 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2922 mg = mg_find(sv, PERL_MAGIC_qr);
2923 else if (SvSMAGICAL(ret)) {
2924 if (SvGMAGICAL(ret))
2925 sv_unmagic(ret, PERL_MAGIC_qr);
2927 mg = mg_find(ret, PERL_MAGIC_qr);
2931 re = (regexp *)mg->mg_obj;
2932 (void)ReREFCNT_inc(re);
2936 const char *t = SvPV_const(ret, len);
2938 char * const oprecomp = PL_regprecomp;
2939 const I32 osize = PL_regsize;
2940 const I32 onpar = PL_regnpar;
2943 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
2944 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
2946 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2948 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2950 PL_regprecomp = oprecomp;
2955 PerlIO_printf(Perl_debug_log,
2956 "Entering embedded \"%s%.60s%s%s\"\n",
2960 (strlen(re->precomp) > 60 ? "..." : ""))
2963 state.prev = PL_reg_call_cc;
2964 state.cc = PL_regcc;
2965 state.re = PL_reg_re;
2969 cp = regcppush(0); /* Save *all* the positions. */
2972 state.ss = PL_savestack_ix;
2973 *PL_reglastparen = 0;
2974 *PL_reglastcloseparen = 0;
2975 PL_reg_call_cc = &state;
2976 PL_reginput = locinput;
2977 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2978 ((re->reganch & ROPT_UTF8) != 0);
2979 if (toggleutf) PL_reg_flags ^= RF_utf8;
2981 /* XXXX This is too dramatic a measure... */
2984 if (regmatch(re->program + 1)) {
2985 /* Even though we succeeded, we need to restore
2986 global variables, since we may be wrapped inside
2987 SUSPEND, thus the match may be not finished yet. */
2989 /* XXXX Do this only if SUSPENDed? */
2990 PL_reg_call_cc = state.prev;
2991 PL_regcc = state.cc;
2992 PL_reg_re = state.re;
2993 cache_re(PL_reg_re);
2994 if (toggleutf) PL_reg_flags ^= RF_utf8;
2996 /* XXXX This is too dramatic a measure... */
2999 /* These are needed even if not SUSPEND. */
3005 REGCP_UNWIND(lastcp);
3007 PL_reg_call_cc = state.prev;
3008 PL_regcc = state.cc;
3009 PL_reg_re = state.re;
3010 cache_re(PL_reg_re);
3011 if (toggleutf) PL_reg_flags ^= RF_utf8;
3013 /* XXXX This is too dramatic a measure... */
3023 sv_setsv(save_scalar(PL_replgv), ret);
3029 n = ARG(scan); /* which paren pair */
3030 PL_reg_start_tmp[n] = locinput;
3035 n = ARG(scan); /* which paren pair */
3036 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
3037 PL_regendp[n] = locinput - PL_bostr;
3038 if (n > (I32)*PL_reglastparen)
3039 *PL_reglastparen = n;
3040 *PL_reglastcloseparen = n;
3043 n = ARG(scan); /* which paren pair */
3044 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
3047 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3049 next = NEXTOPER(NEXTOPER(scan));
3051 next = scan + ARG(scan);
3052 if (OP(next) == IFTHEN) /* Fake one. */
3053 next = NEXTOPER(NEXTOPER(next));
3057 logical = scan->flags;
3059 /*******************************************************************
3060 PL_regcc contains infoblock about the innermost (...)* loop, and
3061 a pointer to the next outer infoblock.
3063 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
3065 1) After matching X, regnode for CURLYX is processed;
3067 2) This regnode creates infoblock on the stack, and calls
3068 regmatch() recursively with the starting point at WHILEM node;
3070 3) Each hit of WHILEM node tries to match A and Z (in the order
3071 depending on the current iteration, min/max of {min,max} and
3072 greediness). The information about where are nodes for "A"
3073 and "Z" is read from the infoblock, as is info on how many times "A"
3074 was already matched, and greediness.
3076 4) After A matches, the same WHILEM node is hit again.
3078 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
3079 of the same pair. Thus when WHILEM tries to match Z, it temporarily
3080 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
3081 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
3082 of the external loop.
3084 Currently present infoblocks form a tree with a stem formed by PL_curcc
3085 and whatever it mentions via ->next, and additional attached trees
3086 corresponding to temporarily unset infoblocks as in "5" above.
3088 In the following picture infoblocks for outer loop of
3089 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
3090 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
3091 infoblocks are drawn below the "reset" infoblock.
3093 In fact in the picture below we do not show failed matches for Z and T
3094 by WHILEM blocks. [We illustrate minimal matches, since for them it is
3095 more obvious *why* one needs to *temporary* unset infoblocks.]
3097 Matched REx position InfoBlocks Comment
3101 Y A)*?Z)*?T x <- O <- I
3102 YA )*?Z)*?T x <- O <- I
3103 YA A)*?Z)*?T x <- O <- I
3104 YAA )*?Z)*?T x <- O <- I
3105 YAA Z)*?T x <- O # Temporary unset I
3108 YAAZ Y(A)*?Z)*?T x <- O
3111 YAAZY (A)*?Z)*?T x <- O
3114 YAAZY A)*?Z)*?T x <- O <- I
3117 YAAZYA )*?Z)*?T x <- O <- I
3120 YAAZYA Z)*?T x <- O # Temporary unset I
3126 YAAZYAZ T x # Temporary unset O
3133 *******************************************************************/
3136 CHECKPOINT cp = PL_savestack_ix;
3137 /* No need to save/restore up to this paren */
3138 I32 parenfloor = scan->flags;
3140 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
3142 cc.oldcc = PL_regcc;
3144 /* XXXX Probably it is better to teach regpush to support
3145 parenfloor > PL_regsize... */
3146 if (parenfloor > (I32)*PL_reglastparen)
3147 parenfloor = *PL_reglastparen; /* Pessimization... */
3148 cc.parenfloor = parenfloor;
3150 cc.min = ARG1(scan);
3151 cc.max = ARG2(scan);
3152 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3156 PL_reginput = locinput;
3157 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
3159 PL_regcc = cc.oldcc;
3165 * This is really hard to understand, because after we match
3166 * what we're trying to match, we must make sure the rest of
3167 * the REx is going to match for sure, and to do that we have
3168 * to go back UP the parse tree by recursing ever deeper. And
3169 * if it fails, we have to reset our parent's current state
3170 * that we can try again after backing off.
3173 CHECKPOINT cp, lastcp;
3174 CURCUR* cc = PL_regcc;
3175 char *lastloc = cc->lastloc; /* Detection of 0-len. */
3176 I32 cache_offset = 0, cache_bit = 0;
3178 n = cc->cur + 1; /* how many we know we matched */
3179 PL_reginput = locinput;
3182 PerlIO_printf(Perl_debug_log,
3183 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
3184 REPORT_CODE_OFF+PL_regindent*2, "",
3185 (long)n, (long)cc->min,
3186 (long)cc->max, PTR2UV(cc))
3189 /* If degenerate scan matches "", assume scan done. */
3191 if (locinput == cc->lastloc && n >= cc->min) {
3192 PL_regcc = cc->oldcc;
3196 PerlIO_printf(Perl_debug_log,
3197 "%*s empty match detected, try continuation...\n",
3198 REPORT_CODE_OFF+PL_regindent*2, "")
3200 if (regmatch(cc->next))
3208 /* First just match a string of min scans. */
3212 cc->lastloc = locinput;
3213 if (regmatch(cc->scan))
3216 cc->lastloc = lastloc;
3221 /* Check whether we already were at this position.
3222 Postpone detection until we know the match is not
3223 *that* much linear. */
3224 if (!PL_reg_maxiter) {
3225 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
3226 PL_reg_leftiter = PL_reg_maxiter;
3228 if (PL_reg_leftiter-- == 0) {
3229 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3230 if (PL_reg_poscache) {
3231 if ((I32)PL_reg_poscache_size < size) {
3232 Renew(PL_reg_poscache, size, char);
3233 PL_reg_poscache_size = size;
3235 Zero(PL_reg_poscache, size, char);
3238 PL_reg_poscache_size = size;
3239 Newxz(PL_reg_poscache, size, char);
3242 PerlIO_printf(Perl_debug_log,
3243 "%sDetected a super-linear match, switching on caching%s...\n",
3244 PL_colors[4], PL_colors[5])
3247 if (PL_reg_leftiter < 0) {
3248 cache_offset = locinput - PL_bostr;
3250 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3251 + cache_offset * (scan->flags>>4);
3252 cache_bit = cache_offset % 8;
3254 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
3256 PerlIO_printf(Perl_debug_log,
3257 "%*s already tried at this position...\n",
3258 REPORT_CODE_OFF+PL_regindent*2, "")
3260 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3261 /* cache records success */
3264 /* cache records failure */
3267 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
3271 /* Prefer next over scan for minimal matching. */
3274 PL_regcc = cc->oldcc;
3277 cp = regcppush(cc->parenfloor);
3279 if (regmatch(cc->next)) {
3281 CACHEsayYES; /* All done. */
3283 REGCP_UNWIND(lastcp);
3289 if (n >= cc->max) { /* Maximum greed exceeded? */
3290 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3291 && !(PL_reg_flags & RF_warned)) {
3292 PL_reg_flags |= RF_warned;
3293 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3294 "Complex regular subexpression recursion",
3301 PerlIO_printf(Perl_debug_log,
3302 "%*s trying longer...\n",
3303 REPORT_CODE_OFF+PL_regindent*2, "")
3305 /* Try scanning more and see if it helps. */
3306 PL_reginput = locinput;
3308 cc->lastloc = locinput;
3309 cp = regcppush(cc->parenfloor);
3311 if (regmatch(cc->scan)) {
3315 REGCP_UNWIND(lastcp);
3318 cc->lastloc = lastloc;
3322 /* Prefer scan over next for maximal matching. */
3324 if (n < cc->max) { /* More greed allowed? */
3325 cp = regcppush(cc->parenfloor);
3327 cc->lastloc = locinput;
3329 if (regmatch(cc->scan)) {
3333 REGCP_UNWIND(lastcp);
3334 regcppop(); /* Restore some previous $<digit>s? */
3335 PL_reginput = locinput;
3337 PerlIO_printf(Perl_debug_log,
3338 "%*s failed, try continuation...\n",
3339 REPORT_CODE_OFF+PL_regindent*2, "")
3342 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3343 && !(PL_reg_flags & RF_warned)) {
3344 PL_reg_flags |= RF_warned;
3345 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3346 "Complex regular subexpression recursion",
3350 /* Failed deeper matches of scan, so see if this one works. */
3351 PL_regcc = cc->oldcc;
3354 if (regmatch(cc->next))
3360 cc->lastloc = lastloc;
3365 next = scan + ARG(scan);
3368 inner = NEXTOPER(NEXTOPER(scan));
3371 inner = NEXTOPER(scan);
3375 if (OP(next) != c1) /* No choice. */
3376 next = inner; /* Avoid recursion. */
3378 const I32 lastparen = *PL_reglastparen;
3380 re_unwind_branch_t *uw;
3382 /* Put unwinding data on stack */
3383 unwind1 = SSNEWt(1,re_unwind_branch_t);
3384 uw = SSPTRt(unwind1,re_unwind_branch_t);
3387 uw->type = ((c1 == BRANCH)
3389 : RE_UNWIND_BRANCHJ);
3390 uw->lastparen = lastparen;
3392 uw->locinput = locinput;
3393 uw->nextchr = nextchr;
3395 uw->regindent = ++PL_regindent;
3398 REGCP_SET(uw->lastcp);
3400 /* Now go into the first branch */
3413 /* We suppose that the next guy does not need
3414 backtracking: in particular, it is of constant non-zero length,
3415 and has no parenths to influence future backrefs. */
3416 ln = ARG1(scan); /* min to match */
3417 n = ARG2(scan); /* max to match */
3418 paren = scan->flags;
3420 if (paren > PL_regsize)
3422 if (paren > (I32)*PL_reglastparen)
3423 *PL_reglastparen = paren;
3425 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3427 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3428 PL_reginput = locinput;
3431 if (ln && regrepeat_hard(scan, ln, &l) < ln)
3433 locinput = PL_reginput;
3434 if (HAS_TEXT(next) || JUMPABLE(next)) {
3435 regnode *text_node = next;
3437 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3439 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3441 if (PL_regkind[(U8)OP(text_node)] == REF) {
3445 else { c1 = (U8)*STRING(text_node); }
3446 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3448 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3449 c2 = PL_fold_locale[c1];
3458 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3459 /* If it could work, try it. */
3461 UCHARAT(PL_reginput) == c1 ||
3462 UCHARAT(PL_reginput) == c2)
3466 PL_regstartp[paren] =
3467 HOPc(PL_reginput, -l) - PL_bostr;
3468 PL_regendp[paren] = PL_reginput - PL_bostr;
3471 PL_regendp[paren] = -1;
3475 REGCP_UNWIND(lastcp);
3477 /* Couldn't or didn't -- move forward. */
3478 PL_reginput = locinput;
3479 if (regrepeat_hard(scan, 1, &l)) {
3481 locinput = PL_reginput;
3488 n = regrepeat_hard(scan, n, &l);
3489 locinput = PL_reginput;
3491 PerlIO_printf(Perl_debug_log,
3492 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3493 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3497 if (HAS_TEXT(next) || JUMPABLE(next)) {
3498 regnode *text_node = next;
3500 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3502 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3504 if (PL_regkind[(U8)OP(text_node)] == REF) {
3508 else { c1 = (U8)*STRING(text_node); }
3510 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3512 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3513 c2 = PL_fold_locale[c1];
3524 /* If it could work, try it. */
3526 UCHARAT(PL_reginput) == c1 ||
3527 UCHARAT(PL_reginput) == c2)
3530 PerlIO_printf(Perl_debug_log,
3531 "%*s trying tail with n=%"IVdf"...\n",
3532 (int)(REPORT_CODE_OFF+PL_regindent*2), "", (IV)n)
3536 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3537 PL_regendp[paren] = PL_reginput - PL_bostr;
3540 PL_regendp[paren] = -1;
3544 REGCP_UNWIND(lastcp);
3546 /* Couldn't or didn't -- back up. */
3548 locinput = HOPc(locinput, -l);
3549 PL_reginput = locinput;
3556 paren = scan->flags; /* Which paren to set */
3557 if (paren > PL_regsize)
3559 if (paren > (I32)*PL_reglastparen)
3560 *PL_reglastparen = paren;
3561 ln = ARG1(scan); /* min to match */
3562 n = ARG2(scan); /* max to match */
3563 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3567 ln = ARG1(scan); /* min to match */
3568 n = ARG2(scan); /* max to match */
3569 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3574 scan = NEXTOPER(scan);
3580 scan = NEXTOPER(scan);
3584 * Lookahead to avoid useless match attempts
3585 * when we know what character comes next.
3589 * Used to only do .*x and .*?x, but now it allows
3590 * for )'s, ('s and (?{ ... })'s to be in the way
3591 * of the quantifier and the EXACT-like node. -- japhy
3594 if (HAS_TEXT(next) || JUMPABLE(next)) {
3596 regnode *text_node = next;
3598 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3600 if (! HAS_TEXT(text_node)) c1 = c2 = -1000;
3602 if (PL_regkind[(U8)OP(text_node)] == REF) {
3604 goto assume_ok_easy;
3606 else { s = (U8*)STRING(text_node); }
3610 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3612 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3613 c2 = PL_fold_locale[c1];
3616 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3617 STRLEN ulen1, ulen2;
3618 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3619 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3621 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3622 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3624 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
3626 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
3630 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
3639 PL_reginput = locinput;
3643 if (ln && regrepeat(scan, ln) < ln)
3645 locinput = PL_reginput;
3648 char *e; /* Should not check after this */
3649 char *old = locinput;
3652 if (n == REG_INFTY) {
3655 while (UTF8_IS_CONTINUATION(*(U8*)e))
3661 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3665 e = locinput + n - ln;
3670 /* Find place 'next' could work */
3673 while (locinput <= e &&
3674 UCHARAT(locinput) != c1)
3677 while (locinput <= e
3678 && UCHARAT(locinput) != c1
3679 && UCHARAT(locinput) != c2)
3682 count = locinput - old;
3687 /* count initialised to
3688 * utf8_distance(old, locinput) */
3689 while (locinput <= e &&
3690 utf8n_to_uvchr((U8*)locinput,
3691 UTF8_MAXBYTES, &len,
3692 uniflags) != (UV)c1) {
3698 /* count initialised to
3699 * utf8_distance(old, locinput) */
3700 while (locinput <= e) {
3701 UV c = utf8n_to_uvchr((U8*)locinput,
3702 UTF8_MAXBYTES, &len,
3704 if (c == (UV)c1 || c == (UV)c2)
3713 /* PL_reginput == old now */
3714 if (locinput != old) {
3715 ln = 1; /* Did some */
3716 if (regrepeat(scan, count) < count)
3719 /* PL_reginput == locinput now */
3720 TRYPAREN(paren, ln, locinput);
3721 PL_reginput = locinput; /* Could be reset... */
3722 REGCP_UNWIND(lastcp);
3723 /* Couldn't or didn't -- move forward. */
3726 locinput += UTF8SKIP(locinput);
3733 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3737 c = utf8n_to_uvchr((U8*)PL_reginput,
3741 c = UCHARAT(PL_reginput);
3742 /* If it could work, try it. */
3743 if (c == (UV)c1 || c == (UV)c2)
3745 TRYPAREN(paren, ln, PL_reginput);
3746 REGCP_UNWIND(lastcp);
3749 /* If it could work, try it. */
3750 else if (c1 == -1000)
3752 TRYPAREN(paren, ln, PL_reginput);
3753 REGCP_UNWIND(lastcp);
3755 /* Couldn't or didn't -- move forward. */
3756 PL_reginput = locinput;
3757 if (regrepeat(scan, 1)) {
3759 locinput = PL_reginput;
3767 n = regrepeat(scan, n);
3768 locinput = PL_reginput;
3769 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3770 ((!PL_multiline && OP(next) != MEOL) ||
3771 OP(next) == SEOL || OP(next) == EOS))
3773 ln = n; /* why back off? */
3774 /* ...because $ and \Z can match before *and* after
3775 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3776 We should back off by one in this case. */
3777 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3786 c = utf8n_to_uvchr((U8*)PL_reginput,
3790 c = UCHARAT(PL_reginput);
3792 /* If it could work, try it. */
3793 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3795 TRYPAREN(paren, n, PL_reginput);
3796 REGCP_UNWIND(lastcp);
3798 /* Couldn't or didn't -- back up. */
3800 PL_reginput = locinput = HOPc(locinput, -1);
3808 c = utf8n_to_uvchr((U8*)PL_reginput,
3812 c = UCHARAT(PL_reginput);
3814 /* If it could work, try it. */
3815 if (c1 == -1000 || c == (UV)c1 || c == (UV)c2)
3817 TRYPAREN(paren, n, PL_reginput);
3818 REGCP_UNWIND(lastcp);
3820 /* Couldn't or didn't -- back up. */
3822 PL_reginput = locinput = HOPc(locinput, -1);
3829 if (PL_reg_call_cc) {
3830 re_cc_state *cur_call_cc = PL_reg_call_cc;
3831 CURCUR *cctmp = PL_regcc;
3832 regexp *re = PL_reg_re;
3833 CHECKPOINT cp, lastcp;
3835 cp = regcppush(0); /* Save *all* the positions. */
3837 regcp_set_to(PL_reg_call_cc->ss); /* Restore parens of
3839 PL_reginput = locinput; /* Make position available to
3841 cache_re(PL_reg_call_cc->re);
3842 PL_regcc = PL_reg_call_cc->cc;
3843 PL_reg_call_cc = PL_reg_call_cc->prev;
3844 if (regmatch(cur_call_cc->node)) {
3845 PL_reg_call_cc = cur_call_cc;
3849 REGCP_UNWIND(lastcp);
3851 PL_reg_call_cc = cur_call_cc;
3857 PerlIO_printf(Perl_debug_log,
3858 "%*s continuation failed...\n",
3859 REPORT_CODE_OFF+PL_regindent*2, "")
3863 if (locinput < PL_regtill) {
3864 DEBUG_r(PerlIO_printf(Perl_debug_log,
3865 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3867 (long)(locinput - PL_reg_starttry),
3868 (long)(PL_regtill - PL_reg_starttry),
3870 sayNO_FINAL; /* Cannot match: too short. */
3872 PL_reginput = locinput; /* put where regtry can find it */
3873 sayYES_FINAL; /* Success! */
3875 PL_reginput = locinput; /* put where regtry can find it */
3876 sayYES_LOUD; /* Success! */
3879 PL_reginput = locinput;
3884 s = HOPBACKc(locinput, scan->flags);
3890 PL_reginput = locinput;
3895 s = HOPBACKc(locinput, scan->flags);
3901 PL_reginput = locinput;
3904 inner = NEXTOPER(NEXTOPER(scan));
3905 if (regmatch(inner) != n) {
3920 if (OP(scan) == SUSPEND) {
3921 locinput = PL_reginput;
3922 nextchr = UCHARAT(locinput);
3927 next = scan + ARG(scan);
3932 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3933 PTR2UV(scan), OP(scan));
3934 Perl_croak(aTHX_ "regexp memory corruption");
3941 * We get here only if there's trouble -- normally "case END" is
3942 * the terminating point.
3944 Perl_croak(aTHX_ "corrupted regexp pointers");
3950 PerlIO_printf(Perl_debug_log,
3951 "%*s %scould match...%s\n",
3952 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3956 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3957 PL_colors[4],PL_colors[5]));
3963 #if 0 /* Breaks $^R */
3971 PerlIO_printf(Perl_debug_log,
3972 "%*s %sfailed...%s\n",
3973 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3979 re_unwind_t *uw = SSPTRt(unwind,re_unwind_t);
3982 case RE_UNWIND_BRANCH:
3983 case RE_UNWIND_BRANCHJ:
3985 re_unwind_branch_t *uwb = &(uw->branch);
3986 const I32 lastparen = uwb->lastparen;
3988 REGCP_UNWIND(uwb->lastcp);
3989 for (n = *PL_reglastparen; n > lastparen; n--)
3991 *PL_reglastparen = n;
3992 scan = next = uwb->next;
3994 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3995 ? BRANCH : BRANCHJ) ) { /* Failure */
4002 /* Have more choice yet. Reuse the same uwb. */
4003 if ((n = (uwb->type == RE_UNWIND_BRANCH
4004 ? NEXT_OFF(next) : ARG(next))))
4007 next = NULL; /* XXXX Needn't unwinding in this case... */
4009 next = NEXTOPER(scan);
4010 if (uwb->type == RE_UNWIND_BRANCHJ)
4011 next = NEXTOPER(next);
4012 locinput = uwb->locinput;
4013 nextchr = uwb->nextchr;
4015 PL_regindent = uwb->regindent;
4022 Perl_croak(aTHX_ "regexp unwind memory corruption");
4033 - regrepeat - repeatedly match something simple, report how many
4036 * [This routine now assumes that it will only match on things of length 1.
4037 * That was true before, but now we assume scan - reginput is the count,
4038 * rather than incrementing count on every character. [Er, except utf8.]]
4041 S_regrepeat(pTHX_ const regnode *p, I32 max)
4043 register char *scan;
4045 register char *loceol = PL_regeol;
4046 register I32 hardcount = 0;
4047 register bool do_utf8 = PL_reg_match_utf8;
4050 if (max == REG_INFTY)
4052 else if (max < loceol - scan)
4053 loceol = scan + max;
4058 while (scan < loceol && hardcount < max && *scan != '\n') {
4059 scan += UTF8SKIP(scan);
4063 while (scan < loceol && *scan != '\n')
4070 while (scan < loceol && hardcount < max) {
4071 scan += UTF8SKIP(scan);
4081 case EXACT: /* length of string is 1 */
4083 while (scan < loceol && UCHARAT(scan) == c)
4086 case EXACTF: /* length of string is 1 */
4088 while (scan < loceol &&
4089 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
4092 case EXACTFL: /* length of string is 1 */
4093 PL_reg_flags |= RF_tainted;
4095 while (scan < loceol &&
4096 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
4102 while (hardcount < max && scan < loceol &&
4103 reginclass(p, (U8*)scan, 0, do_utf8)) {
4104 scan += UTF8SKIP(scan);
4108 while (scan < loceol && REGINCLASS(p, (U8*)scan))
4115 LOAD_UTF8_CHARCLASS_ALNUM();
4116 while (hardcount < max && scan < loceol &&
4117 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4118 scan += UTF8SKIP(scan);
4122 while (scan < loceol && isALNUM(*scan))
4127 PL_reg_flags |= RF_tainted;
4130 while (hardcount < max && scan < loceol &&
4131 isALNUM_LC_utf8((U8*)scan)) {
4132 scan += UTF8SKIP(scan);
4136 while (scan < loceol && isALNUM_LC(*scan))
4143 LOAD_UTF8_CHARCLASS_ALNUM();
4144 while (hardcount < max && scan < loceol &&
4145 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
4146 scan += UTF8SKIP(scan);
4150 while (scan < loceol && !isALNUM(*scan))
4155 PL_reg_flags |= RF_tainted;
4158 while (hardcount < max && scan < loceol &&
4159 !isALNUM_LC_utf8((U8*)scan)) {
4160 scan += UTF8SKIP(scan);
4164 while (scan < loceol && !isALNUM_LC(*scan))
4171 LOAD_UTF8_CHARCLASS_SPACE();
4172 while (hardcount < max && scan < loceol &&
4174 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4175 scan += UTF8SKIP(scan);
4179 while (scan < loceol && isSPACE(*scan))
4184 PL_reg_flags |= RF_tainted;
4187 while (hardcount < max && scan < loceol &&
4188 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4189 scan += UTF8SKIP(scan);
4193 while (scan < loceol && isSPACE_LC(*scan))
4200 LOAD_UTF8_CHARCLASS_SPACE();
4201 while (hardcount < max && scan < loceol &&
4203 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
4204 scan += UTF8SKIP(scan);
4208 while (scan < loceol && !isSPACE(*scan))
4213 PL_reg_flags |= RF_tainted;
4216 while (hardcount < max && scan < loceol &&
4217 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4218 scan += UTF8SKIP(scan);
4222 while (scan < loceol && !isSPACE_LC(*scan))
4229 LOAD_UTF8_CHARCLASS_DIGIT();
4230 while (hardcount < max && scan < loceol &&
4231 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4232 scan += UTF8SKIP(scan);
4236 while (scan < loceol && isDIGIT(*scan))
4243 LOAD_UTF8_CHARCLASS_DIGIT();
4244 while (hardcount < max && scan < loceol &&
4245 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4246 scan += UTF8SKIP(scan);
4250 while (scan < loceol && !isDIGIT(*scan))
4254 default: /* Called on something of 0 width. */
4255 break; /* So match right here or not at all. */
4261 c = scan - PL_reginput;
4266 SV *prop = sv_newmortal();
4268 regprop(prop, (regnode *)p);
4269 PerlIO_printf(Perl_debug_log,
4270 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4271 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4278 - regrepeat_hard - repeatedly match something, report total lenth and length
4280 * The repeater is supposed to have constant non-zero length.
4284 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp)
4286 register char *scan = Nullch;
4287 register char *start;
4288 register char *loceol = PL_regeol;
4290 I32 count = 0, res = 1;
4295 start = PL_reginput;
4296 if (PL_reg_match_utf8) {
4297 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4300 while (start < PL_reginput) {
4302 start += UTF8SKIP(start);
4313 while (PL_reginput < loceol && (scan = PL_reginput, res = regmatch(p))) {
4315 *lp = l = PL_reginput - start;
4316 if (max != REG_INFTY && l*max < loceol - scan)
4317 loceol = scan + l*max;
4330 - regclass_swash - prepare the utf8 swash
4334 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4340 if (PL_regdata && PL_regdata->count) {
4341 const U32 n = ARG(node);
4343 if (PL_regdata->what[n] == 's') {
4344 SV * const rv = (SV*)PL_regdata->data[n];
4345 AV * const av = (AV*)SvRV((SV*)rv);
4346 SV **const ary = AvARRAY(av);
4349 /* See the end of regcomp.c:S_reglass() for
4350 * documentation of these array elements. */
4353 a = SvTYPE(ary[1]) == SVt_RV ? &ary[1] : 0;
4354 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4358 else if (si && doinit) {
4359 sw = swash_init("utf8", "", si, 1, 0);
4360 (void)av_store(av, 1, sw);
4376 - reginclass - determine if a character falls into a character class
4378 The n is the ANYOF regnode, the p is the target string, lenp
4379 is pointer to the maximum length of how far to go in the p
4380 (if the lenp is zero, UTF8SKIP(p) is used),
4381 do_utf8 tells whether the target string is in UTF-8.
4386 S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
4388 const char flags = ANYOF_FLAGS(n);
4394 if (do_utf8 && !UTF8_IS_INVARIANT(c))
4395 c = utf8n_to_uvchr((U8 *)p, UTF8_MAXBYTES, &len,
4396 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
4398 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4399 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4402 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4403 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4406 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4410 SV * const sw = regclass_swash((regnode *)n, TRUE, 0, (SV**)&av);
4413 if (swash_fetch(sw, (U8 *)p, do_utf8))
4415 else if (flags & ANYOF_FOLD) {
4416 if (!match && lenp && av) {
4418 for (i = 0; i <= av_len(av); i++) {
4419 SV* const sv = *av_fetch(av, i, FALSE);
4421 const char * const s = SvPV_const(sv, len);
4423 if (len <= plen && memEQ(s, (char*)p, len)) {
4431 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4434 to_utf8_fold((U8 *)p, tmpbuf, &tmplen);
4435 if (swash_fetch(sw, tmpbuf, do_utf8))
4441 if (match && lenp && *lenp == 0)
4442 *lenp = UNISKIP(NATIVE_TO_UNI(c));
4444 if (!match && c < 256) {
4445 if (ANYOF_BITMAP_TEST(n, c))
4447 else if (flags & ANYOF_FOLD) {
4450 if (flags & ANYOF_LOCALE) {
4451 PL_reg_flags |= RF_tainted;
4452 f = PL_fold_locale[c];
4456 if (f != c && ANYOF_BITMAP_TEST(n, f))
4460 if (!match && (flags & ANYOF_CLASS)) {
4461 PL_reg_flags |= RF_tainted;
4463 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4464 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4465 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4466 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4467 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4468 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4469 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4470 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4471 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4472 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4473 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4474 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4475 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4476 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4477 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4478 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4479 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4480 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4481 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4482 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4483 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4484 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4485 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4486 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4487 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4488 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4489 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4490 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4491 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4492 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4493 ) /* How's that for a conditional? */
4500 return (flags & ANYOF_INVERT) ? !match : match;
4504 S_reghop(pTHX_ U8 *s, I32 off)
4506 return S_reghop3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4510 S_reghop3(pTHX_ U8 *s, I32 off, U8* lim)
4513 while (off-- && s < lim) {
4514 /* XXX could check well-formedness here */
4522 if (UTF8_IS_CONTINUED(*s)) {
4523 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4526 /* XXX could check well-formedness here */
4534 S_reghopmaybe(pTHX_ U8 *s, I32 off)
4536 return S_reghopmaybe3(aTHX_ s, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr));
4540 S_reghopmaybe3(pTHX_ U8* s, I32 off, U8* lim)
4543 while (off-- && s < lim) {
4544 /* XXX could check well-formedness here */
4554 if (UTF8_IS_CONTINUED(*s)) {
4555 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4558 /* XXX could check well-formedness here */
4570 restore_pos(pTHX_ void *arg)
4572 PERL_UNUSED_ARG(arg);
4573 if (PL_reg_eval_set) {
4574 if (PL_reg_oldsaved) {
4575 PL_reg_re->subbeg = PL_reg_oldsaved;
4576 PL_reg_re->sublen = PL_reg_oldsavedlen;
4577 RX_MATCH_COPIED_on(PL_reg_re);
4579 PL_reg_magic->mg_len = PL_reg_oldpos;
4580 PL_reg_eval_set = 0;
4581 PL_curpm = PL_reg_oldcurpm;
4586 S_to_utf8_substr(pTHX_ register regexp *prog)
4588 if (prog->float_substr && !prog->float_utf8) {
4590 prog->float_utf8 = sv = newSVsv(prog->float_substr);
4591 sv_utf8_upgrade(sv);
4592 if (SvTAIL(prog->float_substr))
4594 if (prog->float_substr == prog->check_substr)
4595 prog->check_utf8 = sv;
4597 if (prog->anchored_substr && !prog->anchored_utf8) {
4599 prog->anchored_utf8 = sv = newSVsv(prog->anchored_substr);
4600 sv_utf8_upgrade(sv);
4601 if (SvTAIL(prog->anchored_substr))
4603 if (prog->anchored_substr == prog->check_substr)
4604 prog->check_utf8 = sv;
4609 S_to_byte_substr(pTHX_ register regexp *prog)
4611 if (prog->float_utf8 && !prog->float_substr) {
4613 prog->float_substr = sv = newSVsv(prog->float_utf8);
4614 if (sv_utf8_downgrade(sv, TRUE)) {
4615 if (SvTAIL(prog->float_utf8))
4619 prog->float_substr = sv = &PL_sv_undef;
4621 if (prog->float_utf8 == prog->check_utf8)
4622 prog->check_substr = sv;
4624 if (prog->anchored_utf8 && !prog->anchored_substr) {
4626 prog->anchored_substr = sv = newSVsv(prog->anchored_utf8);
4627 if (sv_utf8_downgrade(sv, TRUE)) {
4628 if (SvTAIL(prog->anchored_utf8))
4632 prog->anchored_substr = sv = &PL_sv_undef;
4634 if (prog->anchored_utf8 == prog->check_utf8)
4635 prog->check_substr = sv;
4641 * c-indentation-style: bsd
4643 * indent-tabs-mode: t
4646 * ex: set ts=8 sts=4 sw=4 noet: