5 * "One Ring to rule them all, One Ring to find them..."
8 /* This file contains functions for executing a regular expression. See
9 * also regcomp.c which funnily enough, contains functions for compiling
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_exec.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_regexec_flags my_regexec
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_re_intuit_start my_re_intuit_start
49 /* *These* symbols are masked to allow static link. */
50 # define Perl_pregexec my_pregexec
51 # define Perl_reginitcolors my_reginitcolors
52 # define Perl_regclass_swash my_regclass_swash
54 # define PERL_NO_GET_CONTEXT
58 * pregcomp and pregexec -- regsub and regerror are not used in perl
60 * Copyright (c) 1986 by University of Toronto.
61 * Written by Henry Spencer. Not derived from licensed software.
63 * Permission is granted to anyone to use this software for any
64 * purpose on any computer system, and to redistribute it freely,
65 * subject to the following restrictions:
67 * 1. The author is not responsible for the consequences of use of
68 * this software, no matter how awful, even if they arise
71 * 2. The origin of this software must not be misrepresented, either
72 * by explicit claim or by omission.
74 * 3. Altered versions must be plainly marked as such, and must not
75 * be misrepresented as being the original software.
77 **** Alterations to Henry's code are...
79 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
80 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
82 **** You may distribute under the terms of either the GNU General Public
83 **** License or the Artistic License, as specified in the README file.
85 * Beware that some of this code is subtly aware of the way operator
86 * precedence is structured in regular expressions. Serious changes in
87 * regular-expression syntax might require a total rethink.
90 #define PERL_IN_REGEXEC_C
95 #define RF_tainted 1 /* tainted information used? */
96 #define RF_warned 2 /* warned about big count? */
97 #define RF_evaled 4 /* Did an EVAL with setting? */
98 #define RF_utf8 8 /* String contains multibyte chars? */
100 #define UTF ((PL_reg_flags & RF_utf8) != 0)
102 #define RS_init 1 /* eval environment created */
103 #define RS_set 2 /* replsv value is set */
106 #define STATIC static
109 #define REGINCLASS(p,c) (ANYOF_FLAGS(p) ? reginclass(p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
115 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
116 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
118 #define HOPc(pos,off) ((char *)(PL_reg_match_utf8 \
119 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
121 #define HOPBACKc(pos, off) ((char*) \
122 ((PL_reg_match_utf8) \
123 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
124 : (pos - off >= PL_bostr) \
129 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)pos, off, (U8*)lim) : (U8*)(pos + off))
130 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
132 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
133 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((U8*)str); assert(ok); LEAVE; } } STMT_END
134 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
135 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
136 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
137 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
139 /* for use after a quantifier and before an EXACT-like node -- japhy */
140 #define JUMPABLE(rn) ( \
141 OP(rn) == OPEN || OP(rn) == CLOSE || OP(rn) == EVAL || \
142 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
143 OP(rn) == PLUS || OP(rn) == MINMOD || \
144 (PL_regkind[(U8)OP(rn)] == CURLY && ARG1(rn) > 0) \
147 #define HAS_TEXT(rn) ( \
148 PL_regkind[(U8)OP(rn)] == EXACT || PL_regkind[(U8)OP(rn)] == REF \
152 Search for mandatory following text node; for lookahead, the text must
153 follow but for lookbehind (rn->flags != 0) we skip to the next step.
155 #define FIND_NEXT_IMPT(rn) STMT_START { \
156 while (JUMPABLE(rn)) \
157 if (OP(rn) == SUSPEND || PL_regkind[(U8)OP(rn)] == CURLY) \
158 rn = NEXTOPER(NEXTOPER(rn)); \
159 else if (OP(rn) == PLUS) \
161 else if (OP(rn) == IFMATCH) \
162 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
163 else rn += NEXT_OFF(rn); \
166 static void restore_pos(pTHX_ void *arg);
169 S_regcppush(pTHX_ I32 parenfloor)
171 const int retval = PL_savestack_ix;
172 #define REGCP_PAREN_ELEMS 4
173 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
176 if (paren_elems_to_push < 0)
177 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
179 #define REGCP_OTHER_ELEMS 6
180 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
181 for (p = PL_regsize; p > parenfloor; p--) {
182 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
183 SSPUSHINT(PL_regendp[p]);
184 SSPUSHINT(PL_regstartp[p]);
185 SSPUSHPTR(PL_reg_start_tmp[p]);
188 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
189 SSPUSHINT(PL_regsize);
190 SSPUSHINT(*PL_reglastparen);
191 SSPUSHINT(*PL_reglastcloseparen);
192 SSPUSHPTR(PL_reginput);
193 #define REGCP_FRAME_ELEMS 2
194 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
195 * are needed for the regexp context stack bookkeeping. */
196 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
197 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
202 /* These are needed since we do not localize EVAL nodes: */
203 # define REGCP_SET(cp) DEBUG_r(PerlIO_printf(Perl_debug_log, \
204 " Setting an EVAL scope, savestack=%"IVdf"\n", \
205 (IV)PL_savestack_ix)); cp = PL_savestack_ix
207 # define REGCP_UNWIND(cp) DEBUG_r(cp != PL_savestack_ix ? \
208 PerlIO_printf(Perl_debug_log, \
209 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
210 (IV)(cp), (IV)PL_savestack_ix) : 0); regcpblow(cp)
218 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
220 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
221 i = SSPOPINT; /* Parentheses elements to pop. */
222 input = (char *) SSPOPPTR;
223 *PL_reglastcloseparen = SSPOPINT;
224 *PL_reglastparen = SSPOPINT;
225 PL_regsize = SSPOPINT;
227 /* Now restore the parentheses context. */
228 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
229 i > 0; i -= REGCP_PAREN_ELEMS) {
231 U32 paren = (U32)SSPOPINT;
232 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
233 PL_regstartp[paren] = SSPOPINT;
235 if (paren <= *PL_reglastparen)
236 PL_regendp[paren] = tmps;
238 PerlIO_printf(Perl_debug_log,
239 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
240 (UV)paren, (IV)PL_regstartp[paren],
241 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
242 (IV)PL_regendp[paren],
243 (paren > *PL_reglastparen ? "(no)" : ""));
247 if (*PL_reglastparen + 1 <= PL_regnpar) {
248 PerlIO_printf(Perl_debug_log,
249 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
250 (IV)(*PL_reglastparen + 1), (IV)PL_regnpar);
254 /* It would seem that the similar code in regtry()
255 * already takes care of this, and in fact it is in
256 * a better location to since this code can #if 0-ed out
257 * but the code in regtry() is needed or otherwise tests
258 * requiring null fields (pat.t#187 and split.t#{13,14}
259 * (as of patchlevel 7877) will fail. Then again,
260 * this code seems to be necessary or otherwise
261 * building DynaLoader will fail:
262 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
264 for (i = *PL_reglastparen + 1; (U32)i <= PL_regnpar; i++) {
266 PL_regstartp[i] = -1;
273 typedef struct re_cc_state
277 struct re_cc_state *prev;
282 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
284 #define TRYPAREN(paren, n, input) { \
287 PL_regstartp[paren] = HOPc(input, -1) - PL_bostr; \
288 PL_regendp[paren] = input - PL_bostr; \
291 PL_regendp[paren] = -1; \
293 if (regmatch(next)) \
296 PL_regendp[paren] = -1; \
301 * pregexec and friends
305 - pregexec - match a regexp against a string
308 Perl_pregexec(pTHX_ register regexp *prog, char *stringarg, register char *strend,
309 char *strbeg, I32 minend, SV *screamer, U32 nosave)
310 /* strend: pointer to null at end of string */
311 /* strbeg: real beginning of string */
312 /* minend: end of match must be >=minend after stringarg. */
313 /* nosave: For optimizations. */
316 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
317 nosave ? 0 : REXEC_COPY_STR);
321 S_cache_re(pTHX_ regexp *prog)
323 PL_regprecomp = prog->precomp; /* Needed for FAIL. */
325 PL_regprogram = prog->program;
327 PL_regnpar = prog->nparens;
328 PL_regdata = prog->data;
333 * Need to implement the following flags for reg_anch:
335 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
337 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
338 * INTUIT_AUTORITATIVE_ML
339 * INTUIT_ONCE_NOML - Intuit can match in one location only.
342 * Another flag for this function: SECOND_TIME (so that float substrs
343 * with giant delta may be not rechecked).
346 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
348 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
349 Otherwise, only SvCUR(sv) is used to get strbeg. */
351 /* XXXX We assume that strpos is strbeg unless sv. */
353 /* XXXX Some places assume that there is a fixed substring.
354 An update may be needed if optimizer marks as "INTUITable"
355 RExen without fixed substrings. Similarly, it is assumed that
356 lengths of all the strings are no more than minlen, thus they
357 cannot come from lookahead.
358 (Or minlen should take into account lookahead.) */
360 /* A failure to find a constant substring means that there is no need to make
361 an expensive call to REx engine, thus we celebrate a failure. Similarly,
362 finding a substring too deep into the string means that less calls to
363 regtry() should be needed.
365 REx compiler's optimizer found 4 possible hints:
366 a) Anchored substring;
368 c) Whether we are anchored (beginning-of-line or \G);
369 d) First node (of those at offset 0) which may distingush positions;
370 We use a)b)d) and multiline-part of c), and try to find a position in the
371 string which does not contradict any of them.
374 /* Most of decisions we do here should have been done at compile time.
375 The nodes of the REx which we used for the search should have been
376 deleted from the finite automaton. */
379 Perl_re_intuit_start(pTHX_ regexp *prog, SV *sv, char *strpos,
380 char *strend, U32 flags, re_scream_pos_data *data)
382 register I32 start_shift = 0;
383 /* Should be nonnegative! */
384 register I32 end_shift = 0;
389 const int do_utf8 = sv ? SvUTF8(sv) : 0; /* if no sv we have to assume bytes */
391 register char *other_last = NULL; /* other substr checked before this */
392 char *check_at = NULL; /* check substr found at this pos */
393 const I32 multiline = PL_multiline | (prog->reganch & PMf_MULTILINE);
395 const char * const i_strpos = strpos;
396 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
398 RX_MATCH_UTF8_set(prog,do_utf8);
400 if (prog->reganch & ROPT_UTF8) {
401 DEBUG_r(PerlIO_printf(Perl_debug_log,
402 "UTF-8 regex...\n"));
403 PL_reg_flags |= RF_utf8;
407 const char *s = PL_reg_match_utf8 ?
408 sv_uni_display(dsv, sv, 60, UNI_DISPLAY_REGEX) :
410 const int len = PL_reg_match_utf8 ?
411 (int)strlen(s) : strend - strpos;
414 if (PL_reg_match_utf8)
415 DEBUG_r(PerlIO_printf(Perl_debug_log,
416 "UTF-8 target...\n"));
417 PerlIO_printf(Perl_debug_log,
418 "%sGuessing start of match, REx%s \"%s%.60s%s%s\" against \"%s%.*s%s%s\"...\n",
419 PL_colors[4],PL_colors[5],PL_colors[0],
422 (strlen(prog->precomp) > 60 ? "..." : ""),
424 (int)(len > 60 ? 60 : len),
426 (len > 60 ? "..." : "")
430 /* CHR_DIST() would be more correct here but it makes things slow. */
431 if (prog->minlen > strend - strpos) {
432 DEBUG_r(PerlIO_printf(Perl_debug_log,
433 "String too short... [re_intuit_start]\n"));
436 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
439 if (!prog->check_utf8 && prog->check_substr)
440 to_utf8_substr(prog);
441 check = prog->check_utf8;
443 if (!prog->check_substr && prog->check_utf8)
444 to_byte_substr(prog);
445 check = prog->check_substr;
447 if (check == &PL_sv_undef) {
448 DEBUG_r(PerlIO_printf(Perl_debug_log,
449 "Non-utf string cannot match utf check string\n"));
452 if (prog->reganch & ROPT_ANCH) { /* Match at beg-of-str or after \n */
453 ml_anch = !( (prog->reganch & ROPT_ANCH_SINGLE)
454 || ( (prog->reganch & ROPT_ANCH_BOL)
455 && !multiline ) ); /* Check after \n? */
458 if ( !(prog->reganch & (ROPT_ANCH_GPOS /* Checked by the caller */
459 | ROPT_IMPLICIT)) /* not a real BOL */
460 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
462 && (strpos != strbeg)) {
463 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
466 if (prog->check_offset_min == prog->check_offset_max &&
467 !(prog->reganch & ROPT_CANY_SEEN)) {
468 /* Substring at constant offset from beg-of-str... */
471 s = HOP3c(strpos, prog->check_offset_min, strend);
473 slen = SvCUR(check); /* >= 1 */
475 if ( strend - s > slen || strend - s < slen - 1
476 || (strend - s == slen && strend[-1] != '\n')) {
477 DEBUG_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
480 /* Now should match s[0..slen-2] */
482 if (slen && (*SvPVX_const(check) != *s
484 && memNE(SvPVX_const(check), s, slen)))) {
486 DEBUG_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
490 else if (*SvPVX_const(check) != *s
491 || ((slen = SvCUR(check)) > 1
492 && memNE(SvPVX_const(check), s, slen)))
495 goto success_at_start;
498 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
500 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
501 end_shift = prog->minlen - start_shift -
502 CHR_SVLEN(check) + (SvTAIL(check) != 0);
504 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
505 - (SvTAIL(check) != 0);
506 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
508 if (end_shift < eshift)
512 else { /* Can match at random position */
515 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
516 /* Should be nonnegative! */
517 end_shift = prog->minlen - start_shift -
518 CHR_SVLEN(check) + (SvTAIL(check) != 0);
521 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
523 Perl_croak(aTHX_ "panic: end_shift");
527 /* Find a possible match in the region s..strend by looking for
528 the "check" substring in the region corrected by start/end_shift. */
529 if (flags & REXEC_SCREAM) {
530 I32 p = -1; /* Internal iterator of scream. */
531 I32 * const pp = data ? data->scream_pos : &p;
533 if (PL_screamfirst[BmRARE(check)] >= 0
534 || ( BmRARE(check) == '\n'
535 && (BmPREVIOUS(check) == SvCUR(check) - 1)
537 s = screaminstr(sv, check,
538 start_shift + (s - strbeg), end_shift, pp, 0);
541 /* we may be pointing at the wrong string */
542 if (s && RX_MATCH_COPIED(prog))
543 s = strbeg + (s - SvPVX_const(sv));
545 *data->scream_olds = s;
547 else if (prog->reganch & ROPT_CANY_SEEN)
548 s = fbm_instr((U8*)(s + start_shift),
549 (U8*)(strend - end_shift),
550 check, multiline ? FBMrf_MULTILINE : 0);
552 s = fbm_instr(HOP3(s, start_shift, strend),
553 HOP3(strend, -end_shift, strbeg),
554 check, multiline ? FBMrf_MULTILINE : 0);
556 /* Update the count-of-usability, remove useless subpatterns,
559 /* FIXME - DEBUG_EXECUTE_r if that is merged to maint. */
560 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s %s substr \"%s%.*s%s\"%s%s",
561 (s ? "Found" : "Did not find"),
562 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"),
564 (int)(SvCUR(check) - (SvTAIL(check)!=0)),
566 PL_colors[1], (SvTAIL(check) ? "$" : ""),
567 (s ? " at offset " : "...\n") ) );
574 /* Finish the diagnostic message */
575 DEBUG_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
577 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
578 Start with the other substr.
579 XXXX no SCREAM optimization yet - and a very coarse implementation
580 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
581 *always* match. Probably should be marked during compile...
582 Probably it is right to do no SCREAM here...
585 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) {
586 /* Take into account the "other" substring. */
587 /* XXXX May be hopelessly wrong for UTF... */
590 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
593 char * const last = HOP3c(s, -start_shift, strbeg);
598 t = s - prog->check_offset_max;
599 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
601 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
606 t = HOP3c(t, prog->anchored_offset, strend);
607 if (t < other_last) /* These positions already checked */
609 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
612 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
613 /* On end-of-str: see comment below. */
614 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
615 if (must == &PL_sv_undef) {
617 DEBUG_r(must = prog->anchored_utf8); /* for debug */
622 HOP3(HOP3(last1, prog->anchored_offset, strend)
623 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
625 multiline ? FBMrf_MULTILINE : 0
627 DEBUG_r(PerlIO_printf(Perl_debug_log,
628 "%s anchored substr \"%s%.*s%s\"%s",
629 (s ? "Found" : "Contradicts"),
632 - (SvTAIL(must)!=0)),
634 PL_colors[1], (SvTAIL(must) ? "$" : "")));
636 if (last1 >= last2) {
637 DEBUG_r(PerlIO_printf(Perl_debug_log,
638 ", giving up...\n"));
641 DEBUG_r(PerlIO_printf(Perl_debug_log,
642 ", trying floating at offset %ld...\n",
643 (long)(HOP3c(s1, 1, strend) - i_strpos)));
644 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
645 s = HOP3c(last, 1, strend);
649 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
650 (long)(s - i_strpos)));
651 t = HOP3c(s, -prog->anchored_offset, strbeg);
652 other_last = HOP3c(s, 1, strend);
660 else { /* Take into account the floating substring. */
665 t = HOP3c(s, -start_shift, strbeg);
667 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
668 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
669 last = HOP3c(t, prog->float_max_offset, strend);
670 s = HOP3c(t, prog->float_min_offset, strend);
673 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
674 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
675 /* fbm_instr() takes into account exact value of end-of-str
676 if the check is SvTAIL(ed). Since false positives are OK,
677 and end-of-str is not later than strend we are OK. */
678 if (must == &PL_sv_undef) {
680 DEBUG_r(must = prog->float_utf8); /* for debug message */
683 s = fbm_instr((unsigned char*)s,
684 (unsigned char*)last + SvCUR(must)
686 must, multiline ? FBMrf_MULTILINE : 0);
687 /* FIXME - DEBUG_EXECUTE_r if that is merged to maint */
688 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s floating substr \"%s%.*s%s\"%s",
689 (s ? "Found" : "Contradicts"),
691 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
693 PL_colors[1], (SvTAIL(must) ? "$" : "")));
696 DEBUG_r(PerlIO_printf(Perl_debug_log,
697 ", giving up...\n"));
700 DEBUG_r(PerlIO_printf(Perl_debug_log,
701 ", trying anchored starting at offset %ld...\n",
702 (long)(s1 + 1 - i_strpos)));
704 s = HOP3c(t, 1, strend);
708 DEBUG_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
709 (long)(s - i_strpos)));
710 other_last = s; /* Fix this later. --Hugo */
719 t = s - prog->check_offset_max;
720 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
722 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*)strpos))
724 /* Fixed substring is found far enough so that the match
725 cannot start at strpos. */
727 if (ml_anch && t[-1] != '\n') {
728 /* Eventually fbm_*() should handle this, but often
729 anchored_offset is not 0, so this check will not be wasted. */
730 /* XXXX In the code below we prefer to look for "^" even in
731 presence of anchored substrings. And we search even
732 beyond the found float position. These pessimizations
733 are historical artefacts only. */
735 while (t < strend - prog->minlen) {
737 if (t < check_at - prog->check_offset_min) {
738 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
739 /* Since we moved from the found position,
740 we definitely contradict the found anchored
741 substr. Due to the above check we do not
742 contradict "check" substr.
743 Thus we can arrive here only if check substr
744 is float. Redo checking for "other"=="fixed".
747 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
748 PL_colors[0],PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
749 goto do_other_anchored;
751 /* We don't contradict the found floating substring. */
752 /* XXXX Why not check for STCLASS? */
754 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
755 PL_colors[0],PL_colors[1], (long)(s - i_strpos)));
758 /* Position contradicts check-string */
759 /* XXXX probably better to look for check-string
760 than for "\n", so one should lower the limit for t? */
761 DEBUG_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
762 PL_colors[0],PL_colors[1], (long)(t + 1 - i_strpos)));
763 other_last = strpos = s = t + 1;
768 DEBUG_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
769 PL_colors[0],PL_colors[1]));
773 DEBUG_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
774 PL_colors[0],PL_colors[1]));
778 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
781 /* The found string does not prohibit matching at strpos,
782 - no optimization of calling REx engine can be performed,
783 unless it was an MBOL and we are not after MBOL,
784 or a future STCLASS check will fail this. */
786 /* Even in this situation we may use MBOL flag if strpos is offset
787 wrt the start of the string. */
788 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
789 && (strpos != strbeg) && strpos[-1] != '\n'
790 /* May be due to an implicit anchor of m{.*foo} */
791 && !(prog->reganch & ROPT_IMPLICIT))
796 DEBUG_r( if (ml_anch)
797 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
798 (long)(strpos - i_strpos), PL_colors[0],PL_colors[1]);
801 if (!(prog->reganch & ROPT_NAUGHTY) /* XXXX If strpos moved? */
803 prog->check_utf8 /* Could be deleted already */
804 && --BmUSEFUL(prog->check_utf8) < 0
805 && (prog->check_utf8 == prog->float_utf8)
807 prog->check_substr /* Could be deleted already */
808 && --BmUSEFUL(prog->check_substr) < 0
809 && (prog->check_substr == prog->float_substr)
812 /* If flags & SOMETHING - do not do it many times on the same match */
813 DEBUG_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
814 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
815 if (do_utf8 ? prog->check_substr : prog->check_utf8)
816 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
817 prog->check_substr = prog->check_utf8 = NULL; /* disable */
818 prog->float_substr = prog->float_utf8 = NULL; /* clear */
819 check = NULL; /* abort */
821 /* XXXX This is a remnant of the old implementation. It
822 looks wasteful, since now INTUIT can use many
824 prog->reganch &= ~RE_USE_INTUIT;
831 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
832 if (prog->regstclass) {
833 /* minlen == 0 is possible if regstclass is \b or \B,
834 and the fixed substr is ''$.
835 Since minlen is already taken into account, s+1 is before strend;
836 accidentally, minlen >= 1 guaranties no false positives at s + 1
837 even for \b or \B. But (minlen? 1 : 0) below assumes that
838 regstclass does not come from lookahead... */
839 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
840 This leaves EXACTF only, which is dealt with in find_byclass(). */
841 const U8* const str = (U8*)STRING(prog->regstclass);
842 const int cl_l = (PL_regkind[(U8)OP(prog->regstclass)] == EXACT
843 ? CHR_DIST((U8 *)str+STR_LEN(prog->regstclass),
846 const char * const endpos = (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
847 ? HOP3c(s, (prog->minlen ? cl_l : 0), strend)
848 : (prog->float_substr || prog->float_utf8
849 ? HOP3c(HOP3c(check_at, -start_shift, strbeg),
855 s = find_byclass(prog, prog->regstclass, s, endpos, 1);
858 const char *what = NULL;
860 if (endpos == strend) {
861 DEBUG_r( PerlIO_printf(Perl_debug_log,
862 "Could not match STCLASS...\n") );
865 DEBUG_r( PerlIO_printf(Perl_debug_log,
866 "This position contradicts STCLASS...\n") );
867 if ((prog->reganch & ROPT_ANCH) && !ml_anch)
869 /* Contradict one of substrings */
870 if (prog->anchored_substr || prog->anchored_utf8) {
871 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
872 DEBUG_r( what = "anchored" );
874 s = HOP3c(t, 1, strend);
875 if (s + start_shift + end_shift > strend) {
876 /* XXXX Should be taken into account earlier? */
877 DEBUG_r( PerlIO_printf(Perl_debug_log,
878 "Could not match STCLASS...\n") );
883 DEBUG_r( PerlIO_printf(Perl_debug_log,
884 "Looking for %s substr starting at offset %ld...\n",
885 what, (long)(s + start_shift - i_strpos)) );
888 /* Have both, check_string is floating */
889 if (t + start_shift >= check_at) /* Contradicts floating=check */
890 goto retry_floating_check;
891 /* Recheck anchored substring, but not floating... */
895 DEBUG_r( PerlIO_printf(Perl_debug_log,
896 "Looking for anchored substr starting at offset %ld...\n",
897 (long)(other_last - i_strpos)) );
898 goto do_other_anchored;
900 /* Another way we could have checked stclass at the
901 current position only: */
906 DEBUG_r( PerlIO_printf(Perl_debug_log,
907 "Looking for /%s^%s/m starting at offset %ld...\n",
908 PL_colors[0],PL_colors[1], (long)(t - i_strpos)) );
911 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
913 /* Check is floating subtring. */
914 retry_floating_check:
915 t = check_at - start_shift;
916 DEBUG_r( what = "floating" );
917 goto hop_and_restart;
920 DEBUG_r(PerlIO_printf(Perl_debug_log,
921 "By STCLASS: moving %ld --> %ld\n",
922 (long)(t - i_strpos), (long)(s - i_strpos))
926 DEBUG_r(PerlIO_printf(Perl_debug_log,
927 "Does not contradict STCLASS...\n");
932 DEBUG_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
933 PL_colors[4], (check ? "Guessed" : "Giving up"),
934 PL_colors[5], (long)(s - i_strpos)) );
937 fail_finish: /* Substring not found */
938 if (prog->check_substr || prog->check_utf8) /* could be removed already */
939 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
941 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
942 PL_colors[4],PL_colors[5]));
946 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
949 ibcmp_utf8(s, NULL, 0, do_utf8, \
950 m, NULL, ln, (bool)UTF)) \
951 && (norun || regtry(prog, s)) ) \
954 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
955 uvchr_to_utf8(tmpbuf, c); \
956 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
958 && (f == c1 || f == c2) \
959 && (ln == foldlen || \
960 !ibcmp_utf8((char *) foldbuf, \
961 NULL, foldlen, do_utf8, \
963 NULL, ln, (bool)UTF)) \
964 && (norun || regtry(prog, s)) ) \
969 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
973 && (ln == 1 || !(OP(c) == EXACTF \
975 : ibcmp_locale(s, m, ln))) \
976 && (norun || regtry(prog, s)) ) \
982 #define REXEC_FBC_UTF8_SCAN(CoDe) \
984 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
990 #define REXEC_FBC_SCAN(CoDe) \
992 while (s < strend) { \
998 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
999 REXEC_FBC_UTF8_SCAN( \
1001 if (tmp && (norun || regtry(prog, s))) \
1010 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1013 if (tmp && (norun || regtry(prog, s))) \
1022 #define REXEC_FBC_TRYIT \
1023 if ((norun || regtry(prog, s))) \
1026 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1029 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1032 REXEC_FBC_CLASS_SCAN(CoNd); \
1036 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1037 PL_reg_flags |= RF_tainted; \
1039 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1042 REXEC_FBC_CLASS_SCAN(CoNd); \
1047 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char *strend, I32 norun)
1049 const I32 doevery = (prog->reganch & ROPT_SKIP) == 0;
1053 register STRLEN uskip;
1057 register I32 tmp = 1; /* Scratch variable? */
1058 register const bool do_utf8 = PL_reg_match_utf8;
1060 /* We know what class it must start with. */
1064 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1065 !UTF8_IS_INVARIANT((U8)s[0]) ?
1066 reginclass(c, (U8*)s, 0, do_utf8) :
1067 REGINCLASS(c, (U8*)s));
1070 while (s < strend) {
1073 if (REGINCLASS(c, (U8*)s) ||
1074 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1075 /* The assignment of 2 is intentional:
1076 * for the folded sharp s, the skip is 2. */
1077 (skip = SHARP_S_SKIP))) {
1078 if (tmp && (norun || regtry(prog, s)))
1091 if (tmp && (norun || regtry(prog, s)))
1099 ln = STR_LEN(c); /* length to match in octets/bytes */
1100 lnc = (I32) ln; /* length to match in characters */
1102 STRLEN ulen1, ulen2;
1104 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1105 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1106 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
1108 to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1109 to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1111 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1113 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1116 while (sm < ((U8 *) m + ln)) {
1131 c2 = PL_fold_locale[c1];
1133 e = HOP3c(strend, -((I32)lnc), s);
1136 e = s; /* Due to minlen logic of intuit() */
1138 /* The idea in the EXACTF* cases is to first find the
1139 * first character of the EXACTF* node and then, if
1140 * necessary, case-insensitively compare the full
1141 * text of the node. The c1 and c2 are the first
1142 * characters (though in Unicode it gets a bit
1143 * more complicated because there are more cases
1144 * than just upper and lower: one needs to use
1145 * the so-called folding case for case-insensitive
1146 * matching (called "loose matching" in Unicode).
1147 * ibcmp_utf8() will do just that. */
1151 U8 tmpbuf [UTF8_MAXBYTES+1];
1152 STRLEN len, foldlen;
1153 const U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
1155 /* Upper and lower of 1st char are equal -
1156 * probably not a "letter". */
1158 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1160 REXEC_FBC_EXACTISH_CHECK(c == c1);
1165 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1168 /* Handle some of the three Greek sigmas cases.
1169 * Note that not all the possible combinations
1170 * are handled here: some of them are handled
1171 * by the standard folding rules, and some of
1172 * them (the character class or ANYOF cases)
1173 * are handled during compiletime in
1174 * regexec.c:S_regclass(). */
1175 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1176 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1177 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1179 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1185 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1187 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1191 PL_reg_flags |= RF_tainted;
1198 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1199 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1201 tmp = ((OP(c) == BOUND ?
1202 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1203 LOAD_UTF8_CHARCLASS_ALNUM();
1204 REXEC_FBC_UTF8_SCAN(
1205 if (tmp == !(OP(c) == BOUND ?
1206 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1207 isALNUM_LC_utf8((U8*)s)))
1215 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1216 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1219 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1225 if ((!prog->minlen && tmp) && (norun || regtry(prog, s)))
1229 PL_reg_flags |= RF_tainted;
1236 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1237 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, 0);
1239 tmp = ((OP(c) == NBOUND ?
1240 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1241 LOAD_UTF8_CHARCLASS_ALNUM();
1242 REXEC_FBC_UTF8_SCAN(
1243 if (tmp == !(OP(c) == NBOUND ?
1244 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1245 isALNUM_LC_utf8((U8*)s)))
1247 else REXEC_FBC_TRYIT;
1251 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1252 tmp = ((OP(c) == NBOUND ?
1253 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1256 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1258 else REXEC_FBC_TRYIT;
1261 if ((!prog->minlen && !tmp) && (norun || regtry(prog, s)))
1265 REXEC_FBC_CSCAN_PRELOAD(
1266 LOAD_UTF8_CHARCLASS_ALNUM(),
1267 swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1271 REXEC_FBC_CSCAN_TAINT(
1272 isALNUM_LC_utf8((U8*)s),
1276 REXEC_FBC_CSCAN_PRELOAD(
1277 LOAD_UTF8_CHARCLASS_ALNUM(),
1278 !swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8),
1282 REXEC_FBC_CSCAN_TAINT(
1283 !isALNUM_LC_utf8((U8*)s),
1287 REXEC_FBC_CSCAN_PRELOAD(
1288 LOAD_UTF8_CHARCLASS_SPACE(),
1289 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8),
1293 REXEC_FBC_CSCAN_TAINT(
1294 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1298 REXEC_FBC_CSCAN_PRELOAD(
1299 LOAD_UTF8_CHARCLASS_SPACE(),
1300 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8)),
1304 REXEC_FBC_CSCAN_TAINT(
1305 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1309 REXEC_FBC_CSCAN_PRELOAD(
1310 LOAD_UTF8_CHARCLASS_DIGIT(),
1311 swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1315 REXEC_FBC_CSCAN_TAINT(
1316 isDIGIT_LC_utf8((U8*)s),
1320 REXEC_FBC_CSCAN_PRELOAD(
1321 LOAD_UTF8_CHARCLASS_DIGIT(),
1322 !swash_fetch(PL_utf8_digit,(U8*)s, do_utf8),
1326 REXEC_FBC_CSCAN_TAINT(
1327 !isDIGIT_LC_utf8((U8*)s),
1331 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1340 - regexec_flags - match a regexp against a string
1343 Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char *strend,
1344 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1345 /* strend: pointer to null at end of string */
1346 /* strbeg: real beginning of string */
1347 /* minend: end of match must be >=minend after stringarg. */
1348 /* data: May be used for some additional optimizations. */
1349 /* nosave: For optimizations. */
1352 register regnode *c;
1353 register char *startpos = stringarg;
1354 I32 minlen; /* must match at least this many chars */
1355 I32 dontbother = 0; /* how many characters not to try at end */
1356 I32 end_shift = 0; /* Same for the end. */ /* CC */
1357 I32 scream_pos = -1; /* Internal iterator of scream. */
1358 char *scream_olds = NULL;
1359 SV* oreplsv = GvSV(PL_replgv);
1360 const bool do_utf8 = DO_UTF8(sv);
1366 PERL_UNUSED_ARG(data);
1368 /* Be paranoid... */
1369 if (prog == NULL || startpos == NULL) {
1370 Perl_croak(aTHX_ "NULL regexp parameter");
1378 PL_regnarrate = DEBUG_r_TEST;
1381 RX_MATCH_UTF8_set(prog, do_utf8);
1383 multiline = PL_multiline | (prog->reganch & PMf_MULTILINE);
1386 dsv0 = PERL_DEBUG_PAD_ZERO(0);
1387 dsv1 = PERL_DEBUG_PAD_ZERO(1);
1390 minlen = prog->minlen;
1391 if (strend - startpos < minlen) {
1392 DEBUG_r(PerlIO_printf(Perl_debug_log,
1393 "String too short [regexec_flags]...\n"));
1397 /* Check validity of program. */
1398 if (UCHARAT(prog->program) != REG_MAGIC) {
1399 Perl_croak(aTHX_ "corrupted regexp program");
1403 PL_reg_eval_set = 0;
1406 if (prog->reganch & ROPT_UTF8)
1407 PL_reg_flags |= RF_utf8;
1409 /* Mark beginning of line for ^ and lookbehind. */
1410 PL_regbol = startpos;
1414 /* Mark end of line for $ (and such) */
1417 /* see how far we have to get to not match where we matched before */
1418 PL_regtill = startpos+minend;
1420 /* We start without call_cc context. */
1423 /* If there is a "must appear" string, look for it. */
1426 if (prog->reganch & ROPT_GPOS_SEEN) { /* Need to have PL_reg_ganch */
1429 if (flags & REXEC_IGNOREPOS) /* Means: check only at start */
1430 PL_reg_ganch = startpos;
1431 else if (sv && SvTYPE(sv) >= SVt_PVMG
1433 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1434 && mg->mg_len >= 0) {
1435 PL_reg_ganch = strbeg + mg->mg_len; /* Defined pos() */
1436 if (prog->reganch & ROPT_ANCH_GPOS) {
1437 if (s > PL_reg_ganch)
1442 else /* pos() not defined */
1443 PL_reg_ganch = strbeg;
1446 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1447 re_scream_pos_data d;
1449 d.scream_olds = &scream_olds;
1450 d.scream_pos = &scream_pos;
1451 s = re_intuit_start(prog, sv, s, strend, flags, &d);
1453 DEBUG_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1454 goto phooey; /* not present */
1459 const char * const s0 = UTF
1460 ? pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60,
1463 const int len0 = UTF ? (int)SvCUR(dsv0) : prog->prelen;
1464 const char * const s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60,
1465 UNI_DISPLAY_REGEX) : startpos;
1466 const int len1 = do_utf8 ? (int)SvCUR(dsv1) : strend - startpos;
1469 PerlIO_printf(Perl_debug_log,
1470 "%sMatching REx%s \"%s%*.*s%s%s\" against \"%s%.*s%s%s\"\n",
1471 PL_colors[4],PL_colors[5],PL_colors[0],
1474 len0 > 60 ? "..." : "",
1476 (int)(len1 > 60 ? 60 : len1),
1478 (len1 > 60 ? "..." : "")
1482 /* Simplest case: anchored match need be tried only once. */
1483 /* [unless only anchor is BOL and multiline is set] */
1484 if (prog->reganch & (ROPT_ANCH & ~ROPT_ANCH_GPOS)) {
1485 if (s == startpos && regtry(prog, startpos))
1487 else if (multiline || (prog->reganch & ROPT_IMPLICIT)
1488 || (prog->reganch & ROPT_ANCH_MBOL)) /* XXXX SBOL? */
1493 dontbother = minlen - 1;
1494 end = HOP3c(strend, -dontbother, strbeg) - 1;
1495 /* for multiline we only have to try after newlines */
1496 if (prog->check_substr || prog->check_utf8) {
1500 if (regtry(prog, s))
1505 if (prog->reganch & RE_USE_INTUIT) {
1506 s = re_intuit_start(prog, sv, s + 1, strend, flags, NULL);
1517 if (*s++ == '\n') { /* don't need PL_utf8skip here */
1518 if (regtry(prog, s))
1525 } else if (prog->reganch & ROPT_ANCH_GPOS) {
1526 if (regtry(prog, PL_reg_ganch))
1531 /* Messy cases: unanchored match. */
1532 if ((prog->anchored_substr || prog->anchored_utf8) && prog->reganch & ROPT_SKIP) {
1533 /* we have /x+whatever/ */
1534 /* it must be a one character string (XXXX Except UTF?) */
1539 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1540 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1541 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
1546 DEBUG_r( did_match = 1 );
1547 if (regtry(prog, s)) goto got_it;
1549 while (s < strend && *s == ch)
1557 DEBUG_r( did_match = 1 );
1558 if (regtry(prog, s)) goto got_it;
1560 while (s < strend && *s == ch)
1565 DEBUG_r(if (!did_match)
1566 PerlIO_printf(Perl_debug_log,
1567 "Did not find anchored character...\n")
1570 else if (prog->anchored_substr != NULL
1571 || prog->anchored_utf8 != NULL
1572 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
1573 && prog->float_max_offset < strend - s)) {
1578 char *last1; /* Last position checked before */
1582 if (prog->anchored_substr || prog->anchored_utf8) {
1583 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
1584 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1585 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
1586 back_max = back_min = prog->anchored_offset;
1588 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1589 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1590 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
1591 back_max = prog->float_max_offset;
1592 back_min = prog->float_min_offset;
1594 if (must == &PL_sv_undef)
1595 /* could not downgrade utf8 check substring, so must fail */
1598 last = HOP3c(strend, /* Cannot start after this */
1599 -(I32)(CHR_SVLEN(must)
1600 - (SvTAIL(must) != 0) + back_min), strbeg);
1603 last1 = HOPc(s, -1);
1605 last1 = s - 1; /* bogus */
1607 /* XXXX check_substr already used to find "s", can optimize if
1608 check_substr==must. */
1610 dontbother = end_shift;
1611 strend = HOPc(strend, -dontbother);
1612 while ( (s <= last) &&
1613 ((flags & REXEC_SCREAM)
1614 ? (s = screaminstr(sv, must, HOP3c(s, back_min, strend) - strbeg,
1615 end_shift, &scream_pos, 0))
1616 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, strend),
1617 (unsigned char*)strend, must,
1618 multiline ? FBMrf_MULTILINE : 0))) ) {
1619 /* we may be pointing at the wrong string */
1620 if ((flags & REXEC_SCREAM) && RX_MATCH_COPIED(prog))
1621 s = strbeg + (s - SvPVX_const(sv));
1622 DEBUG_r( did_match = 1 );
1623 if (HOPc(s, -back_max) > last1) {
1624 last1 = HOPc(s, -back_min);
1625 s = HOPc(s, -back_max);
1628 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
1630 last1 = HOPc(s, -back_min);
1634 while (s <= last1) {
1635 if (regtry(prog, s))
1641 while (s <= last1) {
1642 if (regtry(prog, s))
1648 DEBUG_r(if (!did_match)
1649 PerlIO_printf(Perl_debug_log,
1650 "Did not find %s substr \"%s%.*s%s\"%s...\n",
1651 ((must == prog->anchored_substr || must == prog->anchored_utf8)
1652 ? "anchored" : "floating"),
1654 (int)(SvCUR(must) - (SvTAIL(must)!=0)),
1656 PL_colors[1], (SvTAIL(must) ? "$" : ""))
1660 else if ((c = prog->regstclass)) {
1662 I32 op = (U8)OP(prog->regstclass);
1663 /* don't bother with what can't match */
1664 if (PL_regkind[op] != EXACT && op != CANY)
1665 strend = HOPc(strend, -(minlen - 1));
1668 SV *prop = sv_newmortal();
1676 pv_uni_display(dsv0, (U8*)SvPVX_const(prop), SvCUR(prop), 60,
1677 UNI_DISPLAY_REGEX) :
1679 len0 = UTF ? SvCUR(dsv0) : SvCUR(prop);
1681 sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_REGEX) : s;
1682 len1 = UTF ? (int)SvCUR(dsv1) : strend - s;
1683 PerlIO_printf(Perl_debug_log,
1684 "Matching stclass \"%*.*s\" against \"%*.*s\"\n",
1688 if (find_byclass(prog, c, s, strend, 0))
1690 DEBUG_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass...\n"));
1694 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
1699 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
1700 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
1701 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
1703 if (flags & REXEC_SCREAM) {
1704 last = screaminstr(sv, float_real, s - strbeg,
1705 end_shift, &scream_pos, 1); /* last one */
1707 last = scream_olds; /* Only one occurrence. */
1708 /* we may be pointing at the wrong string */
1709 else if (RX_MATCH_COPIED(prog))
1710 s = strbeg + (s - SvPVX_const(sv));
1714 const char * const little = SvPV_const(float_real, len);
1716 if (SvTAIL(float_real)) {
1717 if (memEQ(strend - len + 1, little, len - 1))
1718 last = strend - len + 1;
1719 else if (!multiline)
1720 last = memEQ(strend - len, little, len)
1721 ? strend - len : NULL;
1727 last = rninstr(s, strend, little, little + len);
1729 last = strend; /* matching "$" */
1733 DEBUG_r(PerlIO_printf(Perl_debug_log,
1734 "%sCan't trim the tail, match fails (should not happen)%s\n",
1735 PL_colors[4],PL_colors[5]));
1736 goto phooey; /* Should not happen! */
1738 dontbother = strend - last + prog->float_min_offset;
1740 if (minlen && (dontbother < minlen))
1741 dontbother = minlen - 1;
1742 strend -= dontbother; /* this one's always in bytes! */
1743 /* We don't know much -- general case. */
1746 if (regtry(prog, s))
1755 if (regtry(prog, s))
1757 } while (s++ < strend);
1765 RX_MATCH_TAINTED_set(prog, PL_reg_flags & RF_tainted);
1767 if (PL_reg_eval_set) {
1768 /* Preserve the current value of $^R */
1769 if (oreplsv != GvSV(PL_replgv))
1770 sv_setsv(oreplsv, GvSV(PL_replgv));/* So that when GvSV(replgv) is
1771 restored, the value remains
1773 restore_pos(aTHX_ 0);
1776 /* make sure $`, $&, $', and $digit will work later */
1777 if ( !(flags & REXEC_NOT_FIRST) ) {
1778 if (RX_MATCH_COPIED(prog)) {
1779 Safefree(prog->subbeg);
1780 RX_MATCH_COPIED_off(prog);
1782 if (flags & REXEC_COPY_STR) {
1783 I32 i = PL_regeol - startpos + (stringarg - strbeg);
1785 s = savepvn(strbeg, i);
1788 RX_MATCH_COPIED_on(prog);
1791 prog->subbeg = strbeg;
1792 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
1799 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
1800 PL_colors[4],PL_colors[5]));
1801 if (PL_reg_eval_set)
1802 restore_pos(aTHX_ 0);
1807 - regtry - try match at specific point
1809 STATIC I32 /* 0 failure, 1 success */
1810 S_regtry(pTHX_ regexp *prog, char *startpos)
1817 PL_regindent = 0; /* XXXX Not good when matches are reenterable... */
1819 if ((prog->reganch & ROPT_EVAL_SEEN) && !PL_reg_eval_set) {
1822 PL_reg_eval_set = RS_init;
1824 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
1825 (IV)(PL_stack_sp - PL_stack_base));
1828 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
1829 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
1831 /* Apparently this is not needed, judging by wantarray. */
1832 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
1833 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
1836 /* Make $_ available to executed code. */
1837 if (PL_reg_sv != DEFSV) {
1838 /* SAVE_DEFSV does *not* suffice here for USE_5005THREADS */
1843 if (!(SvTYPE(PL_reg_sv) >= SVt_PVMG && SvMAGIC(PL_reg_sv)
1844 && (mg = mg_find(PL_reg_sv, PERL_MAGIC_regex_global)))) {
1845 /* prepare for quick setting of pos */
1846 #ifdef PERL_OLD_COPY_ON_WRITE
1848 sv_force_normal_flags(sv, 0);
1850 mg = sv_magicext(PL_reg_sv, (SV*)0, PERL_MAGIC_regex_global,
1851 &PL_vtbl_mglob, NULL, 0);
1855 PL_reg_oldpos = mg->mg_len;
1856 SAVEDESTRUCTOR_X(restore_pos, 0);
1858 if (!PL_reg_curpm) {
1859 Newxz(PL_reg_curpm, 1, PMOP);
1862 SV* repointer = newSViv(0);
1863 /* so we know which PL_regex_padav element is PL_reg_curpm */
1864 SvFLAGS(repointer) |= SVf_BREAK;
1865 av_push(PL_regex_padav,repointer);
1866 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
1867 PL_regex_pad = AvARRAY(PL_regex_padav);
1871 PM_SETRE(PL_reg_curpm, prog);
1872 PL_reg_oldcurpm = PL_curpm;
1873 PL_curpm = PL_reg_curpm;
1874 if (RX_MATCH_COPIED(prog)) {
1875 /* Here is a serious problem: we cannot rewrite subbeg,
1876 since it may be needed if this match fails. Thus
1877 $` inside (?{}) could fail... */
1878 PL_reg_oldsaved = prog->subbeg;
1879 PL_reg_oldsavedlen = prog->sublen;
1880 RX_MATCH_COPIED_off(prog);
1883 PL_reg_oldsaved = NULL;
1884 prog->subbeg = PL_bostr;
1885 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
1887 prog->startp[0] = startpos - PL_bostr;
1888 PL_reginput = startpos;
1889 PL_regstartp = prog->startp;
1890 PL_regendp = prog->endp;
1891 PL_reglastparen = &prog->lastparen;
1892 PL_reglastcloseparen = &prog->lastcloseparen;
1893 prog->lastparen = 0;
1894 prog->lastcloseparen = 0;
1896 DEBUG_r(PL_reg_starttry = startpos);
1897 if (PL_reg_start_tmpl <= prog->nparens) {
1898 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
1899 if(PL_reg_start_tmp)
1900 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1902 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
1905 /* XXXX What this code is doing here?!!! There should be no need
1906 to do this again and again, PL_reglastparen should take care of
1909 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
1910 * Actually, the code in regcppop() (which Ilya may be meaning by
1911 * PL_reglastparen), is not needed at all by the test suite
1912 * (op/regexp, op/pat, op/split), but that code is needed, oddly
1913 * enough, for building DynaLoader, or otherwise this
1914 * "Error: '*' not in typemap in DynaLoader.xs, line 164"
1915 * will happen. Meanwhile, this code *is* needed for the
1916 * above-mentioned test suite tests to succeed. The common theme
1917 * on those tests seems to be returning null fields from matches.
1922 if (prog->nparens) {
1924 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
1931 if (regmatch(prog->program + 1)) {
1932 prog->endp[0] = PL_reginput - PL_bostr;
1935 REGCP_UNWIND(lastcp);
1939 #define RE_UNWIND_BRANCH 1
1940 #define RE_UNWIND_BRANCHJ 2
1944 typedef struct { /* XX: makes sense to enlarge it... */
1948 } re_unwind_generic_t;
1961 } re_unwind_branch_t;
1963 typedef union re_unwind_t {
1965 re_unwind_generic_t generic;
1966 re_unwind_branch_t branch;
1969 #define sayYES goto yes
1970 #define sayNO goto no
1971 #define sayNO_ANYOF goto no_anyof
1972 #define sayYES_FINAL goto yes_final
1973 #define sayYES_LOUD goto yes_loud
1974 #define sayNO_FINAL goto no_final
1975 #define sayNO_SILENT goto do_no
1976 #define saySAME(x) if (x) goto yes; else goto no
1978 #define POSCACHE_SUCCESS 0 /* caching success rather than failure */
1979 #define POSCACHE_SEEN 1 /* we know what we're caching */
1980 #define POSCACHE_START 2 /* the real cache: this bit maps to pos 0 */
1981 #define CACHEsayYES STMT_START { \
1982 if (cache_offset | cache_bit) { \
1983 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
1984 PL_reg_poscache[0] |= (1<<POSCACHE_SUCCESS) || (1<<POSCACHE_SEEN); \
1985 else if (!(PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
1986 /* cache records failure, but this is success */ \
1988 PerlIO_printf(Perl_debug_log, \
1989 "%*s (remove success from failure cache)\n", \
1990 REPORT_CODE_OFF+PL_regindent*2, "") \
1992 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
1997 #define CACHEsayNO STMT_START { \
1998 if (cache_offset | cache_bit) { \
1999 if (!(PL_reg_poscache[0] & (1<<POSCACHE_SEEN))) \
2000 PL_reg_poscache[0] |= (1<<POSCACHE_SEEN); \
2001 else if ((PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))) { \
2002 /* cache records success, but this is failure */ \
2004 PerlIO_printf(Perl_debug_log, \
2005 "%*s (remove failure from success cache)\n", \
2006 REPORT_CODE_OFF+PL_regindent*2, "") \
2008 PL_reg_poscache[cache_offset] &= ~(1<<cache_bit); \
2015 /* this value indiciates that the c1/c2 "next char" test should be skipped */
2016 #define CHRTEST_VOID -1000
2018 #define REPORT_CODE_OFF 24
2021 - regmatch - main matching routine
2023 * Conceptually the strategy is simple: check to see whether the current
2024 * node matches, call self recursively to see whether the rest matches,
2025 * and then act accordingly. In practice we make some effort to avoid
2026 * recursion, in particular by going through "ordinary" nodes (that don't
2027 * need to know whether the rest of the match failed) by a loop instead of
2030 /* [lwall] I've hoisted the register declarations to the outer block in order to
2031 * maybe save a little bit of pushing and popping on the stack. It also takes
2032 * advantage of machines that use a register save mask on subroutine entry.
2034 STATIC I32 /* 0 failure, 1 success */
2035 S_regmatch(pTHX_ regnode *prog)
2037 register regnode *scan; /* Current node. */
2038 regnode *next; /* Next node. */
2039 regnode *inner; /* Next node in internal branch. */
2040 register I32 nextchr; /* renamed nextchr - nextchar colides with
2041 function of same name */
2042 register I32 n; /* no or next */
2043 register I32 ln = 0; /* len or last */
2044 register char *s = NULL; /* operand or save */
2045 register char *locinput = PL_reginput;
2046 register I32 c1 = 0, c2 = 0, paren; /* case fold search, parenth */
2047 int minmod = 0, sw = 0, logical = 0;
2050 I32 firstcp = PL_savestack_ix;
2052 register const bool do_utf8 = PL_reg_match_utf8;
2054 SV * const dsv0 = PERL_DEBUG_PAD_ZERO(0);
2055 SV * const dsv1 = PERL_DEBUG_PAD_ZERO(1);
2056 SV * const dsv2 = PERL_DEBUG_PAD_ZERO(2);
2058 U32 uniflags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY;
2066 /* Note that nextchr is a byte even in UTF */
2067 nextchr = UCHARAT(locinput);
2069 while (scan != NULL) {
2072 SV * const prop = sv_newmortal();
2073 const int docolor = *PL_colors[0];
2074 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2075 int l = (PL_regeol - locinput) > taill ? taill : (PL_regeol - locinput);
2076 /* The part of the string before starttry has one color
2077 (pref0_len chars), between starttry and current
2078 position another one (pref_len - pref0_len chars),
2079 after the current position the third one.
2080 We assume that pref0_len <= pref_len, otherwise we
2081 decrease pref0_len. */
2082 int pref_len = (locinput - PL_bostr) > (5 + taill) - l
2083 ? (5 + taill) - l : locinput - PL_bostr;
2086 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2088 pref0_len = pref_len - (locinput - PL_reg_starttry);
2089 if (l + pref_len < (5 + taill) && l < PL_regeol - locinput)
2090 l = ( PL_regeol - locinput > (5 + taill) - pref_len
2091 ? (5 + taill) - pref_len : PL_regeol - locinput);
2092 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2096 if (pref0_len > pref_len)
2097 pref0_len = pref_len;
2098 regprop(prop, scan);
2100 const char * const s0 =
2101 do_utf8 && OP(scan) != CANY ?
2102 pv_uni_display(dsv0, (U8*)(locinput - pref_len),
2103 pref0_len, 60, UNI_DISPLAY_REGEX) :
2104 locinput - pref_len;
2105 const int len0 = do_utf8 ? (int)strlen(s0) : pref0_len;
2106 const char * const s1 = do_utf8 && OP(scan) != CANY ?
2107 pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
2108 pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
2109 locinput - pref_len + pref0_len;
2110 const int len1 = do_utf8 ? (int)strlen(s1) : pref_len - pref0_len;
2111 const char * const s2 = do_utf8 && OP(scan) != CANY ?
2112 pv_uni_display(dsv2, (U8*)locinput,
2113 PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
2115 const int len2 = do_utf8 ? (int)strlen(s2) : l;
2116 PerlIO_printf(Perl_debug_log,
2117 "%4"IVdf" <%s%.*s%s%s%.*s%s%s%s%.*s%s>%*s|%3"IVdf":%*s%s\n",
2118 (IV)(locinput - PL_bostr),
2125 (docolor ? "" : "> <"),
2129 15 - l - pref_len + 1,
2131 (IV)(scan - PL_regprogram), PL_regindent*2, "",
2136 next = scan + NEXT_OFF(scan);
2142 if (locinput == PL_bostr || (PL_multiline &&
2143 (nextchr || locinput < PL_regeol) && locinput[-1] == '\n') )
2145 /* regtill = regbol; */
2150 if (locinput == PL_bostr ||
2151 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2157 if (locinput == PL_bostr)
2161 if (locinput == PL_reg_ganch)
2171 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2176 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2178 if (PL_regeol - locinput > 1)
2182 if (PL_regeol != locinput)
2186 if (!nextchr && locinput >= PL_regeol)
2189 locinput += PL_utf8skip[nextchr];
2190 if (locinput > PL_regeol)
2192 nextchr = UCHARAT(locinput);
2195 nextchr = UCHARAT(++locinput);
2198 if (!nextchr && locinput >= PL_regeol)
2200 nextchr = UCHARAT(++locinput);
2203 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
2206 locinput += PL_utf8skip[nextchr];
2207 if (locinput > PL_regeol)
2209 nextchr = UCHARAT(locinput);
2212 nextchr = UCHARAT(++locinput);
2217 if (do_utf8 != UTF) {
2218 /* The target and the pattern have differing utf8ness. */
2220 const char *e = s + ln;
2223 /* The target is utf8, the pattern is not utf8. */
2228 if (NATIVE_TO_UNI(*(U8*)s) !=
2229 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
2237 /* The target is not utf8, the pattern is utf8. */
2242 if (NATIVE_TO_UNI(*((U8*)l)) !=
2243 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
2251 nextchr = UCHARAT(locinput);
2254 /* The target and the pattern have the same utf8ness. */
2255 /* Inline the first character, for speed. */
2256 if (UCHARAT(s) != nextchr)
2258 if (PL_regeol - locinput < ln)
2260 if (ln > 1 && memNE(s, locinput, ln))
2263 nextchr = UCHARAT(locinput);
2266 PL_reg_flags |= RF_tainted;
2272 if (do_utf8 || UTF) {
2273 /* Either target or the pattern are utf8. */
2275 char *e = PL_regeol;
2277 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
2278 l, &e, 0, do_utf8)) {
2279 /* One more case for the sharp s:
2280 * pack("U0U*", 0xDF) =~ /ss/i,
2281 * the 0xC3 0x9F are the UTF-8
2282 * byte sequence for the U+00DF. */
2284 toLOWER(s[0]) == 's' &&
2286 toLOWER(s[1]) == 's' &&
2293 nextchr = UCHARAT(locinput);
2297 /* Neither the target and the pattern are utf8. */
2299 /* Inline the first character, for speed. */
2300 if (UCHARAT(s) != nextchr &&
2301 UCHARAT(s) != ((OP(scan) == EXACTF)
2302 ? PL_fold : PL_fold_locale)[nextchr])
2304 if (PL_regeol - locinput < ln)
2306 if (ln > 1 && (OP(scan) == EXACTF
2307 ? ibcmp(s, locinput, ln)
2308 : ibcmp_locale(s, locinput, ln)))
2311 nextchr = UCHARAT(locinput);
2315 STRLEN inclasslen = PL_regeol - locinput;
2317 if (!reginclass(scan, (U8*)locinput, &inclasslen, do_utf8))
2319 if (locinput >= PL_regeol)
2321 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
2322 nextchr = UCHARAT(locinput);
2327 nextchr = UCHARAT(locinput);
2328 if (!REGINCLASS(scan, (U8*)locinput))
2330 if (!nextchr && locinput >= PL_regeol)
2332 nextchr = UCHARAT(++locinput);
2336 /* If we might have the case of the German sharp s
2337 * in a casefolding Unicode character class. */
2339 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
2340 locinput += SHARP_S_SKIP;
2341 nextchr = UCHARAT(locinput);
2347 PL_reg_flags |= RF_tainted;
2353 LOAD_UTF8_CHARCLASS_ALNUM();
2354 if (!(OP(scan) == ALNUM
2355 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2356 : isALNUM_LC_utf8((U8*)locinput)))
2360 locinput += PL_utf8skip[nextchr];
2361 nextchr = UCHARAT(locinput);
2364 if (!(OP(scan) == ALNUM
2365 ? isALNUM(nextchr) : isALNUM_LC(nextchr)))
2367 nextchr = UCHARAT(++locinput);
2370 PL_reg_flags |= RF_tainted;
2373 if (!nextchr && locinput >= PL_regeol)
2376 LOAD_UTF8_CHARCLASS_ALNUM();
2377 if (OP(scan) == NALNUM
2378 ? (bool)swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8)
2379 : isALNUM_LC_utf8((U8*)locinput))
2383 locinput += PL_utf8skip[nextchr];
2384 nextchr = UCHARAT(locinput);
2387 if (OP(scan) == NALNUM
2388 ? isALNUM(nextchr) : isALNUM_LC(nextchr))
2390 nextchr = UCHARAT(++locinput);
2394 PL_reg_flags |= RF_tainted;
2398 /* was last char in word? */
2400 if (locinput == PL_bostr)
2403 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
2405 ln = utf8n_to_uvchr((U8 *)r, UTF8SKIP(r), 0, 0);
2407 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2408 ln = isALNUM_uni(ln);
2409 LOAD_UTF8_CHARCLASS_ALNUM();
2410 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
2413 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
2414 n = isALNUM_LC_utf8((U8*)locinput);
2418 ln = (locinput != PL_bostr) ?
2419 UCHARAT(locinput - 1) : '\n';
2420 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
2422 n = isALNUM(nextchr);
2425 ln = isALNUM_LC(ln);
2426 n = isALNUM_LC(nextchr);
2429 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
2430 OP(scan) == BOUNDL))
2434 PL_reg_flags |= RF_tainted;
2440 if (UTF8_IS_CONTINUED(nextchr)) {
2441 LOAD_UTF8_CHARCLASS_SPACE();
2442 if (!(OP(scan) == SPACE
2443 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2444 : isSPACE_LC_utf8((U8*)locinput)))
2448 locinput += PL_utf8skip[nextchr];
2449 nextchr = UCHARAT(locinput);
2452 if (!(OP(scan) == SPACE
2453 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2455 nextchr = UCHARAT(++locinput);
2458 if (!(OP(scan) == SPACE
2459 ? isSPACE(nextchr) : isSPACE_LC(nextchr)))
2461 nextchr = UCHARAT(++locinput);
2465 PL_reg_flags |= RF_tainted;
2468 if (!nextchr && locinput >= PL_regeol)
2471 LOAD_UTF8_CHARCLASS_SPACE();
2472 if (OP(scan) == NSPACE
2473 ? (bool)swash_fetch(PL_utf8_space, (U8*)locinput, do_utf8)
2474 : isSPACE_LC_utf8((U8*)locinput))
2478 locinput += PL_utf8skip[nextchr];
2479 nextchr = UCHARAT(locinput);
2482 if (OP(scan) == NSPACE
2483 ? isSPACE(nextchr) : isSPACE_LC(nextchr))
2485 nextchr = UCHARAT(++locinput);
2488 PL_reg_flags |= RF_tainted;
2494 LOAD_UTF8_CHARCLASS_DIGIT();
2495 if (!(OP(scan) == DIGIT
2496 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2497 : isDIGIT_LC_utf8((U8*)locinput)))
2501 locinput += PL_utf8skip[nextchr];
2502 nextchr = UCHARAT(locinput);
2505 if (!(OP(scan) == DIGIT
2506 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr)))
2508 nextchr = UCHARAT(++locinput);
2511 PL_reg_flags |= RF_tainted;
2514 if (!nextchr && locinput >= PL_regeol)
2517 LOAD_UTF8_CHARCLASS_DIGIT();
2518 if (OP(scan) == NDIGIT
2519 ? (bool)swash_fetch(PL_utf8_digit, (U8*)locinput, do_utf8)
2520 : isDIGIT_LC_utf8((U8*)locinput))
2524 locinput += PL_utf8skip[nextchr];
2525 nextchr = UCHARAT(locinput);
2528 if (OP(scan) == NDIGIT
2529 ? isDIGIT(nextchr) : isDIGIT_LC(nextchr))
2531 nextchr = UCHARAT(++locinput);
2534 if (locinput >= PL_regeol)
2537 LOAD_UTF8_CHARCLASS_MARK();
2538 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2540 locinput += PL_utf8skip[nextchr];
2541 while (locinput < PL_regeol &&
2542 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
2543 locinput += UTF8SKIP(locinput);
2544 if (locinput > PL_regeol)
2549 nextchr = UCHARAT(locinput);
2552 PL_reg_flags |= RF_tainted;
2556 n = ARG(scan); /* which paren pair */
2557 ln = PL_regstartp[n];
2558 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2559 if ((I32)*PL_reglastparen < n || ln == -1)
2560 sayNO; /* Do not match unless seen CLOSEn. */
2561 if (ln == PL_regendp[n])
2565 if (do_utf8 && OP(scan) != REF) { /* REF can do byte comparison */
2567 const char *e = PL_bostr + PL_regendp[n];
2569 * Note that we can't do the "other character" lookup trick as
2570 * in the 8-bit case (no pun intended) because in Unicode we
2571 * have to map both upper and title case to lower case.
2573 if (OP(scan) == REFF) {
2575 STRLEN ulen1, ulen2;
2576 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
2577 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
2581 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
2582 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
2583 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
2590 nextchr = UCHARAT(locinput);
2594 /* Inline the first character, for speed. */
2595 if (UCHARAT(s) != nextchr &&
2597 (UCHARAT(s) != ((OP(scan) == REFF
2598 ? PL_fold : PL_fold_locale)[nextchr]))))
2600 ln = PL_regendp[n] - ln;
2601 if (locinput + ln > PL_regeol)
2603 if (ln > 1 && (OP(scan) == REF
2604 ? memNE(s, locinput, ln)
2606 ? ibcmp(s, locinput, ln)
2607 : ibcmp_locale(s, locinput, ln))))
2610 nextchr = UCHARAT(locinput);
2621 OP_4tree * const oop = PL_op;
2622 COP * const ocurcop = PL_curcop;
2625 struct regexp * const oreg = PL_reg_re;
2628 PL_op = (OP_4tree*)PL_regdata->data[n];
2629 DEBUG_r( PerlIO_printf(Perl_debug_log, " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
2630 PAD_SAVE_LOCAL(old_comppad, (PAD*)PL_regdata->data[n + 2]);
2631 PL_regendp[0] = PL_reg_magic->mg_len = locinput - PL_bostr;
2634 SV ** const before = SP;
2635 CALLRUNOPS(aTHX); /* Scalar context. */
2638 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
2646 PAD_RESTORE_LOCAL(old_comppad);
2647 PL_curcop = ocurcop;
2649 if (logical == 2) { /* Postponed subexpression. */
2653 CHECKPOINT cp, lastcp;
2657 if(SvROK(ret) && SvSMAGICAL(sv = SvRV(ret)))
2658 mg = mg_find(sv, PERL_MAGIC_qr);
2659 else if (SvSMAGICAL(ret)) {
2660 if (SvGMAGICAL(ret))
2661 sv_unmagic(ret, PERL_MAGIC_qr);
2663 mg = mg_find(ret, PERL_MAGIC_qr);
2667 re = (regexp *)mg->mg_obj;
2668 (void)ReREFCNT_inc(re);
2672 const char * const t = SvPV_const(ret, len);
2674 char * const oprecomp = PL_regprecomp;
2675 const I32 osize = PL_regsize;
2676 const I32 onpar = PL_regnpar;
2679 if (DO_UTF8(ret)) pm.op_pmdynflags |= PMdf_DYN_UTF8;
2680 re = CALLREGCOMP(aTHX_ (char*)t, (char*)t + len, &pm);
2682 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
2684 sv_magic(ret,(SV*)ReREFCNT_inc(re),
2686 PL_regprecomp = oprecomp;
2691 PerlIO_printf(Perl_debug_log,
2692 "Entering embedded \"%s%.60s%s%s\"\n",
2696 (strlen(re->precomp) > 60 ? "..." : ""))
2699 state.prev = PL_reg_call_cc;
2700 state.cc = PL_regcc;
2701 state.re = PL_reg_re;
2705 cp = regcppush(0); /* Save *all* the positions. */
2708 state.ss = PL_savestack_ix;
2709 *PL_reglastparen = 0;
2710 *PL_reglastcloseparen = 0;
2711 PL_reg_call_cc = &state;
2712 PL_reginput = locinput;
2713 toggleutf = ((PL_reg_flags & RF_utf8) != 0) ^
2714 ((re->reganch & ROPT_UTF8) != 0);
2715 if (toggleutf) PL_reg_flags ^= RF_utf8;
2717 /* XXXX This is too dramatic a measure... */
2720 if (regmatch(re->program + 1)) {
2721 /* Even though we succeeded, we need to restore
2722 global variables, since we may be wrapped inside
2723 SUSPEND, thus the match may be not finished yet. */
2725 /* XXXX Do this only if SUSPENDed? */
2726 PL_reg_call_cc = state.prev;
2727 PL_regcc = state.cc;
2728 PL_reg_re = state.re;
2729 cache_re(PL_reg_re);
2730 if (toggleutf) PL_reg_flags ^= RF_utf8;
2732 /* XXXX This is too dramatic a measure... */
2735 /* These are needed even if not SUSPEND. */
2741 REGCP_UNWIND(lastcp);
2743 PL_reg_call_cc = state.prev;
2744 PL_regcc = state.cc;
2745 PL_reg_re = state.re;
2746 cache_re(PL_reg_re);
2747 if (toggleutf) PL_reg_flags ^= RF_utf8;
2749 /* XXXX This is too dramatic a measure... */
2759 sv_setsv(save_scalar(PL_replgv), ret);
2765 n = ARG(scan); /* which paren pair */
2766 PL_reg_start_tmp[n] = locinput;
2771 n = ARG(scan); /* which paren pair */
2772 PL_regstartp[n] = PL_reg_start_tmp[n] - PL_bostr;
2773 PL_regendp[n] = locinput - PL_bostr;
2774 if (n > (I32)*PL_reglastparen)
2775 *PL_reglastparen = n;
2776 *PL_reglastcloseparen = n;
2779 n = ARG(scan); /* which paren pair */
2780 sw = ((I32)*PL_reglastparen >= n && PL_regendp[n] != -1);
2783 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
2785 next = NEXTOPER(NEXTOPER(scan));
2787 next = scan + ARG(scan);
2788 if (OP(next) == IFTHEN) /* Fake one. */
2789 next = NEXTOPER(NEXTOPER(next));
2793 logical = scan->flags;
2795 /*******************************************************************
2796 PL_regcc contains infoblock about the innermost (...)* loop, and
2797 a pointer to the next outer infoblock.
2799 Here is how Y(A)*Z is processed (if it is compiled into CURLYX/WHILEM):
2801 1) After matching X, regnode for CURLYX is processed;
2803 2) This regnode creates infoblock on the stack, and calls
2804 regmatch() recursively with the starting point at WHILEM node;
2806 3) Each hit of WHILEM node tries to match A and Z (in the order
2807 depending on the current iteration, min/max of {min,max} and
2808 greediness). The information about where are nodes for "A"
2809 and "Z" is read from the infoblock, as is info on how many times "A"
2810 was already matched, and greediness.
2812 4) After A matches, the same WHILEM node is hit again.
2814 5) Each time WHILEM is hit, PL_regcc is the infoblock created by CURLYX
2815 of the same pair. Thus when WHILEM tries to match Z, it temporarily
2816 resets PL_regcc, since this Y(A)*Z can be a part of some other loop:
2817 as in (Y(A)*Z)*. If Z matches, the automaton will hit the WHILEM node
2818 of the external loop.
2820 Currently present infoblocks form a tree with a stem formed by PL_curcc
2821 and whatever it mentions via ->next, and additional attached trees
2822 corresponding to temporarily unset infoblocks as in "5" above.
2824 In the following picture infoblocks for outer loop of
2825 (Y(A)*?Z)*?T are denoted O, for inner I. NULL starting block
2826 is denoted by x. The matched string is YAAZYAZT. Temporarily postponed
2827 infoblocks are drawn below the "reset" infoblock.
2829 In fact in the picture below we do not show failed matches for Z and T
2830 by WHILEM blocks. [We illustrate minimal matches, since for them it is
2831 more obvious *why* one needs to *temporary* unset infoblocks.]
2833 Matched REx position InfoBlocks Comment
2837 Y A)*?Z)*?T x <- O <- I
2838 YA )*?Z)*?T x <- O <- I
2839 YA A)*?Z)*?T x <- O <- I
2840 YAA )*?Z)*?T x <- O <- I
2841 YAA Z)*?T x <- O # Temporary unset I
2844 YAAZ Y(A)*?Z)*?T x <- O
2847 YAAZY (A)*?Z)*?T x <- O
2850 YAAZY A)*?Z)*?T x <- O <- I
2853 YAAZYA )*?Z)*?T x <- O <- I
2856 YAAZYA Z)*?T x <- O # Temporary unset I
2862 YAAZYAZ T x # Temporary unset O
2869 *******************************************************************/
2872 CHECKPOINT cp = PL_savestack_ix;
2873 /* No need to save/restore up to this paren */
2874 parenfloor = scan->flags;
2878 CURLYX and WHILEM are always paired: they're the moral
2879 equivalent of pp_enteriter anbd pp_iter.
2881 The only time next could be null is if the node tree is
2882 corrupt. This was mentioned on p5p a few days ago.
2884 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
2885 So we'll assert that this is true:
2888 if (next && (OP(PREVOPER(next)) == NOTHING)) /* LONGJMP */
2890 cc.oldcc = PL_regcc;
2892 /* XXXX Probably it is better to teach regpush to support
2893 parenfloor > PL_regsize... */
2894 if (parenfloor > (I32)*PL_reglastparen)
2895 parenfloor = *PL_reglastparen; /* Pessimization... */
2896 cc.parenfloor = parenfloor;
2898 cc.min = ARG1(scan);
2899 cc.max = ARG2(scan);
2900 cc.scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
2904 PL_reginput = locinput;
2905 n = regmatch(PREVOPER(next)); /* start on the WHILEM */
2906 if (PL_reg_eval_set){
2907 SV *pres= GvSV(PL_replgv);
2910 sv_setsv(GvSV(PL_replgv), pres);
2915 PL_regcc = cc.oldcc;
2922 st->cc gets initialised by CURLYX ready for use by WHILEM.
2923 So again, unless somethings been corrupted, st->cc cannot
2924 be null at that point in WHILEM.
2926 See http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2006-04/msg00556.html
2927 So we'll assert that this is true:
2932 * This is really hard to understand, because after we match
2933 * what we're trying to match, we must make sure the rest of
2934 * the REx is going to match for sure, and to do that we have
2935 * to go back UP the parse tree by recursing ever deeper. And
2936 * if it fails, we have to reset our parent's current state
2937 * that we can try again after backing off.
2940 CHECKPOINT cp, lastcp;
2941 CURCUR* cc = PL_regcc;
2942 char * const lastloc = cc->lastloc; /* Detection of 0-len. */
2943 I32 cache_offset = 0, cache_bit = 0;
2945 n = cc->cur + 1; /* how many we know we matched */
2946 PL_reginput = locinput;
2949 PerlIO_printf(Perl_debug_log,
2950 "%*s %ld out of %ld..%ld cc=%"UVxf"\n",
2951 REPORT_CODE_OFF+PL_regindent*2, "",
2952 (long)n, (long)cc->min,
2953 (long)cc->max, PTR2UV(cc))
2956 /* If degenerate scan matches "", assume scan done. */
2958 if (locinput == cc->lastloc && n >= cc->min) {
2959 PL_regcc = cc->oldcc;
2963 PerlIO_printf(Perl_debug_log,
2964 "%*s empty match detected, try continuation...\n",
2965 REPORT_CODE_OFF+PL_regindent*2, "")
2967 if (regmatch(cc->next))
2975 /* First just match a string of min scans. */
2979 cc->lastloc = locinput;
2980 if (regmatch(cc->scan))
2983 cc->lastloc = lastloc;
2988 /* Check whether we already were at this position.
2989 Postpone detection until we know the match is not
2990 *that* much linear. */
2991 if (!PL_reg_maxiter) {
2992 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
2993 /* possible overflow for long strings and many CURLYX's */
2994 if (PL_reg_maxiter < 0)
2995 PL_reg_maxiter = I32_MAX;
2996 PL_reg_leftiter = PL_reg_maxiter;
2998 if (PL_reg_leftiter-- == 0) {
2999 const I32 size = (PL_reg_maxiter + 7 + POSCACHE_START)/8;
3000 if (PL_reg_poscache) {
3001 if ((I32)PL_reg_poscache_size < size) {
3002 Renew(PL_reg_poscache, size, char);
3003 PL_reg_poscache_size = size;
3005 Zero(PL_reg_poscache, size, char);
3008 PL_reg_poscache_size = size;
3009 Newxz(PL_reg_poscache, size, char);
3012 PerlIO_printf(Perl_debug_log,
3013 "%sDetected a super-linear match, switching on caching%s...\n",
3014 PL_colors[4], PL_colors[5])
3017 if (PL_reg_leftiter < 0) {
3018 cache_offset = locinput - PL_bostr;
3020 cache_offset = (scan->flags & 0xf) - 1 + POSCACHE_START
3021 + cache_offset * (scan->flags>>4);
3022 cache_bit = cache_offset % 8;
3024 if (PL_reg_poscache[cache_offset] & (1<<cache_bit)) {
3026 PerlIO_printf(Perl_debug_log,
3027 "%*s already tried at this position...\n",
3028 REPORT_CODE_OFF+PL_regindent*2, "")
3030 if (PL_reg_poscache[0] & (1<<POSCACHE_SUCCESS))
3031 /* cache records success */
3034 /* cache records failure */
3037 PL_reg_poscache[cache_offset] |= (1<<cache_bit);
3041 /* Prefer next over scan for minimal matching. */
3044 PL_regcc = cc->oldcc;
3047 cp = regcppush(cc->parenfloor);
3049 if (regmatch(cc->next)) {
3051 CACHEsayYES; /* All done. */
3053 REGCP_UNWIND(lastcp);
3059 if (n >= cc->max) { /* Maximum greed exceeded? */
3060 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3061 && !(PL_reg_flags & RF_warned)) {
3062 PL_reg_flags |= RF_warned;
3063 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3064 "Complex regular subexpression recursion",
3071 PerlIO_printf(Perl_debug_log,
3072 "%*s trying longer...\n",
3073 REPORT_CODE_OFF+PL_regindent*2, "")
3075 /* Try scanning more and see if it helps. */
3076 PL_reginput = locinput;
3078 cc->lastloc = locinput;
3079 cp = regcppush(cc->parenfloor);
3081 if (regmatch(cc->scan)) {
3085 REGCP_UNWIND(lastcp);
3088 cc->lastloc = lastloc;
3092 /* Prefer scan over next for maximal matching. */
3094 if (n < cc->max) { /* More greed allowed? */
3095 cp = regcppush(cc->parenfloor);
3097 cc->lastloc = locinput;
3099 if (regmatch(cc->scan)) {
3103 REGCP_UNWIND(lastcp);
3104 regcppop(); /* Restore some previous $<digit>s? */
3105 PL_reginput = locinput;
3107 PerlIO_printf(Perl_debug_log,
3108 "%*s failed, try continuation...\n",
3109 REPORT_CODE_OFF+PL_regindent*2, "")
3112 if (ckWARN(WARN_REGEXP) && n >= REG_INFTY
3113 && !(PL_reg_flags & RF_warned)) {
3114 PL_reg_flags |= RF_warned;
3115 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
3116 "Complex regular subexpression recursion",
3120 /* Failed deeper matches of scan, so see if this one works. */
3121 PL_regcc = cc->oldcc;
3124 if (regmatch(cc->next))
3130 cc->lastloc = lastloc;
3135 next = scan + ARG(scan);
3138 inner = NEXTOPER(NEXTOPER(scan));
3141 inner = NEXTOPER(scan);
3145 if (!next || OP(next) != c1) /* No choice. */
3146 next = inner; /* Avoid recursion. */
3148 const I32 lastparen = *PL_reglastparen;
3149 /* Put unwinding data on stack */
3150 const I32 unwind1 = SSNEWt(1,re_unwind_branch_t);
3151 re_unwind_branch_t * const uw = SSPTRt(unwind1,re_unwind_branch_t);
3155 uw->type = ((c1 == BRANCH)
3157 : RE_UNWIND_BRANCHJ);
3158 uw->lastparen = lastparen;
3160 uw->locinput = locinput;
3161 uw->nextchr = nextchr;
3163 uw->regindent = ++PL_regindent;
3166 REGCP_SET(uw->lastcp);
3168 /* Now go into the first branch */
3183 /* We suppose that the next guy does not need
3184 backtracking: in particular, it is of constant non-zero length,
3185 and has no parenths to influence future backrefs. */
3186 ln = ARG1(scan); /* min to match */
3187 n = ARG2(scan); /* max to match */
3188 paren = scan->flags;
3190 if (paren > PL_regsize)
3192 if (paren > (I32)*PL_reglastparen)
3193 *PL_reglastparen = paren;
3195 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3197 scan += NEXT_OFF(scan); /* Skip former OPEN. */
3198 PL_reginput = locinput;
3199 maxwanted = minmod ? ln : n;
3201 while (PL_reginput < PL_regeol && matches < maxwanted) {
3202 if (!regmatch(scan))
3204 /* on first match, determine length, l */
3206 if (PL_reg_match_utf8) {
3208 while (s < PL_reginput) {
3214 l = PL_reginput - locinput;
3217 matches = maxwanted;
3221 locinput = PL_reginput;
3225 PL_reginput = locinput;
3229 if (ln && matches < ln)
3231 if (HAS_TEXT(next) || JUMPABLE(next)) {
3232 regnode *text_node = next;
3234 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3236 if (! HAS_TEXT(text_node)) c1 = c2 = CHRTEST_VOID;
3238 if (PL_regkind[(U8)OP(text_node)] == REF) {
3239 c1 = c2 = CHRTEST_VOID;
3242 else { c1 = (U8)*STRING(text_node); }
3243 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3245 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3246 c2 = PL_fold_locale[c1];
3252 c1 = c2 = CHRTEST_VOID;
3255 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3256 /* If it could work, try it. */
3257 if (c1 == CHRTEST_VOID ||
3258 UCHARAT(PL_reginput) == c1 ||
3259 UCHARAT(PL_reginput) == c2)
3263 PL_regstartp[paren] =
3264 HOPc(PL_reginput, -l) - PL_bostr;
3265 PL_regendp[paren] = PL_reginput - PL_bostr;
3268 PL_regendp[paren] = -1;
3272 REGCP_UNWIND(lastcp);
3274 /* Couldn't or didn't -- move forward. */
3275 PL_reginput = locinput;
3276 if (regmatch(scan)) {
3278 locinput = PL_reginput;
3286 PerlIO_printf(Perl_debug_log,
3287 "%*s matched %"IVdf" times, len=%"IVdf"...\n",
3288 (int)(REPORT_CODE_OFF+PL_regindent*2), "",
3289 (IV) matches, (IV)l)
3291 if (matches >= ln) {
3292 if (HAS_TEXT(next) || JUMPABLE(next)) {
3293 regnode *text_node = next;
3295 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3297 if (! HAS_TEXT(text_node)) c1 = c2 = CHRTEST_VOID;
3299 if (PL_regkind[(U8)OP(text_node)] == REF) {
3300 c1 = c2 = CHRTEST_VOID;
3303 else { c1 = (U8)*STRING(text_node); }
3305 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3307 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3308 c2 = PL_fold_locale[c1];
3314 c1 = c2 = CHRTEST_VOID;
3318 while (matches >= ln) {
3319 /* If it could work, try it. */
3320 if (c1 == CHRTEST_VOID ||
3321 UCHARAT(PL_reginput) == c1 ||
3322 UCHARAT(PL_reginput) == c2)
3325 PerlIO_printf(Perl_debug_log,
3326 "%*s trying tail with matches=%"IVdf"...\n",
3327 (int)(REPORT_CODE_OFF+PL_regindent*2),
3332 PL_regstartp[paren] = HOPc(PL_reginput, -l) - PL_bostr;
3333 PL_regendp[paren] = PL_reginput - PL_bostr;
3336 PL_regendp[paren] = -1;
3340 REGCP_UNWIND(lastcp);
3342 /* Couldn't or didn't -- back up. */
3344 locinput = HOPc(locinput, -l);
3345 PL_reginput = locinput;
3353 paren = scan->flags; /* Which paren to set */
3354 if (paren > PL_regsize)
3356 if (paren > (I32)*PL_reglastparen)
3357 *PL_reglastparen = paren;
3358 ln = ARG1(scan); /* min to match */
3359 n = ARG2(scan); /* max to match */
3360 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
3364 ln = ARG1(scan); /* min to match */
3365 n = ARG2(scan); /* max to match */
3366 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
3371 scan = NEXTOPER(scan);
3377 scan = NEXTOPER(scan);
3381 * Lookahead to avoid useless match attempts
3382 * when we know what character comes next.
3386 * Used to only do .*x and .*?x, but now it allows
3387 * for )'s, ('s and (?{ ... })'s to be in the way
3388 * of the quantifier and the EXACT-like node. -- japhy
3391 if (HAS_TEXT(next) || JUMPABLE(next)) {
3393 regnode *text_node = next;
3395 if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node);
3397 if (! HAS_TEXT(text_node)) c1 = c2 = CHRTEST_VOID;
3399 if (PL_regkind[(U8)OP(text_node)] == REF) {
3400 c1 = c2 = CHRTEST_VOID;
3401 goto assume_ok_easy;
3403 else { s = (U8*)STRING(text_node); }
3407 if (OP(text_node) == EXACTF || OP(text_node) == REFF)
3409 else if (OP(text_node) == EXACTFL || OP(text_node) == REFFL)
3410 c2 = PL_fold_locale[c1];
3413 if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
3414 STRLEN ulen1, ulen2;
3415 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3416 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3418 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
3419 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
3421 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
3423 0 : UTF8_ALLOW_ANY);
3424 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
3426 0 : UTF8_ALLOW_ANY);
3428 c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
3430 c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
3435 c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
3442 c1 = c2 = CHRTEST_VOID;
3444 PL_reginput = locinput;
3448 if (ln && regrepeat(scan, ln) < ln)
3450 locinput = PL_reginput;
3452 if (c1 != CHRTEST_VOID) {
3453 char *e; /* Should not check after this */
3454 char *old = locinput;
3457 if (n == REG_INFTY) {
3460 while (UTF8_IS_CONTINUATION(*(U8*)e))
3466 m >0 && e + UTF8SKIP(e) <= PL_regeol; m--)
3470 e = locinput + n - ln;
3475 /* Find place 'next' could work */
3478 while (locinput <= e &&
3479 UCHARAT(locinput) != c1)
3482 while (locinput <= e
3483 && UCHARAT(locinput) != c1
3484 && UCHARAT(locinput) != c2)
3487 count = locinput - old;
3492 /* count initialised to
3493 * utf8_distance(old, locinput) */
3494 while (locinput <= e &&
3495 utf8n_to_uvchr((U8*)locinput,
3496 UTF8_MAXBYTES, &len,
3497 uniflags) != (UV)c1) {
3502 /* count initialised to
3503 * utf8_distance(old, locinput) */
3504 while (locinput <= e) {
3506 const UV c = utf8n_to_uvchr((U8*)locinput,
3507 UTF8_MAXBYTES, &len,
3509 if (c == (UV)c1 || c == (UV)c2)
3518 /* PL_reginput == old now */
3519 if (locinput != old) {
3520 ln = 1; /* Did some */
3521 if (regrepeat(scan, count) < count)
3524 /* PL_reginput == locinput now */
3525 TRYPAREN(paren, ln, locinput);
3526 PL_reginput = locinput; /* Could be reset... */
3527 REGCP_UNWIND(lastcp);
3528 /* Couldn't or didn't -- move forward. */
3531 locinput += UTF8SKIP(locinput);
3538 while (n >= ln || (n == REG_INFTY && ln > 0)) { /* ln overflow ? */
3540 if (c1 != CHRTEST_VOID) {
3542 c = utf8n_to_uvchr((U8*)PL_reginput,
3546 c = UCHARAT(PL_reginput);
3547 /* If it could work, try it. */
3548 if (c == (UV)c1 || c == (UV)c2)
3550 TRYPAREN(paren, ln, PL_reginput);
3551 REGCP_UNWIND(lastcp);
3554 /* If it could work, try it. */
3555 else if (c1 == CHRTEST_VOID)
3557 TRYPAREN(paren, ln, PL_reginput);
3558 REGCP_UNWIND(lastcp);
3560 /* Couldn't or didn't -- move forward. */
3561 PL_reginput = locinput;
3562 if (regrepeat(scan, 1)) {
3564 locinput = PL_reginput;
3572 n = regrepeat(scan, n);
3573 locinput = PL_reginput;
3574 if (ln < n && PL_regkind[(U8)OP(next)] == EOL &&
3575 ((!PL_multiline && OP(next) != MEOL) ||
3576 OP(next) == SEOL || OP(next) == EOS))
3578 ln = n; /* why back off? */
3579 /* ...because $ and \Z can match before *and* after
3580 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
3581 We should back off by one in this case. */
3582 if (UCHARAT(PL_reginput - 1) == '\n' && OP(next) != EOS)
3589 if (c1 != CHRTEST_VOID) {
3591 c = utf8n_to_uvchr((U8*)PL_reginput,
3595 c = UCHARAT(PL_reginput);
3597 /* If it could work, try it. */
3598 if (c1 == CHRTEST_VOID || c == (UV)c1 || c == (UV)c2)
3600 TRYPAREN(paren, n, PL_reginput);
3601 REGCP_UNWIND(lastcp);
3603 /* Couldn't or didn't -- back up. */
3605 PL_reginput = locinput = HOPc(locinput, -1);
3612 if (PL_reg_call_cc) {
3613 re_cc_state *cur_call_cc = PL_reg_call_cc;
3614 CURCUR *cctmp = PL_regcc;
3615 regexp *re = PL_reg_re;
3619 /* Save *all* the positions. */
3620 const CHECKPOINT cp = regcppush(0);
3623 /* Restore parens of the caller. */
3624 tmp = PL_savestack_ix;
3625 PL_savestack_ix = PL_reg_call_cc->ss;
3627 PL_savestack_ix = tmp;
3629 /* Make position available to the callcc. */
3630 PL_reginput = locinput;
3632 cache_re(PL_reg_call_cc->re);
3633 PL_regcc = PL_reg_call_cc->cc;
3634 PL_reg_call_cc = PL_reg_call_cc->prev;
3635 if (regmatch(cur_call_cc->node)) {
3636 PL_reg_call_cc = cur_call_cc;
3640 REGCP_UNWIND(lastcp);
3642 PL_reg_call_cc = cur_call_cc;
3648 PerlIO_printf(Perl_debug_log,
3649 "%*s continuation failed...\n",
3650 REPORT_CODE_OFF+PL_regindent*2, "")
3654 if (locinput < PL_regtill) {
3655 DEBUG_r(PerlIO_printf(Perl_debug_log,
3656 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
3658 (long)(locinput - PL_reg_starttry),
3659 (long)(PL_regtill - PL_reg_starttry),
3661 sayNO_FINAL; /* Cannot match: too short. */
3663 PL_reginput = locinput; /* put where regtry can find it */
3664 sayYES_FINAL; /* Success! */
3666 PL_reginput = locinput; /* put where regtry can find it */
3667 sayYES_LOUD; /* Success! */
3670 PL_reginput = locinput;
3675 s = HOPBACKc(locinput, scan->flags);
3681 PL_reginput = locinput;
3686 s = HOPBACKc(locinput, scan->flags);
3692 PL_reginput = locinput;
3695 inner = NEXTOPER(NEXTOPER(scan));
3696 if (regmatch(inner) != n) {
3711 if (OP(scan) == SUSPEND) {
3712 locinput = PL_reginput;
3713 nextchr = UCHARAT(locinput);
3718 next = scan + ARG(scan);
3723 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
3724 PTR2UV(scan), OP(scan));
3725 Perl_croak(aTHX_ "regexp memory corruption");
3732 * We get here only if there's trouble -- normally "case END" is
3733 * the terminating point.
3735 Perl_croak(aTHX_ "corrupted regexp pointers");
3741 PerlIO_printf(Perl_debug_log,
3742 "%*s %scould match...%s\n",
3743 REPORT_CODE_OFF+PL_regindent*2, "", PL_colors[4],PL_colors[5])
3747 DEBUG_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
3748 PL_colors[4],PL_colors[5]));
3754 #if 0 /* Breaks $^R */
3762 PerlIO_printf(Perl_debug_log,
3763 "%*s %sfailed...%s\n",
3764 REPORT_CODE_OFF+PL_regindent*2, "",PL_colors[4],PL_colors[5])
3770 re_unwind_t * const uw = SSPTRt(unwind,re_unwind_t);
3773 case RE_UNWIND_BRANCH:
3774 case RE_UNWIND_BRANCHJ:
3776 re_unwind_branch_t * const uwb = &(uw->branch);
3777 const I32 lastparen = uwb->lastparen;
3779 REGCP_UNWIND(uwb->lastcp);
3780 for (n = *PL_reglastparen; n > lastparen; n--)
3782 *PL_reglastparen = n;
3783 scan = next = uwb->next;
3785 OP(scan) != (uwb->type == RE_UNWIND_BRANCH
3786 ? BRANCH : BRANCHJ) ) { /* Failure */
3793 /* Have more choice yet. Reuse the same uwb. */
3794 if ((n = (uwb->type == RE_UNWIND_BRANCH
3795 ? NEXT_OFF(next) : ARG(next))))
3798 next = NULL; /* XXXX Needn't unwinding in this case... */
3800 next = NEXTOPER(scan);
3801 if (uwb->type == RE_UNWIND_BRANCHJ)
3802 next = NEXTOPER(next);
3803 locinput = uwb->locinput;
3804 nextchr = uwb->nextchr;
3806 PL_regindent = uwb->regindent;
3813 Perl_croak(aTHX_ "regexp unwind memory corruption");
3824 - regrepeat - repeatedly match something simple, report how many
3827 * [This routine now assumes that it will only match on things of length 1.
3828 * That was true before, but now we assume scan - reginput is the count,
3829 * rather than incrementing count on every character. [Er, except utf8.]]
3832 S_regrepeat(pTHX_ const regnode *p, I32 max)
3834 register char *scan;
3836 register char *loceol = PL_regeol;
3837 register I32 hardcount = 0;
3838 register bool do_utf8 = PL_reg_match_utf8;
3841 if (max == REG_INFTY)
3843 else if (max < loceol - scan)
3844 loceol = scan + max;
3849 while (scan < loceol && hardcount < max && *scan != '\n') {
3850 scan += UTF8SKIP(scan);
3854 while (scan < loceol && *scan != '\n')
3861 while (scan < loceol && hardcount < max) {
3862 scan += UTF8SKIP(scan);
3872 case EXACT: /* length of string is 1 */
3874 while (scan < loceol && UCHARAT(scan) == c)
3877 case EXACTF: /* length of string is 1 */
3879 while (scan < loceol &&
3880 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
3883 case EXACTFL: /* length of string is 1 */
3884 PL_reg_flags |= RF_tainted;
3886 while (scan < loceol &&
3887 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
3893 while (hardcount < max && scan < loceol &&
3894 reginclass(p, (U8*)scan, 0, do_utf8)) {
3895 scan += UTF8SKIP(scan);
3899 while (scan < loceol && REGINCLASS(p, (U8*)scan))
3906 LOAD_UTF8_CHARCLASS_ALNUM();
3907 while (hardcount < max && scan < loceol &&
3908 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3909 scan += UTF8SKIP(scan);
3913 while (scan < loceol && isALNUM(*scan))
3918 PL_reg_flags |= RF_tainted;
3921 while (hardcount < max && scan < loceol &&
3922 isALNUM_LC_utf8((U8*)scan)) {
3923 scan += UTF8SKIP(scan);
3927 while (scan < loceol && isALNUM_LC(*scan))
3934 LOAD_UTF8_CHARCLASS_ALNUM();
3935 while (hardcount < max && scan < loceol &&
3936 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
3937 scan += UTF8SKIP(scan);
3941 while (scan < loceol && !isALNUM(*scan))
3946 PL_reg_flags |= RF_tainted;
3949 while (hardcount < max && scan < loceol &&
3950 !isALNUM_LC_utf8((U8*)scan)) {
3951 scan += UTF8SKIP(scan);
3955 while (scan < loceol && !isALNUM_LC(*scan))
3962 LOAD_UTF8_CHARCLASS_SPACE();
3963 while (hardcount < max && scan < loceol &&
3965 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3966 scan += UTF8SKIP(scan);
3970 while (scan < loceol && isSPACE(*scan))
3975 PL_reg_flags |= RF_tainted;
3978 while (hardcount < max && scan < loceol &&
3979 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
3980 scan += UTF8SKIP(scan);
3984 while (scan < loceol && isSPACE_LC(*scan))
3991 LOAD_UTF8_CHARCLASS_SPACE();
3992 while (hardcount < max && scan < loceol &&
3994 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
3995 scan += UTF8SKIP(scan);
3999 while (scan < loceol && !isSPACE(*scan))
4004 PL_reg_flags |= RF_tainted;
4007 while (hardcount < max && scan < loceol &&
4008 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
4009 scan += UTF8SKIP(scan);
4013 while (scan < loceol && !isSPACE_LC(*scan))
4020 LOAD_UTF8_CHARCLASS_DIGIT();
4021 while (hardcount < max && scan < loceol &&
4022 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4023 scan += UTF8SKIP(scan);
4027 while (scan < loceol && isDIGIT(*scan))
4034 LOAD_UTF8_CHARCLASS_DIGIT();
4035 while (hardcount < max && scan < loceol &&
4036 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
4037 scan += UTF8SKIP(scan);
4041 while (scan < loceol && !isDIGIT(*scan))
4045 default: /* Called on something of 0 width. */
4046 break; /* So match right here or not at all. */
4052 c = scan - PL_reginput;
4057 SV * const prop = sv_newmortal();
4059 regprop(prop, (regnode *)p);
4060 PerlIO_printf(Perl_debug_log,
4061 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
4062 REPORT_CODE_OFF+1, "", SvPVX_const(prop),(IV)c,(IV)max);
4070 - regclass_swash - prepare the utf8 swash
4074 Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp)
4080 if (PL_regdata && PL_regdata->count) {
4081 const U32 n = ARG(node);
4083 if (PL_regdata->what[n] == 's') {
4084 SV * const rv = (SV*)PL_regdata->data[n];
4085 AV * const av = (AV*)SvRV((SV*)rv);
4086 SV **const ary = AvARRAY(av);
4089 /* See the end of regcomp.c:S_regclass() for
4090 * documentation of these array elements. */
4093 a = SvROK(ary[1]) ? &ary[1] : 0;
4094 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : 0;
4098 else if (si && doinit) {
4099 sw = swash_init("utf8", "", si, 1, 0);
4100 (void)av_store(av, 1, sw);
4116 - reginclass - determine if a character falls into a character class
4118 The n is the ANYOF regnode, the p is the target string, lenp
4119 is pointer to the maximum length of how far to go in the p
4120 (if the lenp is zero, UTF8SKIP(p) is used),
4121 do_utf8 tells whether the target string is in UTF-8.
4126 S_reginclass(pTHX_ register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
4128 const char flags = ANYOF_FLAGS(n);
4134 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
4135 c = utf8n_to_uvchr((U8 *)p, UTF8_MAXBYTES, &len,
4136 ckWARN(WARN_UTF8) ? UTF8_CHECK_ONLY :
4137 UTF8_ALLOW_ANYUV|UTF8_CHECK_ONLY);
4138 if (len == (STRLEN)-1)
4139 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
4142 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
4143 if (do_utf8 || (flags & ANYOF_UNICODE)) {
4146 if (do_utf8 && !ANYOF_RUNTIME(n)) {
4147 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
4150 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
4154 SV * const sw = regclass_swash((regnode *)n, TRUE, 0, (SV**)&av);
4157 if (swash_fetch(sw, (U8 *)p, do_utf8))
4159 else if (flags & ANYOF_FOLD) {
4160 if (!match && lenp && av) {
4162 for (i = 0; i <= av_len(av); i++) {
4163 SV* const sv = *av_fetch(av, i, FALSE);
4165 const char * const s = SvPV_const(sv, len);
4167 if (len <= plen && memEQ(s, (char*)p, len)) {
4175 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
4178 to_utf8_fold((U8 *)p, tmpbuf, &tmplen);
4179 if (swash_fetch(sw, tmpbuf, do_utf8))
4185 if (match && lenp && *lenp == 0)
4186 *lenp = UNISKIP(NATIVE_TO_UNI(c));
4188 if (!match && c < 256) {
4189 if (ANYOF_BITMAP_TEST(n, c))
4191 else if (flags & ANYOF_FOLD) {
4194 if (flags & ANYOF_LOCALE) {
4195 PL_reg_flags |= RF_tainted;
4196 f = PL_fold_locale[c];
4200 if (f != c && ANYOF_BITMAP_TEST(n, f))
4204 if (!match && (flags & ANYOF_CLASS)) {
4205 PL_reg_flags |= RF_tainted;
4207 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
4208 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
4209 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
4210 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
4211 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
4212 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
4213 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
4214 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
4215 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
4216 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
4217 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
4218 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
4219 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
4220 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
4221 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
4222 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
4223 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
4224 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
4225 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
4226 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
4227 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
4228 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
4229 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
4230 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
4231 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
4232 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
4233 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
4234 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
4235 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
4236 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
4237 ) /* How's that for a conditional? */
4244 return (flags & ANYOF_INVERT) ? !match : match;
4248 S_reghop3(U8 *s, I32 off, const U8* lim)
4251 while (off-- && s < lim) {
4252 /* XXX could check well-formedness here */
4260 if (UTF8_IS_CONTINUED(*s)) {
4261 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4264 /* XXX could check well-formedness here */
4272 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
4275 while (off-- && s < lim) {
4276 /* XXX could check well-formedness here */
4286 if (UTF8_IS_CONTINUED(*s)) {
4287 while (s > (U8*)lim && UTF8_IS_CONTINUATION(*s))
4290 /* XXX could check well-formedness here */
4302 restore_pos(pTHX_ void *arg)
4304 PERL_UNUSED_ARG(arg);
4305 if (PL_reg_eval_set) {
4306 if (PL_reg_oldsaved) {
4307 PL_reg_re->subbeg = PL_reg_oldsaved;
4308 PL_reg_re->sublen = PL_reg_oldsavedlen;
4309 RX_MATCH_COPIED_on(PL_reg_re);
4311 PL_reg_magic->mg_len = PL_reg_oldpos;
4312 PL_reg_eval_set = 0;
4313 PL_curpm = PL_reg_oldcurpm;
4318 S_to_utf8_substr(pTHX_ register regexp *prog)
4322 if (prog->substrs->data[i].substr
4323 && !prog->substrs->data[i].utf8_substr) {
4324 SV* const sv = newSVsv(prog->substrs->data[i].substr);
4325 prog->substrs->data[i].utf8_substr = sv;
4326 sv_utf8_upgrade(sv);
4327 if (SvVALID(prog->substrs->data[i].substr)) {
4328 const U8 flags = SvTAIL(prog->substrs->data[i].substr)
4331 /* Trim the trailing \n that fbm_compile added last
4333 SvCUR_set(sv, SvCUR(sv) - 1);
4334 /* Whilst this makes the SV technically "invalid" (as its
4335 buffer is no longer followed by "\0") when fbm_compile()
4336 adds the "\n" back, a "\0" is restored. */
4338 fbm_compile(sv, flags);
4340 if (prog->substrs->data[i].substr == prog->check_substr)
4341 prog->check_utf8 = sv;
4347 S_to_byte_substr(pTHX_ register regexp *prog)
4351 if (prog->substrs->data[i].utf8_substr
4352 && !prog->substrs->data[i].substr) {
4353 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
4354 if (sv_utf8_downgrade(sv, TRUE)) {
4355 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
4356 const U8 flags = SvTAIL(prog->substrs->data[i].utf8_substr)
4359 /* Trim the trailing \n that fbm_compile added last
4361 SvCUR_set(sv, SvCUR(sv) - 1);
4363 fbm_compile(sv, flags);
4369 prog->substrs->data[i].substr = sv;
4370 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
4371 prog->check_substr = sv;
4378 * c-indentation-style: bsd
4380 * indent-tabs-mode: t
4383 * ex: set ts=8 sts=4 sw=4 noet: