5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #define RF_tainted 1 /* tainted information used? e.g. locale */
84 #define RF_warned 2 /* warned about big count? */
86 #define RF_utf8 8 /* Pattern contains multibyte chars? */
88 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
90 #define RS_init 1 /* eval environment created */
91 #define RS_set 2 /* replsv value is set */
97 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
98 * call if there are no complications: i.e., if everything matchable is
99 * straight forward in the bitmap */
100 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
101 : ANYOF_BITMAP_TEST(p,*(c)))
107 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
110 #define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
114 #define HOPBACKc(pos, off) \
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
124 /* these are unrolled below in the CCC_TRY_XXX defined */
126 /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just
127 * skip the check on EBCDIC platforms */
128 # define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class)
130 # define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
131 if (!CAT2(PL_utf8_,class)) { \
133 ENTER; save_re_context(); \
134 ok=CAT2(is_utf8_,class)((const U8*)str); \
135 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
138 /* Doesn't do an assert to verify that is correct */
139 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
140 if (!CAT2(PL_utf8_,class)) { \
141 bool throw_away PERL_UNUSED_DECL; \
142 ENTER; save_re_context(); \
143 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
146 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
147 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
148 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
150 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
151 LOAD_UTF8_CHARCLASS(X_begin, " "); \
152 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
153 /* These are utf8 constants, and not utf-ebcdic constants, so the \
154 * assert should likely and hopefully fail on an EBCDIC machine */ \
155 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
157 /* No asserts are done for these, in case called on an early \
158 * Unicode version in which they map to nothing */ \
159 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
160 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
161 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
162 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
163 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
164 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
165 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
167 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
169 /* The actual code for CCC_TRY, which uses several variables from the routine
170 * it's callable from. It is designed to be the bulk of a case statement.
171 * FUNC is the macro or function to call on non-utf8 targets that indicate if
172 * nextchr matches the class.
173 * UTF8_TEST is the whole test string to use for utf8 targets
174 * LOAD is what to use to test, and if not present to load in the swash for the
176 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
178 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
179 * utf8 and a variant, load the swash if necessary and test using the utf8
180 * test. Advance to the next character if test is ok, otherwise fail; If not
181 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
182 * fails, or advance to the next character */
184 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
185 if (locinput >= PL_regeol) { \
188 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
189 LOAD_UTF8_CHARCLASS(CLASS, STR); \
190 if (POS_OR_NEG (UTF8_TEST)) { \
193 locinput += PL_utf8skip[nextchr]; \
194 nextchr = UCHARAT(locinput); \
197 if (POS_OR_NEG (FUNC(nextchr))) { \
200 nextchr = UCHARAT(++locinput); \
203 /* Handle the non-locale cases for a character class and its complement. It
204 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
205 * This is because that code fails when the test succeeds, so we want to have
206 * the test fail so that the code succeeds. The swash is stored in a
207 * predictable PL_ place */
208 #define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
211 _CCC_TRY_CODE( !, FUNC, \
212 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
213 (U8*)locinput, TRUE)), \
216 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
217 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
218 (U8*)locinput, TRUE)), \
221 /* Generate the case statements for both locale and non-locale character
222 * classes in regmatch for classes that don't have special unicode semantics.
223 * Locales don't use an immediate swash, but an intermediary special locale
224 * function that is called on the pointer to the current place in the input
225 * string. That function will resolve to needing the same swash. One might
226 * think that because we don't know what the locale will match, we shouldn't
227 * check with the swash loading function that it loaded properly; ie, that we
228 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
229 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
231 #define CCC_TRY(NAME, NNAME, FUNC, \
232 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
233 NAMEA, NNAMEA, FUNCA, \
236 PL_reg_flags |= RF_tainted; \
237 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
239 PL_reg_flags |= RF_tainted; \
240 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
243 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
246 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
247 nextchr = UCHARAT(++locinput); \
250 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
254 locinput += PL_utf8skip[nextchr]; \
255 nextchr = UCHARAT(locinput); \
258 nextchr = UCHARAT(++locinput); \
261 /* Generate the non-locale cases */ \
262 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
264 /* This is like CCC_TRY, but has an extra set of parameters for generating case
265 * statements to handle separate Unicode semantics nodes */
266 #define CCC_TRY_U(NAME, NNAME, FUNC, \
267 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
268 NAMEU, NNAMEU, FUNCU, \
269 NAMEA, NNAMEA, FUNCA, \
271 CCC_TRY(NAME, NNAME, FUNC, \
272 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
273 NAMEA, NNAMEA, FUNCA, \
275 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
277 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
279 /* for use after a quantifier and before an EXACT-like node -- japhy */
280 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
282 * NOTE that *nothing* that affects backtracking should be in here, specifically
283 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
284 * node that is in between two EXACT like nodes when ascertaining what the required
285 * "follow" character is. This should probably be moved to regex compile time
286 * although it may be done at run time beause of the REF possibility - more
287 * investigation required. -- demerphq
289 #define JUMPABLE(rn) ( \
291 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
293 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
294 OP(rn) == PLUS || OP(rn) == MINMOD || \
296 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
298 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
300 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
303 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
304 we don't need this definition. */
305 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
306 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
307 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
310 /* ... so we use this as its faster. */
311 #define IS_TEXT(rn) ( OP(rn)==EXACT )
312 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
313 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
314 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
319 Search for mandatory following text node; for lookahead, the text must
320 follow but for lookbehind (rn->flags != 0) we skip to the next step.
322 #define FIND_NEXT_IMPT(rn) STMT_START { \
323 while (JUMPABLE(rn)) { \
324 const OPCODE type = OP(rn); \
325 if (type == SUSPEND || PL_regkind[type] == CURLY) \
326 rn = NEXTOPER(NEXTOPER(rn)); \
327 else if (type == PLUS) \
329 else if (type == IFMATCH) \
330 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
331 else rn += NEXT_OFF(rn); \
336 static void restore_pos(pTHX_ void *arg);
338 #define REGCP_PAREN_ELEMS 4
339 #define REGCP_OTHER_ELEMS 5
340 #define REGCP_FRAME_ELEMS 1
341 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
342 * are needed for the regexp context stack bookkeeping. */
345 S_regcppush(pTHX_ I32 parenfloor)
348 const int retval = PL_savestack_ix;
349 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
350 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
351 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
353 GET_RE_DEBUG_FLAGS_DECL;
355 if (paren_elems_to_push < 0)
356 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
357 paren_elems_to_push);
359 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
360 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
361 " out of range (%lu-%ld)",
362 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
364 SSGROW(total_elems + REGCP_FRAME_ELEMS);
366 for (p = PL_regsize; p > parenfloor; p--) {
367 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
368 SSPUSHINT(PL_regoffs[p].end);
369 SSPUSHINT(PL_regoffs[p].start);
370 SSPUSHPTR(PL_reg_start_tmp[p]);
372 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
373 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
374 (UV)p, (IV)PL_regoffs[p].start,
375 (IV)(PL_reg_start_tmp[p] - PL_bostr),
376 (IV)PL_regoffs[p].end
379 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
380 SSPUSHPTR(PL_regoffs);
381 SSPUSHINT(PL_regsize);
382 SSPUSHINT(*PL_reglastparen);
383 SSPUSHINT(*PL_reglastcloseparen);
384 SSPUSHPTR(PL_reginput);
385 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
390 /* These are needed since we do not localize EVAL nodes: */
391 #define REGCP_SET(cp) \
393 PerlIO_printf(Perl_debug_log, \
394 " Setting an EVAL scope, savestack=%"IVdf"\n", \
395 (IV)PL_savestack_ix)); \
398 #define REGCP_UNWIND(cp) \
400 if (cp != PL_savestack_ix) \
401 PerlIO_printf(Perl_debug_log, \
402 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
403 (IV)(cp), (IV)PL_savestack_ix)); \
407 S_regcppop(pTHX_ const regexp *rex)
412 GET_RE_DEBUG_FLAGS_DECL;
414 PERL_ARGS_ASSERT_REGCPPOP;
416 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
418 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
419 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
420 input = (char *) SSPOPPTR;
421 *PL_reglastcloseparen = SSPOPINT;
422 *PL_reglastparen = SSPOPINT;
423 PL_regsize = SSPOPINT;
424 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
426 i -= REGCP_OTHER_ELEMS;
427 /* Now restore the parentheses context. */
428 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
430 U32 paren = (U32)SSPOPINT;
431 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
432 PL_regoffs[paren].start = SSPOPINT;
434 if (paren <= *PL_reglastparen)
435 PL_regoffs[paren].end = tmps;
437 PerlIO_printf(Perl_debug_log,
438 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
439 (UV)paren, (IV)PL_regoffs[paren].start,
440 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
441 (IV)PL_regoffs[paren].end,
442 (paren > *PL_reglastparen ? "(no)" : ""));
446 if (*PL_reglastparen + 1 <= rex->nparens) {
447 PerlIO_printf(Perl_debug_log,
448 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
449 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
453 /* It would seem that the similar code in regtry()
454 * already takes care of this, and in fact it is in
455 * a better location to since this code can #if 0-ed out
456 * but the code in regtry() is needed or otherwise tests
457 * requiring null fields (pat.t#187 and split.t#{13,14}
458 * (as of patchlevel 7877) will fail. Then again,
459 * this code seems to be necessary or otherwise
460 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
461 * --jhi updated by dapm */
462 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
464 PL_regoffs[i].start = -1;
465 PL_regoffs[i].end = -1;
471 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
474 * pregexec and friends
477 #ifndef PERL_IN_XSUB_RE
479 - pregexec - match a regexp against a string
482 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
483 char *strbeg, I32 minend, SV *screamer, U32 nosave)
484 /* strend: pointer to null at end of string */
485 /* strbeg: real beginning of string */
486 /* minend: end of match must be >=minend after stringarg. */
487 /* nosave: For optimizations. */
489 PERL_ARGS_ASSERT_PREGEXEC;
492 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
493 nosave ? 0 : REXEC_COPY_STR);
498 * Need to implement the following flags for reg_anch:
500 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
502 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
503 * INTUIT_AUTORITATIVE_ML
504 * INTUIT_ONCE_NOML - Intuit can match in one location only.
507 * Another flag for this function: SECOND_TIME (so that float substrs
508 * with giant delta may be not rechecked).
511 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
513 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
514 Otherwise, only SvCUR(sv) is used to get strbeg. */
516 /* XXXX We assume that strpos is strbeg unless sv. */
518 /* XXXX Some places assume that there is a fixed substring.
519 An update may be needed if optimizer marks as "INTUITable"
520 RExen without fixed substrings. Similarly, it is assumed that
521 lengths of all the strings are no more than minlen, thus they
522 cannot come from lookahead.
523 (Or minlen should take into account lookahead.)
524 NOTE: Some of this comment is not correct. minlen does now take account
525 of lookahead/behind. Further research is required. -- demerphq
529 /* A failure to find a constant substring means that there is no need to make
530 an expensive call to REx engine, thus we celebrate a failure. Similarly,
531 finding a substring too deep into the string means that less calls to
532 regtry() should be needed.
534 REx compiler's optimizer found 4 possible hints:
535 a) Anchored substring;
537 c) Whether we are anchored (beginning-of-line or \G);
538 d) First node (of those at offset 0) which may distinguish positions;
539 We use a)b)d) and multiline-part of c), and try to find a position in the
540 string which does not contradict any of them.
543 /* Most of decisions we do here should have been done at compile time.
544 The nodes of the REx which we used for the search should have been
545 deleted from the finite automaton. */
548 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
549 char *strend, const U32 flags, re_scream_pos_data *data)
552 struct regexp *const prog = (struct regexp *)SvANY(rx);
553 register I32 start_shift = 0;
554 /* Should be nonnegative! */
555 register I32 end_shift = 0;
560 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
562 register char *other_last = NULL; /* other substr checked before this */
563 char *check_at = NULL; /* check substr found at this pos */
564 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
565 RXi_GET_DECL(prog,progi);
567 const char * const i_strpos = strpos;
569 GET_RE_DEBUG_FLAGS_DECL;
571 PERL_ARGS_ASSERT_RE_INTUIT_START;
573 RX_MATCH_UTF8_set(rx,utf8_target);
576 PL_reg_flags |= RF_utf8;
579 debug_start_match(rx, utf8_target, strpos, strend,
580 sv ? "Guessing start of match in sv for"
581 : "Guessing start of match in string for");
584 /* CHR_DIST() would be more correct here but it makes things slow. */
585 if (prog->minlen > strend - strpos) {
586 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
587 "String too short... [re_intuit_start]\n"));
591 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
594 if (!prog->check_utf8 && prog->check_substr)
595 to_utf8_substr(prog);
596 check = prog->check_utf8;
598 if (!prog->check_substr && prog->check_utf8)
599 to_byte_substr(prog);
600 check = prog->check_substr;
602 if (check == &PL_sv_undef) {
603 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
604 "Non-utf8 string cannot match utf8 check string\n"));
607 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
608 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
609 || ( (prog->extflags & RXf_ANCH_BOL)
610 && !multiline ) ); /* Check after \n? */
613 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
614 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
615 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
617 && (strpos != strbeg)) {
618 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
621 if (prog->check_offset_min == prog->check_offset_max &&
622 !(prog->extflags & RXf_CANY_SEEN)) {
623 /* Substring at constant offset from beg-of-str... */
626 s = HOP3c(strpos, prog->check_offset_min, strend);
629 slen = SvCUR(check); /* >= 1 */
631 if ( strend - s > slen || strend - s < slen - 1
632 || (strend - s == slen && strend[-1] != '\n')) {
633 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
636 /* Now should match s[0..slen-2] */
638 if (slen && (*SvPVX_const(check) != *s
640 && memNE(SvPVX_const(check), s, slen)))) {
642 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
646 else if (*SvPVX_const(check) != *s
647 || ((slen = SvCUR(check)) > 1
648 && memNE(SvPVX_const(check), s, slen)))
651 goto success_at_start;
654 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
656 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
657 end_shift = prog->check_end_shift;
660 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
661 - (SvTAIL(check) != 0);
662 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
664 if (end_shift < eshift)
668 else { /* Can match at random position */
671 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
672 end_shift = prog->check_end_shift;
674 /* end shift should be non negative here */
677 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
679 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
680 (IV)end_shift, RX_PRECOMP(prog));
684 /* Find a possible match in the region s..strend by looking for
685 the "check" substring in the region corrected by start/end_shift. */
688 I32 srch_start_shift = start_shift;
689 I32 srch_end_shift = end_shift;
690 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
691 srch_end_shift -= ((strbeg - s) - srch_start_shift);
692 srch_start_shift = strbeg - s;
694 DEBUG_OPTIMISE_MORE_r({
695 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
696 (IV)prog->check_offset_min,
697 (IV)srch_start_shift,
699 (IV)prog->check_end_shift);
702 if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
703 I32 p = -1; /* Internal iterator of scream. */
704 I32 * const pp = data ? data->scream_pos : &p;
708 assert(SvMAGICAL(sv));
709 mg = mg_find(sv, PERL_MAGIC_study);
712 if (mg->mg_private == 1) {
713 found = ((U8 *)mg->mg_ptr)[BmRARE(check)] != (U8)~0;
714 } else if (mg->mg_private == 2) {
715 found = ((U16 *)mg->mg_ptr)[BmRARE(check)] != (U16)~0;
717 assert (mg->mg_private == 4);
718 found = ((U32 *)mg->mg_ptr)[BmRARE(check)] != (U32)~0;
722 || ( BmRARE(check) == '\n'
723 && (BmPREVIOUS(check) == SvCUR(check) - 1)
725 s = screaminstr(sv, check,
726 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
729 /* we may be pointing at the wrong string */
730 if (s && RXp_MATCH_COPIED(prog))
731 s = strbeg + (s - SvPVX_const(sv));
733 *data->scream_olds = s;
738 if (prog->extflags & RXf_CANY_SEEN) {
739 start_point= (U8*)(s + srch_start_shift);
740 end_point= (U8*)(strend - srch_end_shift);
742 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
743 end_point= HOP3(strend, -srch_end_shift, strbeg);
745 DEBUG_OPTIMISE_MORE_r({
746 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
747 (int)(end_point - start_point),
748 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
752 s = fbm_instr( start_point, end_point,
753 check, multiline ? FBMrf_MULTILINE : 0);
756 /* Update the count-of-usability, remove useless subpatterns,
760 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
761 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
762 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
763 (s ? "Found" : "Did not find"),
764 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
765 ? "anchored" : "floating"),
768 (s ? " at offset " : "...\n") );
773 /* Finish the diagnostic message */
774 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
776 /* XXX dmq: first branch is for positive lookbehind...
777 Our check string is offset from the beginning of the pattern.
778 So we need to do any stclass tests offset forward from that
787 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
788 Start with the other substr.
789 XXXX no SCREAM optimization yet - and a very coarse implementation
790 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
791 *always* match. Probably should be marked during compile...
792 Probably it is right to do no SCREAM here...
795 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
796 : (prog->float_substr && prog->anchored_substr))
798 /* Take into account the "other" substring. */
799 /* XXXX May be hopelessly wrong for UTF... */
802 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
805 char * const last = HOP3c(s, -start_shift, strbeg);
807 char * const saved_s = s;
810 t = s - prog->check_offset_max;
811 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
813 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
818 t = HOP3c(t, prog->anchored_offset, strend);
819 if (t < other_last) /* These positions already checked */
821 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
824 /* XXXX It is not documented what units *_offsets are in.
825 We assume bytes, but this is clearly wrong.
826 Meaning this code needs to be carefully reviewed for errors.
830 /* On end-of-str: see comment below. */
831 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
832 if (must == &PL_sv_undef) {
834 DEBUG_r(must = prog->anchored_utf8); /* for debug */
839 HOP3(HOP3(last1, prog->anchored_offset, strend)
840 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
842 multiline ? FBMrf_MULTILINE : 0
845 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
846 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
847 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
848 (s ? "Found" : "Contradicts"),
849 quoted, RE_SV_TAIL(must));
854 if (last1 >= last2) {
855 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
856 ", giving up...\n"));
859 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
860 ", trying floating at offset %ld...\n",
861 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
862 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
863 s = HOP3c(last, 1, strend);
867 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
868 (long)(s - i_strpos)));
869 t = HOP3c(s, -prog->anchored_offset, strbeg);
870 other_last = HOP3c(s, 1, strend);
878 else { /* Take into account the floating substring. */
880 char * const saved_s = s;
883 t = HOP3c(s, -start_shift, strbeg);
885 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
886 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
887 last = HOP3c(t, prog->float_max_offset, strend);
888 s = HOP3c(t, prog->float_min_offset, strend);
891 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
892 must = utf8_target ? prog->float_utf8 : prog->float_substr;
893 /* fbm_instr() takes into account exact value of end-of-str
894 if the check is SvTAIL(ed). Since false positives are OK,
895 and end-of-str is not later than strend we are OK. */
896 if (must == &PL_sv_undef) {
898 DEBUG_r(must = prog->float_utf8); /* for debug message */
901 s = fbm_instr((unsigned char*)s,
902 (unsigned char*)last + SvCUR(must)
904 must, multiline ? FBMrf_MULTILINE : 0);
906 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
907 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
908 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
909 (s ? "Found" : "Contradicts"),
910 quoted, RE_SV_TAIL(must));
914 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
915 ", giving up...\n"));
918 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
919 ", trying anchored starting at offset %ld...\n",
920 (long)(saved_s + 1 - i_strpos)));
922 s = HOP3c(t, 1, strend);
926 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
927 (long)(s - i_strpos)));
928 other_last = s; /* Fix this later. --Hugo */
938 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
940 DEBUG_OPTIMISE_MORE_r(
941 PerlIO_printf(Perl_debug_log,
942 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
943 (IV)prog->check_offset_min,
944 (IV)prog->check_offset_max,
952 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
954 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
957 /* Fixed substring is found far enough so that the match
958 cannot start at strpos. */
960 if (ml_anch && t[-1] != '\n') {
961 /* Eventually fbm_*() should handle this, but often
962 anchored_offset is not 0, so this check will not be wasted. */
963 /* XXXX In the code below we prefer to look for "^" even in
964 presence of anchored substrings. And we search even
965 beyond the found float position. These pessimizations
966 are historical artefacts only. */
968 while (t < strend - prog->minlen) {
970 if (t < check_at - prog->check_offset_min) {
971 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
972 /* Since we moved from the found position,
973 we definitely contradict the found anchored
974 substr. Due to the above check we do not
975 contradict "check" substr.
976 Thus we can arrive here only if check substr
977 is float. Redo checking for "other"=="fixed".
980 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
981 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
982 goto do_other_anchored;
984 /* We don't contradict the found floating substring. */
985 /* XXXX Why not check for STCLASS? */
987 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
988 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
991 /* Position contradicts check-string */
992 /* XXXX probably better to look for check-string
993 than for "\n", so one should lower the limit for t? */
994 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
995 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
996 other_last = strpos = s = t + 1;
1001 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1002 PL_colors[0], PL_colors[1]));
1006 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1007 PL_colors[0], PL_colors[1]));
1011 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1014 /* The found string does not prohibit matching at strpos,
1015 - no optimization of calling REx engine can be performed,
1016 unless it was an MBOL and we are not after MBOL,
1017 or a future STCLASS check will fail this. */
1019 /* Even in this situation we may use MBOL flag if strpos is offset
1020 wrt the start of the string. */
1021 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1022 && (strpos != strbeg) && strpos[-1] != '\n'
1023 /* May be due to an implicit anchor of m{.*foo} */
1024 && !(prog->intflags & PREGf_IMPLICIT))
1029 DEBUG_EXECUTE_r( if (ml_anch)
1030 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1031 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1034 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1036 prog->check_utf8 /* Could be deleted already */
1037 && --BmUSEFUL(prog->check_utf8) < 0
1038 && (prog->check_utf8 == prog->float_utf8)
1040 prog->check_substr /* Could be deleted already */
1041 && --BmUSEFUL(prog->check_substr) < 0
1042 && (prog->check_substr == prog->float_substr)
1045 /* If flags & SOMETHING - do not do it many times on the same match */
1046 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1047 /* XXX Does the destruction order has to change with utf8_target? */
1048 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1049 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1050 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1051 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1052 check = NULL; /* abort */
1054 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1055 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1056 if (prog->intflags & PREGf_IMPLICIT)
1057 prog->extflags &= ~RXf_ANCH_MBOL;
1058 /* XXXX This is a remnant of the old implementation. It
1059 looks wasteful, since now INTUIT can use many
1060 other heuristics. */
1061 prog->extflags &= ~RXf_USE_INTUIT;
1062 /* XXXX What other flags might need to be cleared in this branch? */
1068 /* Last resort... */
1069 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1070 /* trie stclasses are too expensive to use here, we are better off to
1071 leave it to regmatch itself */
1072 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1073 /* minlen == 0 is possible if regstclass is \b or \B,
1074 and the fixed substr is ''$.
1075 Since minlen is already taken into account, s+1 is before strend;
1076 accidentally, minlen >= 1 guaranties no false positives at s + 1
1077 even for \b or \B. But (minlen? 1 : 0) below assumes that
1078 regstclass does not come from lookahead... */
1079 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1080 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1081 const U8* const str = (U8*)STRING(progi->regstclass);
1082 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1083 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1086 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1087 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1088 else if (prog->float_substr || prog->float_utf8)
1089 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1093 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1094 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1097 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1100 const char *what = NULL;
1102 if (endpos == strend) {
1103 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1104 "Could not match STCLASS...\n") );
1107 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1108 "This position contradicts STCLASS...\n") );
1109 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1111 /* Contradict one of substrings */
1112 if (prog->anchored_substr || prog->anchored_utf8) {
1113 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1114 DEBUG_EXECUTE_r( what = "anchored" );
1116 s = HOP3c(t, 1, strend);
1117 if (s + start_shift + end_shift > strend) {
1118 /* XXXX Should be taken into account earlier? */
1119 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1120 "Could not match STCLASS...\n") );
1125 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1126 "Looking for %s substr starting at offset %ld...\n",
1127 what, (long)(s + start_shift - i_strpos)) );
1130 /* Have both, check_string is floating */
1131 if (t + start_shift >= check_at) /* Contradicts floating=check */
1132 goto retry_floating_check;
1133 /* Recheck anchored substring, but not floating... */
1137 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1138 "Looking for anchored substr starting at offset %ld...\n",
1139 (long)(other_last - i_strpos)) );
1140 goto do_other_anchored;
1142 /* Another way we could have checked stclass at the
1143 current position only: */
1148 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1149 "Looking for /%s^%s/m starting at offset %ld...\n",
1150 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1153 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1155 /* Check is floating substring. */
1156 retry_floating_check:
1157 t = check_at - start_shift;
1158 DEBUG_EXECUTE_r( what = "floating" );
1159 goto hop_and_restart;
1162 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1163 "By STCLASS: moving %ld --> %ld\n",
1164 (long)(t - i_strpos), (long)(s - i_strpos))
1168 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1169 "Does not contradict STCLASS...\n");
1174 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1175 PL_colors[4], (check ? "Guessed" : "Giving up"),
1176 PL_colors[5], (long)(s - i_strpos)) );
1179 fail_finish: /* Substring not found */
1180 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1181 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1183 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1184 PL_colors[4], PL_colors[5]));
1188 #define DECL_TRIE_TYPE(scan) \
1189 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1190 trie_type = ((scan->flags == EXACT) \
1191 ? (utf8_target ? trie_utf8 : trie_plain) \
1192 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1194 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1195 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1197 switch (trie_type) { \
1198 case trie_utf8_fold: \
1199 if ( foldlen>0 ) { \
1200 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1205 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1206 len = UTF8SKIP(uc); \
1207 skiplen = UNISKIP( uvc ); \
1208 foldlen -= skiplen; \
1209 uscan = foldbuf + skiplen; \
1212 case trie_latin_utf8_fold: \
1213 if ( foldlen>0 ) { \
1214 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1220 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1221 skiplen = UNISKIP( uvc ); \
1222 foldlen -= skiplen; \
1223 uscan = foldbuf + skiplen; \
1227 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1234 charid = trie->charmap[ uvc ]; \
1238 if (widecharmap) { \
1239 SV** const svpp = hv_fetch(widecharmap, \
1240 (char*)&uvc, sizeof(UV), 0); \
1242 charid = (U16)SvIV(*svpp); \
1247 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1251 && (ln == 1 || folder(s, pat_string, ln)) \
1252 && (!reginfo || regtry(reginfo, &s)) ) \
1258 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1260 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1266 #define REXEC_FBC_SCAN(CoDe) \
1268 while (s < strend) { \
1274 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1275 REXEC_FBC_UTF8_SCAN( \
1277 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1286 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1289 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1298 #define REXEC_FBC_TRYIT \
1299 if ((!reginfo || regtry(reginfo, &s))) \
1302 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1303 if (utf8_target) { \
1304 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1307 REXEC_FBC_CLASS_SCAN(CoNd); \
1310 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1311 if (utf8_target) { \
1313 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1316 REXEC_FBC_CLASS_SCAN(CoNd); \
1319 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1320 PL_reg_flags |= RF_tainted; \
1321 if (utf8_target) { \
1322 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1325 REXEC_FBC_CLASS_SCAN(CoNd); \
1328 #define DUMP_EXEC_POS(li,s,doutf8) \
1329 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1332 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1333 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1334 tmp = TEST_NON_UTF8(tmp); \
1335 REXEC_FBC_UTF8_SCAN( \
1336 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1345 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1346 if (s == PL_bostr) { \
1350 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1351 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1354 LOAD_UTF8_CHARCLASS_ALNUM(); \
1355 REXEC_FBC_UTF8_SCAN( \
1356 if (tmp == ! (TeSt2_UtF8)) { \
1365 /* The only difference between the BOUND and NBOUND cases is that
1366 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1367 * NBOUND. This is accomplished by passing it in either the if or else clause,
1368 * with the other one being empty */
1369 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1370 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1372 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1373 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1375 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1376 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1378 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1379 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1382 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1383 * be passed in completely with the variable name being tested, which isn't
1384 * such a clean interface, but this is easier to read than it was before. We
1385 * are looking for the boundary (or non-boundary between a word and non-word
1386 * character. The utf8 and non-utf8 cases have the same logic, but the details
1387 * must be different. Find the "wordness" of the character just prior to this
1388 * one, and compare it with the wordness of this one. If they differ, we have
1389 * a boundary. At the beginning of the string, pretend that the previous
1390 * character was a new-line */
1391 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1392 if (utf8_target) { \
1395 else { /* Not utf8 */ \
1396 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1397 tmp = TEST_NON_UTF8(tmp); \
1399 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1408 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1411 /* We know what class REx starts with. Try to find this position... */
1412 /* if reginfo is NULL, its a dryrun */
1413 /* annoyingly all the vars in this routine have different names from their counterparts
1414 in regmatch. /grrr */
1417 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1418 const char *strend, regmatch_info *reginfo)
1421 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1422 char *pat_string; /* The pattern's exactish string */
1423 char *pat_end; /* ptr to end char of pat_string */
1424 re_fold_t folder; /* Function for computing non-utf8 folds */
1425 const U8 *fold_array; /* array for folding ords < 256 */
1428 register STRLEN uskip;
1432 register I32 tmp = 1; /* Scratch variable? */
1433 register const bool utf8_target = PL_reg_match_utf8;
1434 UV utf8_fold_flags = 0;
1435 RXi_GET_DECL(prog,progi);
1437 PERL_ARGS_ASSERT_FIND_BYCLASS;
1439 /* We know what class it must start with. */
1443 if (utf8_target || OP(c) == ANYOFV) {
1444 STRLEN inclasslen = strend - s;
1445 REXEC_FBC_UTF8_CLASS_SCAN(
1446 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1449 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1454 if (tmp && (!reginfo || regtry(reginfo, &s)))
1462 if (UTF_PATTERN || utf8_target) {
1463 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1464 goto do_exactf_utf8;
1466 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1467 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1468 goto do_exactf_non_utf8; /* isn't dealt with by these */
1473 /* regcomp.c already folded this if pattern is in UTF-8 */
1474 utf8_fold_flags = 0;
1475 goto do_exactf_utf8;
1477 fold_array = PL_fold;
1479 goto do_exactf_non_utf8;
1482 if (UTF_PATTERN || utf8_target) {
1483 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1484 goto do_exactf_utf8;
1486 fold_array = PL_fold_locale;
1487 folder = foldEQ_locale;
1488 goto do_exactf_non_utf8;
1492 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1494 goto do_exactf_utf8;
1496 case EXACTFU_TRICKYFOLD:
1498 if (UTF_PATTERN || utf8_target) {
1499 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1500 goto do_exactf_utf8;
1503 /* Any 'ss' in the pattern should have been replaced by regcomp,
1504 * so we don't have to worry here about this single special case
1505 * in the Latin1 range */
1506 fold_array = PL_fold_latin1;
1507 folder = foldEQ_latin1;
1511 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1512 are no glitches with fold-length differences
1513 between the target string and pattern */
1515 /* The idea in the non-utf8 EXACTF* cases is to first find the
1516 * first character of the EXACTF* node and then, if necessary,
1517 * case-insensitively compare the full text of the node. c1 is the
1518 * first character. c2 is its fold. This logic will not work for
1519 * Unicode semantics and the german sharp ss, which hence should
1520 * not be compiled into a node that gets here. */
1521 pat_string = STRING(c);
1522 ln = STR_LEN(c); /* length to match in octets/bytes */
1524 /* We know that we have to match at least 'ln' bytes (which is the
1525 * same as characters, since not utf8). If we have to match 3
1526 * characters, and there are only 2 availabe, we know without
1527 * trying that it will fail; so don't start a match past the
1528 * required minimum number from the far end */
1529 e = HOP3c(strend, -((I32)ln), s);
1531 if (!reginfo && e < s) {
1532 e = s; /* Due to minlen logic of intuit() */
1536 c2 = fold_array[c1];
1537 if (c1 == c2) { /* If char and fold are the same */
1538 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1541 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1550 /* If one of the operands is in utf8, we can't use the simpler
1551 * folding above, due to the fact that many different characters
1552 * can have the same fold, or portion of a fold, or different-
1554 pat_string = STRING(c);
1555 ln = STR_LEN(c); /* length to match in octets/bytes */
1556 pat_end = pat_string + ln;
1557 lnc = (UTF_PATTERN) /* length to match in characters */
1558 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1561 /* We have 'lnc' characters to match in the pattern, but because of
1562 * multi-character folding, each character in the target can match
1563 * up to 3 characters (Unicode guarantees it will never exceed
1564 * this) if it is utf8-encoded; and up to 2 if not (based on the
1565 * fact that the Latin 1 folds are already determined, and the
1566 * only multi-char fold in that range is the sharp-s folding to
1567 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1568 * string character. Adjust lnc accordingly, rounding up, so that
1569 * if we need to match at least 4+1/3 chars, that really is 5. */
1570 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1571 lnc = (lnc + expansion - 1) / expansion;
1573 /* As in the non-UTF8 case, if we have to match 3 characters, and
1574 * only 2 are left, it's guaranteed to fail, so don't start a
1575 * match that would require us to go beyond the end of the string
1577 e = HOP3c(strend, -((I32)lnc), s);
1579 if (!reginfo && e < s) {
1580 e = s; /* Due to minlen logic of intuit() */
1583 /* XXX Note that we could recalculate e to stop the loop earlier,
1584 * as the worst case expansion above will rarely be met, and as we
1585 * go along we would usually find that e moves further to the left.
1586 * This would happen only after we reached the point in the loop
1587 * where if there were no expansion we should fail. Unclear if
1588 * worth the expense */
1591 char *my_strend= (char *)strend;
1592 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1593 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1594 && (!reginfo || regtry(reginfo, &s)) )
1598 s += (utf8_target) ? UTF8SKIP(s) : 1;
1603 PL_reg_flags |= RF_tainted;
1604 FBC_BOUND(isALNUM_LC,
1605 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1606 isALNUM_LC_utf8((U8*)s));
1609 PL_reg_flags |= RF_tainted;
1610 FBC_NBOUND(isALNUM_LC,
1611 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1612 isALNUM_LC_utf8((U8*)s));
1615 FBC_BOUND(isWORDCHAR,
1617 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1620 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1622 isWORDCHAR_A((U8*)s));
1625 FBC_NBOUND(isWORDCHAR,
1627 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1630 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1632 isWORDCHAR_A((U8*)s));
1635 FBC_BOUND(isWORDCHAR_L1,
1637 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1640 FBC_NBOUND(isWORDCHAR_L1,
1642 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1645 REXEC_FBC_CSCAN_TAINT(
1646 isALNUM_LC_utf8((U8*)s),
1651 REXEC_FBC_CSCAN_PRELOAD(
1652 LOAD_UTF8_CHARCLASS_ALNUM(),
1653 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1654 isWORDCHAR_L1((U8) *s)
1658 REXEC_FBC_CSCAN_PRELOAD(
1659 LOAD_UTF8_CHARCLASS_ALNUM(),
1660 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1665 /* Don't need to worry about utf8, as it can match only a single
1666 * byte invariant character */
1667 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1670 REXEC_FBC_CSCAN_PRELOAD(
1671 LOAD_UTF8_CHARCLASS_ALNUM(),
1672 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1673 ! isWORDCHAR_L1((U8) *s)
1677 REXEC_FBC_CSCAN_PRELOAD(
1678 LOAD_UTF8_CHARCLASS_ALNUM(),
1679 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1690 REXEC_FBC_CSCAN_TAINT(
1691 !isALNUM_LC_utf8((U8*)s),
1696 REXEC_FBC_CSCAN_PRELOAD(
1697 LOAD_UTF8_CHARCLASS_SPACE(),
1698 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1703 REXEC_FBC_CSCAN_PRELOAD(
1704 LOAD_UTF8_CHARCLASS_SPACE(),
1705 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1710 /* Don't need to worry about utf8, as it can match only a single
1711 * byte invariant character */
1712 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1715 REXEC_FBC_CSCAN_TAINT(
1716 isSPACE_LC_utf8((U8*)s),
1721 REXEC_FBC_CSCAN_PRELOAD(
1722 LOAD_UTF8_CHARCLASS_SPACE(),
1723 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1724 ! isSPACE_L1((U8) *s)
1728 REXEC_FBC_CSCAN_PRELOAD(
1729 LOAD_UTF8_CHARCLASS_SPACE(),
1730 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1741 REXEC_FBC_CSCAN_TAINT(
1742 !isSPACE_LC_utf8((U8*)s),
1747 REXEC_FBC_CSCAN_PRELOAD(
1748 LOAD_UTF8_CHARCLASS_DIGIT(),
1749 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1754 /* Don't need to worry about utf8, as it can match only a single
1755 * byte invariant character */
1756 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1759 REXEC_FBC_CSCAN_TAINT(
1760 isDIGIT_LC_utf8((U8*)s),
1765 REXEC_FBC_CSCAN_PRELOAD(
1766 LOAD_UTF8_CHARCLASS_DIGIT(),
1767 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1778 REXEC_FBC_CSCAN_TAINT(
1779 !isDIGIT_LC_utf8((U8*)s),
1786 is_LNBREAK_latin1(s)
1798 !is_VERTWS_latin1(s)
1804 is_HORIZWS_latin1(s)
1809 !is_HORIZWS_utf8(s),
1810 !is_HORIZWS_latin1(s)
1817 /* what trie are we using right now */
1819 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1821 = (reg_trie_data*)progi->data->data[ aho->trie ];
1822 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1824 const char *last_start = strend - trie->minlen;
1826 const char *real_start = s;
1828 STRLEN maxlen = trie->maxlen;
1830 U8 **points; /* map of where we were in the input string
1831 when reading a given char. For ASCII this
1832 is unnecessary overhead as the relationship
1833 is always 1:1, but for Unicode, especially
1834 case folded Unicode this is not true. */
1835 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1839 GET_RE_DEBUG_FLAGS_DECL;
1841 /* We can't just allocate points here. We need to wrap it in
1842 * an SV so it gets freed properly if there is a croak while
1843 * running the match */
1846 sv_points=newSV(maxlen * sizeof(U8 *));
1847 SvCUR_set(sv_points,
1848 maxlen * sizeof(U8 *));
1849 SvPOK_on(sv_points);
1850 sv_2mortal(sv_points);
1851 points=(U8**)SvPV_nolen(sv_points );
1852 if ( trie_type != trie_utf8_fold
1853 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1856 bitmap=(U8*)trie->bitmap;
1858 bitmap=(U8*)ANYOF_BITMAP(c);
1860 /* this is the Aho-Corasick algorithm modified a touch
1861 to include special handling for long "unknown char"
1862 sequences. The basic idea being that we use AC as long
1863 as we are dealing with a possible matching char, when
1864 we encounter an unknown char (and we have not encountered
1865 an accepting state) we scan forward until we find a legal
1867 AC matching is basically that of trie matching, except
1868 that when we encounter a failing transition, we fall back
1869 to the current states "fail state", and try the current char
1870 again, a process we repeat until we reach the root state,
1871 state 1, or a legal transition. If we fail on the root state
1872 then we can either terminate if we have reached an accepting
1873 state previously, or restart the entire process from the beginning
1877 while (s <= last_start) {
1878 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1886 U8 *uscan = (U8*)NULL;
1887 U8 *leftmost = NULL;
1889 U32 accepted_word= 0;
1893 while ( state && uc <= (U8*)strend ) {
1895 U32 word = aho->states[ state ].wordnum;
1899 DEBUG_TRIE_EXECUTE_r(
1900 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1901 dump_exec_pos( (char *)uc, c, strend, real_start,
1902 (char *)uc, utf8_target );
1903 PerlIO_printf( Perl_debug_log,
1904 " Scanning for legal start char...\n");
1908 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1912 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1918 if (uc >(U8*)last_start) break;
1922 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1923 if (!leftmost || lpos < leftmost) {
1924 DEBUG_r(accepted_word=word);
1930 points[pointpos++ % maxlen]= uc;
1931 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1932 uscan, len, uvc, charid, foldlen,
1934 DEBUG_TRIE_EXECUTE_r({
1935 dump_exec_pos( (char *)uc, c, strend, real_start,
1937 PerlIO_printf(Perl_debug_log,
1938 " Charid:%3u CP:%4"UVxf" ",
1944 word = aho->states[ state ].wordnum;
1946 base = aho->states[ state ].trans.base;
1948 DEBUG_TRIE_EXECUTE_r({
1950 dump_exec_pos( (char *)uc, c, strend, real_start,
1952 PerlIO_printf( Perl_debug_log,
1953 "%sState: %4"UVxf", word=%"UVxf,
1954 failed ? " Fail transition to " : "",
1955 (UV)state, (UV)word);
1961 ( ((offset = base + charid
1962 - 1 - trie->uniquecharcount)) >= 0)
1963 && ((U32)offset < trie->lasttrans)
1964 && trie->trans[offset].check == state
1965 && (tmp=trie->trans[offset].next))
1967 DEBUG_TRIE_EXECUTE_r(
1968 PerlIO_printf( Perl_debug_log," - legal\n"));
1973 DEBUG_TRIE_EXECUTE_r(
1974 PerlIO_printf( Perl_debug_log," - fail\n"));
1976 state = aho->fail[state];
1980 /* we must be accepting here */
1981 DEBUG_TRIE_EXECUTE_r(
1982 PerlIO_printf( Perl_debug_log," - accepting\n"));
1991 if (!state) state = 1;
1994 if ( aho->states[ state ].wordnum ) {
1995 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1996 if (!leftmost || lpos < leftmost) {
1997 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2002 s = (char*)leftmost;
2003 DEBUG_TRIE_EXECUTE_r({
2005 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2006 (UV)accepted_word, (IV)(s - real_start)
2009 if (!reginfo || regtry(reginfo, &s)) {
2015 DEBUG_TRIE_EXECUTE_r({
2016 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2019 DEBUG_TRIE_EXECUTE_r(
2020 PerlIO_printf( Perl_debug_log,"No match.\n"));
2029 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2039 - regexec_flags - match a regexp against a string
2042 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2043 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2044 /* strend: pointer to null at end of string */
2045 /* strbeg: real beginning of string */
2046 /* minend: end of match must be >=minend after stringarg. */
2047 /* data: May be used for some additional optimizations.
2048 Currently its only used, with a U32 cast, for transmitting
2049 the ganch offset when doing a /g match. This will change */
2050 /* nosave: For optimizations. */
2053 struct regexp *const prog = (struct regexp *)SvANY(rx);
2054 /*register*/ char *s;
2055 register regnode *c;
2056 /*register*/ char *startpos = stringarg;
2057 I32 minlen; /* must match at least this many chars */
2058 I32 dontbother = 0; /* how many characters not to try at end */
2059 I32 end_shift = 0; /* Same for the end. */ /* CC */
2060 I32 scream_pos = -1; /* Internal iterator of scream. */
2061 char *scream_olds = NULL;
2062 const bool utf8_target = cBOOL(DO_UTF8(sv));
2064 RXi_GET_DECL(prog,progi);
2065 regmatch_info reginfo; /* create some info to pass to regtry etc */
2066 regexp_paren_pair *swap = NULL;
2067 GET_RE_DEBUG_FLAGS_DECL;
2069 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2070 PERL_UNUSED_ARG(data);
2072 /* Be paranoid... */
2073 if (prog == NULL || startpos == NULL) {
2074 Perl_croak(aTHX_ "NULL regexp parameter");
2078 multiline = prog->extflags & RXf_PMf_MULTILINE;
2079 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2081 RX_MATCH_UTF8_set(rx, utf8_target);
2083 debug_start_match(rx, utf8_target, startpos, strend,
2087 minlen = prog->minlen;
2089 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2090 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2091 "String too short [regexec_flags]...\n"));
2096 /* Check validity of program. */
2097 if (UCHARAT(progi->program) != REG_MAGIC) {
2098 Perl_croak(aTHX_ "corrupted regexp program");
2102 PL_reg_eval_set = 0;
2106 PL_reg_flags |= RF_utf8;
2108 /* Mark beginning of line for ^ and lookbehind. */
2109 reginfo.bol = startpos; /* XXX not used ??? */
2113 /* Mark end of line for $ (and such) */
2116 /* see how far we have to get to not match where we matched before */
2117 reginfo.till = startpos+minend;
2119 /* If there is a "must appear" string, look for it. */
2122 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2124 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2125 reginfo.ganch = startpos + prog->gofs;
2126 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2127 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2128 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2130 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2131 && mg->mg_len >= 0) {
2132 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2133 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2134 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2136 if (prog->extflags & RXf_ANCH_GPOS) {
2137 if (s > reginfo.ganch)
2139 s = reginfo.ganch - prog->gofs;
2140 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2141 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2147 reginfo.ganch = strbeg + PTR2UV(data);
2148 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2149 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2151 } else { /* pos() not defined */
2152 reginfo.ganch = strbeg;
2153 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2154 "GPOS: reginfo.ganch = strbeg\n"));
2157 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2158 /* We have to be careful. If the previous successful match
2159 was from this regex we don't want a subsequent partially
2160 successful match to clobber the old results.
2161 So when we detect this possibility we add a swap buffer
2162 to the re, and switch the buffer each match. If we fail
2163 we switch it back, otherwise we leave it swapped.
2166 /* do we need a save destructor here for eval dies? */
2167 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2169 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2170 re_scream_pos_data d;
2172 d.scream_olds = &scream_olds;
2173 d.scream_pos = &scream_pos;
2174 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2177 goto phooey; /* not present */
2183 /* Simplest case: anchored match need be tried only once. */
2184 /* [unless only anchor is BOL and multiline is set] */
2185 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2186 if (s == startpos && regtry(®info, &startpos))
2188 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2189 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2194 dontbother = minlen - 1;
2195 end = HOP3c(strend, -dontbother, strbeg) - 1;
2196 /* for multiline we only have to try after newlines */
2197 if (prog->check_substr || prog->check_utf8) {
2198 /* because of the goto we can not easily reuse the macros for bifurcating the
2199 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2202 goto after_try_utf8;
2204 if (regtry(®info, &s)) {
2211 if (prog->extflags & RXf_USE_INTUIT) {
2212 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2221 } /* end search for check string in unicode */
2223 if (s == startpos) {
2224 goto after_try_latin;
2227 if (regtry(®info, &s)) {
2234 if (prog->extflags & RXf_USE_INTUIT) {
2235 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2244 } /* end search for check string in latin*/
2245 } /* end search for check string */
2246 else { /* search for newline */
2248 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2251 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2252 while (s <= end) { /* note it could be possible to match at the end of the string */
2253 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2254 if (regtry(®info, &s))
2258 } /* end search for newline */
2259 } /* end anchored/multiline check string search */
2261 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2263 /* the warning about reginfo.ganch being used without initialization
2264 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2265 and we only enter this block when the same bit is set. */
2266 char *tmp_s = reginfo.ganch - prog->gofs;
2268 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2273 /* Messy cases: unanchored match. */
2274 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2275 /* we have /x+whatever/ */
2276 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2281 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2282 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2283 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2288 DEBUG_EXECUTE_r( did_match = 1 );
2289 if (regtry(®info, &s)) goto got_it;
2291 while (s < strend && *s == ch)
2299 DEBUG_EXECUTE_r( did_match = 1 );
2300 if (regtry(®info, &s)) goto got_it;
2302 while (s < strend && *s == ch)
2307 DEBUG_EXECUTE_r(if (!did_match)
2308 PerlIO_printf(Perl_debug_log,
2309 "Did not find anchored character...\n")
2312 else if (prog->anchored_substr != NULL
2313 || prog->anchored_utf8 != NULL
2314 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2315 && prog->float_max_offset < strend - s)) {
2320 char *last1; /* Last position checked before */
2324 if (prog->anchored_substr || prog->anchored_utf8) {
2325 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2326 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2327 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2328 back_max = back_min = prog->anchored_offset;
2330 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2331 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2332 must = utf8_target ? prog->float_utf8 : prog->float_substr;
2333 back_max = prog->float_max_offset;
2334 back_min = prog->float_min_offset;
2338 if (must == &PL_sv_undef)
2339 /* could not downgrade utf8 check substring, so must fail */
2345 last = HOP3c(strend, /* Cannot start after this */
2346 -(I32)(CHR_SVLEN(must)
2347 - (SvTAIL(must) != 0) + back_min), strbeg);
2350 last1 = HOPc(s, -1);
2352 last1 = s - 1; /* bogus */
2354 /* XXXX check_substr already used to find "s", can optimize if
2355 check_substr==must. */
2357 dontbother = end_shift;
2358 strend = HOPc(strend, -dontbother);
2359 while ( (s <= last) &&
2360 ((flags & REXEC_SCREAM) && SvSCREAM(sv)
2361 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2362 end_shift, &scream_pos, 0))
2363 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2364 (unsigned char*)strend, must,
2365 multiline ? FBMrf_MULTILINE : 0))) ) {
2366 /* we may be pointing at the wrong string */
2367 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2368 s = strbeg + (s - SvPVX_const(sv));
2369 DEBUG_EXECUTE_r( did_match = 1 );
2370 if (HOPc(s, -back_max) > last1) {
2371 last1 = HOPc(s, -back_min);
2372 s = HOPc(s, -back_max);
2375 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2377 last1 = HOPc(s, -back_min);
2381 while (s <= last1) {
2382 if (regtry(®info, &s))
2388 while (s <= last1) {
2389 if (regtry(®info, &s))
2395 DEBUG_EXECUTE_r(if (!did_match) {
2396 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2397 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2398 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2399 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2400 ? "anchored" : "floating"),
2401 quoted, RE_SV_TAIL(must));
2405 else if ( (c = progi->regstclass) ) {
2407 const OPCODE op = OP(progi->regstclass);
2408 /* don't bother with what can't match */
2409 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2410 strend = HOPc(strend, -(minlen - 1));
2413 SV * const prop = sv_newmortal();
2414 regprop(prog, prop, c);
2416 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2418 PerlIO_printf(Perl_debug_log,
2419 "Matching stclass %.*s against %s (%d bytes)\n",
2420 (int)SvCUR(prop), SvPVX_const(prop),
2421 quoted, (int)(strend - s));
2424 if (find_byclass(prog, c, s, strend, ®info))
2426 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2430 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2435 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2436 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2437 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2439 if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) {
2440 last = screaminstr(sv, float_real, s - strbeg,
2441 end_shift, &scream_pos, 1); /* last one */
2443 last = scream_olds; /* Only one occurrence. */
2444 /* we may be pointing at the wrong string */
2445 else if (RXp_MATCH_COPIED(prog))
2446 s = strbeg + (s - SvPVX_const(sv));
2450 const char * const little = SvPV_const(float_real, len);
2452 if (SvTAIL(float_real)) {
2453 if (memEQ(strend - len + 1, little, len - 1))
2454 last = strend - len + 1;
2455 else if (!multiline)
2456 last = memEQ(strend - len, little, len)
2457 ? strend - len : NULL;
2463 last = rninstr(s, strend, little, little + len);
2465 last = strend; /* matching "$" */
2470 PerlIO_printf(Perl_debug_log,
2471 "%sCan't trim the tail, match fails (should not happen)%s\n",
2472 PL_colors[4], PL_colors[5]));
2473 goto phooey; /* Should not happen! */
2475 dontbother = strend - last + prog->float_min_offset;
2477 if (minlen && (dontbother < minlen))
2478 dontbother = minlen - 1;
2479 strend -= dontbother; /* this one's always in bytes! */
2480 /* We don't know much -- general case. */
2483 if (regtry(®info, &s))
2492 if (regtry(®info, &s))
2494 } while (s++ < strend);
2503 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2505 if (PL_reg_eval_set)
2506 restore_pos(aTHX_ prog);
2507 if (RXp_PAREN_NAMES(prog))
2508 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2510 /* make sure $`, $&, $', and $digit will work later */
2511 if ( !(flags & REXEC_NOT_FIRST) ) {
2512 RX_MATCH_COPY_FREE(rx);
2513 if (flags & REXEC_COPY_STR) {
2514 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2515 #ifdef PERL_OLD_COPY_ON_WRITE
2517 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2519 PerlIO_printf(Perl_debug_log,
2520 "Copy on write: regexp capture, type %d\n",
2523 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2524 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2525 assert (SvPOKp(prog->saved_copy));
2529 RX_MATCH_COPIED_on(rx);
2530 s = savepvn(strbeg, i);
2536 prog->subbeg = strbeg;
2537 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2544 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2545 PL_colors[4], PL_colors[5]));
2546 if (PL_reg_eval_set)
2547 restore_pos(aTHX_ prog);
2549 /* we failed :-( roll it back */
2550 Safefree(prog->offs);
2559 - regtry - try match at specific point
2561 STATIC I32 /* 0 failure, 1 success */
2562 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2566 REGEXP *const rx = reginfo->prog;
2567 regexp *const prog = (struct regexp *)SvANY(rx);
2568 RXi_GET_DECL(prog,progi);
2569 GET_RE_DEBUG_FLAGS_DECL;
2571 PERL_ARGS_ASSERT_REGTRY;
2573 reginfo->cutpoint=NULL;
2575 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2578 PL_reg_eval_set = RS_init;
2579 DEBUG_EXECUTE_r(DEBUG_s(
2580 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2581 (IV)(PL_stack_sp - PL_stack_base));
2584 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2585 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2587 /* Apparently this is not needed, judging by wantarray. */
2588 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2589 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2592 /* Make $_ available to executed code. */
2593 if (reginfo->sv != DEFSV) {
2595 DEFSV_set(reginfo->sv);
2598 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2599 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2600 /* prepare for quick setting of pos */
2601 #ifdef PERL_OLD_COPY_ON_WRITE
2602 if (SvIsCOW(reginfo->sv))
2603 sv_force_normal_flags(reginfo->sv, 0);
2605 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2606 &PL_vtbl_mglob, NULL, 0);
2610 PL_reg_oldpos = mg->mg_len;
2611 SAVEDESTRUCTOR_X(restore_pos, prog);
2613 if (!PL_reg_curpm) {
2614 Newxz(PL_reg_curpm, 1, PMOP);
2617 SV* const repointer = &PL_sv_undef;
2618 /* this regexp is also owned by the new PL_reg_curpm, which
2619 will try to free it. */
2620 av_push(PL_regex_padav, repointer);
2621 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2622 PL_regex_pad = AvARRAY(PL_regex_padav);
2627 /* It seems that non-ithreads works both with and without this code.
2628 So for efficiency reasons it seems best not to have the code
2629 compiled when it is not needed. */
2630 /* This is safe against NULLs: */
2631 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2632 /* PM_reg_curpm owns a reference to this regexp. */
2633 (void)ReREFCNT_inc(rx);
2635 PM_SETRE(PL_reg_curpm, rx);
2636 PL_reg_oldcurpm = PL_curpm;
2637 PL_curpm = PL_reg_curpm;
2638 if (RXp_MATCH_COPIED(prog)) {
2639 /* Here is a serious problem: we cannot rewrite subbeg,
2640 since it may be needed if this match fails. Thus
2641 $` inside (?{}) could fail... */
2642 PL_reg_oldsaved = prog->subbeg;
2643 PL_reg_oldsavedlen = prog->sublen;
2644 #ifdef PERL_OLD_COPY_ON_WRITE
2645 PL_nrs = prog->saved_copy;
2647 RXp_MATCH_COPIED_off(prog);
2650 PL_reg_oldsaved = NULL;
2651 prog->subbeg = PL_bostr;
2652 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2654 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2655 prog->offs[0].start = *startpos - PL_bostr;
2656 PL_reginput = *startpos;
2657 PL_reglastparen = &prog->lastparen;
2658 PL_reglastcloseparen = &prog->lastcloseparen;
2659 prog->lastparen = 0;
2660 prog->lastcloseparen = 0;
2662 PL_regoffs = prog->offs;
2663 if (PL_reg_start_tmpl <= prog->nparens) {
2664 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2665 if(PL_reg_start_tmp)
2666 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2668 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2671 /* XXXX What this code is doing here?!!! There should be no need
2672 to do this again and again, PL_reglastparen should take care of
2675 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2676 * Actually, the code in regcppop() (which Ilya may be meaning by
2677 * PL_reglastparen), is not needed at all by the test suite
2678 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2679 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2680 * Meanwhile, this code *is* needed for the
2681 * above-mentioned test suite tests to succeed. The common theme
2682 * on those tests seems to be returning null fields from matches.
2683 * --jhi updated by dapm */
2685 if (prog->nparens) {
2686 regexp_paren_pair *pp = PL_regoffs;
2688 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2696 if (regmatch(reginfo, progi->program + 1)) {
2697 PL_regoffs[0].end = PL_reginput - PL_bostr;
2700 if (reginfo->cutpoint)
2701 *startpos= reginfo->cutpoint;
2702 REGCP_UNWIND(lastcp);
2707 #define sayYES goto yes
2708 #define sayNO goto no
2709 #define sayNO_SILENT goto no_silent
2711 /* we dont use STMT_START/END here because it leads to
2712 "unreachable code" warnings, which are bogus, but distracting. */
2713 #define CACHEsayNO \
2714 if (ST.cache_mask) \
2715 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2718 /* this is used to determine how far from the left messages like
2719 'failed...' are printed. It should be set such that messages
2720 are inline with the regop output that created them.
2722 #define REPORT_CODE_OFF 32
2725 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2726 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2728 #define SLAB_FIRST(s) (&(s)->states[0])
2729 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2731 /* grab a new slab and return the first slot in it */
2733 STATIC regmatch_state *
2736 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2739 regmatch_slab *s = PL_regmatch_slab->next;
2741 Newx(s, 1, regmatch_slab);
2742 s->prev = PL_regmatch_slab;
2744 PL_regmatch_slab->next = s;
2746 PL_regmatch_slab = s;
2747 return SLAB_FIRST(s);
2751 /* push a new state then goto it */
2753 #define PUSH_STATE_GOTO(state, node) \
2755 st->resume_state = state; \
2758 /* push a new state with success backtracking, then goto it */
2760 #define PUSH_YES_STATE_GOTO(state, node) \
2762 st->resume_state = state; \
2763 goto push_yes_state;
2769 regmatch() - main matching routine
2771 This is basically one big switch statement in a loop. We execute an op,
2772 set 'next' to point the next op, and continue. If we come to a point which
2773 we may need to backtrack to on failure such as (A|B|C), we push a
2774 backtrack state onto the backtrack stack. On failure, we pop the top
2775 state, and re-enter the loop at the state indicated. If there are no more
2776 states to pop, we return failure.
2778 Sometimes we also need to backtrack on success; for example /A+/, where
2779 after successfully matching one A, we need to go back and try to
2780 match another one; similarly for lookahead assertions: if the assertion
2781 completes successfully, we backtrack to the state just before the assertion
2782 and then carry on. In these cases, the pushed state is marked as
2783 'backtrack on success too'. This marking is in fact done by a chain of
2784 pointers, each pointing to the previous 'yes' state. On success, we pop to
2785 the nearest yes state, discarding any intermediate failure-only states.
2786 Sometimes a yes state is pushed just to force some cleanup code to be
2787 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2788 it to free the inner regex.
2790 Note that failure backtracking rewinds the cursor position, while
2791 success backtracking leaves it alone.
2793 A pattern is complete when the END op is executed, while a subpattern
2794 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2795 ops trigger the "pop to last yes state if any, otherwise return true"
2798 A common convention in this function is to use A and B to refer to the two
2799 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2800 the subpattern to be matched possibly multiple times, while B is the entire
2801 rest of the pattern. Variable and state names reflect this convention.
2803 The states in the main switch are the union of ops and failure/success of
2804 substates associated with with that op. For example, IFMATCH is the op
2805 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2806 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2807 successfully matched A and IFMATCH_A_fail is a state saying that we have
2808 just failed to match A. Resume states always come in pairs. The backtrack
2809 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2810 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2811 on success or failure.
2813 The struct that holds a backtracking state is actually a big union, with
2814 one variant for each major type of op. The variable st points to the
2815 top-most backtrack struct. To make the code clearer, within each
2816 block of code we #define ST to alias the relevant union.
2818 Here's a concrete example of a (vastly oversimplified) IFMATCH
2824 #define ST st->u.ifmatch
2826 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2827 ST.foo = ...; // some state we wish to save
2829 // push a yes backtrack state with a resume value of
2830 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2832 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2835 case IFMATCH_A: // we have successfully executed A; now continue with B
2837 bar = ST.foo; // do something with the preserved value
2840 case IFMATCH_A_fail: // A failed, so the assertion failed
2841 ...; // do some housekeeping, then ...
2842 sayNO; // propagate the failure
2849 For any old-timers reading this who are familiar with the old recursive
2850 approach, the code above is equivalent to:
2852 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2861 ...; // do some housekeeping, then ...
2862 sayNO; // propagate the failure
2865 The topmost backtrack state, pointed to by st, is usually free. If you
2866 want to claim it, populate any ST.foo fields in it with values you wish to
2867 save, then do one of
2869 PUSH_STATE_GOTO(resume_state, node);
2870 PUSH_YES_STATE_GOTO(resume_state, node);
2872 which sets that backtrack state's resume value to 'resume_state', pushes a
2873 new free entry to the top of the backtrack stack, then goes to 'node'.
2874 On backtracking, the free slot is popped, and the saved state becomes the
2875 new free state. An ST.foo field in this new top state can be temporarily
2876 accessed to retrieve values, but once the main loop is re-entered, it
2877 becomes available for reuse.
2879 Note that the depth of the backtrack stack constantly increases during the
2880 left-to-right execution of the pattern, rather than going up and down with
2881 the pattern nesting. For example the stack is at its maximum at Z at the
2882 end of the pattern, rather than at X in the following:
2884 /(((X)+)+)+....(Y)+....Z/
2886 The only exceptions to this are lookahead/behind assertions and the cut,
2887 (?>A), which pop all the backtrack states associated with A before
2890 Backtrack state structs are allocated in slabs of about 4K in size.
2891 PL_regmatch_state and st always point to the currently active state,
2892 and PL_regmatch_slab points to the slab currently containing
2893 PL_regmatch_state. The first time regmatch() is called, the first slab is
2894 allocated, and is never freed until interpreter destruction. When the slab
2895 is full, a new one is allocated and chained to the end. At exit from
2896 regmatch(), slabs allocated since entry are freed.
2901 #define DEBUG_STATE_pp(pp) \
2903 DUMP_EXEC_POS(locinput, scan, utf8_target); \
2904 PerlIO_printf(Perl_debug_log, \
2905 " %*s"pp" %s%s%s%s%s\n", \
2907 PL_reg_name[st->resume_state], \
2908 ((st==yes_state||st==mark_state) ? "[" : ""), \
2909 ((st==yes_state) ? "Y" : ""), \
2910 ((st==mark_state) ? "M" : ""), \
2911 ((st==yes_state||st==mark_state) ? "]" : "") \
2916 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2921 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2922 const char *start, const char *end, const char *blurb)
2924 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2926 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2931 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2932 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2934 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2935 start, end - start, 60);
2937 PerlIO_printf(Perl_debug_log,
2938 "%s%s REx%s %s against %s\n",
2939 PL_colors[4], blurb, PL_colors[5], s0, s1);
2941 if (utf8_target||utf8_pat)
2942 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2943 utf8_pat ? "pattern" : "",
2944 utf8_pat && utf8_target ? " and " : "",
2945 utf8_target ? "string" : ""
2951 S_dump_exec_pos(pTHX_ const char *locinput,
2952 const regnode *scan,
2953 const char *loc_regeol,
2954 const char *loc_bostr,
2955 const char *loc_reg_starttry,
2956 const bool utf8_target)
2958 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2959 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2960 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2961 /* The part of the string before starttry has one color
2962 (pref0_len chars), between starttry and current
2963 position another one (pref_len - pref0_len chars),
2964 after the current position the third one.
2965 We assume that pref0_len <= pref_len, otherwise we
2966 decrease pref0_len. */
2967 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2968 ? (5 + taill) - l : locinput - loc_bostr;
2971 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2973 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2975 pref0_len = pref_len - (locinput - loc_reg_starttry);
2976 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2977 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2978 ? (5 + taill) - pref_len : loc_regeol - locinput);
2979 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2983 if (pref0_len > pref_len)
2984 pref0_len = pref_len;
2986 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2988 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2989 (locinput - pref_len),pref0_len, 60, 4, 5);
2991 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2992 (locinput - pref_len + pref0_len),
2993 pref_len - pref0_len, 60, 2, 3);
2995 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2996 locinput, loc_regeol - locinput, 10, 0, 1);
2998 const STRLEN tlen=len0+len1+len2;
2999 PerlIO_printf(Perl_debug_log,
3000 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3001 (IV)(locinput - loc_bostr),
3004 (docolor ? "" : "> <"),
3006 (int)(tlen > 19 ? 0 : 19 - tlen),
3013 /* reg_check_named_buff_matched()
3014 * Checks to see if a named buffer has matched. The data array of
3015 * buffer numbers corresponding to the buffer is expected to reside
3016 * in the regexp->data->data array in the slot stored in the ARG() of
3017 * node involved. Note that this routine doesn't actually care about the
3018 * name, that information is not preserved from compilation to execution.
3019 * Returns the index of the leftmost defined buffer with the given name
3020 * or 0 if non of the buffers matched.
3023 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3026 RXi_GET_DECL(rex,rexi);
3027 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3028 I32 *nums=(I32*)SvPVX(sv_dat);
3030 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3032 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3033 if ((I32)*PL_reglastparen >= nums[n] &&
3034 PL_regoffs[nums[n]].end != -1)
3043 /* free all slabs above current one - called during LEAVE_SCOPE */
3046 S_clear_backtrack_stack(pTHX_ void *p)
3048 regmatch_slab *s = PL_regmatch_slab->next;
3053 PL_regmatch_slab->next = NULL;
3055 regmatch_slab * const osl = s;
3062 #define SETREX(Re1,Re2) \
3063 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
3066 STATIC I32 /* 0 failure, 1 success */
3067 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
3069 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3073 register const bool utf8_target = PL_reg_match_utf8;
3074 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3075 REGEXP *rex_sv = reginfo->prog;
3076 regexp *rex = (struct regexp *)SvANY(rex_sv);
3077 RXi_GET_DECL(rex,rexi);
3079 /* the current state. This is a cached copy of PL_regmatch_state */
3080 register regmatch_state *st;
3081 /* cache heavy used fields of st in registers */
3082 register regnode *scan;
3083 register regnode *next;
3084 register U32 n = 0; /* general value; init to avoid compiler warning */
3085 register I32 ln = 0; /* len or last; init to avoid compiler warning */
3086 register char *locinput = PL_reginput;
3087 register I32 nextchr; /* is always set to UCHARAT(locinput) */
3089 bool result = 0; /* return value of S_regmatch */
3090 int depth = 0; /* depth of backtrack stack */
3091 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3092 const U32 max_nochange_depth =
3093 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3094 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3095 regmatch_state *yes_state = NULL; /* state to pop to on success of
3097 /* mark_state piggy backs on the yes_state logic so that when we unwind
3098 the stack on success we can update the mark_state as we go */
3099 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3100 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3101 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3103 bool no_final = 0; /* prevent failure from backtracking? */
3104 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3105 char *startpoint = PL_reginput;
3106 SV *popmark = NULL; /* are we looking for a mark? */
3107 SV *sv_commit = NULL; /* last mark name seen in failure */
3108 SV *sv_yes_mark = NULL; /* last mark name we have seen
3109 during a successful match */
3110 U32 lastopen = 0; /* last open we saw */
3111 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3112 SV* const oreplsv = GvSV(PL_replgv);
3113 /* these three flags are set by various ops to signal information to
3114 * the very next op. They have a useful lifetime of exactly one loop
3115 * iteration, and are not preserved or restored by state pushes/pops
3117 bool sw = 0; /* the condition value in (?(cond)a|b) */
3118 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3119 int logical = 0; /* the following EVAL is:
3123 or the following IFMATCH/UNLESSM is:
3124 false: plain (?=foo)
3125 true: used as a condition: (?(?=foo))
3128 GET_RE_DEBUG_FLAGS_DECL;
3131 PERL_ARGS_ASSERT_REGMATCH;
3133 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3134 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3136 /* on first ever call to regmatch, allocate first slab */
3137 if (!PL_regmatch_slab) {
3138 Newx(PL_regmatch_slab, 1, regmatch_slab);
3139 PL_regmatch_slab->prev = NULL;
3140 PL_regmatch_slab->next = NULL;
3141 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3144 oldsave = PL_savestack_ix;
3145 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3146 SAVEVPTR(PL_regmatch_slab);
3147 SAVEVPTR(PL_regmatch_state);
3149 /* grab next free state slot */
3150 st = ++PL_regmatch_state;
3151 if (st > SLAB_LAST(PL_regmatch_slab))
3152 st = PL_regmatch_state = S_push_slab(aTHX);
3154 /* Note that nextchr is a byte even in UTF */
3155 nextchr = UCHARAT(locinput);
3157 while (scan != NULL) {
3160 SV * const prop = sv_newmortal();
3161 regnode *rnext=regnext(scan);
3162 DUMP_EXEC_POS( locinput, scan, utf8_target );
3163 regprop(rex, prop, scan);
3165 PerlIO_printf(Perl_debug_log,
3166 "%3"IVdf":%*s%s(%"IVdf")\n",
3167 (IV)(scan - rexi->program), depth*2, "",
3169 (PL_regkind[OP(scan)] == END || !rnext) ?
3170 0 : (IV)(rnext - rexi->program));
3173 next = scan + NEXT_OFF(scan);
3176 state_num = OP(scan);
3180 assert(PL_reglastparen == &rex->lastparen);
3181 assert(PL_reglastcloseparen == &rex->lastcloseparen);
3182 assert(PL_regoffs == rex->offs);
3184 switch (state_num) {
3186 if (locinput == PL_bostr)
3188 /* reginfo->till = reginfo->bol; */
3193 if (locinput == PL_bostr ||
3194 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3200 if (locinput == PL_bostr)
3204 if (locinput == reginfo->ganch)
3209 /* update the startpoint */
3210 st->u.keeper.val = PL_regoffs[0].start;
3211 PL_reginput = locinput;
3212 PL_regoffs[0].start = locinput - PL_bostr;
3213 PUSH_STATE_GOTO(KEEPS_next, next);
3215 case KEEPS_next_fail:
3216 /* rollback the start point change */
3217 PL_regoffs[0].start = st->u.keeper.val;
3223 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3228 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3230 if (PL_regeol - locinput > 1)
3234 if (PL_regeol != locinput)
3238 if (!nextchr && locinput >= PL_regeol)
3241 locinput += PL_utf8skip[nextchr];
3242 if (locinput > PL_regeol)
3244 nextchr = UCHARAT(locinput);
3247 nextchr = UCHARAT(++locinput);
3250 if (!nextchr && locinput >= PL_regeol)
3252 nextchr = UCHARAT(++locinput);
3255 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3258 locinput += PL_utf8skip[nextchr];
3259 if (locinput > PL_regeol)
3261 nextchr = UCHARAT(locinput);
3264 nextchr = UCHARAT(++locinput);
3268 #define ST st->u.trie
3270 /* In this case the charclass data is available inline so
3271 we can fail fast without a lot of extra overhead.
3273 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3275 PerlIO_printf(Perl_debug_log,
3276 "%*s %sfailed to match trie start class...%s\n",
3277 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3284 /* the basic plan of execution of the trie is:
3285 * At the beginning, run though all the states, and
3286 * find the longest-matching word. Also remember the position
3287 * of the shortest matching word. For example, this pattern:
3290 * when matched against the string "abcde", will generate
3291 * accept states for all words except 3, with the longest
3292 * matching word being 4, and the shortest being 1 (with
3293 * the position being after char 1 of the string).
3295 * Then for each matching word, in word order (i.e. 1,2,4,5),
3296 * we run the remainder of the pattern; on each try setting
3297 * the current position to the character following the word,
3298 * returning to try the next word on failure.
3300 * We avoid having to build a list of words at runtime by
3301 * using a compile-time structure, wordinfo[].prev, which
3302 * gives, for each word, the previous accepting word (if any).
3303 * In the case above it would contain the mappings 1->2, 2->0,
3304 * 3->0, 4->5, 5->1. We can use this table to generate, from
3305 * the longest word (4 above), a list of all words, by
3306 * following the list of prev pointers; this gives us the
3307 * unordered list 4,5,1,2. Then given the current word we have
3308 * just tried, we can go through the list and find the
3309 * next-biggest word to try (so if we just failed on word 2,
3310 * the next in the list is 4).
3312 * Since at runtime we don't record the matching position in
3313 * the string for each word, we have to work that out for
3314 * each word we're about to process. The wordinfo table holds
3315 * the character length of each word; given that we recorded
3316 * at the start: the position of the shortest word and its
3317 * length in chars, we just need to move the pointer the
3318 * difference between the two char lengths. Depending on
3319 * Unicode status and folding, that's cheap or expensive.
3321 * This algorithm is optimised for the case where are only a
3322 * small number of accept states, i.e. 0,1, or maybe 2.
3323 * With lots of accepts states, and having to try all of them,
3324 * it becomes quadratic on number of accept states to find all
3329 /* what type of TRIE am I? (utf8 makes this contextual) */
3330 DECL_TRIE_TYPE(scan);
3332 /* what trie are we using right now */
3333 reg_trie_data * const trie
3334 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3335 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3336 U32 state = trie->startstate;
3338 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3339 if (trie->states[ state ].wordnum) {
3341 PerlIO_printf(Perl_debug_log,
3342 "%*s %smatched empty string...%s\n",
3343 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3349 PerlIO_printf(Perl_debug_log,
3350 "%*s %sfailed to match trie start class...%s\n",
3351 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3358 U8 *uc = ( U8* )locinput;
3362 U8 *uscan = (U8*)NULL;
3363 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3364 U32 charcount = 0; /* how many input chars we have matched */
3365 U32 accepted = 0; /* have we seen any accepting states? */
3368 ST.jump = trie->jump;
3371 ST.longfold = FALSE; /* char longer if folded => it's harder */
3374 /* fully traverse the TRIE; note the position of the
3375 shortest accept state and the wordnum of the longest
3378 while ( state && uc <= (U8*)PL_regeol ) {
3379 U32 base = trie->states[ state ].trans.base;
3383 wordnum = trie->states[ state ].wordnum;
3385 if (wordnum) { /* it's an accept state */
3388 /* record first match position */
3390 ST.firstpos = (U8*)locinput;
3395 ST.firstchars = charcount;
3398 if (!ST.nextword || wordnum < ST.nextword)
3399 ST.nextword = wordnum;
3400 ST.topword = wordnum;
3403 DEBUG_TRIE_EXECUTE_r({
3404 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3405 PerlIO_printf( Perl_debug_log,
3406 "%*s %sState: %4"UVxf" Accepted: %c ",
3407 2+depth * 2, "", PL_colors[4],
3408 (UV)state, (accepted ? 'Y' : 'N'));
3411 /* read a char and goto next state */
3414 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3415 uscan, len, uvc, charid, foldlen,
3422 base + charid - 1 - trie->uniquecharcount)) >= 0)
3424 && ((U32)offset < trie->lasttrans)
3425 && trie->trans[offset].check == state)
3427 state = trie->trans[offset].next;
3438 DEBUG_TRIE_EXECUTE_r(
3439 PerlIO_printf( Perl_debug_log,
3440 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3441 charid, uvc, (UV)state, PL_colors[5] );
3447 /* calculate total number of accept states */
3452 w = trie->wordinfo[w].prev;
3455 ST.accepted = accepted;
3459 PerlIO_printf( Perl_debug_log,
3460 "%*s %sgot %"IVdf" possible matches%s\n",
3461 REPORT_CODE_OFF + depth * 2, "",
3462 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3464 goto trie_first_try; /* jump into the fail handler */
3468 case TRIE_next_fail: /* we failed - try next alternative */
3470 REGCP_UNWIND(ST.cp);
3471 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3472 PL_regoffs[n].end = -1;
3473 *PL_reglastparen = n;
3475 if (!--ST.accepted) {
3477 PerlIO_printf( Perl_debug_log,
3478 "%*s %sTRIE failed...%s\n",
3479 REPORT_CODE_OFF+depth*2, "",
3486 /* Find next-highest word to process. Note that this code
3487 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3488 register U16 min = 0;
3490 register U16 const nextword = ST.nextword;
3491 register reg_trie_wordinfo * const wordinfo
3492 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3493 for (word=ST.topword; word; word=wordinfo[word].prev) {
3494 if (word > nextword && (!min || word < min))
3507 ST.lastparen = *PL_reglastparen;
3511 /* find start char of end of current word */
3513 U32 chars; /* how many chars to skip */
3514 U8 *uc = ST.firstpos;
3515 reg_trie_data * const trie
3516 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3518 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3520 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3524 /* the hard option - fold each char in turn and find
3525 * its folded length (which may be different */
3526 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3534 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3542 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3547 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3561 PL_reginput = (char *)uc;
3564 scan = (ST.jump && ST.jump[ST.nextword])
3565 ? ST.me + ST.jump[ST.nextword]
3569 PerlIO_printf( Perl_debug_log,
3570 "%*s %sTRIE matched word #%d, continuing%s\n",
3571 REPORT_CODE_OFF+depth*2, "",
3578 if (ST.accepted > 1 || has_cutgroup) {
3579 PUSH_STATE_GOTO(TRIE_next, scan);
3582 /* only one choice left - just continue */
3584 AV *const trie_words
3585 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3586 SV ** const tmp = av_fetch( trie_words,
3588 SV *sv= tmp ? sv_newmortal() : NULL;
3590 PerlIO_printf( Perl_debug_log,
3591 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3592 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3594 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3595 PL_colors[0], PL_colors[1],
3596 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3598 : "not compiled under -Dr",
3602 locinput = PL_reginput;
3603 nextchr = UCHARAT(locinput);
3604 continue; /* execute rest of RE */
3609 char *s = STRING(scan);
3611 if (utf8_target != UTF_PATTERN) {
3612 /* The target and the pattern have differing utf8ness. */
3614 const char * const e = s + ln;
3617 /* The target is utf8, the pattern is not utf8. */
3622 if (NATIVE_TO_UNI(*(U8*)s) !=
3623 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3631 /* The target is not utf8, the pattern is utf8. */
3636 if (NATIVE_TO_UNI(*((U8*)l)) !=
3637 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3645 nextchr = UCHARAT(locinput);
3648 /* The target and the pattern have the same utf8ness. */
3649 /* Inline the first character, for speed. */
3650 if (UCHARAT(s) != nextchr)
3652 if (PL_regeol - locinput < ln)
3654 if (ln > 1 && memNE(s, locinput, ln))
3657 nextchr = UCHARAT(locinput);
3662 const U8 * fold_array;
3664 U32 fold_utf8_flags;
3666 PL_reg_flags |= RF_tainted;
3667 folder = foldEQ_locale;
3668 fold_array = PL_fold_locale;
3669 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3673 case EXACTFU_TRICKYFOLD:
3675 folder = foldEQ_latin1;
3676 fold_array = PL_fold_latin1;
3677 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3681 folder = foldEQ_latin1;
3682 fold_array = PL_fold_latin1;
3683 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3688 fold_array = PL_fold;
3689 fold_utf8_flags = 0;
3695 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3696 /* Either target or the pattern are utf8, or has the issue where
3697 * the fold lengths may differ. */
3698 const char * const l = locinput;
3699 char *e = PL_regeol;
3701 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
3702 l, &e, 0, utf8_target, fold_utf8_flags))
3707 nextchr = UCHARAT(locinput);
3711 /* Neither the target nor the pattern are utf8 */
3712 if (UCHARAT(s) != nextchr &&
3713 UCHARAT(s) != fold_array[nextchr])
3717 if (PL_regeol - locinput < ln)
3719 if (ln > 1 && ! folder(s, locinput, ln))
3722 nextchr = UCHARAT(locinput);
3726 /* XXX Could improve efficiency by separating these all out using a
3727 * macro or in-line function. At that point regcomp.c would no longer
3728 * have to set the FLAGS fields of these */
3731 PL_reg_flags |= RF_tainted;
3739 /* was last char in word? */
3741 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3742 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3744 if (locinput == PL_bostr)
3747 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3749 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3751 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3752 ln = isALNUM_uni(ln);
3753 LOAD_UTF8_CHARCLASS_ALNUM();
3754 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3757 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3758 n = isALNUM_LC_utf8((U8*)locinput);
3763 /* Here the string isn't utf8, or is utf8 and only ascii
3764 * characters are to match \w. In the latter case looking at
3765 * the byte just prior to the current one may be just the final
3766 * byte of a multi-byte character. This is ok. There are two
3768 * 1) it is a single byte character, and then the test is doing
3769 * just what it's supposed to.
3770 * 2) it is a multi-byte character, in which case the final
3771 * byte is never mistakable for ASCII, and so the test
3772 * will say it is not a word character, which is the
3773 * correct answer. */
3774 ln = (locinput != PL_bostr) ?
3775 UCHARAT(locinput - 1) : '\n';
3776 switch (FLAGS(scan)) {
3777 case REGEX_UNICODE_CHARSET:
3778 ln = isWORDCHAR_L1(ln);
3779 n = isWORDCHAR_L1(nextchr);
3781 case REGEX_LOCALE_CHARSET:
3782 ln = isALNUM_LC(ln);
3783 n = isALNUM_LC(nextchr);
3785 case REGEX_DEPENDS_CHARSET:
3787 n = isALNUM(nextchr);
3789 case REGEX_ASCII_RESTRICTED_CHARSET:
3790 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3791 ln = isWORDCHAR_A(ln);
3792 n = isWORDCHAR_A(nextchr);
3795 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3799 /* Note requires that all BOUNDs be lower than all NBOUNDs in
3801 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3806 if (utf8_target || state_num == ANYOFV) {
3807 STRLEN inclasslen = PL_regeol - locinput;
3808 if (locinput >= PL_regeol)
3811 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3813 locinput += inclasslen;
3814 nextchr = UCHARAT(locinput);
3819 nextchr = UCHARAT(locinput);
3820 if (!nextchr && locinput >= PL_regeol)
3822 if (!REGINCLASS(rex, scan, (U8*)locinput))
3824 nextchr = UCHARAT(++locinput);
3828 /* Special char classes - The defines start on line 129 or so */
3829 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
3830 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
3831 ALNUMU, NALNUMU, isWORDCHAR_L1,
3832 ALNUMA, NALNUMA, isWORDCHAR_A,
3835 CCC_TRY_U(SPACE, NSPACE, isSPACE,
3836 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
3837 SPACEU, NSPACEU, isSPACE_L1,
3838 SPACEA, NSPACEA, isSPACE_A,
3841 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
3842 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
3843 DIGITA, NDIGITA, isDIGIT_A,
3846 case CLUMP: /* Match \X: logical Unicode character. This is defined as
3847 a Unicode extended Grapheme Cluster */
3848 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
3849 extended Grapheme Cluster is: