5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #include "invlist_inline.h"
84 #include "unicode_constants.h"
86 #define B_ON_NON_UTF8_LOCALE_IS_WRONG \
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
89 static const char utf8_locale_required[] =
90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
93 /* At least one required character in the target string is expressible only in
95 static const char* const non_utf8_target_but_utf8_required
96 = "Can't match, because target string needs to be in UTF-8\n";
99 /* Returns a boolean as to whether the input unsigned number is a power of 2
100 * (2**0, 2**1, etc). In other words if it has just a single bit set.
101 * If not, subtracting 1 would leave the uppermost bit set, so the & would
103 #define isPOWER_OF_2(n) ((n & (n-1)) == 0)
105 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
106 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
110 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
113 #define STATIC static
116 /* Valid only if 'c', the character being looke-up, is an invariant under
117 * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
118 * everything matchable is straight forward in the bitmap */
119 #define REGINCLASS(prog,p,c,u) (ANYOF_FLAGS(p) \
120 ? reginclass(prog,p,c,c+1,u) \
121 : ANYOF_BITMAP_TEST(p,*(c)))
127 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
129 #define HOPc(pos,off) \
130 (char *)(reginfo->is_utf8_target \
131 ? reghop3((U8*)pos, off, \
132 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
135 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
136 #define HOPBACK3(pos, off, lim) \
137 (reginfo->is_utf8_target \
138 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
139 : (pos - off >= lim) \
143 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
145 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
146 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
148 /* lim must be +ve. Returns NULL on overshoot */
149 #define HOPMAYBE3(pos,off,lim) \
150 (reginfo->is_utf8_target \
151 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
152 : ((U8*)pos + off <= lim) \
156 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
157 * off must be >=0; args should be vars rather than expressions */
158 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
159 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
160 : (U8*)((pos + off) > lim ? lim : (pos + off)))
161 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
163 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
164 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
166 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
168 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
169 #define NEXTCHR_IS_EOS (nextchr < 0)
171 #define SET_nextchr \
172 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
174 #define SET_locinput(p) \
178 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
179 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
181 /* for use after a quantifier and before an EXACT-like node -- japhy */
182 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
184 * NOTE that *nothing* that affects backtracking should be in here, specifically
185 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
186 * node that is in between two EXACT like nodes when ascertaining what the required
187 * "follow" character is. This should probably be moved to regex compile time
188 * although it may be done at run time beause of the REF possibility - more
189 * investigation required. -- demerphq
191 #define JUMPABLE(rn) ( \
193 (OP(rn) == CLOSE && \
194 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
196 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
197 OP(rn) == PLUS || OP(rn) == MINMOD || \
199 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
201 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
203 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
206 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
207 we don't need this definition. XXX These are now out-of-sync*/
208 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
209 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFAA || OP(rn)==EXACTFAA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
210 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
213 /* ... so we use this as its faster. */
214 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
215 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFAA || OP(rn) == EXACTFAA_NO_TRIE)
216 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
217 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
222 Search for mandatory following text node; for lookahead, the text must
223 follow but for lookbehind (rn->flags != 0) we skip to the next step.
225 #define FIND_NEXT_IMPT(rn) STMT_START { \
226 while (JUMPABLE(rn)) { \
227 const OPCODE type = OP(rn); \
228 if (type == SUSPEND || PL_regkind[type] == CURLY) \
229 rn = NEXTOPER(NEXTOPER(rn)); \
230 else if (type == PLUS) \
232 else if (type == IFMATCH) \
233 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
234 else rn += NEXT_OFF(rn); \
238 #define SLAB_FIRST(s) (&(s)->states[0])
239 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
241 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
242 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
243 static regmatch_state * S_push_slab(pTHX);
245 #define REGCP_PAREN_ELEMS 3
246 #define REGCP_OTHER_ELEMS 3
247 #define REGCP_FRAME_ELEMS 1
248 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
249 * are needed for the regexp context stack bookkeeping. */
252 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
254 const int retval = PL_savestack_ix;
255 const int paren_elems_to_push =
256 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
257 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
258 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
260 GET_RE_DEBUG_FLAGS_DECL;
262 PERL_ARGS_ASSERT_REGCPPUSH;
264 if (paren_elems_to_push < 0)
265 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
266 (int)paren_elems_to_push, (int)maxopenparen,
267 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
269 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
270 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
271 " out of range (%lu-%ld)",
273 (unsigned long)maxopenparen,
276 SSGROW(total_elems + REGCP_FRAME_ELEMS);
279 if ((int)maxopenparen > (int)parenfloor)
280 Perl_re_exec_indentf( aTHX_
281 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
287 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
288 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
289 SSPUSHIV(rex->offs[p].end);
290 SSPUSHIV(rex->offs[p].start);
291 SSPUSHINT(rex->offs[p].start_tmp);
292 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
293 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
296 (IV)rex->offs[p].start,
297 (IV)rex->offs[p].start_tmp,
301 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
302 SSPUSHINT(maxopenparen);
303 SSPUSHINT(rex->lastparen);
304 SSPUSHINT(rex->lastcloseparen);
305 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
310 /* These are needed since we do not localize EVAL nodes: */
311 #define REGCP_SET(cp) \
313 Perl_re_exec_indentf( aTHX_ \
314 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
315 depth, (IV)PL_savestack_ix \
320 #define REGCP_UNWIND(cp) \
322 if (cp != PL_savestack_ix) \
323 Perl_re_exec_indentf( aTHX_ \
324 "Clearing an EVAL scope, savestack=%" \
325 IVdf "..%" IVdf "\n", \
326 depth, (IV)(cp), (IV)PL_savestack_ix \
331 /* set the start and end positions of capture ix */
332 #define CLOSE_CAPTURE(ix, s, e) \
333 rex->offs[ix].start = s; \
334 rex->offs[ix].end = e; \
335 if (ix > rex->lastparen) \
336 rex->lastparen = ix; \
337 rex->lastcloseparen = ix; \
338 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
339 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
344 (IV)rex->offs[ix].start, \
345 (IV)rex->offs[ix].end, \
349 #define UNWIND_PAREN(lp, lcp) \
350 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
351 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
356 (UV)(rex->lastparen), \
359 for (n = rex->lastparen; n > lp; n--) \
360 rex->offs[n].end = -1; \
361 rex->lastparen = n; \
362 rex->lastcloseparen = lcp;
366 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
370 GET_RE_DEBUG_FLAGS_DECL;
372 PERL_ARGS_ASSERT_REGCPPOP;
374 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
376 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
377 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
378 rex->lastcloseparen = SSPOPINT;
379 rex->lastparen = SSPOPINT;
380 *maxopenparen_p = SSPOPINT;
382 i -= REGCP_OTHER_ELEMS;
383 /* Now restore the parentheses context. */
385 if (i || rex->lastparen + 1 <= rex->nparens)
386 Perl_re_exec_indentf( aTHX_
387 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
393 paren = *maxopenparen_p;
394 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
396 rex->offs[paren].start_tmp = SSPOPINT;
397 rex->offs[paren].start = SSPOPIV;
399 if (paren <= rex->lastparen)
400 rex->offs[paren].end = tmps;
401 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
402 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
405 (IV)rex->offs[paren].start,
406 (IV)rex->offs[paren].start_tmp,
407 (IV)rex->offs[paren].end,
408 (paren > rex->lastparen ? "(skipped)" : ""));
413 /* It would seem that the similar code in regtry()
414 * already takes care of this, and in fact it is in
415 * a better location to since this code can #if 0-ed out
416 * but the code in regtry() is needed or otherwise tests
417 * requiring null fields (pat.t#187 and split.t#{13,14}
418 * (as of patchlevel 7877) will fail. Then again,
419 * this code seems to be necessary or otherwise
420 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
421 * --jhi updated by dapm */
422 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
423 if (i > *maxopenparen_p)
424 rex->offs[i].start = -1;
425 rex->offs[i].end = -1;
426 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
427 " \\%" UVuf ": %s ..-1 undeffing\n",
430 (i > *maxopenparen_p) ? "-1" : " "
436 /* restore the parens and associated vars at savestack position ix,
437 * but without popping the stack */
440 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
442 I32 tmpix = PL_savestack_ix;
443 PERL_ARGS_ASSERT_REGCP_RESTORE;
445 PL_savestack_ix = ix;
446 regcppop(rex, maxopenparen_p);
447 PL_savestack_ix = tmpix;
450 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
452 #ifndef PERL_IN_XSUB_RE
455 Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
457 /* Returns a boolean as to whether or not 'character' is a member of the
458 * Posix character class given by 'classnum' that should be equivalent to a
459 * value in the typedef '_char_class_number'.
461 * Ideally this could be replaced by a just an array of function pointers
462 * to the C library functions that implement the macros this calls.
463 * However, to compile, the precise function signatures are required, and
464 * these may vary from platform to to platform. To avoid having to figure
465 * out what those all are on each platform, I (khw) am using this method,
466 * which adds an extra layer of function call overhead (unless the C
467 * optimizer strips it away). But we don't particularly care about
468 * performance with locales anyway. */
470 switch ((_char_class_number) classnum) {
471 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
472 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
473 case _CC_ENUM_ASCII: return isASCII_LC(character);
474 case _CC_ENUM_BLANK: return isBLANK_LC(character);
475 case _CC_ENUM_CASED: return isLOWER_LC(character)
476 || isUPPER_LC(character);
477 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
478 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
479 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
480 case _CC_ENUM_LOWER: return isLOWER_LC(character);
481 case _CC_ENUM_PRINT: return isPRINT_LC(character);
482 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
483 case _CC_ENUM_SPACE: return isSPACE_LC(character);
484 case _CC_ENUM_UPPER: return isUPPER_LC(character);
485 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
486 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
487 default: /* VERTSPACE should never occur in locales */
488 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
491 NOT_REACHED; /* NOTREACHED */
498 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
500 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
501 * 'character' is a member of the Posix character class given by 'classnum'
502 * that should be equivalent to a value in the typedef
503 * '_char_class_number'.
505 * This just calls isFOO_lc on the code point for the character if it is in
506 * the range 0-255. Outside that range, all characters use Unicode
507 * rules, ignoring any locale. So use the Unicode function if this class
508 * requires a swash, and use the Unicode macro otherwise. */
510 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
512 if (UTF8_IS_INVARIANT(*character)) {
513 return isFOO_lc(classnum, *character);
515 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
516 return isFOO_lc(classnum,
517 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
520 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
522 switch ((_char_class_number) classnum) {
523 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
524 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
525 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
526 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
528 return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
529 utf8_to_uvchr_buf(character, e, NULL));
532 return FALSE; /* Things like CNTRL are always below 256 */
536 S_find_next_ascii(char * s, const char * send, const bool utf8_target)
538 /* Returns the position of the first ASCII byte in the sequence between 's'
539 * and 'send-1' inclusive; returns 'send' if none found */
541 PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
545 if ((STRLEN) (send - s) >= PERL_WORDSIZE
547 /* This term is wordsize if subword; 0 if not */
548 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
551 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
554 /* Process per-byte until reach word boundary. XXX This loop could be
555 * eliminated if we knew that this platform had fast unaligned reads */
556 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
560 s++; /* khw didn't bother creating a separate loop for
564 /* Here, we know we have at least one full word to process. Process
565 * per-word as long as we have at least a full word left */
567 PERL_UINTMAX_T complemented = ~ * (PERL_UINTMAX_T *) s;
568 if (complemented & PERL_VARIANTS_WORD_MASK) {
570 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
571 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
573 s += _variant_byte_number(complemented);
576 # else /* If weird byte order, drop into next loop to do byte-at-a-time
585 } while (s + PERL_WORDSIZE <= send);
590 /* Process per-character */
612 S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
614 /* Returns the position of the first non-ASCII byte in the sequence between
615 * 's' and 'send-1' inclusive; returns 'send' if none found */
619 PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
623 if ( ! isASCII(*s)) {
631 if ( ! isASCII(*s)) {
642 const U8 * next_non_ascii = NULL;
644 PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
645 PERL_UNUSED_ARG(utf8_target);
647 /* On ASCII platforms invariants and ASCII are identical, so if the string
648 * is entirely invariants, there is no non-ASCII character */
649 return (is_utf8_invariant_string_loc((U8 *) s,
653 : (char *) next_non_ascii;
660 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
662 /* Returns the position of the first byte in the sequence between 's' and
663 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
666 PERL_ARGS_ASSERT_FIND_SPAN_END;
670 if ((STRLEN) (send - s) >= PERL_WORDSIZE
671 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
672 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
674 PERL_UINTMAX_T span_word;
676 /* Process per-byte until reach word boundary. XXX This loop could be
677 * eliminated if we knew that this platform had fast unaligned reads */
678 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
679 if (*s != span_byte) {
685 /* Create a word filled with the bytes we are spanning */
686 span_word = PERL_COUNT_MULTIPLIER * span_byte;
688 /* Process per-word as long as we have at least a full word left */
691 /* Keep going if the whole word is composed of 'span_byte's */
692 if ((* (PERL_UINTMAX_T *) s) == span_word) {
697 /* Here, at least one byte in the word isn't 'span_byte'. */
705 /* This xor leaves 1 bits only in those non-matching bytes */
706 span_word ^= * (PERL_UINTMAX_T *) s;
708 /* Make sure the upper bit of each non-matching byte is set. This
709 * makes each such byte look like an ASCII platform variant byte */
710 span_word |= span_word << 1;
711 span_word |= span_word << 2;
712 span_word |= span_word << 4;
714 /* That reduces the problem to what this function solves */
715 return s + _variant_byte_number(span_word);
719 } while (s + PERL_WORDSIZE <= send);
722 /* Process the straggler bytes beyond the final word boundary */
724 if (*s != span_byte) {
734 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
736 /* Returns the position of the first byte in the sequence between 's'
737 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
738 * returns 'send' if none found. It uses word-level operations instead of
739 * byte to speed up the process */
741 PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
744 assert((byte & mask) == byte);
748 if ((STRLEN) (send - s) >= PERL_WORDSIZE
749 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
750 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
752 PERL_UINTMAX_T word_complemented, mask_word;
754 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
755 if (((*s) & mask) == byte) {
761 word_complemented = ~ (PERL_COUNT_MULTIPLIER * byte);
762 mask_word = PERL_COUNT_MULTIPLIER * mask;
765 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
767 /* If 'masked' contains 'byte' within it, anding with the
768 * complement will leave those 8 bits 0 */
769 masked &= word_complemented;
771 /* This causes the most significant bit to be set to 1 for any
772 * bytes in the word that aren't completely 0 */
773 masked |= masked << 1;
774 masked |= masked << 2;
775 masked |= masked << 4;
777 /* The msbits are the same as what marks a byte as variant, so we
778 * can use this mask. If all msbits are 1, the word doesn't
780 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
785 /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
786 * and any that are, are 0. Complement and re-AND to swap that */
788 masked &= PERL_VARIANTS_WORD_MASK;
790 /* This reduces the problem to that solved by this function */
791 s += _variant_byte_number(masked);
794 } while (s + PERL_WORDSIZE <= send);
800 if (((*s) & mask) == byte) {
810 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
812 /* Returns the position of the first byte in the sequence between 's' and
813 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
814 * 'span_byte' should have been ANDed with 'mask' in the call of this
815 * function. Returns 'send' if none found. Works like find_span_end(),
816 * except for the AND */
818 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
821 assert((span_byte & mask) == span_byte);
823 if ((STRLEN) (send - s) >= PERL_WORDSIZE
824 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
825 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
827 PERL_UINTMAX_T span_word, mask_word;
829 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
830 if (((*s) & mask) != span_byte) {
836 span_word = PERL_COUNT_MULTIPLIER * span_byte;
837 mask_word = PERL_COUNT_MULTIPLIER * mask;
840 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
842 if (masked == span_word) {
854 masked |= masked << 1;
855 masked |= masked << 2;
856 masked |= masked << 4;
857 return s + _variant_byte_number(masked);
861 } while (s + PERL_WORDSIZE <= send);
865 if (((*s) & mask) != span_byte) {
875 * pregexec and friends
878 #ifndef PERL_IN_XSUB_RE
880 - pregexec - match a regexp against a string
883 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
884 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
885 /* stringarg: the point in the string at which to begin matching */
886 /* strend: pointer to null at end of string */
887 /* strbeg: real beginning of string */
888 /* minend: end of match must be >= minend bytes after stringarg. */
889 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
890 * itself is accessed via the pointers above */
891 /* nosave: For optimizations. */
893 PERL_ARGS_ASSERT_PREGEXEC;
896 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
897 nosave ? 0 : REXEC_COPY_STR);
903 /* re_intuit_start():
905 * Based on some optimiser hints, try to find the earliest position in the
906 * string where the regex could match.
908 * rx: the regex to match against
909 * sv: the SV being matched: only used for utf8 flag; the string
910 * itself is accessed via the pointers below. Note that on
911 * something like an overloaded SV, SvPOK(sv) may be false
912 * and the string pointers may point to something unrelated to
914 * strbeg: real beginning of string
915 * strpos: the point in the string at which to begin matching
916 * strend: pointer to the byte following the last char of the string
917 * flags currently unused; set to 0
918 * data: currently unused; set to NULL
920 * The basic idea of re_intuit_start() is to use some known information
921 * about the pattern, namely:
923 * a) the longest known anchored substring (i.e. one that's at a
924 * constant offset from the beginning of the pattern; but not
925 * necessarily at a fixed offset from the beginning of the
927 * b) the longest floating substring (i.e. one that's not at a constant
928 * offset from the beginning of the pattern);
929 * c) Whether the pattern is anchored to the string; either
930 * an absolute anchor: /^../, or anchored to \n: /^.../m,
931 * or anchored to pos(): /\G/;
932 * d) A start class: a real or synthetic character class which
933 * represents which characters are legal at the start of the pattern;
935 * to either quickly reject the match, or to find the earliest position
936 * within the string at which the pattern might match, thus avoiding
937 * running the full NFA engine at those earlier locations, only to
938 * eventually fail and retry further along.
940 * Returns NULL if the pattern can't match, or returns the address within
941 * the string which is the earliest place the match could occur.
943 * The longest of the anchored and floating substrings is called 'check'
944 * and is checked first. The other is called 'other' and is checked
945 * second. The 'other' substring may not be present. For example,
947 * /(abc|xyz)ABC\d{0,3}DEFG/
951 * check substr (float) = "DEFG", offset 6..9 chars
952 * other substr (anchored) = "ABC", offset 3..3 chars
955 * Be aware that during the course of this function, sometimes 'anchored'
956 * refers to a substring being anchored relative to the start of the
957 * pattern, and sometimes to the pattern itself being anchored relative to
958 * the string. For example:
960 * /\dabc/: "abc" is anchored to the pattern;
961 * /^\dabc/: "abc" is anchored to the pattern and the string;
962 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
963 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
964 * but the pattern is anchored to the string.
968 Perl_re_intuit_start(pTHX_
971 const char * const strbeg,
975 re_scream_pos_data *data)
977 struct regexp *const prog = ReANY(rx);
978 SSize_t start_shift = prog->check_offset_min;
979 /* Should be nonnegative! */
980 SSize_t end_shift = 0;
981 /* current lowest pos in string where the regex can start matching */
982 char *rx_origin = strpos;
984 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
985 U8 other_ix = 1 - prog->substrs->check_ix;
987 char *other_last = strpos;/* latest pos 'other' substr already checked to */
988 char *check_at = NULL; /* check substr found at this pos */
989 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
990 RXi_GET_DECL(prog,progi);
991 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
992 regmatch_info *const reginfo = ®info_buf;
993 GET_RE_DEBUG_FLAGS_DECL;
995 PERL_ARGS_ASSERT_RE_INTUIT_START;
996 PERL_UNUSED_ARG(flags);
997 PERL_UNUSED_ARG(data);
999 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1000 "Intuit: trying to determine minimum start position...\n"));
1002 /* for now, assume that all substr offsets are positive. If at some point
1003 * in the future someone wants to do clever things with lookbehind and
1004 * -ve offsets, they'll need to fix up any code in this function
1005 * which uses these offsets. See the thread beginning
1006 * <20140113145929.GF27210@iabyn.com>
1008 assert(prog->substrs->data[0].min_offset >= 0);
1009 assert(prog->substrs->data[0].max_offset >= 0);
1010 assert(prog->substrs->data[1].min_offset >= 0);
1011 assert(prog->substrs->data[1].max_offset >= 0);
1012 assert(prog->substrs->data[2].min_offset >= 0);
1013 assert(prog->substrs->data[2].max_offset >= 0);
1015 /* for now, assume that if both present, that the floating substring
1016 * doesn't start before the anchored substring.
1017 * If you break this assumption (e.g. doing better optimisations
1018 * with lookahead/behind), then you'll need to audit the code in this
1019 * function carefully first
1022 ! ( (prog->anchored_utf8 || prog->anchored_substr)
1023 && (prog->float_utf8 || prog->float_substr))
1024 || (prog->float_min_offset >= prog->anchored_offset));
1026 /* byte rather than char calculation for efficiency. It fails
1027 * to quickly reject some cases that can't match, but will reject
1028 * them later after doing full char arithmetic */
1029 if (prog->minlen > strend - strpos) {
1030 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1031 " String too short...\n"));
1035 RXp_MATCH_UTF8_set(prog, utf8_target);
1036 reginfo->is_utf8_target = cBOOL(utf8_target);
1037 reginfo->info_aux = NULL;
1038 reginfo->strbeg = strbeg;
1039 reginfo->strend = strend;
1040 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
1041 reginfo->intuit = 1;
1042 /* not actually used within intuit, but zero for safety anyway */
1043 reginfo->poscache_maxiter = 0;
1046 if ((!prog->anchored_utf8 && prog->anchored_substr)
1047 || (!prog->float_utf8 && prog->float_substr))
1048 to_utf8_substr(prog);
1049 check = prog->check_utf8;
1051 if (!prog->check_substr && prog->check_utf8) {
1052 if (! to_byte_substr(prog)) {
1053 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1056 check = prog->check_substr;
1059 /* dump the various substring data */
1060 DEBUG_OPTIMISE_MORE_r({
1062 for (i=0; i<=2; i++) {
1063 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
1064 : prog->substrs->data[i].substr);
1068 Perl_re_printf( aTHX_
1069 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
1070 " useful=%" IVdf " utf8=%d [%s]\n",
1072 (IV)prog->substrs->data[i].min_offset,
1073 (IV)prog->substrs->data[i].max_offset,
1074 (IV)prog->substrs->data[i].end_shift,
1076 utf8_target ? 1 : 0,
1081 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
1083 /* ml_anch: check after \n?
1085 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
1086 * with /.*.../, these flags will have been added by the
1088 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
1089 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
1091 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
1092 && !(prog->intflags & PREGf_IMPLICIT);
1094 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
1095 /* we are only allowed to match at BOS or \G */
1097 /* trivially reject if there's a BOS anchor and we're not at BOS.
1099 * Note that we don't try to do a similar quick reject for
1100 * \G, since generally the caller will have calculated strpos
1101 * based on pos() and gofs, so the string is already correctly
1102 * anchored by definition; and handling the exceptions would
1103 * be too fiddly (e.g. REXEC_IGNOREPOS).
1105 if ( strpos != strbeg
1106 && (prog->intflags & PREGf_ANCH_SBOL))
1108 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1109 " Not at start...\n"));
1113 /* in the presence of an anchor, the anchored (relative to the
1114 * start of the regex) substr must also be anchored relative
1115 * to strpos. So quickly reject if substr isn't found there.
1116 * This works for \G too, because the caller will already have
1117 * subtracted gofs from pos, and gofs is the offset from the
1118 * \G to the start of the regex. For example, in /.abc\Gdef/,
1119 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1120 * caller will have set strpos=pos()-4; we look for the substr
1121 * at position pos()-4+1, which lines up with the "a" */
1123 if (prog->check_offset_min == prog->check_offset_max) {
1124 /* Substring at constant offset from beg-of-str... */
1125 SSize_t slen = SvCUR(check);
1126 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1128 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1129 " Looking for check substr at fixed offset %" IVdf "...\n",
1130 (IV)prog->check_offset_min));
1132 if (SvTAIL(check)) {
1133 /* In this case, the regex is anchored at the end too.
1134 * Unless it's a multiline match, the lengths must match
1135 * exactly, give or take a \n. NB: slen >= 1 since
1136 * the last char of check is \n */
1138 && ( strend - s > slen
1139 || strend - s < slen - 1
1140 || (strend - s == slen && strend[-1] != '\n')))
1142 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1143 " String too long...\n"));
1146 /* Now should match s[0..slen-2] */
1149 if (slen && (strend - s < slen
1150 || *SvPVX_const(check) != *s
1151 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1153 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1154 " String not equal...\n"));
1159 goto success_at_start;
1164 end_shift = prog->check_end_shift;
1166 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
1168 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1169 (IV)end_shift, RX_PRECOMP(rx));
1174 /* This is the (re)entry point of the main loop in this function.
1175 * The goal of this loop is to:
1176 * 1) find the "check" substring in the region rx_origin..strend
1177 * (adjusted by start_shift / end_shift). If not found, reject
1179 * 2) If it exists, look for the "other" substr too if defined; for
1180 * example, if the check substr maps to the anchored substr, then
1181 * check the floating substr, and vice-versa. If not found, go
1182 * back to (1) with rx_origin suitably incremented.
1183 * 3) If we find an rx_origin position that doesn't contradict
1184 * either of the substrings, then check the possible additional
1185 * constraints on rx_origin of /^.../m or a known start class.
1186 * If these fail, then depending on which constraints fail, jump
1187 * back to here, or to various other re-entry points further along
1188 * that skip some of the first steps.
1189 * 4) If we pass all those tests, update the BmUSEFUL() count on the
1190 * substring. If the start position was determined to be at the
1191 * beginning of the string - so, not rejected, but not optimised,
1192 * since we have to run regmatch from position 0 - decrement the
1193 * BmUSEFUL() count. Otherwise increment it.
1197 /* first, look for the 'check' substring */
1203 DEBUG_OPTIMISE_MORE_r({
1204 Perl_re_printf( aTHX_
1205 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1206 " Start shift: %" IVdf " End shift %" IVdf
1207 " Real end Shift: %" IVdf "\n",
1208 (IV)(rx_origin - strbeg),
1209 (IV)prog->check_offset_min,
1212 (IV)prog->check_end_shift);
1215 end_point = HOPBACK3(strend, end_shift, rx_origin);
1218 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1223 /* If the regex is absolutely anchored to either the start of the
1224 * string (SBOL) or to pos() (ANCH_GPOS), then
1225 * check_offset_max represents an upper bound on the string where
1226 * the substr could start. For the ANCH_GPOS case, we assume that
1227 * the caller of intuit will have already set strpos to
1228 * pos()-gofs, so in this case strpos + offset_max will still be
1229 * an upper bound on the substr.
1232 && prog->intflags & PREGf_ANCH
1233 && prog->check_offset_max != SSize_t_MAX)
1235 SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1236 const char * const anchor =
1237 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1238 SSize_t targ_len = (char*)end_point - anchor;
1240 if (check_len > targ_len) {
1241 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1242 "Target string too short to match required substring...\n"));
1246 /* do a bytes rather than chars comparison. It's conservative;
1247 * so it skips doing the HOP if the result can't possibly end
1248 * up earlier than the old value of end_point.
1250 assert(anchor + check_len <= (char *)end_point);
1251 if (prog->check_offset_max + check_len < targ_len) {
1252 end_point = HOP3lim((U8*)anchor,
1253 prog->check_offset_max,
1254 end_point - check_len
1257 if (end_point < start_point)
1262 check_at = fbm_instr( start_point, end_point,
1263 check, multiline ? FBMrf_MULTILINE : 0);
1265 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1266 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1267 (IV)((char*)start_point - strbeg),
1268 (IV)((char*)end_point - strbeg),
1269 (IV)(check_at ? check_at - strbeg : -1)
1272 /* Update the count-of-usability, remove useless subpatterns,
1276 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1277 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1278 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
1279 (check_at ? "Found" : "Did not find"),
1280 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1281 ? "anchored" : "floating"),
1284 (check_at ? " at offset " : "...\n") );
1289 /* set rx_origin to the minimum position where the regex could start
1290 * matching, given the constraint of the just-matched check substring.
1291 * But don't set it lower than previously.
1294 if (check_at - rx_origin > prog->check_offset_max)
1295 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1296 /* Finish the diagnostic message */
1297 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1298 "%ld (rx_origin now %" IVdf ")...\n",
1299 (long)(check_at - strbeg),
1300 (IV)(rx_origin - strbeg)
1305 /* now look for the 'other' substring if defined */
1307 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
1308 : prog->substrs->data[other_ix].substr)
1310 /* Take into account the "other" substring. */
1314 struct reg_substr_datum *other;
1317 other = &prog->substrs->data[other_ix];
1319 /* if "other" is anchored:
1320 * we've previously found a floating substr starting at check_at.
1321 * This means that the regex origin must lie somewhere
1322 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1323 * and max: HOP3(check_at, -check_offset_min)
1324 * (except that min will be >= strpos)
1325 * So the fixed substr must lie somewhere between
1326 * HOP3(min, anchored_offset)
1327 * HOP3(max, anchored_offset) + SvCUR(substr)
1330 /* if "other" is floating
1331 * Calculate last1, the absolute latest point where the
1332 * floating substr could start in the string, ignoring any
1333 * constraints from the earlier fixed match. It is calculated
1336 * strend - prog->minlen (in chars) is the absolute latest
1337 * position within the string where the origin of the regex
1338 * could appear. The latest start point for the floating
1339 * substr is float_min_offset(*) on from the start of the
1340 * regex. last1 simply combines thee two offsets.
1342 * (*) You might think the latest start point should be
1343 * float_max_offset from the regex origin, and technically
1344 * you'd be correct. However, consider
1346 * Here, float min, max are 3,5 and minlen is 7.
1347 * This can match either
1351 * In the first case, the regex matches minlen chars; in the
1352 * second, minlen+1, in the third, minlen+2.
1353 * In the first case, the floating offset is 3 (which equals
1354 * float_min), in the second, 4, and in the third, 5 (which
1355 * equals float_max). In all cases, the floating string bcd
1356 * can never start more than 4 chars from the end of the
1357 * string, which equals minlen - float_min. As the substring
1358 * starts to match more than float_min from the start of the
1359 * regex, it makes the regex match more than minlen chars,
1360 * and the two cancel each other out. So we can always use
1361 * float_min - minlen, rather than float_max - minlen for the
1362 * latest position in the string.
1364 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1365 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1368 assert(prog->minlen >= other->min_offset);
1369 last1 = HOP3c(strend,
1370 other->min_offset - prog->minlen, strbeg);
1372 if (other_ix) {/* i.e. if (other-is-float) */
1373 /* last is the latest point where the floating substr could
1374 * start, *given* any constraints from the earlier fixed
1375 * match. This constraint is that the floating string starts
1376 * <= float_max_offset chars from the regex origin (rx_origin).
1377 * If this value is less than last1, use it instead.
1379 assert(rx_origin <= last1);
1381 /* this condition handles the offset==infinity case, and
1382 * is a short-cut otherwise. Although it's comparing a
1383 * byte offset to a char length, it does so in a safe way,
1384 * since 1 char always occupies 1 or more bytes,
1385 * so if a string range is (last1 - rx_origin) bytes,
1386 * it will be less than or equal to (last1 - rx_origin)
1387 * chars; meaning it errs towards doing the accurate HOP3
1388 * rather than just using last1 as a short-cut */
1389 (last1 - rx_origin) < other->max_offset
1391 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1394 assert(strpos + start_shift <= check_at);
1395 last = HOP4c(check_at, other->min_offset - start_shift,
1399 s = HOP3c(rx_origin, other->min_offset, strend);
1400 if (s < other_last) /* These positions already checked */
1403 must = utf8_target ? other->utf8_substr : other->substr;
1404 assert(SvPOK(must));
1407 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1413 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1414 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1415 (IV)(from - strbeg),
1421 (unsigned char*)from,
1424 multiline ? FBMrf_MULTILINE : 0
1426 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1427 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1428 (IV)(from - strbeg),
1430 (IV)(s ? s - strbeg : -1)
1436 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1437 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1438 Perl_re_printf( aTHX_ " %s %s substr %s%s",
1439 s ? "Found" : "Contradicts",
1440 other_ix ? "floating" : "anchored",
1441 quoted, RE_SV_TAIL(must));
1446 /* last1 is latest possible substr location. If we didn't
1447 * find it before there, we never will */
1448 if (last >= last1) {
1449 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1450 "; giving up...\n"));
1454 /* try to find the check substr again at a later
1455 * position. Maybe next time we'll find the "other" substr
1457 other_last = HOP3c(last, 1, strend) /* highest failure */;
1459 other_ix /* i.e. if other-is-float */
1460 ? HOP3c(rx_origin, 1, strend)
1461 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1462 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1463 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1464 (other_ix ? "floating" : "anchored"),
1465 (long)(HOP3c(check_at, 1, strend) - strbeg),
1466 (IV)(rx_origin - strbeg)
1471 if (other_ix) { /* if (other-is-float) */
1472 /* other_last is set to s, not s+1, since its possible for
1473 * a floating substr to fail first time, then succeed
1474 * second time at the same floating position; e.g.:
1475 * "-AB--AABZ" =~ /\wAB\d*Z/
1476 * The first time round, anchored and float match at
1477 * "-(AB)--AAB(Z)" then fail on the initial \w character
1478 * class. Second time round, they match at "-AB--A(AB)(Z)".
1483 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1484 other_last = HOP3c(s, 1, strend);
1486 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1487 " at offset %ld (rx_origin now %" IVdf ")...\n",
1489 (IV)(rx_origin - strbeg)
1495 DEBUG_OPTIMISE_MORE_r(
1496 Perl_re_printf( aTHX_
1497 " Check-only match: offset min:%" IVdf " max:%" IVdf
1498 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1499 " strend:%" IVdf "\n",
1500 (IV)prog->check_offset_min,
1501 (IV)prog->check_offset_max,
1502 (IV)(check_at-strbeg),
1503 (IV)(rx_origin-strbeg),
1504 (IV)(rx_origin-check_at),
1510 postprocess_substr_matches:
1512 /* handle the extra constraint of /^.../m if present */
1514 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1517 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1518 " looking for /^/m anchor"));
1520 /* we have failed the constraint of a \n before rx_origin.
1521 * Find the next \n, if any, even if it's beyond the current
1522 * anchored and/or floating substrings. Whether we should be
1523 * scanning ahead for the next \n or the next substr is debatable.
1524 * On the one hand you'd expect rare substrings to appear less
1525 * often than \n's. On the other hand, searching for \n means
1526 * we're effectively flipping between check_substr and "\n" on each
1527 * iteration as the current "rarest" string candidate, which
1528 * means for example that we'll quickly reject the whole string if
1529 * hasn't got a \n, rather than trying every substr position
1533 s = HOP3c(strend, - prog->minlen, strpos);
1534 if (s <= rx_origin ||
1535 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1537 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1538 " Did not find /%s^%s/m...\n",
1539 PL_colors[0], PL_colors[1]));
1543 /* earliest possible origin is 1 char after the \n.
1544 * (since *rx_origin == '\n', it's safe to ++ here rather than
1545 * HOP(rx_origin, 1)) */
1548 if (prog->substrs->check_ix == 0 /* check is anchored */
1549 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1551 /* Position contradicts check-string; either because
1552 * check was anchored (and thus has no wiggle room),
1553 * or check was float and rx_origin is above the float range */
1554 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1555 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1556 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1560 /* if we get here, the check substr must have been float,
1561 * is in range, and we may or may not have had an anchored
1562 * "other" substr which still contradicts */
1563 assert(prog->substrs->check_ix); /* check is float */
1565 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1566 /* whoops, the anchored "other" substr exists, so we still
1567 * contradict. On the other hand, the float "check" substr
1568 * didn't contradict, so just retry the anchored "other"
1570 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1571 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1572 PL_colors[0], PL_colors[1],
1573 (IV)(rx_origin - strbeg + prog->anchored_offset),
1574 (IV)(rx_origin - strbeg)
1576 goto do_other_substr;
1579 /* success: we don't contradict the found floating substring
1580 * (and there's no anchored substr). */
1581 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1582 " Found /%s^%s/m with rx_origin %ld...\n",
1583 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1586 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1587 " (multiline anchor test skipped)\n"));
1593 /* if we have a starting character class, then test that extra constraint.
1594 * (trie stclasses are too expensive to use here, we are better off to
1595 * leave it to regmatch itself) */
1597 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1598 const U8* const str = (U8*)STRING(progi->regstclass);
1600 /* XXX this value could be pre-computed */
1601 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1602 ? (reginfo->is_utf8_pat
1603 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1604 : STR_LEN(progi->regstclass))
1608 /* latest pos that a matching float substr constrains rx start to */
1609 char *rx_max_float = NULL;
1611 /* if the current rx_origin is anchored, either by satisfying an
1612 * anchored substring constraint, or a /^.../m constraint, then we
1613 * can reject the current origin if the start class isn't found
1614 * at the current position. If we have a float-only match, then
1615 * rx_origin is constrained to a range; so look for the start class
1616 * in that range. if neither, then look for the start class in the
1617 * whole rest of the string */
1619 /* XXX DAPM it's not clear what the minlen test is for, and why
1620 * it's not used in the floating case. Nothing in the test suite
1621 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1622 * Here are some old comments, which may or may not be correct:
1624 * minlen == 0 is possible if regstclass is \b or \B,
1625 * and the fixed substr is ''$.
1626 * Since minlen is already taken into account, rx_origin+1 is
1627 * before strend; accidentally, minlen >= 1 guaranties no false
1628 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1629 * 0) below assumes that regstclass does not come from lookahead...
1630 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1631 * This leaves EXACTF-ish only, which are dealt with in
1635 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1636 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1637 else if (prog->float_substr || prog->float_utf8) {
1638 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1639 endpos = HOP3clim(rx_max_float, cl_l, strend);
1644 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1645 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
1646 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1647 (IV)start_shift, (IV)(check_at - strbeg),
1648 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1650 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1653 if (endpos == strend) {
1654 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1655 " Could not match STCLASS...\n") );
1658 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1659 " This position contradicts STCLASS...\n") );
1660 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1661 && !(prog->intflags & PREGf_IMPLICIT))
1664 /* Contradict one of substrings */
1665 if (prog->anchored_substr || prog->anchored_utf8) {
1666 if (prog->substrs->check_ix == 1) { /* check is float */
1667 /* Have both, check_string is floating */
1668 assert(rx_origin + start_shift <= check_at);
1669 if (rx_origin + start_shift != check_at) {
1670 /* not at latest position float substr could match:
1671 * Recheck anchored substring, but not floating.
1672 * The condition above is in bytes rather than
1673 * chars for efficiency. It's conservative, in
1674 * that it errs on the side of doing 'goto
1675 * do_other_substr'. In this case, at worst,
1676 * an extra anchored search may get done, but in
1677 * practice the extra fbm_instr() is likely to
1678 * get skipped anyway. */
1679 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1680 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1681 (long)(other_last - strbeg),
1682 (IV)(rx_origin - strbeg)
1684 goto do_other_substr;
1692 /* In the presence of ml_anch, we might be able to
1693 * find another \n without breaking the current float
1696 /* strictly speaking this should be HOP3c(..., 1, ...),
1697 * but since we goto a block of code that's going to
1698 * search for the next \n if any, its safe here */
1700 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1701 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
1702 PL_colors[0], PL_colors[1],
1703 (long)(rx_origin - strbeg)) );
1704 goto postprocess_substr_matches;
1707 /* strictly speaking this can never be true; but might
1708 * be if we ever allow intuit without substrings */
1709 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1712 rx_origin = rx_max_float;
1715 /* at this point, any matching substrings have been
1716 * contradicted. Start again... */
1718 rx_origin = HOP3c(rx_origin, 1, strend);
1720 /* uses bytes rather than char calculations for efficiency.
1721 * It's conservative: it errs on the side of doing 'goto restart',
1722 * where there is code that does a proper char-based test */
1723 if (rx_origin + start_shift + end_shift > strend) {
1724 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1725 " Could not match STCLASS...\n") );
1728 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1729 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1730 (prog->substrs->check_ix ? "floating" : "anchored"),
1731 (long)(rx_origin + start_shift - strbeg),
1732 (IV)(rx_origin - strbeg)
1739 if (rx_origin != s) {
1740 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1741 " By STCLASS: moving %ld --> %ld\n",
1742 (long)(rx_origin - strbeg), (long)(s - strbeg))
1746 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1747 " Does not contradict STCLASS...\n");
1752 /* Decide whether using the substrings helped */
1754 if (rx_origin != strpos) {
1755 /* Fixed substring is found far enough so that the match
1756 cannot start at strpos. */
1758 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
1759 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1762 /* The found rx_origin position does not prohibit matching at
1763 * strpos, so calling intuit didn't gain us anything. Decrement
1764 * the BmUSEFUL() count on the check substring, and if we reach
1766 if (!(prog->intflags & PREGf_NAUGHTY)
1768 prog->check_utf8 /* Could be deleted already */
1769 && --BmUSEFUL(prog->check_utf8) < 0
1770 && (prog->check_utf8 == prog->float_utf8)
1772 prog->check_substr /* Could be deleted already */
1773 && --BmUSEFUL(prog->check_substr) < 0
1774 && (prog->check_substr == prog->float_substr)
1777 /* If flags & SOMETHING - do not do it many times on the same match */
1778 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
1779 /* XXX Does the destruction order has to change with utf8_target? */
1780 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1781 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1782 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1783 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1784 check = NULL; /* abort */
1785 /* XXXX This is a remnant of the old implementation. It
1786 looks wasteful, since now INTUIT can use many
1787 other heuristics. */
1788 prog->extflags &= ~RXf_USE_INTUIT;
1792 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1793 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1794 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1798 fail_finish: /* Substring not found */
1799 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1800 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1802 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
1803 PL_colors[4], PL_colors[5]));
1808 #define DECL_TRIE_TYPE(scan) \
1809 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1810 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1811 trie_utf8l, trie_flu8, trie_flu8_latin } \
1812 trie_type = ((scan->flags == EXACT) \
1813 ? (utf8_target ? trie_utf8 : trie_plain) \
1814 : (scan->flags == EXACTL) \
1815 ? (utf8_target ? trie_utf8l : trie_plain) \
1816 : (scan->flags == EXACTFAA) \
1818 ? trie_utf8_exactfa_fold \
1819 : trie_latin_utf8_exactfa_fold) \
1820 : (scan->flags == EXACTFLU8 \
1823 : trie_flu8_latin) \
1826 : trie_latin_utf8_fold)))
1828 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1829 * 'foldbuf+sizeof(foldbuf)' */
1830 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1833 U8 flags = FOLD_FLAGS_FULL; \
1834 switch (trie_type) { \
1836 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1837 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \
1838 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1840 goto do_trie_utf8_fold; \
1841 case trie_utf8_exactfa_fold: \
1842 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1844 case trie_utf8_fold: \
1845 do_trie_utf8_fold: \
1846 if ( foldlen>0 ) { \
1847 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1852 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
1854 len = UTF8SKIP(uc); \
1855 skiplen = UVCHR_SKIP( uvc ); \
1856 foldlen -= skiplen; \
1857 uscan = foldbuf + skiplen; \
1860 case trie_flu8_latin: \
1861 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1862 goto do_trie_latin_utf8_fold; \
1863 case trie_latin_utf8_exactfa_fold: \
1864 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1866 case trie_latin_utf8_fold: \
1867 do_trie_latin_utf8_fold: \
1868 if ( foldlen>0 ) { \
1869 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1875 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1876 skiplen = UVCHR_SKIP( uvc ); \
1877 foldlen -= skiplen; \
1878 uscan = foldbuf + skiplen; \
1882 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1883 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1884 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1888 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
1895 charid = trie->charmap[ uvc ]; \
1899 if (widecharmap) { \
1900 SV** const svpp = hv_fetch(widecharmap, \
1901 (char*)&uvc, sizeof(UV), 0); \
1903 charid = (U16)SvIV(*svpp); \
1908 #define DUMP_EXEC_POS(li,s,doutf8,depth) \
1909 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1910 startpos, doutf8, depth)
1912 #define REXEC_FBC_SCAN(UTF8, CODE) \
1914 while (s < strend) { \
1916 s += ((UTF8) ? UTF8SKIP(s) : 1); \
1920 #define REXEC_FBC_CLASS_SCAN(UTF8, COND) \
1922 while (s < strend) { \
1923 REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
1927 #define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \
1930 s += ((UTF8) ? UTF8SKIP(s) : 1); \
1931 previous_occurrence_end = s; \
1934 s += ((UTF8) ? UTF8SKIP(s) : 1); \
1937 #define REXEC_FBC_CSCAN(CONDUTF8,COND) \
1938 if (utf8_target) { \
1939 REXEC_FBC_CLASS_SCAN(1, CONDUTF8); \
1942 REXEC_FBC_CLASS_SCAN(0, COND); \
1945 /* We keep track of where the next character should start after an occurrence
1946 * of the one we're looking for. Knowing that, we can see right away if the
1947 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
1948 * don't accept the 2nd and succeeding adjacent occurrences */
1949 #define FBC_CHECK_AND_TRY \
1951 || s != previous_occurrence_end) \
1952 && (reginfo->intuit || regtry(reginfo, &s))) \
1958 /* This differs from the above macros in that it calls a function which returns
1959 * the next occurrence of the thing being looked for in 's'; and 'strend' if
1960 * there is no such occurrence. */
1961 #define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f) \
1962 while (s < strend) { \
1964 if (s >= strend) { \
1969 s += (UTF8) ? UTF8SKIP(s) : 1; \
1970 previous_occurrence_end = s; \
1973 /* The three macros below are slightly different versions of the same logic.
1975 * The first is for /a and /aa when the target string is UTF-8. This can only
1976 * match ascii, but it must advance based on UTF-8. The other two handle the
1977 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1978 * for the boundary (or non-boundary) between a word and non-word character.
1979 * The utf8 and non-utf8 cases have the same logic, but the details must be
1980 * different. Find the "wordness" of the character just prior to this one, and
1981 * compare it with the wordness of this one. If they differ, we have a
1982 * boundary. At the beginning of the string, pretend that the previous
1983 * character was a new-line.
1985 * All these macros uncleanly have side-effects with each other and outside
1986 * variables. So far it's been too much trouble to clean-up
1988 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1989 * a word character or not.
1990 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1992 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1994 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1995 * are looking for a boundary or for a non-boundary. If we are looking for a
1996 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1997 * see if this tentative match actually works, and if so, to quit the loop
1998 * here. And vice-versa if we are looking for a non-boundary.
2000 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
2001 * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of
2002 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
2003 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
2004 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
2005 * complement. But in that branch we complement tmp, meaning that at the
2006 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
2007 * which means at the top of the loop in the next iteration, it is
2008 * TEST_NON_UTF8(s-1) */
2009 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2010 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2011 tmp = TEST_NON_UTF8(tmp); \
2012 REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \
2013 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
2015 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
2022 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
2023 * TEST_UTF8 is a macro that for the same input code points returns identically
2024 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
2025 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
2026 if (s == reginfo->strbeg) { \
2029 else { /* Back-up to the start of the previous character */ \
2030 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
2031 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
2032 0, UTF8_ALLOW_DEFAULT); \
2034 tmp = TEST_UV(tmp); \
2035 REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \
2036 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
2045 /* Like the above two macros. UTF8_CODE is the complete code for handling
2046 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
2048 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2049 if (utf8_target) { \
2052 else { /* Not utf8 */ \
2053 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2054 tmp = TEST_NON_UTF8(tmp); \
2055 REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */ \
2056 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
2065 /* Here, things have been set up by the previous code so that tmp is the \
2066 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
2067 * utf8ness of the target). We also have to check if this matches against \
2068 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
2069 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
2071 if (tmp == ! TEST_NON_UTF8('\n')) { \
2078 /* This is the macro to use when we want to see if something that looks like it
2079 * could match, actually does, and if so exits the loop */
2080 #define REXEC_FBC_TRYIT \
2081 if ((reginfo->intuit || regtry(reginfo, &s))) \
2084 /* The only difference between the BOUND and NBOUND cases is that
2085 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2086 * NBOUND. This is accomplished by passing it as either the if or else clause,
2087 * with the other one being empty (PLACEHOLDER is defined as empty).
2089 * The TEST_FOO parameters are for operating on different forms of input, but
2090 * all should be ones that return identically for the same underlying code
2092 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2094 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2095 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2097 #define FBC_BOUND_A(TEST_NON_UTF8) \
2099 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2100 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2102 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2104 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2105 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2107 #define FBC_NBOUND_A(TEST_NON_UTF8) \
2109 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2110 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2114 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2115 IV cp_out = _invlist_search(invlist, cp_in);
2116 assert(cp_out >= 0);
2119 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2120 invmap[S_get_break_val_cp_checked(invlist, cp)]
2122 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2123 invmap[_invlist_search(invlist, cp)]
2126 /* Takes a pointer to an inversion list, a pointer to its corresponding
2127 * inversion map, and a code point, and returns the code point's value
2128 * according to the two arrays. It assumes that all code points have a value.
2129 * This is used as the base macro for macros for particular properties */
2130 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
2131 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2133 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2134 * of a code point, returning the value for the first code point in the string.
2135 * And it takes the particular macro name that finds the desired value given a
2136 * code point. Merely convert the UTF-8 to code point and call the cp macro */
2137 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
2138 (__ASSERT_(pos < strend) \
2139 /* Note assumes is valid UTF-8 */ \
2140 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2142 /* Returns the GCB value for the input code point */
2143 #define getGCB_VAL_CP(cp) \
2144 _generic_GET_BREAK_VAL_CP( \
2149 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2150 * bounded by pos and strend */
2151 #define getGCB_VAL_UTF8(pos, strend) \
2152 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2154 /* Returns the LB value for the input code point */
2155 #define getLB_VAL_CP(cp) \
2156 _generic_GET_BREAK_VAL_CP( \
2161 /* Returns the LB value for the first code point in the UTF-8 encoded string
2162 * bounded by pos and strend */
2163 #define getLB_VAL_UTF8(pos, strend) \
2164 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2167 /* Returns the SB value for the input code point */
2168 #define getSB_VAL_CP(cp) \
2169 _generic_GET_BREAK_VAL_CP( \
2174 /* Returns the SB value for the first code point in the UTF-8 encoded string
2175 * bounded by pos and strend */
2176 #define getSB_VAL_UTF8(pos, strend) \
2177 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2179 /* Returns the WB value for the input code point */
2180 #define getWB_VAL_CP(cp) \
2181 _generic_GET_BREAK_VAL_CP( \
2186 /* Returns the WB value for the first code point in the UTF-8 encoded string
2187 * bounded by pos and strend */
2188 #define getWB_VAL_UTF8(pos, strend) \
2189 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2191 /* We know what class REx starts with. Try to find this position... */
2192 /* if reginfo->intuit, its a dryrun */
2193 /* annoyingly all the vars in this routine have different names from their counterparts
2194 in regmatch. /grrr */
2196 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2197 const char *strend, regmatch_info *reginfo)
2201 /* TRUE if x+ need not match at just the 1st pos of run of x's */
2202 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2204 char *pat_string; /* The pattern's exactish string */
2205 char *pat_end; /* ptr to end char of pat_string */
2206 re_fold_t folder; /* Function for computing non-utf8 folds */
2207 const U8 *fold_array; /* array for folding ords < 256 */
2214 /* In some cases we accept only the first occurence of 'x' in a sequence of
2215 * them. This variable points to just beyond the end of the previous
2216 * occurrence of 'x', hence we can tell if we are in a sequence. (Having
2217 * it point to beyond the 'x' allows us to work for UTF-8 without having to
2219 char * previous_occurrence_end = 0;
2221 I32 tmp; /* Scratch variable */
2222 const bool utf8_target = reginfo->is_utf8_target;
2223 UV utf8_fold_flags = 0;
2224 const bool is_utf8_pat = reginfo->is_utf8_pat;
2225 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2226 with a result inverts that result, as 0^1 =
2228 _char_class_number classnum;
2230 RXi_GET_DECL(prog,progi);
2232 PERL_ARGS_ASSERT_FIND_BYCLASS;
2234 /* We know what class it must start with. */
2238 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2240 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
2241 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
2248 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2249 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
2251 else if (ANYOF_FLAGS(c)) {
2252 REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
2255 REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s)));
2259 case ANYOFM: /* ARG() is the base byte; FLAGS() the mask byte */
2260 /* UTF-8ness doesn't matter, so use 0 */
2261 REXEC_FBC_FIND_NEXT_SCAN(0,
2262 (char *) find_next_masked((U8 *) s, (U8 *) strend,
2263 (U8) ARG(c), FLAGS(c)));
2266 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
2267 assert(! is_utf8_pat);
2270 if (is_utf8_pat || utf8_target) {
2271 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2272 goto do_exactf_utf8;
2274 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
2275 folder = foldEQ_latin1; /* /a, except the sharp s one which */
2276 goto do_exactf_non_utf8; /* isn't dealt with by these */
2278 case EXACTF: /* This node only generated for non-utf8 patterns */
2279 assert(! is_utf8_pat);
2281 utf8_fold_flags = 0;
2282 goto do_exactf_utf8;
2284 fold_array = PL_fold;
2286 goto do_exactf_non_utf8;
2289 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2290 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
2291 utf8_fold_flags = FOLDEQ_LOCALE;
2292 goto do_exactf_utf8;
2294 fold_array = PL_fold_locale;
2295 folder = foldEQ_locale;
2296 goto do_exactf_non_utf8;
2300 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2302 goto do_exactf_utf8;
2305 if (! utf8_target) { /* All code points in this node require
2306 UTF-8 to express. */
2309 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2310 | FOLDEQ_S2_FOLDS_SANE;
2311 goto do_exactf_utf8;
2314 if (is_utf8_pat || utf8_target) {
2315 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
2316 goto do_exactf_utf8;
2319 /* Any 'ss' in the pattern should have been replaced by regcomp,
2320 * so we don't have to worry here about this single special case
2321 * in the Latin1 range */
2322 fold_array = PL_fold_latin1;
2323 folder = foldEQ_latin1;
2327 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2328 are no glitches with fold-length differences
2329 between the target string and pattern */
2331 /* The idea in the non-utf8 EXACTF* cases is to first find the
2332 * first character of the EXACTF* node and then, if necessary,
2333 * case-insensitively compare the full text of the node. c1 is the
2334 * first character. c2 is its fold. This logic will not work for
2335 * Unicode semantics and the german sharp ss, which hence should
2336 * not be compiled into a node that gets here. */
2337 pat_string = STRING(c);
2338 ln = STR_LEN(c); /* length to match in octets/bytes */
2340 /* We know that we have to match at least 'ln' bytes (which is the
2341 * same as characters, since not utf8). If we have to match 3
2342 * characters, and there are only 2 availabe, we know without
2343 * trying that it will fail; so don't start a match past the
2344 * required minimum number from the far end */
2345 e = HOP3c(strend, -((SSize_t)ln), s);
2350 c2 = fold_array[c1];
2351 if (c1 == c2) { /* If char and fold are the same */
2353 s = (char *) memchr(s, c1, e + 1 - s);
2358 /* Check that the rest of the node matches */
2359 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2360 && (reginfo->intuit || regtry(reginfo, &s)) )
2368 U8 bits_differing = c1 ^ c2;
2370 /* If the folds differ in one bit position only, we can mask to
2371 * match either of them, and can use this faster find method. Both
2372 * ASCII and EBCDIC tend to have their case folds differ in only
2373 * one position, so this is very likely */
2374 if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2375 bits_differing = ~ bits_differing;
2377 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2378 (c1 & bits_differing), bits_differing);
2383 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2384 && (reginfo->intuit || regtry(reginfo, &s)) )
2391 else { /* Otherwise, stuck with looking byte-at-a-time. This
2392 should actually happen only in EXACTFL nodes */
2394 if ( (*(U8*)s == c1 || *(U8*)s == c2)
2395 && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2396 && (reginfo->intuit || regtry(reginfo, &s)) )
2410 /* If one of the operands is in utf8, we can't use the simpler folding
2411 * above, due to the fact that many different characters can have the
2412 * same fold, or portion of a fold, or different- length fold */
2413 pat_string = STRING(c);
2414 ln = STR_LEN(c); /* length to match in octets/bytes */
2415 pat_end = pat_string + ln;
2416 lnc = is_utf8_pat /* length to match in characters */
2417 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2420 /* We have 'lnc' characters to match in the pattern, but because of
2421 * multi-character folding, each character in the target can match
2422 * up to 3 characters (Unicode guarantees it will never exceed
2423 * this) if it is utf8-encoded; and up to 2 if not (based on the
2424 * fact that the Latin 1 folds are already determined, and the
2425 * only multi-char fold in that range is the sharp-s folding to
2426 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
2427 * string character. Adjust lnc accordingly, rounding up, so that
2428 * if we need to match at least 4+1/3 chars, that really is 5. */
2429 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2430 lnc = (lnc + expansion - 1) / expansion;
2432 /* As in the non-UTF8 case, if we have to match 3 characters, and
2433 * only 2 are left, it's guaranteed to fail, so don't start a
2434 * match that would require us to go beyond the end of the string
2436 e = HOP3c(strend, -((SSize_t)lnc), s);
2438 /* XXX Note that we could recalculate e to stop the loop earlier,
2439 * as the worst case expansion above will rarely be met, and as we
2440 * go along we would usually find that e moves further to the left.
2441 * This would happen only after we reached the point in the loop
2442 * where if there were no expansion we should fail. Unclear if
2443 * worth the expense */
2446 char *my_strend= (char *)strend;
2447 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
2448 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
2449 && (reginfo->intuit || regtry(reginfo, &s)) )
2453 s += (utf8_target) ? UTF8SKIP(s) : 1;
2459 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2460 if (FLAGS(c) != TRADITIONAL_BOUND) {
2461 if (! IN_UTF8_CTYPE_LOCALE) {
2462 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2463 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2468 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2472 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2473 if (FLAGS(c) != TRADITIONAL_BOUND) {
2474 if (! IN_UTF8_CTYPE_LOCALE) {
2475 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2476 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2481 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
2484 case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2486 assert(FLAGS(c) == TRADITIONAL_BOUND);
2488 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2491 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2493 assert(FLAGS(c) == TRADITIONAL_BOUND);
2495 FBC_BOUND_A(isWORDCHAR_A);
2498 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2500 assert(FLAGS(c) == TRADITIONAL_BOUND);
2502 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2505 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2507 assert(FLAGS(c) == TRADITIONAL_BOUND);
2509 FBC_NBOUND_A(isWORDCHAR_A);
2513 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2514 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2525 switch((bound_type) FLAGS(c)) {
2526 case TRADITIONAL_BOUND:
2527 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2530 if (s == reginfo->strbeg) {
2531 if (reginfo->intuit || regtry(reginfo, &s))
2536 /* Didn't match. Try at the next position (if there is one) */
2537 s += (utf8_target) ? UTF8SKIP(s) : 1;
2538 if (UNLIKELY(s >= reginfo->strend)) {
2544 GCB_enum before = getGCB_VAL_UTF8(
2546 (U8*)(reginfo->strbeg)),
2547 (U8*) reginfo->strend);
2548 while (s < strend) {
2549 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2550 (U8*) reginfo->strend);
2551 if ( (to_complement ^ isGCB(before,
2553 (U8*) reginfo->strbeg,
2556 && (reginfo->intuit || regtry(reginfo, &s)))
2564 else { /* Not utf8. Everything is a GCB except between CR and
2566 while (s < strend) {
2567 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2568 || UCHARAT(s) != '\n'))
2569 && (reginfo->intuit || regtry(reginfo, &s)))
2577 /* And, since this is a bound, it can match after the final
2578 * character in the string */
2579 if ((reginfo->intuit || regtry(reginfo, &s))) {
2585 if (s == reginfo->strbeg) {
2586 if (reginfo->intuit || regtry(reginfo, &s)) {
2589 s += (utf8_target) ? UTF8SKIP(s) : 1;
2590 if (UNLIKELY(s >= reginfo->strend)) {
2596 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2598 (U8*)(reginfo->strbeg)),
2599 (U8*) reginfo->strend);
2600 while (s < strend) {
2601 LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2602 if (to_complement ^ isLB(before,
2604 (U8*) reginfo->strbeg,
2606 (U8*) reginfo->strend,
2608 && (reginfo->intuit || regtry(reginfo, &s)))
2616 else { /* Not utf8. */
2617 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2618 while (s < strend) {
2619 LB_enum after = getLB_VAL_CP((U8) *s);
2620 if (to_complement ^ isLB(before,
2622 (U8*) reginfo->strbeg,
2624 (U8*) reginfo->strend,
2626 && (reginfo->intuit || regtry(reginfo, &s)))
2635 if (reginfo->intuit || regtry(reginfo, &s)) {
2642 if (s == reginfo->strbeg) {
2643 if (reginfo->intuit || regtry(reginfo, &s)) {
2646 s += (utf8_target) ? UTF8SKIP(s) : 1;
2647 if (UNLIKELY(s >= reginfo->strend)) {
2653 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2655 (U8*)(reginfo->strbeg)),
2656 (U8*) reginfo->strend);
2657 while (s < strend) {
2658 SB_enum after = getSB_VAL_UTF8((U8*) s,
2659 (U8*) reginfo->strend);
2660 if ((to_complement ^ isSB(before,
2662 (U8*) reginfo->strbeg,
2664 (U8*) reginfo->strend,
2666 && (reginfo->intuit || regtry(reginfo, &s)))
2674 else { /* Not utf8. */
2675 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2676 while (s < strend) {
2677 SB_enum after = getSB_VAL_CP((U8) *s);
2678 if ((to_complement ^ isSB(before,
2680 (U8*) reginfo->strbeg,
2682 (U8*) reginfo->strend,
2684 && (reginfo->intuit || regtry(reginfo, &s)))
2693 /* Here are at the final position in the target string. The SB
2694 * value is always true here, so matches, depending on other
2696 if (reginfo->intuit || regtry(reginfo, &s)) {
2703 if (s == reginfo->strbeg) {
2704 if (reginfo->intuit || regtry(reginfo, &s)) {
2707 s += (utf8_target) ? UTF8SKIP(s) : 1;
2708 if (UNLIKELY(s >= reginfo->strend)) {
2714 /* We are at a boundary between char_sub_0 and char_sub_1.
2715 * We also keep track of the value for char_sub_-1 as we
2716 * loop through the line. Context may be needed to make a
2717 * determination, and if so, this can save having to
2719 WB_enum previous = WB_UNKNOWN;
2720 WB_enum before = getWB_VAL_UTF8(
2723 (U8*)(reginfo->strbeg)),
2724 (U8*) reginfo->strend);
2725 while (s < strend) {
2726 WB_enum after = getWB_VAL_UTF8((U8*) s,
2727 (U8*) reginfo->strend);
2728 if ((to_complement ^ isWB(previous,
2731 (U8*) reginfo->strbeg,
2733 (U8*) reginfo->strend,
2735 && (reginfo->intuit || regtry(reginfo, &s)))
2744 else { /* Not utf8. */
2745 WB_enum previous = WB_UNKNOWN;
2746 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2747 while (s < strend) {
2748 WB_enum after = getWB_VAL_CP((U8) *s);
2749 if ((to_complement ^ isWB(previous,
2752 (U8*) reginfo->strbeg,
2754 (U8*) reginfo->strend,
2756 && (reginfo->intuit || regtry(reginfo, &s)))
2766 if (reginfo->intuit || regtry(reginfo, &s)) {
2773 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2774 is_LNBREAK_latin1_safe(s, strend)
2779 REXEC_FBC_FIND_NEXT_SCAN(0, find_next_ascii(s, strend, utf8_target));
2784 REXEC_FBC_FIND_NEXT_SCAN(1, find_next_non_ascii(s, strend,
2788 REXEC_FBC_FIND_NEXT_SCAN(0, find_next_non_ascii(s, strend,
2794 /* The argument to all the POSIX node types is the class number to pass to
2795 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2802 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2803 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)),
2804 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2819 /* The complement of something that matches only ASCII matches all
2820 * non-ASCII, plus everything in ASCII that isn't in the class. */
2821 REXEC_FBC_CLASS_SCAN(1, ! isASCII_utf8_safe(s, strend)
2822 || ! _generic_isCC_A(*s, FLAGS(c)));
2830 /* Don't need to worry about utf8, as it can match only a single
2831 * byte invariant character. But we do anyway for performance reasons,
2832 * as otherwise we would have to examine all the continuation
2835 REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c)));
2840 REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2841 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2849 if (! utf8_target) {
2850 REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */
2851 to_complement ^ cBOOL(_generic_isCC(*s,
2857 classnum = (_char_class_number) FLAGS(c);
2860 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2861 to_complement ^ cBOOL(_invlist_contains_cp(
2862 PL_XPosix_ptrs[classnum],
2863 utf8_to_uvchr_buf((U8 *) s,
2867 case _CC_ENUM_SPACE:
2868 REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */
2869 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
2872 case _CC_ENUM_BLANK:
2873 REXEC_FBC_CLASS_SCAN(1,
2874 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
2877 case _CC_ENUM_XDIGIT:
2878 REXEC_FBC_CLASS_SCAN(1,
2879 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
2882 case _CC_ENUM_VERTSPACE:
2883 REXEC_FBC_CLASS_SCAN(1,
2884 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
2887 case _CC_ENUM_CNTRL:
2888 REXEC_FBC_CLASS_SCAN(1,
2889 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
2899 /* what trie are we using right now */
2900 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2901 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2902 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2904 const char *last_start = strend - trie->minlen;
2906 const char *real_start = s;
2908 STRLEN maxlen = trie->maxlen;
2910 U8 **points; /* map of where we were in the input string
2911 when reading a given char. For ASCII this
2912 is unnecessary overhead as the relationship
2913 is always 1:1, but for Unicode, especially
2914 case folded Unicode this is not true. */
2915 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2919 GET_RE_DEBUG_FLAGS_DECL;
2921 /* We can't just allocate points here. We need to wrap it in
2922 * an SV so it gets freed properly if there is a croak while
2923 * running the match */
2926 sv_points=newSV(maxlen * sizeof(U8 *));
2927 SvCUR_set(sv_points,
2928 maxlen * sizeof(U8 *));
2929 SvPOK_on(sv_points);
2930 sv_2mortal(sv_points);
2931 points=(U8**)SvPV_nolen(sv_points );
2932 if ( trie_type != trie_utf8_fold
2933 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2936 bitmap=(U8*)trie->bitmap;
2938 bitmap=(U8*)ANYOF_BITMAP(c);
2940 /* this is the Aho-Corasick algorithm modified a touch
2941 to include special handling for long "unknown char" sequences.
2942 The basic idea being that we use AC as long as we are dealing
2943 with a possible matching char, when we encounter an unknown char
2944 (and we have not encountered an accepting state) we scan forward
2945 until we find a legal starting char.
2946 AC matching is basically that of trie matching, except that when
2947 we encounter a failing transition, we fall back to the current
2948 states "fail state", and try the current char again, a process
2949 we repeat until we reach the root state, state 1, or a legal
2950 transition. If we fail on the root state then we can either
2951 terminate if we have reached an accepting state previously, or
2952 restart the entire process from the beginning if we have not.
2955 while (s <= last_start) {
2956 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2964 U8 *uscan = (U8*)NULL;
2965 U8 *leftmost = NULL;
2967 U32 accepted_word= 0;
2971 while ( state && uc <= (U8*)strend ) {
2973 U32 word = aho->states[ state ].wordnum;
2977 DEBUG_TRIE_EXECUTE_r(
2978 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2979 dump_exec_pos( (char *)uc, c, strend, real_start,
2980 (char *)uc, utf8_target, 0 );
2981 Perl_re_printf( aTHX_
2982 " Scanning for legal start char...\n");
2986 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2990 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2996 if (uc >(U8*)last_start) break;
3000 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
3001 if (!leftmost || lpos < leftmost) {
3002 DEBUG_r(accepted_word=word);
3008 points[pointpos++ % maxlen]= uc;
3009 if (foldlen || uc < (U8*)strend) {
3010 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3011 (U8 *) strend, uscan, len, uvc,
3012 charid, foldlen, foldbuf,
3014 DEBUG_TRIE_EXECUTE_r({
3015 dump_exec_pos( (char *)uc, c, strend,
3016 real_start, s, utf8_target, 0);
3017 Perl_re_printf( aTHX_
3018 " Charid:%3u CP:%4" UVxf " ",
3030 word = aho->states[ state ].wordnum;
3032 base = aho->states[ state ].trans.base;
3034 DEBUG_TRIE_EXECUTE_r({
3036 dump_exec_pos( (char *)uc, c, strend, real_start,
3037 s, utf8_target, 0 );
3038 Perl_re_printf( aTHX_
3039 "%sState: %4" UVxf ", word=%" UVxf,
3040 failed ? " Fail transition to " : "",
3041 (UV)state, (UV)word);
3047 ( ((offset = base + charid
3048 - 1 - trie->uniquecharcount)) >= 0)
3049 && ((U32)offset < trie->lasttrans)
3050 && trie->trans[offset].check == state
3051 && (tmp=trie->trans[offset].next))
3053 DEBUG_TRIE_EXECUTE_r(
3054 Perl_re_printf( aTHX_ " - legal\n"));
3059 DEBUG_TRIE_EXECUTE_r(
3060 Perl_re_printf( aTHX_ " - fail\n"));
3062 state = aho->fail[state];
3066 /* we must be accepting here */
3067 DEBUG_TRIE_EXECUTE_r(
3068 Perl_re_printf( aTHX_ " - accepting\n"));
3077 if (!state) state = 1;
3080 if ( aho->states[ state ].wordnum ) {
3081 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
3082 if (!leftmost || lpos < leftmost) {
3083 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3088 s = (char*)leftmost;
3089 DEBUG_TRIE_EXECUTE_r({
3090 Perl_re_printf( aTHX_ "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
3091 (UV)accepted_word, (IV)(s - real_start)
3094 if (reginfo->intuit || regtry(reginfo, &s)) {
3100 DEBUG_TRIE_EXECUTE_r({
3101 Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
3104 DEBUG_TRIE_EXECUTE_r(
3105 Perl_re_printf( aTHX_ "No match.\n"));
3114 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3121 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3122 * flags have same meanings as with regexec_flags() */
3125 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3132 struct regexp *const prog = ReANY(rx);
3134 if (flags & REXEC_COPY_STR) {
3137 DEBUG_C(Perl_re_printf( aTHX_
3138 "Copy on write: regexp capture, type %d\n",
3140 /* Create a new COW SV to share the match string and store
3141 * in saved_copy, unless the current COW SV in saved_copy
3142 * is valid and suitable for our purpose */
3143 if (( prog->saved_copy
3144 && SvIsCOW(prog->saved_copy)
3145 && SvPOKp(prog->saved_copy)
3148 && SvPVX(sv) == SvPVX(prog->saved_copy)))
3150 /* just reuse saved_copy SV */
3151 if (RXp_MATCH_COPIED(prog)) {
3152 Safefree(prog->subbeg);
3153 RXp_MATCH_COPIED_off(prog);
3157 /* create new COW SV to share string */
3158 RXp_MATCH_COPY_FREE(prog);
3159 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3161 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3162 assert (SvPOKp(prog->saved_copy));
3163 prog->sublen = strend - strbeg;
3164 prog->suboffset = 0;
3165 prog->subcoffset = 0;
3170 SSize_t max = strend - strbeg;
3173 if ( (flags & REXEC_COPY_SKIP_POST)
3174 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3175 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3176 ) { /* don't copy $' part of string */
3179 /* calculate the right-most part of the string covered
3180 * by a capture. Due to lookahead, this may be to
3181 * the right of $&, so we have to scan all captures */
3182 while (n <= prog->lastparen) {
3183 if (prog->offs[n].end > max)
3184 max = prog->offs[n].end;
3188 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3189 ? prog->offs[0].start
3191 assert(max >= 0 && max <= strend - strbeg);
3194 if ( (flags & REXEC_COPY_SKIP_PRE)
3195 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3196 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3197 ) { /* don't copy $` part of string */
3200 /* calculate the left-most part of the string covered
3201 * by a capture. Due to lookbehind, this may be to
3202 * the left of $&, so we have to scan all captures */
3203 while (min && n <= prog->lastparen) {
3204 if ( prog->offs[n].start != -1
3205 && prog->offs[n].start < min)
3207 min = prog->offs[n].start;
3211 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3212 && min > prog->offs[0].end
3214 min = prog->offs[0].end;
3218 assert(min >= 0 && min <= max && min <= strend - strbeg);
3221 if (RXp_MATCH_COPIED(prog)) {
3222 if (sublen > prog->sublen)
3224 (char*)saferealloc(prog->subbeg, sublen+1);
3227 prog->subbeg = (char*)safemalloc(sublen+1);
3228 Copy(strbeg + min, prog->subbeg, sublen, char);
3229 prog->subbeg[sublen] = '\0';
3230 prog->suboffset = min;
3231 prog->sublen = sublen;
3232 RXp_MATCH_COPIED_on(prog);
3234 prog->subcoffset = prog->suboffset;
3235 if (prog->suboffset && utf8_target) {
3236 /* Convert byte offset to chars.
3237 * XXX ideally should only compute this if @-/@+
3238 * has been seen, a la PL_sawampersand ??? */
3240 /* If there's a direct correspondence between the
3241 * string which we're matching and the original SV,
3242 * then we can use the utf8 len cache associated with
3243 * the SV. In particular, it means that under //g,
3244 * sv_pos_b2u() will use the previously cached
3245 * position to speed up working out the new length of
3246 * subcoffset, rather than counting from the start of
3247 * the string each time. This stops
3248 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3249 * from going quadratic */
3250 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3251 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3252 SV_GMAGIC|SV_CONST_RETURN);
3254 prog->subcoffset = utf8_length((U8*)strbeg,
3255 (U8*)(strbeg+prog->suboffset));
3259 RXp_MATCH_COPY_FREE(prog);
3260 prog->subbeg = strbeg;
3261 prog->suboffset = 0;
3262 prog->subcoffset = 0;
3263 prog->sublen = strend - strbeg;
3271 - regexec_flags - match a regexp against a string
3274 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3275 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3276 /* stringarg: the point in the string at which to begin matching */
3277 /* strend: pointer to null at end of string */
3278 /* strbeg: real beginning of string */
3279 /* minend: end of match must be >= minend bytes after stringarg. */
3280 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
3281 * itself is accessed via the pointers above */
3282 /* data: May be used for some additional optimizations.
3283 Currently unused. */
3284 /* flags: For optimizations. See REXEC_* in regexp.h */
3287 struct regexp *const prog = ReANY(rx);
3291 SSize_t minlen; /* must match at least this many chars */
3292 SSize_t dontbother = 0; /* how many characters not to try at end */
3293 const bool utf8_target = cBOOL(DO_UTF8(sv));
3295 RXi_GET_DECL(prog,progi);
3296 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3297 regmatch_info *const reginfo = ®info_buf;
3298 regexp_paren_pair *swap = NULL;
3300 GET_RE_DEBUG_FLAGS_DECL;
3302 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3303 PERL_UNUSED_ARG(data);
3305 /* Be paranoid... */
3307 Perl_croak(aTHX_ "NULL regexp parameter");
3311 debug_start_match(rx, utf8_target, stringarg, strend,
3315 startpos = stringarg;
3317 /* set these early as they may be used by the HOP macros below */
3318 reginfo->strbeg = strbeg;
3319 reginfo->strend = strend;
3320 reginfo->is_utf8_target = cBOOL(utf8_target);
3322 if (prog->intflags & PREGf_GPOS_SEEN) {
3325 /* set reginfo->ganch, the position where \G can match */
3328 (flags & REXEC_IGNOREPOS)
3329 ? stringarg /* use start pos rather than pos() */
3330 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3331 /* Defined pos(): */
3332 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3333 : strbeg; /* pos() not defined; use start of string */
3335 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3336 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3338 /* in the presence of \G, we may need to start looking earlier in
3339 * the string than the suggested start point of stringarg:
3340 * if prog->gofs is set, then that's a known, fixed minimum
3343 * /ab|c\G/: gofs = 1
3344 * or if the minimum offset isn't known, then we have to go back
3345 * to the start of the string, e.g. /w+\G/
3348 if (prog->intflags & PREGf_ANCH_GPOS) {
3350 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3352 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3354 DEBUG_r(Perl_re_printf( aTHX_
3355 "fail: ganch-gofs before earliest possible start\n"));
3360 startpos = reginfo->ganch;
3362 else if (prog->gofs) {
3363 startpos = HOPBACKc(startpos, prog->gofs);
3367 else if (prog->intflags & PREGf_GPOS_FLOAT)
3371 minlen = prog->minlen;
3372 if ((startpos + minlen) > strend || startpos < strbeg) {
3373 DEBUG_r(Perl_re_printf( aTHX_
3374 "Regex match can't succeed, so not even tried\n"));
3378 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3379 * which will call destuctors to reset PL_regmatch_state, free higher
3380 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3381 * regmatch_info_aux_eval */
3383 oldsave = PL_savestack_ix;
3387 if ((prog->extflags & RXf_USE_INTUIT)
3388 && !(flags & REXEC_CHECKED))
3390 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3395 if (prog->extflags & RXf_CHECK_ALL) {
3396 /* we can match based purely on the result of INTUIT.
3397 * Set up captures etc just for $& and $-[0]
3398 * (an intuit-only match wont have $1,$2,..) */
3399 assert(!prog->nparens);
3401 /* s/// doesn't like it if $& is earlier than where we asked it to
3402 * start searching (which can happen on something like /.\G/) */
3403 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3406 /* this should only be possible under \G */
3407 assert(prog->intflags & PREGf_GPOS_SEEN);
3408 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3409 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3413 /* match via INTUIT shouldn't have any captures.
3414 * Let @-, @+, $^N know */
3415 prog->lastparen = prog->lastcloseparen = 0;
3416 RXp_MATCH_UTF8_set(prog, utf8_target);
3417 prog->offs[0].start = s - strbeg;
3418 prog->offs[0].end = utf8_target
3419 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3420 : s - strbeg + prog->minlenret;
3421 if ( !(flags & REXEC_NOT_FIRST) )
3422 S_reg_set_capture_string(aTHX_ rx,
3424 sv, flags, utf8_target);
3430 multiline = prog->extflags & RXf_PMf_MULTILINE;
3432 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3433 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3434 "String too short [regexec_flags]...\n"));
3438 /* Check validity of program. */
3439 if (UCHARAT(progi->program) != REG_MAGIC) {
3440 Perl_croak(aTHX_ "corrupted regexp program");
3443 RXp_MATCH_TAINTED_off(prog);
3444 RXp_MATCH_UTF8_set(prog, utf8_target);
3446 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3447 reginfo->intuit = 0;
3448 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3449 reginfo->warned = FALSE;
3451 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3452 /* see how far we have to get to not match where we matched before */
3453 reginfo->till = stringarg + minend;
3455 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3456 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3457 S_cleanup_regmatch_info_aux has executed (registered by
3458 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3459 magic belonging to this SV.
3460 Not newSVsv, either, as it does not COW.
3462 reginfo->sv = newSV(0);
3463 SvSetSV_nosteal(reginfo->sv, sv);
3464 SAVEFREESV(reginfo->sv);
3467 /* reserve next 2 or 3 slots in PL_regmatch_state:
3468 * slot N+0: may currently be in use: skip it
3469 * slot N+1: use for regmatch_info_aux struct
3470 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3471 * slot N+3: ready for use by regmatch()
3475 regmatch_state *old_regmatch_state;
3476 regmatch_slab *old_regmatch_slab;
3477 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3479 /* on first ever match, allocate first slab */
3480 if (!PL_regmatch_slab) {
3481 Newx(PL_regmatch_slab, 1, regmatch_slab);
3482 PL_regmatch_slab->prev = NULL;
3483 PL_regmatch_slab->next = NULL;
3484 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3487 old_regmatch_state = PL_regmatch_state;
3488 old_regmatch_slab = PL_regmatch_slab;
3490 for (i=0; i <= max; i++) {
3492 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3494 reginfo->info_aux_eval =
3495 reginfo->info_aux->info_aux_eval =
3496 &(PL_regmatch_state->u.info_aux_eval);
3498 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3499 PL_regmatch_state = S_push_slab(aTHX);
3502 /* note initial PL_regmatch_state position; at end of match we'll
3503 * pop back to there and free any higher slabs */
3505 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3506 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
3507 reginfo->info_aux->poscache = NULL;
3509 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3511 if ((prog->extflags & RXf_EVAL_SEEN))
3512 S_setup_eval_state(aTHX_ reginfo);
3514 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3517 /* If there is a "must appear" string, look for it. */
3519 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3520 /* We have to be careful. If the previous successful match
3521 was from this regex we don't want a subsequent partially
3522 successful match to clobber the old results.
3523 So when we detect this possibility we add a swap buffer
3524 to the re, and switch the buffer each match. If we fail,
3525 we switch it back; otherwise we leave it swapped.
3528 /* do we need a save destructor here for eval dies? */
3529 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3530 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3531 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3539 if (prog->recurse_locinput)
3540 Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3542 /* Simplest case: anchored match need be tried only once, or with
3543 * MBOL, only at the beginning of each line.
3545 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3546 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3547 * match at the start of the string then it won't match anywhere else
3548 * either; while with /.*.../, if it doesn't match at the beginning,
3549 * the earliest it could match is at the start of the next line */
3551 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3554 if (regtry(reginfo, &s))
3557 if (!(prog->intflags & PREGf_ANCH_MBOL))
3560 /* didn't match at start, try at other newline positions */
3563 dontbother = minlen - 1;
3564 end = HOP3c(strend, -dontbother, strbeg) - 1;
3566 /* skip to next newline */
3568 while (s <= end) { /* note it could be possible to match at the end of the string */
3569 /* NB: newlines are the same in unicode as they are in latin */
3572 if (prog->check_substr || prog->check_utf8) {
3573 /* note that with PREGf_IMPLICIT, intuit can only fail
3574 * or return the start position, so it's of limited utility.
3575 * Nevertheless, I made the decision that the potential for
3576 * quick fail was still worth it - DAPM */
3577 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3581 if (regtry(reginfo, &s))
3585 } /* end anchored search */
3587 if (prog->intflags & PREGf_ANCH_GPOS)
3589 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3590 assert(prog->intflags & PREGf_GPOS_SEEN);
3591 /* For anchored \G, the only position it can match from is
3592 * (ganch-gofs); we already set startpos to this above; if intuit
3593 * moved us on from there, we can't possibly succeed */
3594 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3595 if (s == startpos && regtry(reginfo, &s))
3600 /* Messy cases: unanchored match. */
3601 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3602 /* we have /x+whatever/ */
3603 /* it must be a one character string (XXXX Except is_utf8_pat?) */
3609 if (! prog->anchored_utf8) {
3610 to_utf8_substr(prog);
3612 ch = SvPVX_const(prog->anchored_utf8)[0];
3613 REXEC_FBC_SCAN(0, /* 0=>not-utf8 */
3615 DEBUG_EXECUTE_r( did_match = 1 );
3616 if (regtry(reginfo, &s)) goto got_it;
3618 while (s < strend && *s == ch)
3625 if (! prog->anchored_substr) {
3626 if (! to_byte_substr(prog)) {
3627 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3630 ch = SvPVX_const(prog->anchored_substr)[0];
3631 REXEC_FBC_SCAN(0, /* 0=>not-utf8 */
3633 DEBUG_EXECUTE_r( did_match = 1 );
3634 if (regtry(reginfo, &s)) goto got_it;
3636 while (s < strend && *s == ch)
3641 DEBUG_EXECUTE_r(if (!did_match)
3642 Perl_re_printf( aTHX_
3643 "Did not find anchored character...\n")
3646 else if (prog->anchored_substr != NULL
3647 || prog->anchored_utf8 != NULL
3648 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3649 && prog->float_max_offset < strend - s)) {
3654 char *last1; /* Last position checked before */
3658 if (prog->anchored_substr || prog->anchored_utf8) {
3660 if (! prog->anchored_utf8) {
3661 to_utf8_substr(prog);
3663 must = prog->anchored_utf8;
3666 if (! prog->anchored_substr) {
3667 if (! to_byte_substr(prog)) {
3668 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3671 must = prog->anchored_substr;
3673 back_max = back_min = prog->anchored_offset;
3676 if (! prog->float_utf8) {
3677 to_utf8_substr(prog);
3679 must = prog->float_utf8;
3682 if (! prog->float_substr) {
3683 if (! to_byte_substr(prog)) {
3684 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3687 must = prog->float_substr;
3689 back_max = prog->float_max_offset;
3690 back_min = prog->float_min_offset;
3696 last = HOP3c(strend, /* Cannot start after this */
3697 -(SSize_t)(CHR_SVLEN(must)
3698 - (SvTAIL(must) != 0) + back_min), strbeg);
3700 if (s > reginfo->strbeg)
3701 last1 = HOPc(s, -1);
3703 last1 = s - 1; /* bogus */
3705 /* XXXX check_substr already used to find "s", can optimize if
3706 check_substr==must. */
3708 strend = HOPc(strend, -dontbother);
3709 while ( (s <= last) &&
3710 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
3711 (unsigned char*)strend, must,
3712 multiline ? FBMrf_MULTILINE : 0)) ) {
3713 DEBUG_EXECUTE_r( did_match = 1 );
3714 if (HOPc(s, -back_max) > last1) {
3715 last1 = HOPc(s, -back_min);
3716 s = HOPc(s, -back_max);
3719 char * const t = (last1 >= reginfo->strbeg)
3720 ? HOPc(last1, 1) : last1 + 1;
3722 last1 = HOPc(s, -back_min);
3726 while (s <= last1) {
3727 if (regtry(reginfo, &s))
3730 s++; /* to break out of outer loop */
3737 while (s <= last1) {
3738 if (regtry(reginfo, &s))
3744 DEBUG_EXECUTE_r(if (!did_match) {
3745 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3746 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3747 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n",
3748 ((must == prog->anchored_substr || must == prog->anchored_utf8)
3749 ? "anchored" : "floating"),
3750 quoted, RE_SV_TAIL(must));
3754 else if ( (c = progi->regstclass) ) {
3756 const OPCODE op = OP(progi->regstclass);
3757 /* don't bother with what can't match */
3758 if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3759 strend = HOPc(strend, -(minlen - 1));
3762 SV * const prop = sv_newmortal();
3763 regprop(prog, prop, c, reginfo, NULL);
3765 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3766 s,strend-s,PL_dump_re_max_len);
3767 Perl_re_printf( aTHX_
3768 "Matching stclass %.*s against %s (%d bytes)\n",
3769 (int)SvCUR(prop), SvPVX_const(prop),
3770 quoted, (int)(strend - s));
3773 if (find_byclass(prog, c, s, strend, reginfo))
3775 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n"));
3779 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3787 if (! prog->float_utf8) {
3788 to_utf8_substr(prog);
3790 float_real = prog->float_utf8;
3793 if (! prog->float_substr) {
3794 if (! to_byte_substr(prog)) {
3795 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3798 float_real = prog->float_substr;
3801 little = SvPV_const(float_real, len);
3802 if (SvTAIL(float_real)) {
3803 /* This means that float_real contains an artificial \n on
3804 * the end due to the presence of something like this:
3805 * /foo$/ where we can match both "foo" and "foo\n" at the
3806 * end of the string. So we have to compare the end of the
3807 * string first against the float_real without the \n and
3808 * then against the full float_real with the string. We
3809 * have to watch out for cases where the string might be
3810 * smaller than the float_real or the float_real without
3812 char *checkpos= strend - len;
3814 Perl_re_printf( aTHX_
3815 "%sChecking for float_real.%s\n",
3816 PL_colors[4], PL_colors[5]));
3817 if (checkpos + 1 < strbeg) {
3818 /* can't match, even if we remove the trailing \n
3819 * string is too short to match */
3821 Perl_re_printf( aTHX_
3822 "%sString shorter than required trailing substring, cannot match.%s\n",
3823 PL_colors[4], PL_colors[5]));
3825 } else if (memEQ(checkpos + 1, little, len - 1)) {
3826 /* can match, the end of the string matches without the
3828 last = checkpos + 1;
3829 } else if (checkpos < strbeg) {
3830 /* cant match, string is too short when the "\n" is
3833 Perl_re_printf( aTHX_
3834 "%sString does not contain required trailing substring, cannot match.%s\n",
3835 PL_colors[4], PL_colors[5]));
3837 } else if (!multiline) {
3838 /* non multiline match, so compare with the "\n" at the
3839 * end of the string */
3840 if (memEQ(checkpos, little, len)) {
3844 Perl_re_printf( aTHX_
3845 "%sString does not contain required trailing substring, cannot match.%s\n",
3846 PL_colors[4], PL_colors[5]));
3850 /* multiline match, so we have to search for a place
3851 * where the full string is located */
3857 last = rninstr(s, strend, little, little + len);
3859 last = strend; /* matching "$" */
3862 /* at one point this block contained a comment which was
3863 * probably incorrect, which said that this was a "should not
3864 * happen" case. Even if it was true when it was written I am
3865 * pretty sure it is not anymore, so I have removed the comment
3866 * and replaced it with this one. Yves */
3868 Perl_re_printf( aTHX_
3869 "%sString does not contain required substring, cannot match.%s\n",
3870 PL_colors[4], PL_colors[5]
3874 dontbother = strend - last + prog->float_min_offset;
3876 if (minlen && (dontbother < minlen))
3877 dontbother = minlen - 1;
3878 strend -= dontbother; /* this one's always in bytes! */
3879 /* We don't know much -- general case. */
3882 if (regtry(reginfo, &s))
3891 if (regtry(reginfo, &s))
3893 } while (s++ < strend);
3901 /* s/// doesn't like it if $& is earlier than where we asked it to
3902 * start searching (which can happen on something like /.\G/) */
3903 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3904 && (prog->offs[0].start < stringarg - strbeg))
3906 /* this should only be possible under \G */
3907 assert(prog->intflags & PREGf_GPOS_SEEN);
3908 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3909 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3915 Perl_re_exec_indentf( aTHX_
3916 "rex=0x%" UVxf " freeing offs: 0x%" UVxf "\n",
3924 /* clean up; this will trigger destructors that will free all slabs
3925 * above the current one, and cleanup the regmatch_info_aux
3926 * and regmatch_info_aux_eval sructs */
3928 LEAVE_SCOPE(oldsave);
3930 if (RXp_PAREN_NAMES(prog))
3931 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3933 /* make sure $`, $&, $', and $digit will work later */
3934 if ( !(flags & REXEC_NOT_FIRST) )
3935 S_reg_set_capture_string(aTHX_ rx,
3936 strbeg, reginfo->strend,
3937 sv, flags, utf8_target);
3942 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n",
3943 PL_colors[4], PL_colors[5]));
3945 /* clean up; this will trigger destructors that will free all slabs
3946 * above the current one, and cleanup the regmatch_info_aux
3947 * and regmatch_info_aux_eval sructs */
3949 LEAVE_SCOPE(oldsave);
3952 /* we failed :-( roll it back */
3953 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3954 "rex=0x%" UVxf " rolling back offs: freeing=0x%" UVxf " restoring=0x%" UVxf "\n",
3960 Safefree(prog->offs);
3967 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3968 * Do inc before dec, in case old and new rex are the same */
3969 #define SET_reg_curpm(Re2) \
3970 if (reginfo->info_aux_eval) { \
3971 (void)ReREFCNT_inc(Re2); \
3972 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3973 PM_SETRE((PL_reg_curpm), (Re2)); \
3978 - regtry - try match at specific point
3980 STATIC bool /* 0 failure, 1 success */
3981 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3984 REGEXP *const rx = reginfo->prog;
3985 regexp *const prog = ReANY(rx);
3988 U32 depth = 0; /* used by REGCP_SET */
3990 RXi_GET_DECL(prog,progi);
3991 GET_RE_DEBUG_FLAGS_DECL;
3993 PERL_ARGS_ASSERT_REGTRY;
3995 reginfo->cutpoint=NULL;
3997 prog->offs[0].start = *startposp - reginfo->strbeg;
3998 prog->lastparen = 0;
3999 prog->lastcloseparen = 0;
4001 /* XXXX What this code is doing here?!!! There should be no need
4002 to do this again and again, prog->lastparen should take care of
4005 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4006 * Actually, the code in regcppop() (which Ilya may be meaning by
4007 * prog->lastparen), is not needed at all by the test suite
4008 * (op/regexp, op/pat, op/split), but that code is needed otherwise
4009 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4010 * Meanwhile, this code *is* needed for the
4011 * above-mentioned test suite tests to succeed. The common theme
4012 * on those tests seems to be returning null fields from matches.
4013 * --jhi updated by dapm */
4015 /* After encountering a variant of the issue mentioned above I think
4016 * the point Ilya was making is that if we properly unwind whenever
4017 * we set lastparen to a smaller value then we should not need to do
4018 * this every time, only when needed. So if we have tests that fail if
4019 * we remove this, then it suggests somewhere else we are improperly
4020 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4021 * places it is called, and related regcp() routines. - Yves */
4023 if (prog->nparens) {
4024 regexp_paren_pair *pp = prog->offs;
4026 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4034 result = regmatch(reginfo, *startposp, progi->program + 1);
4036 prog->offs[0].end = result;
4039 if (reginfo->cutpoint)
4040 *startposp= reginfo->cutpoint;
4041 REGCP_UNWIND(lastcp);
4046 #define sayYES goto yes
4047 #define sayNO goto no
4048 #define sayNO_SILENT goto no_silent
4050 /* we dont use STMT_START/END here because it leads to
4051 "unreachable code" warnings, which are bogus, but distracting. */
4052 #define CACHEsayNO \
4053 if (ST.cache_mask) \
4054 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
4057 /* this is used to determine how far from the left messages like
4058 'failed...' are printed in regexec.c. It should be set such that
4059 messages are inline with the regop output that created them.
4061 #define REPORT_CODE_OFF 29
4062 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4065 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4069 PerlIO *f= Perl_debug_log;
4070 PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4071 va_start(ap, depth);
4072 PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4073 result = PerlIO_vprintf(f, fmt, ap);
4077 #endif /* DEBUGGING */
4080 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
4081 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
4082 #define CHRTEST_NOT_A_CP_1 -999
4083 #define CHRTEST_NOT_A_CP_2 -998
4085 /* grab a new slab and return the first slot in it */
4087 STATIC regmatch_state *
4090 regmatch_slab *s = PL_regmatch_slab->next;
4092 Newx(s, 1, regmatch_slab);
4093 s->prev = PL_regmatch_slab;
4095 PL_regmatch_slab->next = s;
4097 PL_regmatch_slab = s;
4098 return SLAB_FIRST(s);
4102 /* push a new state then goto it */
4104 #define PUSH_STATE_GOTO(state, node, input) \
4105 pushinput = input; \
4107 st->resume_state = state; \
4110 /* push a new state with success backtracking, then goto it */
4112 #define PUSH_YES_STATE_GOTO(state, node, input) \
4113 pushinput = input; \
4115 st->resume_state = state; \
4116 goto push_yes_state;
4123 regmatch() - main matching routine
4125 This is basically one big switch statement in a loop. We execute an op,
4126 set 'next' to point the next op, and continue. If we come to a point which
4127 we may need to backtrack to on failure such as (A|B|C), we push a
4128 backtrack state onto the backtrack stack. On failure, we pop the top
4129 state, and re-enter the loop at the state indicated. If there are no more
4130 states to pop, we return failure.
4132 Sometimes we also need to backtrack on success; for example /A+/, where
4133 after successfully matching one A, we need to go back and try to
4134 match another one; similarly for lookahead assertions: if the assertion
4135 completes successfully, we backtrack to the state just before the assertion
4136 and then carry on. In these cases, the pushed state is marked as
4137 'backtrack on success too'. This marking is in fact done by a chain of
4138 pointers, each pointing to the previous 'yes' state. On success, we pop to
4139 the nearest yes state, discarding any intermediate failure-only states.
4140 Sometimes a yes state is pushed just to force some cleanup code to be
4141 called at the end of a successful match or submatch; e.g. (??{$re}) uses
4142 it to free the inner regex.
4144 Note that failure backtracking rewinds the cursor position, while
4145 success backtracking leaves it alone.
4147 A pattern is complete when the END op is executed, while a subpattern
4148 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
4149 ops trigger the "pop to last yes state if any, otherwise return true"
4152 A common convention in this function is to use A and B to refer to the two
4153 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
4154 the subpattern to be matched possibly multiple times, while B is the entire
4155 rest of the pattern. Variable and state names reflect this convention.
4157 The states in the main switch are the union of ops and failure/success of
4158 substates associated with with that op. For example, IFMATCH is the op
4159 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
4160 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
4161 successfully matched A and IFMATCH_A_fail is a state saying that we have
4162 just failed to match A. Resume states always come in pairs. The backtrack
4163 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
4164 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
4165 on success or failure.
4167 The struct that holds a backtracking state is actually a big union, with
4168 one variant for each major type of op. The variable st points to the
4169 top-most backtrack struct. To make the code clearer, within each
4170 block of code we #define ST to alias the relevant union.
4172 Here's a concrete example of a (vastly oversimplified) IFMATCH
4178 #define ST st->u.ifmatch
4180 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4181 ST.foo = ...; // some state we wish to save
4183 // push a yes backtrack state with a resume value of
4184 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
4186 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
4189 case IFMATCH_A: // we have successfully executed A; now continue with B
4191 bar = ST.foo; // do something with the preserved value
4194 case IFMATCH_A_fail: // A failed, so the assertion failed
4195 ...; // do some housekeeping, then ...
4196 sayNO; // propagate the failure
4203 For any old-timers reading this who are familiar with the old recursive
4204 approach, the code above is equivalent to:
4206 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
4215 ...; // do some housekeeping, then ...
4216 sayNO; // propagate the failure
4219 The topmost backtrack state, pointed to by st, is usually free. If you
4220 want to claim it, populate any ST.foo fields in it with values you wish to
4221 save, then do one of
4223 PUSH_STATE_GOTO(resume_state, node, newinput);
4224 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
4226 which sets that backtrack state's resume value to 'resume_state', pushes a
4227 new free entry to the top of the backtrack stack, then goes to 'node'.
4228 On backtracking, the free slot is popped, and the saved state becomes the
4229 new free state. An ST.foo field in this new top state can be temporarily
4230 accessed to retrieve values, but once the main loop is re-entered, it
4231 becomes available for reuse.
4233 Note that the depth of the backtrack stack constantly increases during the
4234 left-to-right execution of the pattern, rather than going up and down with
4235 the pattern nesting. For example the stack is at its maximum at Z at the
4236 end of the pattern, rather than at X in the following:
4238 /(((X)+)+)+....(Y)+....Z/
4240 The only exceptions to this are lookahead/behind assertions and the cut,
4241 (?>A), which pop all the backtrack states associated with A before
4244 Backtrack state structs are allocated in slabs of about 4K in size.
4245 PL_regmatch_state and st always point to the currently active state,
4246 and PL_regmatch_slab points to the slab currently containing
4247 PL_regmatch_state. The first time regmatch() is called, the first slab is
4248 allocated, and is never freed until interpreter destruction. When the slab
4249 is full, a new one is allocated and chained to the end. At exit from
4250 regmatch(), slabs allocated since entry are freed.
4255 #define DEBUG_STATE_pp(pp) \
4257 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
4258 Perl_re_printf( aTHX_ \
4259 "%*s" pp " %s%s%s%s%s\n", \
4260 INDENT_CHARS(depth), "", \
4261 PL_reg_name[st->resume_state], \
4262 ((st==yes_state||st==mark_state) ? "[" : ""), \
4263 ((st==yes_state) ? "Y" : ""), \
4264 ((st==mark_state) ? "M" : ""), \
4265 ((st==yes_state||st==mark_state) ? "]" : "") \
4270 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
4275 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4276 const char *start, const char *end, const char *blurb)
4278 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4280 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4285 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4286 RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4288 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4289 start, end - start, PL_dump_re_max_len);
4291 Perl_re_printf( aTHX_
4292 "%s%s REx%s %s against %s\n",
4293 PL_colors[4], blurb, PL_colors[5], s0, s1);
4295 if (utf8_target||utf8_pat)
4296 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n",
4297 utf8_pat ? "pattern" : "",
4298 utf8_pat && utf8_target ? " and " : "",
4299 utf8_target ? "string" : ""
4305 S_dump_exec_pos(pTHX_ const char *locinput,
4306 const regnode *scan,
4307 const char *loc_regeol,
4308 const char *loc_bostr,
4309 const char *loc_reg_starttry,
4310 const bool utf8_target,
4314 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4315 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4316 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4317 /* The part of the string before starttry has one color
4318 (pref0_len chars), between starttry and current
4319 position another one (pref_len - pref0_len chars),
4320 after the current position the third one.
4321 We assume that pref0_len <= pref_len, otherwise we
4322 decrease pref0_len. */
4323 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4324 ? (5 + taill) - l : locinput - loc_bostr;
4327 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4329 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4331 pref0_len = pref_len - (locinput - loc_reg_starttry);
4332 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4333 l = ( loc_regeol - locinput > (5 + taill) - pref_len
4334 ? (5 + taill) - pref_len : loc_regeol - locinput);
4335 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4339 if (pref0_len > pref_len)
4340 pref0_len = pref_len;
4342 const int is_uni = utf8_target ? 1 : 0;
4344 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4345 (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4347 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4348 (locinput - pref_len + pref0_len),
4349 pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4351 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4352 locinput, loc_regeol - locinput, 10, 0, 1);
4354 const STRLEN tlen=len0+len1+len2;
4355 Perl_re_printf( aTHX_
4356 "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4357 (IV)(locinput - loc_bostr),
4360 (docolor ? "" : "> <"),
4362 (int)(tlen > 19 ? 0 : 19 - tlen),
4370 /* reg_check_named_buff_matched()
4371 * Checks to see if a named buffer has matched. The data array of
4372 * buffer numbers corresponding to the buffer is expected to reside
4373 * in the regexp->data->data array in the slot stored in the ARG() of
4374 * node involved. Note that this routine doesn't actually care about the
4375 * name, that information is not preserved from compilation to execution.
4376 * Returns the index of the leftmost defined buffer with the given name
4377 * or 0 if non of the buffers matched.
4380 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4383 RXi_GET_DECL(rex,rexi);
4384 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4385 I32 *nums=(I32*)SvPVX(sv_dat);
4387 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4389 for ( n=0; n<SvIVX(sv_dat); n++ ) {
4390 if ((I32)rex->lastparen >= nums[n] &&
4391 rex->offs[nums[n]].end != -1)
4401 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
4402 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
4404 /* This function determines if there are one or two characters that match
4405 * the first character of the passed-in EXACTish node <text_node>, and if
4406 * so, returns them in the passed-in pointers.
4408 * If it determines that no possible character in the target string can
4409 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
4410 * the first character in <text_node> requires UTF-8 to represent, and the
4411 * target string isn't in UTF-8.)
4413 * If there are more than two characters that could match the beginning of
4414 * <text_node>, or if more context is required to determine a match or not,
4415 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4417 * The motiviation behind this function is to allow the caller to set up
4418 * tight loops for matching. If <text_node> is of type EXACT, there is
4419 * only one possible character that can match its first character, and so
4420 * the situation is quite simple. But things get much more complicated if
4421 * folding is involved. It may be that the first character of an EXACTFish
4422 * node doesn't participate in any possible fold, e.g., punctuation, so it
4423 * can be matched only by itself. The vast majority of characters that are
4424 * in folds match just two things, their lower and upper-case equivalents.
4425 * But not all are like that; some have multiple possible matches, or match
4426 * sequences of more than one character. This function sorts all that out.
4428 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
4429 * loop of trying to match A*, we know we can't exit where the thing
4430 * following it isn't a B. And something can't be a B unless it is the
4431 * beginning of B. By putting a quick test for that beginning in a tight
4432 * loop, we can rule out things that can't possibly be B without having to
4433 * break out of the loop, thus avoiding work. Similarly, if A is a single
4434 * character, we can make a tight loop matching A*, using the outputs of
4437 * If the target string to match isn't in UTF-8, and there aren't
4438 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4439 * the one or two possible octets (which are characters in this situation)
4440 * that can match. In all cases, if there is only one character that can
4441 * match, *<c1p> and *<c2p> will be identical.
4443 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4444 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4445 * can match the beginning of <text_node>. They should be declared with at
4446 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
4447 * undefined what these contain.) If one or both of the buffers are
4448 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4449 * corresponding invariant. If variant, the corresponding *<c1p> and/or
4450 * *<c2p> will be set to a negative number(s) that shouldn't match any code
4451 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
4452 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
4454 const bool utf8_target = reginfo->is_utf8_target;
4456 UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4457 UV c2 = (UV)CHRTEST_NOT_A_CP_2;
4458 bool use_chrtest_void = FALSE;
4459 const bool is_utf8_pat = reginfo->is_utf8_pat;
4461 /* Used when we have both utf8 input and utf8 output, to avoid converting
4462 * to/from code points */
4463 bool utf8_has_been_setup = FALSE;
4467 U8 *pat = (U8*)STRING(text_node);
4468 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4470 if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
4472 /* In an exact node, only one thing can be matched, that first
4473 * character. If both the pat and the target are UTF-8, we can just
4474 * copy the input to the output, avoiding finding the code point of
4479 else if (utf8_target) {
4480 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4481 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4482 utf8_has_been_setup = TRUE;
4485 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
4488 else { /* an EXACTFish node */
4489 U8 *pat_end = pat + STR_LEN(text_node);
4491 /* An EXACTFL node has at least some characters unfolded, because what
4492 * they match is not known until now. So, now is the time to fold
4493 * the first few of them, as many as are needed to determine 'c1' and
4494 * 'c2' later in the routine. If the pattern isn't UTF-8, we only need
4495 * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4496 * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we
4497 * need to fold as many characters as a single character can fold to,
4498 * so that later we can check if the first ones are such a multi-char
4499 * fold. But, in such a pattern only locale-problematic characters
4500 * aren't folded, so we can skip this completely if the first character
4501 * in the node isn't one of the tricky ones */
4502 if (OP(text_node) == EXACTFL) {
4504 if (! is_utf8_pat) {
4505 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4507 folded[0] = folded[1] = 's';
4509 pat_end = folded + 2;
4512 else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4517 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4519 *(d++) = (U8) toFOLD_LC(*s);
4524 _toFOLD_utf8_flags(s,
4528 FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4539 if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4540 || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4542 /* Multi-character folds require more context to sort out. Also
4543 * PL_utf8_foldclosures used below doesn't handle them, so have to
4544 * be handled outside this routine */
4545 use_chrtest_void = TRUE;
4547 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4548 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4550 const unsigned int * remaining_folds_to_list;
4551 unsigned int first_folds_to;
4553 /* Look up what code points (besides c1) fold to c1; e.g.,
4554 * [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4555 Size_t folds_to_count = _inverse_folds(c1,
4557 &remaining_folds_to_list);
4558 if (folds_to_count == 0) {
4559 c2 = c1; /* there is only a single character that could
4562 else if (folds_to_count != 1) {
4563 /* If there aren't exactly two folds to this (itself and
4564 * another), it is outside the scope of this function */
4565 use_chrtest_void = TRUE;
4567 else { /* There are two. We already have one, get the other */
4568 c2 = first_folds_to;
4570 /* Folds that cross the 255/256 boundary are forbidden if
4571 * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is
4572 * ASCIII. The only other match to c1 is c2, and since c1
4573 * is above 255, c2 better be as well under these
4574 * circumstances. If it isn't, it means the only legal
4575 * match of c1 is itself. */
4577 && ( ( OP(text_node) == EXACTFL
4578 && ! IN_UTF8_CTYPE_LOCALE)
4579 || (( OP(text_node) == EXACTFAA
4580 || OP(text_node) == EXACTFAA_NO_TRIE)
4581 && (isASCII(c1) || isASCII(c2)))))
4587 else /* Here, c1 is <= 255 */
4589 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4590 && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4591 && ((OP(text_node) != EXACTFAA
4592 && OP(text_node) != EXACTFAA_NO_TRIE)
4595 /* Here, there could be something above Latin1 in the target
4596 * which folds to this character in the pattern. All such
4597 * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4598 * than two characters involved in their folds, so are outside
4599 * the scope of this function */
4600 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4601 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4604 use_chrtest_void = TRUE;
4607 else { /* Here nothing above Latin1 can fold to the pattern
4609 switch (OP(text_node)) {
4611 case EXACTFL: /* /l rules */
4612 c2 = PL_fold_locale[c1];
4615 case EXACTF: /* This node only generated for non-utf8
4617 assert(! is_utf8_pat);
4618 if (! utf8_target) { /* /d rules */
4623 /* /u rules for all these. This happens to work for
4624 * EXACTFAA as nothing in Latin1 folds to ASCII */
4625 case EXACTFAA_NO_TRIE: /* This node only generated for
4626 non-utf8 patterns */
4627 assert(! is_utf8_pat);
4632 c2 = PL_fold_latin1[c1];
4636 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4637 NOT_REACHED; /* NOTREACHED */
4643 /* Here have figured things out. Set up the returns */
4644 if (use_chrtest_void) {
4645 *c2p = *c1p = CHRTEST_VOID;
4647 else if (utf8_target) {
4648 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
4649 uvchr_to_utf8(c1_utf8, c1);
4650 uvchr_to_utf8(c2_utf8, c2);
4653 /* Invariants are stored in both the utf8 and byte outputs; Use
4654 * negative numbers otherwise for the byte ones. Make sure that the
4655 * byte ones are the same iff the utf8 ones are the same */
4656 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4657 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4660 ? CHRTEST_NOT_A_CP_1
4661 : CHRTEST_NOT_A_CP_2;
4663 else if (c1 > 255) {
4664 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
4669 *c1p = *c2p = c2; /* c2 is the only representable value */
4671 else { /* c1 is representable; see about c2 */
4673 *c2p = (c2 < 256) ? c2 : c1;
4680 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
4682 /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4683 * between the inputs. See http://www.unicode.org/reports/tr29/. */
4685 PERL_ARGS_ASSERT_ISGCB;
4687 switch (GCB_table[before][after]) {
4694 case GCB_RI_then_RI:
4697 U8 * temp_pos = (U8 *) curpos;
4699 /* Do not break within emoji flag sequences. That is, do not
4700 * break between regional indicator (RI) symbols if there is an
4701 * odd number of RI characters before the break point.
4702 * GB12 sot (RI RI)* RI × RI
4703 * GB13 [^RI] (RI RI)* RI × RI */
4705 while (backup_one_GCB(strbeg,
4707 utf8_target) == GCB_Regional_Indicator)
4712 return RI_count % 2 != 1;
4715 case GCB_EX_then_EM:
4717 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */
4719 U8 * temp_pos = (U8 *) curpos;
4723 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4725 while (prev == GCB_Extend);
4727 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4730 case GCB_Maybe_Emoji_NonBreak:
4734 /* Do not break within emoji modifier sequences or emoji zwj sequences.
4735 GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
4737 U8 * temp_pos = (U8 *) curpos;
4741 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4743 while (prev == GCB_Extend);
4745 return prev != GCB_XPG_XX;
4753 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4754 before, after, GCB_table[before][after]);
4761 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4765 PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4767 if (*curpos < strbeg) {
4772 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4773 U8 * prev_prev_char_pos;
4775 if (! prev_char_pos) {
4779 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4780 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4781 *curpos = prev_char_pos;
4782 prev_char_pos = prev_prev_char_pos;
4785 *curpos = (U8 *) strbeg;
4790 if (*curpos - 2 < strbeg) {
4791 *curpos = (U8 *) strbeg;
4795 gcb = getGCB_VAL_CP(*(*curpos - 1));
4801 /* Combining marks attach to most classes that precede them, but this defines
4802 * the exceptions (from TR14) */
4803 #define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
4804 || prev == LB_Mandatory_Break \
4805 || prev == LB_Carriage_Return \
4806 || prev == LB_Line_Feed \
4807 || prev == LB_Next_Line \
4808 || prev == LB_Space \
4809 || prev == LB_ZWSpace))
4812 S_isLB(pTHX_ LB_enum before,
4814 const U8 * const strbeg,
4815 const U8 * const curpos,
4816 const U8 * const strend,
4817 const bool utf8_target)
4819 U8 * temp_pos = (U8 *) curpos;
4820 LB_enum prev = before;
4822 /* Is the boundary between 'before' and 'after' line-breakable?
4823 * Most of this is just a table lookup of a generated table from Unicode
4824 * rules. But some rules require context to decide, and so have to be
4825 * implemented in code */
4827 PERL_ARGS_ASSERT_ISLB;
4829 /* Rule numbers in the comments below are as of Unicode 9.0 */
4833 switch (LB_table[before][after]) {
4838 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4841 case LB_SP_foo + LB_BREAKABLE:
4842 case LB_SP_foo + LB_NOBREAK:
4843 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4845 /* When we have something following a SP, we have to look at the
4846 * context in order to know what to do.
4848 * SP SP should not reach here because LB7: Do not break before
4849 * spaces. (For two spaces in a row there is nothing that
4850 * overrides that) */
4851 assert(after != LB_Space);
4853 /* Here we have a space followed by a non-space. Mostly this is a
4854 * case of LB18: "Break after spaces". But there are complications
4855 * as the handling of spaces is somewhat tricky. They are in a
4856 * number of rules, which have to be applied in priority order, but
4857 * something earlier in the string can cause a rule to be skipped
4858 * and a lower priority rule invoked. A prime example is LB7 which
4859 * says don't break before a space. But rule LB8 (lower priority)
4860 * says that the first break opportunity after a ZW is after any
4861 * span of spaces immediately after it. If a ZW comes before a SP
4862 * in the input, rule LB8 applies, and not LB7. Other such rules
4863 * involve combining marks which are rules 9 and 10, but they may
4864 * override higher priority rules if they come earlier in the
4865 * string. Since we're doing random access into the middle of the
4866 * string, we have to look for rules that should get applied based
4867 * on both string position and priority. Combining marks do not
4868 * attach to either ZW nor SP, so we don't have to consider them
4871 * To check for LB8, we have to find the first non-space character
4872 * before this span of spaces */
4874 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4876 while (prev == LB_Space);
4878 /* LB8 Break before any character following a zero-width space,
4879 * even if one or more spaces intervene.
4881 * So if we have a ZW just before this span, and to get here this
4882 * is the final space in the span. */
4883 if (prev == LB_ZWSpace) {
4887 /* Here, not ZW SP+. There are several rules that have higher
4888 * priority than LB18 and can be resolved now, as they don't depend
4889 * on anything earlier in the string (except ZW, which we have
4890 * already handled). One of these rules is LB11 Do not break
4891 * before Word joiner, but we have specially encoded that in the
4892 * lookup table so it is caught by the single test below which
4893 * catches the other ones. */
4894 if (LB_table[LB_Space][after] - LB_SP_foo
4895 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4900 /* If we get here, we have to XXX consider combining marks. */
4901 if (prev == LB_Combining_Mark) {
4903 /* What happens with these depends on the character they
4906 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4908 while (prev == LB_Combining_Mark);
4910 /* Most times these attach to and inherit the characteristics
4911 * of that character, but not always, and when not, they are to
4912 * be treated as AL by rule LB10. */
4913 if (! LB_CM_ATTACHES_TO(prev)) {
4914 prev = LB_Alphabetic;
4918 /* Here, we have the character preceding the span of spaces all set
4919 * up. We follow LB18: "Break after spaces" unless the table shows
4920 * that is overriden */
4921 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4925 /* We don't know how to treat the CM except by looking at the first
4926 * non-CM character preceding it. ZWJ is treated as CM */
4928 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4930 while (prev == LB_Combining_Mark || prev == LB_ZWJ);
4932 /* Here, 'prev' is that first earlier non-CM character. If the CM
4933 * attatches to it, then it inherits the behavior of 'prev'. If it
4934 * doesn't attach, it is to be treated as an AL */
4935 if (! LB_CM_ATTACHES_TO(prev)) {
4936 prev = LB_Alphabetic;
4941 case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4942 case LB_HY_or_BA_then_foo + LB_NOBREAK:
4944 /* LB21a Don't break after Hebrew + Hyphen.
4945 * HL (HY | BA) × */
4947 if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4948 == LB_Hebrew_Letter)
4953 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4955 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4956 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4958 /* LB25a (PR | PO) × ( OP | HY )? NU */
4959 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4963 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4966 case LB_SY_or_IS_then_various + LB_BREAKABLE:
4967 case LB_SY_or_IS_then_various + LB_NOBREAK:
4969 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4971 LB_enum temp = prev;
4973 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4975 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4976 if (temp == LB_Numeric) {
4980 return LB_table[prev][after] - LB_SY_or_IS_then_various
4984 case LB_various_then_PO_or_PR + LB_BREAKABLE:
4985 case LB_various_then_PO_or_PR + LB_NOBREAK:
4987 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4989 LB_enum temp = prev;
4990 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4992 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4994 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4995 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4997 if (temp == LB_Numeric) {
5000 return LB_various_then_PO_or_PR;
5003 case LB_RI_then_RI + LB_NOBREAK:
5004 case LB_RI_then_RI + LB_BREAKABLE:
5008 /* LB30a Break between two regional indicator symbols if and
5009 * only if there are an even number of regional indicators
5010 * preceding the position of the break.
5012 * sot (RI RI)* RI × RI
5013 * [^RI] (RI RI)* RI × RI */
5015 while (backup_one_LB(strbeg,
5017 utf8_target) == LB_Regional_Indicator)
5022 return RI_count % 2 == 0;
5030 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5031 before, after, LB_table[before][after]);
5038 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5042 PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5044 if (*curpos >= strend) {
5049 *curpos += UTF8SKIP(*curpos);
5050 if (*curpos >= strend) {
5053 lb = getLB_VAL_UTF8(*curpos, strend);
5057 if (*curpos >= strend) {
5060 lb = getLB_VAL_CP(**curpos);
5067 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5071 PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5073 if (*curpos < strbeg) {
5078 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5079 U8 * prev_prev_char_pos;
5081 if (! prev_char_pos) {
5085 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5086 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5087 *curpos = prev_char_pos;
5088 prev_char_pos = prev_prev_char_pos;
5091 *curpos = (U8 *) strbeg;
5096 if (*curpos - 2 < strbeg) {
5097 *curpos = (U8 *) strbeg;
5101 lb = getLB_VAL_CP(*(*curpos - 1));
5108 S_isSB(pTHX_ SB_enum before,
5110 const U8 * const strbeg,
5111 const U8 * const curpos,
5112 const U8 * const strend,
5113 const bool utf8_target)
5115 /* returns a boolean indicating if there is a Sentence Boundary Break
5116 * between the inputs. See http://www.unicode.org/reports/tr29/ */
5118 U8 * lpos = (U8 *) curpos;
5119 bool has_para_sep = FALSE;
5120 bool has_sp = FALSE;
5122 PERL_ARGS_ASSERT_ISSB;
5124 /* Break at the start and end of text.
5127 But unstated in Unicode is don't break if the text is empty */
5128 if (before == SB_EDGE || after == SB_EDGE) {
5129 return before != after;
5132 /* SB 3: Do not break within CRLF. */
5133 if (before == SB_CR && after == SB_LF) {
5137 /* Break after paragraph separators. CR and LF are considered
5138 * so because Unicode views text as like word processing text where there
5139 * are no newlines except between paragraphs, and the word processor takes
5140 * care of wrapping without there being hard line-breaks in the text *./
5141 SB4. Sep | CR | LF ÷ */
5142 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5146 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5147 * (See Section 6.2, Replacing Ignore Rules.)
5148 SB5. X (Extend | Format)* → X */
5149 if (after == SB_Extend || after == SB_Format) {
5151 /* Implied is that the these characters attach to everything
5152 * immediately prior to them except for those separator-type
5153 * characters. And the rules earlier have already handled the case
5154 * when one of those immediately precedes the extend char */
5158 if (before == SB_Extend || before == SB_Format) {
5159 U8 * temp_pos = lpos;
5160 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5161 if ( backup != SB_EDGE
5170 /* Here, both 'before' and 'backup' are these types; implied is that we
5171 * don't break between them */
5172 if (backup == SB_Extend || backup == SB_Format) {
5177 /* Do not break after ambiguous terminators like period, if they are
5178 * immediately followed by a number or lowercase letter, if they are
5179 * between uppercase letters, if the first following letter (optionally
5180 * after certain punctuation) is lowercase, or if they are followed by
5181 * "continuation" punctuation such as comma, colon, or semicolon. For
5182 * example, a period may be an abbreviation or numeric period, and thus may
5183 * not mark the end of a sentence.
5185 * SB6. ATerm × Numeric */
5186 if (before == SB_ATerm && after == SB_Numeric) {
5190 /* SB7. (Upper | Lower) ATerm × Upper */
5191 if (before == SB_ATerm && after == SB_Upper) {
5192 U8 * temp_pos = lpos;
5193 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5194 if (backup == SB_Upper || backup == SB_Lower) {
5199 /* The remaining rules that aren't the final one, all require an STerm or
5200 * an ATerm after having backed up over some Close* Sp*, and in one case an
5201 * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5202 * So do that backup now, setting flags if either Sp or a paragraph
5203 * separator are found */
5205 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5206 has_para_sep = TRUE;
5207 before = backup_one_SB(strbeg, &lpos, utf8_target);
5210 if (before == SB_Sp) {
5213 before = backup_one_SB(strbeg, &lpos, utf8_target);
5215 while (before == SB_Sp);
5218 while (before == SB_Close) {
5219 before = backup_one_SB(strbeg, &lpos, utf8_target);
5222 /* The next few rules apply only when the backed-up-to is an ATerm, and in
5223 * most cases an STerm */
5224 if (before == SB_STerm || before == SB_ATerm) {
5226 /* So, here the lhs matches
5227 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5228 * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5229 * The rules that apply here are:
5231 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR
5232 | LF | STerm | ATerm) )* Lower
5233 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
5234 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF)
5235 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF)
5236 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷
5239 /* And all but SB11 forbid having seen a paragraph separator */
5240 if (! has_para_sep) {
5241 if (before == SB_ATerm) { /* SB8 */
5242 U8 * rpos = (U8 *) curpos;
5243 SB_enum later = after;
5245 while ( later != SB_OLetter
5246 && later != SB_Upper
5247 && later != SB_Lower
5251 && later != SB_STerm
5252 && later != SB_ATerm
5253 && later != SB_EDGE)
5255 later = advance_one_SB(&rpos, strend, utf8_target);
5257 if (later == SB_Lower) {
5262 if ( after == SB_SContinue /* SB8a */
5263 || after == SB_STerm
5264 || after == SB_ATerm)
5269 if (! has_sp) { /* SB9 applies only if there was no Sp* */
5270 if ( after == SB_Close
5280 /* SB10. This and SB9 could probably be combined some way, but khw
5281 * has decided to follow the Unicode rule book precisely for
5282 * simplified maintenance */
5296 /* Otherwise, do not break.
5303 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5307 PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5309 if (*curpos >= strend) {
5315 *curpos += UTF8SKIP(*curpos);
5316 if (*curpos >= strend) {
5319 sb = getSB_VAL_UTF8(*curpos, strend);
5320 } while (sb == SB_Extend || sb == SB_Format);
5325 if (*curpos >= strend) {
5328 sb = getSB_VAL_CP(**curpos);
5329 } while (sb == SB_Extend || sb == SB_Format);
5336 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5340 PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5342 if (*curpos < strbeg) {
5347 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5348 if (! prev_char_pos) {
5352 /* Back up over Extend and Format. curpos is always just to the right
5353 * of the characater whose value we are getting */
5355 U8 * prev_prev_char_pos;
5356 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5359 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5360 *curpos = prev_char_pos;
5361 prev_char_pos = prev_prev_char_pos;
5364 *curpos = (U8 *) strbeg;
5367 } while (sb == SB_Extend || sb == SB_Format);
5371 if (*curpos - 2 < strbeg) {
5372 *curpos = (U8 *) strbeg;
5376 sb = getSB_VAL_CP(*(*curpos - 1));
5377 } while (sb == SB_Extend || sb == SB_Format);
5384 S_isWB(pTHX_ WB_enum previous,
5387 const U8 * const strbeg,
5388 const U8 * const curpos,
5389 const U8 * const strend,
5390 const bool utf8_target)
5392 /* Return a boolean as to if the boundary between 'before' and 'after' is
5393 * a Unicode word break, using their published algorithm, but tailored for
5394 * Perl by treating spans of white space as one unit. Context may be
5395 * needed to make this determination. If the value for the character
5396 * before 'before' is known, it is passed as 'previous'; otherwise that
5397 * should be set to WB_UNKNOWN. The other input parameters give the
5398 * boundaries and current position in the matching of the string. That
5399 * is, 'curpos' marks the position where the character whose wb value is
5400 * 'after' begins. See http://www.unicode.org/reports/tr29/ */
5402 U8 * before_pos = (U8 *) curpos;
5403 U8 * after_pos = (U8 *) curpos;
5404 WB_enum prev = before;
5407 PERL_ARGS_ASSERT_ISWB;
5409 /* Rule numbers in the comments below are as of Unicode 9.0 */
5413 switch (WB_table[before][after]) {
5420 case WB_hs_then_hs: /* 2 horizontal spaces in a row */
5421 next = advance_one_WB(&after_pos, strend, utf8_target,
5422 FALSE /* Don't skip Extend nor Format */ );
5423 /* A space immediately preceeding an Extend or Format is attached
5424 * to by them, and hence gets separated from previous spaces.
5425 * Otherwise don't break between horizontal white space */
5426 return next == WB_Extend || next == WB_Format;
5428 /* WB4 Ignore Format and Extend characters, except when they appear at
5429 * the beginning of a region of text. This code currently isn't
5430 * general purpose, but it works as the rules are currently and likely
5431 * to be laid out. The reason it works is that when 'they appear at
5432 * the beginning of a region of text', the rule is to break before
5433 * them, just like any other character. Therefore, the default rule
5434 * applies and we don't have to look in more depth. Should this ever
5435 * change, we would have to have 2 'case' statements, like in the rules
5436 * below, and backup a single character (not spacing over the extend
5437 * ones) and then see if that is one of the region-end characters and
5439 case WB_Ex_or_FO_or_ZWJ_then_foo:
5440 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5443 case WB_DQ_then_HL + WB_BREAKABLE:
5444 case WB_DQ_then_HL + WB_NOBREAK:
5446 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */
5448 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5449 == WB_Hebrew_Letter)
5454 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5456 case WB_HL_then_DQ + WB_BREAKABLE:
5457 case WB_HL_then_DQ + WB_NOBREAK:
5459 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */
5461 if (advance_one_WB(&after_pos, strend, utf8_target,
5462 TRUE /* Do skip Extend and Format */ )
5463 == WB_Hebrew_Letter)
5468 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5470 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5471 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5473 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
5474 * | Single_Quote) (ALetter | Hebrew_Letter) */
5476 next = advance_one_WB(&after_pos, strend, utf8_target,
5477 TRUE /* Do skip Extend and Format */ );
5479 if (next == WB_ALetter || next == WB_Hebrew_Letter)
5484 return WB_table[before][after]
5485 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5487 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5488 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5490 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5491 * | Single_Quote) × (ALetter | Hebrew_Letter) */
5493 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5494 if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5499 return WB_table[before][after]
5500 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5502 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5503 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5505 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
5508 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5514 return WB_table[before][after]
5515 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5517 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5518 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5520 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */
5522 if (advance_one_WB(&after_pos, strend, utf8_target,
5523 TRUE /* Do skip Extend and Format */ )
5529 return WB_table[before][after]
5530 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5532 case WB_RI_then_RI + WB_NOBREAK:
5533 case WB_RI_then_RI + WB_BREAKABLE:
5537 /* Do not break within emoji flag sequences. That is, do not
5538 * break between regional indicator (RI) symbols if there is an
5539 * odd number of RI characters before the potential break
5542 * WB15 sot (RI RI)* RI × RI
5543 * WB16 [^RI] (RI RI)* RI × RI */
5545 while (backup_one_WB(&previous,
5548 utf8_target) == WB_Regional_Indicator)
5553 return RI_count % 2 != 1;
5561 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5562 before, after, WB_table[before][after]);
5569 S_advance_one_WB(pTHX_ U8 ** curpos,
5570 const U8 * const strend,
5571 const bool utf8_target,
5572 const bool skip_Extend_Format)
5576 PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5578 if (*curpos >= strend) {
5584 /* Advance over Extend and Format */
5586 *curpos += UTF8SKIP(*curpos);
5587 if (*curpos >= strend) {
5590 wb = getWB_VAL_UTF8(*curpos, strend);
5591 } while ( skip_Extend_Format
5592 && (wb == WB_Extend || wb == WB_Format));
5597 if (*curpos >= strend) {
5600 wb = getWB_VAL_CP(**curpos);
5601 } while ( skip_Extend_Format
5602 && (wb == WB_Extend || wb == WB_Format));
5609 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5613 PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5615 /* If we know what the previous character's break value is, don't have
5617 if (*previous != WB_UNKNOWN) {
5620 /* But we need to move backwards by one */
5622 *curpos = reghopmaybe3(*curpos, -1, strbeg);
5624 *previous = WB_EDGE;
5625 *curpos = (U8 *) strbeg;
5628 *previous = WB_UNKNOWN;
5633 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5636 /* And we always back up over these three types */
5637 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
5642 if (*curpos < strbeg) {
5647 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5648 if (! prev_char_pos) {
5652 /* Back up over Extend and Format. curpos is always just to the right
5653 * of the characater whose value we are getting */
5655 U8 * prev_prev_char_pos;
5656 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5660 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5661 *curpos = prev_char_pos;
5662 prev_char_pos = prev_prev_char_pos;
5665 *curpos = (U8 *) strbeg;
5668 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
5672 if (*curpos - 2 < strbeg) {
5673 *curpos = (U8 *) strbeg;
5677 wb = getWB_VAL_CP(*(*curpos - 1));
5678 } while (wb == WB_Extend || wb == WB_Format);
5684 #define EVAL_CLOSE_PAREN_IS(st,expr) \
5687 ( ( st )->u.eval.close_paren ) && \
5688 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
5691 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
5694 ( ( st )->u.eval.close_paren ) && \
5696 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
5700 #define EVAL_CLOSE_PAREN_SET(st,expr) \
5701 (st)->u.eval.close_paren = ( (expr) + 1 )
5703 #define EVAL_CLOSE_PAREN_CLEAR(st) \
5704 (st)->u.eval.close_paren = 0
5706 /* returns -1 on failure, $+[0] on success */
5708 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
5711 const bool utf8_target = reginfo->is_utf8_target;
5712 const U32 uniflags = UTF8_ALLOW_DEFAULT;
5713 REGEXP *rex_sv = reginfo->prog;
5714 regexp *rex = ReANY(rex_sv);
5715 RXi_GET_DECL(rex,rexi);
5716 /* the current state. This is a cached copy of PL_regmatch_state */
5718 /* cache heavy used fields of st in registers */
5721 U32 n = 0; /* general value; init to avoid compiler warning */
5722 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
5723 SSize_t endref = 0; /* offset of end of backref when ln is start */
5724 char *locinput = startpos;
5725 char *pushinput; /* where to continue after a PUSH */
5726 I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */
5728 bool result = 0; /* return value of S_regmatch */
5729 U32 depth = 0; /* depth of backtrack stack */
5730 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5731 const U32 max_nochange_depth =
5732 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5733 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
5734 regmatch_state *yes_state = NULL; /* state to pop to on success of
5736 /* mark_state piggy backs on the yes_state logic so that when we unwind
5737 the stack on success we can update the mark_state as we go */
5738 regmatch_state *mark_state = NULL; /* last mark state we have seen */
5739 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
5740 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
5742 bool no_final = 0; /* prevent failure from backtracking? */
5743 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
5744 char *startpoint = locinput;
5745 SV *popmark = NULL; /* are we looking for a mark? */
5746 SV *sv_commit = NULL; /* last mark name seen in failure */
5747 SV *sv_yes_mark = NULL; /* last mark name we have seen
5748 during a successful match */
5749 U32 lastopen = 0; /* last open we saw */
5750 bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
5751 SV* const oreplsv = GvSVn(PL_replgv);
5752 /* these three flags are set by various ops to signal information to
5753 * the very next op. They have a useful lifetime of exactly one loop
5754 * iteration, and are not preserved or restored by state pushes/pops
5756 bool sw = 0; /* the condition value in (?(cond)a|b) */
5757 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
5758 int logical = 0; /* the following EVAL is:
5762 or the following IFMATCH/UNLESSM is:
5763 false: plain (?=foo)
5764 true: used as a condition: (?(?=foo))
5766 PAD* last_pad = NULL;
5768 U8 gimme = G_SCALAR;
5769 CV *caller_cv = NULL; /* who called us */
5770 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
5771 U32 maxopenparen = 0; /* max '(' index seen so far */
5772 int to_complement; /* Invert the result? */
5773 _char_class_number classnum;
5774 bool is_utf8_pat = reginfo->is_utf8_pat;
5776 I32 orig_savestack_ix = PL_savestack_ix;
5777 U8 * script_run_begin = NULL;
5779 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5780 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5781 # define SOLARIS_BAD_OPTIMIZER
5782 const U32 *pl_charclass_dup = PL_charclass;
5783 # define PL_charclass pl_charclass_dup
5787 GET_RE_DEBUG_FLAGS_DECL;
5790 /* protect against undef(*^R) */
5791 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5793 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5794 multicall_oldcatch = 0;
5795 PERL_UNUSED_VAR(multicall_cop);
5797 PERL_ARGS_ASSERT_REGMATCH;
5799 st = PL_regmatch_state;
5801 /* Note that nextchr is a byte even in UTF */
5805 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5806 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5807 Perl_re_printf( aTHX_ "regmatch start\n" );
5810 while (scan != NULL) {
5811 next = scan + NEXT_OFF(scan);
5814 state_num = OP(scan);
5818 if (state_num <= REGNODE_MAX) {
5819 SV * const prop = sv_newmortal();
5820 regnode *rnext = regnext(scan);
5822 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5823 regprop(rex, prop, scan, reginfo, NULL);
5824 Perl_re_printf( aTHX_
5825 "%*s%" IVdf ":%s(%" IVdf ")\n",
5826 INDENT_CHARS(depth), "",
5827 (IV)(scan - rexi->program),
5829 (PL_regkind[OP(scan)] == END || !rnext) ?
5830 0 : (IV)(rnext - rexi->program));
5837 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
5839 switch (state_num) {
5840 case SBOL: /* /^../ and /\A../ */
5841 if (locinput == reginfo->strbeg)
5845 case MBOL: /* /^../m */
5846 if (locinput == reginfo->strbeg ||
5847 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
5854 if (locinput == reginfo->ganch)
5858 case KEEPS: /* \K */
5859 /* update the startpoint */
5860 st->u.keeper.val = rex->offs[0].start;
5861 rex->offs[0].start = locinput - reginfo->strbeg;
5862 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
5863 NOT_REACHED; /* NOTREACHED */
5865 case KEEPS_next_fail:
5866 /* rollback the start point change */
5867 rex->offs[0].start = st->u.keeper.val;
5869 NOT_REACHED; /* NOTREACHED */
5871 case MEOL: /* /..$/m */
5872 if (!NEXTCHR_IS_EOS && nextchr != '\n')
5876 case SEOL: /* /..$/ */
5877 if (!NEXTCHR_IS_EOS && nextchr != '\n')
5879 if (reginfo->strend - locinput > 1)
5884 if (!NEXTCHR_IS_EOS)
5888 case SANY: /* /./s */
5891 goto increment_locinput;
5893 case REG_ANY: /* /./ */
5894 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5896 goto increment_locinput;
5900 #define ST st->u.trie
5901 case TRIEC: /* (ab|cd) with known charclass */
5902 /* In this case the charclass data is available inline so
5903 we can fail fast without a lot of extra overhead.
5905 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5907 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
5908 depth, PL_colors[4], PL_colors[5])
5911 NOT_REACHED; /* NOTREACHED */
5914 case TRIE: /* (ab|cd) */
5915 /* the basic plan of execution of the trie is:
5916 * At the beginning, run though all the states, and
5917 * find the longest-matching word. Also remember the position
5918 * of the shortest matching word. For example, this pattern:
5921 * when matched against the string "abcde", will generate
5922 * accept states for all words except 3, with the longest
5923 * matching word being 4, and the shortest being 2 (with
5924 * the position being after char 1 of the string).
5926 * Then for each matching word, in word order (i.e. 1,2,4,5),
5927 * we run the remainder of the pattern; on each try setting
5928 * the current position to the character following the word,
5929 * returning to try the next word on failure.
5931 * We avoid having to build a list of words at runtime by
5932 * using a compile-time structure, wordinfo[].prev, which
5933 * gives, for each word, the previous accepting word (if any).
5934 * In the case above it would contain the mappings 1->2, 2->0,
5935 * 3->0, 4->5, 5->1. We can use this table to generate, from
5936 * the longest word (4 above), a list of all words, by
5937 * following the list of prev pointers; this gives us the
5938 * unordered list 4,5,1,2. Then given the current word we have
5939 * just tried, we can go through the list and find the
5940 * next-biggest word to try (so if we just failed on word 2,
5941 * the next in the list is 4).
5943 * Since at runtime we don't record the matching position in
5944 * the string for each word, we have to work that out for
5945 * each word we're about to process. The wordinfo table holds
5946 * the character length of each word; given that we recorded
5947 * at the start: the position of the shortest word and its
5948 * length in chars, we just need to move the pointer the
5949 * difference between the two char lengths. Depending on
5950 * Unicode status and folding, that's cheap or expensive.
5952 * This algorithm is optimised for the case where are only a
5953 * small number of accept states, i.e. 0,1, or maybe 2.
5954 * With lots of accepts states, and having to try all of them,
5955 * it becomes quadratic on number of accept states to find all
5960 /* what type of TRIE am I? (utf8 makes this contextual) */
5961 DECL_TRIE_TYPE(scan);
5963 /* what trie are we using right now */
5964 reg_trie_data * const trie
5965 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5966 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5967 U32 state = trie->startstate;
5969 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5970 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5973 && UTF8_IS_ABOVE_LATIN1(nextchr)
5974 && scan->flags == EXACTL)
5976 /* We only output for EXACTL, as we let the folder
5977 * output this message for EXACTFLU8 to avoid
5979 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5984 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5986 if (trie->states[ state ].wordnum) {
5988 Perl_re_exec_indentf( aTHX_ "%sTRIE: matched empty string...%s\n",
5989 depth, PL_colors[4], PL_colors[5])
5995 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
5996 depth, PL_colors[4], PL_colors[5])
6003 U8 *uc = ( U8* )locinput;
6007 U8 *uscan = (U8*)NULL;
6008 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6009 U32 charcount = 0; /* how many input chars we have matched */
6010 U32 accepted = 0; /* have we seen any accepting states? */
6012 ST.jump = trie->jump;
6015 ST.longfold = FALSE; /* char longer if folded => it's harder */
6018 /* fully traverse the TRIE; note the position of the
6019 shortest accept state and the wordnum of the longest
6022 while ( state && uc <= (U8*)(reginfo->strend) ) {
6023 U32 base = trie->states[ state ].trans.base;
6027 wordnum = trie->states[ state ].wordnum;
6029 if (wordnum) { /* it's an accept state */
6032 /* record first match position */
6034 ST.firstpos = (U8*)locinput;
6039 ST.firstchars = charcount;
6042 if (!ST.nextword || wordnum < ST.nextword)
6043 ST.nextword = wordnum;
6044 ST.topword = wordnum;
6047 DEBUG_TRIE_EXECUTE_r({
6048 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6050 PerlIO_printf( Perl_debug_log,
6051 "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6052 INDENT_CHARS(depth), "", PL_colors[4],
6053 (UV)state, (accepted ? 'Y' : 'N'));
6056 /* read a char and goto next state */
6057 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
6059 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6060 (U8 *) reginfo->strend, uscan,
6061 len, uvc, charid, foldlen,
6068 base + charid - 1 - trie->uniquecharcount)) >= 0)
6070 && ((U32)offset < trie->lasttrans)
6071 && trie->trans[offset].check == state)
6073 state = trie->trans[offset].next;
6084 DEBUG_TRIE_EXECUTE_r(
6085 Perl_re_printf( aTHX_
6086 "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6087 charid, uvc, (UV)state, PL_colors[5] );
6093 /* calculate total number of accept states */
6098 w = trie->wordinfo[w].prev;
6101 ST.accepted = accepted;
6105 Perl_re_exec_indentf( aTHX_ "%sTRIE: got %" IVdf " possible matches%s\n",
6107 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6109 goto trie_first_try; /* jump into the fail handler */
6111 NOT_REACHED; /* NOTREACHED */
6113 case TRIE_next_fail: /* we failed - try next alternative */
6117 /* undo any captures done in the tail part of a branch,
6119 * /(?:X(.)(.)|Y(.)).../
6120 * where the trie just matches X then calls out to do the
6121 * rest of the branch */
6122 REGCP_UNWIND(ST.cp);
6123 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6125 if (!--ST.accepted) {
6127 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n",
6135 /* Find next-highest word to process. Note that this code
6136 * is O(N^2) per trie run (O(N) per branch), so keep tight */
6139 U16 const nextword = ST.nextword;
6140 reg_trie_wordinfo * const wordinfo
6141 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6142 for (word=ST.topword; word; word=wordinfo[word].prev) {
6143 if (word > nextword && (!min || word < min))
6156 ST.lastparen = rex->lastparen;
6157 ST.lastcloseparen = rex->lastcloseparen;
6161 /* find start char of end of current word */
6163 U32 chars; /* how many chars to skip */
6164 reg_trie_data * const trie
6165 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6167 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6169 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6174 /* the hard option - fold each char in turn and find
6175 * its folded length (which may be different */
6176 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6184 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6192 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6197 uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6213 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6214 ? ST.jump[ST.nextword]
6218 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n",
6226 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6227 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
6228 NOT_REACHED; /* NOTREACHED */
6230 /* only one choice left - just continue */
6232 AV *const trie_words
6233 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6234 SV ** const tmp = trie_words
6235 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6236 SV *sv= tmp ? sv_newmortal() : NULL;
6238 Perl_re_exec_indentf( aTHX_ "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6239 depth, PL_colors[4],
6241 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6242 PL_colors[0], PL_colors[1],
6243 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6245 : "not compiled under -Dr",
6249 locinput = (char*)uc;
6250 continue; /* execute rest of RE */
6255 case EXACTL: /* /abc/l */
6256 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6258 /* Complete checking would involve going through every character
6259 * matched by the string to see if any is above latin1. But the
6260 * comparision otherwise might very well be a fast assembly
6261 * language routine, and I (khw) don't think slowing things down
6262 * just to check for this warning is worth it. So this just checks
6263 * the first character */
6264 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6265 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6268 case EXACT: { /* /abc/ */
6269 char *s = STRING(scan);
6271 if (utf8_target != is_utf8_pat) {
6272 /* The target and the pattern have differing utf8ness. */
6274 const char * const e = s + ln;
6277 /* The target is utf8, the pattern is not utf8.
6278 * Above-Latin1 code points can't match the pattern;
6279 * invariants match exactly, and the other Latin1 ones need
6280 * to be downgraded to a single byte in order to do the
6281 * comparison. (If we could be confident that the target
6282 * is not malformed, this could be refactored to have fewer
6283 * tests by just assuming that if the first bytes match, it
6284 * is an invariant, but there are tests in the test suite
6285 * dealing with (??{...}) which violate this) */
6287 if (l >= reginfo->strend
6288 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6292 if (UTF8_IS_INVARIANT(*(U8*)l)) {
6299 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6309 /* The target is not utf8, the pattern is utf8. */
6311 if (l >= reginfo->strend
6312 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6316 if (UTF8_IS_INVARIANT(*(U8*)s)) {
6323 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6335 /* The target and the pattern have the same utf8ness. */
6336 /* Inline the first character, for speed. */
6337 if (reginfo->strend - locinput < ln
6338 || UCHARAT(s) != nextchr
6339 || (ln > 1 && memNE(s, locinput, ln)))
6348 case EXACTFL: { /* /abc/il */
6350 const U8 * fold_array;
6352 U32 fold_utf8_flags;
6354 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6355 folder = foldEQ_locale;
6356 fold_array = PL_fold_locale;
6357 fold_utf8_flags = FOLDEQ_LOCALE;
6360 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
6361 is effectively /u; hence to match, target
6363 if (! utf8_target) {
6366 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
6367 | FOLDEQ_S1_FOLDS_SANE;
6368 folder = foldEQ_latin1;
6369 fold_array = PL_fold_latin1;
6372 case EXACTFU_SS: /* /\x{df}/iu */
6373 case EXACTFU: /* /abc/iu */
6374 folder = foldEQ_latin1;
6375 fold_array = PL_fold_latin1;
6376 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
6379 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8
6381 assert(! is_utf8_pat);
6383 case EXACTFAA: /* /abc/iaa */
6384 folder = foldEQ_latin1;
6385 fold_array = PL_fold_latin1;
6386 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6389 case EXACTF: /* /abc/i This node only generated for
6390 non-utf8 patterns */
6391 assert(! is_utf8_pat);
6393 fold_array = PL_fold;
6394 fold_utf8_flags = 0;
6402 || state_num == EXACTFU_SS
6403 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6405 /* Either target or the pattern are utf8, or has the issue where
6406 * the fold lengths may differ. */
6407 const char * const l = locinput;
6408 char *e = reginfo->strend;
6410 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
6411 l, &e, 0, utf8_target, fold_utf8_flags))
6419 /* Neither the target nor the pattern are utf8 */
6420 if (UCHARAT(s) != nextchr
6422 && UCHARAT(s) != fold_array[nextchr])
6426 if (reginfo->strend - locinput < ln)
6428 if (ln > 1 && ! folder(s, locinput, ln))
6434 case NBOUNDL: /* /\B/l */
6438 case BOUNDL: /* /\b/l */
6441 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6443 if (FLAGS(scan) != TRADITIONAL_BOUND) {
6444 if (! IN_UTF8_CTYPE_LOCALE) {
6445 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6446 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6452 if (locinput == reginfo->strbeg)
6453 b1 = isWORDCHAR_LC('\n');
6455 b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6456 (U8*)(reginfo->strbeg)),
6457 (U8*)(reginfo->strend));
6459 b2 = (NEXTCHR_IS_EOS)
6460 ? isWORDCHAR_LC('\n')
6461 : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6462 (U8*) reginfo->strend);
6464 else { /* Here the string isn't utf8 */
6465 b1 = (locinput == reginfo->strbeg)
6466 ? isWORDCHAR_LC('\n')
6467 : isWORDCHAR_LC(UCHARAT(locinput - 1));
6468 b2 = (NEXTCHR_IS_EOS)
6469 ? isWORDCHAR_LC('\n')
6470 : isWORDCHAR_LC(nextchr);
6472 if (to_complement ^ (b1 == b2)) {
6478 case NBOUND: /* /\B/ */
6482 case BOUND: /* /\b/ */
6486 goto bound_ascii_match_only;
6488 case NBOUNDA: /* /\B/a */
6492 case BOUNDA: /* /\b/a */
6496 bound_ascii_match_only:
6497 /* Here the string isn't utf8, or is utf8 and only ascii characters
6498 * are to match \w. In the latter case looking at the byte just
6499 * prior to the current one may be just the final byte of a
6500 * multi-byte character. This is ok. There are two cases:
6501 * 1) it is a single byte character, and then the test is doing
6502 * just what it's supposed to.
6503 * 2) it is a multi-byte character, in which case the final byte is
6504 * never mistakable for ASCII, and so the test will say it is
6505 * not a word character, which is the correct answer. */
6506 b1 = (locinput == reginfo->strbeg)
6507 ? isWORDCHAR_A('\n')
6508 : isWORDCHAR_A(UCHARAT(locinput - 1));
6509 b2 = (NEXTCHR_IS_EOS)
6510 ? isWORDCHAR_A('\n')
6511 : isWORDCHAR_A(nextchr);
6512 if (to_complement ^ (b1 == b2)) {
6518 case NBOUNDU: /* /\B/u */
6522 case BOUNDU: /* /\b/u */
6525 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6528 else if (utf8_target) {
6530 switch((bound_type) FLAGS(scan)) {
6531 case TRADITIONAL_BOUND:
6534 b1 = (locinput == reginfo->strbeg)
6535 ? 0 /* isWORDCHAR_L1('\n') */
6536 : isWORDCHAR_utf8_safe(
6537 reghop3((U8*)locinput,
6539 (U8*)(reginfo->strbeg)),
6540 (U8*) reginfo->strend);
6541 b2 = (NEXTCHR_IS_EOS)
6542 ? 0 /* isWORDCHAR_L1('\n') */
6543 : isWORDCHAR_utf8_safe((U8*)locinput,
6544 (U8*) reginfo->strend);
6545 match = cBOOL(b1 != b2);
6549 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6550 match = TRUE; /* GCB always matches at begin and
6554 /* Find the gcb values of previous and current
6555 * chars, then see if is a break point */
6556 match = isGCB(getGCB_VAL_UTF8(
6557 reghop3((U8*)locinput,
6559 (U8*)(reginfo->strbeg)),
6560 (U8*) reginfo->strend),
6561 getGCB_VAL_UTF8((U8*) locinput,
6562 (U8*) reginfo->strend),
6563 (U8*) reginfo->strbeg,
6570 if (locinput == reginfo->strbeg) {
6573 else if (NEXTCHR_IS_EOS) {
6577 match = isLB(getLB_VAL_UTF8(
6578 reghop3((U8*)locinput,
6580 (U8*)(reginfo->strbeg)),
6581 (U8*) reginfo->strend),
6582 getLB_VAL_UTF8((U8*) locinput,
6583 (U8*) reginfo->strend),
6584 (U8*) reginfo->strbeg,
6586 (U8*) reginfo->strend,
6591 case SB_BOUND: /* Always matches at begin and end */
6592 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6596 match = isSB(getSB_VAL_UTF8(
6597 reghop3((U8*)locinput,
6599 (U8*)(reginfo->strbeg)),
6600 (U8*) reginfo->strend),
6601 getSB_VAL_UTF8((U8*) locinput,
6602 (U8*) reginfo->strend),
6603 (U8*) reginfo->strbeg,
6605 (U8*) reginfo->strend,
6611 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6615 match = isWB(WB_UNKNOWN,
6617 reghop3((U8*)locinput,
6619 (U8*)(reginfo->strbeg)),
6620 (U8*) reginfo->strend),
6621 getWB_VAL_UTF8((U8*) locinput,
6622 (U8*) reginfo->strend),
6623 (U8*) reginfo->strbeg,
6625 (U8*) reginfo->strend,
6631 else { /* Not utf8 target */
6632 switch((bound_type) FLAGS(scan)) {
6633 case TRADITIONAL_BOUND:
6636 b1 = (locinput == reginfo->strbeg)
6637 ? 0 /* isWORDCHAR_L1('\n') */
6638 : isWORDCHAR_L1(UCHARAT(locinput - 1));
6639 b2 = (NEXTCHR_IS_EOS)
6640 ? 0 /* isWORDCHAR_L1('\n') */
6641 : isWORDCHAR_L1(nextchr);
6642 match = cBOOL(b1 != b2);
6647 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6648 match = TRUE; /* GCB always matches at begin and
6651 else { /* Only CR-LF combo isn't a GCB in 0-255
6653 match = UCHARAT(locinput - 1) != '\r'
6654 || UCHARAT(locinput) != '\n';
6659 if (locinput == reginfo->strbeg) {
6662 else if (NEXTCHR_IS_EOS) {
6666 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6667 getLB_VAL_CP(UCHARAT(locinput)),
6668 (U8*) reginfo->strbeg,
6670 (U8*) reginfo->strend,
6675 case SB_BOUND: /* Always matches at begin and end */
6676 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6680 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6681 getSB_VAL_CP(UCHARAT(locinput)),
6682 (U8*) reginfo->strbeg,
6684 (U8*) reginfo->strend,
6690 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6694 match = isWB(WB_UNKNOWN,
6695 getWB_VAL_CP(UCHARAT(locinput -1)),
6696 getWB_VAL_CP(UCHARAT(locinput)),
6697 (U8*) reginfo->strbeg,
6699 (U8*) reginfo->strend,
6706 if (to_complement ^ ! match) {
6712 case ANYOFL: /* /[abc]/l */
6713 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6715 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
6717 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6720 case ANYOFD: /* /[abc]/d */
6721 case ANYOF: /* /[abc]/ */
6724 if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
6725 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6728 locinput += UTF8SKIP(locinput);
6731 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
6738 if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)) {
6745 if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
6749 locinput++; /* ASCII is always single byte */
6753 if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
6757 goto increment_locinput;
6760 /* The argument (FLAGS) to all the POSIX node types is the class number
6763 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
6767 case POSIXL: /* \w or [:punct:] etc. under /l */
6768 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6772 /* Use isFOO_lc() for characters within Latin1. (Note that
6773 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6774 * wouldn't be invariant) */
6775 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6776 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
6784 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6785 /* An above Latin-1 code point, or malformed */
6786 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6788 goto utf8_posix_above_latin1;
6791 /* Here is a UTF-8 variant code point below 256 and the target is
6793 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6794 EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6795 *(locinput + 1))))))
6800 goto increment_locinput;
6802 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
6806 case POSIXD: /* \w or [:punct:] etc. under /d */
6812 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
6814 if (NEXTCHR_IS_EOS) {
6818 /* All UTF-8 variants match */
6819 if (! UTF8_IS_INVARIANT(nextchr)) {
6820 goto increment_locinput;
6826 case POSIXA: /* \w or [:punct:] etc. under /a */
6829 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6830 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6831 * character is a single byte */
6833 if (NEXTCHR_IS_EOS) {
6839 if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6845 /* Here we are either not in utf8, or we matched a utf8-invariant,
6846 * so the next char is the next byte */
6850 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
6854 case POSIXU: /* \w or [:punct:] etc. under /u */
6856 if (NEXTCHR_IS_EOS) {
6860 /* Use _generic_isCC() for characters within Latin1. (Note that
6861 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6862 * wouldn't be invariant) */
6863 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6864 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6871 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6872 if (! (to_complement
6873 ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6881 else { /* Handle above Latin-1 code points */
6882 utf8_posix_above_latin1:
6883 classnum = (_char_class_number) FLAGS(scan);
6886 if (! (to_complement
6887 ^ cBOOL(_invlist_contains_cp(
6888 PL_XPosix_ptrs[classnum],
6889 utf8_to_uvchr_buf((U8 *) locinput,
6890 (U8 *) reginfo->strend,
6896 case _CC_ENUM_SPACE:
6897 if (! (to_complement
6898 ^ cBOOL(is_XPERLSPACE_high(locinput))))
6903 case _CC_ENUM_BLANK:
6904 if (! (to_complement
6905 ^ cBOOL(is_HORIZWS_high(locinput))))
6910 case _CC_ENUM_XDIGIT:
6911 if (! (to_complement
6912 ^ cBOOL(is_XDIGIT_high(locinput))))
6917 case _CC_ENUM_VERTSPACE:
6918 if (! (to_complement
6919 ^ cBOOL(is_VERTWS_high(locinput))))
6924 case _CC_ENUM_CNTRL: /* These can't match above Latin1 */
6925 case _CC_ENUM_ASCII:
6926 if (! to_complement) {
6931 locinput += UTF8SKIP(locinput);
6935 case CLUMP: /* Match \X: logical Unicode character. This is defined as
6936 a Unicode extended Grapheme Cluster */
6939 if (! utf8_target) {
6941 /* Match either CR LF or '.', as all the other possibilities
6943 locinput++; /* Match the . or CR */
6944 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6946 && locinput < reginfo->strend
6947 && UCHARAT(locinput) == '\n')
6954 /* Get the gcb type for the current character */
6955 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
6956 (U8*) reginfo->strend);
6958 /* Then scan through the input until we get to the first
6959 * character whose type is supposed to be a gcb with the
6960 * current character. (There is always a break at the
6962 locinput += UTF8SKIP(locinput);
6963 while (locinput < reginfo->strend) {
6964 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
6965 (U8*) reginfo->strend);
6966 if (isGCB(prev_gcb, cur_gcb,
6967 (U8*) reginfo->strbeg, (U8*) locinput,
6974 locinput += UTF8SKIP(locinput);
6981 case NREFFL: /* /\g{name}/il */
6982 { /* The capture buffer cases. The ones beginning with N for the
6983 named buffers just convert to the equivalent numbered and
6984 pretend they were called as the corresponding numbered buffer
6986 /* don't initialize these in the declaration, it makes C++
6991 const U8 *fold_array;
6994 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6995 folder = foldEQ_locale;
6996 fold_array = PL_fold_locale;
6998 utf8_fold_flags = FOLDEQ_LOCALE;
7001 case NREFFA: /* /\g{name}/iaa */
7002 folder = foldEQ_latin1;
7003 fold_array = PL_fold_latin1;
7005 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7008 case NREFFU: /* /\g{name}/iu */
7009 folder = foldEQ_latin1;
7010 fold_array = PL_fold_latin1;
7012 utf8_fold_flags = 0;
7015 case NREFF: /* /\g{name}/i */
7017 fold_array = PL_fold;
7019 utf8_fold_flags = 0;
7022 case NREF: /* /\g{name}/ */
7026 utf8_fold_flags = 0;
7029 /* For the named back references, find the corresponding buffer
7031 n = reg_check_named_buff_matched(rex,scan);
7036 goto do_nref_ref_common;
7038 case REFFL: /* /\1/il */
7039 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7040 folder = foldEQ_locale;
7041 fold_array = PL_fold_locale;
7042 utf8_fold_flags = FOLDEQ_LOCALE;
7045 case REFFA: /* /\1/iaa */
7046 folder = foldEQ_latin1;
7047 fold_array = PL_fold_latin1;
7048 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7051 case REFFU: /* /\1/iu */
7052 folder = foldEQ_latin1;
7053 fold_array = PL_fold_latin1;
7054 utf8_fold_flags = 0;
7057 case REFF: /* /\1/i */
7059 fold_array = PL_fold;
7060 utf8_fold_flags = 0;
7063 case REF: /* /\1/ */
7066 utf8_fold_flags = 0;
7070 n = ARG(scan); /* which paren pair */
7073 ln = rex->offs[n].start;
7074 endref = rex->offs[n].end;
7075 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7076 if (rex->lastparen < n || ln == -1 || endref == -1)
7077 sayNO; /* Do not match unless seen CLOSEn. */
7081 s = reginfo->strbeg + ln;
7082 if (type != REF /* REF can do byte comparison */
7083 && (utf8_target || type == REFFU || type == REFFL))
7085 char * limit = reginfo->strend;
7087 /* This call case insensitively compares the entire buffer
7088 * at s, with the current input starting at locinput, but
7089 * not going off the end given by reginfo->strend, and
7090 * returns in <limit> upon success, how much of the
7091 * current input was matched */
7092 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7093 locinput, &limit, 0, utf8_target, utf8_fold_flags))
7101 /* Not utf8: Inline the first character, for speed. */
7102 if (!NEXTCHR_IS_EOS &&
7103 UCHARAT(s) != nextchr &&
7105 UCHARAT(s) != fold_array[nextchr]))
7108 if (locinput + ln > reginfo->strend)
7110 if (ln > 1 && (type == REF
7111 ? memNE(s, locinput, ln)
7112 : ! folder(s, locinput, ln)))
7118 case NOTHING: /* null op; e.g. the 'nothing' following
7119 * the '*' in m{(a+|b)*}' */
7121 case TAIL: /* placeholder while compiling (A|B|C) */
7125 #define ST st->u.eval
7126 #define CUR_EVAL cur_eval->u.eval
7132 regexp_internal *rei;
7133 regnode *startpoint;
7136 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
7137 arg= (U32)ARG(scan);
7138 if (cur_eval && cur_eval->locinput == locinput) {
7139 if ( ++nochange_depth > max_nochange_depth )
7141 "Pattern subroutine nesting without pos change"
7142 " exceeded limit in regex");
7149 startpoint = scan + ARG2L(scan);
7150 EVAL_CLOSE_PAREN_SET( st, arg );
7151 /* Detect infinite recursion
7153 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7154 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7155 * So we track the position in the string we are at each time
7156 * we recurse and if we try to enter the same routine twice from
7157 * the same position we throw an error.
7159 if ( rex->recurse_locinput[arg] == locinput ) {
7160 /* FIXME: we should show the regop that is failing as part
7161 * of the error message. */
7162 Perl_croak(aTHX_ "Infinite recursion in regex");
7164 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7165 rex->recurse_locinput[arg]= locinput;
7168 GET_RE_DEBUG_FLAGS_DECL;
7170 Perl_re_exec_indentf( aTHX_
7171 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7172 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7178 /* Save all the positions seen so far. */
7179 ST.cp = regcppush(rex, 0, maxopenparen);
7180 REGCP_SET(ST.lastcp);
7182 /* and then jump to the code we share with EVAL */
7183 goto eval_recurse_doit;
7186 case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */
7187 if (cur_eval && cur_eval->locinput==locinput) {
7188 if ( ++nochange_depth > max_nochange_depth )
7189 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7194 /* execute the code in the {...} */
7198 OP * const oop = PL_op;
7199 COP * const ocurcop = PL_curcop;
7203 /* save *all* paren positions */
7204 regcppush(rex, 0, maxopenparen);
7205 REGCP_SET(ST.lastcp);
7208 caller_cv = find_runcv(NULL);
7212 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7214 (REGEXP*)(rexi->data->data[n])
7216 nop = (OP*)rexi->data->data[n+1];
7218 else if (rexi->data->what[n] == 'l') { /* literal code */
7220 nop = (OP*)rexi->data->data[n];
7221 assert(CvDEPTH(newcv));
7224 /* literal with own CV */
7225 assert(rexi->data->what[n] == 'L');
7226 newcv = rex->qr_anoncv;
7227 nop = (OP*)rexi->data->data[n];
7230 /* Some notes about MULTICALL and the context and save stacks.
7233 * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7234 * since codeblocks don't introduce a new scope (so that
7235 * local() etc accumulate), at the end of a successful
7236 * match there will be a SAVEt_CLEARSV on the savestack
7237 * for each of $x, $y, $z. If the three code blocks above
7238 * happen to have come from different CVs (e.g. via
7239 * embedded qr//s), then we must ensure that during any
7240 * savestack unwinding, PL_comppad always points to the
7241 * right pad at each moment. We achieve this by
7242 * interleaving SAVEt_COMPPAD's on the savestack whenever
7243 * there is a change of pad.
7244 * In theory whenever we call a code block, we should
7245 * push a CXt_SUB context, then pop it on return from
7246 * that code block. This causes a bit of an issue in that
7247 * normally popping a context also clears the savestack
7248 * back to cx->blk_oldsaveix, but here we specifically
7249 * don't want to clear the save stack on exit from the
7251 * Also for efficiency we don't want to keep pushing and
7252 * popping the single SUB context as we backtrack etc.
7253 * So instead, we push a single context the first time
7254 * we need, it, then hang onto it until the end of this
7255 * function. Whenever we encounter a new code block, we
7256 * update the CV etc if that's changed. During the times
7257 * in this function where we're not executing a code
7258 * block, having the SUB context still there is a bit
7259 * naughty - but we hope that no-one notices.
7260 * When the SUB context is initially pushed, we fake up
7261 * cx->blk_oldsaveix to be as if we'd pushed this context
7262 * on first entry to S_regmatch rather than at some random
7263 * point during the regexe execution. That way if we
7264 * croak, popping the context stack will ensure that
7265 * *everything* SAVEd by this function is undone and then
7266 * the context popped, rather than e.g., popping the
7267 * context (and restoring the original PL_comppad) then
7268 * popping more of the savestack and restoring a bad
7272 /* If this is the first EVAL, push a MULTICALL. On
7273 * subsequent calls, if we're executing a different CV, or
7274 * if PL_comppad has got messed up from backtracking
7275 * through SAVECOMPPADs, then refresh the context.
7277 if (newcv != last_pushed_cv || PL_comppad != last_pad)
7279 U8 flags = (CXp_SUB_RE |
7280 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
7282 if (last_pushed_cv) {
7283 CHANGE_MULTICALL_FLAGS(newcv, flags);
7286 PUSH_MULTICALL_FLAGS(newcv, flags);
7288 /* see notes above */
7289 CX_CUR()->blk_oldsaveix = orig_savestack_ix;
7291 last_pushed_cv = newcv;
7294 /* these assignments are just to silence compiler
7296 multicall_cop = NULL;
7298 last_pad = PL_comppad;
7300 /* the initial nextstate you would normally execute
7301 * at the start of an eval (which would cause error
7302 * messages to come from the eval), may be optimised
7303 * away from the execution path in the regex code blocks;
7304 * so manually set PL_curcop to it initially */
7306 OP *o = cUNOPx(nop)->op_first;
7307 assert(o->op_type == OP_NULL);
7308 if (o->op_targ == OP_SCOPE) {
7309 o = cUNOPo->op_first;
7312 assert(o->op_targ == OP_LEAVE);
7313 o = cUNOPo->op_first;
7314 assert(o->op_type == OP_ENTER);
7318 if (o->op_type != OP_STUB) {
7319 assert( o->op_type == OP_NEXTSTATE
7320 || o->op_type == OP_DBSTATE
7321 || (o->op_type == OP_NULL
7322 && ( o->op_targ == OP_NEXTSTATE
7323 || o->op_targ == OP_DBSTATE
7327 PL_curcop = (COP*)o;
7332 DEBUG_STATE_r( Perl_re_printf( aTHX_
7333 " re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
7335 rex->offs[0].end = locinput - reginfo->strbeg;
7336 if (reginfo->info_aux_eval->pos_magic)
7337 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
7338 reginfo->sv, reginfo->strbeg,
7339 locinput - reginfo->strbeg);
7342 SV *sv_mrk = get_sv("REGMARK", 1);
7343 sv_setsv(sv_mrk, sv_yes_mark);
7346 /* we don't use MULTICALL here as we want to call the
7347 * first op of the block of interest, rather than the
7348 * first op of the sub. Also, we don't want to free
7349 * the savestack frame */
7350 before = (IV)(SP-PL_stack_base);
7352 CALLRUNOPS(aTHX); /* Scalar context. */
7354 if ((IV)(SP-PL_stack_base) == before)
7355 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
7361 /* before restoring everything, evaluate the returned
7362 * value, so that 'uninit' warnings don't use the wrong
7363 * PL_op or pad. Also need to process any magic vars
7364 * (e.g. $1) *before* parentheses are restored */
7369 if (logical == 0) /* (?{})/ */
7370 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
7371 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
7372 sw = cBOOL(SvTRUE_NN(ret));
7375 else { /* /(??{}) */
7376 /* if its overloaded, let the regex compiler handle
7377 * it; otherwise extract regex, or stringify */
7378 if (SvGMAGICAL(ret))
7379 ret = sv_mortalcopy(ret);
7380 if (!SvAMAGIC(ret)) {
7384 if (SvTYPE(sv) == SVt_REGEXP)
7385 re_sv = (REGEXP*) sv;
7386 else if (SvSMAGICAL(ret)) {
7387 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
7389 re_sv = (REGEXP *) mg->mg_obj;
7392 /* force any undef warnings here */
7393 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7394 ret = sv_mortalcopy(ret);
7395 (void) SvPV_force_nolen(ret);
7401 /* *** Note that at this point we don't restore
7402 * PL_comppad, (or pop the CxSUB) on the assumption it may
7403 * be used again soon. This is safe as long as nothing
7404 * in the regexp code uses the pad ! */
7406 PL_curcop = ocurcop;
7407 regcp_restore(rex, ST.lastcp, &maxopenparen);
7408 PL_curpm_under = PL_curpm;
7409 PL_curpm = PL_reg_curpm;
7412 PUSH_STATE_GOTO(EVAL_B, next, locinput);
7417 /* only /(??{})/ from now on */
7420 /* extract RE object from returned value; compiling if
7424 re_sv = reg_temp_copy(NULL, re_sv);
7429 if (SvUTF8(ret) && IN_BYTES) {
7430 /* In use 'bytes': make a copy of the octet
7431 * sequence, but without the flag on */
7433 const char *const p = SvPV(ret, len);
7434 ret = newSVpvn_flags(p, len, SVs_TEMP);
7436 if (rex->intflags & PREGf_USE_RE_EVAL)
7437 pm_flags |= PMf_USE_RE_EVAL;
7439 /* if we got here, it should be an engine which
7440 * supports compiling code blocks and stuff */
7441 assert(rex->engine && rex->engine->op_comp);
7442 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
7443 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
7444 rex->engine, NULL, NULL,
7445 /* copy /msixn etc to inner pattern */
7450 & (SVs_TEMP | SVs_GMG | SVf_ROK))
7451 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
7452 /* This isn't a first class regexp. Instead, it's
7453 caching a regexp onto an existing, Perl visible
7455 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
7461 RXp_MATCH_COPIED_off(re);
7462 re->subbeg = rex->subbeg;
7463 re->sublen = rex->sublen;
7464 re->suboffset = rex->suboffset;
7465 re->subcoffset = rex->subcoffset;
7467 re->lastcloseparen = 0;
7470 debug_start_match(re_sv, utf8_target, locinput,
7471 reginfo->strend, "EVAL/GOSUB: Matching embedded");
7473 startpoint = rei->program + 1;
7474 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7475 * close_paren only for GOSUB */
7476 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
7477 /* Save all the seen positions so far. */
7478 ST.cp = regcppush(rex, 0, maxopenparen);
7479 REGCP_SET(ST.lastcp);
7480 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7482 /* run the pattern returned from (??{...}) */
7484 eval_recurse_doit: /* Share code with GOSUB below this line
7485 * At this point we expect the stack context to be
7486 * set up correctly */
7488 /* invalidate the S-L poscache. We're now executing a
7489 * different set of WHILEM ops (and their associated
7490 * indexes) against the same string, so the bits in the
7491 * cache are meaningless. Setting maxiter to zero forces
7492 * the cache to be invalidated and zeroed before reuse.
7493 * XXX This is too dramatic a measure. Ideally we should
7494 * save the old cache and restore when running the outer
7496 reginfo->poscache_maxiter = 0;
7498 /* the new regexp might have a different is_utf8_pat than we do */
7499 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
7501 ST.prev_rex = rex_sv;
7502 ST.prev_curlyx = cur_curlyx;
7504 SET_reg_curpm(rex_sv);
7509 ST.prev_eval = cur_eval;
7511 /* now continue from first node in postoned RE */
7512 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
7513 NOT_REACHED; /* NOTREACHED */
7516 case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
7517 /* note: this is called twice; first after popping B, then A */
7519 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n",
7520 depth, cur_eval, ST.prev_eval);
7523 #define SET_RECURSE_LOCINPUT(STR,VAL)\
7524 if ( cur_eval && CUR_EVAL.close_paren ) {\
7526 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
7528 CUR_EVAL.close_paren - 1,\
7532 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7535 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
7537 rex_sv = ST.prev_rex;
7538 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7539 SET_reg_curpm(rex_sv);
7540 rex = ReANY(rex_sv);
7541 rexi = RXi_GET(rex);
7543 /* preserve $^R across LEAVE's. See Bug 121070. */
7544 SV *save_sv= GvSV(PL_replgv);
7545 SvREFCNT_inc(save_sv);
7546 regcpblow(ST.cp); /* LEAVE in disguise */
7547 sv_setsv(GvSV(PL_replgv), save_sv);
7548 SvREFCNT_dec(save_sv);
7550 cur_eval = ST.prev_eval;
7551 cur_curlyx = ST.prev_curlyx;
7553 /* Invalidate cache. See "invalidate" comment above. */
7554 reginfo->poscache_maxiter = 0;
7555 if ( nochange_depth )
7558 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
7562 case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7563 REGCP_UNWIND(ST.lastcp);
7566 case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
7567 /* note: this is called twice; first after popping B, then A */
7569 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
7570 depth, cur_eval, ST.prev_eval);
7573 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
7575 rex_sv = ST.prev_rex;
7576 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7577 SET_reg_curpm(rex_sv);
7578 rex = ReANY(rex_sv);
7579 rexi = RXi_GET(rex);
7581 REGCP_UNWIND(ST.lastcp);
7582 regcppop(rex, &maxopenparen);
7583 cur_eval = ST.prev_eval;
7584 cur_curlyx = ST.prev_curlyx;
7586 /* Invalidate cache. See "invalidate" comment above. */
7587 reginfo->poscache_maxiter = 0;
7588 if ( nochange_depth )
7591 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7596 n = ARG(scan); /* which paren pair */
7597 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
7598 if (n > maxopenparen)
7600 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
7601 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
7606 (IV)rex->offs[n].start_tmp,
7612 case SROPEN: /* (*SCRIPT_RUN: */
7613 script_run_begin = (U8 *) locinput;
7618 n = ARG(scan); /* which paren pair */
7619 CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7620 locinput - reginfo->strbeg);
7621 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
7626 case SRCLOSE: /* (*SCRIPT_RUN: ... ) */
7628 if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
7636 case ACCEPT: /* (*ACCEPT) */
7638 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7642 cursor && OP(cursor)!=END;
7643 cursor=regnext(cursor))
7645 if ( OP(cursor)==CLOSE ){
7647 if ( n <= lastopen ) {
7648 CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
7649 locinput - reginfo->strbeg);
7650 if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
7659 case GROUPP: /* (?(1)) */
7660 n = ARG(scan); /* which paren pair */
7661 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
7664 case NGROUPP: /* (?(<name>)) */
7665 /* reg_check_named_buff_matched returns 0 for no match */
7666 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
7669 case INSUBP: /* (?(R)) */
7671 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7672 * of SCAN is already set up as matches a eval.close_paren */
7673 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
7676 case DEFINEP: /* (?(DEFINE)) */
7680 case IFTHEN: /* (?(cond)A|B) */
7681 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7683 next = NEXTOPER(NEXTOPER(scan));
7685 next = scan + ARG(scan);
7686 if (OP(next) == IFTHEN) /* Fake one. */
7687 next = NEXTOPER(NEXTOPER(next));
7691 case LOGICAL: /* modifier for EVAL and IFMATCH */
7692 logical = scan->flags;
7695 /*******************************************************************
7697 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7698 pattern, where A and B are subpatterns. (For simple A, CURLYM or
7699 STAR/PLUS/CURLY/CURLYN are used instead.)
7701 A*B is compiled as <CURLYX><A><WHILEM><B>
7703 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7704 state, which contains the current count, initialised to -1. It also sets
7705 cur_curlyx to point to this state, with any previous value saved in the
7708 CURLYX then jumps straight to the WHILEM op, rather than executing A,
7709 since the pattern may possibly match zero times (i.e. it's a while {} loop
7710 rather than a do {} while loop).
7712 Each entry to WHILEM represents a successful match of A. The count in the
7713 CURLYX block is incremented, another WHILEM state is pushed, and execution
7714 passes to A or B depending on greediness and the current count.
7716 For example, if matching against the string a1a2a3b (where the aN are
7717 substrings that match /A/), then the match progresses as follows: (the
7718 pushed states are interspersed with the bits of strings matched so far):
7721 <CURLYX cnt=0><WHILEM>
7722 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7723 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7724 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7725 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
7727 (Contrast this with something like CURLYM, which maintains only a single
7731 a1 <CURLYM cnt=1> a2
7732 a1 a2 <CURLYM cnt=2> a3
7733 a1 a2 a3 <CURLYM cnt=3> b
7736 Each WHILEM state block marks a point to backtrack to upon partial failure
7737 of A or B, and also contains some minor state data related to that
7738 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
7739 overall state, such as the count, and pointers to the A and B ops.
7741 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7742 must always point to the *current* CURLYX block, the rules are:
7744 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7745 and set cur_curlyx to point the new block.
7747 When popping the CURLYX block after a successful or unsuccessful match,
7748 restore the previous cur_curlyx.
7750 When WHILEM is about to execute B, save the current cur_curlyx, and set it
7751 to the outer one saved in the CURLYX block.
7753 When popping the WHILEM block after a successful or unsuccessful B match,
7754 restore the previous cur_curlyx.
7756 Here's an example for the pattern (AI* BI)*BO
7757 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
7760 curlyx backtrack stack
7761 ------ ---------------
7763 CO <CO prev=NULL> <WO>
7764 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7765 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7766 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
7768 At this point the pattern succeeds, and we work back down the stack to
7769 clean up, restoring as we go:
7771 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7772 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7773 CO <CO prev=NULL> <WO>
7776 *******************************************************************/
7778 #define ST st->u.curlyx
7780 case CURLYX: /* start of /A*B/ (for complex A) */
7782 /* No need to save/restore up to this paren */
7783 I32 parenfloor = scan->flags;
7785 assert(next); /* keep Coverity happy */
7786 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7789 /* XXXX Probably it is better to teach regpush to support
7790 parenfloor > maxopenparen ... */
7791 if (parenfloor > (I32)rex->lastparen)
7792 parenfloor = rex->lastparen; /* Pessimization... */
7794 ST.prev_curlyx= cur_curlyx;
7796 ST.cp = PL_savestack_ix;
7798 /* these fields contain the state of the current curly.
7799 * they are accessed by subsequent WHILEMs */
7800 ST.parenfloor = parenfloor;
7805 ST.count = -1; /* this will be updated by WHILEM */
7806 ST.lastloc = NULL; /* this will be updated by WHILEM */
7808 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
7809 NOT_REACHED; /* NOTREACHED */
7812 case CURLYX_end: /* just finished matching all of A*B */
7813 cur_curlyx = ST.prev_curlyx;
7815 NOT_REACHED; /* NOTREACHED */
7817 case CURLYX_end_fail: /* just failed to match all of A*B */
7819 cur_curlyx = ST.prev_curlyx;
7821 NOT_REACHED; /* NOTREACHED */
7825 #define ST st->u.whilem
7827 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
7829 /* see the discussion above about CURLYX/WHILEM */
7834 assert(cur_curlyx); /* keep Coverity happy */
7836 min = ARG1(cur_curlyx->u.curlyx.me);
7837 max = ARG2(cur_curlyx->u.curlyx.me);
7838 A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
7839 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7840 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7841 ST.cache_offset = 0;
7845 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n",
7846 depth, (long)n, min, max)
7849 /* First just match a string of min A's. */
7852 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
7853 cur_curlyx->u.curlyx.lastloc = locinput;
7854 REGCP_SET(ST.lastcp);
7856 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
7857 NOT_REACHED; /* NOTREACHED */
7860 /* If degenerate A matches "", assume A done. */
7862 if (locinput == cur_curlyx->u.curlyx.lastloc) {
7863 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: empty match detected, trying continuation...\n",
7866 goto do_whilem_B_max;
7869 /* super-linear cache processing.
7871 * The idea here is that for certain types of CURLYX/WHILEM -
7872 * principally those whose upper bound is infinity (and
7873 * excluding regexes that have things like \1 and other very
7874 * non-regular expresssiony things), then if a pattern like
7875 * /....A*.../ fails and we backtrack to the WHILEM, then we
7876 * make a note that this particular WHILEM op was at string
7877 * position 47 (say) when the rest of pattern failed. Then, if
7878 * we ever find ourselves back at that WHILEM, and at string
7879 * position 47 again, we can just fail immediately rather than
7880 * running the rest of the pattern again.
7882 * This is very handy when patterns start to go
7883 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7884 * with a combinatorial explosion of backtracking.
7886 * The cache is implemented as a bit array, with one bit per
7887 * string byte position per WHILEM op (up to 16) - so its
7888 * between 0.25 and 2x the string size.
7890 * To avoid allocating a poscache buffer every time, we do an
7891 * initially countdown; only after we have executed a WHILEM
7892 * op (string-length x #WHILEMs) times do we allocate the
7895 * The top 4 bits of scan->flags byte say how many different
7896 * relevant CURLLYX/WHILEM op pairs there are, while the
7897 * bottom 4-bits is the identifying index number of this
7903 if (!reginfo->poscache_maxiter) {
7904 /* start the countdown: Postpone detection until we
7905 * know the match is not *that* much linear. */
7906 reginfo->poscache_maxiter
7907 = (reginfo->strend - reginfo->strbeg + 1)
7909 /* possible overflow for long strings and many CURLYX's */
7910 if (reginfo->poscache_maxiter < 0)
7911 reginfo->poscache_maxiter = I32_MAX;
7912 reginfo->poscache_iter = reginfo->poscache_maxiter;
7915 if (reginfo->poscache_iter-- == 0) {
7916 /* initialise cache */
7917 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
7918 regmatch_info_aux *const aux = reginfo->info_aux;
7919 if (aux->poscache) {
7920 if ((SSize_t)reginfo->poscache_size < size) {
7921 Renew(aux->poscache, size, char);
7922 reginfo->poscache_size = size;
7924 Zero(aux->poscache, size, char);
7927 reginfo->poscache_size = size;
7928 Newxz(aux->poscache, size, char);
7930 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
7931 "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
7932 PL_colors[4], PL_colors[5])
7936 if (reginfo->poscache_iter < 0) {
7937 /* have we already failed at this position? */
7938 SSize_t offset, mask;
7940 reginfo->poscache_iter = -1; /* stop eventual underflow */
7941 offset = (scan->flags & 0xf) - 1
7942 + (locinput - reginfo->strbeg)
7944 mask = 1 << (offset % 8);
7946 if (reginfo->info_aux->poscache[offset] & mask) {
7947 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: (cache) already tried at this position...\n",
7950 cur_curlyx->u.curlyx.count--;
7951 sayNO; /* cache records failure */
7953 ST.cache_offset = offset;
7954 ST.cache_mask = mask;
7958 /* Prefer B over A for minimal matching. */
7960 if (cur_curlyx->u.curlyx.minmod) {
7961 ST.save_curlyx = cur_curlyx;
7962 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
7963 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7965 NOT_REACHED; /* NOTREACHED */
7968 /* Prefer A over B for maximal matching. */
7970 if (n < max) { /* More greed allowed? */
7971 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7973 cur_curlyx->u.curlyx.lastloc = locinput;
7974 REGCP_SET(ST.lastcp);
7975 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
7976 NOT_REACHED; /* NOTREACHED */
7978 goto do_whilem_B_max;
7980 NOT_REACHED; /* NOTREACHED */
7982 case WHILEM_B_min: /* just matched B in a minimal match */
7983 case WHILEM_B_max: /* just matched B in a maximal match */
7984 cur_curlyx = ST.save_curlyx;
7986 NOT_REACHED; /* NOTREACHED */
7988 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
7989 cur_curlyx = ST.save_curlyx;
7990 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7991 cur_curlyx->u.curlyx.count--;
7993 NOT_REACHED; /* NOTREACHED */
7995 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
7997 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
7998 REGCP_UNWIND(ST.lastcp);
7999 regcppop(rex, &maxopenparen);
8000 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8001 cur_curlyx->u.curlyx.count--;
8003 NOT_REACHED; /* NOTREACHED */
8005 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8006 REGCP_UNWIND(ST.lastcp);
8007 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8008 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: failed, trying continuation...\n",
8012 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8013 && ckWARN(WARN_REGEXP)
8014 && !reginfo->warned)
8016 reginfo->warned = TRUE;
8017 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8018 "Complex regular subexpression recursion limit (%d) "
8024 ST.save_curlyx = cur_curlyx;
8025 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8026 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8028 NOT_REACHED; /* NOTREACHED */
8030 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8031 cur_curlyx = ST.save_curlyx;
8033 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8034 /* Maximum greed exceeded */
8035 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8036 && ckWARN(WARN_REGEXP)
8037 && !reginfo->warned)
8039 reginfo->warned = TRUE;
8040 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8041 "Complex regular subexpression recursion "
8042 "limit (%d) exceeded",
8045 cur_curlyx->u.curlyx.count--;
8049 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: B min fail: trying longer...\n", depth)
8051 /* Try grabbing another A and see if it helps. */
8052 cur_curlyx->u.curlyx.lastloc = locinput;
8053 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8055 REGCP_SET(ST.lastcp);
8056 PUSH_STATE_GOTO(WHILEM_A_min,
8057 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8059 NOT_REACHED; /* NOTREACHED */
8062 #define ST st->u.branch
8064 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
8065 next = scan + ARG(scan);
8068 scan = NEXTOPER(scan);
8071 case BRANCH: /* /(...|A|...)/ */
8072 scan = NEXTOPER(scan); /* scan now points to inner node */
8073 ST.lastparen = rex->lastparen;
8074 ST.lastcloseparen = rex->lastcloseparen;
8075 ST.next_branch = next;
8078 /* Now go into the branch */
8080 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
8082 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
8084 NOT_REACHED; /* NOTREACHED */
8086 case CUTGROUP: /* /(*THEN)/ */
8087 sv_yes_mark = st->u.mark.mark_name = scan->flags
8088 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8090 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
8091 NOT_REACHED; /* NOTREACHED */
8093 case CUTGROUP_next_fail:
8096 if (st->u.mark.mark_name)
8097 sv_commit = st->u.mark.mark_name;
8099 NOT_REACHED; /* NOTREACHED */
8103 NOT_REACHED; /* NOTREACHED */
8105 case BRANCH_next_fail: /* that branch failed; try the next, if any */
8110 REGCP_UNWIND(ST.cp);
8111 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8112 scan = ST.next_branch;
8113 /* no more branches? */
8114 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8116 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n",
8123 continue; /* execute next BRANCH[J] op */
8126 case MINMOD: /* next op will be non-greedy, e.g. A*? */
8131 #define ST st->u.curlym
8133 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
8135 /* This is an optimisation of CURLYX that enables us to push
8136 * only a single backtracking state, no matter how many matches
8137 * there are in {m,n}. It relies on the pattern being constant
8138 * length, with no parens to influence future backrefs
8142 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8144 ST.lastparen = rex->lastparen;
8145 ST.lastcloseparen = rex->lastcloseparen;
8147 /* if paren positive, emulate an OPEN/CLOSE around A */
8149 U32 paren = ST.me->flags;
8150 if (paren > maxopenparen)
8151 maxopenparen = paren;
8152 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8160 ST.c1 = CHRTEST_UNINIT;
8163 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8166 curlym_do_A: /* execute the A in /A{m,n}B/ */
8167 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
8168 NOT_REACHED; /* NOTREACHED */
8170 case CURLYM_A: /* we've just matched an A */
8172 /* after first match, determine A's length: u.curlym.alen */
8173 if (ST.count == 1) {
8174 if (reginfo->is_utf8_target) {
8175 char *s = st->locinput;
8176 while (s < locinput) {
8182 ST.alen = locinput - st->locinput;
8185 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8188 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8189 depth, (IV) ST.count, (IV)ST.alen)
8192 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8196 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8197 if ( max == REG_INFTY || ST.count < max )
8198 goto curlym_do_A; /* try to match another A */
8200 goto curlym_do_B; /* try to match B */
8202 case CURLYM_A_fail: /* just failed to match an A */
8203 REGCP_UNWIND(ST.cp);
8206 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8207 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8210 curlym_do_B: /* execute the B in /A{m,n}B/ */
8211 if (ST.c1 == CHRTEST_UNINIT) {
8212 /* calculate c1 and c2 for possible match of 1st char
8213 * following curly */
8214 ST.c1 = ST.c2 = CHRTEST_VOID;
8216 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8217 regnode *text_node = ST.B;
8218 if (! HAS_TEXT(text_node))
8219 FIND_NEXT_IMPT(text_node);
8222 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
8224 But the former is redundant in light of the latter.
8226 if this changes back then the macro for
8227 IS_TEXT and friends need to change.
8229 if (PL_regkind[OP(text_node)] == EXACT) {
8230 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8231 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8241 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n",
8242 depth, (IV)ST.count)
8244 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
8245 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
8246 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8247 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8249 /* simulate B failing */
8251 Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
8253 valid_utf8_to_uvchr((U8 *) locinput, NULL),
8254 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
8255 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
8257 state_num = CURLYM_B_fail;
8258 goto reenter_switch;
8261 else if (nextchr != ST.c1 && nextchr != ST.c2) {
8262 /* simulate B failing */
8264 Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
8266 (int) nextchr, ST.c1, ST.c2)
8268 state_num = CURLYM_B_fail;
8269 goto reenter_switch;
8274 /* emulate CLOSE: mark current A as captured */
8275 U32 paren = (U32)ST.me->flags;
8277 CLOSE_CAPTURE(paren,
8278 HOPc(locinput, -ST.alen) - reginfo->strbeg,
8279 locinput - reginfo->strbeg);
8282 rex->offs[paren].end = -1;
8284 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8293 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
8294 NOT_REACHED; /* NOTREACHED */
8296 case CURLYM_B_fail: /* just failed to match a B */
8297 REGCP_UNWIND(ST.cp);
8298 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8300 I32 max = ARG2(ST.me);
8301 if (max != REG_INFTY && ST.count == max)
8303 goto curlym_do_A; /* try to match a further A */
8305 /* backtrack one A */
8306 if (ST.count == ARG1(ST.me) /* min */)
8309 SET_locinput(HOPc(locinput, -ST.alen));
8310 goto curlym_do_B; /* try to match B */
8313 #define ST st->u.curly
8315 #define CURLY_SETPAREN(paren, success) \
8318 CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
8319 locinput - reginfo->strbeg); \
8322 rex->offs[paren].end = -1; \
8323 rex->lastparen = ST.lastparen; \
8324 rex->lastcloseparen = ST.lastcloseparen; \
8328 case STAR: /* /A*B/ where A is width 1 char */
8332 scan = NEXTOPER(scan);
8335 case PLUS: /* /A+B/ where A is width 1 char */
8339 scan = NEXTOPER(scan);
8342 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
8343 ST.paren = scan->flags; /* Which paren to set */
8344 ST.lastparen = rex->lastparen;
8345 ST.lastcloseparen = rex->lastcloseparen;
8346 if (ST.paren > maxopenparen)
8347 maxopenparen = ST.paren;
8348 ST.min = ARG1(scan); /* min to match */
8349 ST.max = ARG2(scan); /* max to match */
8350 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
8352 /* handle the single-char capture called as a GOSUB etc */
8353 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
8355 char *li = locinput;
8356 if (!regrepeat(rex, &li, scan, reginfo, 1))
8364 case CURLY: /* /A{m,n}B/ where A is width 1 char */
8366 ST.min = ARG1(scan); /* min to match */
8367 ST.max = ARG2(scan); /* max to match */
8368 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8371 * Lookahead to avoid useless match attempts
8372 * when we know what character comes next.
8374 * Used to only do .*x and .*?x, but now it allows
8375 * for )'s, ('s and (?{ ... })'s to be in the way
8376 * of the quantifier and the EXACT-like node. -- japhy
8379 assert(ST.min <= ST.max);
8380 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
8381 ST.c1 = ST.c2 = CHRTEST_VOID;
8384 regnode *text_node = next;
8386 if (! HAS_TEXT(text_node))
8387 FIND_NEXT_IMPT(text_node);
8389 if (! HAS_TEXT(text_node))
8390 ST.c1 = ST.c2 = CHRTEST_VOID;
8392 if ( PL_regkind[OP(text_node)] != EXACT ) {
8393 ST.c1 = ST.c2 = CHRTEST_VOID;
8397 /* Currently we only get here when
8399 PL_rekind[OP(text_node)] == EXACT
8401 if this changes back then the macro for IS_TEXT and
8402 friends need to change. */
8403 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
8404 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
8416 char *li = locinput;
8419 regrepeat(rex, &li, ST.A, reginfo, ST.min)
8425 if (ST.c1 == CHRTEST_VOID)
8426 goto curly_try_B_min;
8428 ST.oldloc = locinput;
8430 /* set ST.maxpos to the furthest point along the
8431 * string that could possibly match */
8432 if (ST.max == REG_INFTY) {
8433 ST.maxpos = reginfo->strend - 1;
8435 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8438 else if (utf8_target) {
8439 int m = ST.max - ST.min;
8440 for (ST.maxpos = locinput;
8441 m >0 && ST.maxpos < reginfo->strend; m--)
8442 ST.maxpos += UTF8SKIP(ST.maxpos);
8445 ST.maxpos = locinput + ST.max - ST.min;
8446 if (ST.maxpos >= reginfo->strend)
8447 ST.maxpos = reginfo->strend - 1;
8449 goto curly_try_B_min_known;
8453 /* avoid taking address of locinput, so it can remain
8455 char *li = locinput;
8456 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
8457 if (ST.count < ST.min)
8460 if ((ST.count > ST.min)
8461 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8463 /* A{m,n} must come at the end of the string, there's
8464 * no point in backing off ... */
8466 /* ...except that $ and \Z can match before *and* after
8467 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
8468 We may back off by one in this case. */
8469 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
8473 goto curly_try_B_max;
8475 NOT_REACHED; /* NOTREACHED */
8477 case CURLY_B_min_fail:
8478 /* failed to find B in a non-greedy match.
8479 * Handles both cases where c1,c2 valid or not */
8481 REGCP_UNWIND(ST.cp);
8483 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8486 if (ST.c1 == CHRTEST_VOID) {
8487 /* failed -- move forward one */
8488 char *li = locinput;
8489 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
8494 if (!( ST.count <= ST.max
8495 /* count overflow ? */
8496 || (ST.max == REG_INFTY && ST.count > 0))
8502 /* Couldn't or didn't -- move forward. */
8503 ST.oldloc = locinput;
8505 locinput += UTF8SKIP(locinput);
8510 curly_try_B_min_known:
8511 /* find the next place where 'B' could work, then call B */
8513 n = (ST.oldloc == locinput) ? 0 : 1;
8514 if (ST.c1 == ST.c2) {
8515 /* set n to utf8_distance(oldloc, locinput) */
8516 while (locinput <= ST.maxpos
8517 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8519 locinput += UTF8SKIP(locinput);
8524 /* set n to utf8_distance(oldloc, locinput) */
8525 while (locinput <= ST.maxpos
8526 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8527 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8529 locinput += UTF8SKIP(locinput);
8534 else { /* Not utf8_target */
8535 if (ST.c1 == ST.c2) {
8536 locinput = (char *) memchr(locinput,
8538 ST.maxpos + 1 - locinput);
8540 locinput = ST.maxpos + 1;
8544 U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
8546 if (! isPOWER_OF_2(c1_c2_bits_differing)) {
8547 while ( locinput <= ST.maxpos
8548 && UCHARAT(locinput) != ST.c1
8549 && UCHARAT(locinput) != ST.c2)
8555 /* If c1 and c2 only differ by a single bit, we can
8556 * avoid a conditional each time through the loop,
8557 * at the expense of a little preliminary setup and
8558 * an extra mask each iteration. By masking out
8559 * that bit, we match exactly two characters, c1
8560 * and c2, and so we don't have to test for both.
8561 * On both ASCII and EBCDIC platforms, most of the
8562 * ASCII-range and Latin1-range folded equivalents
8563 * differ only in a single bit, so this is actually
8564 * the most common case. (e.g. 'A' 0x41 vs 'a'
8566 U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
8567 U8 c1_c2_mask = ~ c1_c2_bits_differing;
8568 while ( locinput <= ST.maxpos
8569 && (UCHARAT(locinput) & c1_c2_mask)
8576 n = locinput - ST.oldloc;
8578 if (locinput > ST.maxpos)
8581 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8582 * at b; check that everything between oldloc and
8583 * locinput matches */
8584 char *li = ST.oldloc;
8586 if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
8588 assert(n == REG_INFTY || locinput == li);
8593 CURLY_SETPAREN(ST.paren, ST.count);
8594 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
8595 NOT_REACHED; /* NOTREACHED */
8599 /* a successful greedy match: now try to match B */
8601 bool could_match = locinput < reginfo->strend;
8603 /* If it could work, try it. */
8604 if (ST.c1 != CHRTEST_VOID && could_match) {
8605 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8607 could_match = memEQ(locinput,
8612 UTF8SKIP(locinput));
8615 could_match = UCHARAT(locinput) == ST.c1
8616 || UCHARAT(locinput) == ST.c2;
8619 if (ST.c1 == CHRTEST_VOID || could_match) {
8620 CURLY_SETPAREN(ST.paren, ST.count);
8621 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
8622 NOT_REACHED; /* NOTREACHED */
8627 case CURLY_B_max_fail:
8628 /* failed to find B in a greedy match */
8630 REGCP_UNWIND(ST.cp);
8632 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8635 if (--ST.count < ST.min)
8637 locinput = HOPc(locinput, -1);
8638 goto curly_try_B_max;
8642 case END: /* last op of main pattern */
8645 /* we've just finished A in /(??{A})B/; now continue with B */
8646 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
8647 st->u.eval.prev_rex = rex_sv; /* inner */
8649 /* Save *all* the positions. */
8650 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
8651 rex_sv = CUR_EVAL.prev_rex;
8652 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8653 SET_reg_curpm(rex_sv);
8654 rex = ReANY(rex_sv);
8655 rexi = RXi_GET(rex);
8657 st->u.eval.prev_curlyx = cur_curlyx;
8658 cur_curlyx = CUR_EVAL.prev_curlyx;
8660 REGCP_SET(st->u.eval.lastcp);
8662 /* Restore parens of the outer rex without popping the
8664 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
8666 st->u.eval.prev_eval = cur_eval;
8667 cur_eval = CUR_EVAL.prev_eval;
8669 Perl_re_exec_indentf( aTHX_ "END: EVAL trying tail ... (cur_eval=%p)\n",
8671 if ( nochange_depth )
8674 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8676 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
8677 locinput); /* match B */
8680 if (locinput < reginfo->till) {
8681 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
8682 "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
8684 (long)(locinput - startpos),
8685 (long)(reginfo->till - startpos),
8688 sayNO_SILENT; /* Cannot match: too short. */
8690 sayYES; /* Success! */
8692 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8694 Perl_re_exec_indentf( aTHX_ "%sSUCCEED: subpattern success...%s\n",
8695 depth, PL_colors[4], PL_colors[5]));
8696 sayYES; /* Success! */
8699 #define ST st->u.ifmatch
8704 case SUSPEND: /* (?>A) */
8706 newstart = locinput;
8709 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
8711 goto ifmatch_trivial_fail_test;
8713 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
8715 ifmatch_trivial_fail_test:
8717 char * const s = HOPBACKc(locinput, scan->flags);
8722 sw = 1 - cBOOL(ST.wanted);
8726 next = scan + ARG(scan);
8734 newstart = locinput;
8738 ST.logical = logical;
8739 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8741 /* execute body of (?...A) */
8742 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
8743 NOT_REACHED; /* NOTREACHED */
8746 case IFMATCH_A_fail: /* body of (?...A) failed */
8747 ST.wanted = !ST.wanted;
8750 case IFMATCH_A: /* body of (?...A) succeeded */
8752 sw = cBOOL(ST.wanted);
8754 else if (!ST.wanted)
8757 if (OP(ST.me) != SUSPEND) {
8758 /* restore old position except for (?>...) */
8759 locinput = st->locinput;
8761 scan = ST.me + ARG(ST.me);
8764 continue; /* execute B */
8768 case LONGJMP: /* alternative with many branches compiles to
8769 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
8770 next = scan + ARG(scan);
8775 case COMMIT: /* (*COMMIT) */
8776 reginfo->cutpoint = reginfo->strend;
8779 case PRUNE: /* (*PRUNE) */
8781 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8782 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
8783 NOT_REACHED; /* NOTREACHED */
8785 case COMMIT_next_fail:
8789 NOT_REACHED; /* NOTREACHED */
8791 case OPFAIL: /* (*FAIL) */
8793 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8795 /* deal with (?(?!)X|Y) properly,
8796 * make sure we trigger the no branch
8797 * of the trailing IFTHEN structure*/
8803 NOT_REACHED; /* NOTREACHED */
8805 #define ST st->u.mark
8806 case MARKPOINT: /* (*MARK:foo) */
8807 ST.prev_mark = mark_state;
8808 ST.mark_name = sv_commit = sv_yes_mark
8809 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8811 ST.mark_loc = locinput;
8812 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
8813 NOT_REACHED; /* NOTREACHED */
8815 case MARKPOINT_next:
8816 mark_state = ST.prev_mark;
8818 NOT_REACHED; /* NOTREACHED */
8820 case MARKPOINT_next_fail:
8821 if (popmark && sv_eq(ST.mark_name,popmark))
8823 if (ST.mark_loc > startpoint)
8824 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8825 popmark = NULL; /* we found our mark */
8826 sv_commit = ST.mark_name;
8829 Perl_re_exec_indentf( aTHX_ "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
8831 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
8834 mark_state = ST.prev_mark;
8835 sv_yes_mark = mark_state ?
8836 mark_state->u.mark.mark_name : NULL;
8838 NOT_REACHED; /* NOTREACHED */
8840 case SKIP: /* (*SKIP) */
8842 /* (*SKIP) : if we fail we cut here*/
8843 ST.mark_name = NULL;
8844 ST.mark_loc = locinput;
8845 PUSH_STATE_GOTO(SKIP_next,next, locinput);
8847 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
8848 otherwise do nothing. Meaning we need to scan
8850 regmatch_state *cur = mark_state;
8851 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8854 if ( sv_eq( cur->u.mark.mark_name,
8857 ST.mark_name = find;
8858 PUSH_STATE_GOTO( SKIP_next, next, locinput);
8860 cur = cur->u.mark.prev_mark;
8863 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
8866 case SKIP_next_fail:
8868 /* (*CUT:NAME) - Set up to search for the name as we
8869 collapse the stack*/
8870 popmark = ST.mark_name;
8872 /* (*CUT) - No name, we cut here.*/
8873 if (ST.mark_loc > startpoint)
8874 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8875 /* but we set sv_commit to latest mark_name if there
8876 is one so they can test to see how things lead to this
8879 sv_commit=mark_state->u.mark.mark_name;
8883 NOT_REACHED; /* NOTREACHED */
8886 case LNBREAK: /* \R */
8887 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
8894 PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
8895 PTR2UV(scan), OP(scan));
8896 Perl_croak(aTHX_ "regexp memory corruption");
8898 /* this is a point to jump to in order to increment
8899 * locinput by one character */
8901 assert(!NEXTCHR_IS_EOS);
8903 locinput += PL_utf8skip[nextchr];
8904 /* locinput is allowed to go 1 char off the end (signifying
8905 * EOS), but not 2+ */
8906 if (locinput > reginfo->strend)
8915 /* switch break jumps here */
8916 scan = next; /* prepare to execute the next op and ... */
8917 continue; /* ... jump back to the top, reusing st */
8921 /* push a state that backtracks on success */
8922 st->u.yes.prev_yes_state = yes_state;
8926 /* push a new regex state, then continue at scan */
8928 regmatch_state *newst;
8931 regmatch_state *cur = st;
8932 regmatch_state *curyes = yes_state;
8934 regmatch_slab *slab = PL_regmatch_slab;
8935 for (i = 0; i < 3 && i <= depth; cur--,i++) {
8936 if (cur < SLAB_FIRST(slab)) {
8938 cur = SLAB_LAST(slab);
8940 Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
8943 depth - i, PL_reg_name[cur->resume_state],
8944 (curyes == cur) ? "yes" : ""
8947 curyes = cur->u.yes.prev_yes_state;
8950 DEBUG_STATE_pp("push")
8953 st->locinput = locinput;
8955 if (newst > SLAB_LAST(PL_regmatch_slab))
8956 newst = S_push_slab(aTHX);
8957 PL_regmatch_state = newst;
8959 locinput = pushinput;
8965 #ifdef SOLARIS_BAD_OPTIMIZER
8966 # undef PL_charclass
8970 * We get here only if there's trouble -- normally "case END" is
8971 * the terminating point.
8973 Perl_croak(aTHX_ "corrupted regexp pointers");
8974 NOT_REACHED; /* NOTREACHED */
8978 /* we have successfully completed a subexpression, but we must now
8979 * pop to the state marked by yes_state and continue from there */
8980 assert(st != yes_state);
8982 while (st != yes_state) {
8984 if (st < SLAB_FIRST(PL_regmatch_slab)) {
8985 PL_regmatch_slab = PL_regmatch_slab->prev;
8986 st = SLAB_LAST(PL_regmatch_slab);
8990 DEBUG_STATE_pp("pop (no final)");
8992 DEBUG_STATE_pp("pop (yes)");
8998 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
8999 || yes_state > SLAB_LAST(PL_regmatch_slab))
9001 /* not in this slab, pop slab */
9002 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9003 PL_regmatch_slab = PL_regmatch_slab->prev;
9004 st = SLAB_LAST(PL_regmatch_slab);
9006 depth -= (st - yes_state);
9009 yes_state = st->u.yes.prev_yes_state;
9010 PL_regmatch_state = st;
9013 locinput= st->locinput;
9014 state_num = st->resume_state + no_final;
9015 goto reenter_switch;
9018 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n",
9019 PL_colors[4], PL_colors[5]));
9021 if (reginfo->info_aux_eval) {
9022 /* each successfully executed (?{...}) block does the equivalent of
9023 * local $^R = do {...}
9024 * When popping the save stack, all these locals would be undone;
9025 * bypass this by setting the outermost saved $^R to the latest
9027 /* I dont know if this is needed or works properly now.
9028 * see code related to PL_replgv elsewhere in this file.
9031 if (oreplsv != GvSV(PL_replgv))
9032 sv_setsv(oreplsv, GvSV(PL_replgv));
9039 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n",
9041 PL_colors[4], PL_colors[5])
9053 /* there's a previous state to backtrack to */
9055 if (st < SLAB_FIRST(PL_regmatch_slab)) {
9056 PL_regmatch_slab = PL_regmatch_slab->prev;
9057 st = SLAB_LAST(PL_regmatch_slab);
9059 PL_regmatch_state = st;
9060 locinput= st->locinput;
9062 DEBUG_STATE_pp("pop");
9064 if (yes_state == st)
9065 yes_state = st->u.yes.prev_yes_state;
9067 state_num = st->resume_state + 1; /* failure = success + 1 */
9069 goto reenter_switch;
9074 if (rex->intflags & PREGf_VERBARG_SEEN) {
9075 SV *sv_err = get_sv("REGERROR", 1);
9076 SV *sv_mrk = get_sv("REGMARK", 1);
9078 sv_commit = &PL_sv_no;
9080 sv_yes_mark = &PL_sv_yes;
9083 sv_commit = &PL_sv_yes;
9084 sv_yes_mark = &PL_sv_no;
9088 sv_setsv(sv_err, sv_commit);
9089 sv_setsv(sv_mrk, sv_yes_mark);
9093 if (last_pushed_cv) {
9095 /* see "Some notes about MULTICALL" above */
9097 PERL_UNUSED_VAR(SP);
9100 LEAVE_SCOPE(orig_savestack_ix);
9102 assert(!result || locinput - reginfo->strbeg >= 0);
9103 return result ? locinput - reginfo->strbeg : -1;
9107 - regrepeat - repeatedly match something simple, report how many
9109 * What 'simple' means is a node which can be the operand of a quantifier like
9112 * startposp - pointer a pointer to the start position. This is updated
9113 * to point to the byte following the highest successful
9115 * p - the regnode to be repeatedly matched against.
9116 * reginfo - struct holding match state, such as strend
9117 * max - maximum number of things to match.
9118 * depth - (for debugging) backtracking depth.
9121 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9122 regmatch_info *const reginfo, I32 max _pDEPTH)
9124 char *scan; /* Pointer to current position in target string */
9126 char *loceol = reginfo->strend; /* local version */
9127 I32 hardcount = 0; /* How many matches so far */
9128 bool utf8_target = reginfo->is_utf8_target;
9129 unsigned int to_complement = 0; /* Invert the result? */
9131 _char_class_number classnum;
9133 PERL_ARGS_ASSERT_REGREPEAT;
9136 if (max == REG_INFTY)
9138 else if (! utf8_target && loceol - scan > max)
9139 loceol = scan + max;
9141 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
9142 * to the maximum of how far we should go in it (leaving it set to the real
9143 * end, if the maximum permissible would take us beyond that). This allows
9144 * us to make the loop exit condition that we haven't gone past <loceol> to
9145 * also mean that we haven't exceeded the max permissible count, saving a
9146 * test each time through the loop. But it assumes that the OP matches a
9147 * single byte, which is true for most of the OPs below when applied to a
9148 * non-UTF-8 target. Those relatively few OPs that don't have this
9149 * characteristic will have to compensate.
9151 * There is no adjustment for UTF-8 targets, as the number of bytes per
9152 * character varies. OPs will have to test both that the count is less
9153 * than the max permissible (using <hardcount> to keep track), and that we
9154 * are still within the bounds of the string (using <loceol>. A few OPs
9155 * match a single byte no matter what the encoding. They can omit the max
9156 * test if, for the UTF-8 case, they do the adjustment that was skipped
9159 * Thus, the code above sets things up for the common case; and exceptional
9160 * cases need extra work; the common case is to make sure <scan> doesn't
9161 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
9162 * count doesn't exceed the maximum permissible */
9167 while (scan < loceol && hardcount < max && *scan != '\n') {
9168 scan += UTF8SKIP(scan);
9172 scan = (char *) memchr(scan, '\n', loceol - scan);
9180 while (scan < loceol && hardcount < max) {
9181 scan += UTF8SKIP(scan);
9189 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9190 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9191 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9195 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9199 /* Can use a simple find if the pattern char to match on is invariant
9200 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
9201 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
9202 * true iff it doesn't matter if the argument is in UTF-8 or not */
9203 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
9204 if (utf8_target && loceol - scan > max) {
9205 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
9206 * since here, to match at all, 1 char == 1 byte */
9207 loceol = scan + max;
9209 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9211 else if (reginfo->is_utf8_pat) {
9213 STRLEN scan_char_len;
9215 /* When both target and pattern are UTF-8, we have to do
9217 while (hardcount < max
9219 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
9220 && memEQ(scan, STRING(p), scan_char_len))
9222 scan += scan_char_len;
9226 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
9228 /* Target isn't utf8; convert the character in the UTF-8
9229 * pattern to non-UTF8, and do a simple find */
9230 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
9231 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c);
9232 } /* else pattern char is above Latin1, can't possibly match the
9237 /* Here, the string must be utf8; pattern isn't, and <c> is
9238 * different in utf8 than not, so can't compare them directly.
9239 * Outside the loop, find the two utf8 bytes that represent c, and
9240 * then look for those in sequence in the utf8 string */
9241 U8 high = UTF8_TWO_BYTE_HI(c);
9242 U8 low = UTF8_TWO_BYTE_LO(c);
9244 while (hardcount < max
9245 && scan + 1 < loceol
9246 && UCHARAT(scan) == high
9247 && UCHARAT(scan + 1) == low)
9255 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */
9256 assert(! reginfo->is_utf8_pat);
9259 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
9263 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9264 utf8_flags = FOLDEQ_LOCALE;
9267 case EXACTF: /* This node only generated for non-utf8 patterns */
9268 assert(! reginfo->is_utf8_pat);
9273 if (! utf8_target) {
9276 utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
9277 | FOLDEQ_S2_FOLDS_SANE;
9282 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
9286 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
9288 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9290 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
9293 if (c1 == CHRTEST_VOID) {
9294 /* Use full Unicode fold matching */
9295 char *tmpeol = reginfo->strend;
9296 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
9297 while (hardcount < max
9298 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
9299 STRING(p), NULL, pat_len,
9300 reginfo->is_utf8_pat, utf8_flags))
9303 tmpeol = reginfo->strend;
9307 else if (utf8_target) {
9309 while (scan < loceol
9311 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
9313 scan += UTF8SKIP(scan);
9318 while (scan < loceol
9320 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
9321 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
9323 scan += UTF8SKIP(scan);
9328 else if (c1 == c2) {
9329 scan = (char *) find_span_end((U8 *) scan, (U8 *) loceol, (U8) c1);
9332 /* See comments in regmatch() CURLY_B_min_known_fail. We avoid
9333 * a conditional each time through the loop if the characters
9334 * differ only in a single bit, as is the usual situation */
9335 U8 c1_c2_bits_differing = c1 ^ c2;
9337 if (isPOWER_OF_2(c1_c2_bits_differing)) {
9338 U8 c1_c2_mask = ~ c1_c2_bits_differing;
9340 scan = (char *) find_span_end_mask((U8 *) scan,
9346 while ( scan < loceol
9347 && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
9358 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9360 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
9361 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
9367 while (hardcount < max
9369 && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
9371 scan += UTF8SKIP(scan);
9375 else if (ANYOF_FLAGS(p)) {
9376 while (scan < loceol
9377 && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
9381 while (scan < loceol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
9387 if (utf8_target && loceol - scan > max) {
9389 /* We didn't adjust <loceol> at the beginning of this routine
9390 * because is UTF-8, but it is actually ok to do so, since here, to
9391 * match, 1 char == 1 byte. */
9392 loceol = scan + max;
9395 scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) loceol, (U8) ARG(p), FLAGS(p));
9399 if (utf8_target && loceol - scan > max) {
9400 loceol = scan + max;
9403 scan = find_next_non_ascii(scan, loceol, utf8_target);
9408 while ( hardcount < max
9410 && ! isASCII_utf8_safe(scan, loceol))
9412 scan += UTF8SKIP(scan);
9417 scan = find_next_ascii(scan, loceol, utf8_target);
9421 /* The argument (FLAGS) to all the POSIX node types is the class number */
9428 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9429 if (! utf8_target) {
9430 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
9436 while (hardcount < max && scan < loceol
9437 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
9441 scan += UTF8SKIP(scan);
9454 if (utf8_target && loceol - scan > max) {
9456 /* We didn't adjust <loceol> at the beginning of this routine
9457 * because is UTF-8, but it is actually ok to do so, since here, to
9458 * match, 1 char == 1 byte. */
9459 loceol = scan + max;
9461 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
9474 if (! utf8_target) {
9475 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
9481 /* The complement of something that matches only ASCII matches all
9482 * non-ASCII, plus everything in ASCII that isn't in the class. */
9483 while (hardcount < max && scan < loceol
9484 && ( ! isASCII_utf8_safe(scan, reginfo->strend)
9485 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
9487 scan += UTF8SKIP(scan);
9498 if (! utf8_target) {
9499 while (scan < loceol && to_complement
9500 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
9507 classnum = (_char_class_number) FLAGS(p);
9510 while ( hardcount < max && scan < loceol
9511 && to_complement ^ cBOOL(_invlist_contains_cp(
9512 PL_XPosix_ptrs[classnum],
9513 utf8_to_uvchr_buf((U8 *) scan,
9517 scan += UTF8SKIP(scan);
9522 /* For the classes below, the knowledge of how to handle
9523 * every code point is compiled in to Perl via a macro.
9524 * This code is written for making the loops as tight as
9525 * possible. It could be refactored to save space instead.
9528 case _CC_ENUM_SPACE:
9529 while (hardcount < max
9532 ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
9534 scan += UTF8SKIP(scan);
9538 case _CC_ENUM_BLANK:
9539 while (hardcount < max
9542 ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
9544 scan += UTF8SKIP(scan);
9548 case _CC_ENUM_XDIGIT:
9549 while (hardcount < max
9552 ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
9554 scan += UTF8SKIP(scan);
9558 case _CC_ENUM_VERTSPACE:
9559 while (hardcount < max
9562 ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
9564 scan += UTF8SKIP(scan);
9568 case _CC_ENUM_CNTRL:
9569 while (hardcount < max
9572 ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
9574 scan += UTF8SKIP(scan);
9584 while (hardcount < max && scan < loceol &&
9585 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9590 /* LNBREAK can match one or two latin chars, which is ok, but we
9591 * have to use hardcount in this situation, and throw away the
9592 * adjustment to <loceol> done before the switch statement */
9593 loceol = reginfo->strend;
9594 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9603 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9617 /* These are all 0 width, so match right here or not at all. */
9621 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
9622 NOT_REACHED; /* NOTREACHED */
9629 c = scan - *startposp;
9633 GET_RE_DEBUG_FLAGS_DECL;
9635 SV * const prop = sv_newmortal();
9636 regprop(prog, prop, p, reginfo, NULL);
9637 Perl_re_exec_indentf( aTHX_ "%s can match %" IVdf " times out of %" IVdf "...\n",
9638 depth, SvPVX_const(prop),(IV)c,(IV)max);
9646 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
9648 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
9649 create a copy so that changes the caller makes won't change the shared one.
9650 If <altsvp> is non-null, will return NULL in it, for back-compat.
9653 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
9655 PERL_ARGS_ASSERT_REGCLASS_SWASH;
9661 return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
9664 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
9667 - reginclass - determine if a character falls into a character class
9669 n is the ANYOF-type regnode
9670 p is the target string
9671 p_end points to one byte beyond the end of the target string
9672 utf8_target tells whether p is in UTF-8.
9674 Returns true if matched; false otherwise.
9676 Note that this can be a synthetic start class, a combination of various
9677 nodes, so things you think might be mutually exclusive, such as locale,
9678 aren't. It can match both locale and non-locale
9683 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
9686 const char flags = ANYOF_FLAGS(n);
9690 PERL_ARGS_ASSERT_REGINCLASS;
9692 /* If c is not already the code point, get it. Note that
9693 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9694 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
9696 const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9697 c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9698 if (c_len == (STRLEN)-1) {
9699 _force_out_malformed_utf8_message(p, p_end,
9701 1 /* 1 means die */ );
9702 NOT_REACHED; /* NOTREACHED */
9705 && (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
9706 && ! ANYOFL_UTF8_LOCALE_REQD(flags))
9708 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9712 /* If this character is potentially in the bitmap, check it */
9713 if (c < NUM_ANYOF_CODE_POINTS) {
9714 if (ANYOF_BITMAP_TEST(n, c))
9717 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9724 else if (flags & ANYOF_LOCALE_FLAGS) {
9725 if ((flags & ANYOFL_FOLD)
9727 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9731 else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9735 /* The data structure is arranged so bits 0, 2, 4, ... are set
9736 * if the class includes the Posix character class given by
9737 * bit/2; and 1, 3, 5, ... are set if the class includes the
9738 * complemented Posix class given by int(bit/2). So we loop
9739 * through the bits, each time changing whether we complement
9740 * the result or not. Suppose for the sake of illustration
9741 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
9742 * is set, it means there is a match for this ANYOF node if the
9743 * character is in the class given by the expression (0 / 2 = 0
9744 * = \w). If it is in that class, isFOO_lc() will return 1,
9745 * and since 'to_complement' is 0, the result will stay TRUE,
9746 * and we exit the loop. Suppose instead that bit 0 is 0, but
9747 * bit 1 is 1. That means there is a match if the character
9748 * matches \W. We won't bother to call isFOO_lc() on bit 0,
9749 * but will on bit 1. On the second iteration 'to_complement'
9750 * will be 1, so the exclusive or will reverse things, so we
9751 * are testing for \W. On the third iteration, 'to_complement'
9752 * will be 0, and we would be testing for \s; the fourth
9753 * iteration would test for \S, etc.
9755 * Note that this code assumes that all the classes are closed
9756 * under folding. For example, if a character matches \w, then
9757 * its fold does too; and vice versa. This should be true for
9758 * any well-behaved locale for all the currently defined Posix
9759 * classes, except for :lower: and :upper:, which are handled
9760 * by the pseudo-class :cased: which matches if either of the
9761 * other two does. To get rid of this assumption, an outer
9762 * loop could be used below to iterate over both the source
9763 * character, and its fold (if different) */
9766 int to_complement = 0;
9768 while (count < ANYOF_MAX) {
9769 if (ANYOF_POSIXL_TEST(n, count)
9770 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9783 /* If the bitmap didn't (or couldn't) match, and something outside the
9784 * bitmap could match, try that. */
9786 if (c >= NUM_ANYOF_CODE_POINTS
9787 && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9789 match = TRUE; /* Everything above the bitmap matches */
9791 /* Here doesn't match everything above the bitmap. If there is
9792 * some information available beyond the bitmap, we may find a
9793 * match in it. If so, this is most likely because the code point
9794 * is outside the bitmap range. But rarely, it could be because of
9795 * some other reason. If so, various flags are set to indicate
9796 * this possibility. On ANYOFD nodes, there may be matches that
9797 * happen only when the target string is UTF-8; or for other node
9798 * types, because runtime lookup is needed, regardless of the
9799 * UTF-8ness of the target string. Finally, under /il, there may
9800 * be some matches only possible if the locale is a UTF-8 one. */
9801 else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP
9802 && ( c >= NUM_ANYOF_CODE_POINTS
9803 || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9804 && ( UNLIKELY(OP(n) != ANYOFD)
9805 || (utf8_target && ! isASCII_uni(c)
9806 # if NUM_ANYOF_CODE_POINTS > 256
9810 || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9811 && IN_UTF8_CTYPE_LOCALE)))
9813 SV* only_utf8_locale = NULL;
9814 SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
9815 &only_utf8_locale, NULL);
9821 } else { /* Convert to utf8 */
9822 utf8_p = utf8_buffer;
9823 append_utf8_from_native_byte(*p, &utf8_p);
9824 utf8_p = utf8_buffer;
9827 if (swash_fetch(sw, utf8_p, TRUE)) {
9831 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9832 match = _invlist_contains_cp(only_utf8_locale, c);
9836 if (UNICODE_IS_SUPER(c)
9838 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9840 && ckWARN_d(WARN_NON_UNICODE))
9842 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
9843 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
9847 #if ANYOF_INVERT != 1
9848 /* Depending on compiler optimization cBOOL takes time, so if don't have to
9850 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9853 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
9854 return (flags & ANYOF_INVERT) ^ match;
9858 S_reghop3(U8 *s, SSize_t off, const U8* lim)
9860 /* return the position 'off' UTF-8 characters away from 's', forward if
9861 * 'off' >= 0, backwards if negative. But don't go outside of position
9862 * 'lim', which better be < s if off < 0 */
9864 PERL_ARGS_ASSERT_REGHOP3;
9867 while (off-- && s < lim) {
9868 /* XXX could check well-formedness here */
9869 U8 *new_s = s + UTF8SKIP(s);
9870 if (new_s > lim) /* lim may be in the middle of a long character */
9876 while (off++ && s > lim) {
9878 if (UTF8_IS_CONTINUED(*s)) {
9879 while (s > lim && UTF8_IS_CONTINUATION(*s))
9881 if (! UTF8_IS_START(*s)) {
9882 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9885 /* XXX could check well-formedness here */
9892 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
9894 PERL_ARGS_ASSERT_REGHOP4;
9897 while (off-- && s < rlim) {
9898 /* XXX could check well-formedness here */
9903 while (off++ && s > llim) {
9905 if (UTF8_IS_CONTINUED(*s)) {
9906 while (s > llim && UTF8_IS_CONTINUATION(*s))
9908 if (! UTF8_IS_START(*s)) {
9909 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9912 /* XXX could check well-formedness here */
9918 /* like reghop3, but returns NULL on overrun, rather than returning last
9922 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
9924 PERL_ARGS_ASSERT_REGHOPMAYBE3;
9927 while (off-- && s < lim) {
9928 /* XXX could check well-formedness here */
9935 while (off++ && s > lim) {
9937 if (UTF8_IS_CONTINUED(*s)) {
9938 while (s > lim && UTF8_IS_CONTINUATION(*s))
9940 if (! UTF8_IS_START(*s)) {
9941 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
9944 /* XXX could check well-formedness here */
9953 /* when executing a regex that may have (?{}), extra stuff needs setting
9954 up that will be visible to the called code, even before the current
9955 match has finished. In particular:
9957 * $_ is localised to the SV currently being matched;
9958 * pos($_) is created if necessary, ready to be updated on each call-out
9960 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9961 isn't set until the current pattern is successfully finished), so that
9962 $1 etc of the match-so-far can be seen;
9963 * save the old values of subbeg etc of the current regex, and set then
9964 to the current string (again, this is normally only done at the end
9969 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
9972 regexp *const rex = ReANY(reginfo->prog);
9973 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
9975 eval_state->rex = rex;
9978 /* Make $_ available to executed code. */
9979 if (reginfo->sv != DEFSV) {
9981 DEFSV_set(reginfo->sv);
9984 if (!(mg = mg_find_mglob(reginfo->sv))) {
9985 /* prepare for quick setting of pos */
9986 mg = sv_magicext_mglob(reginfo->sv);
9989 eval_state->pos_magic = mg;
9990 eval_state->pos = mg->mg_len;
9991 eval_state->pos_flags = mg->mg_flags;
9994 eval_state->pos_magic = NULL;
9996 if (!PL_reg_curpm) {
9997 /* PL_reg_curpm is a fake PMOP that we can attach the current
9998 * regex to and point PL_curpm at, so that $1 et al are visible
9999 * within a /(?{})/. It's just allocated once per interpreter the
10000 * first time its needed */
10001 Newxz(PL_reg_curpm, 1, PMOP);
10002 #ifdef USE_ITHREADS
10004 SV* const repointer = &PL_sv_undef;
10005 /* this regexp is also owned by the new PL_reg_curpm, which
10006 will try to free it. */
10007 av_push(PL_regex_padav, repointer);
10008 PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
10009 PL_regex_pad = AvARRAY(PL_regex_padav);
10013 SET_reg_curpm(reginfo->prog);
10014 eval_state->curpm = PL_curpm;
10015 PL_curpm_under = PL_curpm;
10016 PL_curpm = PL_reg_curpm;
10017 if (RXp_MATCH_COPIED(rex)) {
10018 /* Here is a serious problem: we cannot rewrite subbeg,
10019 since it may be needed if this match fails. Thus
10020 $` inside (?{}) could fail... */
10021 eval_state->subbeg = rex->subbeg;
10022 eval_state->sublen = rex->sublen;
10023 eval_state->suboffset = rex->suboffset;
10024 eval_state->subcoffset = rex->subcoffset;
10025 #ifdef PERL_ANY_COW
10026 eval_state->saved_copy = rex->saved_copy;
10028 RXp_MATCH_COPIED_off(rex);
10031 eval_state->subbeg = NULL;
10032 rex->subbeg = (char *)reginfo->strbeg;
10033 rex->suboffset = 0;
10034 rex->subcoffset = 0;
10035 rex->sublen = reginfo->strend - reginfo->strbeg;
10039 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10042 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10044 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10045 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
10048 Safefree(aux->poscache);
10052 /* undo the effects of S_setup_eval_state() */
10054 if (eval_state->subbeg) {
10055 regexp * const rex = eval_state->rex;
10056 rex->subbeg = eval_state->subbeg;
10057 rex->sublen = eval_state->sublen;
10058 rex->suboffset = eval_state->suboffset;
10059 rex->subcoffset = eval_state->subcoffset;
10060 #ifdef PERL_ANY_COW
10061 rex->saved_copy = eval_state->saved_copy;
10063 RXp_MATCH_COPIED_on(rex);
10065 if (eval_state->pos_magic)
10067 eval_state->pos_magic->mg_len = eval_state->pos;
10068 eval_state->pos_magic->mg_flags =
10069 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10070 | (eval_state->pos_flags & MGf_BYTES);
10073 PL_curpm = eval_state->curpm;
10076 PL_regmatch_state = aux->old_regmatch_state;
10077 PL_regmatch_slab = aux->old_regmatch_slab;
10079 /* free all slabs above current one - this must be the last action
10080 * of this function, as aux and eval_state are allocated within
10081 * slabs and may be freed here */
10083 s = PL_regmatch_slab->next;
10085 PL_regmatch_slab->next = NULL;
10087 regmatch_slab * const osl = s;
10096 S_to_utf8_substr(pTHX_ regexp *prog)
10098 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10099 * on the converted value */
10103 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10106 if (prog->substrs->data[i].substr
10107 && !prog->substrs->data[i].utf8_substr) {
10108 SV* const sv = newSVsv(prog->substrs->data[i].substr);
10109 prog->substrs->data[i].utf8_substr = sv;
10110 sv_utf8_upgrade(sv);
10111 if (SvVALID(prog->substrs->data[i].substr)) {
10112 if (SvTAIL(prog->substrs->data[i].substr)) {
10113 /* Trim the trailing \n that fbm_compile added last
10115 SvCUR_set(sv, SvCUR(sv) - 1);
10116 /* Whilst this makes the SV technically "invalid" (as its
10117 buffer is no longer followed by "\0") when fbm_compile()
10118 adds the "\n" back, a "\0" is restored. */
10119 fbm_compile(sv, FBMcf_TAIL);
10121 fbm_compile(sv, 0);
10123 if (prog->substrs->data[i].substr == prog->check_substr)
10124 prog->check_utf8 = sv;
10130 S_to_byte_substr(pTHX_ regexp *prog)
10132 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
10133 * on the converted value; returns FALSE if can't be converted. */
10137 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
10140 if (prog->substrs->data[i].utf8_substr
10141 && !prog->substrs->data[i].substr) {
10142 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
10143 if (! sv_utf8_downgrade(sv, TRUE)) {
10146 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
10147 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
10148 /* Trim the trailing \n that fbm_compile added last
10150 SvCUR_set(sv, SvCUR(sv) - 1);
10151 fbm_compile(sv, FBMcf_TAIL);
10153 fbm_compile(sv, 0);
10155 prog->substrs->data[i].substr = sv;
10156 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
10157 prog->check_substr = sv;
10164 #ifndef PERL_IN_XSUB_RE
10167 Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
10169 /* Temporary helper function for toke.c. Verify that the code point 'cp'
10170 * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in
10171 * the larger string bounded by 'strbeg' and 'strend'.
10173 * 'cp' needs to be assigned (if not a future version of the Unicode
10174 * Standard could make it something that combines with adjacent characters,
10175 * so code using it would then break), and there has to be a GCB break
10176 * before and after the character. */
10178 GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
10179 const U8 * prev_cp_start;
10181 PERL_ARGS_ASSERT__IS_GRAPHEME;
10183 if ( UNLIKELY(UNICODE_IS_SUPER(cp))
10184 || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
10186 /* These are considered graphemes */
10190 /* Otherwise, unassigned code points are forbidden */
10191 if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
10192 _invlist_search(PL_Assigned_invlist, cp))))
10197 cp_gcb_val = getGCB_VAL_CP(cp);
10199 /* Find the GCB value of the previous code point in the input */
10200 prev_cp_start = utf8_hop_back(s, -1, strbeg);
10201 if (UNLIKELY(prev_cp_start == s)) {
10202 prev_cp_gcb_val = GCB_EDGE;
10205 prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
10208 /* And check that is a grapheme boundary */
10209 if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
10210 TRUE /* is UTF-8 encoded */ ))
10215 /* Similarly verify there is a break between the current character and the
10219 next_cp_gcb_val = GCB_EDGE;
10222 next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
10225 return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
10229 =head1 Unicode Support
10231 =for apidoc isSCRIPT_RUN
10233 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
10234 not including C<send> form a "script run". C<utf8_target> is TRUE iff the
10235 sequence starting at C<s> is to be treated as UTF-8. To be precise, except for
10236 two degenerate cases given below, this function returns TRUE iff all code
10237 points in it come from any combination of three "scripts" given by the Unicode
10238 "Script Extensions" property: Common, Inherited, and possibly one other.
10239 Additionally all decimal digits must come from the same consecutive sequence of
10242 For example, if all the characters in the sequence are Greek, or Common, or
10243 Inherited, this function will return TRUE, provided any decimal digits in it
10244 are the ASCII digits "0".."9". For scripts (unlike Greek) that have their own
10245 digits defined this will accept either digits from that set or from 0..9, but
10246 not a combination of the two. Some scripts, such as Arabic, have more than one
10247 set of digits. All digits must come from the same set for this function to
10250 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
10251 contain the script found, using the C<SCX_enum> typedef. Its value will be
10252 C<SCX_INVALID> if the function returns FALSE.
10254 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
10255 will be C<SCX_INVALID>.
10257 If the sequence contains a single code point which is unassigned to a character
10258 in the version of Unicode being used, the function will return TRUE, and the
10259 script will be C<SCX_Unknown>. Any other combination of unassigned code points
10260 in the input sequence will result in the function treating the input as not
10261 being a script run.
10263 The returned script will be C<SCX_Inherited> iff all the code points in it are
10264 from the Inherited script.
10266 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
10267 it are from the Inherited or Common scripts.
10274 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
10276 /* Basically, it looks at each character in the sequence to see if the
10277 * above conditions are met; if not it fails. It uses an inversion map to
10278 * find the enum corresponding to the script of each character. But this
10279 * is complicated by the fact that a few code points can be in any of
10280 * several scripts. The data has been constructed so that there are
10281 * additional enum values (all negative) for these situations. The
10282 * absolute value of those is an index into another table which contains
10283 * pointers to auxiliary tables for each such situation. Each aux array
10284 * lists all the scripts for the given situation. There is another,
10285 * parallel, table that gives the number of entries in each aux table.
10286 * These are all defined in charclass_invlists.h */
10288 /* XXX Here are the additional things UTS 39 says could be done:
10290 * Forbid sequences of the same nonspacing mark
10292 * Check to see that all the characters are in the sets of exemplar
10293 * characters for at least one language in the Unicode Common Locale Data
10294 * Repository [CLDR]. */
10297 /* Things that match /\d/u */
10298 SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
10299 UV * decimals_array = invlist_array(decimals_invlist);
10301 /* What code point is the digit '0' of the script run? (0 meaning FALSE if
10302 * not currently known) */
10303 UV zero_of_run = 0;
10305 SCX_enum script_of_run = SCX_INVALID; /* Illegal value */
10306 SCX_enum script_of_char = SCX_INVALID;
10308 /* If the script remains not fully determined from iteration to iteration,
10309 * this is the current intersection of the possiblities. */
10310 SCX_enum * intersection = NULL;
10311 PERL_UINT_FAST8_T intersection_len = 0;
10313 bool retval = TRUE;
10314 SCX_enum * ret_script = NULL;
10318 PERL_ARGS_ASSERT_ISSCRIPT_RUN;
10320 /* All code points in 0..255 are either Common or Latin, so must be a
10321 * script run. We can return immediately unless we need to know which
10323 if (! utf8_target && LIKELY(send > s)) {
10324 if (ret_script == NULL) {
10328 /* If any character is Latin, the run is Latin */
10330 if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
10331 *ret_script = SCX_Latin;
10336 /* Here, all are Common */
10337 *ret_script = SCX_Common;
10341 /* Look at each character in the sequence */
10343 /* If the current character being examined is a digit, this is the code
10344 * point of the zero for its sequence of 10 */
10349 /* The code allows all scripts to use the ASCII digits. This is
10350 * because they are used in commerce even in scripts that have their
10351 * own set. Hence any ASCII ones found are ok, unless and until a
10352 * digit from another set has already been encountered. (The other
10353 * digit ranges in Common are not similarly blessed) */
10354 if (UNLIKELY(isDIGIT(*s))) {
10355 if (UNLIKELY(script_of_run == SCX_Unknown)) {
10360 if (zero_of_run != '0') {
10372 /* Here, isn't an ASCII digit. Find the code point of the character */
10373 if (! UTF8_IS_INVARIANT(*s)) {
10375 cp = valid_utf8_to_uvchr((U8 *) s, &len);
10382 /* If is within the range [+0 .. +9] of the script's zero, it also is a
10383 * digit in that script. We can skip the rest of this code for this
10385 if (UNLIKELY( zero_of_run
10386 && cp >= zero_of_run
10387 && cp - zero_of_run <= 9))
10392 /* Find the character's script. The correct values are hard-coded here
10393 * for small-enough code points. */
10394 if (cp < 0x2B9) { /* From inspection of Unicode db; extremely
10395 unlikely to change */
10397 || ( isALPHA_L1(cp)
10398 && LIKELY(cp != MICRO_SIGN_NATIVE)))
10400 script_of_char = SCX_Latin;
10403 script_of_char = SCX_Common;
10407 script_of_char = _Perl_SCX_invmap[
10408 _invlist_search(PL_SCX_invlist, cp)];
10411 /* We arbitrarily accept a single unassigned character, but not in
10412 * combination with anything else, and not a run of them. */
10413 if ( UNLIKELY(script_of_run == SCX_Unknown)
10414 || UNLIKELY( script_of_run != SCX_INVALID
10415 && script_of_char == SCX_Unknown))
10421 /* For the first character, or the run is inherited, the run's script
10422 * is set to the char's */
10423 if ( UNLIKELY(script_of_run == SCX_INVALID)
10424 || UNLIKELY(script_of_run == SCX_Inherited))
10426 script_of_run = script_of_char;
10429 /* For the character's script to be Unknown, it must be the first
10430 * character in the sequence (for otherwise a test above would have
10431 * prevented us from reaching here), and we have set the run's script
10432 * to it. Nothing further to be done for this character */
10433 if (UNLIKELY(script_of_char == SCX_Unknown)) {
10437 /* We accept 'inherited' script characters currently even at the
10438 * beginning. (We know that no characters in Inherited are digits, or
10439 * we'd have to check for that) */
10440 if (UNLIKELY(script_of_char == SCX_Inherited)) {
10444 /* If the run so far is Common, and the new character isn't, change the
10445 * run's script to that of this character */
10446 if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
10448 /* But Common contains several sets of digits. Only the '0' set
10449 * can be part of another script. */
10450 if (zero_of_run && zero_of_run != '0') {
10455 script_of_run = script_of_char;
10458 /* Now we can see if the script of the character is the same as that of
10460 if (LIKELY(script_of_char == script_of_run)) {
10461 /* By far the most common case */
10462 goto scripts_match;
10465 /* Here, the script of the run isn't Common. But characters in Common
10466 * match any script */
10467 if (script_of_char == SCX_Common) {
10468 goto scripts_match;
10471 #ifndef HAS_SCX_AUX_TABLES
10473 /* Too early a Unicode version to have a code point belonging to more
10474 * than one script, so, if the scripts don't exactly match, fail */
10475 PERL_UNUSED_VAR(intersection_len);
10481 /* Here there is no exact match between the character's script and the
10482 * run's. And we've handled the special cases of scripts Unknown,
10483 * Inherited, and Common.
10485 * Negative script numbers signify that the value may be any of several
10486 * scripts, and we need to look at auxiliary information to make our
10487 * deterimination. But if both are non-negative, we can fail now */
10488 if (LIKELY(script_of_char >= 0)) {
10489 const SCX_enum * search_in;
10490 PERL_UINT_FAST8_T search_in_len;
10491 PERL_UINT_FAST8_T i;
10493 if (LIKELY(script_of_run >= 0)) {
10498 /* Use the previously constructed set of possible scripts, if any.
10500 if (intersection) {
10501 search_in = intersection;
10502 search_in_len = intersection_len;
10505 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
10506 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
10509 for (i = 0; i < search_in_len; i++) {
10510 if (search_in[i] == script_of_char) {
10511 script_of_run = script_of_char;
10512 goto scripts_match;
10519 else if (LIKELY(script_of_run >= 0)) {
10520 /* script of character could be one of several, but run is a single
10522 const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
10523 const PERL_UINT_FAST8_T search_in_len
10524 = SCX_AUX_TABLE_lengths[-script_of_char];
10525 PERL_UINT_FAST8_T i;
10527 for (i = 0; i < search_in_len; i++) {
10528 if (search_in[i] == script_of_run) {
10529 script_of_char = script_of_run;
10530 goto scripts_match;
10538 /* Both run and char could be in one of several scripts. If the
10539 * intersection is empty, then this character isn't in this script
10540 * run. Otherwise, we need to calculate the intersection to use
10541 * for future iterations of the loop, unless we are already at the
10542 * final character */
10543 const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
10544 const PERL_UINT_FAST8_T char_len
10545 = SCX_AUX_TABLE_lengths[-script_of_char];
10546 const SCX_enum * search_run;
10547 PERL_UINT_FAST8_T run_len;
10549 SCX_enum * new_overlap = NULL;
10550 PERL_UINT_FAST8_T i, j;
10552 if (intersection) {
10553 search_run = intersection;
10554 run_len = intersection_len;
10557 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
10558 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
10561 intersection_len = 0;
10563 for (i = 0; i < run_len; i++) {
10564 for (j = 0; j < char_len; j++) {
10565 if (search_run[i] == search_char[j]) {
10567 /* Here, the script at i,j matches. That means this
10568 * character is in the run. But continue on to find
10569 * the complete intersection, for the next loop
10570 * iteration, and for the digit check after it.
10572 * On the first found common script, we malloc space
10573 * for the intersection list for the worst case of the
10574 * intersection, which is the minimum of the number of
10575 * scripts remaining in each set. */
10576 if (intersection_len == 0) {
10578 MIN(run_len - i, char_len - j),
10581 new_overlap[intersection_len++] = search_run[i];
10586 /* Here we've looked through everything. If they have no scripts
10587 * in common, not a run */
10588 if (intersection_len == 0) {
10593 /* If there is only a single script in common, set to that.
10594 * Otherwise, use the intersection going forward */
10595 Safefree(intersection);
10596 intersection = NULL;
10597 if (intersection_len == 1) {
10598 script_of_run = script_of_char = new_overlap[0];
10599 Safefree(new_overlap);
10600 new_overlap = NULL;
10603 intersection = new_overlap;
10611 /* Here, the script of the character is compatible with that of the
10612 * run. That means that in most cases, it continues the script run.
10613 * Either it and the run match exactly, or one or both can be in any of
10614 * several scripts, and the intersection is not empty. However, if the
10615 * character is a decimal digit, it could still mean failure if it is
10616 * from the wrong sequence of 10. So, we need to look at if it's a
10617 * digit. We've already handled the 10 decimal digits, and the next
10618 * lowest one is this one: */
10619 if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
10620 continue; /* Not a digit; this character is part of the run */
10623 /* If we have a definitive '0' for the script of this character, we
10624 * know that for this to be a digit, it must be in the range of +0..+9
10626 if ( script_of_char >= 0
10627 && (zero_of_char = script_zeros[script_of_char]))
10629 if ( cp < zero_of_char
10630 || cp > zero_of_char + 9)
10632 continue; /* Not a digit; this character is part of the run
10637 else { /* Need to look up if this character is a digit or not */
10638 SSize_t index_of_zero_of_char;
10639 index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
10640 if ( UNLIKELY(index_of_zero_of_char < 0)
10641 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
10643 continue; /* Not a digit; this character is part of the run.
10647 zero_of_char = decimals_array[index_of_zero_of_char];
10650 /* Here, the character is a decimal digit, and the zero of its sequence
10651 * of 10 is in 'zero_of_char'. If we already have a zero for this run,
10652 * they better be the same. */
10654 if (zero_of_run != zero_of_char) {
10659 else if (script_of_char == SCX_Common && script_of_run != SCX_Common) {
10661 /* Here, the script run isn't Common, but the current digit is in
10662 * Common, and isn't '0'-'9' (those were handled earlier). Only
10663 * '0'-'9' are acceptable in non-Common scripts. */
10667 else { /* Otherwise we now have a zero for this run */
10668 zero_of_run = zero_of_char;
10670 } /* end of looping through CLOSESR text */
10672 Safefree(intersection);
10674 if (ret_script != NULL) {
10676 *ret_script = script_of_run;
10679 *ret_script = SCX_INVALID;
10686 #endif /* ifndef PERL_IN_XSUB_RE */
10689 * ex: set ts=8 sts=4 sw=4 et: