5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #include "invlist_inline.h"
84 #include "unicode_constants.h"
86 static const char b_utf8_locale_required[] =
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
88 " Assuming a UTF-8 locale";
90 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND \
92 if (! IN_UTF8_CTYPE_LOCALE) { \
93 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
94 b_utf8_locale_required); \
98 static const char sets_utf8_locale_required[] =
99 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
101 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n) \
103 if (! IN_UTF8_CTYPE_LOCALE && (FLAGS(n) & ANYOFL_UTF8_LOCALE_REQD)){\
104 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
105 sets_utf8_locale_required); \
110 /* At least one required character in the target string is expressible only in
112 static const char non_utf8_target_but_utf8_required[]
113 = "Can't match, because target string needs to be in UTF-8\n";
116 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
117 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
122 #define STATIC static
129 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
131 #define HOPc(pos,off) \
132 (char *)(reginfo->is_utf8_target \
133 ? reghop3((U8*)pos, off, \
134 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
137 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
138 #define HOPBACK3(pos, off, lim) \
139 (reginfo->is_utf8_target \
140 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
141 : (pos - off >= lim) \
145 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
147 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
148 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
150 /* lim must be +ve. Returns NULL on overshoot */
151 #define HOPMAYBE3(pos,off,lim) \
152 (reginfo->is_utf8_target \
153 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
154 : ((U8*)pos + off <= lim) \
158 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
159 * off must be >=0; args should be vars rather than expressions */
160 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
161 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
162 : (U8*)((pos + off) > lim ? lim : (pos + off)))
163 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
165 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
166 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
168 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
170 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
171 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
173 /* for use after a quantifier and before an EXACT-like node -- japhy */
174 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
176 * NOTE that *nothing* that affects backtracking should be in here, specifically
177 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
178 * node that is in between two EXACT like nodes when ascertaining what the required
179 * "follow" character is. This should probably be moved to regex compile time
180 * although it may be done at run time beause of the REF possibility - more
181 * investigation required. -- demerphq
183 #define JUMPABLE(rn) ( \
185 (OP(rn) == CLOSE && \
186 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
188 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
189 OP(rn) == PLUS || OP(rn) == MINMOD || \
191 (REGNODE_TYPE(OP(rn)) == CURLY && ARG1(rn) > 0) \
193 #define IS_EXACT(rn) (REGNODE_TYPE(OP(rn)) == EXACT)
195 #define HAS_TEXT(rn) ( IS_EXACT(rn) || REGNODE_TYPE(OP(rn)) == REF )
198 Search for mandatory following text node; for lookahead, the text must
199 follow but for lookbehind (rn->flags != 0) we skip to the next step.
201 #define FIND_NEXT_IMPT(rn) STMT_START { \
202 while (JUMPABLE(rn)) { \
203 const OPCODE type = OP(rn); \
204 if (type == SUSPEND || REGNODE_TYPE(type) == CURLY) \
205 rn = REGNODE_AFTER_opcode(rn,type); \
206 else if (type == PLUS) \
207 rn = REGNODE_AFTER_type(rn,tregnode_PLUS); \
208 else if (type == IFMATCH) \
209 rn = (rn->flags == 0) ? REGNODE_AFTER_type(rn,tregnode_IFMATCH) : rn + ARG(rn); \
210 else rn += NEXT_OFF(rn); \
214 #define SLAB_FIRST(s) (&(s)->states[0])
215 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
217 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
218 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
219 static regmatch_state * S_push_slab(pTHX);
221 #define REGCP_PAREN_ELEMS 3
222 #define REGCP_OTHER_ELEMS 3
223 #define REGCP_FRAME_ELEMS 1
224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
225 * are needed for the regexp context stack bookkeeping. */
228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
230 const int retval = PL_savestack_ix;
231 const int paren_elems_to_push =
232 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
233 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
234 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
236 DECLARE_AND_GET_RE_DEBUG_FLAGS;
238 PERL_ARGS_ASSERT_REGCPPUSH;
240 if (paren_elems_to_push < 0)
241 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
242 (int)paren_elems_to_push, (int)maxopenparen,
243 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
245 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
246 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
247 " out of range (%lu-%ld)",
249 (unsigned long)maxopenparen,
252 SSGROW(total_elems + REGCP_FRAME_ELEMS);
255 if ((int)maxopenparen > (int)parenfloor)
256 Perl_re_exec_indentf( aTHX_
257 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
263 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
264 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
265 SSPUSHIV(rex->offs[p].end);
266 SSPUSHIV(rex->offs[p].start);
267 SSPUSHINT(rex->offs[p].start_tmp);
268 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
269 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
272 (IV)rex->offs[p].start,
273 (IV)rex->offs[p].start_tmp,
277 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
278 SSPUSHINT(maxopenparen);
279 SSPUSHINT(rex->lastparen);
280 SSPUSHINT(rex->lastcloseparen);
281 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
286 /* These are needed since we do not localize EVAL nodes: */
287 #define REGCP_SET(cp) \
289 Perl_re_exec_indentf( aTHX_ \
290 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
291 depth, (IV)PL_savestack_ix \
296 #define REGCP_UNWIND(cp) \
298 if (cp != PL_savestack_ix) \
299 Perl_re_exec_indentf( aTHX_ \
300 "Clearing an EVAL scope, savestack=%" \
301 IVdf "..%" IVdf "\n", \
302 depth, (IV)(cp), (IV)PL_savestack_ix \
307 /* set the start and end positions of capture ix */
308 #define CLOSE_CAPTURE(ix, s, e) \
309 rex->offs[ix].start = s; \
310 rex->offs[ix].end = e; \
311 if (ix > rex->lastparen) \
312 rex->lastparen = ix; \
313 rex->lastcloseparen = ix; \
314 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
315 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
320 (IV)rex->offs[ix].start, \
321 (IV)rex->offs[ix].end, \
325 #define UNWIND_PAREN(lp, lcp) \
326 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
327 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
332 (UV)(rex->lastparen), \
335 for (n = rex->lastparen; n > lp; n--) \
336 rex->offs[n].end = -1; \
337 rex->lastparen = n; \
338 rex->lastcloseparen = lcp;
342 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
346 DECLARE_AND_GET_RE_DEBUG_FLAGS;
348 PERL_ARGS_ASSERT_REGCPPOP;
350 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
352 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
353 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
354 rex->lastcloseparen = SSPOPINT;
355 rex->lastparen = SSPOPINT;
356 *maxopenparen_p = SSPOPINT;
358 i -= REGCP_OTHER_ELEMS;
359 /* Now restore the parentheses context. */
361 if (i || rex->lastparen + 1 <= rex->nparens)
362 Perl_re_exec_indentf( aTHX_
363 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
369 paren = *maxopenparen_p;
370 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
372 rex->offs[paren].start_tmp = SSPOPINT;
373 rex->offs[paren].start = SSPOPIV;
375 if (paren <= rex->lastparen)
376 rex->offs[paren].end = tmps;
377 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
378 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
381 (IV)rex->offs[paren].start,
382 (IV)rex->offs[paren].start_tmp,
383 (IV)rex->offs[paren].end,
384 (paren > rex->lastparen ? "(skipped)" : ""));
389 /* It would seem that the similar code in regtry()
390 * already takes care of this, and in fact it is in
391 * a better location to since this code can #if 0-ed out
392 * but the code in regtry() is needed or otherwise tests
393 * requiring null fields (pat.t#187 and split.t#{13,14}
394 * (as of patchlevel 7877) will fail. Then again,
395 * this code seems to be necessary or otherwise
396 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
397 * --jhi updated by dapm */
398 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
399 if (i > *maxopenparen_p)
400 rex->offs[i].start = -1;
401 rex->offs[i].end = -1;
402 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
403 " \\%" UVuf ": %s ..-1 undeffing\n",
406 (i > *maxopenparen_p) ? "-1" : " "
412 /* restore the parens and associated vars at savestack position ix,
413 * but without popping the stack */
416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
418 I32 tmpix = PL_savestack_ix;
419 PERL_ARGS_ASSERT_REGCP_RESTORE;
421 PL_savestack_ix = ix;
422 regcppop(rex, maxopenparen_p);
423 PL_savestack_ix = tmpix;
426 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
429 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
431 /* Returns a boolean as to whether or not 'character' is a member of the
432 * Posix character class given by 'classnum' that should be equivalent to a
433 * value in the typedef 'char_class_number_'.
435 * Ideally this could be replaced by a just an array of function pointers
436 * to the C library functions that implement the macros this calls.
437 * However, to compile, the precise function signatures are required, and
438 * these may vary from platform to platform. To avoid having to figure
439 * out what those all are on each platform, I (khw) am using this method,
440 * which adds an extra layer of function call overhead (unless the C
441 * optimizer strips it away). But we don't particularly care about
442 * performance with locales anyway. */
444 if (IN_UTF8_CTYPE_LOCALE) {
445 return cBOOL(generic_isCC_(character, classnum));
448 switch ((char_class_number_) classnum) {
449 case CC_ENUM_ALPHANUMERIC_: return isU8_ALPHANUMERIC_LC(character);
450 case CC_ENUM_ALPHA_: return isU8_ALPHA_LC(character);
451 case CC_ENUM_ASCII_: return isU8_ASCII_LC(character);
452 case CC_ENUM_BLANK_: return isU8_BLANK_LC(character);
453 case CC_ENUM_CASED_: return isU8_CASED_LC(character);
454 case CC_ENUM_CNTRL_: return isU8_CNTRL_LC(character);
455 case CC_ENUM_DIGIT_: return isU8_DIGIT_LC(character);
456 case CC_ENUM_GRAPH_: return isU8_GRAPH_LC(character);
457 case CC_ENUM_LOWER_: return isU8_LOWER_LC(character);
458 case CC_ENUM_PRINT_: return isU8_PRINT_LC(character);
459 case CC_ENUM_PUNCT_: return isU8_PUNCT_LC(character);
460 case CC_ENUM_SPACE_: return isU8_SPACE_LC(character);
461 case CC_ENUM_UPPER_: return isU8_UPPER_LC(character);
462 case CC_ENUM_WORDCHAR_: return isU8_WORDCHAR_LC(character);
463 case CC_ENUM_XDIGIT_: return isU8_XDIGIT_LC(character);
464 default: /* VERTSPACE should never occur in locales */
469 "panic: isFOO_lc() has an unexpected character class '%d'",
472 NOT_REACHED; /* NOTREACHED */
476 PERL_STATIC_INLINE I32
477 S_foldEQ_latin1_s2_folded(pTHX_ const char *s1, const char *s2, I32 len)
479 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be
480 * folded. Works on all folds representable without UTF-8, except for
481 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it
482 * check that the strings each have at least 'len' characters.
484 * There is almost an identical API function where s2 need not be folded:
485 * Perl_foldEQ_latin1() */
487 const U8 *a = (const U8 *)s1;
488 const U8 *b = (const U8 *)s2;
490 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
495 assert(! isUPPER_L1(*b));
496 if (toLOWER_L1(*a) != *b) {
505 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
507 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
508 * 'character' is a member of the Posix character class given by 'classnum'
509 * that should be equivalent to a value in the typedef
510 * 'char_class_number_'.
512 * This just calls isFOO_lc on the code point for the character if it is in
513 * the range 0-255. Outside that range, all characters use Unicode
514 * rules, ignoring any locale. So use the Unicode function if this class
515 * requires an inversion list, and use the Unicode macro otherwise. */
518 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
520 if (UTF8_IS_INVARIANT(*character)) {
521 return isFOO_lc(classnum, *character);
523 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
524 return isFOO_lc(classnum,
525 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
528 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
530 switch ((char_class_number_) classnum) {
531 case CC_ENUM_SPACE_: return is_XPERLSPACE_high(character);
532 case CC_ENUM_BLANK_: return is_HORIZWS_high(character);
533 case CC_ENUM_XDIGIT_: return is_XDIGIT_high(character);
534 case CC_ENUM_VERTSPACE_: return is_VERTWS_high(character);
536 return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
537 utf8_to_uvchr_buf(character, e, NULL));
540 return FALSE; /* Things like CNTRL are always below 256 */
544 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
546 /* Returns the position of the first byte in the sequence between 's' and
547 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
550 PERL_ARGS_ASSERT_FIND_SPAN_END;
554 if ((STRLEN) (send - s) >= PERL_WORDSIZE
555 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
556 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
558 PERL_UINTMAX_T span_word;
560 /* Process per-byte until reach word boundary. XXX This loop could be
561 * eliminated if we knew that this platform had fast unaligned reads */
562 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
563 if (*s != span_byte) {
569 /* Create a word filled with the bytes we are spanning */
570 span_word = PERL_COUNT_MULTIPLIER * span_byte;
572 /* Process per-word as long as we have at least a full word left */
575 /* Keep going if the whole word is composed of 'span_byte's */
576 if ((* (PERL_UINTMAX_T *) s) == span_word) {
581 /* Here, at least one byte in the word isn't 'span_byte'. */
589 /* This xor leaves 1 bits only in those non-matching bytes */
590 span_word ^= * (PERL_UINTMAX_T *) s;
592 /* Make sure the upper bit of each non-matching byte is set. This
593 * makes each such byte look like an ASCII platform variant byte */
594 span_word |= span_word << 1;
595 span_word |= span_word << 2;
596 span_word |= span_word << 4;
598 /* That reduces the problem to what this function solves */
599 return s + variant_byte_number(span_word);
603 } while (s + PERL_WORDSIZE <= send);
606 /* Process the straggler bytes beyond the final word boundary */
608 if (*s != span_byte) {
618 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
620 /* Returns the position of the first byte in the sequence between 's'
621 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
622 * returns 'send' if none found. It uses word-level operations instead of
623 * byte to speed up the process */
625 PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
628 assert((byte & mask) == byte);
632 if ((STRLEN) (send - s) >= PERL_WORDSIZE
633 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
634 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
636 PERL_UINTMAX_T word, mask_word;
638 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
639 if (((*s) & mask) == byte) {
645 word = PERL_COUNT_MULTIPLIER * byte;
646 mask_word = PERL_COUNT_MULTIPLIER * mask;
649 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
651 /* If 'masked' contains bytes with the bit pattern of 'byte' within
652 * it, xoring with 'word' will leave each of the 8 bits in such
653 * bytes be 0, and no byte containing any other bit pattern will be
657 /* This causes the most significant bit to be set to 1 for any
658 * bytes in the word that aren't completely 0 */
659 masked |= masked << 1;
660 masked |= masked << 2;
661 masked |= masked << 4;
663 /* The msbits are the same as what marks a byte as variant, so we
664 * can use this mask. If all msbits are 1, the word doesn't
666 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
671 /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
672 * and any that are, are 0. Complement and re-AND to swap that */
674 masked &= PERL_VARIANTS_WORD_MASK;
676 /* This reduces the problem to that solved by this function */
677 s += variant_byte_number(masked);
680 } while (s + PERL_WORDSIZE <= send);
686 if (((*s) & mask) == byte) {
696 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
698 /* Returns the position of the first byte in the sequence between 's' and
699 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
700 * 'span_byte' should have been ANDed with 'mask' in the call of this
701 * function. Returns 'send' if none found. Works like find_span_end(),
702 * except for the AND */
704 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
707 assert((span_byte & mask) == span_byte);
709 if ((STRLEN) (send - s) >= PERL_WORDSIZE
710 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
711 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
713 PERL_UINTMAX_T span_word, mask_word;
715 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
716 if (((*s) & mask) != span_byte) {
722 span_word = PERL_COUNT_MULTIPLIER * span_byte;
723 mask_word = PERL_COUNT_MULTIPLIER * mask;
726 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
728 if (masked == span_word) {
740 masked |= masked << 1;
741 masked |= masked << 2;
742 masked |= masked << 4;
743 return s + variant_byte_number(masked);
747 } while (s + PERL_WORDSIZE <= send);
751 if (((*s) & mask) != span_byte) {
761 * pregexec and friends
764 #ifndef PERL_IN_XSUB_RE
766 - pregexec - match a regexp against a string
769 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
770 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
771 /* stringarg: the point in the string at which to begin matching */
772 /* strend: pointer to null at end of string */
773 /* strbeg: real beginning of string */
774 /* minend: end of match must be >= minend bytes after stringarg. */
775 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
776 * itself is accessed via the pointers above */
777 /* nosave: For optimizations. */
779 PERL_ARGS_ASSERT_PREGEXEC;
782 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
783 nosave ? 0 : REXEC_COPY_STR);
789 /* re_intuit_start():
791 * Based on some optimiser hints, try to find the earliest position in the
792 * string where the regex could match.
794 * rx: the regex to match against
795 * sv: the SV being matched: only used for utf8 flag; the string
796 * itself is accessed via the pointers below. Note that on
797 * something like an overloaded SV, SvPOK(sv) may be false
798 * and the string pointers may point to something unrelated to
800 * strbeg: real beginning of string
801 * strpos: the point in the string at which to begin matching
802 * strend: pointer to the byte following the last char of the string
803 * flags currently unused; set to 0
804 * data: currently unused; set to NULL
806 * The basic idea of re_intuit_start() is to use some known information
807 * about the pattern, namely:
809 * a) the longest known anchored substring (i.e. one that's at a
810 * constant offset from the beginning of the pattern; but not
811 * necessarily at a fixed offset from the beginning of the
813 * b) the longest floating substring (i.e. one that's not at a constant
814 * offset from the beginning of the pattern);
815 * c) Whether the pattern is anchored to the string; either
816 * an absolute anchor: /^../, or anchored to \n: /^.../m,
817 * or anchored to pos(): /\G/;
818 * d) A start class: a real or synthetic character class which
819 * represents which characters are legal at the start of the pattern;
821 * to either quickly reject the match, or to find the earliest position
822 * within the string at which the pattern might match, thus avoiding
823 * running the full NFA engine at those earlier locations, only to
824 * eventually fail and retry further along.
826 * Returns NULL if the pattern can't match, or returns the address within
827 * the string which is the earliest place the match could occur.
829 * The longest of the anchored and floating substrings is called 'check'
830 * and is checked first. The other is called 'other' and is checked
831 * second. The 'other' substring may not be present. For example,
833 * /(abc|xyz)ABC\d{0,3}DEFG/
837 * check substr (float) = "DEFG", offset 6..9 chars
838 * other substr (anchored) = "ABC", offset 3..3 chars
841 * Be aware that during the course of this function, sometimes 'anchored'
842 * refers to a substring being anchored relative to the start of the
843 * pattern, and sometimes to the pattern itself being anchored relative to
844 * the string. For example:
846 * /\dabc/: "abc" is anchored to the pattern;
847 * /^\dabc/: "abc" is anchored to the pattern and the string;
848 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
849 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
850 * but the pattern is anchored to the string.
854 Perl_re_intuit_start(pTHX_
857 const char * const strbeg,
861 re_scream_pos_data *data)
863 struct regexp *const prog = ReANY(rx);
864 SSize_t start_shift = prog->check_offset_min;
865 /* Should be nonnegative! */
866 SSize_t end_shift = 0;
867 /* current lowest pos in string where the regex can start matching */
868 char *rx_origin = strpos;
870 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
871 U8 other_ix = 1 - prog->substrs->check_ix;
873 char *other_last = strpos;/* latest pos 'other' substr already checked to */
874 char *check_at = NULL; /* check substr found at this pos */
875 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
876 RXi_GET_DECL(prog,progi);
877 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
878 regmatch_info *const reginfo = ®info_buf;
879 DECLARE_AND_GET_RE_DEBUG_FLAGS;
881 PERL_ARGS_ASSERT_RE_INTUIT_START;
882 PERL_UNUSED_ARG(flags);
883 PERL_UNUSED_ARG(data);
885 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
886 "Intuit: trying to determine minimum start position...\n"));
888 /* for now, assume that all substr offsets are positive. If at some point
889 * in the future someone wants to do clever things with lookbehind and
890 * -ve offsets, they'll need to fix up any code in this function
891 * which uses these offsets. See the thread beginning
892 * <20140113145929.GF27210@iabyn.com>
894 assert(prog->substrs->data[0].min_offset >= 0);
895 assert(prog->substrs->data[0].max_offset >= 0);
896 assert(prog->substrs->data[1].min_offset >= 0);
897 assert(prog->substrs->data[1].max_offset >= 0);
898 assert(prog->substrs->data[2].min_offset >= 0);
899 assert(prog->substrs->data[2].max_offset >= 0);
901 /* for now, assume that if both present, that the floating substring
902 * doesn't start before the anchored substring.
903 * If you break this assumption (e.g. doing better optimisations
904 * with lookahead/behind), then you'll need to audit the code in this
905 * function carefully first
908 ! ( (prog->anchored_utf8 || prog->anchored_substr)
909 && (prog->float_utf8 || prog->float_substr))
910 || (prog->float_min_offset >= prog->anchored_offset));
912 /* byte rather than char calculation for efficiency. It fails
913 * to quickly reject some cases that can't match, but will reject
914 * them later after doing full char arithmetic */
915 if (prog->minlen > strend - strpos) {
916 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
917 " String too short...\n"));
921 RXp_MATCH_UTF8_set(prog, utf8_target);
922 reginfo->is_utf8_target = cBOOL(utf8_target);
923 reginfo->info_aux = NULL;
924 reginfo->strbeg = strbeg;
925 reginfo->strend = strend;
926 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
928 /* not actually used within intuit, but zero for safety anyway */
929 reginfo->poscache_maxiter = 0;
932 if ((!prog->anchored_utf8 && prog->anchored_substr)
933 || (!prog->float_utf8 && prog->float_substr))
934 to_utf8_substr(prog);
935 check = prog->check_utf8;
937 if (!prog->check_substr && prog->check_utf8) {
938 if (! to_byte_substr(prog)) {
939 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
942 check = prog->check_substr;
945 /* dump the various substring data */
946 DEBUG_OPTIMISE_MORE_r({
948 for (i=0; i<=2; i++) {
949 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
950 : prog->substrs->data[i].substr);
954 Perl_re_printf( aTHX_
955 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
956 " useful=%" IVdf " utf8=%d [%s]\n",
958 (IV)prog->substrs->data[i].min_offset,
959 (IV)prog->substrs->data[i].max_offset,
960 (IV)prog->substrs->data[i].end_shift,
967 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
969 /* ml_anch: check after \n?
971 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
972 * with /.*.../, these flags will have been added by the
974 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
975 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
977 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
978 && !(prog->intflags & PREGf_IMPLICIT);
980 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
981 /* we are only allowed to match at BOS or \G */
983 /* trivially reject if there's a BOS anchor and we're not at BOS.
985 * Note that we don't try to do a similar quick reject for
986 * \G, since generally the caller will have calculated strpos
987 * based on pos() and gofs, so the string is already correctly
988 * anchored by definition; and handling the exceptions would
989 * be too fiddly (e.g. REXEC_IGNOREPOS).
991 if ( strpos != strbeg
992 && (prog->intflags & PREGf_ANCH_SBOL))
994 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
995 " Not at start...\n"));
999 /* in the presence of an anchor, the anchored (relative to the
1000 * start of the regex) substr must also be anchored relative
1001 * to strpos. So quickly reject if substr isn't found there.
1002 * This works for \G too, because the caller will already have
1003 * subtracted gofs from pos, and gofs is the offset from the
1004 * \G to the start of the regex. For example, in /.abc\Gdef/,
1005 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
1006 * caller will have set strpos=pos()-4; we look for the substr
1007 * at position pos()-4+1, which lines up with the "a" */
1009 if (prog->check_offset_min == prog->check_offset_max) {
1010 /* Substring at constant offset from beg-of-str... */
1011 SSize_t slen = SvCUR(check);
1012 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1014 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1015 " Looking for check substr at fixed offset %" IVdf "...\n",
1016 (IV)prog->check_offset_min));
1018 if (SvTAIL(check)) {
1019 /* In this case, the regex is anchored at the end too.
1020 * Unless it's a multiline match, the lengths must match
1021 * exactly, give or take a \n. NB: slen >= 1 since
1022 * the last char of check is \n */
1024 && ( strend - s > slen
1025 || strend - s < slen - 1
1026 || (strend - s == slen && strend[-1] != '\n')))
1028 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1029 " String too long...\n"));
1032 /* Now should match s[0..slen-2] */
1035 if (slen && (strend - s < slen
1036 || *SvPVX_const(check) != *s
1037 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1039 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1040 " String not equal...\n"));
1045 goto success_at_start;
1050 end_shift = prog->check_end_shift;
1052 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
1054 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1055 (IV)end_shift, RX_PRECOMP(rx));
1060 /* This is the (re)entry point of the main loop in this function.
1061 * The goal of this loop is to:
1062 * 1) find the "check" substring in the region rx_origin..strend
1063 * (adjusted by start_shift / end_shift). If not found, reject
1065 * 2) If it exists, look for the "other" substr too if defined; for
1066 * example, if the check substr maps to the anchored substr, then
1067 * check the floating substr, and vice-versa. If not found, go
1068 * back to (1) with rx_origin suitably incremented.
1069 * 3) If we find an rx_origin position that doesn't contradict
1070 * either of the substrings, then check the possible additional
1071 * constraints on rx_origin of /^.../m or a known start class.
1072 * If these fail, then depending on which constraints fail, jump
1073 * back to here, or to various other re-entry points further along
1074 * that skip some of the first steps.
1075 * 4) If we pass all those tests, update the BmUSEFUL() count on the
1076 * substring. If the start position was determined to be at the
1077 * beginning of the string - so, not rejected, but not optimised,
1078 * since we have to run regmatch from position 0 - decrement the
1079 * BmUSEFUL() count. Otherwise increment it.
1083 /* first, look for the 'check' substring */
1089 DEBUG_OPTIMISE_MORE_r({
1090 Perl_re_printf( aTHX_
1091 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1092 " Start shift: %" IVdf " End shift %" IVdf
1093 " Real end Shift: %" IVdf "\n",
1094 (IV)(rx_origin - strbeg),
1095 (IV)prog->check_offset_min,
1098 (IV)prog->check_end_shift);
1101 end_point = HOPBACK3(strend, end_shift, rx_origin);
1104 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1109 /* If the regex is absolutely anchored to either the start of the
1110 * string (SBOL) or to pos() (ANCH_GPOS), then
1111 * check_offset_max represents an upper bound on the string where
1112 * the substr could start. For the ANCH_GPOS case, we assume that
1113 * the caller of intuit will have already set strpos to
1114 * pos()-gofs, so in this case strpos + offset_max will still be
1115 * an upper bound on the substr.
1118 && prog->intflags & PREGf_ANCH
1119 && prog->check_offset_max != SSize_t_MAX)
1121 SSize_t check_len = SvCUR(check) - cBOOL(SvTAIL(check));
1122 const char * const anchor =
1123 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1124 SSize_t targ_len = (char*)end_point - anchor;
1126 if (check_len > targ_len) {
1127 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1128 "Target string too short to match required substring...\n"));
1132 /* do a bytes rather than chars comparison. It's conservative;
1133 * so it skips doing the HOP if the result can't possibly end
1134 * up earlier than the old value of end_point.
1136 assert(anchor + check_len <= (char *)end_point);
1137 if (prog->check_offset_max + check_len < targ_len) {
1138 end_point = HOP3lim((U8*)anchor,
1139 prog->check_offset_max,
1140 end_point - check_len
1143 if (end_point < start_point)
1148 check_at = fbm_instr( start_point, end_point,
1149 check, multiline ? FBMrf_MULTILINE : 0);
1151 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1152 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1153 (IV)((char*)start_point - strbeg),
1154 (IV)((char*)end_point - strbeg),
1155 (IV)(check_at ? check_at - strbeg : -1)
1158 /* Update the count-of-usability, remove useless subpatterns,
1162 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1163 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1164 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
1165 (check_at ? "Found" : "Did not find"),
1166 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1167 ? "anchored" : "floating"),
1170 (check_at ? " at offset " : "...\n") );
1175 /* set rx_origin to the minimum position where the regex could start
1176 * matching, given the constraint of the just-matched check substring.
1177 * But don't set it lower than previously.
1180 if (check_at - rx_origin > prog->check_offset_max)
1181 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1182 /* Finish the diagnostic message */
1183 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1184 "%ld (rx_origin now %" IVdf ")...\n",
1185 (long)(check_at - strbeg),
1186 (IV)(rx_origin - strbeg)
1191 /* now look for the 'other' substring if defined */
1193 if (prog->substrs->data[other_ix].utf8_substr
1194 || prog->substrs->data[other_ix].substr)
1196 /* Take into account the "other" substring. */
1200 struct reg_substr_datum *other;
1203 other = &prog->substrs->data[other_ix];
1204 if (!utf8_target && !other->substr) {
1205 if (!to_byte_substr(prog)) {
1206 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1210 /* if "other" is anchored:
1211 * we've previously found a floating substr starting at check_at.
1212 * This means that the regex origin must lie somewhere
1213 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1214 * and max: HOP3(check_at, -check_offset_min)
1215 * (except that min will be >= strpos)
1216 * So the fixed substr must lie somewhere between
1217 * HOP3(min, anchored_offset)
1218 * HOP3(max, anchored_offset) + SvCUR(substr)
1221 /* if "other" is floating
1222 * Calculate last1, the absolute latest point where the
1223 * floating substr could start in the string, ignoring any
1224 * constraints from the earlier fixed match. It is calculated
1227 * strend - prog->minlen (in chars) is the absolute latest
1228 * position within the string where the origin of the regex
1229 * could appear. The latest start point for the floating
1230 * substr is float_min_offset(*) on from the start of the
1231 * regex. last1 simply combines thee two offsets.
1233 * (*) You might think the latest start point should be
1234 * float_max_offset from the regex origin, and technically
1235 * you'd be correct. However, consider
1237 * Here, float min, max are 3,5 and minlen is 7.
1238 * This can match either
1242 * In the first case, the regex matches minlen chars; in the
1243 * second, minlen+1, in the third, minlen+2.
1244 * In the first case, the floating offset is 3 (which equals
1245 * float_min), in the second, 4, and in the third, 5 (which
1246 * equals float_max). In all cases, the floating string bcd
1247 * can never start more than 4 chars from the end of the
1248 * string, which equals minlen - float_min. As the substring
1249 * starts to match more than float_min from the start of the
1250 * regex, it makes the regex match more than minlen chars,
1251 * and the two cancel each other out. So we can always use
1252 * float_min - minlen, rather than float_max - minlen for the
1253 * latest position in the string.
1255 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1256 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1259 assert(prog->minlen >= other->min_offset);
1260 last1 = HOP3c(strend,
1261 other->min_offset - prog->minlen, strbeg);
1263 if (other_ix) {/* i.e. if (other-is-float) */
1264 /* last is the latest point where the floating substr could
1265 * start, *given* any constraints from the earlier fixed
1266 * match. This constraint is that the floating string starts
1267 * <= float_max_offset chars from the regex origin (rx_origin).
1268 * If this value is less than last1, use it instead.
1270 assert(rx_origin <= last1);
1272 /* this condition handles the offset==infinity case, and
1273 * is a short-cut otherwise. Although it's comparing a
1274 * byte offset to a char length, it does so in a safe way,
1275 * since 1 char always occupies 1 or more bytes,
1276 * so if a string range is (last1 - rx_origin) bytes,
1277 * it will be less than or equal to (last1 - rx_origin)
1278 * chars; meaning it errs towards doing the accurate HOP3
1279 * rather than just using last1 as a short-cut */
1280 (last1 - rx_origin) < other->max_offset
1282 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1285 assert(strpos + start_shift <= check_at);
1286 last = HOP4c(check_at, other->min_offset - start_shift,
1290 s = HOP3c(rx_origin, other->min_offset, strend);
1291 if (s < other_last) /* These positions already checked */
1294 must = utf8_target ? other->utf8_substr : other->substr;
1295 assert(SvPOK(must));
1298 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1304 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1305 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1306 (IV)(from - strbeg),
1312 (unsigned char*)from,
1315 multiline ? FBMrf_MULTILINE : 0
1317 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1318 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1319 (IV)(from - strbeg),
1321 (IV)(s ? s - strbeg : -1)
1327 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1328 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1329 Perl_re_printf( aTHX_ " %s %s substr %s%s",
1330 s ? "Found" : "Contradicts",
1331 other_ix ? "floating" : "anchored",
1332 quoted, RE_SV_TAIL(must));
1337 /* last1 is latest possible substr location. If we didn't
1338 * find it before there, we never will */
1339 if (last >= last1) {
1340 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1341 "; giving up...\n"));
1345 /* try to find the check substr again at a later
1346 * position. Maybe next time we'll find the "other" substr
1348 other_last = HOP3c(last, 1, strend) /* highest failure */;
1350 other_ix /* i.e. if other-is-float */
1351 ? HOP3c(rx_origin, 1, strend)
1352 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1353 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1354 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1355 (other_ix ? "floating" : "anchored"),
1356 (long)(HOP3c(check_at, 1, strend) - strbeg),
1357 (IV)(rx_origin - strbeg)
1362 if (other_ix) { /* if (other-is-float) */
1363 /* other_last is set to s, not s+1, since its possible for
1364 * a floating substr to fail first time, then succeed
1365 * second time at the same floating position; e.g.:
1366 * "-AB--AABZ" =~ /\wAB\d*Z/
1367 * The first time round, anchored and float match at
1368 * "-(AB)--AAB(Z)" then fail on the initial \w character
1369 * class. Second time round, they match at "-AB--A(AB)(Z)".
1374 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1375 other_last = HOP3c(s, 1, strend);
1377 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1378 " at offset %ld (rx_origin now %" IVdf ")...\n",
1380 (IV)(rx_origin - strbeg)
1386 DEBUG_OPTIMISE_MORE_r(
1387 Perl_re_printf( aTHX_
1388 " Check-only match: offset min:%" IVdf " max:%" IVdf
1389 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1390 " strend:%" IVdf "\n",
1391 (IV)prog->check_offset_min,
1392 (IV)prog->check_offset_max,
1393 (IV)(check_at-strbeg),
1394 (IV)(rx_origin-strbeg),
1395 (IV)(rx_origin-check_at),
1401 postprocess_substr_matches:
1403 /* handle the extra constraint of /^.../m if present */
1405 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1408 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1409 " looking for /^/m anchor"));
1411 /* we have failed the constraint of a \n before rx_origin.
1412 * Find the next \n, if any, even if it's beyond the current
1413 * anchored and/or floating substrings. Whether we should be
1414 * scanning ahead for the next \n or the next substr is debatable.
1415 * On the one hand you'd expect rare substrings to appear less
1416 * often than \n's. On the other hand, searching for \n means
1417 * we're effectively flipping between check_substr and "\n" on each
1418 * iteration as the current "rarest" candidate string, which
1419 * means for example that we'll quickly reject the whole string if
1420 * hasn't got a \n, rather than trying every substr position
1424 s = HOP3c(strend, - prog->minlen, strpos);
1425 if (s <= rx_origin ||
1426 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1428 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1429 " Did not find /%s^%s/m...\n",
1430 PL_colors[0], PL_colors[1]));
1434 /* earliest possible origin is 1 char after the \n.
1435 * (since *rx_origin == '\n', it's safe to ++ here rather than
1436 * HOP(rx_origin, 1)) */
1439 if (prog->substrs->check_ix == 0 /* check is anchored */
1440 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1442 /* Position contradicts check-string; either because
1443 * check was anchored (and thus has no wiggle room),
1444 * or check was float and rx_origin is above the float range */
1445 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1446 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1447 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1451 /* if we get here, the check substr must have been float,
1452 * is in range, and we may or may not have had an anchored
1453 * "other" substr which still contradicts */
1454 assert(prog->substrs->check_ix); /* check is float */
1456 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1457 /* whoops, the anchored "other" substr exists, so we still
1458 * contradict. On the other hand, the float "check" substr
1459 * didn't contradict, so just retry the anchored "other"
1461 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1462 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1463 PL_colors[0], PL_colors[1],
1464 (IV)(rx_origin - strbeg + prog->anchored_offset),
1465 (IV)(rx_origin - strbeg)
1467 goto do_other_substr;
1470 /* success: we don't contradict the found floating substring
1471 * (and there's no anchored substr). */
1472 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1473 " Found /%s^%s/m with rx_origin %ld...\n",
1474 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1477 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1478 " (multiline anchor test skipped)\n"));
1484 /* if we have a starting character class, then test that extra constraint.
1485 * (trie stclasses are too expensive to use here, we are better off to
1486 * leave it to regmatch itself) */
1488 if (progi->regstclass && REGNODE_TYPE(OP(progi->regstclass))!=TRIE) {
1489 const U8* const str = (U8*)STRING(progi->regstclass);
1491 /* XXX this value could be pre-computed */
1492 const SSize_t cl_l = (REGNODE_TYPE(OP(progi->regstclass)) == EXACT
1493 ? (reginfo->is_utf8_pat
1494 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1495 : (SSize_t)STR_LEN(progi->regstclass))
1499 /* latest pos that a matching float substr constrains rx start to */
1500 char *rx_max_float = NULL;
1502 /* if the current rx_origin is anchored, either by satisfying an
1503 * anchored substring constraint, or a /^.../m constraint, then we
1504 * can reject the current origin if the start class isn't found
1505 * at the current position. If we have a float-only match, then
1506 * rx_origin is constrained to a range; so look for the start class
1507 * in that range. if neither, then look for the start class in the
1508 * whole rest of the string */
1510 /* XXX DAPM it's not clear what the minlen test is for, and why
1511 * it's not used in the floating case. Nothing in the test suite
1512 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1513 * Here are some old comments, which may or may not be correct:
1515 * minlen == 0 is possible if regstclass is \b or \B,
1516 * and the fixed substr is ''$.
1517 * Since minlen is already taken into account, rx_origin+1 is
1518 * before strend; accidentally, minlen >= 1 guaranties no false
1519 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1520 * 0) below assumes that regstclass does not come from lookahead...
1521 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1522 * This leaves EXACTF-ish only, which are dealt with in
1526 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1527 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1528 else if (prog->float_substr || prog->float_utf8) {
1529 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1530 endpos = HOP3clim(rx_max_float, cl_l, strend);
1535 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1536 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
1537 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1538 (IV)start_shift, (IV)(check_at - strbeg),
1539 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1541 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1544 if (endpos == strend) {
1545 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1546 " Could not match STCLASS...\n") );
1549 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1550 " This position contradicts STCLASS...\n") );
1551 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1552 && !(prog->intflags & PREGf_IMPLICIT))
1555 /* Contradict one of substrings */
1556 if (prog->anchored_substr || prog->anchored_utf8) {
1557 if (prog->substrs->check_ix == 1) { /* check is float */
1558 /* Have both, check_string is floating */
1559 assert(rx_origin + start_shift <= check_at);
1560 if (rx_origin + start_shift != check_at) {
1561 /* not at latest position float substr could match:
1562 * Recheck anchored substring, but not floating.
1563 * The condition above is in bytes rather than
1564 * chars for efficiency. It's conservative, in
1565 * that it errs on the side of doing 'goto
1566 * do_other_substr'. In this case, at worst,
1567 * an extra anchored search may get done, but in
1568 * practice the extra fbm_instr() is likely to
1569 * get skipped anyway. */
1570 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1571 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1572 (long)(other_last - strbeg),
1573 (IV)(rx_origin - strbeg)
1575 goto do_other_substr;
1583 /* In the presence of ml_anch, we might be able to
1584 * find another \n without breaking the current float
1587 /* strictly speaking this should be HOP3c(..., 1, ...),
1588 * but since we goto a block of code that's going to
1589 * search for the next \n if any, its safe here */
1591 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1592 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
1593 PL_colors[0], PL_colors[1],
1594 (long)(rx_origin - strbeg)) );
1595 goto postprocess_substr_matches;
1598 /* strictly speaking this can never be true; but might
1599 * be if we ever allow intuit without substrings */
1600 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1603 rx_origin = rx_max_float;
1606 /* at this point, any matching substrings have been
1607 * contradicted. Start again... */
1609 rx_origin = HOP3c(rx_origin, 1, strend);
1611 /* uses bytes rather than char calculations for efficiency.
1612 * It's conservative: it errs on the side of doing 'goto restart',
1613 * where there is code that does a proper char-based test */
1614 if (rx_origin + start_shift + end_shift > strend) {
1615 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1616 " Could not match STCLASS...\n") );
1619 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1620 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1621 (prog->substrs->check_ix ? "floating" : "anchored"),
1622 (long)(rx_origin + start_shift - strbeg),
1623 (IV)(rx_origin - strbeg)
1630 if (rx_origin != s) {
1631 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1632 " By STCLASS: moving %ld --> %ld\n",
1633 (long)(rx_origin - strbeg), (long)(s - strbeg))
1637 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1638 " Does not contradict STCLASS...\n");
1643 /* Decide whether using the substrings helped */
1645 if (rx_origin != strpos) {
1646 /* Fixed substring is found far enough so that the match
1647 cannot start at strpos. */
1649 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
1650 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1653 /* The found rx_origin position does not prohibit matching at
1654 * strpos, so calling intuit didn't gain us anything. Decrement
1655 * the BmUSEFUL() count on the check substring, and if we reach
1657 if (!(prog->intflags & PREGf_NAUGHTY)
1659 prog->check_utf8 /* Could be deleted already */
1660 && --BmUSEFUL(prog->check_utf8) < 0
1661 && (prog->check_utf8 == prog->float_utf8)
1663 prog->check_substr /* Could be deleted already */
1664 && --BmUSEFUL(prog->check_substr) < 0
1665 && (prog->check_substr == prog->float_substr)
1668 /* If flags & SOMETHING - do not do it many times on the same match */
1669 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
1670 /* XXX Does the destruction order has to change with utf8_target? */
1671 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1672 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1673 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1674 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1675 check = NULL; /* abort */
1676 /* XXXX This is a remnant of the old implementation. It
1677 looks wasteful, since now INTUIT can use many
1678 other heuristics. */
1679 prog->extflags &= ~RXf_USE_INTUIT;
1683 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1684 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1685 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1689 fail_finish: /* Substring not found */
1690 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1691 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1693 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
1694 PL_colors[4], PL_colors[5]));
1699 #define DECL_TRIE_TYPE(scan) \
1700 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1701 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1702 trie_utf8l, trie_flu8, trie_flu8_latin } \
1703 trie_type = ((scan->flags == EXACT) \
1704 ? (utf8_target ? trie_utf8 : trie_plain) \
1705 : (scan->flags == EXACTL) \
1706 ? (utf8_target ? trie_utf8l : trie_plain) \
1707 : (scan->flags == EXACTFAA) \
1709 ? trie_utf8_exactfa_fold \
1710 : trie_latin_utf8_exactfa_fold) \
1711 : (scan->flags == EXACTFLU8 \
1714 : trie_flu8_latin) \
1717 : trie_latin_utf8_fold)))
1719 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1720 * 'foldbuf+sizeof(foldbuf)' */
1721 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1724 U8 flags = FOLD_FLAGS_FULL; \
1725 switch (trie_type) { \
1727 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \
1728 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \
1729 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1731 goto do_trie_utf8_fold; \
1732 case trie_utf8_exactfa_fold: \
1733 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1735 case trie_utf8_fold: \
1736 do_trie_utf8_fold: \
1737 if ( foldlen>0 ) { \
1738 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1743 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
1745 len = UTF8_SAFE_SKIP(uc, uc_end); \
1746 skiplen = UVCHR_SKIP( uvc ); \
1747 foldlen -= skiplen; \
1748 uscan = foldbuf + skiplen; \
1751 case trie_flu8_latin: \
1752 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \
1753 goto do_trie_latin_utf8_fold; \
1754 case trie_latin_utf8_exactfa_fold: \
1755 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1757 case trie_latin_utf8_fold: \
1758 do_trie_latin_utf8_fold: \
1759 if ( foldlen>0 ) { \
1760 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1766 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1767 skiplen = UVCHR_SKIP( uvc ); \
1768 foldlen -= skiplen; \
1769 uscan = foldbuf + skiplen; \
1773 CHECK_AND_WARN_PROBLEMATIC_LOCALE_; \
1774 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1775 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1779 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
1786 charid = trie->charmap[ uvc ]; \
1790 if (widecharmap) { \
1791 SV** const svpp = hv_fetch(widecharmap, \
1792 (char*)&uvc, sizeof(UV), 0); \
1794 charid = (U16)SvIV(*svpp); \
1799 #define DUMP_EXEC_POS(li,s,doutf8,depth) \
1800 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1801 startpos, doutf8, depth)
1803 #define GET_ANYOFH_INVLIST(prog, n) \
1804 GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0, NULL, NULL)
1806 #define REXEC_FBC_UTF8_SCAN(CODE) \
1808 while (s < strend) { \
1810 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1814 #define REXEC_FBC_NON_UTF8_SCAN(CODE) \
1816 while (s < strend) { \
1822 #define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1824 while (s < strend) { \
1825 REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1829 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND) \
1831 while (s < strend) { \
1832 REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1836 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1839 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1840 previous_occurrence_end = s; \
1846 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1850 previous_occurrence_end = s; \
1856 /* We keep track of where the next character should start after an occurrence
1857 * of the one we're looking for. Knowing that, we can see right away if the
1858 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
1859 * don't accept the 2nd and succeeding adjacent occurrences */
1860 #define FBC_CHECK_AND_TRY \
1862 || s != previous_occurrence_end) \
1863 && ( reginfo->intuit \
1864 || (s <= reginfo->strend && regtry(reginfo, &s)))) \
1870 /* These differ from the above macros in that they call a function which
1871 * returns the next occurrence of the thing being looked for in 's'; and
1872 * 'strend' if there is no such occurrence. 'f' is something like fcn(a,b,c)
1874 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f) \
1875 while (s < strend) { \
1877 if (s >= strend) { \
1883 previous_occurrence_end = s; \
1886 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f) \
1887 while (s < strend) { \
1889 if (s >= strend) { \
1895 previous_occurrence_end = s; \
1898 /* This is like the above macro except the function returns NULL if there is no
1899 * occurrence, and there is a further condition that must be matched besides
1901 #define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND) \
1902 while (s < strend) { \
1905 s = (char *) strend; \
1911 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1912 previous_occurrence_end = s; \
1919 /* This differs from the above macros in that it is passed a single byte that
1920 * is known to begin the next occurrence of the thing being looked for in 's'.
1921 * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1922 * at that position. */
1923 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \
1924 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s), \
1927 /* This is like the function above, but takes an entire string to look for
1928 * instead of a single byte */
1929 #define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND) \
1930 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND( \
1931 ninstr(s, strend, substr, substr_end), \
1934 /* The four macros below are slightly different versions of the same logic.
1936 * The first is for /a and /aa when the target string is UTF-8. This can only
1937 * match ascii, but it must advance based on UTF-8. The other three handle
1938 * the non-UTF-8 and the more generic UTF-8 cases. In all four, we are
1939 * looking for the boundary (or non-boundary) between a word and non-word
1940 * character. The utf8 and non-utf8 cases have the same logic, but the details
1941 * must be different. Find the "wordness" of the character just prior to this
1942 * one, and compare it with the wordness of this one. If they differ, we have
1943 * a boundary. At the beginning of the string, pretend that the previous
1944 * character was a new-line.
1946 * All these macros uncleanly have side-effects with each other and outside
1947 * variables. So far it's been too much trouble to clean-up
1949 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1950 * a word character or not.
1951 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1953 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1955 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1956 * are looking for a boundary or for a non-boundary. If we are looking for a
1957 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1958 * see if this tentative match actually works, and if so, to quit the loop
1959 * here. And vice-versa if we are looking for a non-boundary.
1961 * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
1962 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1963 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1964 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1965 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1966 * complement. But in that branch we complement tmp, meaning that at the
1967 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1968 * which means at the top of the loop in the next iteration, it is
1969 * TEST_NON_UTF8(s-1) */
1970 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1971 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1972 tmp = TEST_NON_UTF8(tmp); \
1973 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1974 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1976 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1983 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1984 * TEST_UTF8 is a macro that for the same input code points returns identically
1985 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
1986 * end pointer as well) */
1987 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
1988 if (s == reginfo->strbeg) { \
1991 else { /* Back-up to the start of the previous character */ \
1992 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1993 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1994 0, UTF8_ALLOW_DEFAULT); \
1996 tmp = TEST_UV(tmp); \
1997 REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */ \
1998 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
2007 /* Like the above two macros, for a UTF-8 target string. UTF8_CODE is the
2008 * complete code for handling UTF-8. Common to the BOUND and NBOUND cases,
2009 * set-up by the FBC_BOUND, etc macros below */
2010 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2012 /* Here, things have been set up by the previous code so that tmp is the \
2013 * return of TEST_NON_UTF8(s-1). We also have to check if this matches \
2014 * against the EOS, which we treat as a \n */ \
2015 if (tmp == ! TEST_NON_UTF8('\n')) { \
2022 /* Same as the macro above, but the target isn't UTF-8 */
2023 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2024 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2025 tmp = TEST_NON_UTF8(tmp); \
2026 REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */ \
2027 if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) { \
2035 /* Here, things have been set up by the previous code so that tmp is \
2036 * the return of TEST_NON_UTF8(s-1). We also have to check if this \
2037 * matches against the EOS, which we treat as a \n */ \
2038 if (tmp == ! TEST_NON_UTF8('\n')) { \
2045 /* This is the macro to use when we want to see if something that looks like it
2046 * could match, actually does, and if so exits the loop. It needs to be used
2047 * only for bounds checking macros, as it allows for matching beyond the end of
2048 * string (which should be zero length without having to look at the string
2050 #define REXEC_FBC_TRYIT \
2051 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \
2054 /* The only difference between the BOUND and NBOUND cases is that
2055 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2056 * NBOUND. This is accomplished by passing it as either the if or else clause,
2057 * with the other one being empty (PLACEHOLDER is defined as empty).
2059 * The TEST_FOO parameters are for operating on different forms of input, but
2060 * all should be ones that return identically for the same underlying code
2063 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2064 FBC_BOUND_COMMON_UTF8( \
2065 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2066 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2068 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8) \
2069 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2071 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8) \
2072 FBC_BOUND_COMMON_UTF8( \
2073 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2074 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2076 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8) \
2077 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2079 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2080 FBC_BOUND_COMMON_UTF8( \
2081 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2082 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2084 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8) \
2085 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2087 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8) \
2088 FBC_BOUND_COMMON_UTF8( \
2089 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2090 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2092 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8) \
2093 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2097 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2098 IV cp_out = _invlist_search(invlist, cp_in);
2099 assert(cp_out >= 0);
2102 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2103 invmap[S_get_break_val_cp_checked(invlist, cp)]
2105 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2106 invmap[_invlist_search(invlist, cp)]
2109 /* Takes a pointer to an inversion list, a pointer to its corresponding
2110 * inversion map, and a code point, and returns the code point's value
2111 * according to the two arrays. It assumes that all code points have a value.
2112 * This is used as the base macro for macros for particular properties */
2113 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
2114 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2116 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2117 * of a code point, returning the value for the first code point in the string.
2118 * And it takes the particular macro name that finds the desired value given a
2119 * code point. Merely convert the UTF-8 to code point and call the cp macro */
2120 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
2121 (__ASSERT_(pos < strend) \
2122 /* Note assumes is valid UTF-8 */ \
2123 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2125 /* Returns the GCB value for the input code point */
2126 #define getGCB_VAL_CP(cp) \
2127 _generic_GET_BREAK_VAL_CP( \
2132 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2133 * bounded by pos and strend */
2134 #define getGCB_VAL_UTF8(pos, strend) \
2135 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2137 /* Returns the LB value for the input code point */
2138 #define getLB_VAL_CP(cp) \
2139 _generic_GET_BREAK_VAL_CP( \
2144 /* Returns the LB value for the first code point in the UTF-8 encoded string
2145 * bounded by pos and strend */
2146 #define getLB_VAL_UTF8(pos, strend) \
2147 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2150 /* Returns the SB value for the input code point */
2151 #define getSB_VAL_CP(cp) \
2152 _generic_GET_BREAK_VAL_CP( \
2157 /* Returns the SB value for the first code point in the UTF-8 encoded string
2158 * bounded by pos and strend */
2159 #define getSB_VAL_UTF8(pos, strend) \
2160 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2162 /* Returns the WB value for the input code point */
2163 #define getWB_VAL_CP(cp) \
2164 _generic_GET_BREAK_VAL_CP( \
2169 /* Returns the WB value for the first code point in the UTF-8 encoded string
2170 * bounded by pos and strend */
2171 #define getWB_VAL_UTF8(pos, strend) \
2172 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2174 /* We know what class REx starts with. Try to find this position... */
2175 /* if reginfo->intuit, its a dryrun */
2176 /* annoyingly all the vars in this routine have different names from their counterparts
2177 in regmatch. /grrr */
2179 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2180 const char *strend, regmatch_info *reginfo)
2183 /* TRUE if x+ need not match at just the 1st pos of run of x's */
2184 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2186 char *pat_string; /* The pattern's exactish string */
2187 char *pat_end; /* ptr to end char of pat_string */
2188 re_fold_t folder; /* Function for computing non-utf8 folds */
2189 const U8 *fold_array; /* array for folding ords < 256 */
2196 /* In some cases we accept only the first occurence of 'x' in a sequence of
2197 * them. This variable points to just beyond the end of the previous
2198 * occurrence of 'x', hence we can tell if we are in a sequence. (Having
2199 * it point to beyond the 'x' allows us to work for UTF-8 without having to
2201 char * previous_occurrence_end = 0;
2203 I32 tmp; /* Scratch variable */
2204 const bool utf8_target = reginfo->is_utf8_target;
2205 UV utf8_fold_flags = 0;
2206 const bool is_utf8_pat = reginfo->is_utf8_pat;
2207 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2208 with a result inverts that result, as 0^1 =
2210 char_class_number_ classnum;
2212 RXi_GET_DECL(prog,progi);
2214 PERL_ARGS_ASSERT_FIND_BYCLASS;
2216 /* We know what class it must start with. The case statements below have
2217 * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2218 * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2219 * ('p8' and 'pb'. */
2220 switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2223 case ANYOFPOSIXL_t8_pb:
2224 case ANYOFPOSIXL_t8_p8:
2227 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2228 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2236 REXEC_FBC_UTF8_CLASS_SCAN(
2237 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2240 case ANYOFPOSIXL_tb_pb:
2241 case ANYOFPOSIXL_tb_p8:
2244 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2245 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2253 if (! ANYOF_FLAGS(c) && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(c)) {
2254 /* We know that s is in the bitmap range since the target isn't
2255 * UTF-8, so what happens for out-of-range values is not relevant,
2256 * so exclude that from the flags */
2257 REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2260 REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2265 case ANYOFM_tb_pb: /* ARG() is the base byte; FLAGS() the mask byte */
2267 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2268 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2273 /* UTF-8ness doesn't matter because only matches UTF-8 invariants. But
2274 * we do anyway for performance reasons, as otherwise we would have to
2275 * examine all the continuation characters */
2276 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2277 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2282 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2283 find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2287 case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2289 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2290 (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2291 (U8) ARG(c), FLAGS(c)));
2294 /* These nodes all require at least one code point to be in UTF-8 to
2300 case ANYOFHbbm_tb_pb:
2301 case ANYOFHbbm_tb_p8:
2306 case EXACTFLU8_tb_pb:
2307 case EXACTFLU8_tb_p8:
2308 case EXACTFU_REQ8_tb_pb:
2309 case EXACTFU_REQ8_tb_p8:
2314 anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2315 REXEC_FBC_UTF8_CLASS_SCAN(
2316 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2317 && _invlist_contains_cp(anyofh_list,
2318 utf8_to_uvchr_buf((U8 *) s,
2326 /* We know what the first byte of any matched string should be. */
2327 U8 first_byte = FLAGS(c);
2329 anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2330 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2331 _invlist_contains_cp(anyofh_list,
2332 utf8_to_uvchr_buf((U8 *) s,
2338 case ANYOFHbbm_t8_pb:
2339 case ANYOFHbbm_t8_p8:
2341 /* We know what the first byte of any matched string should be. */
2342 U8 first_byte = FLAGS(c);
2344 /* And a bitmap defines all the legal 2nd byte matches */
2345 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2347 && BITMAP_TEST(((struct regnode_bbm *) c)->bitmap,
2348 (U8) s[1] & UTF_CONTINUATION_MASK)));
2354 anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2355 REXEC_FBC_UTF8_CLASS_SCAN(
2356 ( inRANGE(NATIVE_UTF8_TO_I8(*s),
2357 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2358 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2359 && _invlist_contains_cp(anyofh_list,
2360 utf8_to_uvchr_buf((U8 *) s,
2367 anyofh_list = GET_ANYOFH_INVLIST(prog, c);
2368 REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(
2369 ((struct regnode_anyofhs *) c)->string,
2370 /* Note FLAGS is the string length in this regnode */
2371 ((struct regnode_anyofhs *) c)->string + FLAGS(c),
2372 _invlist_contains_cp(anyofh_list,
2373 utf8_to_uvchr_buf((U8 *) s,
2380 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2381 ANYOFRbase(c), ANYOFRdelta(c)));
2386 REXEC_FBC_UTF8_CLASS_SCAN(
2387 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2388 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2391 ANYOFRbase(c), ANYOFRdelta(c))));
2396 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2397 ANYOFRbase(c), ANYOFRdelta(c)));
2402 { /* We know what the first byte of any matched string should be */
2403 U8 first_byte = FLAGS(c);
2405 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2406 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2409 ANYOFRbase(c), ANYOFRdelta(c)));
2413 case EXACTFAA_tb_pb:
2415 /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2416 * which these functions don't handle anyway */
2417 fold_array = PL_fold_latin1;
2418 folder = S_foldEQ_latin1_s2_folded;
2419 goto do_exactf_non_utf8;
2422 fold_array = PL_fold;
2423 folder = Perl_foldEQ;
2424 goto do_exactf_non_utf8;
2427 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2429 if (IN_UTF8_CTYPE_LOCALE) {
2430 utf8_fold_flags = FOLDEQ_LOCALE;
2431 goto do_exactf_utf8;
2434 fold_array = PL_fold_locale;
2435 folder = Perl_foldEQ_locale;
2436 goto do_exactf_non_utf8;
2439 /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2440 * don't have to worry here about this single special case in the
2442 fold_array = PL_fold_latin1;
2443 folder = S_foldEQ_latin1_s2_folded;
2447 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2448 are no glitches with fold-length differences
2449 between the target string and pattern */
2451 /* The idea in the non-utf8 EXACTF* cases is to first find the first
2452 * character of the EXACTF* node and then, if necessary,
2453 * case-insensitively compare the full text of the node. c1 is the
2454 * first character. c2 is its fold. This logic will not work for
2455 * Unicode semantics and the german sharp ss, which hence should not be
2456 * compiled into a node that gets here. */
2457 pat_string = STRINGs(c);
2458 ln = STR_LENs(c); /* length to match in octets/bytes */
2460 /* We know that we have to match at least 'ln' bytes (which is the same
2461 * as characters, since not utf8). If we have to match 3 characters,
2462 * and there are only 2 availabe, we know without trying that it will
2463 * fail; so don't start a match past the required minimum number from
2465 e = HOP3c(strend, -((SSize_t)ln), s);
2470 c2 = fold_array[c1];
2471 if (c1 == c2) { /* If char and fold are the same */
2473 s = (char *) memchr(s, c1, e + 1 - s);
2478 /* Check that the rest of the node matches */
2479 if ( (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2480 && (reginfo->intuit || regtry(reginfo, &s)) )
2488 U8 bits_differing = c1 ^ c2;
2490 /* If the folds differ in one bit position only, we can mask to
2491 * match either of them, and can use this faster find method. Both
2492 * ASCII and EBCDIC tend to have their case folds differ in only
2493 * one position, so this is very likely */
2494 if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2495 bits_differing = ~ bits_differing;
2497 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2498 (c1 & bits_differing), bits_differing);
2503 if ( (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2504 && (reginfo->intuit || regtry(reginfo, &s)) )
2511 else { /* Otherwise, stuck with looking byte-at-a-time. This
2512 should actually happen only in EXACTFL nodes */
2514 if ( (*(U8*)s == c1 || *(U8*)s == c2)
2515 && (ln == 1 || folder(aTHX_ s + 1, pat_string + 1, ln - 1))
2516 && (reginfo->intuit || regtry(reginfo, &s)) )
2526 case EXACTFAA_tb_p8:
2527 case EXACTFAA_t8_p8:
2528 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2529 |FOLDEQ_S2_ALREADY_FOLDED
2530 |FOLDEQ_S2_FOLDS_SANE;
2531 goto do_exactf_utf8;
2533 case EXACTFAA_NO_TRIE_tb_pb:
2534 case EXACTFAA_NO_TRIE_t8_pb:
2535 case EXACTFAA_t8_pb:
2537 /* Here, and elsewhere in this file, the reason we can't consider a
2538 * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2539 * is because any MICRO SIGN in the pattern won't be folded. Since the
2540 * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2541 * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2542 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2543 goto do_exactf_utf8;
2548 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2549 utf8_fold_flags = FOLDEQ_LOCALE;
2550 goto do_exactf_utf8;
2552 case EXACTFLU8_t8_pb:
2553 case EXACTFLU8_t8_p8:
2554 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2555 | FOLDEQ_S2_FOLDS_SANE;
2556 goto do_exactf_utf8;
2558 case EXACTFU_REQ8_t8_p8:
2559 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2560 goto do_exactf_utf8;
2565 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2566 goto do_exactf_utf8;
2568 /* The following are problematic even though pattern isn't UTF-8. Use
2569 * full functionality normally not done except for UTF-8. */
2571 case EXACTFUP_tb_pb:
2572 case EXACTFUP_t8_pb:
2578 /* If one of the operands is in utf8, we can't use the simpler
2579 * folding above, due to the fact that many different characters
2580 * can have the same fold, or portion of a fold, or different-
2582 pat_string = STRINGs(c);
2583 ln = STR_LENs(c); /* length to match in octets/bytes */
2584 pat_end = pat_string + ln;
2585 lnc = is_utf8_pat /* length to match in characters */
2586 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2589 /* We have 'lnc' characters to match in the pattern, but because of
2590 * multi-character folding, each character in the target can match
2591 * up to 3 characters (Unicode guarantees it will never exceed
2592 * this) if it is utf8-encoded; and up to 2 if not (based on the
2593 * fact that the Latin 1 folds are already determined, and the only
2594 * multi-char fold in that range is the sharp-s folding to 'ss'.
2595 * Thus, a pattern character can match as little as 1/3 of a string
2596 * character. Adjust lnc accordingly, rounding up, so that if we
2597 * need to match at least 4+1/3 chars, that really is 5. */
2598 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2599 lnc = (lnc + expansion - 1) / expansion;
2601 /* As in the non-UTF8 case, if we have to match 3 characters, and
2602 * only 2 are left, it's guaranteed to fail, so don't start a match
2603 * that would require us to go beyond the end of the string */
2604 e = HOP3c(strend, -((SSize_t)lnc), s);
2606 /* XXX Note that we could recalculate e to stop the loop earlier,
2607 * as the worst case expansion above will rarely be met, and as we
2608 * go along we would usually find that e moves further to the left.
2609 * This would happen only after we reached the point in the loop
2610 * where if there were no expansion we should fail. Unclear if
2611 * worth the expense */
2614 char *my_strend= (char *)strend;
2615 if ( foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
2616 pat_string, NULL, ln, is_utf8_pat,
2618 && (reginfo->intuit || regtry(reginfo, &s)) )
2622 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2629 case BOUND_tb_pb: /* /d without utf8 target is /a */
2631 /* regcomp.c makes sure that these only have the traditional \b
2633 assert(FLAGS(c) == TRADITIONAL_BOUND);
2635 FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
2638 case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2640 /* regcomp.c makes sure that these only have the traditional \b
2642 assert(FLAGS(c) == TRADITIONAL_BOUND);
2644 FBC_BOUND_A_UTF8(isWORDCHAR_A);
2649 case NBOUND_tb_pb: /* /d without utf8 target is /a */
2651 /* regcomp.c makes sure that these only have the traditional \b
2653 assert(FLAGS(c) == TRADITIONAL_BOUND);
2655 FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
2658 case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2660 /* regcomp.c makes sure that these only have the traditional \b
2662 assert(FLAGS(c) == TRADITIONAL_BOUND);
2664 FBC_NBOUND_A_UTF8(isWORDCHAR_A);
2669 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2670 FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2675 goto do_boundu_non_utf8;
2679 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2680 if (FLAGS(c) == TRADITIONAL_BOUND) {
2681 FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
2685 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2688 goto do_boundu_non_utf8;
2692 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2693 if (FLAGS(c) == TRADITIONAL_BOUND) {
2694 FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2698 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2700 goto do_boundu_non_utf8;
2704 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2705 FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
2710 if (s == reginfo->strbeg) {
2711 if (reginfo->intuit || regtry(reginfo, &s))
2716 /* Didn't match. Try at the next position (if there is one) */
2718 if (UNLIKELY(s >= reginfo->strend)) {
2723 switch((bound_type) FLAGS(c)) {
2724 case TRADITIONAL_BOUND: /* Should have already been handled */
2729 /* Not utf8. Everything is a GCB except between CR and LF */
2730 while (s < strend) {
2731 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2732 || UCHARAT(s) != '\n'))
2733 && (reginfo->intuit || regtry(reginfo, &s)))
2744 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2745 while (s < strend) {
2746 LB_enum after = getLB_VAL_CP((U8) *s);
2747 if (to_complement ^ isLB(before,
2749 (U8*) reginfo->strbeg,
2751 (U8*) reginfo->strend,
2752 0 /* target not utf8 */ )
2753 && (reginfo->intuit || regtry(reginfo, &s)))
2766 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2767 while (s < strend) {
2768 SB_enum after = getSB_VAL_CP((U8) *s);
2769 if ((to_complement ^ isSB(before,
2771 (U8*) reginfo->strbeg,
2773 (U8*) reginfo->strend,
2774 0 /* target not utf8 */ ))
2775 && (reginfo->intuit || regtry(reginfo, &s)))
2788 WB_enum previous = WB_UNKNOWN;
2789 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2790 while (s < strend) {
2791 WB_enum after = getWB_VAL_CP((U8) *s);
2792 if ((to_complement ^ isWB(previous,
2795 (U8*) reginfo->strbeg,
2797 (U8*) reginfo->strend,
2798 0 /* target not utf8 */ ))
2799 && (reginfo->intuit || regtry(reginfo, &s)))
2810 /* Here are at the final position in the target string, which is a
2811 * boundary by definition, so matches, depending on other constraints.
2813 if ( reginfo->intuit
2814 || (s <= reginfo->strend && regtry(reginfo, &s)))
2823 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2824 if (FLAGS(c) == TRADITIONAL_BOUND) {
2825 FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2826 isWORDCHAR_LC_utf8_safe);
2830 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2833 goto do_boundu_utf8;
2837 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
2838 if (FLAGS(c) == TRADITIONAL_BOUND) {
2839 FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2840 isWORDCHAR_LC_utf8_safe);
2844 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2847 goto do_boundu_utf8;
2851 /* regcomp.c makes sure that these only have the traditional \b
2853 assert(FLAGS(c) == TRADITIONAL_BOUND);
2859 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2860 FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2861 isWORDCHAR_utf8_safe);
2866 goto do_boundu_utf8;
2870 /* regcomp.c makes sure that these only have the traditional \b
2872 assert(FLAGS(c) == TRADITIONAL_BOUND);
2878 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2879 FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2884 if (s == reginfo->strbeg) {
2885 if (reginfo->intuit || regtry(reginfo, &s))
2890 /* Didn't match. Try at the next position (if there is one) */
2891 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2892 if (UNLIKELY(s >= reginfo->strend)) {
2897 switch((bound_type) FLAGS(c)) {
2898 case TRADITIONAL_BOUND: /* Should have already been handled */
2904 GCB_enum before = getGCB_VAL_UTF8(
2906 (U8*)(reginfo->strbeg)),
2907 (U8*) reginfo->strend);
2908 while (s < strend) {
2909 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2910 (U8*) reginfo->strend);
2911 if ( (to_complement ^ isGCB(before,
2913 (U8*) reginfo->strbeg,
2915 1 /* target is utf8 */ ))
2916 && (reginfo->intuit || regtry(reginfo, &s)))
2921 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2928 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2930 (U8*)(reginfo->strbeg)),
2931 (U8*) reginfo->strend);
2932 while (s < strend) {
2933 LB_enum after = getLB_VAL_UTF8((U8*) s,
2934 (U8*) reginfo->strend);
2935 if (to_complement ^ isLB(before,
2937 (U8*) reginfo->strbeg,
2939 (U8*) reginfo->strend,
2940 1 /* target is utf8 */ )
2941 && (reginfo->intuit || regtry(reginfo, &s)))
2946 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2954 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2956 (U8*)(reginfo->strbeg)),
2957 (U8*) reginfo->strend);
2958 while (s < strend) {
2959 SB_enum after = getSB_VAL_UTF8((U8*) s,
2960 (U8*) reginfo->strend);
2961 if ((to_complement ^ isSB(before,
2963 (U8*) reginfo->strbeg,
2965 (U8*) reginfo->strend,
2966 1 /* target is utf8 */ ))
2967 && (reginfo->intuit || regtry(reginfo, &s)))
2972 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2980 /* We are at a boundary between char_sub_0 and char_sub_1.
2981 * We also keep track of the value for char_sub_-1 as we
2982 * loop through the line. Context may be needed to make a
2983 * determination, and if so, this can save having to
2985 WB_enum previous = WB_UNKNOWN;
2986 WB_enum before = getWB_VAL_UTF8(
2989 (U8*)(reginfo->strbeg)),
2990 (U8*) reginfo->strend);
2991 while (s < strend) {
2992 WB_enum after = getWB_VAL_UTF8((U8*) s,
2993 (U8*) reginfo->strend);
2994 if ((to_complement ^ isWB(previous,
2997 (U8*) reginfo->strbeg,
2999 (U8*) reginfo->strend,
3000 1 /* target is utf8 */ ))
3001 && (reginfo->intuit || regtry(reginfo, &s)))
3007 s += UTF8_SAFE_SKIP(s, reginfo->strend);
3012 /* Here are at the final position in the target string, which is a
3013 * boundary by definition, so matches, depending on other constraints.
3016 if ( reginfo->intuit
3017 || (s <= reginfo->strend && regtry(reginfo, &s)))
3025 REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
3030 REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
3033 /* The argument to all the POSIX node types is the class number to pass
3034 * to generic_isCC_() to build a mask for searching in PL_charclass[] */
3043 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3044 REXEC_FBC_UTF8_CLASS_SCAN(
3045 to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
3056 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
3057 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3058 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3063 /* The complement of something that matches only ASCII matches all
3064 * non-ASCII, plus everything in ASCII that isn't in the class. */
3065 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend)
3066 || ! generic_isCC_A_(*s, FLAGS(c)));
3071 /* Don't need to worry about utf8, as it can match only a single
3072 * byte invariant character. But we do anyway for performance reasons,
3073 * as otherwise we would have to examine all the continuation
3075 REXEC_FBC_UTF8_CLASS_SCAN(generic_isCC_A_(*s, FLAGS(c)));
3089 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3090 to_complement ^ cBOOL(generic_isCC_A_(*s, FLAGS(c))));
3100 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3101 to_complement ^ cBOOL(generic_isCC_(*s,
3116 classnum = (char_class_number_) FLAGS(c);
3119 REXEC_FBC_UTF8_CLASS_SCAN(
3120 to_complement ^ cBOOL(_invlist_contains_cp(
3121 PL_XPosix_ptrs[classnum],
3122 utf8_to_uvchr_buf((U8 *) s,
3127 case CC_ENUM_SPACE_:
3128 REXEC_FBC_UTF8_CLASS_SCAN(
3129 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3132 case CC_ENUM_BLANK_:
3133 REXEC_FBC_UTF8_CLASS_SCAN(
3134 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3137 case CC_ENUM_XDIGIT_:
3138 REXEC_FBC_UTF8_CLASS_SCAN(
3139 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3142 case CC_ENUM_VERTSPACE_:
3143 REXEC_FBC_UTF8_CLASS_SCAN(
3144 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3147 case CC_ENUM_CNTRL_:
3148 REXEC_FBC_UTF8_CLASS_SCAN(
3149 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3154 case AHOCORASICKC_tb_pb:
3155 case AHOCORASICKC_tb_p8:
3156 case AHOCORASICKC_t8_pb:
3157 case AHOCORASICKC_t8_p8:
3158 case AHOCORASICK_tb_pb:
3159 case AHOCORASICK_tb_p8:
3160 case AHOCORASICK_t8_pb:
3161 case AHOCORASICK_t8_p8:
3164 /* what trie are we using right now */
3165 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3166 reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
3167 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3169 const char *last_start = strend - trie->minlen;
3171 const char *real_start = s;
3173 STRLEN maxlen = trie->maxlen;
3175 U8 **points; /* map of where we were in the input string
3176 when reading a given char. For ASCII this
3177 is unnecessary overhead as the relationship
3178 is always 1:1, but for Unicode, especially
3179 case folded Unicode this is not true. */
3180 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3184 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3186 /* We can't just allocate points here. We need to wrap it in
3187 * an SV so it gets freed properly if there is a croak while
3188 * running the match */
3191 sv_points=newSV(maxlen * sizeof(U8 *));
3192 SvCUR_set(sv_points,
3193 maxlen * sizeof(U8 *));
3194 SvPOK_on(sv_points);
3195 sv_2mortal(sv_points);
3196 points=(U8**)SvPV_nolen(sv_points );
3197 if ( trie_type != trie_utf8_fold
3198 && (trie->bitmap || OP(c)==AHOCORASICKC) )
3201 bitmap=(U8*)trie->bitmap;
3203 bitmap=(U8*)ANYOF_BITMAP(c);
3205 /* this is the Aho-Corasick algorithm modified a touch
3206 to include special handling for long "unknown char" sequences.
3207 The basic idea being that we use AC as long as we are dealing
3208 with a possible matching char, when we encounter an unknown char
3209 (and we have not encountered an accepting state) we scan forward
3210 until we find a legal starting char.
3211 AC matching is basically that of trie matching, except that when
3212 we encounter a failing transition, we fall back to the current
3213 states "fail state", and try the current char again, a process
3214 we repeat until we reach the root state, state 1, or a legal
3215 transition. If we fail on the root state then we can either
3216 terminate if we have reached an accepting state previously, or
3217 restart the entire process from the beginning if we have not.
3220 while (s <= last_start) {
3221 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3229 U8 *uscan = (U8*)NULL;
3230 U8 *leftmost = NULL;
3232 U32 accepted_word= 0;
3236 while ( state && uc <= (U8*)strend ) {
3238 U32 word = aho->states[ state ].wordnum;
3242 DEBUG_TRIE_EXECUTE_r(
3243 if ( uc <= (U8*)last_start
3244 && !BITMAP_TEST(bitmap,*uc) )
3246 dump_exec_pos( (char *)uc, c, strend,
3248 (char *)uc, utf8_target, 0 );
3249 Perl_re_printf( aTHX_
3250 " Scanning for legal start char...\n");
3254 while ( uc <= (U8*)last_start
3255 && !BITMAP_TEST(bitmap,*uc) )
3260 while ( uc <= (U8*)last_start
3261 && ! BITMAP_TEST(bitmap,*uc) )
3268 if (uc >(U8*)last_start) break;
3272 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3274 if (!leftmost || lpos < leftmost) {
3275 DEBUG_r(accepted_word=word);
3281 points[pointpos++ % maxlen]= uc;
3282 if (foldlen || uc < (U8*)strend) {
3283 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3284 (U8 *) strend, uscan, len, uvc,
3285 charid, foldlen, foldbuf,
3287 DEBUG_TRIE_EXECUTE_r({
3288 dump_exec_pos( (char *)uc, c, strend,
3289 real_start, s, utf8_target, 0);
3290 Perl_re_printf( aTHX_
3291 " Charid:%3u CP:%4" UVxf " ",
3303 word = aho->states[ state ].wordnum;
3305 base = aho->states[ state ].trans.base;
3307 DEBUG_TRIE_EXECUTE_r({
3309 dump_exec_pos((char *)uc, c, strend, real_start,
3310 s, utf8_target, 0 );
3311 Perl_re_printf( aTHX_
3312 "%sState: %4" UVxf ", word=%" UVxf,
3313 failed ? " Fail transition to " : "",
3314 (UV)state, (UV)word);
3320 ( ((offset = base + charid
3321 - 1 - trie->uniquecharcount)) >= 0)
3322 && ((U32)offset < trie->lasttrans)
3323 && trie->trans[offset].check == state
3324 && (tmp=trie->trans[offset].next))
3326 DEBUG_TRIE_EXECUTE_r(
3327 Perl_re_printf( aTHX_ " - legal\n"));
3332 DEBUG_TRIE_EXECUTE_r(
3333 Perl_re_printf( aTHX_ " - fail\n"));
3335 state = aho->fail[state];
3339 /* we must be accepting here */
3340 DEBUG_TRIE_EXECUTE_r(
3341 Perl_re_printf( aTHX_ " - accepting\n"));
3350 if (!state) state = 1;
3353 if ( aho->states[ state ].wordnum ) {
3354 U8 *lpos = points[ (pointpos
3355 - trie->wordinfo[aho->states[ state ]
3356 .wordnum].len) % maxlen ];
3357 if (!leftmost || lpos < leftmost) {
3358 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3363 s = (char*)leftmost;
3364 DEBUG_TRIE_EXECUTE_r({
3365 Perl_re_printf( aTHX_ "Matches word #%" UVxf
3366 " at position %" IVdf ". Trying full"
3368 (UV)accepted_word, (IV)(s - real_start)
3371 if (reginfo->intuit || regtry(reginfo, &s)) {
3376 if (s < reginfo->strend) {
3379 DEBUG_TRIE_EXECUTE_r({
3380 Perl_re_printf( aTHX_
3381 "Pattern failed. Looking for new start"
3385 DEBUG_TRIE_EXECUTE_r(
3386 Perl_re_printf( aTHX_ "No match.\n"));
3395 case EXACTFU_REQ8_t8_pb:
3396 case EXACTFUP_tb_p8:
3397 case EXACTFUP_t8_p8:
3399 case EXACTF_t8_p8: /* This node only generated for non-utf8 patterns */
3400 case EXACTFAA_NO_TRIE_tb_p8:
3401 case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3406 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3407 } /* End of switch on node type */
3415 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3416 * flags have same meanings as with regexec_flags() */
3419 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3426 struct regexp *const prog = ReANY(rx);
3428 if (flags & REXEC_COPY_STR) {
3431 DEBUG_C(Perl_re_printf( aTHX_
3432 "Copy on write: regexp capture, type %d\n",
3434 /* Create a new COW SV to share the match string and store
3435 * in saved_copy, unless the current COW SV in saved_copy
3436 * is valid and suitable for our purpose */
3437 if (( prog->saved_copy
3438 && SvIsCOW(prog->saved_copy)
3439 && SvPOKp(prog->saved_copy)
3442 && SvPVX(sv) == SvPVX(prog->saved_copy)))
3444 /* just reuse saved_copy SV */
3445 if (RXp_MATCH_COPIED(prog)) {
3446 Safefree(prog->subbeg);
3447 RXp_MATCH_COPIED_off(prog);
3451 /* create new COW SV to share string */
3452 RXp_MATCH_COPY_FREE(prog);
3453 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3455 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3456 assert (SvPOKp(prog->saved_copy));
3457 prog->sublen = strend - strbeg;
3458 prog->suboffset = 0;
3459 prog->subcoffset = 0;
3464 SSize_t max = strend - strbeg;
3467 if ( (flags & REXEC_COPY_SKIP_POST)
3468 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3469 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3470 ) { /* don't copy $' part of string */
3473 /* calculate the right-most part of the string covered
3474 * by a capture. Due to lookahead, this may be to
3475 * the right of $&, so we have to scan all captures */
3476 while (n <= prog->lastparen) {
3477 if (prog->offs[n].end > max)
3478 max = prog->offs[n].end;
3482 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3483 ? prog->offs[0].start
3485 assert(max >= 0 && max <= strend - strbeg);
3488 if ( (flags & REXEC_COPY_SKIP_PRE)
3489 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3490 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3491 ) { /* don't copy $` part of string */
3494 /* calculate the left-most part of the string covered
3495 * by a capture. Due to lookbehind, this may be to
3496 * the left of $&, so we have to scan all captures */
3497 while (min && n <= prog->lastparen) {
3498 if ( prog->offs[n].start != -1
3499 && prog->offs[n].start < min)
3501 min = prog->offs[n].start;
3505 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3506 && min > prog->offs[0].end
3508 min = prog->offs[0].end;
3512 assert(min >= 0 && min <= max && min <= strend - strbeg);
3515 if (RXp_MATCH_COPIED(prog)) {
3516 if (sublen > prog->sublen)
3518 (char*)saferealloc(prog->subbeg, sublen+1);
3521 prog->subbeg = (char*)safemalloc(sublen+1);
3522 Copy(strbeg + min, prog->subbeg, sublen, char);
3523 prog->subbeg[sublen] = '\0';
3524 prog->suboffset = min;
3525 prog->sublen = sublen;
3526 RXp_MATCH_COPIED_on(prog);
3528 prog->subcoffset = prog->suboffset;
3529 if (prog->suboffset && utf8_target) {
3530 /* Convert byte offset to chars.
3531 * XXX ideally should only compute this if @-/@+
3532 * has been seen, a la PL_sawampersand ??? */
3534 /* If there's a direct correspondence between the
3535 * string which we're matching and the original SV,
3536 * then we can use the utf8 len cache associated with
3537 * the SV. In particular, it means that under //g,
3538 * sv_pos_b2u() will use the previously cached
3539 * position to speed up working out the new length of
3540 * subcoffset, rather than counting from the start of
3541 * the string each time. This stops
3542 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3543 * from going quadratic */
3544 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3545 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3546 SV_GMAGIC|SV_CONST_RETURN);
3548 prog->subcoffset = utf8_length((U8*)strbeg,
3549 (U8*)(strbeg+prog->suboffset));
3553 RXp_MATCH_COPY_FREE(prog);
3554 prog->subbeg = strbeg;
3555 prog->suboffset = 0;
3556 prog->subcoffset = 0;
3557 prog->sublen = strend - strbeg;
3565 - regexec_flags - match a regexp against a string
3568 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3569 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3570 /* stringarg: the point in the string at which to begin matching */
3571 /* strend: pointer to null at end of string */
3572 /* strbeg: real beginning of string */
3573 /* minend: end of match must be >= minend bytes after stringarg. */
3574 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
3575 * itself is accessed via the pointers above */
3576 /* data: May be used for some additional optimizations.
3577 Currently unused. */
3578 /* flags: For optimizations. See REXEC_* in regexp.h */
3581 struct regexp *const prog = ReANY(rx);
3585 SSize_t minlen; /* must match at least this many chars */
3586 SSize_t dontbother = 0; /* how many characters not to try at end */
3587 const bool utf8_target = cBOOL(DO_UTF8(sv));
3589 RXi_GET_DECL(prog,progi);
3590 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3591 regmatch_info *const reginfo = ®info_buf;
3592 regexp_paren_pair *swap = NULL;
3594 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3596 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3597 PERL_UNUSED_ARG(data);
3599 /* Be paranoid... */
3601 Perl_croak(aTHX_ "NULL regexp parameter");
3605 debug_start_match(rx, utf8_target, stringarg, strend,
3609 startpos = stringarg;
3611 /* set these early as they may be used by the HOP macros below */
3612 reginfo->strbeg = strbeg;
3613 reginfo->strend = strend;
3614 reginfo->is_utf8_target = cBOOL(utf8_target);
3616 if (prog->intflags & PREGf_GPOS_SEEN) {
3619 /* set reginfo->ganch, the position where \G can match */
3622 (flags & REXEC_IGNOREPOS)
3623 ? stringarg /* use start pos rather than pos() */
3624 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3625 /* Defined pos(): */
3626 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3627 : strbeg; /* pos() not defined; use start of string */
3629 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3630 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3632 /* in the presence of \G, we may need to start looking earlier in
3633 * the string than the suggested start point of stringarg:
3634 * if prog->gofs is set, then that's a known, fixed minimum
3637 * /ab|c\G/: gofs = 1
3638 * or if the minimum offset isn't known, then we have to go back
3639 * to the start of the string, e.g. /w+\G/
3642 if (prog->intflags & PREGf_ANCH_GPOS) {
3644 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3646 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3648 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3649 "fail: ganch-gofs before earliest possible start\n"));
3654 startpos = reginfo->ganch;
3656 else if (prog->gofs) {
3657 startpos = HOPBACKc(startpos, prog->gofs);
3661 else if (prog->intflags & PREGf_GPOS_FLOAT)
3665 minlen = prog->minlen;
3666 if ((startpos + minlen) > strend || startpos < strbeg) {
3667 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3668 "Regex match can't succeed, so not even tried\n"));
3672 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3673 * which will call destuctors to reset PL_regmatch_state, free higher
3674 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3675 * regmatch_info_aux_eval */
3677 oldsave = PL_savestack_ix;
3681 if ((prog->extflags & RXf_USE_INTUIT)
3682 && !(flags & REXEC_CHECKED))
3684 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3689 if (prog->extflags & RXf_CHECK_ALL) {
3690 /* we can match based purely on the result of INTUIT.
3691 * Set up captures etc just for $& and $-[0]
3692 * (an intuit-only match wont have $1,$2,..) */
3693 assert(!prog->nparens);
3695 /* s/// doesn't like it if $& is earlier than where we asked it to
3696 * start searching (which can happen on something like /.\G/) */
3697 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3700 /* this should only be possible under \G */
3701 assert(prog->intflags & PREGf_GPOS_SEEN);
3702 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3703 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3707 /* match via INTUIT shouldn't have any captures.
3708 * Let @-, @+, $^N know */
3709 prog->lastparen = prog->lastcloseparen = 0;
3710 RXp_MATCH_UTF8_set(prog, utf8_target);
3711 prog->offs[0].start = s - strbeg;
3712 prog->offs[0].end = utf8_target
3713 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3714 : s - strbeg + prog->minlenret;
3715 if ( !(flags & REXEC_NOT_FIRST) )
3716 S_reg_set_capture_string(aTHX_ rx,
3718 sv, flags, utf8_target);
3724 multiline = prog->extflags & RXf_PMf_MULTILINE;
3726 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3727 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3728 "String too short [regexec_flags]...\n"));
3732 /* Check validity of program. */
3733 if (UCHARAT(progi->program) != REG_MAGIC) {
3734 Perl_croak(aTHX_ "corrupted regexp program");
3737 RXp_MATCH_TAINTED_off(prog);
3738 RXp_MATCH_UTF8_set(prog, utf8_target);
3740 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3741 reginfo->intuit = 0;
3742 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3743 reginfo->warned = FALSE;
3745 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3746 /* see how far we have to get to not match where we matched before */
3747 reginfo->till = stringarg + minend;
3749 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3750 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3751 S_cleanup_regmatch_info_aux has executed (registered by
3752 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3753 magic belonging to this SV.
3754 Not newSVsv, either, as it does not COW.
3756 reginfo->sv = newSV_type(SVt_NULL);
3757 SvSetSV_nosteal(reginfo->sv, sv);
3758 SAVEFREESV(reginfo->sv);
3761 /* reserve next 2 or 3 slots in PL_regmatch_state:
3762 * slot N+0: may currently be in use: skip it
3763 * slot N+1: use for regmatch_info_aux struct
3764 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3765 * slot N+3: ready for use by regmatch()
3769 regmatch_state *old_regmatch_state;
3770 regmatch_slab *old_regmatch_slab;
3771 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3773 /* on first ever match, allocate first slab */
3774 if (!PL_regmatch_slab) {
3775 Newx(PL_regmatch_slab, 1, regmatch_slab);
3776 PL_regmatch_slab->prev = NULL;
3777 PL_regmatch_slab->next = NULL;
3778 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3781 old_regmatch_state = PL_regmatch_state;
3782 old_regmatch_slab = PL_regmatch_slab;
3784 for (i=0; i <= max; i++) {
3786 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3788 reginfo->info_aux_eval =
3789 reginfo->info_aux->info_aux_eval =
3790 &(PL_regmatch_state->u.info_aux_eval);
3792 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3793 PL_regmatch_state = S_push_slab(aTHX);
3796 /* note initial PL_regmatch_state position; at end of match we'll
3797 * pop back to there and free any higher slabs */
3799 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3800 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
3801 reginfo->info_aux->poscache = NULL;
3803 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3805 if ((prog->extflags & RXf_EVAL_SEEN))
3806 S_setup_eval_state(aTHX_ reginfo);
3808 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3811 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3812 /* We have to be careful. If the previous successful match
3813 was from this regex we don't want a subsequent partially
3814 successful match to clobber the old results.
3815 So when we detect this possibility we add a swap buffer
3816 to the re, and switch the buffer each match. If we fail,
3817 we switch it back; otherwise we leave it swapped.
3820 /* avoid leak if we die, or clean up anyway if match completes */
3822 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3823 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3824 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3832 if (prog->recurse_locinput)
3833 Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3835 /* Simplest case: anchored match (but not \G) need be tried only once,
3836 * or with MBOL, only at the beginning of each line.
3838 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3839 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3840 * match at the start of the string then it won't match anywhere else
3841 * either; while with /.*.../, if it doesn't match at the beginning,
3842 * the earliest it could match is at the start of the next line */
3844 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3847 if (regtry(reginfo, &s))
3850 if (!(prog->intflags & PREGf_ANCH_MBOL))
3853 /* didn't match at start, try at other newline positions */
3856 dontbother = minlen - 1;
3857 end = HOP3c(strend, -dontbother, strbeg) - 1;
3859 /* skip to next newline */
3861 while (s <= end) { /* note it could be possible to match at the end of the string */
3862 /* NB: newlines are the same in unicode as they are in latin */
3865 if (prog->check_substr || prog->check_utf8) {
3866 /* note that with PREGf_IMPLICIT, intuit can only fail
3867 * or return the start position, so it's of limited utility.
3868 * Nevertheless, I made the decision that the potential for
3869 * quick fail was still worth it - DAPM */
3870 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3874 if (regtry(reginfo, &s))
3878 } /* end anchored search */
3880 /* anchored \G match */
3881 if (prog->intflags & PREGf_ANCH_GPOS)
3883 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3884 assert(prog->intflags & PREGf_GPOS_SEEN);
3885 /* For anchored \G, the only position it can match from is
3886 * (ganch-gofs); we already set startpos to this above; if intuit
3887 * moved us on from there, we can't possibly succeed */
3888 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3889 if (s == startpos && regtry(reginfo, &s))
3894 /* Messy cases: unanchored match. */
3896 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3897 /* we have /x+whatever/ */
3898 /* it must be a one character string (XXXX Except is_utf8_pat?) */
3904 if (! prog->anchored_utf8) {
3905 to_utf8_substr(prog);
3907 ch = SvPVX_const(prog->anchored_utf8)[0];
3908 REXEC_FBC_UTF8_SCAN(
3910 DEBUG_EXECUTE_r( did_match = 1 );
3911 if (regtry(reginfo, &s)) goto got_it;
3912 s += UTF8_SAFE_SKIP(s, strend);
3913 while (s < strend && *s == ch)
3920 if (! prog->anchored_substr) {
3921 if (! to_byte_substr(prog)) {
3922 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3925 ch = SvPVX_const(prog->anchored_substr)[0];
3926 REXEC_FBC_NON_UTF8_SCAN(
3928 DEBUG_EXECUTE_r( did_match = 1 );
3929 if (regtry(reginfo, &s)) goto got_it;
3931 while (s < strend && *s == ch)
3936 DEBUG_EXECUTE_r(if (!did_match)
3937 Perl_re_printf( aTHX_
3938 "Did not find anchored character...\n")
3941 else if (prog->anchored_substr != NULL
3942 || prog->anchored_utf8 != NULL
3943 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3944 && prog->float_max_offset < strend - s)) {
3949 char *last1; /* Last position checked before */
3953 if (prog->anchored_substr || prog->anchored_utf8) {
3955 if (! prog->anchored_utf8) {
3956 to_utf8_substr(prog);
3958 must = prog->anchored_utf8;
3961 if (! prog->anchored_substr) {
3962 if (! to_byte_substr(prog)) {
3963 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3966 must = prog->anchored_substr;
3968 back_max = back_min = prog->anchored_offset;
3971 if (! prog->float_utf8) {
3972 to_utf8_substr(prog);
3974 must = prog->float_utf8;
3977 if (! prog->float_substr) {
3978 if (! to_byte_substr(prog)) {
3979 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3982 must = prog->float_substr;
3984 back_max = prog->float_max_offset;
3985 back_min = prog->float_min_offset;
3991 last = HOP3c(strend, /* Cannot start after this */
3992 -(SSize_t)(CHR_SVLEN(must)
3993 - (SvTAIL(must) != 0) + back_min), strbeg);
3995 if (s > reginfo->strbeg)
3996 last1 = HOPc(s, -1);
3998 last1 = s - 1; /* bogus */
4000 /* XXXX check_substr already used to find "s", can optimize if
4001 check_substr==must. */
4003 strend = HOPc(strend, -dontbother);
4004 while ( (s <= last) &&
4005 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
4006 (unsigned char*)strend, must,
4007 multiline ? FBMrf_MULTILINE : 0)) ) {
4008 DEBUG_EXECUTE_r( did_match = 1 );
4009 if (HOPc(s, -back_max) > last1) {
4010 last1 = HOPc(s, -back_min);
4011 s = HOPc(s, -back_max);
4014 char * const t = (last1 >= reginfo->strbeg)
4015 ? HOPc(last1, 1) : last1 + 1;
4017 last1 = HOPc(s, -back_min);
4021 while (s <= last1) {
4022 if (regtry(reginfo, &s))
4025 s++; /* to break out of outer loop */
4032 while (s <= last1) {
4033 if (regtry(reginfo, &s))
4039 DEBUG_EXECUTE_r(if (!did_match) {
4040 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
4041 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
4042 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n",
4043 ((must == prog->anchored_substr || must == prog->anchored_utf8)
4044 ? "anchored" : "floating"),
4045 quoted, RE_SV_TAIL(must));
4049 else if ( (c = progi->regstclass) ) {
4051 const OPCODE op = OP(progi->regstclass);
4052 /* don't bother with what can't match */
4053 if (REGNODE_TYPE(op) != EXACT && REGNODE_TYPE(op) != TRIE)
4054 strend = HOPc(strend, -(minlen - 1));
4057 SV * const prop = sv_newmortal();
4058 regprop(prog, prop, c, reginfo, NULL);
4060 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
4061 s,strend-s,PL_dump_re_max_len);
4062 Perl_re_printf( aTHX_
4063 "Matching stclass %.*s against %s (%d bytes)\n",
4064 (int)SvCUR(prop), SvPVX_const(prop),
4065 quoted, (int)(strend - s));
4068 if (find_byclass(prog, c, s, strend, reginfo))
4070 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n"));
4074 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
4082 if (! prog->float_utf8) {
4083 to_utf8_substr(prog);
4085 float_real = prog->float_utf8;
4088 if (! prog->float_substr) {
4089 if (! to_byte_substr(prog)) {
4090 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4093 float_real = prog->float_substr;
4096 little = SvPV_const(float_real, len);
4097 if (SvTAIL(float_real)) {
4098 /* This means that float_real contains an artificial \n on
4099 * the end due to the presence of something like this:
4100 * /foo$/ where we can match both "foo" and "foo\n" at the
4101 * end of the string. So we have to compare the end of the
4102 * string first against the float_real without the \n and
4103 * then against the full float_real with the string. We
4104 * have to watch out for cases where the string might be
4105 * smaller than the float_real or the float_real without
4107 char *checkpos= strend - len;
4109 Perl_re_printf( aTHX_
4110 "%sChecking for float_real.%s\n",
4111 PL_colors[4], PL_colors[5]));
4112 if (checkpos + 1 < strbeg) {
4113 /* can't match, even if we remove the trailing \n
4114 * string is too short to match */
4116 Perl_re_printf( aTHX_
4117 "%sString shorter than required trailing substring, cannot match.%s\n",
4118 PL_colors[4], PL_colors[5]));
4120 } else if (memEQ(checkpos + 1, little, len - 1)) {
4121 /* can match, the end of the string matches without the
4123 last = checkpos + 1;
4124 } else if (checkpos < strbeg) {
4125 /* cant match, string is too short when the "\n" is
4128 Perl_re_printf( aTHX_
4129 "%sString does not contain required trailing substring, cannot match.%s\n",
4130 PL_colors[4], PL_colors[5]));
4132 } else if (!multiline) {
4133 /* non multiline match, so compare with the "\n" at the
4134 * end of the string */
4135 if (memEQ(checkpos, little, len)) {
4139 Perl_re_printf( aTHX_
4140 "%sString does not contain required trailing substring, cannot match.%s\n",
4141 PL_colors[4], PL_colors[5]));
4145 /* multiline match, so we have to search for a place
4146 * where the full string is located */
4152 last = rninstr(s, strend, little, little + len);
4154 last = strend; /* matching "$" */
4157 /* at one point this block contained a comment which was
4158 * probably incorrect, which said that this was a "should not
4159 * happen" case. Even if it was true when it was written I am
4160 * pretty sure it is not anymore, so I have removed the comment
4161 * and replaced it with this one. Yves */
4163 Perl_re_printf( aTHX_
4164 "%sString does not contain required substring, cannot match.%s\n",
4165 PL_colors[4], PL_colors[5]
4169 dontbother = strend - last + prog->float_min_offset;
4171 if (minlen && (dontbother < minlen))
4172 dontbother = minlen - 1;
4173 strend -= dontbother; /* this one's always in bytes! */
4174 /* We don't know much -- general case. */
4177 if (regtry(reginfo, &s))
4186 if (regtry(reginfo, &s))
4188 } while (s++ < strend);
4196 /* s/// doesn't like it if $& is earlier than where we asked it to
4197 * start searching (which can happen on something like /.\G/) */
4198 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
4199 && (prog->offs[0].start < stringarg - strbeg))
4201 /* this should only be possible under \G */
4202 assert(prog->intflags & PREGf_GPOS_SEEN);
4203 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
4204 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
4208 /* clean up; this will trigger destructors that will free all slabs
4209 * above the current one, and cleanup the regmatch_info_aux
4210 * and regmatch_info_aux_eval sructs */
4212 LEAVE_SCOPE(oldsave);
4214 if (RXp_PAREN_NAMES(prog))
4215 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
4217 /* make sure $`, $&, $', and $digit will work later */
4218 if ( !(flags & REXEC_NOT_FIRST) )
4219 S_reg_set_capture_string(aTHX_ rx,
4220 strbeg, reginfo->strend,
4221 sv, flags, utf8_target);
4226 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n",
4227 PL_colors[4], PL_colors[5]));
4230 /* we failed :-( roll it back.
4231 * Since the swap buffer will be freed on scope exit which follows
4232 * shortly, restore the old captures by copying 'swap's original
4233 * data to the new offs buffer
4235 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
4236 "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
4243 Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
4246 /* clean up; this will trigger destructors that will free all slabs
4247 * above the current one, and cleanup the regmatch_info_aux
4248 * and regmatch_info_aux_eval sructs */
4250 LEAVE_SCOPE(oldsave);
4256 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
4257 * Do inc before dec, in case old and new rex are the same */
4258 #define SET_reg_curpm(Re2) \
4259 if (reginfo->info_aux_eval) { \
4260 (void)ReREFCNT_inc(Re2); \
4261 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
4262 PM_SETRE((PL_reg_curpm), (Re2)); \
4267 - regtry - try match at specific point
4269 STATIC bool /* 0 failure, 1 success */
4270 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4273 REGEXP *const rx = reginfo->prog;
4274 regexp *const prog = ReANY(rx);
4277 U32 depth = 0; /* used by REGCP_SET */
4279 RXi_GET_DECL(prog,progi);
4280 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4282 PERL_ARGS_ASSERT_REGTRY;
4284 reginfo->cutpoint=NULL;
4286 prog->offs[0].start = *startposp - reginfo->strbeg;
4287 prog->lastparen = 0;
4288 prog->lastcloseparen = 0;
4290 /* XXXX What this code is doing here?!!! There should be no need
4291 to do this again and again, prog->lastparen should take care of
4294 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4295 * Actually, the code in regcppop() (which Ilya may be meaning by
4296 * prog->lastparen), is not needed at all by the test suite
4297 * (op/regexp, op/pat, op/split), but that code is needed otherwise
4298 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4299 * Meanwhile, this code *is* needed for the
4300 * above-mentioned test suite tests to succeed. The common theme
4301 * on those tests seems to be returning null fields from matches.
4302 * --jhi updated by dapm */
4304 /* After encountering a variant of the issue mentioned above I think
4305 * the point Ilya was making is that if we properly unwind whenever
4306 * we set lastparen to a smaller value then we should not need to do
4307 * this every time, only when needed. So if we have tests that fail if
4308 * we remove this, then it suggests somewhere else we are improperly
4309 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4310 * places it is called, and related regcp() routines. - Yves */
4312 if (prog->nparens) {
4313 regexp_paren_pair *pp = prog->offs;
4315 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4323 result = regmatch(reginfo, *startposp, progi->program + 1);
4325 prog->offs[0].end = result;
4328 if (reginfo->cutpoint)
4329 *startposp= reginfo->cutpoint;
4330 REGCP_UNWIND(lastcp);
4334 /* this is used to determine how far from the left messages like
4335 'failed...' are printed in regexec.c. It should be set such that
4336 messages are inline with the regop output that created them.
4338 #define REPORT_CODE_OFF 29
4339 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4342 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4346 PerlIO *f= Perl_debug_log;
4347 PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4348 va_start(ap, depth);
4349 PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4350 result = PerlIO_vprintf(f, fmt, ap);
4354 #endif /* DEBUGGING */
4356 /* grab a new slab and return the first slot in it */
4358 STATIC regmatch_state *
4361 regmatch_slab *s = PL_regmatch_slab->next;
4363 Newx(s, 1, regmatch_slab);
4364 s->prev = PL_regmatch_slab;
4366 PL_regmatch_slab->next = s;
4368 PL_regmatch_slab = s;
4369 return SLAB_FIRST(s);
4375 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4376 const char *start, const char *end, const char *blurb)
4378 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4380 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4385 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4386 RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4388 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4389 start, end - start, PL_dump_re_max_len);
4391 Perl_re_printf( aTHX_
4392 "%s%s REx%s %s against %s\n",
4393 PL_colors[4], blurb, PL_colors[5], s0, s1);
4395 if (utf8_target||utf8_pat)
4396 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n",
4397 utf8_pat ? "pattern" : "",
4398 utf8_pat && utf8_target ? " and " : "",
4399 utf8_target ? "string" : ""
4405 S_dump_exec_pos(pTHX_ const char *locinput,
4406 const regnode *scan,
4407 const char *loc_regeol,
4408 const char *loc_bostr,
4409 const char *loc_reg_starttry,
4410 const bool utf8_target,
4414 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4415 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4416 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4417 /* The part of the string before starttry has one color
4418 (pref0_len chars), between starttry and current
4419 position another one (pref_len - pref0_len chars),
4420 after the current position the third one.
4421 We assume that pref0_len <= pref_len, otherwise we
4422 decrease pref0_len. */
4423 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4424 ? (5 + taill) - l : locinput - loc_bostr;
4427 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4430 while (UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) {
4434 pref0_len = pref_len - (locinput - loc_reg_starttry);
4435 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4436 l = ( loc_regeol - locinput > (5 + taill) - pref_len
4437 ? (5 + taill) - pref_len : loc_regeol - locinput);
4439 while (UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) {
4445 if (pref0_len > pref_len)
4446 pref0_len = pref_len;
4448 const int is_uni = utf8_target ? 1 : 0;
4450 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4451 (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4453 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4454 (locinput - pref_len + pref0_len),
4455 pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4457 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4458 locinput, loc_regeol - locinput, 10, 0, 1);
4460 const STRLEN tlen=len0+len1+len2;
4461 Perl_re_printf( aTHX_
4462 "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4" UVuf "| ",
4463 (IV)(locinput - loc_bostr),
4466 (docolor ? "" : "> <"),
4468 (int)(tlen > 19 ? 0 : 19 - tlen),
4476 /* reg_check_named_buff_matched()
4477 * Checks to see if a named buffer has matched. The data array of
4478 * buffer numbers corresponding to the buffer is expected to reside
4479 * in the regexp->data->data array in the slot stored in the ARG() of
4480 * node involved. Note that this routine doesn't actually care about the
4481 * name, that information is not preserved from compilation to execution.
4482 * Returns the index of the leftmost defined buffer with the given name
4483 * or 0 if non of the buffers matched.
4486 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4489 RXi_GET_DECL(rex,rexi);
4490 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4491 I32 *nums=(I32*)SvPVX(sv_dat);
4493 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4495 for ( n=0; n<SvIVX(sv_dat); n++ ) {
4496 if ((I32)rex->lastparen >= nums[n] &&
4497 rex->offs[nums[n]].end != -1)
4506 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
4507 struct next_matchable_info * m,
4508 regmatch_info *reginfo)
4510 /* This function determines various characteristics about every possible
4511 * initial match of the passed-in EXACTish <text_node>, and stores them in
4514 * That includes a match string and a parallel mask, such that if you AND
4515 * the target string with the mask and compare with the match string,
4516 * you'll have a pretty good idea, perhaps even perfect, if that portion of
4517 * the target matches or not.
4519 * The motivation behind this function is to allow the caller to set up
4520 * tight loops for matching. Consider patterns like '.*B' or '.*?B' where
4521 * B is an arbitrary EXACTish node. To find the end of .*, we look for the
4522 * beginning oF B, which is the passed in <text_node> That's where this
4523 * function comes in. The values it returns can quickly be used to rule
4524 * out many, or all, cases of possible matches not actually being the
4525 * beginning of B, <text_node>. It is also used in regrepeat() where we
4526 * have 'A*', for arbitrary 'A'. This sets up criteria to more efficiently
4527 * determine where the span of 'A's stop.
4529 * If <text_node> is of type EXACT, there is only one possible character
4530 * that can match its first character, and so the situation is quite
4531 * simple. But things can get much more complicated if folding is
4532 * involved. It may be that the first character of an EXACTFish node
4533 * doesn't participate in any possible fold, e.g., punctuation, so it can
4534 * be matched only by itself. The vast majority of characters that are in
4535 * folds match just two things, their lower and upper-case equivalents.
4536 * But not all are like that; some have multiple possible matches, or match
4537 * sequences of more than one character. This function sorts all that out.
4539 * It returns information about all possibilities of what the first
4540 * character(s) of <text_node> could look like. Again, if <text_node> is a
4541 * plain EXACT node, that's just the actual first bytes of the first
4542 * character; but otherwise it is the bytes, that when masked, match all
4543 * possible combinations of all the initial bytes of all the characters
4544 * that could match, folded. (Actually, this is a slight over promise. It
4545 * handles only up to the initial 5 bytes, which is enough for all Unicode
4546 * characters, but not for all non-Unicode ones.)
4548 * Here's an example to clarify. Suppose the first character of
4549 * <text_node> is the letter 'C', and we are under /i matching. That means
4550 * 'c' also matches. The representations of these two characters differ in
4551 * just one bit, so the mask would be a zero in that position and ones in
4552 * the other 7. And the returned string would be the AND of these two
4553 * characters, and would be one byte long, since these characters are each
4554 * a single byte. ANDing the target <text_node> with this mask will yield
4555 * the returned string if and only if <text_node> begins with one of these
4556 * two characters. So, the function would also return that the definitive
4557 * length matched is 1 byte.
4559 * Now, suppose instead of the letter 'C', <text_node> begins with the
4560 * letter 'F'. The situation is much more complicated because there are
4561 * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also
4562 * begins with 'f', and hence could match. We add these into the returned
4563 * string and mask, but the result isn't definitive; the caller has to
4564 * check further if its AND and compare pass. But the failure of that
4565 * compare will quickly rule out most possible inputs.
4567 * Much of this could be done in regcomp.c at compile time, except for
4568 * locale-dependent, and UTF-8 target dependent data. Extra data fields
4569 * could be used for one or the other eventualities.
4571 * If this function determines that no possible character in the target
4572 * string can match, it returns FALSE; otherwise TRUE. (The FALSE
4573 * situation occurs if the first character in <text_node> requires UTF-8 to
4574 * represent, and the target string isn't in UTF-8.)
4576 * Some analysis is in GH #18414, located at the time of this writing at:
4577 * https://github.com/Perl/perl5/issues/18414
4580 const bool utf8_target = reginfo->is_utf8_target;
4581 bool utf8_pat = reginfo->is_utf8_pat;
4583 PERL_UINT_FAST8_T i;
4585 /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
4587 U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
4588 U8 lengths[MAX_MATCHES] = { 0 };
4590 U8 index_of_longest = 0;
4592 U8 *pat = (U8*)STRING(text_node);
4593 Size_t pat_len = STR_LEN(text_node);
4594 U8 op = OP(text_node);
4596 U8 byte_mask[5] = {0};
4597 U8 byte_anded[5] = {0};
4599 /* There are some folds in Unicode to multiple characters. This will hold
4600 * such characters that could fold to the beginning of 'text_node' */
4601 UV multi_fold_from = 0;
4603 /* We may have to create a modified copy of the pattern */
4604 U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' };
4607 m->min_length = 255;
4610 /* Even if the first character in the node can match something in Latin1,
4611 * if there is anything in the node that can't, the match must fail */
4612 if (! utf8_target && isEXACT_REQ8(op)) {
4616 /* Define a temporary op for use in this function, using an existing one that
4617 * should never be a real op during execution */
4618 #define TURKISH PSEUDO
4620 /* What to do about these two nodes had to be deferred to runtime (which is
4621 * now). If the extra information we now have so indicates, turn them into
4623 if ( (op == EXACTF && utf8_target)
4624 || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4626 if (op == EXACTFL && PL_in_utf8_turkic_locale) {
4633 /* And certain situations are better handled if we create a modified
4634 * version of the pattern */
4635 if (utf8_pat) { /* Here, must have been EXACTFL, so look at the
4636 specific problematic characters */
4637 if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) {
4639 /* The node could start with characters that are the first ones
4640 * of a multi-character fold. */
4642 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4643 if (multi_fold_from) {
4645 /* Here, they do form a sequence that matches the fold of a
4646 * single character. That single character then is a
4647 * possible match. Below we will look again at this, but
4648 * the code below is expecting every character in the
4649 * pattern to be folded, which the input isn't required to
4650 * be in this case. So, just fold the single character,
4651 * and the result will be in the expected form. */
4652 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4656 /* Turkish has a couple extra possibilities. */
4657 else if ( UNLIKELY(op == TURKISH)
4659 && isALPHA_FOLD_EQ(pat[0], 'f')
4660 && ( memBEGINs(pat + 1, pat_len - 1,
4661 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4663 && isALPHA_FOLD_EQ(pat[1], 'f')
4664 && memBEGINs(pat + 2, pat_len - 2,
4665 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4667 /* The macros for finding a multi-char fold don't include
4668 * the Turkish possibilities, in which U+130 folds to 'i'.
4669 * Hard-code these. It's very unlikely that Unicode will
4670 * ever add any others. */
4671 if (pat[1] == 'f') {
4673 Copy("ffi", mod_pat, pat_len, U8);
4677 Copy("fi", mod_pat, pat_len, U8);
4681 else if ( UTF8_IS_DOWNGRADEABLE_START(*pat)
4682 && LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8))
4683 && LIKELY(memNEs(pat, pat_len,
4684 LATIN_SMALL_LETTER_SHARP_S_UTF8))
4685 && (LIKELY(op != TURKISH || *pat != 'I')))
4687 /* For all cases of things between 0-255, except the ones
4688 * in the conditional above, the fold is just the lower
4689 * case, which is faster than the more general case. */
4690 mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0],
4696 else { /* Code point above 255, or needs special handling */
4697 _to_utf8_fold_flags(pat, pat + pat_len,
4699 FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE);
4704 else if /* Below is not a UTF-8 pattern; there's a somewhat different
4705 set of problematic characters */
4707 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len)))
4709 /* We may have to canonicalize a multi-char fold, as in the UTF-8
4711 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4715 else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) {
4716 mod_pat[0] = mod_pat[1] = 's';
4718 utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant
4719 chars, and speeds copying */
4722 else if (LIKELY(op != TURKISH || *pat != 'I')) {
4723 mod_pat[0] = toLOWER_L1(*pat);
4728 else if /* Below isn't a node that we convert to UTF-8 */
4731 && op == EXACTFAA_NO_TRIE
4732 && *pat == LATIN_SMALL_LETTER_SHARP_S)
4734 /* A very special case. Folding U+DF goes to U+17F under /iaa. We
4735 * did this at compile time when the pattern was UTF-8 , but otherwise
4736 * we couldn't do it earlier, because it requires a UTF-8 target for
4737 * this match to be legal. */
4738 pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1);
4739 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8
4740 LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8);
4745 /* Here, we have taken care of the initial work for a few very problematic
4746 * situations, possibly creating a modified pattern.
4748 * Now ready for the general case. We build up all the possible things
4749 * that could match the first character of the pattern into the elements of
4752 * Everything generally matches at least itself. But if there is a
4753 * UTF8ness mismatch, we have to convert to that of the target string. */
4754 if (UTF8_IS_INVARIANT(*pat)) { /* Immaterial if either is in UTF-8 */
4755 matches[0][0] = pat[0];
4759 else if (utf8_target) {
4761 lengths[0] = UTF8SKIP(pat);
4762 Copy(pat, matches[0], lengths[0], U8);
4765 else { /* target is UTF-8, pattern isn't */
4766 matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
4767 matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
4772 else if (! utf8_pat) { /* Neither is UTF-8 */
4773 matches[0][0] = pat[0];
4777 else /* target isn't UTF-8; pattern is. No match possible unless the
4778 pattern's first character can fit in a byte */
4779 if (UTF8_IS_DOWNGRADEABLE_START(*pat))
4781 matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
4786 /* Here we have taken care of any necessary node-type changes */
4789 m->max_length = lengths[0];
4790 m->min_length = lengths[0];
4793 /* For non-folding nodes, there are no other possible candidate matches,
4794 * but for foldable ones, we have to look further. */
4795 if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */
4796 UV folded; /* The first character in the pattern, folded */
4797 U32 first_fold_from; /* A character that folds to it */
4798 const U32 * remaining_fold_froms; /* The remaining characters that
4799 fold to it, if any */
4800 Size_t folds_to_count; /* The total number of characters that fold to
4803 /* If the node begins with a sequence of more than one character that
4804 * together form the fold of a single character, it is called a
4805 * 'multi-character fold', and the normal functions don't handle this
4806 * case. We set 'multi_fold_from' to the single folded-from character,
4807 * which is handled in an extra iteration below */
4809 folded = valid_utf8_to_uvchr(pat, NULL);
4811 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4816 /* This may generate illegal combinations for things like EXACTF,
4817 * but rather than repeat the logic and exclude them here, all such
4818 * illegalities are checked for and skipped below in the loop */
4820 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len);
4823 /* Everything matches at least itself; initialize to that because the
4824 * only the branches below that set it are the ones where the number
4828 /* There are a few special cases for locale-dependent nodes, where the
4829 * run-time context was needed before we could know what matched */
4830 if (UNLIKELY(op == EXACTFL) && folded < 256) {
4831 first_fold_from = PL_fold_locale[folded];
4833 else if ( op == EXACTFL && utf8_target && utf8_pat
4834 && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8
4835 LATIN_SMALL_LETTER_LONG_S_UTF8))
4837 first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S;
4839 else if (UNLIKELY( op == TURKISH
4840 && ( isALPHA_FOLD_EQ(folded, 'i')
4842 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
4843 LATIN_SMALL_LETTER_DOTLESS_I))))
4844 { /* Turkish folding requires special handling */
4846 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4847 else if (folded == 'I')
4848 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I;
4849 else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
4850 first_fold_from = 'i';
4851 else first_fold_from = 'I';
4854 /* Here, isn't a special case: use the generic function to
4855 * calculate what folds to this */
4857 /* Look up what code points (besides itself) fold to 'folded';
4858 * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4859 folds_to_count = _inverse_folds(folded, &first_fold_from,
4860 &remaining_fold_froms);
4863 /* Add each character that folds to 'folded' to the list of them,
4864 * subject to limitations based on the node type and target UTF8ness.
4865 * If there was a character that folded to multiple characters, do an
4866 * extra iteration for it. (Note the extra iteration if there is a
4867 * multi-character fold) */
4868 for (i = 0; i < folds_to_count
4869 + UNLIKELY(multi_fold_from != 0); i++)
4873 if (i >= folds_to_count) { /* Final iteration: handle the
4875 fold_from = multi_fold_from;
4878 fold_from = first_fold_from;
4880 else if (i < folds_to_count) {
4881 fold_from = remaining_fold_froms[i-1];
4884 if (folded == fold_from) { /* We already added the character
4889 /* EXACTF doesn't have any non-ascii folds */
4890 if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) {
4894 /* In /iaa nodes, neither or both must be ASCII to be a legal fold
4896 if ( isASCII(folded) != isASCII(fold_from)
4897 && inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE))
4903 /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8
4904 * locale, but those have been converted to EXACTFU above) */
4906 && (folded < 256) != (fold_from < 256))
4911 /* If this triggers, it likely is because of the unlikely case
4912 * where a new Unicode standard has changed what MAX_MATCHES should
4914 assert(m->count < MAX_MATCHES);
4916 /* Add this character to the list of possible matches */
4918 uvchr_to_utf8(matches[(U8) m->count], fold_from);
4919 lengths[m->count] = UVCHR_SKIP(fold_from);
4922 else { /* Non-UTF8 target: no code point above 255 can appear in it
4924 if (fold_from > 255) {
4928 matches[m->count][0] = fold_from;
4929 lengths[m->count] = 1;
4933 /* Update min and mlengths */
4934 if (m->min_length > lengths[m->count-1]) {
4935 m->min_length = lengths[m->count-1];
4938 if (m->max_length < lengths[m->count-1]) {
4939 index_of_longest = m->count - 1;
4940 m->max_length = lengths[index_of_longest];
4942 } /* looped through each potential fold */
4944 /* If there is something that folded to an initial multi-character
4945 * fold, repeat, using it. This catches some edge cases. An example
4946 * of one is /ss/i when UTF-8 encoded. The function
4947 * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF
4948 * (LATIN SMALL SHARP S). If it returned a list of characters, this
4949 * code wouldn't be needed. But since it doesn't, we have to look what
4950 * folds to the U+DF. In this case, U+1E9E does, and has to be added.
4952 if (multi_fold_from) {
4953 folded = multi_fold_from;
4954 multi_fold_from = 0;
4957 } /* End of finding things that participate in this fold */
4959 if (m->count == 0) { /* If nothing found, can't match */
4964 /* Have calculated all possible matches. Now calculate the mask and AND
4966 m->initial_exact = 0;
4967 m->initial_definitive = 0;
4970 unsigned int mask_ones = 0;
4971 unsigned int possible_ones = 0;
4974 /* For each byte that is in all possible matches ... */
4975 for (j = 0; j < MIN(m->min_length, 5); j++) {
4977 /* Initialize the accumulator for this byte */
4978 byte_mask[j] = 0xFF;
4979 byte_anded[j] = matches[0][j];
4981 /* Then the rest of the rows (folds). The mask is based on, like,
4982 * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0
4983 * where they differ. */
4984 for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) {
4985 byte_mask[j] &= ~ (byte_anded[j] ^ matches[i][j]);
4986 byte_anded[j] &= matches[i][j];
4989 /* Keep track of the number of initial mask bytes that are all one
4990 * bits. The code calling this can use this number to know that
4991 * a string that matches this number of bytes in the pattern is an
4992 * exact match of that pattern for this number of bytes. But also
4993 * counted are the number of initial bytes that in total have a
4994 * single zero bit. If a string matches those, masked, it must be
4995 * one of two possibilites, both of which this function has
4996 * determined are legal. (But if that single 0 is one of the
4997 * initial bits for masking a UTF-8 start byte, that could
4998 * incorrectly lead to different length strings appearing to be
4999 * equivalent, so only do this optimization when the matchables are
5000 * all the same length. This was uncovered by testing
5002 if (m->min_length == m->max_length) {
5003 mask_ones += PL_bitcount[byte_mask[j]];
5005 if (mask_ones + 1 >= possible_ones) {
5006 m->initial_definitive++;
5007 if (mask_ones >= possible_ones) {
5015 /* The first byte is separate for speed */
5016 m->first_byte_mask = byte_mask[0];
5017 m->first_byte_anded = byte_anded[0];
5019 /* Then pack up to the next 4 bytes into a word */
5020 m->mask32 = m->anded32 = 0;
5021 for (i = 1; i < MIN(m->min_length, 5); i++) {
5023 U8 shift = (which - 1) * 8;
5024 m->mask32 |= (U32) byte_mask[i] << shift;
5025 m->anded32 |= (U32) byte_anded[i] << shift;
5028 /* Finally, take the match strings and place them sequentially into a
5029 * one-dimensional array. (This is done to save significant space in the
5030 * structure.) Sort so the longest (presumably the least likely) is last.
5031 * XXX When this gets moved to regcomp, may want to fully sort shortest
5032 * first, but above we generally used the folded code point first, and
5033 * those tend to be no longer than their upper case values, so this is
5034 * already pretty well sorted by size.
5036 * If the asserts fail, it's most likely because a new version of the
5037 * Unicode standard requires more space; simply increase the declaration
5041 U8 output_index = 0;
5043 if (m->count > 1) { /* No need to sort a single entry */
5044 for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
5046 /* Keep the same order for all but the longest. (If the
5047 * asserts fail, it could be because m->matches is declared too
5048 * short, either because of a new Unicode release, or an
5049 * overlooked test case, or it could be a bug.) */
5050 if (i != index_of_longest) {
5051 assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
5052 Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
5053 cur_pos += lengths[i];
5054 m->lengths[output_index++] = lengths[i];
5059 assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches));
5060 Copy(matches[index_of_longest], m->matches + cur_pos,
5061 lengths[index_of_longest], U8);
5063 /* Place the longest match last */
5064 m->lengths[output_index] = lengths[index_of_longest];
5071 PERL_STATIC_FORCE_INLINE /* We want speed at the expense of size */
5073 S_test_EXACTISH_ST(const char * loc,
5074 struct next_matchable_info info)
5076 /* This function uses the data set up in setup_EXACTISH_ST() to see if the
5077 * bytes starting at 'loc' can match based on 'next_matchable_info' */
5081 /* Check the first byte */
5082 if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded)
5085 /* Pack the next up-to-4 bytes into a 32 bit word */
5086 switch (info.min_length) {
5088 input32 |= (U32) ((U8) loc[4]) << 3 * 8;
5091 input32 |= (U8) loc[3] << 2 * 8;
5094 input32 |= (U8) loc[2] << 1 * 8;
5097 input32 |= (U8) loc[1];
5100 return TRUE; /* We already tested and passed the 0th byte */
5105 /* And AND that with the mask and compare that with the assembled ANDED
5107 return (input32 & info.mask32) == info.anded32;
5111 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
5113 /* returns a boolean indicating if there is a Grapheme Cluster Boundary
5114 * between the inputs. See https://www.unicode.org/reports/tr29/. */
5116 PERL_ARGS_ASSERT_ISGCB;
5118 switch (GCB_table[before][after]) {
5125 case GCB_RI_then_RI:
5128 U8 * temp_pos = (U8 *) curpos;
5130 /* Do not break within emoji flag sequences. That is, do not
5131 * break between regional indicator (RI) symbols if there is an
5132 * odd number of RI characters before the break point.
5133 * GB12 sot (RI RI)* RI × RI
5134 * GB13 [^RI] (RI RI)* RI × RI */
5136 while (backup_one_GCB(strbeg,
5138 utf8_target) == GCB_Regional_Indicator)
5143 return RI_count % 2 != 1;
5146 case GCB_EX_then_EM:
5148 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */
5150 U8 * temp_pos = (U8 *) curpos;
5154 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5156 while (prev == GCB_Extend);
5158 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
5161 case GCB_Maybe_Emoji_NonBreak:
5165 /* Do not break within emoji modifier sequences or emoji zwj sequences.
5166 GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
5168 U8 * temp_pos = (U8 *) curpos;
5172 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5174 while (prev == GCB_Extend);
5176 return prev != GCB_ExtPict_XX;
5184 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
5185 before, after, GCB_table[before][after]);
5192 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5196 PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
5198 if (*curpos < strbeg) {
5203 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5204 U8 * prev_prev_char_pos;
5206 if (! prev_char_pos) {
5210 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5211 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5212 *curpos = prev_char_pos;
5213 prev_char_pos = prev_prev_char_pos;
5216 *curpos = (U8 *) strbeg;
5221 if (*curpos - 2 < strbeg) {
5222 *curpos = (U8 *) strbeg;
5226 gcb = getGCB_VAL_CP(*(*curpos - 1));
5232 /* Combining marks attach to most classes that precede them, but this defines
5233 * the exceptions (from TR14) */
5234 #define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
5235 || prev == LB_Mandatory_Break \
5236 || prev == LB_Carriage_Return \
5237 || prev == LB_Line_Feed \
5238 || prev == LB_Next_Line \
5239 || prev == LB_Space \
5240 || prev == LB_ZWSpace))
5243 S_isLB(pTHX_ LB_enum before,
5245 const U8 * const strbeg,
5246 const U8 * const curpos,
5247 const U8 * const strend,
5248 const bool utf8_target)
5250 U8 * temp_pos = (U8 *) curpos;
5251 LB_enum prev = before;
5253 /* Is the boundary between 'before' and 'after' line-breakable?
5254 * Most of this is just a table lookup of a generated table from Unicode
5255 * rules. But some rules require context to decide, and so have to be
5256 * implemented in code */
5258 PERL_ARGS_ASSERT_ISLB;
5260 /* Rule numbers in the comments below are as of Unicode 9.0 */
5264 switch (LB_table[before][after]) {
5269 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5272 case LB_SP_foo + LB_BREAKABLE:
5273 case LB_SP_foo + LB_NOBREAK:
5274 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5276 /* When we have something following a SP, we have to look at the
5277 * context in order to know what to do.
5279 * SP SP should not reach here because LB7: Do not break before
5280 * spaces. (For two spaces in a row there is nothing that
5281 * overrides that) */
5282 assert(after != LB_Space);
5284 /* Here we have a space followed by a non-space. Mostly this is a
5285 * case of LB18: "Break after spaces". But there are complications
5286 * as the handling of spaces is somewhat tricky. They are in a
5287 * number of rules, which have to be applied in priority order, but
5288 * something earlier in the string can cause a rule to be skipped
5289 * and a lower priority rule invoked. A prime example is LB7 which
5290 * says don't break before a space. But rule LB8 (lower priority)
5291 * says that the first break opportunity after a ZW is after any
5292 * span of spaces immediately after it. If a ZW comes before a SP
5293 * in the input, rule LB8 applies, and not LB7. Other such rules
5294 * involve combining marks which are rules 9 and 10, but they may
5295 * override higher priority rules if they come earlier in the
5296 * string. Since we're doing random access into the middle of the
5297 * string, we have to look for rules that should get applied based
5298 * on both string position and priority. Combining marks do not
5299 * attach to either ZW nor SP, so we don't have to consider them
5302 * To check for LB8, we have to find the first non-space character
5303 * before this span of spaces */
5305 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5307 while (prev == LB_Space);
5309 /* LB8 Break before any character following a zero-width space,
5310 * even if one or more spaces intervene.
5312 * So if we have a ZW just before this span, and to get here this
5313 * is the final space in the span. */
5314 if (prev == LB_ZWSpace) {
5318 /* Here, not ZW SP+. There are several rules that have higher
5319 * priority than LB18 and can be resolved now, as they don't depend
5320 * on anything earlier in the string (except ZW, which we have
5321 * already handled). One of these rules is LB11 Do not break
5322 * before Word joiner, but we have specially encoded that in the
5323 * lookup table so it is caught by the single test below which
5324 * catches the other ones. */
5325 if (LB_table[LB_Space][after] - LB_SP_foo
5326 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
5331 /* If we get here, we have to XXX consider combining marks. */
5332 if (prev == LB_Combining_Mark) {
5334 /* What happens with these depends on the character they
5337 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5339 while (prev == LB_Combining_Mark);
5341 /* Most times these attach to and inherit the characteristics
5342 * of that character, but not always, and when not, they are to
5343 * be treated as AL by rule LB10. */
5344 if (! LB_CM_ATTACHES_TO(prev)) {
5345 prev = LB_Alphabetic;
5349 /* Here, we have the character preceding the span of spaces all set
5350 * up. We follow LB18: "Break after spaces" unless the table shows
5351 * that is overriden */
5352 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
5356 /* We don't know how to treat the CM except by looking at the first
5357 * non-CM character preceding it. ZWJ is treated as CM */
5359 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5361 while (prev == LB_Combining_Mark || prev == LB_ZWJ);
5363 /* Here, 'prev' is that first earlier non-CM character. If the CM
5364 * attatches to it, then it inherits the behavior of 'prev'. If it
5365 * doesn't attach, it is to be treated as an AL */
5366 if (! LB_CM_ATTACHES_TO(prev)) {
5367 prev = LB_Alphabetic;
5372 case LB_HY_or_BA_then_foo + LB_BREAKABLE:
5373 case LB_HY_or_BA_then_foo + LB_NOBREAK:
5375 /* LB21a Don't break after Hebrew + Hyphen.
5376 * HL (HY | BA) × */
5378 if (backup_one_LB(strbeg, &temp_pos, utf8_target)
5379 == LB_Hebrew_Letter)
5384 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
5386 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
5387 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5389 /* LB25a (PR | PO) × ( OP | HY )? NU */
5390 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5394 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5397 case LB_SY_or_IS_then_various + LB_BREAKABLE:
5398 case LB_SY_or_IS_then_various + LB_NOBREAK:
5400 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
5402 LB_enum temp = prev;
5404 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5406 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5407 if (temp == LB_Numeric) {
5411 return LB_table[prev][after] - LB_SY_or_IS_then_various
5415 case LB_various_then_PO_or_PR + LB_BREAKABLE:
5416 case LB_various_then_PO_or_PR + LB_NOBREAK:
5418 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
5420 LB_enum temp = prev;
5421 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5423 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5425 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5426 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5428 if (temp == LB_Numeric) {
5431 return LB_various_then_PO_or_PR;
5434 case LB_RI_then_RI + LB_NOBREAK:
5435 case LB_RI_then_RI + LB_BREAKABLE:
5439 /* LB30a Break between two regional indicator symbols if and
5440 * only if there are an even number of regional indicators
5441 * preceding the position of the break.
5443 * sot (RI RI)* RI × RI
5444 * [^RI] (RI RI)* RI × RI */
5446 while (backup_one_LB(strbeg,
5448 utf8_target) == LB_Regional_Indicator)
5453 return RI_count % 2 == 0;
5461 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5462 before, after, LB_table[before][after]);
5469 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5474 PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5476 if (*curpos >= strend) {
5481 *curpos += UTF8SKIP(*curpos);
5482 if (*curpos >= strend) {
5485 lb = getLB_VAL_UTF8(*curpos, strend);
5489 if (*curpos >= strend) {
5492 lb = getLB_VAL_CP(**curpos);
5499 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5503 PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5505 if (*curpos < strbeg) {
5510 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5511 U8 * prev_prev_char_pos;
5513 if (! prev_char_pos) {
5517 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5518 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5519 *curpos = prev_char_pos;
5520 prev_char_pos = prev_prev_char_pos;
5523 *curpos = (U8 *) strbeg;
5528 if (*curpos - 2 < strbeg) {
5529 *curpos = (U8 *) strbeg;
5533 lb = getLB_VAL_CP(*(*curpos - 1));
5540 S_isSB(pTHX_ SB_enum before,
5542 const U8 * const strbeg,
5543 const U8 * const curpos,
5544 const U8 * const strend,
5545 const bool utf8_target)
5547 /* returns a boolean indicating if there is a Sentence Boundary Break
5548 * between the inputs. See https://www.unicode.org/reports/tr29/ */
5550 U8 * lpos = (U8 *) curpos;
5551 bool has_para_sep = FALSE;
5552 bool has_sp = FALSE;
5554 PERL_ARGS_ASSERT_ISSB;
5556 /* Break at the start and end of text.
5559 But unstated in Unicode is don't break if the text is empty */
5560 if (before == SB_EDGE || after == SB_EDGE) {
5561 return before != after;
5564 /* SB 3: Do not break within CRLF. */
5565 if (before == SB_CR && after == SB_LF) {
5569 /* Break after paragraph separators. CR and LF are considered
5570 * so because Unicode views text as like word processing text where there
5571 * are no newlines except between paragraphs, and the word processor takes
5572 * care of wrapping without there being hard line-breaks in the text *./
5573 SB4. Sep | CR | LF ÷ */
5574 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5578 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5579 * (See Section 6.2, Replacing Ignore Rules.)
5580 SB5. X (Extend | Format)* → X */
5581 if (after == SB_Extend || after == SB_Format) {
5583 /* Implied is that the these characters attach to everything
5584 * immediately prior to them except for those separator-type
5585 * characters. And the rules earlier have already handled the case
5586 * when one of those immediately precedes the extend char */
5590 if (before == SB_Extend || before == SB_Format) {
5591 U8 * temp_pos = lpos;
5592 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5593 if ( backup != SB_EDGE
5602 /* Here, both 'before' and 'backup' are these types; implied is that we
5603 * don't break between them */
5604 if (backup == SB_Extend || backup == SB_Format) {
5609 /* Do not break after ambiguous terminators like period, if they are
5610 * immediately followed by a number or lowercase letter, if they are
5611 * between uppercase letters, if the first following letter (optionally
5612 * after certain punctuation) is lowercase, or if they are followed by
5613 * "continuation" punctuation such as comma, colon, or semicolon. For
5614 * example, a period may be an abbreviation or numeric period, and thus may
5615 * not mark the end of a sentence.
5617 * SB6. ATerm × Numeric */
5618 if (before == SB_ATerm && after == SB_Numeric) {
5622 /* SB7. (Upper | Lower) ATerm × Upper */
5623 if (before == SB_ATerm && after == SB_Upper) {
5624 U8 * temp_pos = lpos;
5625 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5626 if (backup == SB_Upper || backup == SB_Lower) {
5631 /* The remaining rules that aren't the final one, all require an STerm or
5632 * an ATerm after having backed up over some Close* Sp*, and in one case an
5633 * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5634 * So do that backup now, setting flags if either Sp or a paragraph
5635 * separator are found */
5637 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5638 has_para_sep = TRUE;
5639 before = backup_one_SB(strbeg, &lpos, utf8_target);
5642 if (before == SB_Sp) {
5645 before = backup_one_SB(strbeg, &lpos, utf8_target);
5647 while (before == SB_Sp);
5650 while (before == SB_Close) {
5651 before = backup_one_SB(strbeg, &lpos, utf8_target);
5654 /* The next few rules apply only when the backed-up-to is an ATerm, and in
5655 * most cases an STerm */
5656 if (before == SB_STerm || before == SB_ATerm) {
5658 /* So, here the lhs matches
5659 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5660 * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5661 * The rules that apply here are:
5663 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR
5664 | LF | STerm | ATerm) )* Lower
5665 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
5666 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF)
5667 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF)
5668 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷
5671 /* And all but SB11 forbid having seen a paragraph separator */
5672 if (! has_para_sep) {
5673 if (before == SB_ATerm) { /* SB8 */
5674 U8 * rpos = (U8 *) curpos;
5675 SB_enum later = after;
5677 while ( later != SB_OLetter
5678 && later != SB_Upper
5679 && later != SB_Lower
5683 && later != SB_STerm
5684 && later != SB_ATerm
5685 && later != SB_EDGE)
5687 later = advance_one_SB(&rpos, strend, utf8_target);
5689 if (later == SB_Lower) {
5694 if ( after == SB_SContinue /* SB8a */
5695 || after == SB_STerm
5696 || after == SB_ATerm)
5701 if (! has_sp) { /* SB9 applies only if there was no Sp* */
5702 if ( after == SB_Close
5712 /* SB10. This and SB9 could probably be combined some way, but khw
5713 * has decided to follow the Unicode rule book precisely for
5714 * simplified maintenance */
5728 /* Otherwise, do not break.
5735 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5739 PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5741 if (*curpos >= strend) {
5747 *curpos += UTF8SKIP(*curpos);
5748 if (*curpos >= strend) {
5751 sb = getSB_VAL_UTF8(*curpos, strend);
5752 } while (sb == SB_Extend || sb == SB_Format);
5757 if (*curpos >= strend) {
5760 sb = getSB_VAL_CP(**curpos);
5761 } while (sb == SB_Extend || sb == SB_Format);
5768 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5772 PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5774 if (*curpos < strbeg) {
5779 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5780 if (! prev_char_pos) {
5784 /* Back up over Extend and Format. curpos is always just to the right
5785 * of the characater whose value we are getting */
5787 U8 * prev_prev_char_pos;
5788 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5791 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5792 *curpos = prev_char_pos;
5793 prev_char_pos = prev_prev_char_pos;
5796 *curpos = (U8 *) strbeg;
5799 } while (sb == SB_Extend || sb == SB_Format);
5803 if (*curpos - 2 < strbeg) {
5804 *curpos = (U8 *) strbeg;
5808 sb = getSB_VAL_CP(*(*curpos - 1));
5809 } while (sb == SB_Extend || sb == SB_Format);
5816 S_isWB(pTHX_ WB_enum previous,
5819 const U8 * const strbeg,
5820 const U8 * const curpos,
5821 const U8 * const strend,
5822 const bool utf8_target)
5824 /* Return a boolean as to if the boundary between 'before' and 'after' is
5825 * a Unicode word break, using their published algorithm, but tailored for
5826 * Perl by treating spans of white space as one unit. Context may be
5827 * needed to make this determination. If the value for the character
5828 * before 'before' is known, it is passed as 'previous'; otherwise that
5829 * should be set to WB_UNKNOWN. The other input parameters give the
5830 * boundaries and current position in the matching of the string. That
5831 * is, 'curpos' marks the position where the character whose wb value is
5832 * 'after' begins. See http://www.unicode.org/reports/tr29/ */
5834 U8 * before_pos = (U8 *) curpos;
5835 U8 * after_pos = (U8 *) curpos;
5836 WB_enum prev = before;
5839 PERL_ARGS_ASSERT_ISWB;
5841 /* Rule numbers in the comments below are as of Unicode 9.0 */
5845 switch (WB_table[before][after]) {
5852 case WB_hs_then_hs: /* 2 horizontal spaces in a row */
5853 next = advance_one_WB(&after_pos, strend, utf8_target,
5854 FALSE /* Don't skip Extend nor Format */ );
5855 /* A space immediately preceding an Extend or Format is attached
5856 * to by them, and hence gets separated from previous spaces.
5857 * Otherwise don't break between horizontal white space */
5858 return next == WB_Extend || next == WB_Format;
5860 /* WB4 Ignore Format and Extend characters, except when they appear at
5861 * the beginning of a region of text. This code currently isn't
5862 * general purpose, but it works as the rules are currently and likely
5863 * to be laid out. The reason it works is that when 'they appear at
5864 * the beginning of a region of text', the rule is to break before
5865 * them, just like any other character. Therefore, the default rule
5866 * applies and we don't have to look in more depth. Should this ever
5867 * change, we would have to have 2 'case' statements, like in the rules
5868 * below, and backup a single character (not spacing over the extend
5869 * ones) and then see if that is one of the region-end characters and
5871 case WB_Ex_or_FO_or_ZWJ_then_foo:
5872 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5875 case WB_DQ_then_HL + WB_BREAKABLE:
5876 case WB_DQ_then_HL + WB_NOBREAK:
5878 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */
5880 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5881 == WB_Hebrew_Letter)
5886 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5888 case WB_HL_then_DQ + WB_BREAKABLE:
5889 case WB_HL_then_DQ + WB_NOBREAK:
5891 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */
5893 if (advance_one_WB(&after_pos, strend, utf8_target,
5894 TRUE /* Do skip Extend and Format */ )
5895 == WB_Hebrew_Letter)
5900 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5902 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5903 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5905 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
5906 * | Single_Quote) (ALetter | Hebrew_Letter) */
5908 next = advance_one_WB(&after_pos, strend, utf8_target,
5909 TRUE /* Do skip Extend and Format */ );
5911 if (next == WB_ALetter || next == WB_Hebrew_Letter)
5916 return WB_table[before][after]
5917 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5919 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5920 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5922 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5923 * | Single_Quote) × (ALetter | Hebrew_Letter) */
5925 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5926 if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5931 return WB_table[before][after]
5932 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5934 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5935 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5937 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
5940 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5946 return WB_table[before][after]
5947 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5949 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5950 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5952 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */
5954 if (advance_one_WB(&after_pos, strend, utf8_target,
5955 TRUE /* Do skip Extend and Format */ )
5961 return WB_table[before][after]
5962 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5964 case WB_RI_then_RI + WB_NOBREAK:
5965 case WB_RI_then_RI + WB_BREAKABLE:
5969 /* Do not break within emoji flag sequences. That is, do not
5970 * break between regional indicator (RI) symbols if there is an
5971 * odd number of RI characters before the potential break
5974 * WB15 sot (RI RI)* RI × RI
5975 * WB16 [^RI] (RI RI)* RI × RI */
5977 while (backup_one_WB(&previous,
5980 utf8_target) == WB_Regional_Indicator)
5985 return RI_count % 2 != 1;
5993 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5994 before, after, WB_table[before][after]);
6001 S_advance_one_WB(pTHX_ U8 ** curpos,
6002 const U8 * const strend,
6003 const bool utf8_target,
6004 const bool skip_Extend_Format)
6008 PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
6010 if (*curpos >= strend) {
6016 /* Advance over Extend and Format */
6018 *curpos += UTF8SKIP(*curpos);
6019 if (*curpos >= strend) {
6022 wb = getWB_VAL_UTF8(*curpos, strend);
6023 } while ( skip_Extend_Format
6024 && (wb == WB_Extend || wb == WB_Format));
6029 if (*curpos >= strend) {
6032 wb = getWB_VAL_CP(**curpos);
6033 } while ( skip_Extend_Format
6034 && (wb == WB_Extend || wb == WB_Format));
6041 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
6045 PERL_ARGS_ASSERT_BACKUP_ONE_WB;
6047 /* If we know what the previous character's break value is, don't have
6049 if (*previous != WB_UNKNOWN) {
6052 /* But we need to move backwards by one */
6054 *curpos = reghopmaybe3(*curpos, -1, strbeg);
6056 *previous = WB_EDGE;
6057 *curpos = (U8 *) strbeg;
6060 *previous = WB_UNKNOWN;
6065 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
6068 /* And we always back up over these three types */
6069 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
6074 if (*curpos < strbeg) {
6079 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
6080 if (! prev_char_pos) {
6084 /* Back up over Extend and Format. curpos is always just to the right
6085 * of the characater whose value we are getting */
6087 U8 * prev_prev_char_pos;
6088 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
6092 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
6093 *curpos = prev_char_pos;
6094 prev_char_pos = prev_prev_char_pos;
6097 *curpos = (U8 *) strbeg;
6100 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
6104 if (*curpos - 2 < strbeg) {
6105 *curpos = (U8 *) strbeg;
6109 wb = getWB_VAL_CP(*(*curpos - 1));
6110 } while (wb == WB_Extend || wb == WB_Format);
6116 /* Macros for regmatch(), using its internal variables */
6117 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
6118 #define NEXTCHR_IS_EOS (nextbyte < 0)
6120 #define SET_nextchr \
6121 nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
6123 #define SET_locinput(p) \
6127 #define sayYES goto yes
6128 #define sayNO goto no
6129 #define sayNO_SILENT goto no_silent
6131 /* we dont use STMT_START/END here because it leads to
6132 "unreachable code" warnings, which are bogus, but distracting. */
6133 #define CACHEsayNO \
6134 if (ST.cache_mask) \
6135 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
6138 #define EVAL_CLOSE_PAREN_IS(st,expr) \
6141 ( ( st )->u.eval.close_paren ) && \
6142 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6145 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
6148 ( ( st )->u.eval.close_paren ) && \
6150 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6154 #define EVAL_CLOSE_PAREN_SET(st,expr) \
6155 (st)->u.eval.close_paren = ( (expr) + 1 )
6157 #define EVAL_CLOSE_PAREN_CLEAR(st) \
6158 (st)->u.eval.close_paren = 0
6160 /* push a new state then goto it */
6162 #define PUSH_STATE_GOTO(state, node, input, eol, sr0) \
6163 pushinput = input; \
6167 st->resume_state = state; \
6170 /* push a new state with success backtracking, then goto it */
6172 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \
6173 pushinput = input; \
6177 st->resume_state = state; \
6178 goto push_yes_state;
6180 #define DEBUG_STATE_pp(pp) \
6182 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
6183 Perl_re_printf( aTHX_ \
6184 "%*s" pp " %s%s%s%s%s\n", \
6185 INDENT_CHARS(depth), "", \
6186 REGNODE_NAME(st->resume_state), \
6187 ((st==yes_state||st==mark_state) ? "[" : ""), \
6188 ((st==yes_state) ? "Y" : ""), \
6189 ((st==mark_state) ? "M" : ""), \
6190 ((st==yes_state||st==mark_state) ? "]" : "") \
6196 regmatch() - main matching routine
6198 This is basically one big switch statement in a loop. We execute an op,
6199 set 'next' to point the next op, and continue. If we come to a point which
6200 we may need to backtrack to on failure such as (A|B|C), we push a
6201 backtrack state onto the backtrack stack. On failure, we pop the top
6202 state, and re-enter the loop at the state indicated. If there are no more
6203 states to pop, we return failure.
6205 Sometimes we also need to backtrack on success; for example /A+/, where
6206 after successfully matching one A, we need to go back and try to
6207 match another one; similarly for lookahead assertions: if the assertion
6208 completes successfully, we backtrack to the state just before the assertion
6209 and then carry on. In these cases, the pushed state is marked as
6210 'backtrack on success too'. This marking is in fact done by a chain of
6211 pointers, each pointing to the previous 'yes' state. On success, we pop to
6212 the nearest yes state, discarding any intermediate failure-only states.
6213 Sometimes a yes state is pushed just to force some cleanup code to be
6214 called at the end of a successful match or submatch; e.g. (??{$re}) uses
6215 it to free the inner regex.
6217 Note that failure backtracking rewinds the cursor position, while
6218 success backtracking leaves it alone.
6220 A pattern is complete when the END op is executed, while a subpattern
6221 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
6222 ops trigger the "pop to last yes state if any, otherwise return true"
6225 A common convention in this function is to use A and B to refer to the two
6226 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
6227 the subpattern to be matched possibly multiple times, while B is the entire
6228 rest of the pattern. Variable and state names reflect this convention.
6230 The states in the main switch are the union of ops and failure/success of
6231 substates associated with that op. For example, IFMATCH is the op
6232 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
6233 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
6234 successfully matched A and IFMATCH_A_fail is a state saying that we have
6235 just failed to match A. Resume states always come in pairs. The backtrack
6236 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
6237 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
6238 on success or failure.
6240 The struct that holds a backtracking state is actually a big union, with
6241 one variant for each major type of op. The variable st points to the
6242 top-most backtrack struct. To make the code clearer, within each
6243 block of code we #define ST to alias the relevant union.
6245 Here's a concrete example of a (vastly oversimplified) IFMATCH
6251 #define ST st->u.ifmatch
6253 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6254 ST.foo = ...; // some state we wish to save
6256 // push a yes backtrack state with a resume value of
6257 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
6259 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
6262 case IFMATCH_A: // we have successfully executed A; now continue with B
6264 bar = ST.foo; // do something with the preserved value
6267 case IFMATCH_A_fail: // A failed, so the assertion failed
6268 ...; // do some housekeeping, then ...
6269 sayNO; // propagate the failure
6276 For any old-timers reading this who are familiar with the old recursive
6277 approach, the code above is equivalent to:
6279 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6288 ...; // do some housekeeping, then ...
6289 sayNO; // propagate the failure
6292 The topmost backtrack state, pointed to by st, is usually free. If you
6293 want to claim it, populate any ST.foo fields in it with values you wish to
6294 save, then do one of
6296 PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
6297 PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
6299 which sets that backtrack state's resume value to 'resume_state', pushes a
6300 new free entry to the top of the backtrack stack, then goes to 'node'.
6301 On backtracking, the free slot is popped, and the saved state becomes the
6302 new free state. An ST.foo field in this new top state can be temporarily
6303 accessed to retrieve values, but once the main loop is re-entered, it
6304 becomes available for reuse.
6306 Note that the depth of the backtrack stack constantly increases during the
6307 left-to-right execution of the pattern, rather than going up and down with
6308 the pattern nesting. For example the stack is at its maximum at Z at the
6309 end of the pattern, rather than at X in the following:
6311 /(((X)+)+)+....(Y)+....Z/
6313 The only exceptions to this are lookahead/behind assertions and the cut,
6314 (?>A), which pop all the backtrack states associated with A before
6317 Backtrack state structs are allocated in slabs of about 4K in size.
6318 PL_regmatch_state and st always point to the currently active state,
6319 and PL_regmatch_slab points to the slab currently containing
6320 PL_regmatch_state. The first time regmatch() is called, the first slab is
6321 allocated, and is never freed until interpreter destruction. When the slab
6322 is full, a new one is allocated and chained to the end. At exit from
6323 regmatch(), slabs allocated since entry are freed.
6325 In order to work with variable length lookbehinds, an upper limit is placed on
6326 lookbehinds which is set to where the match position is at the end of where the
6327 lookbehind would get to. Nothing in the lookbehind should match above that,
6328 except we should be able to look beyond if for things like \b, which need the
6329 next character in the string to be able to determine if this is a boundary or
6330 not. We also can't match the end of string/line unless we are also at the end
6331 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
6332 that match a width, we have to add a condition that they are within the legal
6333 bounds of our window into the string.
6337 /* returns -1 on failure, $+[0] on success */
6339 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
6341 const bool utf8_target = reginfo->is_utf8_target;
6342 const U32 uniflags = UTF8_ALLOW_DEFAULT;
6343 REGEXP *rex_sv = reginfo->prog;
6344 regexp *rex = ReANY(rex_sv);
6345 RXi_GET_DECL(rex,rexi);
6346 /* the current state. This is a cached copy of PL_regmatch_state */
6348 /* cache heavy used fields of st in registers */
6351 U32 n = 0; /* general value; init to avoid compiler warning */
6352 U32 utmp = 0; /* tmp variable - valid for at most one opcode */
6353 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
6354 SSize_t endref = 0; /* offset of end of backref when ln is start */
6355 char *locinput = startpos;
6356 char *loceol = reginfo->strend;
6357 char *pushinput; /* where to continue after a PUSH */
6358 char *pusheol; /* where to stop matching (loceol) after a PUSH */
6359 U8 *pushsr0; /* save starting pos of script run */
6360 PERL_INT_FAST16_T nextbyte; /* is always set to UCHARAT(locinput), or -1
6363 bool result = 0; /* return value of S_regmatch */
6364 U32 depth = 0; /* depth of backtrack stack */
6365 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
6366 const U32 max_nochange_depth =
6367 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
6368 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
6369 regmatch_state *yes_state = NULL; /* state to pop to on success of
6371 /* mark_state piggy backs on the yes_state logic so that when we unwind
6372 the stack on success we can update the mark_state as we go */
6373 regmatch_state *mark_state = NULL; /* last mark state we have seen */
6374 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
6375 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
6377 bool no_final = 0; /* prevent failure from backtracking? */
6378 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
6379 char *startpoint = locinput;
6380 SV *popmark = NULL; /* are we looking for a mark? */
6381 SV *sv_commit = NULL; /* last mark name seen in failure */
6382 SV *sv_yes_mark = NULL; /* last mark name we have seen
6383 during a successful match */
6384 U32 lastopen = 0; /* last open we saw */
6385 bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
6386 SV* const oreplsv = GvSVn(PL_replgv);
6387 /* these three flags are set by various ops to signal information to
6388 * the very next op. They have a useful lifetime of exactly one loop
6389 * iteration, and are not preserved or restored by state pushes/pops
6391 bool sw = 0; /* the condition value in (?(cond)a|b) */
6392 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
6393 int logical = 0; /* the following EVAL is:
6397 or the following IFMATCH/UNLESSM is:
6398 false: plain (?=foo)
6399 true: used as a condition: (?(?=foo))
6401 PAD* last_pad = NULL;
6403 U8 gimme = G_SCALAR;
6404 CV *caller_cv = NULL; /* who called us */
6405 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
6406 U32 maxopenparen = 0; /* max '(' index seen so far */
6407 int to_complement; /* Invert the result? */
6408 char_class_number_ classnum;
6409 bool is_utf8_pat = reginfo->is_utf8_pat;
6411 I32 orig_savestack_ix = PL_savestack_ix;
6412 U8 * script_run_begin = NULL;
6413 char *match_end= NULL; /* where a match MUST end to be considered successful */
6414 bool is_accepted = FALSE; /* have we hit an ACCEPT opcode? */
6415 re_fold_t folder = NULL; /* used by various EXACTish regops */
6416 const U8 * fold_array = NULL; /* used by various EXACTish regops */
6418 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
6419 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
6420 # define SOLARIS_BAD_OPTIMIZER
6421 const U32 *pl_charclass_dup = PL_charclass;
6422 # define PL_charclass pl_charclass_dup
6426 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6429 /* protect against undef(*^R) */
6430 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
6432 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
6433 multicall_oldcatch = 0;
6434 PERL_UNUSED_VAR(multicall_cop);
6436 PERL_ARGS_ASSERT_REGMATCH;
6438 st = PL_regmatch_state;
6440 /* Note that nextbyte is a byte even in UTF */
6444 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
6445 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6446 Perl_re_printf( aTHX_ "regmatch start\n" );
6449 while (scan != NULL) {
6450 next = scan + NEXT_OFF(scan);
6453 state_num = OP(scan);
6457 if (state_num <= REGNODE_MAX) {
6458 SV * const prop = sv_newmortal();
6459 regnode *rnext = regnext(scan);
6461 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6462 regprop(rex, prop, scan, reginfo, NULL);
6463 Perl_re_printf( aTHX_
6464 "%*s%" IVdf ":%s(%" IVdf ")\n",
6465 INDENT_CHARS(depth), "",
6466 (IV)(scan - rexi->program),
6468 (REGNODE_TYPE(OP(scan)) == END || !rnext) ?
6469 0 : (IV)(rnext - rexi->program));
6476 assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
6478 switch (state_num) {
6481 case SBOL: /* /^../ and /\A../ */
6482 if (locinput == reginfo->strbeg)
6486 case MBOL: /* /^../m */
6487 if (locinput == reginfo->strbeg ||
6488 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
6495 if (locinput == reginfo->ganch)
6499 case KEEPS: /* \K */
6500 /* update the startpoint */
6501 st->u.keeper.val = rex->offs[0].start;
6502 rex->offs[0].start = locinput - reginfo->strbeg;
6503 PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
6505 NOT_REACHED; /* NOTREACHED */
6507 case KEEPS_next_fail:
6508 /* rollback the start point change */
6509 rex->offs[0].start = st->u.keeper.val;
6511 NOT_REACHED; /* NOTREACHED */
6513 case MEOL: /* /..$/m */
6514 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6518 case SEOL: /* /..$/ */
6519 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6521 if (reginfo->strend - locinput > 1)
6526 if (!NEXTCHR_IS_EOS)
6530 case SANY: /* /./s */
6531 if (NEXTCHR_IS_EOS || locinput >= loceol)
6533 goto increment_locinput;
6535 case REG_ANY: /* /./ */
6537 || locinput >= loceol
6538 || nextbyte == '\n')
6542 goto increment_locinput;
6546 #define ST st->u.trie
6547 case TRIEC: /* (ab|cd) with known charclass */
6548 /* In this case the charclass data is available inline so
6549 we can fail fast without a lot of extra overhead.
6551 if ( ! NEXTCHR_IS_EOS
6552 && locinput < loceol
6553 && ! ANYOF_BITMAP_TEST(scan, nextbyte))
6556 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6557 depth, PL_colors[4], PL_colors[5])
6560 NOT_REACHED; /* NOTREACHED */
6563 case TRIE: /* (ab|cd) */
6564 /* the basic plan of execution of the trie is:
6565 * At the beginning, run though all the states, and
6566 * find the longest-matching word. Also remember the position
6567 * of the shortest matching word. For example, this pattern:
6570 * when matched against the string "abcde", will generate
6571 * accept states for all words except 3, with the longest
6572 * matching word being 4, and the shortest being 2 (with
6573 * the position being after char 1 of the string).
6575 * Then for each matching word, in word order (i.e. 1,2,4,5),
6576 * we run the remainder of the pattern; on each try setting
6577 * the current position to the character following the word,
6578 * returning to try the next word on failure.
6580 * We avoid having to build a list of words at runtime by
6581 * using a compile-time structure, wordinfo[].prev, which
6582 * gives, for each word, the previous accepting word (if any).
6583 * In the case above it would contain the mappings 1->2, 2->0,
6584 * 3->0, 4->5, 5->1. We can use this table to generate, from
6585 * the longest word (4 above), a list of all words, by
6586 * following the list of prev pointers; this gives us the
6587 * unordered list 4,5,1,2. Then given the current word we have
6588 * just tried, we can go through the list and find the
6589 * next-biggest word to try (so if we just failed on word 2,
6590 * the next in the list is 4).
6592 * Since at runtime we don't record the matching position in
6593 * the string for each word, we have to work that out for
6594 * each word we're about to process. The wordinfo table holds
6595 * the character length of each word; given that we recorded
6596 * at the start: the position of the shortest word and its
6597 * length in chars, we just need to move the pointer the
6598 * difference between the two char lengths. Depending on
6599 * Unicode status and folding, that's cheap or expensive.
6601 * This algorithm is optimised for the case where are only a
6602 * small number of accept states, i.e. 0,1, or maybe 2.
6603 * With lots of accepts states, and having to try all of them,
6604 * it becomes quadratic on number of accept states to find all
6609 /* what type of TRIE am I? (utf8 makes this contextual) */
6610 DECL_TRIE_TYPE(scan);
6612 /* what trie are we using right now */
6613 reg_trie_data * const trie
6614 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
6615 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
6616 U32 state = trie->startstate;
6618 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
6619 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
6622 && UTF8_IS_ABOVE_LATIN1(nextbyte)
6623 && scan->flags == EXACTL)
6625 /* We only output for EXACTL, as we let the folder
6626 * output this message for EXACTFLU8 to avoid
6628 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6634 || locinput >= loceol
6635 || ! TRIE_BITMAP_TEST(trie, nextbyte)))
6637 if (trie->states[ state ].wordnum) {
6639 Perl_re_exec_indentf( aTHX_ "%sTRIE: matched empty string...%s\n",
6640 depth, PL_colors[4], PL_colors[5])
6646 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6647 depth, PL_colors[4], PL_colors[5])
6654 U8 *uc = ( U8* )locinput;
6658 U8 *uscan = (U8*)NULL;
6659 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6660 U32 charcount = 0; /* how many input chars we have matched */
6661 U32 accepted = 0; /* have we seen any accepting states? */
6663 ST.jump = trie->jump;
6666 ST.longfold = FALSE; /* char longer if folded => it's harder */
6669 /* fully traverse the TRIE; note the position of the
6670 shortest accept state and the wordnum of the longest
6673 while ( state && uc <= (U8*)(loceol) ) {
6674 U32 base = trie->states[ state ].trans.base;
6678 wordnum = trie->states[ state ].wordnum;
6680 if (wordnum) { /* it's an accept state */
6683 /* record first match position */
6685 ST.firstpos = (U8*)locinput;
6690 ST.firstchars = charcount;
6693 if (!ST.nextword || wordnum < ST.nextword)
6694 ST.nextword = wordnum;
6695 ST.topword = wordnum;
6698 DEBUG_TRIE_EXECUTE_r({
6699 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6701 PerlIO_printf( Perl_debug_log,
6702 "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6703 INDENT_CHARS(depth), "", PL_colors[4],
6704 (UV)state, (accepted ? 'Y' : 'N'));
6707 /* read a char and goto next state */
6708 if ( base && (foldlen || uc < (U8*)(loceol))) {
6710 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6711 (U8 *) loceol, uscan,
6712 len, uvc, charid, foldlen,
6719 base + charid - 1 - trie->uniquecharcount)) >= 0)
6721 && ((U32)offset < trie->lasttrans)
6722 && trie->trans[offset].check == state)
6724 state = trie->trans[offset].next;
6735 DEBUG_TRIE_EXECUTE_r(
6736 Perl_re_printf( aTHX_
6737 "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6738 charid, uvc, (UV)state, PL_colors[5] );
6744 /* calculate total number of accept states */
6749 w = trie->wordinfo[w].prev;
6752 ST.accepted = accepted;
6756 Perl_re_exec_indentf( aTHX_ "%sTRIE: got %" IVdf " possible matches%s\n",
6758 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6760 goto trie_first_try; /* jump into the fail handler */
6762 NOT_REACHED; /* NOTREACHED */
6764 case TRIE_next_fail: /* we failed - try next alternative */
6768 /* undo any captures done in the tail part of a branch,
6770 * /(?:X(.)(.)|Y(.)).../
6771 * where the trie just matches X then calls out to do the
6772 * rest of the branch */
6773 REGCP_UNWIND(ST.cp);
6774 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6776 if (!--ST.accepted) {
6778 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n",
6786 /* Find next-highest word to process. Note that this code
6787 * is O(N^2) per trie run (O(N) per branch), so keep tight */
6790 U16 const nextword = ST.nextword;
6791 reg_trie_wordinfo * const wordinfo
6792 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6793 for (word=ST.topword; word; word=wordinfo[word].prev) {
6794 if (word > nextword && (!min || word < min))
6807 ST.lastparen = rex->lastparen;
6808 ST.lastcloseparen = rex->lastcloseparen;
6812 /* find start char of end of current word */
6814 U32 chars; /* how many chars to skip */
6815 reg_trie_data * const trie
6816 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6818 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6820 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6825 /* the hard option - fold each char in turn and find
6826 * its folded length (which may be different */
6827 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6835 /* XXX This assumes the length is well-formed, as
6836 * does the UTF8SKIP below */
6837 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6845 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6850 uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6859 uc = utf8_hop(uc, chars);
6865 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6866 ? ST.jump[ST.nextword]
6870 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n",
6878 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6879 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6881 NOT_REACHED; /* NOTREACHED */
6883 /* only one choice left - just continue */
6885 AV *const trie_words
6886 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6887 SV ** const tmp = trie_words
6888 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6889 SV *sv= tmp ? sv_newmortal() : NULL;
6891 Perl_re_exec_indentf( aTHX_ "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6892 depth, PL_colors[4],
6894 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6895 PL_colors[0], PL_colors[1],
6896 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6898 : "not compiled under -Dr",
6902 locinput = (char*)uc;
6903 continue; /* execute rest of RE */
6909 if (! utf8_target) {
6919 ln = STR_LENl(scan);
6920 goto join_short_long_exact;
6922 case EXACTL: /* /abc/l */
6923 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
6925 /* Complete checking would involve going through every character
6926 * matched by the string to see if any is above latin1. But the
6927 * comparision otherwise might very well be a fast assembly
6928 * language routine, and I (khw) don't think slowing things down
6929 * just to check for this warning is worth it. So this just checks
6930 * the first character */
6931 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6932 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6936 if (! utf8_target) {
6941 case EXACT: /* /abc/ */
6944 ln = STR_LENs(scan);
6946 join_short_long_exact:
6947 if (utf8_target != is_utf8_pat) {
6948 /* The target and the pattern have differing utf8ness. */
6950 const char * const e = s + ln;
6953 /* The target is utf8, the pattern is not utf8.
6954 * Above-Latin1 code points can't match the pattern;
6955 * invariants match exactly, and the other Latin1 ones need
6956 * to be downgraded to a single byte in order to do the
6957 * comparison. (If we could be confident that the target
6958 * is not malformed, this could be refactored to have fewer
6959 * tests by just assuming that if the first bytes match, it
6960 * is an invariant, but there are tests in the test suite
6961 * dealing with (??{...}) which violate this) */
6964 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6968 if (UTF8_IS_INVARIANT(*(U8*)l)) {
6975 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6985 /* The target is not utf8, the pattern is utf8. */
6988 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6992 if (UTF8_IS_INVARIANT(*(U8*)s)) {
6999 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
7011 /* The target and the pattern have the same utf8ness. */
7012 /* Inline the first character, for speed. */
7013 if ( loceol - locinput < ln
7014 || UCHARAT(s) != nextbyte
7015 || (ln > 1 && memNE(s, locinput, ln)))
7024 case EXACTFL: /* /abc/il */
7027 U32 fold_utf8_flags;
7029 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7030 folder = Perl_foldEQ_locale;
7031 fold_array = PL_fold_locale;
7032 fold_utf8_flags = FOLDEQ_LOCALE;
7035 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
7036 is effectively /u; hence to match, target
7038 if (! utf8_target) {
7041 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
7042 | FOLDEQ_S2_FOLDS_SANE;
7043 folder = S_foldEQ_latin1_s2_folded;
7044 fold_array = PL_fold_latin1;
7047 case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */
7048 if (! utf8_target) {
7051 assert(is_utf8_pat);
7052 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7054 /* this is only used in an assert check, so we restrict it to DEBUGGING mode.
7055 * In theory neither of these variables should be used in this mode. */
7061 case EXACTFUP: /* /foo/iu, and something is problematic in
7062 'foo' so can't take shortcuts. */
7063 assert(! is_utf8_pat);
7064 folder = Perl_foldEQ_latin1;
7065 fold_array = PL_fold_latin1;
7066 fold_utf8_flags = 0;
7069 case EXACTFU: /* /abc/iu */
7070 folder = S_foldEQ_latin1_s2_folded;
7071 fold_array = PL_fold_latin1;
7072 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7075 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8
7077 assert(! is_utf8_pat);
7079 case EXACTFAA: /* /abc/iaa */
7080 folder = S_foldEQ_latin1_s2_folded;
7081 fold_array = PL_fold_latin1;
7082 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7083 if (is_utf8_pat || ! utf8_target) {
7085 /* The possible presence of a MICRO SIGN in the pattern forbids
7086 * us to view a non-UTF-8 pattern as folded when there is a
7088 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
7089 |FOLDEQ_S2_FOLDS_SANE;
7094 case EXACTF: /* /abc/i This node only generated for
7095 non-utf8 patterns */
7096 assert(! is_utf8_pat);
7097 folder = Perl_foldEQ;
7098 fold_array = PL_fold;
7099 fold_utf8_flags = 0;
7103 ln = STR_LENs(scan);
7107 || state_num == EXACTFUP
7108 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
7110 /* Either target or the pattern are utf8, or has the issue where
7111 * the fold lengths may differ. */
7112 const char * const l = locinput;
7115 if (! foldEQ_utf8_flags(l, &e, 0, utf8_target,
7116 s, 0, ln, is_utf8_pat,fold_utf8_flags))
7124 /* Neither the target nor the pattern are utf8 */
7126 if (UCHARAT(s) != nextbyte
7128 && UCHARAT(s) != fold_array[nextbyte])
7132 if (loceol - locinput < ln)
7135 if (ln > 1 && ! folder(aTHX_ locinput, s, ln))
7141 case NBOUNDL: /* /\B/l */
7145 case BOUNDL: /* /\b/l */
7148 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7150 if (FLAGS(scan) != TRADITIONAL_BOUND) {
7151 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
7156 if (locinput == reginfo->strbeg)
7157 b1 = isWORDCHAR_LC('\n');
7159 U8 *p = reghop3((U8*)locinput, -1,
7160 (U8*)(reginfo->strbeg));
7161 b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
7163 b2 = (NEXTCHR_IS_EOS)
7164 ? isWORDCHAR_LC('\n')
7165 : isWORDCHAR_LC_utf8_safe((U8*) locinput,
7166 (U8*) reginfo->strend);
7168 else { /* Here the string isn't utf8 */
7169 b1 = (locinput == reginfo->strbeg)
7170 ? isWORDCHAR_LC('\n')
7171 : isWORDCHAR_LC(UCHARAT(locinput - 1));
7172 b2 = (NEXTCHR_IS_EOS)
7173 ? isWORDCHAR_LC('\n')
7174 : isWORDCHAR_LC(nextbyte);
7176 if (to_complement ^ (b1 == b2)) {
7182 case NBOUND: /* /\B/ */
7186 case BOUND: /* /\b/ */
7190 goto bound_ascii_match_only;
7192 case NBOUNDA: /* /\B/a */
7196 case BOUNDA: /* /\b/a */
7200 bound_ascii_match_only:
7201 /* Here the string isn't utf8, or is utf8 and only ascii characters
7202 * are to match \w. In the latter case looking at the byte just
7203 * prior to the current one may be just the final byte of a
7204 * multi-byte character. This is ok. There are two cases:
7205 * 1) it is a single byte character, and then the test is doing
7206 * just what it's supposed to.
7207 * 2) it is a multi-byte character, in which case the final byte is
7208 * never mistakable for ASCII, and so the test will say it is
7209 * not a word character, which is the correct answer. */
7210 b1 = (locinput == reginfo->strbeg)
7211 ? isWORDCHAR_A('\n')
7212 : isWORDCHAR_A(UCHARAT(locinput - 1));
7213 b2 = (NEXTCHR_IS_EOS)
7214 ? isWORDCHAR_A('\n')
7215 : isWORDCHAR_A(nextbyte);
7216 if (to_complement ^ (b1 == b2)) {
7222 case NBOUNDU: /* /\B/u */
7226 case BOUNDU: /* /\b/u */
7229 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
7232 else if (utf8_target) {
7234 switch((bound_type) FLAGS(scan)) {
7235 case TRADITIONAL_BOUND:
7238 if (locinput == reginfo->strbeg) {
7239 b1 = 0 /* isWORDCHAR_L1('\n') */;
7242 U8 *p = reghop3((U8*)locinput, -1,
7243 (U8*)(reginfo->strbeg));
7245 b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
7247 b2 = (NEXTCHR_IS_EOS)
7248 ? 0 /* isWORDCHAR_L1('\n') */
7249 : isWORDCHAR_utf8_safe((U8*)locinput,
7250 (U8*) reginfo->strend);
7251 match = cBOOL(b1 != b2);
7255 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7256 match = TRUE; /* GCB always matches at begin and
7260 /* Find the gcb values of previous and current
7261 * chars, then see if is a break point */
7262 match = isGCB(getGCB_VAL_UTF8(
7263 reghop3((U8*)locinput,
7265 (U8*)(reginfo->strbeg)),
7266 (U8*) reginfo->strend),
7267 getGCB_VAL_UTF8((U8*) locinput,
7268 (U8*) reginfo->strend),
7269 (U8*) reginfo->strbeg,
7276 if (locinput == reginfo->strbeg) {
7279 else if (NEXTCHR_IS_EOS) {
7283 match = isLB(getLB_VAL_UTF8(
7284 reghop3((U8*)locinput,
7286 (U8*)(reginfo->strbeg)),
7287 (U8*) reginfo->strend),
7288 getLB_VAL_UTF8((U8*) locinput,
7289 (U8*) reginfo->strend),
7290 (U8*) reginfo->strbeg,
7292 (U8*) reginfo->strend,
7297 case SB_BOUND: /* Always matches at begin and end */
7298 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7302 match = isSB(getSB_VAL_UTF8(
7303 reghop3((U8*)locinput,
7305 (U8*)(reginfo->strbeg)),
7306 (U8*) reginfo->strend),
7307 getSB_VAL_UTF8((U8*) locinput,
7308 (U8*) reginfo->strend),
7309 (U8*) reginfo->strbeg,
7311 (U8*) reginfo->strend,
7317 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7321 match = isWB(WB_UNKNOWN,
7323 reghop3((U8*)locinput,
7325 (U8*)(reginfo->strbeg)),
7326 (U8*) reginfo->strend),
7327 getWB_VAL_UTF8((U8*) locinput,
7328 (U8*) reginfo->strend),
7329 (U8*) reginfo->strbeg,
7331 (U8*) reginfo->strend,
7337 else { /* Not utf8 target */
7338 switch((bound_type) FLAGS(scan)) {
7339 case TRADITIONAL_BOUND:
7342 b1 = (locinput == reginfo->strbeg)
7343 ? 0 /* isWORDCHAR_L1('\n') */
7344 : isWORDCHAR_L1(UCHARAT(locinput - 1));
7345 b2 = (NEXTCHR_IS_EOS)
7346 ? 0 /* isWORDCHAR_L1('\n') */
7347 : isWORDCHAR_L1(nextbyte);
7348 match = cBOOL(b1 != b2);
7353 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7354 match = TRUE; /* GCB always matches at begin and
7357 else { /* Only CR-LF combo isn't a GCB in 0-255
7359 match = UCHARAT(locinput - 1) != '\r'
7360 || UCHARAT(locinput) != '\n';
7365 if (locinput == reginfo->strbeg) {
7368 else if (NEXTCHR_IS_EOS) {
7372 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
7373 getLB_VAL_CP(UCHARAT(locinput)),
7374 (U8*) reginfo->strbeg,
7376 (U8*) reginfo->strend,
7381 case SB_BOUND: /* Always matches at begin and end */
7382 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7386 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
7387 getSB_VAL_CP(UCHARAT(locinput)),
7388 (U8*) reginfo->strbeg,
7390 (U8*) reginfo->strend,
7396 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7400 match = isWB(WB_UNKNOWN,
7401 getWB_VAL_CP(UCHARAT(locinput -1)),
7402 getWB_VAL_CP(UCHARAT(locinput)),
7403 (U8*) reginfo->strbeg,
7405 (U8*) reginfo->strend,
7412 if (to_complement ^ ! match) {
7418 case ANYOFL: /* /[abc]/l */
7419 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7420 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
7423 case ANYOFD: /* /[abc]/d */
7424 case ANYOF: /* /[abc]/ */
7425 if (NEXTCHR_IS_EOS || locinput >= loceol)
7427 if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput))
7428 && ! ANYOF_FLAGS(scan)
7429 && ANYOF_MATCHES_NONE_OUTSIDE_BITMAP(scan))
7431 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
7437 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7442 goto increment_locinput;
7448 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
7449 || locinput >= loceol)
7453 locinput++; /* ANYOFM is always single byte */
7458 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
7459 || locinput >= loceol)
7463 goto increment_locinput;
7469 || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7470 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7471 || ! _invlist_contains_cp(anyofh_list,
7472 utf8_to_uvchr_buf((U8 *) locinput,
7478 goto increment_locinput;
7484 || ANYOF_FLAGS(scan) != (U8) *locinput
7485 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7486 || ! _invlist_contains_cp(anyofh_list,
7487 utf8_to_uvchr_buf((U8 *) locinput,
7493 goto increment_locinput;
7499 || ANYOF_FLAGS(scan) != (U8) locinput[0]
7500 || locinput >= reginfo->strend
7501 || ! BITMAP_TEST(( (struct regnode_bbm *) scan)->bitmap,
7502 (U8) locinput[1] & UTF_CONTINUATION_MASK))
7506 goto increment_locinput;
7512 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
7513 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
7514 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
7515 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7516 || ! _invlist_contains_cp(anyofh_list,
7517 utf8_to_uvchr_buf((U8 *) locinput,
7523 goto increment_locinput;
7529 || loceol - locinput < FLAGS(scan)
7530 || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
7531 || ! (anyofh_list = GET_ANYOFH_INVLIST(rex, scan))
7532 || ! _invlist_contains_cp(anyofh_list,
7533 utf8_to_uvchr_buf((U8 *) locinput,
7539 goto increment_locinput;
7543 if (NEXTCHR_IS_EOS) {
7548 if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7549 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7550 (U8 *) reginfo->strend,
7552 ANYOFRbase(scan), ANYOFRdelta(scan)))
7558 if (! withinCOUNT((U8) *locinput,
7559 ANYOFRbase(scan), ANYOFRdelta(scan)))
7564 goto increment_locinput;
7568 if (NEXTCHR_IS_EOS) {
7573 if ( ANYOF_FLAGS(scan) != (U8) *locinput
7574 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7575 (U8 *) reginfo->strend,
7577 ANYOFRbase(scan), ANYOFRdelta(scan)))
7583 if (! withinCOUNT((U8) *locinput,
7584 ANYOFRbase(scan), ANYOFRdelta(scan)))
7589 goto increment_locinput;
7592 /* The argument (FLAGS) to all the POSIX node types is the class number
7595 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
7599 case POSIXL: /* \w or [:punct:] etc. under /l */
7600 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7601 if (NEXTCHR_IS_EOS || locinput >= loceol)
7604 /* Use isFOO_lc() for characters within Latin1. (Note that
7605 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7606 * wouldn't be invariant) */
7607 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7608 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
7616 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7617 /* An above Latin-1 code point, or malformed */
7618 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7620 goto utf8_posix_above_latin1;
7623 /* Here is a UTF-8 variant code point below 256 and the target is
7625 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7626 EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7627 *(locinput + 1))))))
7632 goto increment_locinput;
7634 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
7638 case POSIXD: /* \w or [:punct:] etc. under /d */
7644 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
7646 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7650 /* All UTF-8 variants match */
7651 if (! UTF8_IS_INVARIANT(nextbyte)) {
7652 goto increment_locinput;
7658 case POSIXA: /* \w or [:punct:] etc. under /a */
7661 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7662 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7663 * character is a single byte */
7665 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7671 if (! (to_complement ^ cBOOL(generic_isCC_A_(nextbyte,
7677 /* Here we are either not in utf8, or we matched a utf8-invariant,
7678 * so the next char is the next byte */
7682 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
7686 case POSIXU: /* \w or [:punct:] etc. under /u */
7688 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7692 /* Use generic_isCC_() for characters within Latin1. (Note that
7693 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7694 * wouldn't be invariant) */
7695 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7696 if (! (to_complement ^ cBOOL(generic_isCC_(nextbyte,
7703 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7704 if (! (to_complement
7705 ^ cBOOL(generic_isCC_(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7713 else { /* Handle above Latin-1 code points */
7714 utf8_posix_above_latin1:
7715 classnum = (char_class_number_) FLAGS(scan);
7718 if (! (to_complement
7719 ^ cBOOL(_invlist_contains_cp(
7720 PL_XPosix_ptrs[classnum],
7721 utf8_to_uvchr_buf((U8 *) locinput,
7722 (U8 *) reginfo->strend,
7728 case CC_ENUM_SPACE_:
7729 if (! (to_complement
7730 ^ cBOOL(is_XPERLSPACE_high(locinput))))
7735 case CC_ENUM_BLANK_:
7736 if (! (to_complement
7737 ^ cBOOL(is_HORIZWS_high(locinput))))
7742 case CC_ENUM_XDIGIT_:
7743 if (! (to_complement
7744 ^ cBOOL(is_XDIGIT_high(locinput))))
7749 case CC_ENUM_VERTSPACE_:
7750 if (! (to_complement
7751 ^ cBOOL(is_VERTWS_high(locinput))))
7756 case CC_ENUM_CNTRL_: /* These can't match above Latin1 */
7757 case CC_ENUM_ASCII_:
7758 if (! to_complement) {
7763 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7767 case CLUMP: /* Match \X: logical Unicode character. This is defined as
7768 a Unicode extended Grapheme Cluster */
7769 if (NEXTCHR_IS_EOS || locinput >= loceol)
7771 if (! utf8_target) {
7773 /* Match either CR LF or '.', as all the other possibilities
7775 locinput++; /* Match the . or CR */
7776 if (nextbyte == '\r' /* And if it was CR, and the next is LF,
7778 && locinput < loceol
7779 && UCHARAT(locinput) == '\n')
7786 /* Get the gcb type for the current character */
7787 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7788 (U8*) reginfo->strend);
7790 /* Then scan through the input until we get to the first
7791 * character whose type is supposed to be a gcb with the
7792 * current character. (There is always a break at the
7794 locinput += UTF8SKIP(locinput);
7795 while (locinput < loceol) {
7796 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7797 (U8*) reginfo->strend);
7798 if (isGCB(prev_gcb, cur_gcb,
7799 (U8*) reginfo->strbeg, (U8*) locinput,
7806 locinput += UTF8SKIP(locinput);
7813 case REFFLN: /* /\g{name}/il */
7814 { /* The capture buffer cases. The ones beginning with N for the
7815 named buffers just convert to the equivalent numbered and
7816 pretend they were called as the corresponding numbered buffer
7818 /* don't initialize these in the declaration, it makes C++
7823 const U8 *fold_array;
7826 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7827 folder = Perl_foldEQ_locale;
7828 fold_array = PL_fold_locale;
7830 utf8_fold_flags = FOLDEQ_LOCALE;
7833 case REFFAN: /* /\g{name}/iaa */
7834 folder = Perl_foldEQ_latin1;
7835 fold_array = PL_fold_latin1;
7837 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7840 case REFFUN: /* /\g{name}/iu */
7841 folder = Perl_foldEQ_latin1;
7842 fold_array = PL_fold_latin1;
7844 utf8_fold_flags = 0;
7847 case REFFN: /* /\g{name}/i */
7848 folder = Perl_foldEQ;
7849 fold_array = PL_fold;
7851 utf8_fold_flags = 0;
7854 case REFN: /* /\g{name}/ */
7858 utf8_fold_flags = 0;
7861 /* For the named back references, find the corresponding buffer
7863 n = reg_check_named_buff_matched(rex,scan);
7868 goto do_nref_ref_common;
7870 case REFFL: /* /\1/il */
7871 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
7872 folder = Perl_foldEQ_locale;
7873 fold_array = PL_fold_locale;
7874 utf8_fold_flags = FOLDEQ_LOCALE;
7877 case REFFA: /* /\1/iaa */
7878 folder = Perl_foldEQ_latin1;
7879 fold_array = PL_fold_latin1;
7880 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7883 case REFFU: /* /\1/iu */
7884 folder = Perl_foldEQ_latin1;
7885 fold_array = PL_fold_latin1;
7886 utf8_fold_flags = 0;
7889 case REFF: /* /\1/i */
7890 folder = Perl_foldEQ;
7891 fold_array = PL_fold;
7892 utf8_fold_flags = 0;
7895 case REF: /* /\1/ */
7898 utf8_fold_flags = 0;
7902 n = ARG(scan); /* which paren pair */
7905 ln = rex->offs[n].start;
7906 endref = rex->offs[n].end;
7907 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7908 if (rex->lastparen < n || ln == -1 || endref == -1)
7909 sayNO; /* Do not match unless seen CLOSEn. */
7913 s = reginfo->strbeg + ln;
7914 if (type != REF /* REF can do byte comparison */
7915 && (utf8_target || type == REFFU || type == REFFL))
7917 char * limit = loceol;
7919 /* This call case insensitively compares the entire buffer
7920 * at s, with the current input starting at locinput, but
7921 * not going off the end given by loceol, and
7922 * returns in <limit> upon success, how much of the
7923 * current input was matched */
7924 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7925 locinput, &limit, 0, utf8_target, utf8_fold_flags))
7933 /* Not utf8: Inline the first character, for speed. */
7934 if ( ! NEXTCHR_IS_EOS
7935 && locinput < loceol
7936 && UCHARAT(s) != nextbyte
7938 || UCHARAT(s) != fold_array[nextbyte]))
7943 if (locinput + ln > loceol)
7945 if (ln > 1 && (type == REF
7946 ? memNE(s, locinput, ln)
7947 : ! folder(aTHX_ locinput, s, ln)))
7953 case NOTHING: /* null op; e.g. the 'nothing' following
7954 * the '*' in m{(a+|b)*}' */
7956 case TAIL: /* placeholder while compiling (A|B|C) */
7960 #define ST st->u.eval
7961 #define CUR_EVAL cur_eval->u.eval
7967 regexp_internal *rei;
7968 regnode *startpoint;
7971 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
7972 arg= (U32)ARG(scan);
7973 if (cur_eval && cur_eval->locinput == locinput) {
7974 if ( ++nochange_depth > max_nochange_depth )
7976 "Pattern subroutine nesting without pos change"
7977 " exceeded limit in regex");
7984 startpoint = scan + ARG2L(scan);
7985 EVAL_CLOSE_PAREN_SET( st, arg );
7986 /* Detect infinite recursion
7988 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7989 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7990 * So we track the position in the string we are at each time
7991 * we recurse and if we try to enter the same routine twice from
7992 * the same position we throw an error.
7994 if ( rex->recurse_locinput[arg] == locinput ) {
7995 /* FIXME: we should show the regop that is failing as part
7996 * of the error message. */
7997 Perl_croak(aTHX_ "Infinite recursion in regex");
7999 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
8000 rex->recurse_locinput[arg]= locinput;
8003 DECLARE_AND_GET_RE_DEBUG_FLAGS;
8005 Perl_re_exec_indentf( aTHX_
8006 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
8007 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
8013 /* Save all the positions seen so far. */
8014 ST.cp = regcppush(rex, 0, maxopenparen);
8015 REGCP_SET(ST.lastcp);
8017 /* and then jump to the code we share with EVAL */
8018 goto eval_recurse_doit;
8021 case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */
8022 if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
8023 if ( ++nochange_depth > max_nochange_depth )
8024 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
8029 /* execute the code in the {...} */
8033 OP * const oop = PL_op;
8034 COP * const ocurcop = PL_curcop;
8038 /* save *all* paren positions */
8039 regcppush(rex, 0, maxopenparen);
8040 REGCP_SET(ST.lastcp);
8043 caller_cv = find_runcv(NULL);
8047 if (rexi->data->what[n] == 'r') { /* code from an external qr */
8049 (REGEXP*)(rexi->data->data[n])
8051 nop = (OP*)rexi->data->data[n+1];
8053 else if (rexi->data->what[n] == 'l') { /* literal code */
8055 nop = (OP*)rexi->data->data[n];
8056 assert(CvDEPTH(newcv));
8059 /* literal with own CV */
8060 assert(rexi->data->what[n] == 'L');
8061 newcv = rex->qr_anoncv;
8062 nop = (OP*)rexi->data->data[n];
8065 /* Some notes about MULTICALL and the context and save stacks.
8068 * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
8069 * since codeblocks don't introduce a new scope (so that
8070 * local() etc accumulate), at the end of a successful
8071 * match there will be a SAVEt_CLEARSV on the savestack
8072 * for each of $x, $y, $z. If the three code blocks above
8073 * happen to have come from different CVs (e.g. via
8074 * embedded qr//s), then we must ensure that during any
8075 * savestack unwinding, PL_comppad always points to the
8076 * right pad at each moment. We achieve this by
8077 * interleaving SAVEt_COMPPAD's on the savestack whenever
8078 * there is a change of pad.
8079 * In theory whenever we call a code block, we should
8080 * push a CXt_SUB context, then pop it on return from
8081 * that code block. This causes a bit of an issue in that
8082 * normally popping a context also clears the savestack
8083 * back to cx->blk_oldsaveix, but here we specifically
8084 * don't want to clear the save stack on exit from the
8086 * Also for efficiency we don't want to keep pushing and
8087 * popping the single SUB context as we backtrack etc.
8088 * So instead, we push a single context the first time
8089 * we need, it, then hang onto it until the end of this
8090 * function. Whenever we encounter a new code block, we
8091 * update the CV etc if that's changed. During the times
8092 * in this function where we're not executing a code
8093 * block, having the SUB context still there is a bit
8094 * naughty - but we hope that no-one notices.
8095 * When the SUB context is initially pushed, we fake up
8096 * cx->blk_oldsaveix to be as if we'd pushed this context
8097 * on first entry to S_regmatch rather than at some random
8098 * point during the regexe execution. That way if we
8099 * croak, popping the context stack will ensure that
8100 * *everything* SAVEd by this function is undone and then
8101 * the context popped, rather than e.g., popping the
8102 * context (and restoring the original PL_comppad) then
8103 * popping more of the savestack and restoring a bad
8107 /* If this is the first EVAL, push a MULTICALL. On
8108 * subsequent calls, if we're executing a different CV, or
8109 * if PL_comppad has got messed up from backtracking
8110 * through SAVECOMPPADs, then refresh the context.
8112 if (newcv != last_pushed_cv || PL_comppad != last_pad)
8114 U8 flags = (CXp_SUB_RE |
8115 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
8117 if (last_pushed_cv) {
8118 CHANGE_MULTICALL_FLAGS(newcv, flags);
8121 PUSH_MULTICALL_FLAGS(newcv, flags);
8123 /* see notes above */
8124 CX_CUR()->blk_oldsaveix = orig_savestack_ix;
8126 last_pushed_cv = newcv;
8129 /* these assignments are just to silence compiler
8131 multicall_cop = NULL;
8133 last_pad = PL_comppad;
8135 /* the initial nextstate you would normally execute
8136 * at the start of an eval (which would cause error
8137 * messages to come from the eval), may be optimised
8138 * away from the execution path in the regex code blocks;
8139 * so manually set PL_curcop to it initially */
8141 OP *o = cUNOPx(nop)->op_first;
8142 assert(o->op_type == OP_NULL);
8143 if (o->op_targ == OP_SCOPE) {
8144 o = cUNOPo->op_first;
8147 assert(o->op_targ == OP_LEAVE);
8148 o = cUNOPo->op_first;
8149 assert(o->op_type == OP_ENTER);
8153 if (o->op_type != OP_STUB) {
8154 assert( o->op_type == OP_NEXTSTATE
8155 || o->op_type == OP_DBSTATE
8156 || (o->op_type == OP_NULL
8157 && ( o->op_targ == OP_NEXTSTATE
8158 || o->op_targ == OP_DBSTATE
8162 PL_curcop = (COP*)o;
8167 DEBUG_STATE_r( Perl_re_printf( aTHX_
8168 " re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
8170 rex->offs[0].end = locinput - reginfo->strbeg;
8171 if (reginfo->info_aux_eval->pos_magic)
8172 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
8173 reginfo->sv, reginfo->strbeg,
8174 locinput - reginfo->strbeg);
8177 SV *sv_mrk = get_sv("REGMARK", 1);
8178 sv_setsv(sv_mrk, sv_yes_mark);
8181 /* we don't use MULTICALL here as we want to call the
8182 * first op of the block of interest, rather than the
8183 * first op of the sub. Also, we don't want to free
8184 * the savestack frame */
8185 before = (IV)(SP-PL_stack_base);
8187 CALLRUNOPS(aTHX); /* Scalar context. */
8189 if ((IV)(SP-PL_stack_base) == before)
8190 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8196 /* before restoring everything, evaluate the returned
8197 * value, so that 'uninit' warnings don't use the wrong
8198 * PL_op or pad. Also need to process any magic vars
8199 * (e.g. $1) *before* parentheses are restored */
8204 if (logical == 0) { /* (?{})/ */
8205 SV *replsv = save_scalar(PL_replgv);
8206 sv_setsv(replsv, ret); /* $^R */
8209 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
8210 sw = cBOOL(SvTRUE_NN(ret));
8213 else { /* /(??{}) */
8214 /* if its overloaded, let the regex compiler handle
8215 * it; otherwise extract regex, or stringify */
8216 if (SvGMAGICAL(ret))
8217 ret = sv_mortalcopy(ret);
8218 if (!SvAMAGIC(ret)) {
8222 if (SvTYPE(sv) == SVt_REGEXP)
8223 re_sv = (REGEXP*) sv;
8224 else if (SvSMAGICAL(ret)) {
8225 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
8227 re_sv = (REGEXP *) mg->mg_obj;
8230 /* force any undef warnings here */
8231 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
8232 ret = sv_mortalcopy(ret);
8233 (void) SvPV_force_nolen(ret);
8239 /* *** Note that at this point we don't restore
8240 * PL_comppad, (or pop the CxSUB) on the assumption it may
8241 * be used again soon. This is safe as long as nothing
8242 * in the regexp code uses the pad ! */
8244 PL_curcop = ocurcop;
8245 regcp_restore(rex, ST.lastcp, &maxopenparen);
8246 PL_curpm_under = PL_curpm;
8247 PL_curpm = PL_reg_curpm;
8250 PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
8256 /* only /(??{})/ from now on */
8259 /* extract RE object from returned value; compiling if
8263 re_sv = reg_temp_copy(NULL, re_sv);
8268 if (SvUTF8(ret) && IN_BYTES) {
8269 /* In use 'bytes': make a copy of the octet
8270 * sequence, but without the flag on */
8272 const char *const p = SvPV(ret, len);
8273 ret = newSVpvn_flags(p, len, SVs_TEMP);
8275 if (rex->intflags & PREGf_USE_RE_EVAL)
8276 pm_flags |= PMf_USE_RE_EVAL;
8278 /* if we got here, it should be an engine which
8279 * supports compiling code blocks and stuff */
8280 assert(rex->engine && rex->engine->op_comp);
8281 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
8282 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
8283 rex->engine, NULL, NULL,
8284 /* copy /msixn etc to inner pattern */
8289 & (SVs_TEMP | SVs_GMG | SVf_ROK))
8290 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
8291 /* This isn't a first class regexp. Instead, it's
8292 caching a regexp onto an existing, Perl visible
8294 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
8300 RXp_MATCH_COPIED_off(re);
8301 re->subbeg = rex->subbeg;
8302 re->sublen = rex->sublen;
8303 re->suboffset = rex->suboffset;
8304 re->subcoffset = rex->subcoffset;
8306 re->lastcloseparen = 0;
8309 debug_start_match(re_sv, utf8_target, locinput,
8310 reginfo->strend, "EVAL/GOSUB: Matching embedded");
8312 startpoint = rei->program + 1;
8313 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
8314 * close_paren only for GOSUB */
8315 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
8316 /* Save all the seen positions so far. */
8317 ST.cp = regcppush(rex, 0, maxopenparen);
8318 REGCP_SET(ST.lastcp);
8319 /* and set maxopenparen to 0, since we are starting a "fresh" match */
8321 /* run the pattern returned from (??{...}) */
8323 eval_recurse_doit: /* Share code with GOSUB below this line
8324 * At this point we expect the stack context to be
8325 * set up correctly */
8327 /* invalidate the S-L poscache. We're now executing a
8328 * different set of WHILEM ops (and their associated
8329 * indexes) against the same string, so the bits in the
8330 * cache are meaningless. Setting maxiter to zero forces
8331 * the cache to be invalidated and zeroed before reuse.
8332 * XXX This is too dramatic a measure. Ideally we should
8333 * save the old cache and restore when running the outer
8335 reginfo->poscache_maxiter = 0;
8337 /* the new regexp might have a different is_utf8_pat than we do */
8338 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
8340 ST.prev_rex = rex_sv;
8341 ST.prev_curlyx = cur_curlyx;
8343 SET_reg_curpm(rex_sv);
8348 ST.prev_eval = cur_eval;
8350 /* now continue from first node in postoned RE */
8351 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
8352 loceol, script_run_begin);
8353 NOT_REACHED; /* NOTREACHED */
8356 case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
8357 /* note: this is called twice; first after popping B, then A */
8359 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n",
8360 depth, cur_eval, ST.prev_eval);
8363 #define SET_RECURSE_LOCINPUT(STR,VAL)\
8364 if ( cur_eval && CUR_EVAL.close_paren ) {\
8366 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
8368 CUR_EVAL.close_paren - 1,\
8372 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
8375 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
8377 rex_sv = ST.prev_rex;
8378 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8379 SET_reg_curpm(rex_sv);
8380 rex = ReANY(rex_sv);
8381 rexi = RXi_GET(rex);
8383 /* preserve $^R across LEAVE's. See Bug 121070. */
8384 SV *save_sv= GvSV(PL_replgv);
8386 SvREFCNT_inc(save_sv);
8387 regcpblow(ST.cp); /* LEAVE in disguise */
8388 /* don't move this initialization up */
8389 replsv = GvSV(PL_replgv);
8390 sv_setsv(replsv, save_sv);
8392 SvREFCNT_dec(save_sv);
8394 cur_eval = ST.prev_eval;
8395 cur_curlyx = ST.prev_curlyx;
8397 /* Invalidate cache. See "invalidate" comment above. */
8398 reginfo->poscache_maxiter = 0;
8399 if ( nochange_depth )
8402 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
8406 case EVAL_B_fail: /* unsuccessful B in (?{...})B */
8407 REGCP_UNWIND(ST.lastcp);
8410 case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
8411 /* note: this is called twice; first after popping B, then A */
8413 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
8414 depth, cur_eval, ST.prev_eval);
8417 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
8419 rex_sv = ST.prev_rex;
8420 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8421 SET_reg_curpm(rex_sv);
8422 rex = ReANY(rex_sv);
8423 rexi = RXi_GET(rex);
8425 REGCP_UNWIND(ST.lastcp);
8426 regcppop(rex, &maxopenparen);
8427 cur_eval = ST.prev_eval;
8428 cur_curlyx = ST.prev_curlyx;
8430 /* Invalidate cache. See "invalidate" comment above. */
8431 reginfo->poscache_maxiter = 0;
8432 if ( nochange_depth )
8435 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
8440 n = ARG(scan); /* which paren pair */
8441 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
8442 if (n > maxopenparen)
8444 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
8445 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
8450 (IV)rex->offs[n].start_tmp,
8456 case SROPEN: /* (*SCRIPT_RUN: */
8457 script_run_begin = (U8 *) locinput;
8462 n = ARG(scan); /* which paren pair */
8463 CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8464 locinput - reginfo->strbeg);
8465 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
8470 case SRCLOSE: /* (*SCRIPT_RUN: ... ) */
8472 if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
8480 case ACCEPT: /* (*ACCEPT) */
8483 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8484 utmp = (U32)ARG2L(scan);
8490 cursor && ( OP(cursor) != END );
8491 cursor = ( REGNODE_TYPE( OP(cursor) ) == END )
8492 ? REGNODE_AFTER(cursor)
8495 if ( OP(cursor) != CLOSE )
8500 if ( n > lastopen ) /* might be OPEN/CLOSE in the way */
8501 continue; /* so skip this one */
8503 CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8504 locinput - reginfo->strbeg);
8506 if ( n == utmp || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
8513 case GROUPP: /* (?(1)) */
8514 n = ARG(scan); /* which paren pair */
8515 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
8518 case GROUPPN: /* (?(<name>)) */
8519 /* reg_check_named_buff_matched returns 0 for no match */
8520 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
8523 case INSUBP: /* (?(R)) */
8525 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
8526 * of SCAN is already set up as matches a eval.close_paren */
8527 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
8530 case DEFINEP: /* (?(DEFINE)) */
8534 case IFTHEN: /* (?(cond)A|B) */
8535 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8537 next = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
8539 next = scan + ARG(scan);
8540 if (OP(next) == IFTHEN) /* Fake one. */
8541 next = REGNODE_AFTER_type(next,tregnode_IFTHEN);
8545 case LOGICAL: /* modifier for EVAL and IFMATCH */
8546 logical = scan->flags;
8549 /*******************************************************************
8551 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
8552 pattern, where A and B are subpatterns. (For simple A, CURLYM or
8553 STAR/PLUS/CURLY/CURLYN are used instead.)
8555 A*B is compiled as <CURLYX><A><WHILEM><B>
8557 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
8558 state, which contains the current count, initialised to -1. It also sets
8559 cur_curlyx to point to this state, with any previous value saved in the
8562 CURLYX then jumps straight to the WHILEM op, rather than executing A,
8563 since the pattern may possibly match zero times (i.e. it's a while {} loop
8564 rather than a do {} while loop).
8566 Each entry to WHILEM represents a successful match of A. The count in the
8567 CURLYX block is incremented, another WHILEM state is pushed, and execution
8568 passes to A or B depending on greediness and the current count.
8570 For example, if matching against the string a1a2a3b (where the aN are
8571 substrings that match /A/), then the match progresses as follows: (the
8572 pushed states are interspersed with the bits of strings matched so far):
8575 <CURLYX cnt=0><WHILEM>
8576 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
8577 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
8578 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
8579 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
8581 (Contrast this with something like CURLYM, which maintains only a single
8585 a1 <CURLYM cnt=1> a2
8586 a1 a2 <CURLYM cnt=2> a3
8587 a1 a2 a3 <CURLYM cnt=3> b
8590 Each WHILEM state block marks a point to backtrack to upon partial failure
8591 of A or B, and also contains some minor state data related to that
8592 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
8593 overall state, such as the count, and pointers to the A and B ops.
8595 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
8596 must always point to the *current* CURLYX block, the rules are:
8598 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
8599 and set cur_curlyx to point the new block.
8601 When popping the CURLYX block after a successful or unsuccessful match,
8602 restore the previous cur_curlyx.
8604 When WHILEM is about to execute B, save the current cur_curlyx, and set it
8605 to the outer one saved in the CURLYX block.
8607 When popping the WHILEM block after a successful or unsuccessful B match,
8608 restore the previous cur_curlyx.
8610 Here's an example for the pattern (AI* BI)*BO
8611 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
8614 curlyx backtrack stack
8615 ------ ---------------
8617 CO <CO prev=NULL> <WO>
8618 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8619 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8620 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8622 At this point the pattern succeeds, and we work back down the stack to
8623 clean up, restoring as we go:
8625 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8626 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8627 CO <CO prev=NULL> <WO>
8630 *******************************************************************/
8632 #define ST st->u.curlyx
8634 case CURLYX: /* start of /A*B/ (for complex A) */
8636 /* No need to save/restore up to this paren */
8637 I32 parenfloor = scan->flags;
8639 assert(next); /* keep Coverity happy */
8640 if (OP(REGNODE_BEFORE(next)) == NOTHING) /* LONGJMP */
8643 /* XXXX Probably it is better to teach regpush to support
8644 parenfloor > maxopenparen ... */
8645 if (parenfloor > (I32)rex->lastparen)
8646 parenfloor = rex->lastparen; /* Pessimization... */
8648 ST.prev_curlyx= cur_curlyx;
8650 ST.cp = PL_savestack_ix;
8652 /* these fields contain the state of the current curly.
8653 * they are accessed by subsequent WHILEMs */
8654 ST.parenfloor = parenfloor;
8659 ST.count = -1; /* this will be updated by WHILEM */
8660 ST.lastloc = NULL; /* this will be updated by WHILEM */
8662 PUSH_YES_STATE_GOTO(CURLYX_end, REGNODE_BEFORE(next), locinput, loceol,
8664 NOT_REACHED; /* NOTREACHED */
8667 case CURLYX_end: /* just finished matching all of A*B */
8668 cur_curlyx = ST.prev_curlyx;
8670 NOT_REACHED; /* NOTREACHED */
8672 case CURLYX_end_fail: /* just failed to match all of A*B */
8674 cur_curlyx = ST.prev_curlyx;
8676 NOT_REACHED; /* NOTREACHED */
8680 #define ST st->u.whilem
8682 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
8684 /* see the discussion above about CURLYX/WHILEM */
8689 assert(cur_curlyx); /* keep Coverity happy */
8691 min = ARG1(cur_curlyx->u.curlyx.me);
8692 max = ARG2(cur_curlyx->u.curlyx.me);
8693 A = REGNODE_AFTER(cur_curlyx->u.curlyx.me);
8694 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8695 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8696 ST.cache_offset = 0;
8700 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n",
8701 depth, (long)n, min, max)
8704 /* First just match a string of min A's. */
8707 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8708 cur_curlyx->u.curlyx.lastloc = locinput;
8709 REGCP_SET(ST.lastcp);
8711 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8713 NOT_REACHED; /* NOTREACHED */
8716 /* If degenerate A matches "", assume A done. */
8718 if (locinput == cur_curlyx->u.curlyx.lastloc) {
8719 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: empty match detected, trying continuation...\n",
8722 goto do_whilem_B_max;
8725 /* super-linear cache processing.
8727 * The idea here is that for certain types of CURLYX/WHILEM -
8728 * principally those whose upper bound is infinity (and
8729 * excluding regexes that have things like \1 and other very
8730 * non-regular expresssiony things), then if a pattern like
8731 * /....A*.../ fails and we backtrack to the WHILEM, then we
8732 * make a note that this particular WHILEM op was at string
8733 * position 47 (say) when the rest of pattern failed. Then, if
8734 * we ever find ourselves back at that WHILEM, and at string
8735 * position 47 again, we can just fail immediately rather than
8736 * running the rest of the pattern again.
8738 * This is very handy when patterns start to go
8739 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8740 * with a combinatorial explosion of backtracking.
8742 * The cache is implemented as a bit array, with one bit per
8743 * string byte position per WHILEM op (up to 16) - so its
8744 * between 0.25 and 2x the string size.
8746 * To avoid allocating a poscache buffer every time, we do an
8747 * initially countdown; only after we have executed a WHILEM
8748 * op (string-length x #WHILEMs) times do we allocate the
8751 * The top 4 bits of scan->flags byte say how many different
8752 * relevant CURLLYX/WHILEM op pairs there are, while the
8753 * bottom 4-bits is the identifying index number of this
8759 if (!reginfo->poscache_maxiter) {
8760 /* start the countdown: Postpone detection until we
8761 * know the match is not *that* much linear. */
8762 reginfo->poscache_maxiter
8763 = (reginfo->strend - reginfo->strbeg + 1)
8765 /* possible overflow for long strings and many CURLYX's */
8766 if (reginfo->poscache_maxiter < 0)
8767 reginfo->poscache_maxiter = I32_MAX;
8768 reginfo->poscache_iter = reginfo->poscache_maxiter;
8771 if (reginfo->poscache_iter-- == 0) {
8772 /* initialise cache */
8773 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8774 regmatch_info_aux *const aux = reginfo->info_aux;
8775 if (aux->poscache) {
8776 if ((SSize_t)reginfo->poscache_size < size) {
8777 Renew(aux->poscache, size, char);
8778 reginfo->poscache_size = size;
8780 Zero(aux->poscache, size, char);
8783 reginfo->poscache_size = size;
8784 Newxz(aux->poscache, size, char);
8786 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8787 "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8788 PL_colors[4], PL_colors[5])
8792 if (reginfo->poscache_iter < 0) {
8793 /* have we already failed at this position? */
8794 SSize_t offset, mask;
8796 reginfo->poscache_iter = -1; /* stop eventual underflow */
8797 offset = (scan->flags & 0xf) - 1
8798 + (locinput - reginfo->strbeg)
8800 mask = 1 << (offset % 8);
8802 if (reginfo->info_aux->poscache[offset] & mask) {
8803 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: (cache) already tried at this position...\n",
8806 cur_curlyx->u.curlyx.count--;
8807 sayNO; /* cache records failure */
8809 ST.cache_offset = offset;
8810 ST.cache_mask = mask;
8814 /* Prefer B over A for minimal matching. */
8816 if (cur_curlyx->u.curlyx.minmod) {
8817 ST.save_curlyx = cur_curlyx;
8818 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8819 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8820 locinput, loceol, script_run_begin);
8821 NOT_REACHED; /* NOTREACHED */
8824 /* Prefer A over B for maximal matching. */
8826 if (n < max) { /* More greed allowed? */
8827 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8829 cur_curlyx->u.curlyx.lastloc = locinput;
8830 REGCP_SET(ST.lastcp);
8831 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8833 NOT_REACHED; /* NOTREACHED */
8835 goto do_whilem_B_max;
8837 NOT_REACHED; /* NOTREACHED */
8839 case WHILEM_B_min: /* just matched B in a minimal match */
8840 case WHILEM_B_max: /* just matched B in a maximal match */
8841 cur_curlyx = ST.save_curlyx;
8843 NOT_REACHED; /* NOTREACHED */
8845 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8846 cur_curlyx = ST.save_curlyx;
8847 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8848 cur_curlyx->u.curlyx.count--;
8850 NOT_REACHED; /* NOTREACHED */
8852 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8854 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8855 REGCP_UNWIND(ST.lastcp);
8856 regcppop(rex, &maxopenparen);
8857 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8858 cur_curlyx->u.curlyx.count--;
8860 NOT_REACHED; /* NOTREACHED */
8862 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8863 REGCP_UNWIND(ST.lastcp);
8864 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8865 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: failed, trying continuation...\n",
8871 ST.save_curlyx = cur_curlyx;
8872 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8873 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8874 locinput, loceol, script_run_begin);
8875 NOT_REACHED; /* NOTREACHED */
8877 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8878 cur_curlyx = ST.save_curlyx;
8880 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8881 /* Maximum greed exceeded */
8882 cur_curlyx->u.curlyx.count--;
8886 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: B min fail: trying longer...\n", depth)
8888 /* Try grabbing another A and see if it helps. */
8889 cur_curlyx->u.curlyx.lastloc = locinput;
8890 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8892 REGCP_SET(ST.lastcp);
8893 PUSH_STATE_GOTO(WHILEM_A_min,
8894 /*A*/ REGNODE_AFTER(ST.save_curlyx->u.curlyx.me),
8895 locinput, loceol, script_run_begin);
8896 NOT_REACHED; /* NOTREACHED */
8899 #define ST st->u.branch
8901 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
8902 next = scan + ARG(scan);
8907 case BRANCH: /* /(...|A|...)/ */
8908 scan = REGNODE_AFTER_opcode(scan,state_num); /* scan now points to inner node */
8910 ST.lastparen = rex->lastparen;
8911 ST.lastcloseparen = rex->lastcloseparen;
8912 ST.next_branch = next;
8915 /* Now go into the branch */
8917 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8920 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8923 NOT_REACHED; /* NOTREACHED */
8925 case CUTGROUP: /* /(*THEN)/ */
8926 sv_yes_mark = st->u.mark.mark_name = scan->flags
8927 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8929 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8931 NOT_REACHED; /* NOTREACHED */
8933 case CUTGROUP_next_fail:
8936 if (st->u.mark.mark_name)
8937 sv_commit = st->u.mark.mark_name;
8939 NOT_REACHED; /* NOTREACHED */
8943 NOT_REACHED; /* NOTREACHED */
8945 case BRANCH_next_fail: /* that branch failed; try the next, if any */
8950 REGCP_UNWIND(ST.cp);
8951 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8952 scan = ST.next_branch;
8953 /* no more branches? */
8954 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8956 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n",
8963 continue; /* execute next BRANCH[J] op */
8966 case MINMOD: /* next op will be non-greedy, e.g. A*? */
8971 #define ST st->u.curlym
8973 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
8975 /* This is an optimisation of CURLYX that enables us to push
8976 * only a single backtracking state, no matter how many matches
8977 * there are in {m,n}. It relies on the pattern being constant
8978 * length, with no parens to influence future backrefs
8982 scan = REGNODE_AFTER_type(scan, tregnode_CURLYM);
8984 ST.lastparen = rex->lastparen;
8985 ST.lastcloseparen = rex->lastcloseparen;
8987 /* if paren positive, emulate an OPEN/CLOSE around A */
8989 U32 paren = ST.me->flags;
8991 if (paren > maxopenparen)
8992 maxopenparen = paren;
8993 scan += NEXT_OFF(scan); /* Skip former OPEN. */
9001 ST.Binfo.count = -1;
9004 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
9007 curlym_do_A: /* execute the A in /A{m,n}B/ */
9008 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
9010 NOT_REACHED; /* NOTREACHED */
9012 case CURLYM_A: /* we've just matched an A */
9014 /* after first match, determine A's length: u.curlym.alen */
9015 if (ST.count == 1) {
9016 if (reginfo->is_utf8_target) {
9017 char *s = st->locinput;
9018 while (s < locinput) {
9024 ST.alen = locinput - st->locinput;
9027 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
9030 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
9031 depth, (IV) ST.count, (IV)ST.alen)
9034 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
9039 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
9040 if ( max == REG_INFTY || ST.count < max )
9041 goto curlym_do_A; /* try to match another A */
9043 goto curlym_do_B; /* try to match B */
9045 case CURLYM_A_fail: /* just failed to match an A */
9046 REGCP_UNWIND(ST.cp);
9049 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
9050 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
9053 curlym_do_B: /* execute the B in /A{m,n}B/ */
9055 goto curlym_close_B;
9057 if (ST.Binfo.count < 0) {
9058 /* calculate possible match of 1st char following curly */
9060 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
9061 regnode *text_node = ST.B;
9062 if (! HAS_TEXT(text_node))
9063 FIND_NEXT_IMPT(text_node);
9064 if (REGNODE_TYPE(OP(text_node)) == EXACT) {
9065 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9066 &ST.Binfo, reginfo))
9075 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n",
9076 depth, (IV)ST.count)
9078 if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) {
9079 assert(ST.Binfo.count > 0);
9081 /* Do a quick test to hopefully rule out most non-matches */
9082 if ( locinput + ST.Binfo.min_length > loceol
9083 || ! S_test_EXACTISH_ST(locinput, ST.Binfo))
9086 Perl_re_exec_indentf( aTHX_
9087 "CURLYM Fast bail next target=0x%X anded==0x%X"
9090 (int) nextbyte, ST.Binfo.first_byte_anded,
9091 ST.Binfo.first_byte_mask)
9093 state_num = CURLYM_B_fail;
9094 goto reenter_switch;
9100 /* emulate CLOSE: mark current A as captured */
9101 U32 paren = (U32)ST.me->flags;
9102 if (ST.count || is_accepted) {
9103 CLOSE_CAPTURE(paren,
9104 HOPc(locinput, -ST.alen) - reginfo->strbeg,
9105 locinput - reginfo->strbeg);
9108 rex->offs[paren].end = -1;
9109 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
9111 if (ST.count || is_accepted)
9121 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */
9123 NOT_REACHED; /* NOTREACHED */
9125 case CURLYM_B_fail: /* just failed to match a B */
9126 REGCP_UNWIND(ST.cp);
9127 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9129 I32 max = ARG2(ST.me);
9130 if (max != REG_INFTY && ST.count == max)
9132 goto curlym_do_A; /* try to match a further A */
9134 /* backtrack one A */
9135 if (ST.count == ARG1(ST.me) /* min */)
9138 SET_locinput(HOPc(locinput, -ST.alen));
9139 goto curlym_do_B; /* try to match B */
9142 #define ST st->u.curly
9144 #define CURLY_SETPAREN(paren, success) \
9147 CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
9148 locinput - reginfo->strbeg); \
9151 rex->offs[paren].end = -1; \
9152 rex->lastparen = ST.lastparen; \
9153 rex->lastcloseparen = ST.lastcloseparen; \
9157 case STAR: /* /A*B/ where A is width 1 char */
9161 scan = REGNODE_AFTER_type(scan,tregnode_STAR);
9164 case PLUS: /* /A+B/ where A is width 1 char */
9168 scan = REGNODE_AFTER_type(scan,tregnode_PLUS);
9171 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
9172 ST.paren = scan->flags; /* Which paren to set */
9173 ST.lastparen = rex->lastparen;
9174 ST.lastcloseparen = rex->lastcloseparen;
9175 if (ST.paren > maxopenparen)
9176 maxopenparen = ST.paren;
9177 ST.min = ARG1(scan); /* min to match */
9178 ST.max = ARG2(scan); /* max to match */
9179 scan = regnext(REGNODE_AFTER_type(scan, tregnode_CURLYN));
9181 /* handle the single-char capture called as a GOSUB etc */
9182 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
9184 char *li = locinput;
9185 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
9193 case CURLY: /* /A{m,n}B/ where A is width 1 char */
9195 ST.min = ARG1(scan); /* min to match */
9196 ST.max = ARG2(scan); /* max to match */
9197 scan = REGNODE_AFTER_type(scan, tregnode_CURLY);
9200 * Lookahead to avoid useless match attempts
9201 * when we know what character comes next.
9203 * Used to only do .*x and .*?x, but now it allows
9204 * for )'s, ('s and (?{ ... })'s to be in the way
9205 * of the quantifier and the EXACT-like node. -- japhy
9208 assert(ST.min <= ST.max);
9209 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
9213 regnode *text_node = next;
9215 if (! HAS_TEXT(text_node))
9216 FIND_NEXT_IMPT(text_node);
9218 if (! HAS_TEXT(text_node))
9221 if ( REGNODE_TYPE(OP(text_node)) != EXACT ) {
9225 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9226 &ST.Binfo, reginfo))
9237 char *li = locinput;
9240 regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
9247 if (ST.Binfo.count <= 0)
9248 goto curly_try_B_min;
9250 ST.oldloc = locinput;
9252 /* set ST.maxpos to the furthest point along the
9253 * string that could possibly match, i.e., that a match could
9255 if (ST.max == REG_INFTY) {
9256 ST.maxpos = loceol - 1;
9258 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
9261 else if (utf8_target) {
9262 int m = ST.max - ST.min;
9263 for (ST.maxpos = locinput;
9264 m >0 && ST.maxpos < loceol; m--)
9265 ST.maxpos += UTF8SKIP(ST.maxpos);
9268 ST.maxpos = locinput + ST.max - ST.min;
9269 if (ST.maxpos >= loceol)
9270 ST.maxpos = loceol - 1;
9272 goto curly_try_B_min_known;
9276 /* avoid taking address of locinput, so it can remain
9278 char *li = locinput;
9279 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
9280 if (ST.count < ST.min)
9283 if ((ST.count > ST.min)
9284 && (REGNODE_TYPE(OP(ST.B)) == EOL) && (OP(ST.B) != MEOL))
9286 /* A{m,n} must come at the end of the string, there's
9287 * no point in backing off ... */
9289 /* ...except that $ and \Z can match before *and* after
9290 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
9291 We may back off by one in this case. */
9292 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
9296 goto curly_try_B_max;
9298 NOT_REACHED; /* NOTREACHED */
9300 case CURLY_B_min_fail:
9301 /* failed to find B in a non-greedy match. */
9303 REGCP_UNWIND(ST.cp);
9305 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9308 if (ST.Binfo.count == 0) {
9309 /* failed -- move forward one */
9310 char *li = locinput;
9311 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
9316 if (!( ST.count <= ST.max
9317 /* count overflow ? */
9318 || (ST.max == REG_INFTY && ST.count > 0))
9324 /* Couldn't or didn't -- move forward. */
9325 ST.oldloc = locinput;
9327 locinput += UTF8SKIP(locinput);
9332 curly_try_B_min_known:
9333 /* find the next place where 'B' could work, then call B */
9334 if (locinput + ST.Binfo.initial_exact < loceol) {
9335 if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
9337 /* Here, the mask is all 1's for the entire length of
9338 * any possible match. (That actually means that there
9339 * is only one possible match.) Look for the next
9341 locinput = ninstr(locinput, loceol,
9342 (char *) ST.Binfo.matches,
9343 (char *) ST.Binfo.matches
9344 + ST.Binfo.initial_exact);
9345 if (locinput == NULL) {
9350 /* If the first byte(s) of the mask are all ones, it
9351 * means those bytes must match identically, so can use
9352 * ninstr() to find the next possible matchpoint */
9353 if (ST.Binfo.initial_exact > 0) {
9354 locinput = ninstr(locinput, loceol,
9355 (char *) ST.Binfo.matches,
9356 (char *) ST.Binfo.matches
9357 + ST.Binfo.initial_exact);
9359 else { /* Otherwise find the next byte that matches,
9361 locinput = (char *) find_next_masked(
9362 (U8 *) locinput, (U8 *) loceol,
9363 ST.Binfo.first_byte_anded,
9364 ST.Binfo.first_byte_mask);
9365 /* Advance to the end of a multi-byte character */
9367 while ( locinput < loceol
9368 && UTF8_IS_CONTINUATION(*locinput))
9374 if ( locinput == NULL
9375 || locinput + ST.Binfo.min_length > loceol)
9380 /* Here, we have found a possible match point; if can't
9381 * rule it out, quit the loop so can check fully */
9382 if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
9386 locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
9388 } while (locinput <= ST.maxpos);
9391 if (locinput > ST.maxpos)
9395 ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
9396 : (STRLEN) (locinput - ST.oldloc);
9399 /* Here is at the beginning of a character that meets the mask
9400 * criteria. Need to make sure that some real possibility */
9403 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
9404 * at what may be the beginning of b; check that everything
9405 * between oldloc and locinput matches */
9406 char *li = ST.oldloc;
9408 if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
9410 assert(n == REG_INFTY || locinput == li);
9415 CURLY_SETPAREN(ST.paren, ST.count);
9416 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
9418 NOT_REACHED; /* NOTREACHED */
9422 /* a successful greedy match: now try to match B */
9423 if ( ST.Binfo.count <= 0
9424 || ( ST.Binfo.count > 0
9425 && locinput + ST.Binfo.min_length <= loceol
9426 && S_test_EXACTISH_ST(locinput, ST.Binfo)))
9428 CURLY_SETPAREN(ST.paren, ST.count);
9429 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
9431 NOT_REACHED; /* NOTREACHED */
9435 case CURLY_B_max_fail:
9436 /* failed to find B in a greedy match */
9438 REGCP_UNWIND(ST.cp);
9440 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9443 if (--ST.count < ST.min)
9445 locinput = HOPc(locinput, -1);
9446 goto curly_try_B_max;
9450 case END: /* last op of main pattern */
9453 /* we've just finished A in /(??{A})B/; now continue with B */
9455 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
9456 st->u.eval.prev_rex = rex_sv; /* inner */
9458 /* Save *all* the positions. */
9459 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
9460 rex_sv = CUR_EVAL.prev_rex;
9461 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
9462 SET_reg_curpm(rex_sv);
9463 rex = ReANY(rex_sv);
9464 rexi = RXi_GET(rex);
9466 st->u.eval.prev_curlyx = cur_curlyx;
9467 cur_curlyx = CUR_EVAL.prev_curlyx;
9469 REGCP_SET(st->u.eval.lastcp);
9471 /* Restore parens of the outer rex without popping the
9473 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
9475 st->u.eval.prev_eval = cur_eval;
9476 cur_eval = CUR_EVAL.prev_eval;
9478 Perl_re_exec_indentf( aTHX_ "END: EVAL trying tail ... (cur_eval=%p)\n",
9480 if ( nochange_depth )
9483 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
9485 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */
9486 st->u.eval.prev_eval->u.eval.B,
9487 locinput, loceol, script_run_begin);
9490 if (locinput < reginfo->till) {
9491 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
9492 "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
9494 (long)(locinput - startpos),
9495 (long)(reginfo->till - startpos),
9498 sayNO_SILENT; /* Cannot match: too short. */
9500 sayYES; /* Success! */
9502 case LOOKBEHIND_END: /* validate that *lookbehind* UNLESSM/IFMATCH
9503 matches end at the right spot, required for
9504 variable length matches. */
9505 if (match_end && locinput != match_end)
9508 Perl_re_exec_indentf( aTHX_
9509 "%sLOOKBEHIND_END: subpattern failed...%s\n",
9510 depth, PL_colors[4], PL_colors[5]));
9511 sayNO; /* Variable length match didn't line up */
9515 case SUCCEED: /* successful SUSPEND/CURLYM and
9516 *lookahead* IFMATCH/UNLESSM*/
9518 Perl_re_exec_indentf( aTHX_
9519 "%sSUCCEED: subpattern success...%s\n",
9520 depth, PL_colors[4], PL_colors[5]));
9521 sayYES; /* Success! */
9524 #define ST st->u.ifmatch
9526 case SUSPEND: /* (?>A) */
9528 ST.start = locinput;
9533 case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
9535 goto ifmatch_trivial_fail_test;
9537 case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
9539 ifmatch_trivial_fail_test:
9540 ST.prev_match_end= match_end;
9541 ST.count = scan->next_off + 1; /* next_off repurposed to be
9542 lookbehind count, requires
9544 if (! scan->flags) { /* 'flags' zero means lookahed */
9546 /* Lookahead starts here and ends at the normal place */
9547 ST.start = locinput;
9552 PERL_UINT_FAST8_T back_count = scan->flags;
9554 match_end = locinput;
9556 /* Lookbehind can look beyond the current position */
9559 /* ... and starts at the first place in the input that is in
9560 * the range of the possible start positions */
9561 for (; ST.count > 0; ST.count--, back_count--) {
9562 s = HOPBACKc(locinput, back_count);
9569 /* If the lookbehind doesn't start in the actual string, is a
9570 * trivial match failure */
9571 match_end = ST.prev_match_end;
9574 sw = 1 - cBOOL(ST.wanted);
9579 /* Here, we didn't want it to match, so is actually success */
9580 next = scan + ARG(scan);
9588 ST.logical = logical;
9589 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9591 /* execute body of (?...A) */
9592 PUSH_YES_STATE_GOTO(IFMATCH_A, REGNODE_AFTER(scan), ST.start,
9593 ST.end, script_run_begin);
9594 NOT_REACHED; /* NOTREACHED */
9599 case IFMATCH_A_fail: /* body of (?...A) failed */
9600 if (! ST.logical && ST.count > 1) {
9602 /* It isn't a real failure until we've tried all starting
9603 * positions. Move to the next starting position and retry */
9605 ST.start = HOPc(ST.start, 1);
9607 logical = ST.logical;
9611 /* Here, all starting positions have been tried. */
9615 case IFMATCH_A: /* body of (?...A) succeeded */
9618 sw = matched == ST.wanted;
9619 match_end = ST.prev_match_end;
9620 if (! ST.logical && !sw) {
9624 if (OP(ST.me) != SUSPEND) {
9625 /* restore old position except for (?>...) */
9626 locinput = st->locinput;
9627 loceol = st->loceol;
9628 script_run_begin = st->sr0;
9630 scan = ST.me + ARG(ST.me);
9633 continue; /* execute B */
9638 case LONGJMP: /* alternative with many branches compiles to
9639 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9640 next = scan + ARG(scan);
9645 case COMMIT: /* (*COMMIT) */
9646 reginfo->cutpoint = loceol;
9649 case PRUNE: /* (*PRUNE) */
9651 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9652 PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9654 NOT_REACHED; /* NOTREACHED */
9656 case COMMIT_next_fail:
9660 NOT_REACHED; /* NOTREACHED */
9662 case OPFAIL: /* (*FAIL) */
9664 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9666 /* deal with (?(?!)X|Y) properly,
9667 * make sure we trigger the no branch
9668 * of the trailing IFTHEN structure*/
9674 NOT_REACHED; /* NOTREACHED */
9676 #define ST st->u.mark
9677 case MARKPOINT: /* (*MARK:foo) */
9678 ST.prev_mark = mark_state;
9679 ST.mark_name = sv_commit = sv_yes_mark
9680 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9682 ST.mark_loc = locinput;
9683 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9685 NOT_REACHED; /* NOTREACHED */
9687 case MARKPOINT_next:
9688 mark_state = ST.prev_mark;
9690 NOT_REACHED; /* NOTREACHED */
9692 case MARKPOINT_next_fail:
9693 if (popmark && sv_eq(ST.mark_name,popmark))
9695 if (ST.mark_loc > startpoint)
9696 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9697 popmark = NULL; /* we found our mark */
9698 sv_commit = ST.mark_name;
9701 Perl_re_exec_indentf( aTHX_ "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9703 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9706 mark_state = ST.prev_mark;
9707 sv_yes_mark = mark_state ?
9708 mark_state->u.mark.mark_name : NULL;
9710 NOT_REACHED; /* NOTREACHED */
9712 case SKIP: /* (*SKIP) */
9714 /* (*SKIP) : if we fail we cut here*/
9715 ST.mark_name = NULL;
9716 ST.mark_loc = locinput;
9717 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9720 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9721 otherwise do nothing. Meaning we need to scan
9723 regmatch_state *cur = mark_state;
9724 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9727 if ( sv_eq( cur->u.mark.mark_name,
9730 ST.mark_name = find;
9731 PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9734 cur = cur->u.mark.prev_mark;
9737 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9740 case SKIP_next_fail:
9742 /* (*CUT:NAME) - Set up to search for the name as we
9743 collapse the stack*/
9744 popmark = ST.mark_name;
9746 /* (*CUT) - No name, we cut here.*/
9747 if (ST.mark_loc > startpoint)
9748 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9749 /* but we set sv_commit to latest mark_name if there
9750 is one so they can test to see how things lead to this
9753 sv_commit=mark_state->u.mark.mark_name;
9757 NOT_REACHED; /* NOTREACHED */
9760 case LNBREAK: /* \R */
9761 if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9768 PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9769 PTR2UV(scan), OP(scan));
9770 Perl_croak(aTHX_ "regexp memory corruption");
9772 /* this is a point to jump to in order to increment
9773 * locinput by one character */
9775 assert(!NEXTCHR_IS_EOS);
9777 locinput += PL_utf8skip[nextbyte];
9778 /* locinput is allowed to go 1 char off the end (signifying
9779 * EOS), but not 2+ */
9780 if (locinput > loceol)
9789 /* switch break jumps here */
9790 scan = next; /* prepare to execute the next op and ... */
9791 continue; /* ... jump back to the top, reusing st */
9795 /* push a state that backtracks on success */
9796 st->u.yes.prev_yes_state = yes_state;
9800 /* push a new regex state, then continue at scan */
9802 regmatch_state *newst;
9803 DECLARE_AND_GET_RE_DEBUG_FLAGS;
9805 DEBUG_r( /* DEBUG_STACK_r */
9806 if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
9807 regmatch_state *cur = st;
9808 regmatch_state *curyes = yes_state;
9810 regmatch_slab *slab = PL_regmatch_slab;
9811 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9812 if (cur < SLAB_FIRST(slab)) {
9814 cur = SLAB_LAST(slab);
9816 Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9819 depth - i, REGNODE_NAME(cur->resume_state),
9820 (curyes == cur) ? "yes" : ""
9823 curyes = cur->u.yes.prev_yes_state;
9826 DEBUG_STATE_pp("push")
9829 st->locinput = locinput;
9830 st->loceol = loceol;
9831 st->sr0 = script_run_begin;
9833 if (newst > SLAB_LAST(PL_regmatch_slab))
9834 newst = S_push_slab(aTHX);
9835 PL_regmatch_state = newst;
9837 locinput = pushinput;
9839 script_run_begin = pushsr0;
9845 #ifdef SOLARIS_BAD_OPTIMIZER
9846 # undef PL_charclass
9850 * We get here only if there's trouble -- normally "case END" is
9851 * the terminating point.
9853 Perl_croak(aTHX_ "corrupted regexp pointers");
9854 NOT_REACHED; /* NOTREACHED */
9858 /* we have successfully completed a subexpression, but we must now
9859 * pop to the state marked by yes_state and continue from there */
9860 assert(st != yes_state);
9862 while (st != yes_state) {
9864 if (st < SLAB_FIRST(PL_regmatch_slab)) {
9865 PL_regmatch_slab = PL_regmatch_slab->prev;
9866 st = SLAB_LAST(PL_regmatch_slab);
9870 DEBUG_STATE_pp("pop (no final)");
9872 DEBUG_STATE_pp("pop (yes)");
9878 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9879 || yes_state > SLAB_LAST(PL_regmatch_slab))
9881 /* not in this slab, pop slab */
9882 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9883 PL_regmatch_slab = PL_regmatch_slab->prev;
9884 st = SLAB_LAST(PL_regmatch_slab);
9886 depth -= (st - yes_state);
9889 yes_state = st->u.yes.prev_yes_state;
9890 PL_regmatch_state = st;
9893 locinput= st->locinput;
9895 script_run_begin = st->sr0;
9897 state_num = st->resume_state + no_final;
9898 goto reenter_switch;
9901 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n",
9902 PL_colors[4], PL_colors[5]));
9904 if (reginfo->info_aux_eval) {
9905 /* each successfully executed (?{...}) block does the equivalent of
9906 * local $^R = do {...}
9907 * When popping the save stack, all these locals would be undone;
9908 * bypass this by setting the outermost saved $^R to the latest
9910 /* I dont know if this is needed or works properly now.
9911 * see code related to PL_replgv elsewhere in this file.
9914 if (oreplsv != GvSV(PL_replgv)) {
9915 sv_setsv(oreplsv, GvSV(PL_replgv));
9916 SvSETMAGIC(oreplsv);
9924 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n",
9926 PL_colors[4], PL_colors[5])
9938 /* there's a previous state to backtrack to */
9940 if (st < SLAB_FIRST(PL_regmatch_slab)) {
9941 PL_regmatch_slab = PL_regmatch_slab->prev;
9942 st = SLAB_LAST(PL_regmatch_slab);
9944 PL_regmatch_state = st;
9945 locinput= st->locinput;
9947 script_run_begin = st->sr0;
9949 DEBUG_STATE_pp("pop");
9951 if (yes_state == st)
9952 yes_state = st->u.yes.prev_yes_state;
9954 state_num = st->resume_state + 1; /* failure = success + 1 */
9956 goto reenter_switch;
9961 if (rex->intflags & PREGf_VERBARG_SEEN) {
9962 SV *sv_err = get_sv("REGERROR", 1);
9963 SV *sv_mrk = get_sv("REGMARK", 1);
9965 sv_commit = &PL_sv_no;
9967 sv_yes_mark = &PL_sv_yes;
9970 sv_commit = &PL_sv_yes;
9971 sv_yes_mark = &PL_sv_no;
9975 sv_setsv(sv_err, sv_commit);
9976 sv_setsv(sv_mrk, sv_yes_mark);
9980 if (last_pushed_cv) {
9982 /* see "Some notes about MULTICALL" above */
9984 PERL_UNUSED_VAR(SP);
9987 LEAVE_SCOPE(orig_savestack_ix);
9989 assert(!result || locinput - reginfo->strbeg >= 0);
9990 return result ? locinput - reginfo->strbeg : -1;
9994 - regrepeat - repeatedly match something simple, report how many
9996 * What 'simple' means is a node which can be the operand of a quantifier like
9999 * startposp - pointer to a pointer to the start position. This is updated
10000 * to point to the byte following the highest successful
10002 * p - the regnode to be repeatedly matched against.
10003 * loceol - pointer to the end position beyond which we aren't supposed to
10005 * reginfo - struct holding match state, such as utf8_target
10006 * max - maximum number of things to match.
10007 * depth - (for debugging) backtracking depth.
10010 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
10011 char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
10013 char *scan; /* Pointer to current position in target string */
10015 char *this_eol = loceol; /* potentially adjusted version. */
10016 I32 hardcount = 0; /* How many matches so far */
10017 bool utf8_target = reginfo->is_utf8_target;
10018 unsigned int to_complement = 0; /* Invert the result? */
10019 char_class_number_ classnum;
10021 PERL_ARGS_ASSERT_REGREPEAT;
10023 /* This routine is structured so that we switch on the input OP. Each OP
10024 * case: statement contains a loop to repeatedly apply the OP, advancing
10025 * the input until it fails, or reaches the end of the input, or until it
10026 * reaches the upper limit of matches. */
10029 if (max == REG_INFTY) /* This is a special marker to go to the platform's
10032 else if (! utf8_target && this_eol - scan > max)
10033 this_eol = scan + max;
10035 /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol>
10036 * down to the maximum of how far we should go in it (but leaving it set to
10037 * the real end if the maximum permissible would take us beyond that).
10038 * This allows us to make the loop exit condition that we haven't gone past
10039 * <this_eol> to also mean that we haven't exceeded the max permissible
10040 * count, saving a test each time through the loop. But it assumes that
10041 * the OP matches a single byte, which is true for most of the OPs below
10042 * when applied to a non-UTF-8 target. Those relatively few OPs that don't
10043 * have this characteristic have to compensate.
10045 * There is no such adjustment for UTF-8 targets, since the number of bytes
10046 * per character can vary. OPs will have to test both that the count is
10047 * less than the max permissible (using <hardcount> to keep track), and
10048 * that we are still within the bounds of the string (using <this_eol>. A
10049 * few OPs match a single byte no matter what the encoding. They can omit
10050 * the max test if, for the UTF-8 case, they do the adjustment that was
10053 * Thus, the code above sets things up for the common case; and exceptional
10054 * cases need extra work; the common case is to make sure <scan> doesn't go
10055 * past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
10056 * count doesn't exceed the maximum permissible */
10058 switch (with_t_UTF8ness(OP(p), utf8_target)) {
10062 while (scan < this_eol && hardcount < max && *scan != '\n') {
10063 scan += UTF8SKIP(scan);
10069 scan = (char *) memchr(scan, '\n', this_eol - scan);
10076 while (scan < this_eol && hardcount < max) {
10077 scan += UTF8SKIP(scan);
10086 case EXACT_REQ8_tb:
10087 case LEXACT_REQ8_tb:
10088 case EXACTFU_REQ8_tb:
10092 if (UTF8_IS_ABOVE_LATIN1(*scan)) {
10093 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
10102 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10105 case EXACT_REQ8_t8:
10106 case LEXACT_REQ8_t8:
10107 case EXACTFU_REQ8_t8:
10114 case EXACTFAA_NO_TRIE_t8:
10115 case EXACTFAA_NO_TRIE_tb:
10124 struct next_matchable_info Binfo;
10125 PERL_UINT_FAST8_T definitive_len;
10127 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
10129 /* Set up termination info, and quit if we can rule out that we've
10130 * gotten a match of the termination criteria */
10131 if ( ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo)
10132 || scan + Binfo.min_length > this_eol
10133 || ! S_test_EXACTISH_ST(scan, Binfo))
10138 definitive_len = Binfo.initial_definitive;
10140 /* Here there are potential matches, and the first byte(s) matched our
10143 * If we got a definitive match of some initial bytes, there is no
10144 * possibility of false positives as far as it got */
10145 if (definitive_len > 0) {
10147 /* If as far as it got is the maximum possible, there were no false
10148 * positives at all. Since we have everything set up, see how many
10149 * repeats there are. */
10150 if (definitive_len >= Binfo.max_length) {
10152 /* We've already found one match */
10153 scan += definitive_len;
10156 /* If want more than the one match, and there is room for more,
10157 * see if there are any */
10158 if (hardcount < max && scan + definitive_len <= this_eol) {
10160 /* If the character is only a single byte long, just span
10161 * all such bytes. */
10162 if (definitive_len == 1) {
10163 const char * orig_scan = scan;
10165 if (this_eol - (scan - hardcount) > max) {
10166 this_eol = scan - hardcount + max;
10169 /* Use different routines depending on whether it's an
10170 * exact match or matches with a mask */
10171 if (Binfo.initial_exact == 1) {
10172 scan = (char *) find_span_end((U8 *) scan,
10177 scan = (char *) find_span_end_mask(
10180 Binfo.first_byte_anded,
10181 Binfo.first_byte_mask);
10184 hardcount += scan - orig_scan;
10186 else { /* Here, the full character definitive match is more
10188 while ( hardcount < max
10189 && scan + definitive_len <= this_eol
10190 && S_test_EXACTISH_ST(scan, Binfo))
10192 scan += definitive_len;
10199 } /* End of a full character is definitively matched */
10201 /* Here, an initial portion of the character matched definitively,
10202 * and the rest matched as well, but could have false positives */
10206 U8 * matches = Binfo.matches;
10208 /* The first bytes were definitive. Look at the remaining */
10209 for (i = 0; i < Binfo.count; i++) {
10210 if (memEQ(scan + definitive_len,
10211 matches + definitive_len,
10212 Binfo.lengths[i] - definitive_len))
10214 goto found_a_completion;
10217 matches += Binfo.lengths[i];
10220 /* Didn't find anything to complete our initial match. Stop
10224 found_a_completion:
10226 /* Here, matched a full character, Include it in the result,
10227 * and then look to see if the next char matches */
10229 scan += Binfo.lengths[i];
10231 } while ( hardcount < max
10232 && scan + definitive_len < this_eol
10233 && S_test_EXACTISH_ST(scan, Binfo));
10235 /* Here, have advanced as far as possible */
10237 } /* End of found some initial bytes that definitively matched */
10239 /* Here, we can't rule out that we have found the beginning of 'B', but
10240 * there were no initial bytes that could rule out anything
10241 * definitively. Use brute force to examine all the possibilities */
10242 while (scan < this_eol && hardcount < max) {
10244 U8 * matches = Binfo.matches;
10246 for (i = 0; i < Binfo.count; i++) {
10247 if (memEQ(scan, matches, Binfo.lengths[i])) {
10251 matches += Binfo.lengths[i];
10258 scan += Binfo.lengths[i];
10264 case ANYOFPOSIXL_t8:
10266 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10267 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10272 while ( hardcount < max
10274 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10276 scan += UTF8SKIP(scan);
10281 case ANYOFPOSIXL_tb:
10283 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10284 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10289 if (ANYOF_FLAGS(p) || ANYOF_HAS_AUX(p)) {
10290 while ( scan < this_eol
10291 && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
10295 while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
10301 if (this_eol - scan > max) {
10303 /* We didn't adjust <this_eol> at the beginning of this routine
10304 * because is UTF-8, but it is actually ok to do so, since here, to
10305 * match, 1 char == 1 byte. */
10306 this_eol = scan + max;
10311 scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol,
10312 (U8) ARG(p), FLAGS(p));
10316 while ( hardcount < max
10318 && (*scan & FLAGS(p)) != ARG(p))
10320 scan += UTF8SKIP(scan);
10326 scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol,
10327 (U8) ARG(p), FLAGS(p));
10330 case ANYOFH_tb: /* ANYOFH only can match UTF-8 targets */
10338 anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10339 while ( hardcount < max
10341 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10342 && _invlist_contains_cp(anyofh_list,
10343 utf8_to_uvchr_buf((U8 *) scan,
10347 scan += UTF8SKIP(scan);
10353 /* we know the first byte must be the FLAGS field */
10354 anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10355 while ( hardcount < max
10357 && (U8) *scan == ANYOF_FLAGS(p)
10358 && _invlist_contains_cp(anyofh_list,
10359 utf8_to_uvchr_buf((U8 *) scan,
10363 scan += UTF8SKIP(scan);
10369 while ( hardcount < max
10370 && scan + 1 < this_eol
10371 && (U8) *scan == ANYOF_FLAGS(p)
10372 && BITMAP_TEST(( (struct regnode_bbm *) p)->bitmap,
10373 (U8) scan[1] & UTF_CONTINUATION_MASK))
10375 scan += 2; /* This node only matces 2-byte UTF-8 */
10381 anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10382 while ( hardcount < max
10384 && inRANGE(NATIVE_UTF8_TO_I8(*scan),
10385 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
10386 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
10387 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10388 && _invlist_contains_cp(anyofh_list,
10389 utf8_to_uvchr_buf((U8 *) scan,
10393 scan += UTF8SKIP(scan);
10399 anyofh_list = GET_ANYOFH_INVLIST(prog, p);
10400 while ( hardcount < max
10401 && scan + FLAGS(p) < this_eol
10402 && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
10403 && _invlist_contains_cp(anyofh_list,
10404 utf8_to_uvchr_buf((U8 *) scan,
10408 scan += UTF8SKIP(scan);
10414 while ( hardcount < max
10416 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10417 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10420 ANYOFRbase(p), ANYOFRdelta(p)))
10422 scan += UTF8SKIP(scan);
10428 while ( hardcount < max
10430 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10438 while ( hardcount < max
10440 && (U8) *scan == ANYOF_FLAGS(p)
10441 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10444 ANYOFRbase(p), ANYOFRdelta(p)))
10446 scan += UTF8SKIP(scan);
10452 while ( hardcount < max
10454 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10461 /* The argument (FLAGS) to all the POSIX node types is the class number */
10468 CHECK_AND_WARN_PROBLEMATIC_LOCALE_;
10469 while ( scan < this_eol
10470 && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan)))
10481 while ( hardcount < max && scan < this_eol
10482 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
10486 scan += UTF8SKIP(scan);
10495 if (this_eol - scan > max) {
10497 /* We didn't adjust <this_eol> at the beginning of this routine
10498 * because is UTF-8, but it is actually ok to do so, since here, to
10499 * match, 1 char == 1 byte. */
10500 this_eol = scan + max;
10505 while (scan < this_eol && generic_isCC_A_((U8) *scan, FLAGS(p))) {
10514 while (scan < this_eol && ! generic_isCC_A_((U8) *scan, FLAGS(p))) {
10521 /* The complement of something that matches only ASCII matches all
10522 * non-ASCII, plus everything in ASCII that isn't in the class. */
10523 while ( hardcount < max && scan < this_eol
10524 && ( ! isASCII_utf8_safe(scan, loceol)
10525 || ! generic_isCC_A_((U8) *scan, FLAGS(p))))
10527 scan += UTF8SKIP(scan);
10537 while ( scan < this_eol
10538 && to_complement ^ cBOOL(generic_isCC_((U8) *scan, FLAGS(p))))
10551 classnum = (char_class_number_) FLAGS(p);
10552 switch (classnum) {
10554 while ( hardcount < max && scan < this_eol
10556 ^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum],
10557 utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL))))
10559 scan += UTF8SKIP(scan);
10564 /* For the classes below, the knowledge of how to handle every code
10565 * point is compiled into Perl via a macro. This code is written
10566 * for making the loops as tight as possible. It could be
10567 * refactored to save space instead. */
10569 case CC_ENUM_SPACE_:
10570 while ( hardcount < max
10573 ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
10575 scan += UTF8SKIP(scan);
10579 case CC_ENUM_BLANK_:
10580 while ( hardcount < max
10583 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
10585 scan += UTF8SKIP(scan);
10589 case CC_ENUM_XDIGIT_:
10590 while ( hardcount < max
10593 ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10595 scan += UTF8SKIP(scan);
10599 case CC_ENUM_VERTSPACE_:
10600 while ( hardcount < max
10603 ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10605 scan += UTF8SKIP(scan);
10609 case CC_ENUM_CNTRL_:
10610 while ( hardcount < max
10613 ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10615 scan += UTF8SKIP(scan);
10623 while ( hardcount < max && scan < this_eol
10624 && (c=is_LNBREAK_utf8_safe(scan, this_eol)))
10632 /* LNBREAK can match one or two latin chars, which is ok, but we have
10633 * to use hardcount in this situation, and throw away the adjustment to
10634 * <this_eol> done before the switch statement */
10636 hardcount < max && scan < loceol
10637 && (c = is_LNBREAK_latin1_safe(scan, loceol))
10645 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized"
10646 " node type %d='%s'", OP(p), REGNODE_NAME(OP(p)));
10647 NOT_REACHED; /* NOTREACHED */
10654 c = scan - *startposp;
10658 DECLARE_AND_GET_RE_DEBUG_FLAGS;
10660 SV * const prop = sv_newmortal();
10661 regprop(prog, prop, p, reginfo, NULL);
10662 Perl_re_exec_indentf( aTHX_
10663 "%s can match %" IVdf " times out of %" IVdf "...\n",
10664 depth, SvPVX_const(prop),(IV)c,(IV)max);
10672 - reginclass - determine if a character falls into a character class
10674 n is the ANYOF-type regnode
10675 p is the target string
10676 p_end points to one byte beyond the end of the target string
10677 utf8_target tells whether p is in UTF-8.
10679 Returns true if matched; false otherwise.
10681 Note that this can be a synthetic start class, a combination of various
10682 nodes, so things you think might be mutually exclusive, such as locale,
10683 aren't. It can match both locale and non-locale
10688 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10690 const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10693 bool match = FALSE;
10696 PERL_ARGS_ASSERT_REGINCLASS;
10698 /* If c is not already the code point, get it. Note that
10699 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10700 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10702 const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10703 c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10704 if (c_len == (STRLEN)-1) {
10705 _force_out_malformed_utf8_message(p, p_end,
10707 1 /* 1 means die */ );
10708 NOT_REACHED; /* NOTREACHED */
10711 && (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10712 && ! (flags & ANYOFL_UTF8_LOCALE_REQD))
10714 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10718 /* If this character is potentially in the bitmap, check it */
10719 if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10720 if (ANYOF_BITMAP_TEST(n, c))
10722 else if ( (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared)
10729 else if (flags & ANYOF_LOCALE_FLAGS) {
10730 if ( (flags & ANYOFL_FOLD)
10732 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10736 else if ( ANYOF_POSIXL_TEST_ANY_SET(n)
10737 && c <= U8_MAX /* param to isFOO_lc() */
10739 /* The data structure is arranged so bits 0, 2, 4, ... are set
10740 * if the class includes the Posix character class given by
10741 * bit/2; and 1, 3, 5, ... are set if the class includes the
10742 * complemented Posix class given by int(bit/2), so the
10743 * remainder modulo 2 tells us if to complement or not.
10745 * Note that this code assumes that all the classes are closed
10746 * under folding. For example, if a character matches \w, then
10747 * its fold does too; and vice versa. This should be true for
10748 * any well-behaved locale for all the currently defined Posix
10749 * classes, except for :lower: and :upper:, which are handled
10750 * by the pseudo-class :cased: which matches if either of the
10751 * other two does. To get rid of this assumption, an outer
10752 * loop could be used below to iterate over both the source
10753 * character, and its fold (if different) */
10755 U32 posixl_bits = ANYOF_POSIXL_BITMAP(n);
10758 /* Find the next set bit indicating a class to try matching
10760 U8 bit_pos = lsbit_pos32(posixl_bits);
10762 if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) {
10767 /* Remove this class from consideration; repeat */
10768 POSIXL_CLEAR(posixl_bits, bit_pos);
10769 } while(posixl_bits != 0);
10774 /* If the bitmap didn't (or couldn't) match, and something outside the
10775 * bitmap could match, try that. */
10777 if ( c >= NUM_ANYOF_CODE_POINTS
10778 && ANYOF_ONLY_HAS_BITMAP(n)
10779 && ! (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES))
10781 /* In this case, the ARG is set up so that the final bit indicates
10782 * whether it matches or not */
10783 match = ARG(n) & 1;
10786 /* Here, the main way it could match is if the code point is
10787 * outside the bitmap and an inversion list exists to handle such
10788 * things. The other ways all occur when a flag is set to indicate
10789 * we need special handling. That handling doesn't come in to
10790 * effect for ANYOFD nodes unless the target string is UTF-8 and
10791 * that matters to code point being matched. */
10792 if ( c >= NUM_ANYOF_CODE_POINTS
10793 || ( (flags & ANYOF_HAS_EXTRA_RUNTIME_MATCHES)
10794 && ( UNLIKELY(OP(n) != ANYOFD)
10795 || (utf8_target && ! isASCII_uvchr(c)
10796 # if NUM_ANYOF_CODE_POINTS > 256
10801 /* Here, we have an inversion list for outside-the-bitmap code
10802 * points and/or the flag is set indicating special handling is
10803 * needed. The flag is set when there is auxiliary data beyond the
10804 * normal inversion list, or if there is the possibility of a
10805 * special Turkic locale match, even without auxiliary data.
10807 * First check if there is an inversion list and/or auxiliary data.
10809 if (ANYOF_HAS_AUX(n)) {
10810 SV* only_utf8_locale = NULL;
10812 /* This call will return in 'definition' the union of the base
10813 * non-bitmap inversion list, if any, plus the deferred
10814 * definitions of user-defined properties, if any. It croaks
10815 * if there is such a property but which still has no definition
10817 SV * const definition = GET_REGCLASS_AUX_DATA(prog, n, TRUE, 0,
10818 &only_utf8_locale, NULL);
10820 /* Most likely is the outside-the-bitmap inversion list. */
10821 if (_invlist_contains_cp(definition, c)) {
10824 else /* Failing that, hardcode the two tests for a Turkic
10826 if ( UNLIKELY(PL_in_utf8_turkic_locale)
10827 && isALPHA_FOLD_EQ(*p, 'i'))
10829 /* Turkish locales have these hard-coded rules
10830 * overriding normal ones */
10832 if (_invlist_contains_cp(definition,
10833 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
10838 else if (_invlist_contains_cp(definition,
10839 LATIN_SMALL_LETTER_DOTLESS_I))
10846 if ( UNLIKELY(only_utf8_locale)
10847 && UNLIKELY(IN_UTF8_CTYPE_LOCALE)
10850 match = _invlist_contains_cp(only_utf8_locale, c);
10854 /* In a Turkic locale under folding, hard-code the I i case pair
10855 * matches; these wouldn't have the ANYOF_HAS_EXTRA_RUNTIME_MATCHES
10856 * flag set unless [Ii] were match possibilities */
10857 if (UNLIKELY(PL_in_utf8_turkic_locale) && ! match) {
10859 if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10860 if (ANYOF_BITMAP_TEST(n, 'i')) {
10864 else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10865 if (ANYOF_BITMAP_TEST(n, 'I')) {
10871 #if NUM_ANYOF_CODE_POINTS > 256
10872 /* Larger bitmap means these special cases aren't handled
10873 * outside the bitmap above. */
10875 if (ANYOF_BITMAP_TEST(n,
10876 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
10881 else if (*p == 'I') {
10882 if (ANYOF_BITMAP_TEST(n, LATIN_SMALL_LETTER_DOTLESS_I)) {
10890 if ( UNICODE_IS_SUPER(c)
10891 && (flags & ANYOF_WARN_SUPER__shared)
10893 && ckWARN_d(WARN_NON_UNICODE))
10895 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10896 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10900 #if ANYOF_INVERT != 1
10901 /* Depending on compiler optimization cBOOL takes time, so if don't have to
10903 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10906 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10907 return (flags & ANYOF_INVERT) ^ match;
10911 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10913 /* return the position 'off' UTF-8 characters away from 's', forward if
10914 * 'off' >= 0, backwards if negative. But don't go outside of position
10915 * 'lim', which better be < s if off < 0 */
10917 PERL_ARGS_ASSERT_REGHOP3;
10920 while (off-- && s < lim) {
10921 /* XXX could check well-formedness here */
10922 U8 *new_s = s + UTF8SKIP(s);
10923 if (new_s > lim) /* lim may be in the middle of a long character */
10929 while (off++ && s > lim) {
10931 if (UTF8_IS_CONTINUED(*s)) {
10932 while (s > lim && UTF8_IS_CONTINUATION(*s))
10934 if (! UTF8_IS_START(*s)) {
10935 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10938 /* XXX could check well-formedness here */
10945 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10947 PERL_ARGS_ASSERT_REGHOP4;
10950 while (off-- && s < rlim) {
10951 /* XXX could check well-formedness here */
10956 while (off++ && s > llim) {
10958 if (UTF8_IS_CONTINUED(*s)) {
10959 while (s > llim && UTF8_IS_CONTINUATION(*s))
10961 if (! UTF8_IS_START(*s)) {
10962 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10965 /* XXX could check well-formedness here */
10971 /* like reghop3, but returns NULL on overrun, rather than returning last
10975 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10977 PERL_ARGS_ASSERT_REGHOPMAYBE3;
10980 while (off-- && s < lim) {
10981 /* XXX could check well-formedness here */
10988 while (off++ && s > lim) {
10990 if (UTF8_IS_CONTINUED(*s)) {
10991 while (s > lim && UTF8_IS_CONTINUATION(*s))
10993 if (! UTF8_IS_START(*s)) {
10994 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10997 /* XXX could check well-formedness here */
11006 /* when executing a regex that may have (?{}), extra stuff needs setting
11007 up that will be visible to the called code, even before the current
11008 match has finished. In particular:
11010 * $_ is localised to the SV currently being matched;
11011 * pos($_) is created if necessary, ready to be updated on each call-out
11013 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
11014 isn't set until the current pattern is successfully finished), so that
11015 $1 etc of the match-so-far can be seen;
11016 * save the old values of subbeg etc of the current regex, and set then
11017 to the current string (again, this is normally only done at the end
11022 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
11025 regexp *const rex = ReANY(reginfo->prog);
11026 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
11028 eval_state->rex = rex;
11029 eval_state->sv = reginfo->sv;
11032 /* Make $_ available to executed code. */
11033 if (reginfo->sv != DEFSV) {
11035 DEFSV_set(reginfo->sv);
11037 /* will be dec'd by S_cleanup_regmatch_info_aux */
11038 SvREFCNT_inc_NN(reginfo->sv);
11040 if (!(mg = mg_find_mglob(reginfo->sv))) {
11041 /* prepare for quick setting of pos */
11042 mg = sv_magicext_mglob(reginfo->sv);
11045 eval_state->pos_magic = mg;
11046 eval_state->pos = mg->mg_len;
11047 eval_state->pos_flags = mg->mg_flags;
11050 eval_state->pos_magic = NULL;
11052 if (!PL_reg_curpm) {
11053 /* PL_reg_curpm is a fake PMOP that we can attach the current
11054 * regex to and point PL_curpm at, so that $1 et al are visible
11055 * within a /(?{})/. It's just allocated once per interpreter the
11056 * first time its needed */
11057 Newxz(PL_reg_curpm, 1, PMOP);
11058 #ifdef USE_ITHREADS
11060 SV* const repointer = &PL_sv_undef;
11061 /* this regexp is also owned by the new PL_reg_curpm, which
11062 will try to free it. */
11063 av_push(PL_regex_padav, repointer);
11064 PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
11065 PL_regex_pad = AvARRAY(PL_regex_padav);
11069 SET_reg_curpm(reginfo->prog);
11070 eval_state->curpm = PL_curpm;
11071 PL_curpm_under = PL_curpm;
11072 PL_curpm = PL_reg_curpm;
11073 if (RXp_MATCH_COPIED(rex)) {
11074 /* Here is a serious problem: we cannot rewrite subbeg,
11075 since it may be needed if this match fails. Thus
11076 $` inside (?{}) could fail... */
11077 eval_state->subbeg = rex->subbeg;
11078 eval_state->sublen = rex->sublen;
11079 eval_state->suboffset = rex->suboffset;
11080 eval_state->subcoffset = rex->subcoffset;
11081 #ifdef PERL_ANY_COW
11082 eval_state->saved_copy = rex->saved_copy;
11084 RXp_MATCH_COPIED_off(rex);
11087 eval_state->subbeg = NULL;
11088 rex->subbeg = (char *)reginfo->strbeg;
11089 rex->suboffset = 0;
11090 rex->subcoffset = 0;
11091 rex->sublen = reginfo->strend - reginfo->strbeg;
11095 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
11098 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
11100 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
11101 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
11104 Safefree(aux->poscache);
11108 /* undo the effects of S_setup_eval_state() */
11110 if (eval_state->subbeg) {
11111 regexp * const rex = eval_state->rex;
11112 rex->subbeg = eval_state->subbeg;
11113 rex->sublen = eval_state->sublen;
11114 rex->suboffset = eval_state->suboffset;
11115 rex->subcoffset = eval_state->subcoffset;
11116 #ifdef PERL_ANY_COW
11117 rex->saved_copy = eval_state->saved_copy;
11119 RXp_MATCH_COPIED_on(rex);
11121 if (eval_state->pos_magic)
11123 eval_state->pos_magic->mg_len = eval_state->pos;
11124 eval_state->pos_magic->mg_flags =
11125 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
11126 | (eval_state->pos_flags & MGf_BYTES);
11129 PL_curpm = eval_state->curpm;
11130 SvREFCNT_dec(eval_state->sv);
11133 PL_regmatch_state = aux->old_regmatch_state;
11134 PL_regmatch_slab = aux->old_regmatch_slab;
11136 /* free all slabs above current one - this must be the last action
11137 * of this function, as aux and eval_state are allocated within
11138 * slabs and may be freed here */
11140 s = PL_regmatch_slab->next;
11142 PL_regmatch_slab->next = NULL;
11144 regmatch_slab * const osl = s;
11153 S_to_utf8_substr(pTHX_ regexp *prog)
11155 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
11156 * on the converted value */
11160 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
11163 if (prog->substrs->data[i].substr
11164 && !prog->substrs->data[i].utf8_substr) {
11165 SV* const sv = newSVsv(prog->substrs->data[i].substr);
11166 prog->substrs->data[i].utf8_substr = sv;
11167 sv_utf8_upgrade(sv);
11168 if (SvVALID(prog->substrs->data[i].substr)) {
11169 if (SvTAIL(prog->substrs->data[i].substr)) {
11170 /* Trim the trailing \n that fbm_compile added last
11172 SvCUR_set(sv, SvCUR(sv) - 1);
11173 /* Whilst this makes the SV technically "invalid" (as its
11174 buffer is no longer followed by "\0") when fbm_compile()
11175 adds the "\n" back, a "\0" is restored. */
11176 fbm_compile(sv, FBMcf_TAIL);
11178 fbm_compile(sv, 0);
11180 if (prog->substrs->data[i].substr == prog->check_substr)
11181 prog->check_utf8 = sv;
11187 S_to_byte_substr(pTHX_ regexp *prog)
11189 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
11190 * on the converted value; returns FALSE if can't be converted. */
11194 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
11197 if (prog->substrs->data[i].utf8_substr
11198 && !prog->substrs->data[i].substr) {
11199 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
11200 if (! sv_utf8_downgrade(sv, TRUE)) {
11201 SvREFCNT_dec_NN(sv);
11204 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
11205 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
11206 /* Trim the trailing \n that fbm_compile added last
11208 SvCUR_set(sv, SvCUR(sv) - 1);
11209 fbm_compile(sv, FBMcf_TAIL);
11211 fbm_compile(sv, 0);
11213 prog->substrs->data[i].substr = sv;
11214 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
11215 prog->check_substr = sv;
11222 #ifndef PERL_IN_XSUB_RE
11225 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
11227 /* Temporary helper function for toke.c. Verify that the code point 'cp'
11228 * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in
11229 * the larger string bounded by 'strbeg' and 'strend'.
11231 * 'cp' needs to be assigned (if not, a future version of the Unicode
11232 * Standard could make it something that combines with adjacent characters,
11233 * so code using it would then break), and there has to be a GCB break
11234 * before and after the character. */
11237 GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
11238 const U8 * prev_cp_start;
11240 PERL_ARGS_ASSERT_IS_GRAPHEME;
11242 if ( UNLIKELY(UNICODE_IS_SUPER(cp))
11243 || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
11245 /* These are considered graphemes */
11249 /* Otherwise, unassigned code points are forbidden */
11250 if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
11251 _invlist_search(PL_Assigned_invlist, cp))))
11256 cp_gcb_val = getGCB_VAL_CP(cp);
11258 /* Find the GCB value of the previous code point in the input */
11259 prev_cp_start = utf8_hop_back(s, -1, strbeg);
11260 if (UNLIKELY(prev_cp_start == s)) {
11261 prev_cp_gcb_val = GCB_EDGE;
11264 prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
11267 /* And check that is a grapheme boundary */
11268 if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
11269 TRUE /* is UTF-8 encoded */ ))
11274 /* Similarly verify there is a break between the current character and the
11278 next_cp_gcb_val = GCB_EDGE;
11281 next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
11284 return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
11288 =for apidoc_section $unicode
11290 =for apidoc isSCRIPT_RUN
11292 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
11293 not including C<send> form a "script run". C<utf8_target> is TRUE iff the
11294 sequence starting at C<s> is to be treated as UTF-8. To be precise, except for
11295 two degenerate cases given below, this function returns TRUE iff all code
11296 points in it come from any combination of three "scripts" given by the Unicode
11297 "Script Extensions" property: Common, Inherited, and possibly one other.
11298 Additionally all decimal digits must come from the same consecutive sequence of
11301 For example, if all the characters in the sequence are Greek, or Common, or
11302 Inherited, this function will return TRUE, provided any decimal digits in it
11303 are from the same block of digits in Common. (These are the ASCII digits
11304 "0".."9" and additionally a block for full width forms of these, and several
11305 others used in mathematical notation.) For scripts (unlike Greek) that have
11306 their own digits defined this will accept either digits from that set or from
11307 one of the Common digit sets, but not a combination of the two. Some scripts,
11308 such as Arabic, have more than one set of digits. All digits must come from
11309 the same set for this function to return TRUE.
11311 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
11312 contain the script found, using the C<SCX_enum> typedef. Its value will be
11313 C<SCX_INVALID> if the function returns FALSE.
11315 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
11316 will be C<SCX_INVALID>.
11318 If the sequence contains a single code point which is unassigned to a character
11319 in the version of Unicode being used, the function will return TRUE, and the
11320 script will be C<SCX_Unknown>. Any other combination of unassigned code points
11321 in the input sequence will result in the function treating the input as not
11322 being a script run.
11324 The returned script will be C<SCX_Inherited> iff all the code points in it are
11325 from the Inherited script.
11327 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
11328 it are from the Inherited or Common scripts.
11335 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
11337 /* Basically, it looks at each character in the sequence to see if the
11338 * above conditions are met; if not it fails. It uses an inversion map to
11339 * find the enum corresponding to the script of each character. But this
11340 * is complicated by the fact that a few code points can be in any of
11341 * several scripts. The data has been constructed so that there are
11342 * additional enum values (all negative) for these situations. The
11343 * absolute value of those is an index into another table which contains
11344 * pointers to auxiliary tables for each such situation. Each aux array
11345 * lists all the scripts for the given situation. There is another,
11346 * parallel, table that gives the number of entries in each aux table.
11347 * These are all defined in charclass_invlists.h */
11349 /* XXX Here are the additional things UTS 39 says could be done:
11351 * Forbid sequences of the same nonspacing mark
11353 * Check to see that all the characters are in the sets of exemplar
11354 * characters for at least one language in the Unicode Common Locale Data
11355 * Repository [CLDR]. */
11358 /* Things that match /\d/u */
11359 SV * decimals_invlist = PL_XPosix_ptrs[CC_DIGIT_];
11360 UV * decimals_array = invlist_array(decimals_invlist);
11362 /* What code point is the digit '0' of the script run? (0 meaning FALSE if
11363 * not currently known) */
11364 UV zero_of_run = 0;
11366 SCX_enum script_of_run = SCX_INVALID; /* Illegal value */
11367 SCX_enum script_of_char = SCX_INVALID;
11369 /* If the script remains not fully determined from iteration to iteration,
11370 * this is the current intersection of the possiblities. */
11371 SCX_enum * intersection = NULL;
11372 PERL_UINT_FAST8_T intersection_len = 0;
11374 bool retval = TRUE;
11375 SCX_enum * ret_script = NULL;
11379 PERL_ARGS_ASSERT_ISSCRIPT_RUN;
11381 /* All code points in 0..255 are either Common or Latin, so must be a
11382 * script run. We can return immediately unless we need to know which
11384 if (! utf8_target && LIKELY(send > s)) {
11385 if (ret_script == NULL) {
11389 /* If any character is Latin, the run is Latin */
11391 if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
11392 *ret_script = SCX_Latin;
11397 /* Here, all are Common */
11398 *ret_script = SCX_Common;
11402 /* Look at each character in the sequence */
11404 /* If the current character being examined is a digit, this is the code
11405 * point of the zero for its sequence of 10 */
11410 /* The code allows all scripts to use the ASCII digits. This is
11411 * because they are in the Common script. Hence any ASCII ones found
11412 * are ok, unless and until a digit from another set has already been
11413 * encountered. digit ranges in Common are not similarly blessed) */
11414 if (UNLIKELY(isDIGIT(*s))) {
11415 if (UNLIKELY(script_of_run == SCX_Unknown)) {
11420 if (zero_of_run != '0') {
11432 /* Here, isn't an ASCII digit. Find the code point of the character */
11433 if (! UTF8_IS_INVARIANT(*s)) {
11435 cp = valid_utf8_to_uvchr((U8 *) s, &len);
11442 /* If is within the range [+0 .. +9] of the script's zero, it also is a
11443 * digit in that script. We can skip the rest of this code for this
11445 if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
11449 /* Find the character's script. The correct values are hard-coded here
11450 * for small-enough code points. */
11451 if (cp < 0x2B9) { /* From inspection of Unicode db; extremely
11452 unlikely to change */
11454 || ( isALPHA_L1(cp)
11455 && LIKELY(cp != MICRO_SIGN_NATIVE)))
11457 script_of_char = SCX_Latin;
11460 script_of_char = SCX_Common;
11464 script_of_char = _Perl_SCX_invmap[
11465 _invlist_search(PL_SCX_invlist, cp)];
11468 /* We arbitrarily accept a single unassigned character, but not in
11469 * combination with anything else, and not a run of them. */
11470 if ( UNLIKELY(script_of_run == SCX_Unknown)
11471 || UNLIKELY( script_of_run != SCX_INVALID
11472 && script_of_char == SCX_Unknown))
11478 /* For the first character, or the run is inherited, the run's script
11479 * is set to the char's */
11480 if ( UNLIKELY(script_of_run == SCX_INVALID)
11481 || UNLIKELY(script_of_run == SCX_Inherited))
11483 script_of_run = script_of_char;
11486 /* For the character's script to be Unknown, it must be the first
11487 * character in the sequence (for otherwise a test above would have
11488 * prevented us from reaching here), and we have set the run's script
11489 * to it. Nothing further to be done for this character */
11490 if (UNLIKELY(script_of_char == SCX_Unknown)) {
11494 /* We accept 'inherited' script characters currently even at the
11495 * beginning. (We know that no characters in Inherited are digits, or
11496 * we'd have to check for that) */
11497 if (UNLIKELY(script_of_char == SCX_Inherited)) {
11501 /* If the run so far is Common, and the new character isn't, change the
11502 * run's script to that of this character */
11503 if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
11504 script_of_run = script_of_char;
11507 /* Now we can see if the script of the new character is the same as
11508 * that of the run */
11509 if (LIKELY(script_of_char == script_of_run)) {
11510 /* By far the most common case */
11511 goto scripts_match;
11514 /* Here, the script of the run isn't Common. But characters in Common
11515 * match any script */
11516 if (script_of_char == SCX_Common) {
11517 goto scripts_match;
11520 #ifndef HAS_SCX_AUX_TABLES
11522 /* Too early a Unicode version to have a code point belonging to more
11523 * than one script, so, if the scripts don't exactly match, fail */
11524 PERL_UNUSED_VAR(intersection_len);
11530 /* Here there is no exact match between the character's script and the
11531 * run's. And we've handled the special cases of scripts Unknown,
11532 * Inherited, and Common.
11534 * Negative script numbers signify that the value may be any of several
11535 * scripts, and we need to look at auxiliary information to make our
11536 * deterimination. But if both are non-negative, we can fail now */
11537 if (LIKELY(script_of_char >= 0)) {
11538 const SCX_enum * search_in;
11539 PERL_UINT_FAST8_T search_in_len;
11540 PERL_UINT_FAST8_T i;
11542 if (LIKELY(script_of_run >= 0)) {
11547 /* Use the previously constructed set of possible scripts, if any.
11549 if (intersection) {
11550 search_in = intersection;
11551 search_in_len = intersection_len;
11554 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
11555 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
11558 for (i = 0; i < search_in_len; i++) {
11559 if (search_in[i] == script_of_char) {
11560 script_of_run = script_of_char;
11561 goto scripts_match;
11568 else if (LIKELY(script_of_run >= 0)) {
11569 /* script of character could be one of several, but run is a single
11571 const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11572 const PERL_UINT_FAST8_T search_in_len
11573 = SCX_AUX_TABLE_lengths[-script_of_char];
11574 PERL_UINT_FAST8_T i;
11576 for (i = 0; i < search_in_len; i++) {
11577 if (search_in[i] == script_of_run) {
11578 script_of_char = script_of_run;
11579 goto scripts_match;
11587 /* Both run and char could be in one of several scripts. If the
11588 * intersection is empty, then this character isn't in this script
11589 * run. Otherwise, we need to calculate the intersection to use
11590 * for future iterations of the loop, unless we are already at the
11591 * final character */
11592 const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11593 const PERL_UINT_FAST8_T char_len
11594 = SCX_AUX_TABLE_lengths[-script_of_char];
11595 const SCX_enum * search_run;
11596 PERL_UINT_FAST8_T run_len;
11598 SCX_enum * new_overlap = NULL;
11599 PERL_UINT_FAST8_T i, j;
11601 if (intersection) {
11602 search_run = intersection;
11603 run_len = intersection_len;
11606 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11607 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11610 intersection_len = 0;
11612 for (i = 0; i < run_len; i++) {
11613 for (j = 0; j < char_len; j++) {
11614 if (search_run[i] == search_char[j]) {
11616 /* Here, the script at i,j matches. That means this
11617 * character is in the run. But continue on to find
11618 * the complete intersection, for the next loop
11619 * iteration, and for the digit check after it.
11621 * On the first found common script, we malloc space
11622 * for the intersection list for the worst case of the
11623 * intersection, which is the minimum of the number of
11624 * scripts remaining in each set. */
11625 if (intersection_len == 0) {
11627 MIN(run_len - i, char_len - j),
11630 new_overlap[intersection_len++] = search_run[i];
11635 /* Here we've looked through everything. If they have no scripts
11636 * in common, not a run */
11637 if (intersection_len == 0) {
11642 /* If there is only a single script in common, set to that.
11643 * Otherwise, use the intersection going forward */
11644 Safefree(intersection);
11645 intersection = NULL;
11646 if (intersection_len == 1) {
11647 script_of_run = script_of_char = new_overlap[0];
11648 Safefree(new_overlap);
11649 new_overlap = NULL;
11652 intersection = new_overlap;
11660 /* Here, the script of the character is compatible with that of the
11661 * run. That means that in most cases, it continues the script run.
11662 * Either it and the run match exactly, or one or both can be in any of
11663 * several scripts, and the intersection is not empty. However, if the
11664 * character is a decimal digit, it could still mean failure if it is
11665 * from the wrong sequence of 10. So, we need to look at if it's a
11666 * digit. We've already handled the 10 digits [0-9], and the next
11667 * lowest one is this one: */
11668 if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11669 continue; /* Not a digit; this character is part of the run */
11672 /* If we have a definitive '0' for the script of this character, we
11673 * know that for this to be a digit, it must be in the range of +0..+9
11675 if ( script_of_char >= 0
11676 && (zero_of_char = script_zeros[script_of_char]))
11678 if (! withinCOUNT(cp, zero_of_char, 9)) {
11679 continue; /* Not a digit; this character is part of the run
11684 else { /* Need to look up if this character is a digit or not */
11685 SSize_t index_of_zero_of_char;
11686 index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11687 if ( UNLIKELY(index_of_zero_of_char < 0)
11688 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11690 continue; /* Not a digit; this character is part of the run.
11694 zero_of_char = decimals_array[index_of_zero_of_char];
11697 /* Here, the character is a decimal digit, and the zero of its sequence
11698 * of 10 is in 'zero_of_char'. If we already have a zero for this run,
11699 * they better be the same. */
11701 if (zero_of_run != zero_of_char) {
11706 else { /* Otherwise we now have a zero for this run */
11707 zero_of_run = zero_of_char;
11709 } /* end of looping through CLOSESR text */
11711 Safefree(intersection);
11713 if (ret_script != NULL) {
11715 *ret_script = script_of_run;
11718 *ret_script = SCX_INVALID;
11725 #endif /* ifndef PERL_IN_XSUB_RE */
11728 * ex: set ts=8 sts=4 sw=4 et: