5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #include "invlist_inline.h"
84 #include "unicode_constants.h"
86 static const char b_utf8_locale_required[] =
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
88 " Assuming a UTF-8 locale";
90 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND \
92 if (! IN_UTF8_CTYPE_LOCALE) { \
93 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
94 b_utf8_locale_required); \
98 static const char sets_utf8_locale_required[] =
99 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
101 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n) \
103 if (! IN_UTF8_CTYPE_LOCALE && ANYOFL_UTF8_LOCALE_REQD(FLAGS(n))) { \
104 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
105 sets_utf8_locale_required); \
110 /* At least one required character in the target string is expressible only in
112 static const char non_utf8_target_but_utf8_required[]
113 = "Can't match, because target string needs to be in UTF-8\n";
116 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
117 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
122 #define STATIC static
129 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
131 #define HOPc(pos,off) \
132 (char *)(reginfo->is_utf8_target \
133 ? reghop3((U8*)pos, off, \
134 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
137 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
138 #define HOPBACK3(pos, off, lim) \
139 (reginfo->is_utf8_target \
140 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
141 : (pos - off >= lim) \
145 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
147 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
148 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
150 /* lim must be +ve. Returns NULL on overshoot */
151 #define HOPMAYBE3(pos,off,lim) \
152 (reginfo->is_utf8_target \
153 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
154 : ((U8*)pos + off <= lim) \
158 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
159 * off must be >=0; args should be vars rather than expressions */
160 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
161 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
162 : (U8*)((pos + off) > lim ? lim : (pos + off)))
163 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
165 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
166 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
168 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
170 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
171 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
173 /* for use after a quantifier and before an EXACT-like node -- japhy */
174 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
176 * NOTE that *nothing* that affects backtracking should be in here, specifically
177 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
178 * node that is in between two EXACT like nodes when ascertaining what the required
179 * "follow" character is. This should probably be moved to regex compile time
180 * although it may be done at run time beause of the REF possibility - more
181 * investigation required. -- demerphq
183 #define JUMPABLE(rn) ( \
185 (OP(rn) == CLOSE && \
186 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
188 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
189 OP(rn) == PLUS || OP(rn) == MINMOD || \
191 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
193 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
195 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
198 Search for mandatory following text node; for lookahead, the text must
199 follow but for lookbehind (rn->flags != 0) we skip to the next step.
201 #define FIND_NEXT_IMPT(rn) STMT_START { \
202 while (JUMPABLE(rn)) { \
203 const OPCODE type = OP(rn); \
204 if (type == SUSPEND || PL_regkind[type] == CURLY) \
205 rn = NEXTOPER(NEXTOPER(rn)); \
206 else if (type == PLUS) \
208 else if (type == IFMATCH) \
209 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
210 else rn += NEXT_OFF(rn); \
214 #define SLAB_FIRST(s) (&(s)->states[0])
215 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
217 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
218 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
219 static regmatch_state * S_push_slab(pTHX);
221 #define REGCP_PAREN_ELEMS 3
222 #define REGCP_OTHER_ELEMS 3
223 #define REGCP_FRAME_ELEMS 1
224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
225 * are needed for the regexp context stack bookkeeping. */
228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
230 const int retval = PL_savestack_ix;
231 const int paren_elems_to_push =
232 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
233 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
234 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
236 DECLARE_AND_GET_RE_DEBUG_FLAGS;
238 PERL_ARGS_ASSERT_REGCPPUSH;
240 if (paren_elems_to_push < 0)
241 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
242 (int)paren_elems_to_push, (int)maxopenparen,
243 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
245 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
246 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
247 " out of range (%lu-%ld)",
249 (unsigned long)maxopenparen,
252 SSGROW(total_elems + REGCP_FRAME_ELEMS);
255 if ((int)maxopenparen > (int)parenfloor)
256 Perl_re_exec_indentf( aTHX_
257 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
263 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
264 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
265 SSPUSHIV(rex->offs[p].end);
266 SSPUSHIV(rex->offs[p].start);
267 SSPUSHINT(rex->offs[p].start_tmp);
268 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
269 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
272 (IV)rex->offs[p].start,
273 (IV)rex->offs[p].start_tmp,
277 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
278 SSPUSHINT(maxopenparen);
279 SSPUSHINT(rex->lastparen);
280 SSPUSHINT(rex->lastcloseparen);
281 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
286 /* These are needed since we do not localize EVAL nodes: */
287 #define REGCP_SET(cp) \
289 Perl_re_exec_indentf( aTHX_ \
290 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
291 depth, (IV)PL_savestack_ix \
296 #define REGCP_UNWIND(cp) \
298 if (cp != PL_savestack_ix) \
299 Perl_re_exec_indentf( aTHX_ \
300 "Clearing an EVAL scope, savestack=%" \
301 IVdf "..%" IVdf "\n", \
302 depth, (IV)(cp), (IV)PL_savestack_ix \
307 /* set the start and end positions of capture ix */
308 #define CLOSE_CAPTURE(ix, s, e) \
309 rex->offs[ix].start = s; \
310 rex->offs[ix].end = e; \
311 if (ix > rex->lastparen) \
312 rex->lastparen = ix; \
313 rex->lastcloseparen = ix; \
314 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
315 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
320 (IV)rex->offs[ix].start, \
321 (IV)rex->offs[ix].end, \
325 #define UNWIND_PAREN(lp, lcp) \
326 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
327 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
332 (UV)(rex->lastparen), \
335 for (n = rex->lastparen; n > lp; n--) \
336 rex->offs[n].end = -1; \
337 rex->lastparen = n; \
338 rex->lastcloseparen = lcp;
342 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
346 DECLARE_AND_GET_RE_DEBUG_FLAGS;
348 PERL_ARGS_ASSERT_REGCPPOP;
350 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
352 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
353 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
354 rex->lastcloseparen = SSPOPINT;
355 rex->lastparen = SSPOPINT;
356 *maxopenparen_p = SSPOPINT;
358 i -= REGCP_OTHER_ELEMS;
359 /* Now restore the parentheses context. */
361 if (i || rex->lastparen + 1 <= rex->nparens)
362 Perl_re_exec_indentf( aTHX_
363 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
369 paren = *maxopenparen_p;
370 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
372 rex->offs[paren].start_tmp = SSPOPINT;
373 rex->offs[paren].start = SSPOPIV;
375 if (paren <= rex->lastparen)
376 rex->offs[paren].end = tmps;
377 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
378 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
381 (IV)rex->offs[paren].start,
382 (IV)rex->offs[paren].start_tmp,
383 (IV)rex->offs[paren].end,
384 (paren > rex->lastparen ? "(skipped)" : ""));
389 /* It would seem that the similar code in regtry()
390 * already takes care of this, and in fact it is in
391 * a better location to since this code can #if 0-ed out
392 * but the code in regtry() is needed or otherwise tests
393 * requiring null fields (pat.t#187 and split.t#{13,14}
394 * (as of patchlevel 7877) will fail. Then again,
395 * this code seems to be necessary or otherwise
396 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
397 * --jhi updated by dapm */
398 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
399 if (i > *maxopenparen_p)
400 rex->offs[i].start = -1;
401 rex->offs[i].end = -1;
402 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
403 " \\%" UVuf ": %s ..-1 undeffing\n",
406 (i > *maxopenparen_p) ? "-1" : " "
412 /* restore the parens and associated vars at savestack position ix,
413 * but without popping the stack */
416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
418 I32 tmpix = PL_savestack_ix;
419 PERL_ARGS_ASSERT_REGCP_RESTORE;
421 PL_savestack_ix = ix;
422 regcppop(rex, maxopenparen_p);
423 PL_savestack_ix = tmpix;
426 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
429 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
431 /* Returns a boolean as to whether or not 'character' is a member of the
432 * Posix character class given by 'classnum' that should be equivalent to a
433 * value in the typedef '_char_class_number'.
435 * Ideally this could be replaced by a just an array of function pointers
436 * to the C library functions that implement the macros this calls.
437 * However, to compile, the precise function signatures are required, and
438 * these may vary from platform to platform. To avoid having to figure
439 * out what those all are on each platform, I (khw) am using this method,
440 * which adds an extra layer of function call overhead (unless the C
441 * optimizer strips it away). But we don't particularly care about
442 * performance with locales anyway. */
444 switch ((_char_class_number) classnum) {
445 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
446 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
447 case _CC_ENUM_ASCII: return isASCII_LC(character);
448 case _CC_ENUM_BLANK: return isBLANK_LC(character);
449 case _CC_ENUM_CASED: return isLOWER_LC(character)
450 || isUPPER_LC(character);
451 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
452 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
453 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
454 case _CC_ENUM_LOWER: return isLOWER_LC(character);
455 case _CC_ENUM_PRINT: return isPRINT_LC(character);
456 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
457 case _CC_ENUM_SPACE: return isSPACE_LC(character);
458 case _CC_ENUM_UPPER: return isUPPER_LC(character);
459 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
460 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
461 default: /* VERTSPACE should never occur in locales */
462 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
465 NOT_REACHED; /* NOTREACHED */
469 PERL_STATIC_INLINE I32
470 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
472 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be
473 * folded. Works on all folds representable without UTF-8, except for
474 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it
475 * check that the strings each have at least 'len' characters.
477 * There is almost an identical API function where s2 need not be folded:
478 * Perl_foldEQ_latin1() */
480 const U8 *a = (const U8 *)s1;
481 const U8 *b = (const U8 *)s2;
483 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
488 assert(! isUPPER_L1(*b));
489 if (toLOWER_L1(*a) != *b) {
498 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
500 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
501 * 'character' is a member of the Posix character class given by 'classnum'
502 * that should be equivalent to a value in the typedef
503 * '_char_class_number'.
505 * This just calls isFOO_lc on the code point for the character if it is in
506 * the range 0-255. Outside that range, all characters use Unicode
507 * rules, ignoring any locale. So use the Unicode function if this class
508 * requires an inversion list, and use the Unicode macro otherwise. */
511 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
513 if (UTF8_IS_INVARIANT(*character)) {
514 return isFOO_lc(classnum, *character);
516 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
517 return isFOO_lc(classnum,
518 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
521 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
523 switch ((_char_class_number) classnum) {
524 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
525 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
526 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
527 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
529 return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
530 utf8_to_uvchr_buf(character, e, NULL));
533 return FALSE; /* Things like CNTRL are always below 256 */
537 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
539 /* Returns the position of the first byte in the sequence between 's' and
540 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
543 PERL_ARGS_ASSERT_FIND_SPAN_END;
547 if ((STRLEN) (send - s) >= PERL_WORDSIZE
548 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
549 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
551 PERL_UINTMAX_T span_word;
553 /* Process per-byte until reach word boundary. XXX This loop could be
554 * eliminated if we knew that this platform had fast unaligned reads */
555 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
556 if (*s != span_byte) {
562 /* Create a word filled with the bytes we are spanning */
563 span_word = PERL_COUNT_MULTIPLIER * span_byte;
565 /* Process per-word as long as we have at least a full word left */
568 /* Keep going if the whole word is composed of 'span_byte's */
569 if ((* (PERL_UINTMAX_T *) s) == span_word) {
574 /* Here, at least one byte in the word isn't 'span_byte'. */
582 /* This xor leaves 1 bits only in those non-matching bytes */
583 span_word ^= * (PERL_UINTMAX_T *) s;
585 /* Make sure the upper bit of each non-matching byte is set. This
586 * makes each such byte look like an ASCII platform variant byte */
587 span_word |= span_word << 1;
588 span_word |= span_word << 2;
589 span_word |= span_word << 4;
591 /* That reduces the problem to what this function solves */
592 return s + variant_byte_number(span_word);
596 } while (s + PERL_WORDSIZE <= send);
599 /* Process the straggler bytes beyond the final word boundary */
601 if (*s != span_byte) {
611 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
613 /* Returns the position of the first byte in the sequence between 's'
614 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
615 * returns 'send' if none found. It uses word-level operations instead of
616 * byte to speed up the process */
618 PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
621 assert((byte & mask) == byte);
625 if ((STRLEN) (send - s) >= PERL_WORDSIZE
626 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
627 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
629 PERL_UINTMAX_T word, mask_word;
631 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
632 if (((*s) & mask) == byte) {
638 word = PERL_COUNT_MULTIPLIER * byte;
639 mask_word = PERL_COUNT_MULTIPLIER * mask;
642 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
644 /* If 'masked' contains bytes with the bit pattern of 'byte' within
645 * it, xoring with 'word' will leave each of the 8 bits in such
646 * bytes be 0, and no byte containing any other bit pattern will be
650 /* This causes the most significant bit to be set to 1 for any
651 * bytes in the word that aren't completely 0 */
652 masked |= masked << 1;
653 masked |= masked << 2;
654 masked |= masked << 4;
656 /* The msbits are the same as what marks a byte as variant, so we
657 * can use this mask. If all msbits are 1, the word doesn't
659 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
664 /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
665 * and any that are, are 0. Complement and re-AND to swap that */
667 masked &= PERL_VARIANTS_WORD_MASK;
669 /* This reduces the problem to that solved by this function */
670 s += variant_byte_number(masked);
673 } while (s + PERL_WORDSIZE <= send);
679 if (((*s) & mask) == byte) {
689 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
691 /* Returns the position of the first byte in the sequence between 's' and
692 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
693 * 'span_byte' should have been ANDed with 'mask' in the call of this
694 * function. Returns 'send' if none found. Works like find_span_end(),
695 * except for the AND */
697 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
700 assert((span_byte & mask) == span_byte);
702 if ((STRLEN) (send - s) >= PERL_WORDSIZE
703 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
704 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
706 PERL_UINTMAX_T span_word, mask_word;
708 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
709 if (((*s) & mask) != span_byte) {
715 span_word = PERL_COUNT_MULTIPLIER * span_byte;
716 mask_word = PERL_COUNT_MULTIPLIER * mask;
719 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
721 if (masked == span_word) {
733 masked |= masked << 1;
734 masked |= masked << 2;
735 masked |= masked << 4;
736 return s + variant_byte_number(masked);
740 } while (s + PERL_WORDSIZE <= send);
744 if (((*s) & mask) != span_byte) {
754 * pregexec and friends
757 #ifndef PERL_IN_XSUB_RE
759 - pregexec - match a regexp against a string
762 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
763 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
764 /* stringarg: the point in the string at which to begin matching */
765 /* strend: pointer to null at end of string */
766 /* strbeg: real beginning of string */
767 /* minend: end of match must be >= minend bytes after stringarg. */
768 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
769 * itself is accessed via the pointers above */
770 /* nosave: For optimizations. */
772 PERL_ARGS_ASSERT_PREGEXEC;
775 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
776 nosave ? 0 : REXEC_COPY_STR);
782 /* re_intuit_start():
784 * Based on some optimiser hints, try to find the earliest position in the
785 * string where the regex could match.
787 * rx: the regex to match against
788 * sv: the SV being matched: only used for utf8 flag; the string
789 * itself is accessed via the pointers below. Note that on
790 * something like an overloaded SV, SvPOK(sv) may be false
791 * and the string pointers may point to something unrelated to
793 * strbeg: real beginning of string
794 * strpos: the point in the string at which to begin matching
795 * strend: pointer to the byte following the last char of the string
796 * flags currently unused; set to 0
797 * data: currently unused; set to NULL
799 * The basic idea of re_intuit_start() is to use some known information
800 * about the pattern, namely:
802 * a) the longest known anchored substring (i.e. one that's at a
803 * constant offset from the beginning of the pattern; but not
804 * necessarily at a fixed offset from the beginning of the
806 * b) the longest floating substring (i.e. one that's not at a constant
807 * offset from the beginning of the pattern);
808 * c) Whether the pattern is anchored to the string; either
809 * an absolute anchor: /^../, or anchored to \n: /^.../m,
810 * or anchored to pos(): /\G/;
811 * d) A start class: a real or synthetic character class which
812 * represents which characters are legal at the start of the pattern;
814 * to either quickly reject the match, or to find the earliest position
815 * within the string at which the pattern might match, thus avoiding
816 * running the full NFA engine at those earlier locations, only to
817 * eventually fail and retry further along.
819 * Returns NULL if the pattern can't match, or returns the address within
820 * the string which is the earliest place the match could occur.
822 * The longest of the anchored and floating substrings is called 'check'
823 * and is checked first. The other is called 'other' and is checked
824 * second. The 'other' substring may not be present. For example,
826 * /(abc|xyz)ABC\d{0,3}DEFG/
830 * check substr (float) = "DEFG", offset 6..9 chars
831 * other substr (anchored) = "ABC", offset 3..3 chars
834 * Be aware that during the course of this function, sometimes 'anchored'
835 * refers to a substring being anchored relative to the start of the
836 * pattern, and sometimes to the pattern itself being anchored relative to
837 * the string. For example:
839 * /\dabc/: "abc" is anchored to the pattern;
840 * /^\dabc/: "abc" is anchored to the pattern and the string;
841 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
842 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
843 * but the pattern is anchored to the string.
847 Perl_re_intuit_start(pTHX_
850 const char * const strbeg,
854 re_scream_pos_data *data)
856 struct regexp *const prog = ReANY(rx);
857 SSize_t start_shift = prog->check_offset_min;
858 /* Should be nonnegative! */
859 SSize_t end_shift = 0;
860 /* current lowest pos in string where the regex can start matching */
861 char *rx_origin = strpos;
863 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
864 U8 other_ix = 1 - prog->substrs->check_ix;
866 char *other_last = strpos;/* latest pos 'other' substr already checked to */
867 char *check_at = NULL; /* check substr found at this pos */
868 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
869 RXi_GET_DECL(prog,progi);
870 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
871 regmatch_info *const reginfo = ®info_buf;
872 DECLARE_AND_GET_RE_DEBUG_FLAGS;
874 PERL_ARGS_ASSERT_RE_INTUIT_START;
875 PERL_UNUSED_ARG(flags);
876 PERL_UNUSED_ARG(data);
878 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
879 "Intuit: trying to determine minimum start position...\n"));
881 /* for now, assume that all substr offsets are positive. If at some point
882 * in the future someone wants to do clever things with lookbehind and
883 * -ve offsets, they'll need to fix up any code in this function
884 * which uses these offsets. See the thread beginning
885 * <20140113145929.GF27210@iabyn.com>
887 assert(prog->substrs->data[0].min_offset >= 0);
888 assert(prog->substrs->data[0].max_offset >= 0);
889 assert(prog->substrs->data[1].min_offset >= 0);
890 assert(prog->substrs->data[1].max_offset >= 0);
891 assert(prog->substrs->data[2].min_offset >= 0);
892 assert(prog->substrs->data[2].max_offset >= 0);
894 /* for now, assume that if both present, that the floating substring
895 * doesn't start before the anchored substring.
896 * If you break this assumption (e.g. doing better optimisations
897 * with lookahead/behind), then you'll need to audit the code in this
898 * function carefully first
901 ! ( (prog->anchored_utf8 || prog->anchored_substr)
902 && (prog->float_utf8 || prog->float_substr))
903 || (prog->float_min_offset >= prog->anchored_offset));
905 /* byte rather than char calculation for efficiency. It fails
906 * to quickly reject some cases that can't match, but will reject
907 * them later after doing full char arithmetic */
908 if (prog->minlen > strend - strpos) {
909 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
910 " String too short...\n"));
914 RXp_MATCH_UTF8_set(prog, utf8_target);
915 reginfo->is_utf8_target = cBOOL(utf8_target);
916 reginfo->info_aux = NULL;
917 reginfo->strbeg = strbeg;
918 reginfo->strend = strend;
919 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
921 /* not actually used within intuit, but zero for safety anyway */
922 reginfo->poscache_maxiter = 0;
925 if ((!prog->anchored_utf8 && prog->anchored_substr)
926 || (!prog->float_utf8 && prog->float_substr))
927 to_utf8_substr(prog);
928 check = prog->check_utf8;
930 if (!prog->check_substr && prog->check_utf8) {
931 if (! to_byte_substr(prog)) {
932 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
935 check = prog->check_substr;
938 /* dump the various substring data */
939 DEBUG_OPTIMISE_MORE_r({
941 for (i=0; i<=2; i++) {
942 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
943 : prog->substrs->data[i].substr);
947 Perl_re_printf( aTHX_
948 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
949 " useful=%" IVdf " utf8=%d [%s]\n",
951 (IV)prog->substrs->data[i].min_offset,
952 (IV)prog->substrs->data[i].max_offset,
953 (IV)prog->substrs->data[i].end_shift,
960 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
962 /* ml_anch: check after \n?
964 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
965 * with /.*.../, these flags will have been added by the
967 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
968 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
970 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
971 && !(prog->intflags & PREGf_IMPLICIT);
973 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
974 /* we are only allowed to match at BOS or \G */
976 /* trivially reject if there's a BOS anchor and we're not at BOS.
978 * Note that we don't try to do a similar quick reject for
979 * \G, since generally the caller will have calculated strpos
980 * based on pos() and gofs, so the string is already correctly
981 * anchored by definition; and handling the exceptions would
982 * be too fiddly (e.g. REXEC_IGNOREPOS).
984 if ( strpos != strbeg
985 && (prog->intflags & PREGf_ANCH_SBOL))
987 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
988 " Not at start...\n"));
992 /* in the presence of an anchor, the anchored (relative to the
993 * start of the regex) substr must also be anchored relative
994 * to strpos. So quickly reject if substr isn't found there.
995 * This works for \G too, because the caller will already have
996 * subtracted gofs from pos, and gofs is the offset from the
997 * \G to the start of the regex. For example, in /.abc\Gdef/,
998 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
999 * caller will have set strpos=pos()-4; we look for the substr
1000 * at position pos()-4+1, which lines up with the "a" */
1002 if (prog->check_offset_min == prog->check_offset_max) {
1003 /* Substring at constant offset from beg-of-str... */
1004 SSize_t slen = SvCUR(check);
1005 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1007 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1008 " Looking for check substr at fixed offset %" IVdf "...\n",
1009 (IV)prog->check_offset_min));
1011 if (SvTAIL(check)) {
1012 /* In this case, the regex is anchored at the end too.
1013 * Unless it's a multiline match, the lengths must match
1014 * exactly, give or take a \n. NB: slen >= 1 since
1015 * the last char of check is \n */
1017 && ( strend - s > slen
1018 || strend - s < slen - 1
1019 || (strend - s == slen && strend[-1] != '\n')))
1021 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1022 " String too long...\n"));
1025 /* Now should match s[0..slen-2] */
1028 if (slen && (strend - s < slen
1029 || *SvPVX_const(check) != *s
1030 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1032 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1033 " String not equal...\n"));
1038 goto success_at_start;
1043 end_shift = prog->check_end_shift;
1045 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
1047 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1048 (IV)end_shift, RX_PRECOMP(rx));
1053 /* This is the (re)entry point of the main loop in this function.
1054 * The goal of this loop is to:
1055 * 1) find the "check" substring in the region rx_origin..strend
1056 * (adjusted by start_shift / end_shift). If not found, reject
1058 * 2) If it exists, look for the "other" substr too if defined; for
1059 * example, if the check substr maps to the anchored substr, then
1060 * check the floating substr, and vice-versa. If not found, go
1061 * back to (1) with rx_origin suitably incremented.
1062 * 3) If we find an rx_origin position that doesn't contradict
1063 * either of the substrings, then check the possible additional
1064 * constraints on rx_origin of /^.../m or a known start class.
1065 * If these fail, then depending on which constraints fail, jump
1066 * back to here, or to various other re-entry points further along
1067 * that skip some of the first steps.
1068 * 4) If we pass all those tests, update the BmUSEFUL() count on the
1069 * substring. If the start position was determined to be at the
1070 * beginning of the string - so, not rejected, but not optimised,
1071 * since we have to run regmatch from position 0 - decrement the
1072 * BmUSEFUL() count. Otherwise increment it.
1076 /* first, look for the 'check' substring */
1082 DEBUG_OPTIMISE_MORE_r({
1083 Perl_re_printf( aTHX_
1084 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1085 " Start shift: %" IVdf " End shift %" IVdf
1086 " Real end Shift: %" IVdf "\n",
1087 (IV)(rx_origin - strbeg),
1088 (IV)prog->check_offset_min,
1091 (IV)prog->check_end_shift);
1094 end_point = HOPBACK3(strend, end_shift, rx_origin);
1097 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1102 /* If the regex is absolutely anchored to either the start of the
1103 * string (SBOL) or to pos() (ANCH_GPOS), then
1104 * check_offset_max represents an upper bound on the string where
1105 * the substr could start. For the ANCH_GPOS case, we assume that
1106 * the caller of intuit will have already set strpos to
1107 * pos()-gofs, so in this case strpos + offset_max will still be
1108 * an upper bound on the substr.
1111 && prog->intflags & PREGf_ANCH
1112 && prog->check_offset_max != SSize_t_MAX)
1114 SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1115 const char * const anchor =
1116 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1117 SSize_t targ_len = (char*)end_point - anchor;
1119 if (check_len > targ_len) {
1120 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1121 "Target string too short to match required substring...\n"));
1125 /* do a bytes rather than chars comparison. It's conservative;
1126 * so it skips doing the HOP if the result can't possibly end
1127 * up earlier than the old value of end_point.
1129 assert(anchor + check_len <= (char *)end_point);
1130 if (prog->check_offset_max + check_len < targ_len) {
1131 end_point = HOP3lim((U8*)anchor,
1132 prog->check_offset_max,
1133 end_point - check_len
1136 if (end_point < start_point)
1141 check_at = fbm_instr( start_point, end_point,
1142 check, multiline ? FBMrf_MULTILINE : 0);
1144 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1145 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1146 (IV)((char*)start_point - strbeg),
1147 (IV)((char*)end_point - strbeg),
1148 (IV)(check_at ? check_at - strbeg : -1)
1151 /* Update the count-of-usability, remove useless subpatterns,
1155 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1156 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1157 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
1158 (check_at ? "Found" : "Did not find"),
1159 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1160 ? "anchored" : "floating"),
1163 (check_at ? " at offset " : "...\n") );
1168 /* set rx_origin to the minimum position where the regex could start
1169 * matching, given the constraint of the just-matched check substring.
1170 * But don't set it lower than previously.
1173 if (check_at - rx_origin > prog->check_offset_max)
1174 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1175 /* Finish the diagnostic message */
1176 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1177 "%ld (rx_origin now %" IVdf ")...\n",
1178 (long)(check_at - strbeg),
1179 (IV)(rx_origin - strbeg)
1184 /* now look for the 'other' substring if defined */
1186 if (prog->substrs->data[other_ix].utf8_substr
1187 || prog->substrs->data[other_ix].substr)
1189 /* Take into account the "other" substring. */
1193 struct reg_substr_datum *other;
1196 other = &prog->substrs->data[other_ix];
1197 if (!utf8_target && !other->substr) {
1198 if (!to_byte_substr(prog)) {
1199 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1203 /* if "other" is anchored:
1204 * we've previously found a floating substr starting at check_at.
1205 * This means that the regex origin must lie somewhere
1206 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1207 * and max: HOP3(check_at, -check_offset_min)
1208 * (except that min will be >= strpos)
1209 * So the fixed substr must lie somewhere between
1210 * HOP3(min, anchored_offset)
1211 * HOP3(max, anchored_offset) + SvCUR(substr)
1214 /* if "other" is floating
1215 * Calculate last1, the absolute latest point where the
1216 * floating substr could start in the string, ignoring any
1217 * constraints from the earlier fixed match. It is calculated
1220 * strend - prog->minlen (in chars) is the absolute latest
1221 * position within the string where the origin of the regex
1222 * could appear. The latest start point for the floating
1223 * substr is float_min_offset(*) on from the start of the
1224 * regex. last1 simply combines thee two offsets.
1226 * (*) You might think the latest start point should be
1227 * float_max_offset from the regex origin, and technically
1228 * you'd be correct. However, consider
1230 * Here, float min, max are 3,5 and minlen is 7.
1231 * This can match either
1235 * In the first case, the regex matches minlen chars; in the
1236 * second, minlen+1, in the third, minlen+2.
1237 * In the first case, the floating offset is 3 (which equals
1238 * float_min), in the second, 4, and in the third, 5 (which
1239 * equals float_max). In all cases, the floating string bcd
1240 * can never start more than 4 chars from the end of the
1241 * string, which equals minlen - float_min. As the substring
1242 * starts to match more than float_min from the start of the
1243 * regex, it makes the regex match more than minlen chars,
1244 * and the two cancel each other out. So we can always use
1245 * float_min - minlen, rather than float_max - minlen for the
1246 * latest position in the string.
1248 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1249 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1252 assert(prog->minlen >= other->min_offset);
1253 last1 = HOP3c(strend,
1254 other->min_offset - prog->minlen, strbeg);
1256 if (other_ix) {/* i.e. if (other-is-float) */
1257 /* last is the latest point where the floating substr could
1258 * start, *given* any constraints from the earlier fixed
1259 * match. This constraint is that the floating string starts
1260 * <= float_max_offset chars from the regex origin (rx_origin).
1261 * If this value is less than last1, use it instead.
1263 assert(rx_origin <= last1);
1265 /* this condition handles the offset==infinity case, and
1266 * is a short-cut otherwise. Although it's comparing a
1267 * byte offset to a char length, it does so in a safe way,
1268 * since 1 char always occupies 1 or more bytes,
1269 * so if a string range is (last1 - rx_origin) bytes,
1270 * it will be less than or equal to (last1 - rx_origin)
1271 * chars; meaning it errs towards doing the accurate HOP3
1272 * rather than just using last1 as a short-cut */
1273 (last1 - rx_origin) < other->max_offset
1275 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1278 assert(strpos + start_shift <= check_at);
1279 last = HOP4c(check_at, other->min_offset - start_shift,
1283 s = HOP3c(rx_origin, other->min_offset, strend);
1284 if (s < other_last) /* These positions already checked */
1287 must = utf8_target ? other->utf8_substr : other->substr;
1288 assert(SvPOK(must));
1291 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1297 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1298 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1299 (IV)(from - strbeg),
1305 (unsigned char*)from,
1308 multiline ? FBMrf_MULTILINE : 0
1310 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1311 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1312 (IV)(from - strbeg),
1314 (IV)(s ? s - strbeg : -1)
1320 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1321 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1322 Perl_re_printf( aTHX_ " %s %s substr %s%s",
1323 s ? "Found" : "Contradicts",
1324 other_ix ? "floating" : "anchored",
1325 quoted, RE_SV_TAIL(must));
1330 /* last1 is latest possible substr location. If we didn't
1331 * find it before there, we never will */
1332 if (last >= last1) {
1333 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1334 "; giving up...\n"));
1338 /* try to find the check substr again at a later
1339 * position. Maybe next time we'll find the "other" substr
1341 other_last = HOP3c(last, 1, strend) /* highest failure */;
1343 other_ix /* i.e. if other-is-float */
1344 ? HOP3c(rx_origin, 1, strend)
1345 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1346 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1347 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1348 (other_ix ? "floating" : "anchored"),
1349 (long)(HOP3c(check_at, 1, strend) - strbeg),
1350 (IV)(rx_origin - strbeg)
1355 if (other_ix) { /* if (other-is-float) */
1356 /* other_last is set to s, not s+1, since its possible for
1357 * a floating substr to fail first time, then succeed
1358 * second time at the same floating position; e.g.:
1359 * "-AB--AABZ" =~ /\wAB\d*Z/
1360 * The first time round, anchored and float match at
1361 * "-(AB)--AAB(Z)" then fail on the initial \w character
1362 * class. Second time round, they match at "-AB--A(AB)(Z)".
1367 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1368 other_last = HOP3c(s, 1, strend);
1370 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1371 " at offset %ld (rx_origin now %" IVdf ")...\n",
1373 (IV)(rx_origin - strbeg)
1379 DEBUG_OPTIMISE_MORE_r(
1380 Perl_re_printf( aTHX_
1381 " Check-only match: offset min:%" IVdf " max:%" IVdf
1382 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1383 " strend:%" IVdf "\n",
1384 (IV)prog->check_offset_min,
1385 (IV)prog->check_offset_max,
1386 (IV)(check_at-strbeg),
1387 (IV)(rx_origin-strbeg),
1388 (IV)(rx_origin-check_at),
1394 postprocess_substr_matches:
1396 /* handle the extra constraint of /^.../m if present */
1398 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1401 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1402 " looking for /^/m anchor"));
1404 /* we have failed the constraint of a \n before rx_origin.
1405 * Find the next \n, if any, even if it's beyond the current
1406 * anchored and/or floating substrings. Whether we should be
1407 * scanning ahead for the next \n or the next substr is debatable.
1408 * On the one hand you'd expect rare substrings to appear less
1409 * often than \n's. On the other hand, searching for \n means
1410 * we're effectively flipping between check_substr and "\n" on each
1411 * iteration as the current "rarest" candidate string, which
1412 * means for example that we'll quickly reject the whole string if
1413 * hasn't got a \n, rather than trying every substr position
1417 s = HOP3c(strend, - prog->minlen, strpos);
1418 if (s <= rx_origin ||
1419 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1421 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1422 " Did not find /%s^%s/m...\n",
1423 PL_colors[0], PL_colors[1]));
1427 /* earliest possible origin is 1 char after the \n.
1428 * (since *rx_origin == '\n', it's safe to ++ here rather than
1429 * HOP(rx_origin, 1)) */
1432 if (prog->substrs->check_ix == 0 /* check is anchored */
1433 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1435 /* Position contradicts check-string; either because
1436 * check was anchored (and thus has no wiggle room),
1437 * or check was float and rx_origin is above the float range */
1438 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1439 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1440 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1444 /* if we get here, the check substr must have been float,
1445 * is in range, and we may or may not have had an anchored
1446 * "other" substr which still contradicts */
1447 assert(prog->substrs->check_ix); /* check is float */
1449 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1450 /* whoops, the anchored "other" substr exists, so we still
1451 * contradict. On the other hand, the float "check" substr
1452 * didn't contradict, so just retry the anchored "other"
1454 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1455 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1456 PL_colors[0], PL_colors[1],
1457 (IV)(rx_origin - strbeg + prog->anchored_offset),
1458 (IV)(rx_origin - strbeg)
1460 goto do_other_substr;
1463 /* success: we don't contradict the found floating substring
1464 * (and there's no anchored substr). */
1465 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1466 " Found /%s^%s/m with rx_origin %ld...\n",
1467 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1470 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1471 " (multiline anchor test skipped)\n"));
1477 /* if we have a starting character class, then test that extra constraint.
1478 * (trie stclasses are too expensive to use here, we are better off to
1479 * leave it to regmatch itself) */
1481 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1482 const U8* const str = (U8*)STRING(progi->regstclass);
1484 /* XXX this value could be pre-computed */
1485 const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1486 ? (reginfo->is_utf8_pat
1487 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1488 : (SSize_t)STR_LEN(progi->regstclass))
1492 /* latest pos that a matching float substr constrains rx start to */
1493 char *rx_max_float = NULL;
1495 /* if the current rx_origin is anchored, either by satisfying an
1496 * anchored substring constraint, or a /^.../m constraint, then we
1497 * can reject the current origin if the start class isn't found
1498 * at the current position. If we have a float-only match, then
1499 * rx_origin is constrained to a range; so look for the start class
1500 * in that range. if neither, then look for the start class in the
1501 * whole rest of the string */
1503 /* XXX DAPM it's not clear what the minlen test is for, and why
1504 * it's not used in the floating case. Nothing in the test suite
1505 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1506 * Here are some old comments, which may or may not be correct:
1508 * minlen == 0 is possible if regstclass is \b or \B,
1509 * and the fixed substr is ''$.
1510 * Since minlen is already taken into account, rx_origin+1 is
1511 * before strend; accidentally, minlen >= 1 guaranties no false
1512 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1513 * 0) below assumes that regstclass does not come from lookahead...
1514 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1515 * This leaves EXACTF-ish only, which are dealt with in
1519 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1520 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1521 else if (prog->float_substr || prog->float_utf8) {
1522 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1523 endpos = HOP3clim(rx_max_float, cl_l, strend);
1528 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1529 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
1530 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1531 (IV)start_shift, (IV)(check_at - strbeg),
1532 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1534 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1537 if (endpos == strend) {
1538 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1539 " Could not match STCLASS...\n") );
1542 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1543 " This position contradicts STCLASS...\n") );
1544 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1545 && !(prog->intflags & PREGf_IMPLICIT))
1548 /* Contradict one of substrings */
1549 if (prog->anchored_substr || prog->anchored_utf8) {
1550 if (prog->substrs->check_ix == 1) { /* check is float */
1551 /* Have both, check_string is floating */
1552 assert(rx_origin + start_shift <= check_at);
1553 if (rx_origin + start_shift != check_at) {
1554 /* not at latest position float substr could match:
1555 * Recheck anchored substring, but not floating.
1556 * The condition above is in bytes rather than
1557 * chars for efficiency. It's conservative, in
1558 * that it errs on the side of doing 'goto
1559 * do_other_substr'. In this case, at worst,
1560 * an extra anchored search may get done, but in
1561 * practice the extra fbm_instr() is likely to
1562 * get skipped anyway. */
1563 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1564 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1565 (long)(other_last - strbeg),
1566 (IV)(rx_origin - strbeg)
1568 goto do_other_substr;
1576 /* In the presence of ml_anch, we might be able to
1577 * find another \n without breaking the current float
1580 /* strictly speaking this should be HOP3c(..., 1, ...),
1581 * but since we goto a block of code that's going to
1582 * search for the next \n if any, its safe here */
1584 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1585 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
1586 PL_colors[0], PL_colors[1],
1587 (long)(rx_origin - strbeg)) );
1588 goto postprocess_substr_matches;
1591 /* strictly speaking this can never be true; but might
1592 * be if we ever allow intuit without substrings */
1593 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1596 rx_origin = rx_max_float;
1599 /* at this point, any matching substrings have been
1600 * contradicted. Start again... */
1602 rx_origin = HOP3c(rx_origin, 1, strend);
1604 /* uses bytes rather than char calculations for efficiency.
1605 * It's conservative: it errs on the side of doing 'goto restart',
1606 * where there is code that does a proper char-based test */
1607 if (rx_origin + start_shift + end_shift > strend) {
1608 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1609 " Could not match STCLASS...\n") );
1612 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1613 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1614 (prog->substrs->check_ix ? "floating" : "anchored"),
1615 (long)(rx_origin + start_shift - strbeg),
1616 (IV)(rx_origin - strbeg)
1623 if (rx_origin != s) {
1624 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1625 " By STCLASS: moving %ld --> %ld\n",
1626 (long)(rx_origin - strbeg), (long)(s - strbeg))
1630 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1631 " Does not contradict STCLASS...\n");
1636 /* Decide whether using the substrings helped */
1638 if (rx_origin != strpos) {
1639 /* Fixed substring is found far enough so that the match
1640 cannot start at strpos. */
1642 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
1643 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1646 /* The found rx_origin position does not prohibit matching at
1647 * strpos, so calling intuit didn't gain us anything. Decrement
1648 * the BmUSEFUL() count on the check substring, and if we reach
1650 if (!(prog->intflags & PREGf_NAUGHTY)
1652 prog->check_utf8 /* Could be deleted already */
1653 && --BmUSEFUL(prog->check_utf8) < 0
1654 && (prog->check_utf8 == prog->float_utf8)
1656 prog->check_substr /* Could be deleted already */
1657 && --BmUSEFUL(prog->check_substr) < 0
1658 && (prog->check_substr == prog->float_substr)
1661 /* If flags & SOMETHING - do not do it many times on the same match */
1662 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
1663 /* XXX Does the destruction order has to change with utf8_target? */
1664 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1665 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1666 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1667 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1668 check = NULL; /* abort */
1669 /* XXXX This is a remnant of the old implementation. It
1670 looks wasteful, since now INTUIT can use many
1671 other heuristics. */
1672 prog->extflags &= ~RXf_USE_INTUIT;
1676 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1677 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1678 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1682 fail_finish: /* Substring not found */
1683 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1684 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1686 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
1687 PL_colors[4], PL_colors[5]));
1692 #define DECL_TRIE_TYPE(scan) \
1693 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1694 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1695 trie_utf8l, trie_flu8, trie_flu8_latin } \
1696 trie_type = ((scan->flags == EXACT) \
1697 ? (utf8_target ? trie_utf8 : trie_plain) \
1698 : (scan->flags == EXACTL) \
1699 ? (utf8_target ? trie_utf8l : trie_plain) \
1700 : (scan->flags == EXACTFAA) \
1702 ? trie_utf8_exactfa_fold \
1703 : trie_latin_utf8_exactfa_fold) \
1704 : (scan->flags == EXACTFLU8 \
1707 : trie_flu8_latin) \
1710 : trie_latin_utf8_fold)))
1712 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1713 * 'foldbuf+sizeof(foldbuf)' */
1714 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1717 U8 flags = FOLD_FLAGS_FULL; \
1718 switch (trie_type) { \
1720 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1721 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \
1722 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1724 goto do_trie_utf8_fold; \
1725 case trie_utf8_exactfa_fold: \
1726 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1728 case trie_utf8_fold: \
1729 do_trie_utf8_fold: \
1730 if ( foldlen>0 ) { \
1731 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1736 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
1738 len = UTF8_SAFE_SKIP(uc, uc_end); \
1739 skiplen = UVCHR_SKIP( uvc ); \
1740 foldlen -= skiplen; \
1741 uscan = foldbuf + skiplen; \
1744 case trie_flu8_latin: \
1745 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1746 goto do_trie_latin_utf8_fold; \
1747 case trie_latin_utf8_exactfa_fold: \
1748 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1750 case trie_latin_utf8_fold: \
1751 do_trie_latin_utf8_fold: \
1752 if ( foldlen>0 ) { \
1753 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1759 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1760 skiplen = UVCHR_SKIP( uvc ); \
1761 foldlen -= skiplen; \
1762 uscan = foldbuf + skiplen; \
1766 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1767 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1768 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1772 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
1779 charid = trie->charmap[ uvc ]; \
1783 if (widecharmap) { \
1784 SV** const svpp = hv_fetch(widecharmap, \
1785 (char*)&uvc, sizeof(UV), 0); \
1787 charid = (U16)SvIV(*svpp); \
1792 #define DUMP_EXEC_POS(li,s,doutf8,depth) \
1793 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1794 startpos, doutf8, depth)
1796 #define REXEC_FBC_UTF8_SCAN(CODE) \
1798 while (s < strend) { \
1800 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1804 #define REXEC_FBC_NON_UTF8_SCAN(CODE) \
1806 while (s < strend) { \
1812 #define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1814 while (s < strend) { \
1815 REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1819 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND) \
1821 while (s < strend) { \
1822 REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1826 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1829 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1830 previous_occurrence_end = s; \
1836 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1840 previous_occurrence_end = s; \
1846 /* We keep track of where the next character should start after an occurrence
1847 * of the one we're looking for. Knowing that, we can see right away if the
1848 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
1849 * don't accept the 2nd and succeeding adjacent occurrences */
1850 #define FBC_CHECK_AND_TRY \
1852 || s != previous_occurrence_end) \
1853 && ( reginfo->intuit \
1854 || (s <= reginfo->strend && regtry(reginfo, &s)))) \
1860 /* These differ from the above macros in that they call a function which
1861 * returns the next occurrence of the thing being looked for in 's'; and
1862 * 'strend' if there is no such occurrence. 'f' is something like fcn(a,b,c)
1864 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f) \
1865 while (s < strend) { \
1867 if (s >= strend) { \
1873 previous_occurrence_end = s; \
1876 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f) \
1877 while (s < strend) { \
1879 if (s >= strend) { \
1885 previous_occurrence_end = s; \
1888 /* This is like the above macro except the function returns NULL if there is no
1889 * occurrence, and there is a further condition that must be matched besides
1891 #define REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(f, COND) \
1892 while (s < strend) { \
1895 s = (char *) strend; \
1901 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1902 previous_occurrence_end = s; \
1909 /* This differs from the above macros in that it is passed a single byte that
1910 * is known to begin the next occurrence of the thing being looked for in 's'.
1911 * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1912 * at that position. */
1913 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \
1914 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND(memchr(s, byte, strend - s), \
1917 /* This is like the function above, but takes an entire string to look for
1918 * instead of a single byte */
1919 #define REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(substr, substr_end, COND) \
1920 REXEC_FBC_FIND_NEXT_UTF8_SCAN_COND( \
1921 ninstr(s, strend, substr, substr_end), \
1924 /* The four macros below are slightly different versions of the same logic.
1926 * The first is for /a and /aa when the target string is UTF-8. This can only
1927 * match ascii, but it must advance based on UTF-8. The other three handle
1928 * the non-UTF-8 and the more generic UTF-8 cases. In all four, we are
1929 * looking for the boundary (or non-boundary) between a word and non-word
1930 * character. The utf8 and non-utf8 cases have the same logic, but the details
1931 * must be different. Find the "wordness" of the character just prior to this
1932 * one, and compare it with the wordness of this one. If they differ, we have
1933 * a boundary. At the beginning of the string, pretend that the previous
1934 * character was a new-line.
1936 * All these macros uncleanly have side-effects with each other and outside
1937 * variables. So far it's been too much trouble to clean-up
1939 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1940 * a word character or not.
1941 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1943 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1945 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1946 * are looking for a boundary or for a non-boundary. If we are looking for a
1947 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1948 * see if this tentative match actually works, and if so, to quit the loop
1949 * here. And vice-versa if we are looking for a non-boundary.
1951 * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
1952 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1953 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1954 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1955 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1956 * complement. But in that branch we complement tmp, meaning that at the
1957 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1958 * which means at the top of the loop in the next iteration, it is
1959 * TEST_NON_UTF8(s-1) */
1960 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1961 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1962 tmp = TEST_NON_UTF8(tmp); \
1963 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1964 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1966 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1973 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1974 * TEST_UTF8 is a macro that for the same input code points returns identically
1975 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
1976 * end pointer as well) */
1977 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
1978 if (s == reginfo->strbeg) { \
1981 else { /* Back-up to the start of the previous character */ \
1982 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1983 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1984 0, UTF8_ALLOW_DEFAULT); \
1986 tmp = TEST_UV(tmp); \
1987 REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */ \
1988 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
1997 /* Like the above two macros, for a UTF-8 target string. UTF8_CODE is the
1998 * complete code for handling UTF-8. Common to the BOUND and NBOUND cases,
1999 * set-up by the FBC_BOUND, etc macros below */
2000 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2002 /* Here, things have been set up by the previous code so that tmp is the \
2003 * return of TEST_NON_UTF8(s-1). We also have to check if this matches \
2004 * against the EOS, which we treat as a \n */ \
2005 if (tmp == ! TEST_NON_UTF8('\n')) { \
2012 /* Same as the macro above, but the target isn't UTF-8 */
2013 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
2014 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2015 tmp = TEST_NON_UTF8(tmp); \
2016 REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */ \
2017 if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) { \
2025 /* Here, things have been set up by the previous code so that tmp is \
2026 * the return of TEST_NON_UTF8(s-1). We also have to check if this \
2027 * matches against the EOS, which we treat as a \n */ \
2028 if (tmp == ! TEST_NON_UTF8('\n')) { \
2035 /* This is the macro to use when we want to see if something that looks like it
2036 * could match, actually does, and if so exits the loop. It needs to be used
2037 * only for bounds checking macros, as it allows for matching beyond the end of
2038 * string (which should be zero length without having to look at the string
2040 #define REXEC_FBC_TRYIT \
2041 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \
2044 /* The only difference between the BOUND and NBOUND cases is that
2045 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2046 * NBOUND. This is accomplished by passing it as either the if or else clause,
2047 * with the other one being empty (PLACEHOLDER is defined as empty).
2049 * The TEST_FOO parameters are for operating on different forms of input, but
2050 * all should be ones that return identically for the same underlying code
2053 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2054 FBC_BOUND_COMMON_UTF8( \
2055 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2056 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2058 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8) \
2059 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2061 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8) \
2062 FBC_BOUND_COMMON_UTF8( \
2063 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2064 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2066 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8) \
2067 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2069 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2070 FBC_BOUND_COMMON_UTF8( \
2071 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2072 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2074 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8) \
2075 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2077 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8) \
2078 FBC_BOUND_COMMON_UTF8( \
2079 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2080 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2082 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8) \
2083 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2087 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2088 IV cp_out = _invlist_search(invlist, cp_in);
2089 assert(cp_out >= 0);
2092 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2093 invmap[S_get_break_val_cp_checked(invlist, cp)]
2095 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2096 invmap[_invlist_search(invlist, cp)]
2099 /* Takes a pointer to an inversion list, a pointer to its corresponding
2100 * inversion map, and a code point, and returns the code point's value
2101 * according to the two arrays. It assumes that all code points have a value.
2102 * This is used as the base macro for macros for particular properties */
2103 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
2104 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2106 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2107 * of a code point, returning the value for the first code point in the string.
2108 * And it takes the particular macro name that finds the desired value given a
2109 * code point. Merely convert the UTF-8 to code point and call the cp macro */
2110 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
2111 (__ASSERT_(pos < strend) \
2112 /* Note assumes is valid UTF-8 */ \
2113 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2115 /* Returns the GCB value for the input code point */
2116 #define getGCB_VAL_CP(cp) \
2117 _generic_GET_BREAK_VAL_CP( \
2122 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2123 * bounded by pos and strend */
2124 #define getGCB_VAL_UTF8(pos, strend) \
2125 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2127 /* Returns the LB value for the input code point */
2128 #define getLB_VAL_CP(cp) \
2129 _generic_GET_BREAK_VAL_CP( \
2134 /* Returns the LB value for the first code point in the UTF-8 encoded string
2135 * bounded by pos and strend */
2136 #define getLB_VAL_UTF8(pos, strend) \
2137 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2140 /* Returns the SB value for the input code point */
2141 #define getSB_VAL_CP(cp) \
2142 _generic_GET_BREAK_VAL_CP( \
2147 /* Returns the SB value for the first code point in the UTF-8 encoded string
2148 * bounded by pos and strend */
2149 #define getSB_VAL_UTF8(pos, strend) \
2150 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2152 /* Returns the WB value for the input code point */
2153 #define getWB_VAL_CP(cp) \
2154 _generic_GET_BREAK_VAL_CP( \
2159 /* Returns the WB value for the first code point in the UTF-8 encoded string
2160 * bounded by pos and strend */
2161 #define getWB_VAL_UTF8(pos, strend) \
2162 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2164 /* We know what class REx starts with. Try to find this position... */
2165 /* if reginfo->intuit, its a dryrun */
2166 /* annoyingly all the vars in this routine have different names from their counterparts
2167 in regmatch. /grrr */
2169 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2170 const char *strend, regmatch_info *reginfo)
2173 /* TRUE if x+ need not match at just the 1st pos of run of x's */
2174 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2176 char *pat_string; /* The pattern's exactish string */
2177 char *pat_end; /* ptr to end char of pat_string */
2178 re_fold_t folder; /* Function for computing non-utf8 folds */
2179 const U8 *fold_array; /* array for folding ords < 256 */
2186 /* In some cases we accept only the first occurence of 'x' in a sequence of
2187 * them. This variable points to just beyond the end of the previous
2188 * occurrence of 'x', hence we can tell if we are in a sequence. (Having
2189 * it point to beyond the 'x' allows us to work for UTF-8 without having to
2191 char * previous_occurrence_end = 0;
2193 I32 tmp; /* Scratch variable */
2194 const bool utf8_target = reginfo->is_utf8_target;
2195 UV utf8_fold_flags = 0;
2196 const bool is_utf8_pat = reginfo->is_utf8_pat;
2197 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2198 with a result inverts that result, as 0^1 =
2200 _char_class_number classnum;
2202 RXi_GET_DECL(prog,progi);
2204 PERL_ARGS_ASSERT_FIND_BYCLASS;
2206 /* We know what class it must start with. The case statements below have
2207 * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2208 * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2209 * ('p8' and 'pb'. */
2210 switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2212 case ANYOFPOSIXL_t8_pb:
2213 case ANYOFPOSIXL_t8_p8:
2216 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2217 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2225 REXEC_FBC_UTF8_CLASS_SCAN(
2226 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2229 case ANYOFPOSIXL_tb_pb:
2230 case ANYOFPOSIXL_tb_p8:
2233 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2234 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2242 if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2243 /* We know that s is in the bitmap range since the target isn't
2244 * UTF-8, so what happens for out-of-range values is not relevant,
2245 * so exclude that from the flags */
2246 REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2250 REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2254 case ANYOFM_tb_pb: /* ARG() is the base byte; FLAGS() the mask byte */
2256 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2257 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2262 /* UTF-8ness doesn't matter because only matches UTF-8 invariants. But
2263 * we do anyway for performance reasons, as otherwise we would have to
2264 * examine all the continuation characters */
2265 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2266 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2271 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2272 find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2276 case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2278 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2279 (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2280 (U8) ARG(c), FLAGS(c)));
2283 /* These nodes all require at least one code point to be in UTF-8 to
2293 case EXACTFLU8_tb_pb:
2294 case EXACTFLU8_tb_p8:
2295 case EXACTFU_REQ8_tb_pb:
2296 case EXACTFU_REQ8_tb_p8:
2301 REXEC_FBC_UTF8_CLASS_SCAN(
2302 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2303 && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
2309 /* We know what the first byte of any matched string should be. */
2310 U8 first_byte = FLAGS(c);
2312 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2313 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2319 REXEC_FBC_UTF8_CLASS_SCAN(
2320 ( inRANGE(NATIVE_UTF8_TO_I8(*s),
2321 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2322 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2323 && reginclass(prog, c, (U8*)s, (U8*) strend,
2329 REXEC_FBC_FIND_NEXT_UTF8_STRING_SCAN(
2330 ((struct regnode_anyofhs *) c)->string,
2331 /* Note FLAGS is the string length in this regnode */
2332 ((struct regnode_anyofhs *) c)->string + FLAGS(c),
2333 reginclass(prog, c, (U8*)s, (U8*) strend,
2339 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2340 ANYOFRbase(c), ANYOFRdelta(c)));
2345 REXEC_FBC_UTF8_CLASS_SCAN(
2346 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2347 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2350 ANYOFRbase(c), ANYOFRdelta(c))));
2355 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2356 ANYOFRbase(c), ANYOFRdelta(c)));
2361 { /* We know what the first byte of any matched string should be */
2362 U8 first_byte = FLAGS(c);
2364 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2365 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2368 ANYOFRbase(c), ANYOFRdelta(c)));
2372 case EXACTFAA_tb_pb:
2374 /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2375 * which these functions don't handle anyway */
2376 fold_array = PL_fold_latin1;
2377 folder = foldEQ_latin1_s2_folded;
2378 goto do_exactf_non_utf8;
2381 fold_array = PL_fold;
2383 goto do_exactf_non_utf8;
2386 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2388 if (IN_UTF8_CTYPE_LOCALE) {
2389 utf8_fold_flags = FOLDEQ_LOCALE;
2390 goto do_exactf_utf8;
2393 fold_array = PL_fold_locale;
2394 folder = foldEQ_locale;
2395 goto do_exactf_non_utf8;
2398 /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2399 * don't have to worry here about this single special case in the
2401 fold_array = PL_fold_latin1;
2402 folder = foldEQ_latin1_s2_folded;
2406 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2407 are no glitches with fold-length differences
2408 between the target string and pattern */
2410 /* The idea in the non-utf8 EXACTF* cases is to first find the first
2411 * character of the EXACTF* node and then, if necessary,
2412 * case-insensitively compare the full text of the node. c1 is the
2413 * first character. c2 is its fold. This logic will not work for
2414 * Unicode semantics and the german sharp ss, which hence should not be
2415 * compiled into a node that gets here. */
2416 pat_string = STRINGs(c);
2417 ln = STR_LENs(c); /* length to match in octets/bytes */
2419 /* We know that we have to match at least 'ln' bytes (which is the same
2420 * as characters, since not utf8). If we have to match 3 characters,
2421 * and there are only 2 availabe, we know without trying that it will
2422 * fail; so don't start a match past the required minimum number from
2424 e = HOP3c(strend, -((SSize_t)ln), s);
2429 c2 = fold_array[c1];
2430 if (c1 == c2) { /* If char and fold are the same */
2432 s = (char *) memchr(s, c1, e + 1 - s);
2437 /* Check that the rest of the node matches */
2438 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2439 && (reginfo->intuit || regtry(reginfo, &s)) )
2447 U8 bits_differing = c1 ^ c2;
2449 /* If the folds differ in one bit position only, we can mask to
2450 * match either of them, and can use this faster find method. Both
2451 * ASCII and EBCDIC tend to have their case folds differ in only
2452 * one position, so this is very likely */
2453 if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2454 bits_differing = ~ bits_differing;
2456 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2457 (c1 & bits_differing), bits_differing);
2462 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2463 && (reginfo->intuit || regtry(reginfo, &s)) )
2470 else { /* Otherwise, stuck with looking byte-at-a-time. This
2471 should actually happen only in EXACTFL nodes */
2473 if ( (*(U8*)s == c1 || *(U8*)s == c2)
2474 && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2475 && (reginfo->intuit || regtry(reginfo, &s)) )
2485 case EXACTFAA_tb_p8:
2486 case EXACTFAA_t8_p8:
2487 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2488 |FOLDEQ_S2_ALREADY_FOLDED
2489 |FOLDEQ_S2_FOLDS_SANE;
2490 goto do_exactf_utf8;
2492 case EXACTFAA_NO_TRIE_tb_pb:
2493 case EXACTFAA_NO_TRIE_t8_pb:
2494 case EXACTFAA_t8_pb:
2496 /* Here, and elsewhere in this file, the reason we can't consider a
2497 * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2498 * is because any MICRO SIGN in the pattern won't be folded. Since the
2499 * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2500 * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2501 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2502 goto do_exactf_utf8;
2507 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2508 utf8_fold_flags = FOLDEQ_LOCALE;
2509 goto do_exactf_utf8;
2511 case EXACTFLU8_t8_pb:
2512 case EXACTFLU8_t8_p8:
2513 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2514 | FOLDEQ_S2_FOLDS_SANE;
2515 goto do_exactf_utf8;
2517 case EXACTFU_REQ8_t8_p8:
2518 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2519 goto do_exactf_utf8;
2524 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2525 goto do_exactf_utf8;
2527 /* The following are problematic even though pattern isn't UTF-8. Use
2528 * full functionality normally not done except for UTF-8. */
2530 case EXACTFUP_tb_pb:
2531 case EXACTFUP_t8_pb:
2537 /* If one of the operands is in utf8, we can't use the simpler
2538 * folding above, due to the fact that many different characters
2539 * can have the same fold, or portion of a fold, or different-
2541 pat_string = STRINGs(c);
2542 ln = STR_LENs(c); /* length to match in octets/bytes */
2543 pat_end = pat_string + ln;
2544 lnc = is_utf8_pat /* length to match in characters */
2545 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2548 /* We have 'lnc' characters to match in the pattern, but because of
2549 * multi-character folding, each character in the target can match
2550 * up to 3 characters (Unicode guarantees it will never exceed
2551 * this) if it is utf8-encoded; and up to 2 if not (based on the
2552 * fact that the Latin 1 folds are already determined, and the only
2553 * multi-char fold in that range is the sharp-s folding to 'ss'.
2554 * Thus, a pattern character can match as little as 1/3 of a string
2555 * character. Adjust lnc accordingly, rounding up, so that if we
2556 * need to match at least 4+1/3 chars, that really is 5. */
2557 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2558 lnc = (lnc + expansion - 1) / expansion;
2560 /* As in the non-UTF8 case, if we have to match 3 characters, and
2561 * only 2 are left, it's guaranteed to fail, so don't start a match
2562 * that would require us to go beyond the end of the string */
2563 e = HOP3c(strend, -((SSize_t)lnc), s);
2565 /* XXX Note that we could recalculate e to stop the loop earlier,
2566 * as the worst case expansion above will rarely be met, and as we
2567 * go along we would usually find that e moves further to the left.
2568 * This would happen only after we reached the point in the loop
2569 * where if there were no expansion we should fail. Unclear if
2570 * worth the expense */
2573 char *my_strend= (char *)strend;
2574 if ( foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
2575 pat_string, NULL, ln, is_utf8_pat,
2577 && (reginfo->intuit || regtry(reginfo, &s)) )
2581 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2588 case BOUND_tb_pb: /* /d without utf8 target is /a */
2590 /* regcomp.c makes sure that these only have the traditional \b
2592 assert(FLAGS(c) == TRADITIONAL_BOUND);
2594 FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
2597 case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2599 /* regcomp.c makes sure that these only have the traditional \b
2601 assert(FLAGS(c) == TRADITIONAL_BOUND);
2603 FBC_BOUND_A_UTF8(isWORDCHAR_A);
2608 case NBOUND_tb_pb: /* /d without utf8 target is /a */
2610 /* regcomp.c makes sure that these only have the traditional \b
2612 assert(FLAGS(c) == TRADITIONAL_BOUND);
2614 FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
2617 case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2619 /* regcomp.c makes sure that these only have the traditional \b
2621 assert(FLAGS(c) == TRADITIONAL_BOUND);
2623 FBC_NBOUND_A_UTF8(isWORDCHAR_A);
2628 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2629 FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2634 goto do_boundu_non_utf8;
2638 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2639 if (FLAGS(c) == TRADITIONAL_BOUND) {
2640 FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
2644 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2647 goto do_boundu_non_utf8;
2651 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2652 if (FLAGS(c) == TRADITIONAL_BOUND) {
2653 FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2657 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2659 goto do_boundu_non_utf8;
2663 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2664 FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
2669 if (s == reginfo->strbeg) {
2670 if (reginfo->intuit || regtry(reginfo, &s))
2675 /* Didn't match. Try at the next position (if there is one) */
2677 if (UNLIKELY(s >= reginfo->strend)) {
2682 switch((bound_type) FLAGS(c)) {
2683 case TRADITIONAL_BOUND: /* Should have already been handled */
2688 /* Not utf8. Everything is a GCB except between CR and LF */
2689 while (s < strend) {
2690 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2691 || UCHARAT(s) != '\n'))
2692 && (reginfo->intuit || regtry(reginfo, &s)))
2703 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2704 while (s < strend) {
2705 LB_enum after = getLB_VAL_CP((U8) *s);
2706 if (to_complement ^ isLB(before,
2708 (U8*) reginfo->strbeg,
2710 (U8*) reginfo->strend,
2711 0 /* target not utf8 */ )
2712 && (reginfo->intuit || regtry(reginfo, &s)))
2725 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2726 while (s < strend) {
2727 SB_enum after = getSB_VAL_CP((U8) *s);
2728 if ((to_complement ^ isSB(before,
2730 (U8*) reginfo->strbeg,
2732 (U8*) reginfo->strend,
2733 0 /* target not utf8 */ ))
2734 && (reginfo->intuit || regtry(reginfo, &s)))
2747 WB_enum previous = WB_UNKNOWN;
2748 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2749 while (s < strend) {
2750 WB_enum after = getWB_VAL_CP((U8) *s);
2751 if ((to_complement ^ isWB(previous,
2754 (U8*) reginfo->strbeg,
2756 (U8*) reginfo->strend,
2757 0 /* target not utf8 */ ))
2758 && (reginfo->intuit || regtry(reginfo, &s)))
2769 /* Here are at the final position in the target string, which is a
2770 * boundary by definition, so matches, depending on other constraints.
2772 if ( reginfo->intuit
2773 || (s <= reginfo->strend && regtry(reginfo, &s)))
2782 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2783 if (FLAGS(c) == TRADITIONAL_BOUND) {
2784 FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2785 isWORDCHAR_LC_utf8_safe);
2789 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2792 goto do_boundu_utf8;
2796 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2797 if (FLAGS(c) == TRADITIONAL_BOUND) {
2798 FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2799 isWORDCHAR_LC_utf8_safe);
2803 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2806 goto do_boundu_utf8;
2810 /* regcomp.c makes sure that these only have the traditional \b
2812 assert(FLAGS(c) == TRADITIONAL_BOUND);
2818 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2819 FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2820 isWORDCHAR_utf8_safe);
2825 goto do_boundu_utf8;
2829 /* regcomp.c makes sure that these only have the traditional \b
2831 assert(FLAGS(c) == TRADITIONAL_BOUND);
2837 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2838 FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2843 if (s == reginfo->strbeg) {
2844 if (reginfo->intuit || regtry(reginfo, &s))
2849 /* Didn't match. Try at the next position (if there is one) */
2850 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2851 if (UNLIKELY(s >= reginfo->strend)) {
2856 switch((bound_type) FLAGS(c)) {
2857 case TRADITIONAL_BOUND: /* Should have already been handled */
2863 GCB_enum before = getGCB_VAL_UTF8(
2865 (U8*)(reginfo->strbeg)),
2866 (U8*) reginfo->strend);
2867 while (s < strend) {
2868 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2869 (U8*) reginfo->strend);
2870 if ( (to_complement ^ isGCB(before,
2872 (U8*) reginfo->strbeg,
2874 1 /* target is utf8 */ ))
2875 && (reginfo->intuit || regtry(reginfo, &s)))
2880 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2887 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2889 (U8*)(reginfo->strbeg)),
2890 (U8*) reginfo->strend);
2891 while (s < strend) {
2892 LB_enum after = getLB_VAL_UTF8((U8*) s,
2893 (U8*) reginfo->strend);
2894 if (to_complement ^ isLB(before,
2896 (U8*) reginfo->strbeg,
2898 (U8*) reginfo->strend,
2899 1 /* target is utf8 */ )
2900 && (reginfo->intuit || regtry(reginfo, &s)))
2905 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2913 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2915 (U8*)(reginfo->strbeg)),
2916 (U8*) reginfo->strend);
2917 while (s < strend) {
2918 SB_enum after = getSB_VAL_UTF8((U8*) s,
2919 (U8*) reginfo->strend);
2920 if ((to_complement ^ isSB(before,
2922 (U8*) reginfo->strbeg,
2924 (U8*) reginfo->strend,
2925 1 /* target is utf8 */ ))
2926 && (reginfo->intuit || regtry(reginfo, &s)))
2931 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2939 /* We are at a boundary between char_sub_0 and char_sub_1.
2940 * We also keep track of the value for char_sub_-1 as we
2941 * loop through the line. Context may be needed to make a
2942 * determination, and if so, this can save having to
2944 WB_enum previous = WB_UNKNOWN;
2945 WB_enum before = getWB_VAL_UTF8(
2948 (U8*)(reginfo->strbeg)),
2949 (U8*) reginfo->strend);
2950 while (s < strend) {
2951 WB_enum after = getWB_VAL_UTF8((U8*) s,
2952 (U8*) reginfo->strend);
2953 if ((to_complement ^ isWB(previous,
2956 (U8*) reginfo->strbeg,
2958 (U8*) reginfo->strend,
2959 1 /* target is utf8 */ ))
2960 && (reginfo->intuit || regtry(reginfo, &s)))
2966 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2971 /* Here are at the final position in the target string, which is a
2972 * boundary by definition, so matches, depending on other constraints.
2975 if ( reginfo->intuit
2976 || (s <= reginfo->strend && regtry(reginfo, &s)))
2984 REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
2989 REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
2992 /* The argument to all the POSIX node types is the class number to pass
2993 * to _generic_isCC() to build a mask for searching in PL_charclass[] */
3002 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3003 REXEC_FBC_UTF8_CLASS_SCAN(
3004 to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
3015 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3016 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3017 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3022 /* The complement of something that matches only ASCII matches all
3023 * non-ASCII, plus everything in ASCII that isn't in the class. */
3024 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend)
3025 || ! _generic_isCC_A(*s, FLAGS(c)));
3030 /* Don't need to worry about utf8, as it can match only a single
3031 * byte invariant character. But we do anyway for performance reasons,
3032 * as otherwise we would have to examine all the continuation
3034 REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
3048 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3049 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
3059 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3060 to_complement ^ cBOOL(_generic_isCC(*s,
3075 classnum = (_char_class_number) FLAGS(c);
3078 REXEC_FBC_UTF8_CLASS_SCAN(
3079 to_complement ^ cBOOL(_invlist_contains_cp(
3080 PL_XPosix_ptrs[classnum],
3081 utf8_to_uvchr_buf((U8 *) s,
3086 case _CC_ENUM_SPACE:
3087 REXEC_FBC_UTF8_CLASS_SCAN(
3088 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3091 case _CC_ENUM_BLANK:
3092 REXEC_FBC_UTF8_CLASS_SCAN(
3093 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3096 case _CC_ENUM_XDIGIT:
3097 REXEC_FBC_UTF8_CLASS_SCAN(
3098 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3101 case _CC_ENUM_VERTSPACE:
3102 REXEC_FBC_UTF8_CLASS_SCAN(
3103 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3106 case _CC_ENUM_CNTRL:
3107 REXEC_FBC_UTF8_CLASS_SCAN(
3108 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3113 case AHOCORASICKC_tb_pb:
3114 case AHOCORASICKC_tb_p8:
3115 case AHOCORASICKC_t8_pb:
3116 case AHOCORASICKC_t8_p8:
3117 case AHOCORASICK_tb_pb:
3118 case AHOCORASICK_tb_p8:
3119 case AHOCORASICK_t8_pb:
3120 case AHOCORASICK_t8_p8:
3123 /* what trie are we using right now */
3124 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3125 reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
3126 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3128 const char *last_start = strend - trie->minlen;
3130 const char *real_start = s;
3132 STRLEN maxlen = trie->maxlen;
3134 U8 **points; /* map of where we were in the input string
3135 when reading a given char. For ASCII this
3136 is unnecessary overhead as the relationship
3137 is always 1:1, but for Unicode, especially
3138 case folded Unicode this is not true. */
3139 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3143 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3145 /* We can't just allocate points here. We need to wrap it in
3146 * an SV so it gets freed properly if there is a croak while
3147 * running the match */
3150 sv_points=newSV(maxlen * sizeof(U8 *));
3151 SvCUR_set(sv_points,
3152 maxlen * sizeof(U8 *));
3153 SvPOK_on(sv_points);
3154 sv_2mortal(sv_points);
3155 points=(U8**)SvPV_nolen(sv_points );
3156 if ( trie_type != trie_utf8_fold
3157 && (trie->bitmap || OP(c)==AHOCORASICKC) )
3160 bitmap=(U8*)trie->bitmap;
3162 bitmap=(U8*)ANYOF_BITMAP(c);
3164 /* this is the Aho-Corasick algorithm modified a touch
3165 to include special handling for long "unknown char" sequences.
3166 The basic idea being that we use AC as long as we are dealing
3167 with a possible matching char, when we encounter an unknown char
3168 (and we have not encountered an accepting state) we scan forward
3169 until we find a legal starting char.
3170 AC matching is basically that of trie matching, except that when
3171 we encounter a failing transition, we fall back to the current
3172 states "fail state", and try the current char again, a process
3173 we repeat until we reach the root state, state 1, or a legal
3174 transition. If we fail on the root state then we can either
3175 terminate if we have reached an accepting state previously, or
3176 restart the entire process from the beginning if we have not.
3179 while (s <= last_start) {
3180 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3188 U8 *uscan = (U8*)NULL;
3189 U8 *leftmost = NULL;
3191 U32 accepted_word= 0;
3195 while ( state && uc <= (U8*)strend ) {
3197 U32 word = aho->states[ state ].wordnum;
3201 DEBUG_TRIE_EXECUTE_r(
3202 if ( uc <= (U8*)last_start
3203 && !BITMAP_TEST(bitmap,*uc) )
3205 dump_exec_pos( (char *)uc, c, strend,
3207 (char *)uc, utf8_target, 0 );
3208 Perl_re_printf( aTHX_
3209 " Scanning for legal start char...\n");
3213 while ( uc <= (U8*)last_start
3214 && !BITMAP_TEST(bitmap,*uc) )
3219 while ( uc <= (U8*)last_start
3220 && ! BITMAP_TEST(bitmap,*uc) )
3227 if (uc >(U8*)last_start) break;
3231 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3233 if (!leftmost || lpos < leftmost) {
3234 DEBUG_r(accepted_word=word);
3240 points[pointpos++ % maxlen]= uc;
3241 if (foldlen || uc < (U8*)strend) {
3242 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3243 (U8 *) strend, uscan, len, uvc,
3244 charid, foldlen, foldbuf,
3246 DEBUG_TRIE_EXECUTE_r({
3247 dump_exec_pos( (char *)uc, c, strend,
3248 real_start, s, utf8_target, 0);
3249 Perl_re_printf( aTHX_
3250 " Charid:%3u CP:%4" UVxf " ",
3262 word = aho->states[ state ].wordnum;
3264 base = aho->states[ state ].trans.base;
3266 DEBUG_TRIE_EXECUTE_r({
3268 dump_exec_pos((char *)uc, c, strend, real_start,
3269 s, utf8_target, 0 );
3270 Perl_re_printf( aTHX_
3271 "%sState: %4" UVxf ", word=%" UVxf,
3272 failed ? " Fail transition to " : "",
3273 (UV)state, (UV)word);
3279 ( ((offset = base + charid
3280 - 1 - trie->uniquecharcount)) >= 0)
3281 && ((U32)offset < trie->lasttrans)
3282 && trie->trans[offset].check == state
3283 && (tmp=trie->trans[offset].next))
3285 DEBUG_TRIE_EXECUTE_r(
3286 Perl_re_printf( aTHX_ " - legal\n"));
3291 DEBUG_TRIE_EXECUTE_r(
3292 Perl_re_printf( aTHX_ " - fail\n"));
3294 state = aho->fail[state];
3298 /* we must be accepting here */
3299 DEBUG_TRIE_EXECUTE_r(
3300 Perl_re_printf( aTHX_ " - accepting\n"));
3309 if (!state) state = 1;
3312 if ( aho->states[ state ].wordnum ) {
3313 U8 *lpos = points[ (pointpos
3314 - trie->wordinfo[aho->states[ state ]
3315 .wordnum].len) % maxlen ];
3316 if (!leftmost || lpos < leftmost) {
3317 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3322 s = (char*)leftmost;
3323 DEBUG_TRIE_EXECUTE_r({
3324 Perl_re_printf( aTHX_ "Matches word #%" UVxf
3325 " at position %" IVdf ". Trying full"
3327 (UV)accepted_word, (IV)(s - real_start)
3330 if (reginfo->intuit || regtry(reginfo, &s)) {
3335 if (s < reginfo->strend) {
3338 DEBUG_TRIE_EXECUTE_r({
3339 Perl_re_printf( aTHX_
3340 "Pattern failed. Looking for new start"
3344 DEBUG_TRIE_EXECUTE_r(
3345 Perl_re_printf( aTHX_ "No match.\n"));
3354 case EXACTFU_REQ8_t8_pb:
3355 case EXACTFUP_tb_p8:
3356 case EXACTFUP_t8_p8:
3358 case EXACTF_t8_p8: /* This node only generated for non-utf8 patterns */
3359 case EXACTFAA_NO_TRIE_tb_p8:
3360 case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3365 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3366 } /* End of switch on node type */
3374 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3375 * flags have same meanings as with regexec_flags() */
3378 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3385 struct regexp *const prog = ReANY(rx);
3387 if (flags & REXEC_COPY_STR) {
3390 DEBUG_C(Perl_re_printf( aTHX_
3391 "Copy on write: regexp capture, type %d\n",
3393 /* Create a new COW SV to share the match string and store
3394 * in saved_copy, unless the current COW SV in saved_copy
3395 * is valid and suitable for our purpose */
3396 if (( prog->saved_copy
3397 && SvIsCOW(prog->saved_copy)
3398 && SvPOKp(prog->saved_copy)
3401 && SvPVX(sv) == SvPVX(prog->saved_copy)))
3403 /* just reuse saved_copy SV */
3404 if (RXp_MATCH_COPIED(prog)) {
3405 Safefree(prog->subbeg);
3406 RXp_MATCH_COPIED_off(prog);
3410 /* create new COW SV to share string */
3411 RXp_MATCH_COPY_FREE(prog);
3412 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3414 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3415 assert (SvPOKp(prog->saved_copy));
3416 prog->sublen = strend - strbeg;
3417 prog->suboffset = 0;
3418 prog->subcoffset = 0;
3423 SSize_t max = strend - strbeg;
3426 if ( (flags & REXEC_COPY_SKIP_POST)
3427 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3428 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3429 ) { /* don't copy $' part of string */
3432 /* calculate the right-most part of the string covered
3433 * by a capture. Due to lookahead, this may be to
3434 * the right of $&, so we have to scan all captures */
3435 while (n <= prog->lastparen) {
3436 if (prog->offs[n].end > max)
3437 max = prog->offs[n].end;
3441 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3442 ? prog->offs[0].start
3444 assert(max >= 0 && max <= strend - strbeg);
3447 if ( (flags & REXEC_COPY_SKIP_PRE)
3448 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3449 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3450 ) { /* don't copy $` part of string */
3453 /* calculate the left-most part of the string covered
3454 * by a capture. Due to lookbehind, this may be to
3455 * the left of $&, so we have to scan all captures */
3456 while (min && n <= prog->lastparen) {
3457 if ( prog->offs[n].start != -1
3458 && prog->offs[n].start < min)
3460 min = prog->offs[n].start;
3464 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3465 && min > prog->offs[0].end
3467 min = prog->offs[0].end;
3471 assert(min >= 0 && min <= max && min <= strend - strbeg);
3474 if (RXp_MATCH_COPIED(prog)) {
3475 if (sublen > prog->sublen)
3477 (char*)saferealloc(prog->subbeg, sublen+1);
3480 prog->subbeg = (char*)safemalloc(sublen+1);
3481 Copy(strbeg + min, prog->subbeg, sublen, char);
3482 prog->subbeg[sublen] = '\0';
3483 prog->suboffset = min;
3484 prog->sublen = sublen;
3485 RXp_MATCH_COPIED_on(prog);
3487 prog->subcoffset = prog->suboffset;
3488 if (prog->suboffset && utf8_target) {
3489 /* Convert byte offset to chars.
3490 * XXX ideally should only compute this if @-/@+
3491 * has been seen, a la PL_sawampersand ??? */
3493 /* If there's a direct correspondence between the
3494 * string which we're matching and the original SV,
3495 * then we can use the utf8 len cache associated with
3496 * the SV. In particular, it means that under //g,
3497 * sv_pos_b2u() will use the previously cached
3498 * position to speed up working out the new length of
3499 * subcoffset, rather than counting from the start of
3500 * the string each time. This stops
3501 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3502 * from going quadratic */
3503 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3504 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3505 SV_GMAGIC|SV_CONST_RETURN);
3507 prog->subcoffset = utf8_length((U8*)strbeg,
3508 (U8*)(strbeg+prog->suboffset));
3512 RXp_MATCH_COPY_FREE(prog);
3513 prog->subbeg = strbeg;
3514 prog->suboffset = 0;
3515 prog->subcoffset = 0;
3516 prog->sublen = strend - strbeg;
3524 - regexec_flags - match a regexp against a string
3527 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3528 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3529 /* stringarg: the point in the string at which to begin matching */
3530 /* strend: pointer to null at end of string */
3531 /* strbeg: real beginning of string */
3532 /* minend: end of match must be >= minend bytes after stringarg. */
3533 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
3534 * itself is accessed via the pointers above */
3535 /* data: May be used for some additional optimizations.
3536 Currently unused. */
3537 /* flags: For optimizations. See REXEC_* in regexp.h */
3540 struct regexp *const prog = ReANY(rx);
3544 SSize_t minlen; /* must match at least this many chars */
3545 SSize_t dontbother = 0; /* how many characters not to try at end */
3546 const bool utf8_target = cBOOL(DO_UTF8(sv));
3548 RXi_GET_DECL(prog,progi);
3549 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3550 regmatch_info *const reginfo = ®info_buf;
3551 regexp_paren_pair *swap = NULL;
3553 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3555 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3556 PERL_UNUSED_ARG(data);
3558 /* Be paranoid... */
3560 Perl_croak(aTHX_ "NULL regexp parameter");
3564 debug_start_match(rx, utf8_target, stringarg, strend,
3568 startpos = stringarg;
3570 /* set these early as they may be used by the HOP macros below */
3571 reginfo->strbeg = strbeg;
3572 reginfo->strend = strend;
3573 reginfo->is_utf8_target = cBOOL(utf8_target);
3575 if (prog->intflags & PREGf_GPOS_SEEN) {
3578 /* set reginfo->ganch, the position where \G can match */
3581 (flags & REXEC_IGNOREPOS)
3582 ? stringarg /* use start pos rather than pos() */
3583 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3584 /* Defined pos(): */
3585 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3586 : strbeg; /* pos() not defined; use start of string */
3588 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3589 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3591 /* in the presence of \G, we may need to start looking earlier in
3592 * the string than the suggested start point of stringarg:
3593 * if prog->gofs is set, then that's a known, fixed minimum
3596 * /ab|c\G/: gofs = 1
3597 * or if the minimum offset isn't known, then we have to go back
3598 * to the start of the string, e.g. /w+\G/
3601 if (prog->intflags & PREGf_ANCH_GPOS) {
3603 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3605 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3607 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3608 "fail: ganch-gofs before earliest possible start\n"));
3613 startpos = reginfo->ganch;
3615 else if (prog->gofs) {
3616 startpos = HOPBACKc(startpos, prog->gofs);
3620 else if (prog->intflags & PREGf_GPOS_FLOAT)
3624 minlen = prog->minlen;
3625 if ((startpos + minlen) > strend || startpos < strbeg) {
3626 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3627 "Regex match can't succeed, so not even tried\n"));
3631 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3632 * which will call destuctors to reset PL_regmatch_state, free higher
3633 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3634 * regmatch_info_aux_eval */
3636 oldsave = PL_savestack_ix;
3640 if ((prog->extflags & RXf_USE_INTUIT)
3641 && !(flags & REXEC_CHECKED))
3643 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3648 if (prog->extflags & RXf_CHECK_ALL) {
3649 /* we can match based purely on the result of INTUIT.
3650 * Set up captures etc just for $& and $-[0]
3651 * (an intuit-only match wont have $1,$2,..) */
3652 assert(!prog->nparens);
3654 /* s/// doesn't like it if $& is earlier than where we asked it to
3655 * start searching (which can happen on something like /.\G/) */
3656 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3659 /* this should only be possible under \G */
3660 assert(prog->intflags & PREGf_GPOS_SEEN);
3661 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3662 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3666 /* match via INTUIT shouldn't have any captures.
3667 * Let @-, @+, $^N know */
3668 prog->lastparen = prog->lastcloseparen = 0;
3669 RXp_MATCH_UTF8_set(prog, utf8_target);
3670 prog->offs[0].start = s - strbeg;
3671 prog->offs[0].end = utf8_target
3672 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3673 : s - strbeg + prog->minlenret;
3674 if ( !(flags & REXEC_NOT_FIRST) )
3675 S_reg_set_capture_string(aTHX_ rx,
3677 sv, flags, utf8_target);
3683 multiline = prog->extflags & RXf_PMf_MULTILINE;
3685 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3686 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3687 "String too short [regexec_flags]...\n"));
3691 /* Check validity of program. */
3692 if (UCHARAT(progi->program) != REG_MAGIC) {
3693 Perl_croak(aTHX_ "corrupted regexp program");
3696 RXp_MATCH_TAINTED_off(prog);
3697 RXp_MATCH_UTF8_set(prog, utf8_target);
3699 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3700 reginfo->intuit = 0;
3701 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3702 reginfo->warned = FALSE;
3704 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3705 /* see how far we have to get to not match where we matched before */
3706 reginfo->till = stringarg + minend;
3708 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3709 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3710 S_cleanup_regmatch_info_aux has executed (registered by
3711 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3712 magic belonging to this SV.
3713 Not newSVsv, either, as it does not COW.
3715 reginfo->sv = newSV_type(SVt_NULL);
3716 SvSetSV_nosteal(reginfo->sv, sv);
3717 SAVEFREESV(reginfo->sv);
3720 /* reserve next 2 or 3 slots in PL_regmatch_state:
3721 * slot N+0: may currently be in use: skip it
3722 * slot N+1: use for regmatch_info_aux struct
3723 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3724 * slot N+3: ready for use by regmatch()
3728 regmatch_state *old_regmatch_state;
3729 regmatch_slab *old_regmatch_slab;
3730 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3732 /* on first ever match, allocate first slab */
3733 if (!PL_regmatch_slab) {
3734 Newx(PL_regmatch_slab, 1, regmatch_slab);
3735 PL_regmatch_slab->prev = NULL;
3736 PL_regmatch_slab->next = NULL;
3737 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3740 old_regmatch_state = PL_regmatch_state;
3741 old_regmatch_slab = PL_regmatch_slab;
3743 for (i=0; i <= max; i++) {
3745 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3747 reginfo->info_aux_eval =
3748 reginfo->info_aux->info_aux_eval =
3749 &(PL_regmatch_state->u.info_aux_eval);
3751 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3752 PL_regmatch_state = S_push_slab(aTHX);
3755 /* note initial PL_regmatch_state position; at end of match we'll
3756 * pop back to there and free any higher slabs */
3758 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3759 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
3760 reginfo->info_aux->poscache = NULL;
3762 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3764 if ((prog->extflags & RXf_EVAL_SEEN))
3765 S_setup_eval_state(aTHX_ reginfo);
3767 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3770 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3771 /* We have to be careful. If the previous successful match
3772 was from this regex we don't want a subsequent partially
3773 successful match to clobber the old results.
3774 So when we detect this possibility we add a swap buffer
3775 to the re, and switch the buffer each match. If we fail,
3776 we switch it back; otherwise we leave it swapped.
3779 /* avoid leak if we die, or clean up anyway if match completes */
3781 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3782 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3783 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3791 if (prog->recurse_locinput)
3792 Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3794 /* Simplest case: anchored match (but not \G) need be tried only once,
3795 * or with MBOL, only at the beginning of each line.
3797 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3798 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3799 * match at the start of the string then it won't match anywhere else
3800 * either; while with /.*.../, if it doesn't match at the beginning,
3801 * the earliest it could match is at the start of the next line */
3803 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3806 if (regtry(reginfo, &s))
3809 if (!(prog->intflags & PREGf_ANCH_MBOL))
3812 /* didn't match at start, try at other newline positions */
3815 dontbother = minlen - 1;
3816 end = HOP3c(strend, -dontbother, strbeg) - 1;
3818 /* skip to next newline */
3820 while (s <= end) { /* note it could be possible to match at the end of the string */
3821 /* NB: newlines are the same in unicode as they are in latin */
3824 if (prog->check_substr || prog->check_utf8) {
3825 /* note that with PREGf_IMPLICIT, intuit can only fail
3826 * or return the start position, so it's of limited utility.
3827 * Nevertheless, I made the decision that the potential for
3828 * quick fail was still worth it - DAPM */
3829 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3833 if (regtry(reginfo, &s))
3837 } /* end anchored search */
3839 /* anchored \G match */
3840 if (prog->intflags & PREGf_ANCH_GPOS)
3842 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3843 assert(prog->intflags & PREGf_GPOS_SEEN);
3844 /* For anchored \G, the only position it can match from is
3845 * (ganch-gofs); we already set startpos to this above; if intuit
3846 * moved us on from there, we can't possibly succeed */
3847 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3848 if (s == startpos && regtry(reginfo, &s))
3853 /* Messy cases: unanchored match. */
3855 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3856 /* we have /x+whatever/ */
3857 /* it must be a one character string (XXXX Except is_utf8_pat?) */
3863 if (! prog->anchored_utf8) {
3864 to_utf8_substr(prog);
3866 ch = SvPVX_const(prog->anchored_utf8)[0];
3867 REXEC_FBC_UTF8_SCAN(
3869 DEBUG_EXECUTE_r( did_match = 1 );
3870 if (regtry(reginfo, &s)) goto got_it;
3871 s += UTF8_SAFE_SKIP(s, strend);
3872 while (s < strend && *s == ch)
3879 if (! prog->anchored_substr) {
3880 if (! to_byte_substr(prog)) {
3881 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3884 ch = SvPVX_const(prog->anchored_substr)[0];
3885 REXEC_FBC_NON_UTF8_SCAN(
3887 DEBUG_EXECUTE_r( did_match = 1 );
3888 if (regtry(reginfo, &s)) goto got_it;
3890 while (s < strend && *s == ch)
3895 DEBUG_EXECUTE_r(if (!did_match)
3896 Perl_re_printf( aTHX_
3897 "Did not find anchored character...\n")
3900 else if (prog->anchored_substr != NULL
3901 || prog->anchored_utf8 != NULL
3902 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3903 && prog->float_max_offset < strend - s)) {
3908 char *last1; /* Last position checked before */
3912 if (prog->anchored_substr || prog->anchored_utf8) {
3914 if (! prog->anchored_utf8) {
3915 to_utf8_substr(prog);
3917 must = prog->anchored_utf8;
3920 if (! prog->anchored_substr) {
3921 if (! to_byte_substr(prog)) {
3922 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3925 must = prog->anchored_substr;
3927 back_max = back_min = prog->anchored_offset;
3930 if (! prog->float_utf8) {
3931 to_utf8_substr(prog);
3933 must = prog->float_utf8;
3936 if (! prog->float_substr) {
3937 if (! to_byte_substr(prog)) {
3938 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3941 must = prog->float_substr;
3943 back_max = prog->float_max_offset;
3944 back_min = prog->float_min_offset;
3950 last = HOP3c(strend, /* Cannot start after this */
3951 -(SSize_t)(CHR_SVLEN(must)
3952 - (SvTAIL(must) != 0) + back_min), strbeg);
3954 if (s > reginfo->strbeg)
3955 last1 = HOPc(s, -1);
3957 last1 = s - 1; /* bogus */
3959 /* XXXX check_substr already used to find "s", can optimize if
3960 check_substr==must. */
3962 strend = HOPc(strend, -dontbother);
3963 while ( (s <= last) &&
3964 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
3965 (unsigned char*)strend, must,
3966 multiline ? FBMrf_MULTILINE : 0)) ) {
3967 DEBUG_EXECUTE_r( did_match = 1 );
3968 if (HOPc(s, -back_max) > last1) {
3969 last1 = HOPc(s, -back_min);
3970 s = HOPc(s, -back_max);
3973 char * const t = (last1 >= reginfo->strbeg)
3974 ? HOPc(last1, 1) : last1 + 1;
3976 last1 = HOPc(s, -back_min);
3980 while (s <= last1) {
3981 if (regtry(reginfo, &s))
3984 s++; /* to break out of outer loop */
3991 while (s <= last1) {
3992 if (regtry(reginfo, &s))
3998 DEBUG_EXECUTE_r(if (!did_match) {
3999 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
4000 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
4001 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n",
4002 ((must == prog->anchored_substr || must == prog->anchored_utf8)
4003 ? "anchored" : "floating"),
4004 quoted, RE_SV_TAIL(must));
4008 else if ( (c = progi->regstclass) ) {
4010 const OPCODE op = OP(progi->regstclass);
4011 /* don't bother with what can't match */
4012 if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
4013 strend = HOPc(strend, -(minlen - 1));
4016 SV * const prop = sv_newmortal();
4017 regprop(prog, prop, c, reginfo, NULL);
4019 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
4020 s,strend-s,PL_dump_re_max_len);
4021 Perl_re_printf( aTHX_
4022 "Matching stclass %.*s against %s (%d bytes)\n",
4023 (int)SvCUR(prop), SvPVX_const(prop),
4024 quoted, (int)(strend - s));
4027 if (find_byclass(prog, c, s, strend, reginfo))
4029 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n"));
4033 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
4041 if (! prog->float_utf8) {
4042 to_utf8_substr(prog);
4044 float_real = prog->float_utf8;
4047 if (! prog->float_substr) {
4048 if (! to_byte_substr(prog)) {
4049 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4052 float_real = prog->float_substr;
4055 little = SvPV_const(float_real, len);
4056 if (SvTAIL(float_real)) {
4057 /* This means that float_real contains an artificial \n on
4058 * the end due to the presence of something like this:
4059 * /foo$/ where we can match both "foo" and "foo\n" at the
4060 * end of the string. So we have to compare the end of the
4061 * string first against the float_real without the \n and
4062 * then against the full float_real with the string. We
4063 * have to watch out for cases where the string might be
4064 * smaller than the float_real or the float_real without
4066 char *checkpos= strend - len;
4068 Perl_re_printf( aTHX_
4069 "%sChecking for float_real.%s\n",
4070 PL_colors[4], PL_colors[5]));
4071 if (checkpos + 1 < strbeg) {
4072 /* can't match, even if we remove the trailing \n
4073 * string is too short to match */
4075 Perl_re_printf( aTHX_
4076 "%sString shorter than required trailing substring, cannot match.%s\n",
4077 PL_colors[4], PL_colors[5]));
4079 } else if (memEQ(checkpos + 1, little, len - 1)) {
4080 /* can match, the end of the string matches without the
4082 last = checkpos + 1;
4083 } else if (checkpos < strbeg) {
4084 /* cant match, string is too short when the "\n" is
4087 Perl_re_printf( aTHX_
4088 "%sString does not contain required trailing substring, cannot match.%s\n",
4089 PL_colors[4], PL_colors[5]));
4091 } else if (!multiline) {
4092 /* non multiline match, so compare with the "\n" at the
4093 * end of the string */
4094 if (memEQ(checkpos, little, len)) {
4098 Perl_re_printf( aTHX_
4099 "%sString does not contain required trailing substring, cannot match.%s\n",
4100 PL_colors[4], PL_colors[5]));
4104 /* multiline match, so we have to search for a place
4105 * where the full string is located */
4111 last = rninstr(s, strend, little, little + len);
4113 last = strend; /* matching "$" */
4116 /* at one point this block contained a comment which was
4117 * probably incorrect, which said that this was a "should not
4118 * happen" case. Even if it was true when it was written I am
4119 * pretty sure it is not anymore, so I have removed the comment
4120 * and replaced it with this one. Yves */
4122 Perl_re_printf( aTHX_
4123 "%sString does not contain required substring, cannot match.%s\n",
4124 PL_colors[4], PL_colors[5]
4128 dontbother = strend - last + prog->float_min_offset;
4130 if (minlen && (dontbother < minlen))
4131 dontbother = minlen - 1;
4132 strend -= dontbother; /* this one's always in bytes! */
4133 /* We don't know much -- general case. */
4136 if (regtry(reginfo, &s))
4145 if (regtry(reginfo, &s))
4147 } while (s++ < strend);
4155 /* s/// doesn't like it if $& is earlier than where we asked it to
4156 * start searching (which can happen on something like /.\G/) */
4157 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
4158 && (prog->offs[0].start < stringarg - strbeg))
4160 /* this should only be possible under \G */
4161 assert(prog->intflags & PREGf_GPOS_SEEN);
4162 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
4163 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
4167 /* clean up; this will trigger destructors that will free all slabs
4168 * above the current one, and cleanup the regmatch_info_aux
4169 * and regmatch_info_aux_eval sructs */
4171 LEAVE_SCOPE(oldsave);
4173 if (RXp_PAREN_NAMES(prog))
4174 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
4176 /* make sure $`, $&, $', and $digit will work later */
4177 if ( !(flags & REXEC_NOT_FIRST) )
4178 S_reg_set_capture_string(aTHX_ rx,
4179 strbeg, reginfo->strend,
4180 sv, flags, utf8_target);
4185 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n",
4186 PL_colors[4], PL_colors[5]));
4189 /* we failed :-( roll it back.
4190 * Since the swap buffer will be freed on scope exit which follows
4191 * shortly, restore the old captures by copying 'swap's original
4192 * data to the new offs buffer
4194 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
4195 "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
4202 Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
4205 /* clean up; this will trigger destructors that will free all slabs
4206 * above the current one, and cleanup the regmatch_info_aux
4207 * and regmatch_info_aux_eval sructs */
4209 LEAVE_SCOPE(oldsave);
4215 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
4216 * Do inc before dec, in case old and new rex are the same */
4217 #define SET_reg_curpm(Re2) \
4218 if (reginfo->info_aux_eval) { \
4219 (void)ReREFCNT_inc(Re2); \
4220 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
4221 PM_SETRE((PL_reg_curpm), (Re2)); \
4226 - regtry - try match at specific point
4228 STATIC bool /* 0 failure, 1 success */
4229 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4232 REGEXP *const rx = reginfo->prog;
4233 regexp *const prog = ReANY(rx);
4236 U32 depth = 0; /* used by REGCP_SET */
4238 RXi_GET_DECL(prog,progi);
4239 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4241 PERL_ARGS_ASSERT_REGTRY;
4243 reginfo->cutpoint=NULL;
4245 prog->offs[0].start = *startposp - reginfo->strbeg;
4246 prog->lastparen = 0;
4247 prog->lastcloseparen = 0;
4249 /* XXXX What this code is doing here?!!! There should be no need
4250 to do this again and again, prog->lastparen should take care of
4253 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4254 * Actually, the code in regcppop() (which Ilya may be meaning by
4255 * prog->lastparen), is not needed at all by the test suite
4256 * (op/regexp, op/pat, op/split), but that code is needed otherwise
4257 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4258 * Meanwhile, this code *is* needed for the
4259 * above-mentioned test suite tests to succeed. The common theme
4260 * on those tests seems to be returning null fields from matches.
4261 * --jhi updated by dapm */
4263 /* After encountering a variant of the issue mentioned above I think
4264 * the point Ilya was making is that if we properly unwind whenever
4265 * we set lastparen to a smaller value then we should not need to do
4266 * this every time, only when needed. So if we have tests that fail if
4267 * we remove this, then it suggests somewhere else we are improperly
4268 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4269 * places it is called, and related regcp() routines. - Yves */
4271 if (prog->nparens) {
4272 regexp_paren_pair *pp = prog->offs;
4274 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4282 result = regmatch(reginfo, *startposp, progi->program + 1);
4284 prog->offs[0].end = result;
4287 if (reginfo->cutpoint)
4288 *startposp= reginfo->cutpoint;
4289 REGCP_UNWIND(lastcp);
4293 /* this is used to determine how far from the left messages like
4294 'failed...' are printed in regexec.c. It should be set such that
4295 messages are inline with the regop output that created them.
4297 #define REPORT_CODE_OFF 29
4298 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4301 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4305 PerlIO *f= Perl_debug_log;
4306 PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4307 va_start(ap, depth);
4308 PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4309 result = PerlIO_vprintf(f, fmt, ap);
4313 #endif /* DEBUGGING */
4315 /* grab a new slab and return the first slot in it */
4317 STATIC regmatch_state *
4320 regmatch_slab *s = PL_regmatch_slab->next;
4322 Newx(s, 1, regmatch_slab);
4323 s->prev = PL_regmatch_slab;
4325 PL_regmatch_slab->next = s;
4327 PL_regmatch_slab = s;
4328 return SLAB_FIRST(s);
4334 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4335 const char *start, const char *end, const char *blurb)
4337 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4339 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4344 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4345 RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4347 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4348 start, end - start, PL_dump_re_max_len);
4350 Perl_re_printf( aTHX_
4351 "%s%s REx%s %s against %s\n",
4352 PL_colors[4], blurb, PL_colors[5], s0, s1);
4354 if (utf8_target||utf8_pat)
4355 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n",
4356 utf8_pat ? "pattern" : "",
4357 utf8_pat && utf8_target ? " and " : "",
4358 utf8_target ? "string" : ""
4364 S_dump_exec_pos(pTHX_ const char *locinput,
4365 const regnode *scan,
4366 const char *loc_regeol,
4367 const char *loc_bostr,
4368 const char *loc_reg_starttry,
4369 const bool utf8_target,
4373 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4374 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4375 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4376 /* The part of the string before starttry has one color
4377 (pref0_len chars), between starttry and current
4378 position another one (pref_len - pref0_len chars),
4379 after the current position the third one.
4380 We assume that pref0_len <= pref_len, otherwise we
4381 decrease pref0_len. */
4382 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4383 ? (5 + taill) - l : locinput - loc_bostr;
4386 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4388 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4390 pref0_len = pref_len - (locinput - loc_reg_starttry);
4391 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4392 l = ( loc_regeol - locinput > (5 + taill) - pref_len
4393 ? (5 + taill) - pref_len : loc_regeol - locinput);
4394 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4398 if (pref0_len > pref_len)
4399 pref0_len = pref_len;
4401 const int is_uni = utf8_target ? 1 : 0;
4403 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4404 (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4406 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4407 (locinput - pref_len + pref0_len),
4408 pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4410 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4411 locinput, loc_regeol - locinput, 10, 0, 1);
4413 const STRLEN tlen=len0+len1+len2;
4414 Perl_re_printf( aTHX_
4415 "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4" UVuf "| ",
4416 (IV)(locinput - loc_bostr),
4419 (docolor ? "" : "> <"),
4421 (int)(tlen > 19 ? 0 : 19 - tlen),
4429 /* reg_check_named_buff_matched()
4430 * Checks to see if a named buffer has matched. The data array of
4431 * buffer numbers corresponding to the buffer is expected to reside
4432 * in the regexp->data->data array in the slot stored in the ARG() of
4433 * node involved. Note that this routine doesn't actually care about the
4434 * name, that information is not preserved from compilation to execution.
4435 * Returns the index of the leftmost defined buffer with the given name
4436 * or 0 if non of the buffers matched.
4439 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4442 RXi_GET_DECL(rex,rexi);
4443 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4444 I32 *nums=(I32*)SvPVX(sv_dat);
4446 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4448 for ( n=0; n<SvIVX(sv_dat); n++ ) {
4449 if ((I32)rex->lastparen >= nums[n] &&
4450 rex->offs[nums[n]].end != -1)
4459 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
4460 struct next_matchable_info * m,
4461 regmatch_info *reginfo)
4463 /* This function determines various characteristics about every possible
4464 * initial match of the passed-in EXACTish <text_node>, and stores them in
4467 * That includes a match string and a parallel mask, such that if you AND
4468 * the target string with the mask and compare with the match string,
4469 * you'll have a pretty good idea, perhaps even perfect, if that portion of
4470 * the target matches or not.
4472 * The motivation behind this function is to allow the caller to set up
4473 * tight loops for matching. Consider patterns like '.*B' or '.*?B' where
4474 * B is an arbitrary EXACTish node. To find the end of .*, we look for the
4475 * beginning oF B, which is the passed in <text_node> That's where this
4476 * function comes in. The values it returns can quickly be used to rule
4477 * out many, or all, cases of possible matches not actually being the
4478 * beginning of B, <text_node>. It is also used in regrepeat() where we
4479 * have 'A*', for arbitrary 'A'. This sets up criteria to more efficiently
4480 * determine where the span of 'A's stop.
4482 * If <text_node> is of type EXACT, there is only one possible character
4483 * that can match its first character, and so the situation is quite
4484 * simple. But things can get much more complicated if folding is
4485 * involved. It may be that the first character of an EXACTFish node
4486 * doesn't participate in any possible fold, e.g., punctuation, so it can
4487 * be matched only by itself. The vast majority of characters that are in
4488 * folds match just two things, their lower and upper-case equivalents.
4489 * But not all are like that; some have multiple possible matches, or match
4490 * sequences of more than one character. This function sorts all that out.
4492 * It returns information about all possibilities of what the first
4493 * character(s) of <text_node> could look like. Again, if <text_node> is a
4494 * plain EXACT node, that's just the actual first bytes of the first
4495 * character; but otherwise it is the bytes, that when masked, match all
4496 * possible combinations of all the initial bytes of all the characters
4497 * that could match, folded. (Actually, this is a slight over promise. It
4498 * handles only up to the initial 5 bytes, which is enough for all Unicode
4499 * characters, but not for all non-Unicode ones.)
4501 * Here's an example to clarify. Suppose the first character of
4502 * <text_node> is the letter 'C', and we are under /i matching. That means
4503 * 'c' also matches. The representations of these two characters differ in
4504 * just one bit, so the mask would be a zero in that position and ones in
4505 * the other 7. And the returned string would be the AND of these two
4506 * characters, and would be one byte long, since these characters are each
4507 * a single byte. ANDing the target <text_node> with this mask will yield
4508 * the returned string if and only if <text_node> begins with one of these
4509 * two characters. So, the function would also return that the definitive
4510 * length matched is 1 byte.
4512 * Now, suppose instead of the letter 'C', <text_node> begins with the
4513 * letter 'F'. The situation is much more complicated because there are
4514 * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also
4515 * begins with 'f', and hence could match. We add these into the returned
4516 * string and mask, but the result isn't definitive; the caller has to
4517 * check further if its AND and compare pass. But the failure of that
4518 * compare will quickly rule out most possible inputs.
4520 * Much of this could be done in regcomp.c at compile time, except for
4521 * locale-dependent, and UTF-8 target dependent data. Extra data fields
4522 * could be used for one or the other eventualities.
4524 * If this function determines that no possible character in the target
4525 * string can match, it returns FALSE; otherwise TRUE. (The FALSE
4526 * situation occurs if the first character in <text_node> requires UTF-8 to
4527 * represent, and the target string isn't in UTF-8.)
4529 * Some analysis is in GH #18414, located at the time of this writing at:
4530 * https://github.com/Perl/perl5/issues/18414
4533 const bool utf8_target = reginfo->is_utf8_target;
4534 bool utf8_pat = reginfo->is_utf8_pat;
4536 PERL_UINT_FAST8_T i;
4538 /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
4540 U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
4541 U8 lengths[MAX_MATCHES] = { 0 };
4543 U8 index_of_longest = 0;
4545 U8 *pat = (U8*)STRING(text_node);
4546 Size_t pat_len = STR_LEN(text_node);
4547 U8 op = OP(text_node);
4549 U8 byte_mask[5] = {0};
4550 U8 byte_anded[5] = {0};
4552 /* There are some folds in Unicode to multiple characters. This will hold
4553 * such characters that could fold to the beginning of 'text_node' */
4554 UV multi_fold_from = 0;
4556 /* We may have to create a modified copy of the pattern */
4557 U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' };
4560 m->min_length = 255;
4563 /* Even if the first character in the node can match something in Latin1,
4564 * if there is anything in the node that can't, the match must fail */
4565 if (! utf8_target && isEXACT_REQ8(op)) {
4569 /* Define a temporary op for use in this function, using an existing one that
4570 * should never be a real op during execution */
4571 #define TURKISH PSEUDO
4573 /* What to do about these two nodes had to be deferred to runtime (which is
4574 * now). If the extra information we now have so indicates, turn them into
4576 if ( (op == EXACTF && utf8_target)
4577 || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4579 if (op == EXACTFL && PL_in_utf8_turkic_locale) {
4586 /* And certain situations are better handled if we create a modified
4587 * version of the pattern */
4588 if (utf8_pat) { /* Here, must have been EXACTFL, so look at the
4589 specific problematic characters */
4590 if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) {
4592 /* The node could start with characters that are the first ones
4593 * of a multi-character fold. */
4595 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4596 if (multi_fold_from) {
4598 /* Here, they do form a sequence that matches the fold of a
4599 * single character. That single character then is a
4600 * possible match. Below we will look again at this, but
4601 * the code below is expecting every character in the
4602 * pattern to be folded, which the input isn't required to
4603 * be in this case. So, just fold the single character,
4604 * and the result will be in the expected form. */
4605 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4609 /* Turkish has a couple extra possibilities. */
4610 else if ( UNLIKELY(op == TURKISH)
4612 && isALPHA_FOLD_EQ(pat[0], 'f')
4613 && ( memBEGINs(pat + 1, pat_len - 1,
4614 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4616 && isALPHA_FOLD_EQ(pat[1], 'f')
4617 && memBEGINs(pat + 2, pat_len - 2,
4618 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4620 /* The macros for finding a multi-char fold don't include
4621 * the Turkish possibilities, in which U+130 folds to 'i'.
4622 * Hard-code these. It's very unlikely that Unicode will
4623 * ever add any others. */
4624 if (pat[1] == 'f') {
4626 Copy("ffi", mod_pat, pat_len, U8);
4630 Copy("fi", mod_pat, pat_len, U8);
4634 else if ( UTF8_IS_DOWNGRADEABLE_START(*pat)
4635 && LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8))
4636 && LIKELY(memNEs(pat, pat_len,
4637 LATIN_SMALL_LETTER_SHARP_S_UTF8))
4638 && (LIKELY(op != TURKISH || *pat != 'I')))
4640 /* For all cases of things between 0-255, except the ones
4641 * in the conditional above, the fold is just the lower
4642 * case, which is faster than the more general case. */
4643 mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0],
4649 else { /* Code point above 255, or needs special handling */
4650 _to_utf8_fold_flags(pat, pat + pat_len,
4652 FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE);
4657 else if /* Below is not a UTF-8 pattern; there's a somewhat different
4658 set of problematic characters */
4660 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len)))
4662 /* We may have to canonicalize a multi-char fold, as in the UTF-8
4664 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4668 else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) {
4669 mod_pat[0] = mod_pat[1] = 's';
4671 utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant
4672 chars, and speeds copying */
4675 else if (LIKELY(op != TURKISH || *pat != 'I')) {
4676 mod_pat[0] = toLOWER_L1(*pat);
4681 else if /* Below isn't a node that we convert to UTF-8 */
4684 && op == EXACTFAA_NO_TRIE
4685 && *pat == LATIN_SMALL_LETTER_SHARP_S)
4687 /* A very special case. Folding U+DF goes to U+17F under /iaa. We
4688 * did this at compile time when the pattern was UTF-8 , but otherwise
4689 * we couldn't do it earlier, because it requires a UTF-8 target for
4690 * this match to be legal. */
4691 pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1);
4692 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8
4693 LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8);
4698 /* Here, we have taken care of the initial work for a few very problematic
4699 * situations, possibly creating a modified pattern.
4701 * Now ready for the general case. We build up all the possible things
4702 * that could match the first character of the pattern into the elements of
4705 * Everything generally matches at least itself. But if there is a
4706 * UTF8ness mismatch, we have to convert to that of the target string. */
4707 if (UTF8_IS_INVARIANT(*pat)) { /* Immaterial if either is in UTF-8 */
4708 matches[0][0] = pat[0];
4712 else if (utf8_target) {
4714 lengths[0] = UTF8SKIP(pat);
4715 Copy(pat, matches[0], lengths[0], U8);
4718 else { /* target is UTF-8, pattern isn't */
4719 matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
4720 matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
4725 else if (! utf8_pat) { /* Neither is UTF-8 */
4726 matches[0][0] = pat[0];
4730 else /* target isn't UTF-8; pattern is. No match possible unless the
4731 pattern's first character can fit in a byte */
4732 if (UTF8_IS_DOWNGRADEABLE_START(*pat))
4734 matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
4739 /* Here we have taken care of any necessary node-type changes */
4742 m->max_length = lengths[0];
4743 m->min_length = lengths[0];
4746 /* For non-folding nodes, there are no other possible candidate matches,
4747 * but for foldable ones, we have to look further. */
4748 if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */
4749 UV folded; /* The first character in the pattern, folded */
4750 U32 first_fold_from; /* A character that folds to it */
4751 const U32 * remaining_fold_froms; /* The remaining characters that
4752 fold to it, if any */
4753 Size_t folds_to_count; /* The total number of characters that fold to
4756 /* If the node begins with a sequence of more than one character that
4757 * together form the fold of a single character, it is called a
4758 * 'multi-character fold', and the normal functions don't handle this
4759 * case. We set 'multi_fold_from' to the single folded-from character,
4760 * which is handled in an extra iteration below */
4762 folded = valid_utf8_to_uvchr(pat, NULL);
4764 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4769 /* This may generate illegal combinations for things like EXACTF,
4770 * but rather than repeat the logic and exclude them here, all such
4771 * illegalities are checked for and skipped below in the loop */
4773 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len);
4776 /* Everything matches at least itself; initialize to that because the
4777 * only the branches below that set it are the ones where the number
4781 /* There are a few special cases for locale-dependent nodes, where the
4782 * run-time context was needed before we could know what matched */
4783 if (UNLIKELY(op == EXACTFL) && folded < 256) {
4784 first_fold_from = PL_fold_locale[folded];
4786 else if ( op == EXACTFL && utf8_target && utf8_pat
4787 && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8
4788 LATIN_SMALL_LETTER_LONG_S_UTF8))
4790 first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S;
4792 else if (UNLIKELY( op == TURKISH
4793 && ( isALPHA_FOLD_EQ(folded, 'i')
4795 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
4796 LATIN_SMALL_LETTER_DOTLESS_I))))
4797 { /* Turkish folding requires special handling */
4799 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4800 else if (folded == 'I')
4801 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I;
4802 else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
4803 first_fold_from = 'i';
4804 else first_fold_from = 'I';
4807 /* Here, isn't a special case: use the generic function to
4808 * calculate what folds to this */
4810 /* Look up what code points (besides itself) fold to 'folded';
4811 * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4812 folds_to_count = _inverse_folds(folded, &first_fold_from,
4813 &remaining_fold_froms);
4816 /* Add each character that folds to 'folded' to the list of them,
4817 * subject to limitations based on the node type and target UTF8ness.
4818 * If there was a character that folded to multiple characters, do an
4819 * extra iteration for it. (Note the extra iteration if there is a
4820 * multi-character fold) */
4821 for (i = 0; i < folds_to_count
4822 + UNLIKELY(multi_fold_from != 0); i++)
4826 if (i >= folds_to_count) { /* Final iteration: handle the
4828 fold_from = multi_fold_from;
4831 fold_from = first_fold_from;
4833 else if (i < folds_to_count) {
4834 fold_from = remaining_fold_froms[i-1];
4837 if (folded == fold_from) { /* We already added the character
4842 /* EXACTF doesn't have any non-ascii folds */
4843 if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) {
4847 /* In /iaa nodes, neither or both must be ASCII to be a legal fold
4849 if ( isASCII(folded) != isASCII(fold_from)
4850 && inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE))
4856 /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8
4857 * locale, but those have been converted to EXACTFU above) */
4859 && (folded < 256) != (fold_from < 256))
4864 /* If this triggers, it likely is because of the unlikely case
4865 * where a new Unicode standard has changed what MAX_MATCHES should
4867 assert(m->count < MAX_MATCHES);
4869 /* Add this character to the list of possible matches */
4871 uvchr_to_utf8(matches[m->count], fold_from);
4872 lengths[m->count] = UVCHR_SKIP(fold_from);
4875 else { /* Non-UTF8 target: no code point above 255 can appear in it
4877 if (fold_from > 255) {
4881 matches[m->count][0] = fold_from;
4882 lengths[m->count] = 1;
4886 /* Update min and mlengths */
4887 if (m->min_length > lengths[m->count-1]) {
4888 m->min_length = lengths[m->count-1];
4891 if (m->max_length < lengths[m->count-1]) {
4892 index_of_longest = m->count - 1;
4893 m->max_length = lengths[index_of_longest];
4895 } /* looped through each potential fold */
4897 /* If there is something that folded to an initial multi-character
4898 * fold, repeat, using it. This catches some edge cases. An example
4899 * of one is /ss/i when UTF-8 encoded. The function
4900 * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF
4901 * (LATIN SMALL SHARP S). If it returned a list of characters, this
4902 * code wouldn't be needed. But since it doesn't, we have to look what
4903 * folds to the U+DF. In this case, U+1E9E does, and has to be added.
4905 if (multi_fold_from) {
4906 folded = multi_fold_from;
4907 multi_fold_from = 0;
4910 } /* End of finding things that participate in this fold */
4912 if (m->count == 0) { /* If nothing found, can't match */
4917 /* Have calculated all possible matches. Now calculate the mask and AND
4919 m->initial_exact = 0;
4920 m->initial_definitive = 0;
4923 unsigned int mask_ones = 0;
4924 unsigned int possible_ones = 0;
4927 /* For each byte that is in all possible matches ... */
4928 for (j = 0; j < MIN(m->min_length, 5); j++) {
4930 /* Initialize the accumulator for this byte */
4931 byte_mask[j] = 0xFF;
4932 byte_anded[j] = matches[0][j];
4934 /* Then the rest of the rows (folds). The mask is based on, like,
4935 * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0
4936 * where they differ. */
4937 for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) {
4938 byte_mask[j] &= ~ (byte_anded[j] ^ matches[i][j]);
4939 byte_anded[j] &= matches[i][j];
4942 /* Keep track of the number of initial mask bytes that are all one
4943 * bits. The code calling this can use this number to know that
4944 * a string that matches this number of bytes in the pattern is an
4945 * exact match of that pattern for this number of bytes. But also
4946 * counted are the number of initial bytes that in total have a
4947 * single zero bit. If a string matches those, masked, it must be
4948 * one of two possibilites, both of which this function has
4949 * determined are legal. (But if that single 0 is one of the
4950 * initial bits for masking a UTF-8 start byte, that could
4951 * incorrectly lead to different length strings appearing to be
4952 * equivalent, so only do this optimization when the matchables are
4953 * all the same length. This was uncovered by testing
4955 if (m->min_length == m->max_length) {
4956 mask_ones += PL_bitcount[byte_mask[j]];
4958 if (mask_ones + 1 >= possible_ones) {
4959 m->initial_definitive++;
4960 if (mask_ones >= possible_ones) {
4968 /* The first byte is separate for speed */
4969 m->first_byte_mask = byte_mask[0];
4970 m->first_byte_anded = byte_anded[0];
4972 /* Then pack up to the next 4 bytes into a word */
4973 m->mask32 = m->anded32 = 0;
4974 for (i = 1; i < MIN(m->min_length, 5); i++) {
4976 U8 shift = (which - 1) * 8;
4977 m->mask32 |= (U32) byte_mask[i] << shift;
4978 m->anded32 |= (U32) byte_anded[i] << shift;
4981 /* Finally, take the match strings and place them sequentially into a
4982 * one-dimensional array. (This is done to save significant space in the
4983 * structure.) Sort so the longest (presumably the least likely) is last.
4984 * XXX When this gets moved to regcomp, may want to fully sort shortest
4985 * first, but above we generally used the folded code point first, and
4986 * those tend to be no longer than their upper case values, so this is
4987 * already pretty well sorted by size.
4989 * If the asserts fail, it's most likely because a new version of the
4990 * Unicode standard requires more space; simply increase the declaration
4994 U8 output_index = 0;
4996 if (m->count > 1) { /* No need to sort a single entry */
4997 for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
4999 /* Keep the same order for all but the longest. (If the
5000 * asserts fail, it could be because m->matches is declared too
5001 * short, either because of a new Unicode release, or an
5002 * overlooked test case, or it could be a bug.) */
5003 if (i != index_of_longest) {
5004 assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
5005 Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
5006 cur_pos += lengths[i];
5007 m->lengths[output_index++] = lengths[i];
5012 assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches));
5013 Copy(matches[index_of_longest], m->matches + cur_pos,
5014 lengths[index_of_longest], U8);
5016 /* Place the longest match last */
5017 m->lengths[output_index] = lengths[index_of_longest];
5024 PERL_STATIC_FORCE_INLINE /* We want speed at the expense of size */
5026 S_test_EXACTISH_ST(const char * loc,
5027 struct next_matchable_info info)
5029 /* This function uses the data set up in setup_EXACTISH_ST() to see if the
5030 * bytes starting at 'loc' can match based on 'next_matchable_info' */
5034 /* Check the first byte */
5035 if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded)
5038 /* Pack the next up-to-4 bytes into a 32 bit word */
5039 switch (info.min_length) {
5041 input32 |= (U32) ((U8) loc[4]) << 3 * 8;
5044 input32 |= (U8) loc[3] << 2 * 8;
5047 input32 |= (U8) loc[2] << 1 * 8;
5050 input32 |= (U8) loc[1];
5053 return TRUE; /* We already tested and passed the 0th byte */
5058 /* And AND that with the mask and compare that with the assembled ANDED
5060 return (input32 & info.mask32) == info.anded32;
5064 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
5066 /* returns a boolean indicating if there is a Grapheme Cluster Boundary
5067 * between the inputs. See https://www.unicode.org/reports/tr29/. */
5069 PERL_ARGS_ASSERT_ISGCB;
5071 switch (GCB_table[before][after]) {
5078 case GCB_RI_then_RI:
5081 U8 * temp_pos = (U8 *) curpos;
5083 /* Do not break within emoji flag sequences. That is, do not
5084 * break between regional indicator (RI) symbols if there is an
5085 * odd number of RI characters before the break point.
5086 * GB12 sot (RI RI)* RI × RI
5087 * GB13 [^RI] (RI RI)* RI × RI */
5089 while (backup_one_GCB(strbeg,
5091 utf8_target) == GCB_Regional_Indicator)
5096 return RI_count % 2 != 1;
5099 case GCB_EX_then_EM:
5101 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */
5103 U8 * temp_pos = (U8 *) curpos;
5107 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5109 while (prev == GCB_Extend);
5111 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
5114 case GCB_Maybe_Emoji_NonBreak:
5118 /* Do not break within emoji modifier sequences or emoji zwj sequences.
5119 GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
5121 U8 * temp_pos = (U8 *) curpos;
5125 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5127 while (prev == GCB_Extend);
5129 return prev != GCB_ExtPict_XX;
5137 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
5138 before, after, GCB_table[before][after]);
5145 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5149 PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
5151 if (*curpos < strbeg) {
5156 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5157 U8 * prev_prev_char_pos;
5159 if (! prev_char_pos) {
5163 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5164 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5165 *curpos = prev_char_pos;
5166 prev_char_pos = prev_prev_char_pos;
5169 *curpos = (U8 *) strbeg;
5174 if (*curpos - 2 < strbeg) {
5175 *curpos = (U8 *) strbeg;
5179 gcb = getGCB_VAL_CP(*(*curpos - 1));
5185 /* Combining marks attach to most classes that precede them, but this defines
5186 * the exceptions (from TR14) */
5187 #define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
5188 || prev == LB_Mandatory_Break \
5189 || prev == LB_Carriage_Return \
5190 || prev == LB_Line_Feed \
5191 || prev == LB_Next_Line \
5192 || prev == LB_Space \
5193 || prev == LB_ZWSpace))
5196 S_isLB(pTHX_ LB_enum before,
5198 const U8 * const strbeg,
5199 const U8 * const curpos,
5200 const U8 * const strend,
5201 const bool utf8_target)
5203 U8 * temp_pos = (U8 *) curpos;
5204 LB_enum prev = before;
5206 /* Is the boundary between 'before' and 'after' line-breakable?
5207 * Most of this is just a table lookup of a generated table from Unicode
5208 * rules. But some rules require context to decide, and so have to be
5209 * implemented in code */
5211 PERL_ARGS_ASSERT_ISLB;
5213 /* Rule numbers in the comments below are as of Unicode 9.0 */
5217 switch (LB_table[before][after]) {
5222 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5225 case LB_SP_foo + LB_BREAKABLE:
5226 case LB_SP_foo + LB_NOBREAK:
5227 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5229 /* When we have something following a SP, we have to look at the
5230 * context in order to know what to do.
5232 * SP SP should not reach here because LB7: Do not break before
5233 * spaces. (For two spaces in a row there is nothing that
5234 * overrides that) */
5235 assert(after != LB_Space);
5237 /* Here we have a space followed by a non-space. Mostly this is a
5238 * case of LB18: "Break after spaces". But there are complications
5239 * as the handling of spaces is somewhat tricky. They are in a
5240 * number of rules, which have to be applied in priority order, but
5241 * something earlier in the string can cause a rule to be skipped
5242 * and a lower priority rule invoked. A prime example is LB7 which
5243 * says don't break before a space. But rule LB8 (lower priority)
5244 * says that the first break opportunity after a ZW is after any
5245 * span of spaces immediately after it. If a ZW comes before a SP
5246 * in the input, rule LB8 applies, and not LB7. Other such rules
5247 * involve combining marks which are rules 9 and 10, but they may
5248 * override higher priority rules if they come earlier in the
5249 * string. Since we're doing random access into the middle of the
5250 * string, we have to look for rules that should get applied based
5251 * on both string position and priority. Combining marks do not
5252 * attach to either ZW nor SP, so we don't have to consider them
5255 * To check for LB8, we have to find the first non-space character
5256 * before this span of spaces */
5258 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5260 while (prev == LB_Space);
5262 /* LB8 Break before any character following a zero-width space,
5263 * even if one or more spaces intervene.
5265 * So if we have a ZW just before this span, and to get here this
5266 * is the final space in the span. */
5267 if (prev == LB_ZWSpace) {
5271 /* Here, not ZW SP+. There are several rules that have higher
5272 * priority than LB18 and can be resolved now, as they don't depend
5273 * on anything earlier in the string (except ZW, which we have
5274 * already handled). One of these rules is LB11 Do not break
5275 * before Word joiner, but we have specially encoded that in the
5276 * lookup table so it is caught by the single test below which
5277 * catches the other ones. */
5278 if (LB_table[LB_Space][after] - LB_SP_foo
5279 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
5284 /* If we get here, we have to XXX consider combining marks. */
5285 if (prev == LB_Combining_Mark) {
5287 /* What happens with these depends on the character they
5290 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5292 while (prev == LB_Combining_Mark);
5294 /* Most times these attach to and inherit the characteristics
5295 * of that character, but not always, and when not, they are to
5296 * be treated as AL by rule LB10. */
5297 if (! LB_CM_ATTACHES_TO(prev)) {
5298 prev = LB_Alphabetic;
5302 /* Here, we have the character preceding the span of spaces all set
5303 * up. We follow LB18: "Break after spaces" unless the table shows
5304 * that is overriden */
5305 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
5309 /* We don't know how to treat the CM except by looking at the first
5310 * non-CM character preceding it. ZWJ is treated as CM */
5312 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5314 while (prev == LB_Combining_Mark || prev == LB_ZWJ);
5316 /* Here, 'prev' is that first earlier non-CM character. If the CM
5317 * attatches to it, then it inherits the behavior of 'prev'. If it
5318 * doesn't attach, it is to be treated as an AL */
5319 if (! LB_CM_ATTACHES_TO(prev)) {
5320 prev = LB_Alphabetic;
5325 case LB_HY_or_BA_then_foo + LB_BREAKABLE:
5326 case LB_HY_or_BA_then_foo + LB_NOBREAK:
5328 /* LB21a Don't break after Hebrew + Hyphen.
5329 * HL (HY | BA) × */
5331 if (backup_one_LB(strbeg, &temp_pos, utf8_target)
5332 == LB_Hebrew_Letter)
5337 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
5339 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
5340 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5342 /* LB25a (PR | PO) × ( OP | HY )? NU */
5343 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5347 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5350 case LB_SY_or_IS_then_various + LB_BREAKABLE:
5351 case LB_SY_or_IS_then_various + LB_NOBREAK:
5353 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
5355 LB_enum temp = prev;
5357 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5359 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5360 if (temp == LB_Numeric) {
5364 return LB_table[prev][after] - LB_SY_or_IS_then_various
5368 case LB_various_then_PO_or_PR + LB_BREAKABLE:
5369 case LB_various_then_PO_or_PR + LB_NOBREAK:
5371 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
5373 LB_enum temp = prev;
5374 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5376 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5378 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5379 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5381 if (temp == LB_Numeric) {
5384 return LB_various_then_PO_or_PR;
5387 case LB_RI_then_RI + LB_NOBREAK:
5388 case LB_RI_then_RI + LB_BREAKABLE:
5392 /* LB30a Break between two regional indicator symbols if and
5393 * only if there are an even number of regional indicators
5394 * preceding the position of the break.
5396 * sot (RI RI)* RI × RI
5397 * [^RI] (RI RI)* RI × RI */
5399 while (backup_one_LB(strbeg,
5401 utf8_target) == LB_Regional_Indicator)
5406 return RI_count % 2 == 0;
5414 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5415 before, after, LB_table[before][after]);
5422 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5427 PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5429 if (*curpos >= strend) {
5434 *curpos += UTF8SKIP(*curpos);
5435 if (*curpos >= strend) {
5438 lb = getLB_VAL_UTF8(*curpos, strend);
5442 if (*curpos >= strend) {
5445 lb = getLB_VAL_CP(**curpos);
5452 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5456 PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5458 if (*curpos < strbeg) {
5463 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5464 U8 * prev_prev_char_pos;
5466 if (! prev_char_pos) {
5470 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5471 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5472 *curpos = prev_char_pos;
5473 prev_char_pos = prev_prev_char_pos;
5476 *curpos = (U8 *) strbeg;
5481 if (*curpos - 2 < strbeg) {
5482 *curpos = (U8 *) strbeg;
5486 lb = getLB_VAL_CP(*(*curpos - 1));
5493 S_isSB(pTHX_ SB_enum before,
5495 const U8 * const strbeg,
5496 const U8 * const curpos,
5497 const U8 * const strend,
5498 const bool utf8_target)
5500 /* returns a boolean indicating if there is a Sentence Boundary Break
5501 * between the inputs. See https://www.unicode.org/reports/tr29/ */
5503 U8 * lpos = (U8 *) curpos;
5504 bool has_para_sep = FALSE;
5505 bool has_sp = FALSE;
5507 PERL_ARGS_ASSERT_ISSB;
5509 /* Break at the start and end of text.
5512 But unstated in Unicode is don't break if the text is empty */
5513 if (before == SB_EDGE || after == SB_EDGE) {
5514 return before != after;
5517 /* SB 3: Do not break within CRLF. */
5518 if (before == SB_CR && after == SB_LF) {
5522 /* Break after paragraph separators. CR and LF are considered
5523 * so because Unicode views text as like word processing text where there
5524 * are no newlines except between paragraphs, and the word processor takes
5525 * care of wrapping without there being hard line-breaks in the text *./
5526 SB4. Sep | CR | LF ÷ */
5527 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5531 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5532 * (See Section 6.2, Replacing Ignore Rules.)
5533 SB5. X (Extend | Format)* → X */
5534 if (after == SB_Extend || after == SB_Format) {
5536 /* Implied is that the these characters attach to everything
5537 * immediately prior to them except for those separator-type
5538 * characters. And the rules earlier have already handled the case
5539 * when one of those immediately precedes the extend char */
5543 if (before == SB_Extend || before == SB_Format) {
5544 U8 * temp_pos = lpos;
5545 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5546 if ( backup != SB_EDGE
5555 /* Here, both 'before' and 'backup' are these types; implied is that we
5556 * don't break between them */
5557 if (backup == SB_Extend || backup == SB_Format) {
5562 /* Do not break after ambiguous terminators like period, if they are
5563 * immediately followed by a number or lowercase letter, if they are
5564 * between uppercase letters, if the first following letter (optionally
5565 * after certain punctuation) is lowercase, or if they are followed by
5566 * "continuation" punctuation such as comma, colon, or semicolon. For
5567 * example, a period may be an abbreviation or numeric period, and thus may
5568 * not mark the end of a sentence.
5570 * SB6. ATerm × Numeric */
5571 if (before == SB_ATerm && after == SB_Numeric) {
5575 /* SB7. (Upper | Lower) ATerm × Upper */
5576 if (before == SB_ATerm && after == SB_Upper) {
5577 U8 * temp_pos = lpos;
5578 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5579 if (backup == SB_Upper || backup == SB_Lower) {
5584 /* The remaining rules that aren't the final one, all require an STerm or
5585 * an ATerm after having backed up over some Close* Sp*, and in one case an
5586 * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5587 * So do that backup now, setting flags if either Sp or a paragraph
5588 * separator are found */
5590 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5591 has_para_sep = TRUE;
5592 before = backup_one_SB(strbeg, &lpos, utf8_target);
5595 if (before == SB_Sp) {
5598 before = backup_one_SB(strbeg, &lpos, utf8_target);
5600 while (before == SB_Sp);
5603 while (before == SB_Close) {
5604 before = backup_one_SB(strbeg, &lpos, utf8_target);
5607 /* The next few rules apply only when the backed-up-to is an ATerm, and in
5608 * most cases an STerm */
5609 if (before == SB_STerm || before == SB_ATerm) {
5611 /* So, here the lhs matches
5612 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5613 * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5614 * The rules that apply here are:
5616 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR
5617 | LF | STerm | ATerm) )* Lower
5618 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
5619 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF)
5620 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF)
5621 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷
5624 /* And all but SB11 forbid having seen a paragraph separator */
5625 if (! has_para_sep) {
5626 if (before == SB_ATerm) { /* SB8 */
5627 U8 * rpos = (U8 *) curpos;
5628 SB_enum later = after;
5630 while ( later != SB_OLetter
5631 && later != SB_Upper
5632 && later != SB_Lower
5636 && later != SB_STerm
5637 && later != SB_ATerm
5638 && later != SB_EDGE)
5640 later = advance_one_SB(&rpos, strend, utf8_target);
5642 if (later == SB_Lower) {
5647 if ( after == SB_SContinue /* SB8a */
5648 || after == SB_STerm
5649 || after == SB_ATerm)
5654 if (! has_sp) { /* SB9 applies only if there was no Sp* */
5655 if ( after == SB_Close
5665 /* SB10. This and SB9 could probably be combined some way, but khw
5666 * has decided to follow the Unicode rule book precisely for
5667 * simplified maintenance */
5681 /* Otherwise, do not break.
5688 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5692 PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5694 if (*curpos >= strend) {
5700 *curpos += UTF8SKIP(*curpos);
5701 if (*curpos >= strend) {
5704 sb = getSB_VAL_UTF8(*curpos, strend);
5705 } while (sb == SB_Extend || sb == SB_Format);
5710 if (*curpos >= strend) {
5713 sb = getSB_VAL_CP(**curpos);
5714 } while (sb == SB_Extend || sb == SB_Format);
5721 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5725 PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5727 if (*curpos < strbeg) {
5732 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5733 if (! prev_char_pos) {
5737 /* Back up over Extend and Format. curpos is always just to the right
5738 * of the characater whose value we are getting */
5740 U8 * prev_prev_char_pos;
5741 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5744 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5745 *curpos = prev_char_pos;
5746 prev_char_pos = prev_prev_char_pos;
5749 *curpos = (U8 *) strbeg;
5752 } while (sb == SB_Extend || sb == SB_Format);
5756 if (*curpos - 2 < strbeg) {
5757 *curpos = (U8 *) strbeg;
5761 sb = getSB_VAL_CP(*(*curpos - 1));
5762 } while (sb == SB_Extend || sb == SB_Format);
5769 S_isWB(pTHX_ WB_enum previous,
5772 const U8 * const strbeg,
5773 const U8 * const curpos,
5774 const U8 * const strend,
5775 const bool utf8_target)
5777 /* Return a boolean as to if the boundary between 'before' and 'after' is
5778 * a Unicode word break, using their published algorithm, but tailored for
5779 * Perl by treating spans of white space as one unit. Context may be
5780 * needed to make this determination. If the value for the character
5781 * before 'before' is known, it is passed as 'previous'; otherwise that
5782 * should be set to WB_UNKNOWN. The other input parameters give the
5783 * boundaries and current position in the matching of the string. That
5784 * is, 'curpos' marks the position where the character whose wb value is
5785 * 'after' begins. See http://www.unicode.org/reports/tr29/ */
5787 U8 * before_pos = (U8 *) curpos;
5788 U8 * after_pos = (U8 *) curpos;
5789 WB_enum prev = before;
5792 PERL_ARGS_ASSERT_ISWB;
5794 /* Rule numbers in the comments below are as of Unicode 9.0 */
5798 switch (WB_table[before][after]) {
5805 case WB_hs_then_hs: /* 2 horizontal spaces in a row */
5806 next = advance_one_WB(&after_pos, strend, utf8_target,
5807 FALSE /* Don't skip Extend nor Format */ );
5808 /* A space immediately preceding an Extend or Format is attached
5809 * to by them, and hence gets separated from previous spaces.
5810 * Otherwise don't break between horizontal white space */
5811 return next == WB_Extend || next == WB_Format;
5813 /* WB4 Ignore Format and Extend characters, except when they appear at
5814 * the beginning of a region of text. This code currently isn't
5815 * general purpose, but it works as the rules are currently and likely
5816 * to be laid out. The reason it works is that when 'they appear at
5817 * the beginning of a region of text', the rule is to break before
5818 * them, just like any other character. Therefore, the default rule
5819 * applies and we don't have to look in more depth. Should this ever
5820 * change, we would have to have 2 'case' statements, like in the rules
5821 * below, and backup a single character (not spacing over the extend
5822 * ones) and then see if that is one of the region-end characters and
5824 case WB_Ex_or_FO_or_ZWJ_then_foo:
5825 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5828 case WB_DQ_then_HL + WB_BREAKABLE:
5829 case WB_DQ_then_HL + WB_NOBREAK:
5831 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */
5833 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5834 == WB_Hebrew_Letter)
5839 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5841 case WB_HL_then_DQ + WB_BREAKABLE:
5842 case WB_HL_then_DQ + WB_NOBREAK:
5844 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */
5846 if (advance_one_WB(&after_pos, strend, utf8_target,
5847 TRUE /* Do skip Extend and Format */ )
5848 == WB_Hebrew_Letter)
5853 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5855 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5856 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5858 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
5859 * | Single_Quote) (ALetter | Hebrew_Letter) */
5861 next = advance_one_WB(&after_pos, strend, utf8_target,
5862 TRUE /* Do skip Extend and Format */ );
5864 if (next == WB_ALetter || next == WB_Hebrew_Letter)
5869 return WB_table[before][after]
5870 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5872 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5873 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5875 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5876 * | Single_Quote) × (ALetter | Hebrew_Letter) */
5878 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5879 if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5884 return WB_table[before][after]
5885 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5887 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5888 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5890 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
5893 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5899 return WB_table[before][after]
5900 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5902 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5903 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5905 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */
5907 if (advance_one_WB(&after_pos, strend, utf8_target,
5908 TRUE /* Do skip Extend and Format */ )
5914 return WB_table[before][after]
5915 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5917 case WB_RI_then_RI + WB_NOBREAK:
5918 case WB_RI_then_RI + WB_BREAKABLE:
5922 /* Do not break within emoji flag sequences. That is, do not
5923 * break between regional indicator (RI) symbols if there is an
5924 * odd number of RI characters before the potential break
5927 * WB15 sot (RI RI)* RI × RI
5928 * WB16 [^RI] (RI RI)* RI × RI */
5930 while (backup_one_WB(&previous,
5933 utf8_target) == WB_Regional_Indicator)
5938 return RI_count % 2 != 1;
5946 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5947 before, after, WB_table[before][after]);
5954 S_advance_one_WB(pTHX_ U8 ** curpos,
5955 const U8 * const strend,
5956 const bool utf8_target,
5957 const bool skip_Extend_Format)
5961 PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5963 if (*curpos >= strend) {
5969 /* Advance over Extend and Format */
5971 *curpos += UTF8SKIP(*curpos);
5972 if (*curpos >= strend) {
5975 wb = getWB_VAL_UTF8(*curpos, strend);
5976 } while ( skip_Extend_Format
5977 && (wb == WB_Extend || wb == WB_Format));
5982 if (*curpos >= strend) {
5985 wb = getWB_VAL_CP(**curpos);
5986 } while ( skip_Extend_Format
5987 && (wb == WB_Extend || wb == WB_Format));
5994 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5998 PERL_ARGS_ASSERT_BACKUP_ONE_WB;
6000 /* If we know what the previous character's break value is, don't have
6002 if (*previous != WB_UNKNOWN) {
6005 /* But we need to move backwards by one */
6007 *curpos = reghopmaybe3(*curpos, -1, strbeg);
6009 *previous = WB_EDGE;
6010 *curpos = (U8 *) strbeg;
6013 *previous = WB_UNKNOWN;
6018 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
6021 /* And we always back up over these three types */
6022 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
6027 if (*curpos < strbeg) {
6032 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
6033 if (! prev_char_pos) {
6037 /* Back up over Extend and Format. curpos is always just to the right
6038 * of the characater whose value we are getting */
6040 U8 * prev_prev_char_pos;
6041 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
6045 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
6046 *curpos = prev_char_pos;
6047 prev_char_pos = prev_prev_char_pos;
6050 *curpos = (U8 *) strbeg;
6053 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
6057 if (*curpos - 2 < strbeg) {
6058 *curpos = (U8 *) strbeg;
6062 wb = getWB_VAL_CP(*(*curpos - 1));
6063 } while (wb == WB_Extend || wb == WB_Format);
6069 /* Macros for regmatch(), using its internal variables */
6070 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
6071 #define NEXTCHR_IS_EOS (nextbyte < 0)
6073 #define SET_nextchr \
6074 nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
6076 #define SET_locinput(p) \
6080 #define sayYES goto yes
6081 #define sayNO goto no
6082 #define sayNO_SILENT goto no_silent
6084 /* we dont use STMT_START/END here because it leads to
6085 "unreachable code" warnings, which are bogus, but distracting. */
6086 #define CACHEsayNO \
6087 if (ST.cache_mask) \
6088 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
6091 #define EVAL_CLOSE_PAREN_IS(st,expr) \
6094 ( ( st )->u.eval.close_paren ) && \
6095 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6098 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
6101 ( ( st )->u.eval.close_paren ) && \
6103 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6107 #define EVAL_CLOSE_PAREN_SET(st,expr) \
6108 (st)->u.eval.close_paren = ( (expr) + 1 )
6110 #define EVAL_CLOSE_PAREN_CLEAR(st) \
6111 (st)->u.eval.close_paren = 0
6113 /* push a new state then goto it */
6115 #define PUSH_STATE_GOTO(state, node, input, eol, sr0) \
6116 pushinput = input; \
6120 st->resume_state = state; \
6123 /* push a new state with success backtracking, then goto it */
6125 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \
6126 pushinput = input; \
6130 st->resume_state = state; \
6131 goto push_yes_state;
6133 #define DEBUG_STATE_pp(pp) \
6135 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
6136 Perl_re_printf( aTHX_ \
6137 "%*s" pp " %s%s%s%s%s\n", \
6138 INDENT_CHARS(depth), "", \
6139 PL_reg_name[st->resume_state], \
6140 ((st==yes_state||st==mark_state) ? "[" : ""), \
6141 ((st==yes_state) ? "Y" : ""), \
6142 ((st==mark_state) ? "M" : ""), \
6143 ((st==yes_state||st==mark_state) ? "]" : "") \
6149 regmatch() - main matching routine
6151 This is basically one big switch statement in a loop. We execute an op,
6152 set 'next' to point the next op, and continue. If we come to a point which
6153 we may need to backtrack to on failure such as (A|B|C), we push a
6154 backtrack state onto the backtrack stack. On failure, we pop the top
6155 state, and re-enter the loop at the state indicated. If there are no more
6156 states to pop, we return failure.
6158 Sometimes we also need to backtrack on success; for example /A+/, where
6159 after successfully matching one A, we need to go back and try to
6160 match another one; similarly for lookahead assertions: if the assertion
6161 completes successfully, we backtrack to the state just before the assertion
6162 and then carry on. In these cases, the pushed state is marked as
6163 'backtrack on success too'. This marking is in fact done by a chain of
6164 pointers, each pointing to the previous 'yes' state. On success, we pop to
6165 the nearest yes state, discarding any intermediate failure-only states.
6166 Sometimes a yes state is pushed just to force some cleanup code to be
6167 called at the end of a successful match or submatch; e.g. (??{$re}) uses
6168 it to free the inner regex.
6170 Note that failure backtracking rewinds the cursor position, while
6171 success backtracking leaves it alone.
6173 A pattern is complete when the END op is executed, while a subpattern
6174 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
6175 ops trigger the "pop to last yes state if any, otherwise return true"
6178 A common convention in this function is to use A and B to refer to the two
6179 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
6180 the subpattern to be matched possibly multiple times, while B is the entire
6181 rest of the pattern. Variable and state names reflect this convention.
6183 The states in the main switch are the union of ops and failure/success of
6184 substates associated with that op. For example, IFMATCH is the op
6185 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
6186 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
6187 successfully matched A and IFMATCH_A_fail is a state saying that we have
6188 just failed to match A. Resume states always come in pairs. The backtrack
6189 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
6190 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
6191 on success or failure.
6193 The struct that holds a backtracking state is actually a big union, with
6194 one variant for each major type of op. The variable st points to the
6195 top-most backtrack struct. To make the code clearer, within each
6196 block of code we #define ST to alias the relevant union.
6198 Here's a concrete example of a (vastly oversimplified) IFMATCH
6204 #define ST st->u.ifmatch
6206 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6207 ST.foo = ...; // some state we wish to save
6209 // push a yes backtrack state with a resume value of
6210 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
6212 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
6215 case IFMATCH_A: // we have successfully executed A; now continue with B
6217 bar = ST.foo; // do something with the preserved value
6220 case IFMATCH_A_fail: // A failed, so the assertion failed
6221 ...; // do some housekeeping, then ...
6222 sayNO; // propagate the failure
6229 For any old-timers reading this who are familiar with the old recursive
6230 approach, the code above is equivalent to:
6232 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6241 ...; // do some housekeeping, then ...
6242 sayNO; // propagate the failure
6245 The topmost backtrack state, pointed to by st, is usually free. If you
6246 want to claim it, populate any ST.foo fields in it with values you wish to
6247 save, then do one of
6249 PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
6250 PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
6252 which sets that backtrack state's resume value to 'resume_state', pushes a
6253 new free entry to the top of the backtrack stack, then goes to 'node'.
6254 On backtracking, the free slot is popped, and the saved state becomes the
6255 new free state. An ST.foo field in this new top state can be temporarily
6256 accessed to retrieve values, but once the main loop is re-entered, it
6257 becomes available for reuse.
6259 Note that the depth of the backtrack stack constantly increases during the
6260 left-to-right execution of the pattern, rather than going up and down with
6261 the pattern nesting. For example the stack is at its maximum at Z at the
6262 end of the pattern, rather than at X in the following:
6264 /(((X)+)+)+....(Y)+....Z/
6266 The only exceptions to this are lookahead/behind assertions and the cut,
6267 (?>A), which pop all the backtrack states associated with A before
6270 Backtrack state structs are allocated in slabs of about 4K in size.
6271 PL_regmatch_state and st always point to the currently active state,
6272 and PL_regmatch_slab points to the slab currently containing
6273 PL_regmatch_state. The first time regmatch() is called, the first slab is
6274 allocated, and is never freed until interpreter destruction. When the slab
6275 is full, a new one is allocated and chained to the end. At exit from
6276 regmatch(), slabs allocated since entry are freed.
6278 In order to work with variable length lookbehinds, an upper limit is placed on
6279 lookbehinds which is set to where the match position is at the end of where the
6280 lookbehind would get to. Nothing in the lookbehind should match above that,
6281 except we should be able to look beyond if for things like \b, which need the
6282 next character in the string to be able to determine if this is a boundary or
6283 not. We also can't match the end of string/line unless we are also at the end
6284 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
6285 that match a width, we have to add a condition that they are within the legal
6286 bounds of our window into the string.
6290 /* returns -1 on failure, $+[0] on success */
6292 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
6294 const bool utf8_target = reginfo->is_utf8_target;
6295 const U32 uniflags = UTF8_ALLOW_DEFAULT;
6296 REGEXP *rex_sv = reginfo->prog;
6297 regexp *rex = ReANY(rex_sv);
6298 RXi_GET_DECL(rex,rexi);
6299 /* the current state. This is a cached copy of PL_regmatch_state */
6301 /* cache heavy used fields of st in registers */
6304 U32 n = 0; /* general value; init to avoid compiler warning */
6305 U32 utmp = 0; /* tmp variable - valid for at most one opcode */
6306 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
6307 SSize_t endref = 0; /* offset of end of backref when ln is start */
6308 char *locinput = startpos;
6309 char *loceol = reginfo->strend;
6310 char *pushinput; /* where to continue after a PUSH */
6311 char *pusheol; /* where to stop matching (loceol) after a PUSH */
6312 U8 *pushsr0; /* save starting pos of script run */
6313 PERL_INT_FAST16_T nextbyte; /* is always set to UCHARAT(locinput), or -1
6316 bool result = 0; /* return value of S_regmatch */
6317 U32 depth = 0; /* depth of backtrack stack */
6318 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
6319 const U32 max_nochange_depth =
6320 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
6321 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
6322 regmatch_state *yes_state = NULL; /* state to pop to on success of
6324 /* mark_state piggy backs on the yes_state logic so that when we unwind
6325 the stack on success we can update the mark_state as we go */
6326 regmatch_state *mark_state = NULL; /* last mark state we have seen */
6327 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
6328 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
6330 bool no_final = 0; /* prevent failure from backtracking? */
6331 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
6332 char *startpoint = locinput;
6333 SV *popmark = NULL; /* are we looking for a mark? */
6334 SV *sv_commit = NULL; /* last mark name seen in failure */
6335 SV *sv_yes_mark = NULL; /* last mark name we have seen
6336 during a successful match */
6337 U32 lastopen = 0; /* last open we saw */
6338 bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
6339 SV* const oreplsv = GvSVn(PL_replgv);
6340 /* these three flags are set by various ops to signal information to
6341 * the very next op. They have a useful lifetime of exactly one loop
6342 * iteration, and are not preserved or restored by state pushes/pops
6344 bool sw = 0; /* the condition value in (?(cond)a|b) */
6345 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
6346 int logical = 0; /* the following EVAL is:
6350 or the following IFMATCH/UNLESSM is:
6351 false: plain (?=foo)
6352 true: used as a condition: (?(?=foo))
6354 PAD* last_pad = NULL;
6356 U8 gimme = G_SCALAR;
6357 CV *caller_cv = NULL; /* who called us */
6358 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
6359 U32 maxopenparen = 0; /* max '(' index seen so far */
6360 int to_complement; /* Invert the result? */
6361 _char_class_number classnum;
6362 bool is_utf8_pat = reginfo->is_utf8_pat;
6364 I32 orig_savestack_ix = PL_savestack_ix;
6365 U8 * script_run_begin = NULL;
6366 char *match_end= NULL; /* where a match MUST end to be considered successful */
6367 bool is_accepted = FALSE; /* have we hit an ACCEPT opcode? */
6369 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
6370 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
6371 # define SOLARIS_BAD_OPTIMIZER
6372 const U32 *pl_charclass_dup = PL_charclass;
6373 # define PL_charclass pl_charclass_dup
6377 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6380 /* protect against undef(*^R) */
6381 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
6383 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
6384 multicall_oldcatch = 0;
6385 PERL_UNUSED_VAR(multicall_cop);
6387 PERL_ARGS_ASSERT_REGMATCH;
6389 st = PL_regmatch_state;
6391 /* Note that nextbyte is a byte even in UTF */
6395 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
6396 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6397 Perl_re_printf( aTHX_ "regmatch start\n" );
6400 while (scan != NULL) {
6401 next = scan + NEXT_OFF(scan);
6404 state_num = OP(scan);
6408 if (state_num <= REGNODE_MAX) {
6409 SV * const prop = sv_newmortal();
6410 regnode *rnext = regnext(scan);
6412 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6413 regprop(rex, prop, scan, reginfo, NULL);
6414 Perl_re_printf( aTHX_
6415 "%*s%" IVdf ":%s(%" IVdf ")\n",
6416 INDENT_CHARS(depth), "",
6417 (IV)(scan - rexi->program),
6419 (PL_regkind[OP(scan)] == END || !rnext) ?
6420 0 : (IV)(rnext - rexi->program));
6427 assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
6429 switch (state_num) {
6430 case SBOL: /* /^../ and /\A../ */
6431 if (locinput == reginfo->strbeg)
6435 case MBOL: /* /^../m */
6436 if (locinput == reginfo->strbeg ||
6437 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
6444 if (locinput == reginfo->ganch)
6448 case KEEPS: /* \K */
6449 /* update the startpoint */
6450 st->u.keeper.val = rex->offs[0].start;
6451 rex->offs[0].start = locinput - reginfo->strbeg;
6452 PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
6454 NOT_REACHED; /* NOTREACHED */
6456 case KEEPS_next_fail:
6457 /* rollback the start point change */
6458 rex->offs[0].start = st->u.keeper.val;
6460 NOT_REACHED; /* NOTREACHED */
6462 case MEOL: /* /..$/m */
6463 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6467 case SEOL: /* /..$/ */
6468 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6470 if (reginfo->strend - locinput > 1)
6475 if (!NEXTCHR_IS_EOS)
6479 case SANY: /* /./s */
6480 if (NEXTCHR_IS_EOS || locinput >= loceol)
6482 goto increment_locinput;
6484 case REG_ANY: /* /./ */
6486 || locinput >= loceol
6487 || nextbyte == '\n')
6491 goto increment_locinput;
6495 #define ST st->u.trie
6496 case TRIEC: /* (ab|cd) with known charclass */
6497 /* In this case the charclass data is available inline so
6498 we can fail fast without a lot of extra overhead.
6500 if ( ! NEXTCHR_IS_EOS
6501 && locinput < loceol
6502 && ! ANYOF_BITMAP_TEST(scan, nextbyte))
6505 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6506 depth, PL_colors[4], PL_colors[5])
6509 NOT_REACHED; /* NOTREACHED */
6512 case TRIE: /* (ab|cd) */
6513 /* the basic plan of execution of the trie is:
6514 * At the beginning, run though all the states, and
6515 * find the longest-matching word. Also remember the position
6516 * of the shortest matching word. For example, this pattern:
6519 * when matched against the string "abcde", will generate
6520 * accept states for all words except 3, with the longest
6521 * matching word being 4, and the shortest being 2 (with
6522 * the position being after char 1 of the string).
6524 * Then for each matching word, in word order (i.e. 1,2,4,5),
6525 * we run the remainder of the pattern; on each try setting
6526 * the current position to the character following the word,
6527 * returning to try the next word on failure.
6529 * We avoid having to build a list of words at runtime by
6530 * using a compile-time structure, wordinfo[].prev, which
6531 * gives, for each word, the previous accepting word (if any).
6532 * In the case above it would contain the mappings 1->2, 2->0,
6533 * 3->0, 4->5, 5->1. We can use this table to generate, from
6534 * the longest word (4 above), a list of all words, by
6535 * following the list of prev pointers; this gives us the
6536 * unordered list 4,5,1,2. Then given the current word we have
6537 * just tried, we can go through the list and find the
6538 * next-biggest word to try (so if we just failed on word 2,
6539 * the next in the list is 4).
6541 * Since at runtime we don't record the matching position in
6542 * the string for each word, we have to work that out for
6543 * each word we're about to process. The wordinfo table holds
6544 * the character length of each word; given that we recorded
6545 * at the start: the position of the shortest word and its
6546 * length in chars, we just need to move the pointer the
6547 * difference between the two char lengths. Depending on
6548 * Unicode status and folding, that's cheap or expensive.
6550 * This algorithm is optimised for the case where are only a
6551 * small number of accept states, i.e. 0,1, or maybe 2.
6552 * With lots of accepts states, and having to try all of them,
6553 * it becomes quadratic on number of accept states to find all
6558 /* what type of TRIE am I? (utf8 makes this contextual) */
6559 DECL_TRIE_TYPE(scan);
6561 /* what trie are we using right now */
6562 reg_trie_data * const trie
6563 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
6564 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
6565 U32 state = trie->startstate;
6567 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
6568 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6571 && UTF8_IS_ABOVE_LATIN1(nextbyte)
6572 && scan->flags == EXACTL)
6574 /* We only output for EXACTL, as we let the folder
6575 * output this message for EXACTFLU8 to avoid
6577 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6583 || locinput >= loceol
6584 || ! TRIE_BITMAP_TEST(trie, nextbyte)))
6586 if (trie->states[ state ].wordnum) {
6588 Perl_re_exec_indentf( aTHX_ "%sTRIE: matched empty string...%s\n",
6589 depth, PL_colors[4], PL_colors[5])
6595 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6596 depth, PL_colors[4], PL_colors[5])
6603 U8 *uc = ( U8* )locinput;
6607 U8 *uscan = (U8*)NULL;
6608 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6609 U32 charcount = 0; /* how many input chars we have matched */
6610 U32 accepted = 0; /* have we seen any accepting states? */
6612 ST.jump = trie->jump;
6615 ST.longfold = FALSE; /* char longer if folded => it's harder */
6618 /* fully traverse the TRIE; note the position of the
6619 shortest accept state and the wordnum of the longest
6622 while ( state && uc <= (U8*)(loceol) ) {
6623 U32 base = trie->states[ state ].trans.base;
6627 wordnum = trie->states[ state ].wordnum;
6629 if (wordnum) { /* it's an accept state */
6632 /* record first match position */
6634 ST.firstpos = (U8*)locinput;
6639 ST.firstchars = charcount;
6642 if (!ST.nextword || wordnum < ST.nextword)
6643 ST.nextword = wordnum;
6644 ST.topword = wordnum;
6647 DEBUG_TRIE_EXECUTE_r({
6648 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6650 PerlIO_printf( Perl_debug_log,
6651 "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6652 INDENT_CHARS(depth), "", PL_colors[4],
6653 (UV)state, (accepted ? 'Y' : 'N'));
6656 /* read a char and goto next state */
6657 if ( base && (foldlen || uc < (U8*)(loceol))) {
6659 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6660 (U8 *) loceol, uscan,
6661 len, uvc, charid, foldlen,
6668 base + charid - 1 - trie->uniquecharcount)) >= 0)
6670 && ((U32)offset < trie->lasttrans)
6671 && trie->trans[offset].check == state)
6673 state = trie->trans[offset].next;
6684 DEBUG_TRIE_EXECUTE_r(
6685 Perl_re_printf( aTHX_
6686 "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6687 charid, uvc, (UV)state, PL_colors[5] );
6693 /* calculate total number of accept states */
6698 w = trie->wordinfo[w].prev;
6701 ST.accepted = accepted;
6705 Perl_re_exec_indentf( aTHX_ "%sTRIE: got %" IVdf " possible matches%s\n",
6707 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6709 goto trie_first_try; /* jump into the fail handler */
6711 NOT_REACHED; /* NOTREACHED */
6713 case TRIE_next_fail: /* we failed - try next alternative */
6717 /* undo any captures done in the tail part of a branch,
6719 * /(?:X(.)(.)|Y(.)).../
6720 * where the trie just matches X then calls out to do the
6721 * rest of the branch */
6722 REGCP_UNWIND(ST.cp);
6723 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6725 if (!--ST.accepted) {
6727 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n",
6735 /* Find next-highest word to process. Note that this code
6736 * is O(N^2) per trie run (O(N) per branch), so keep tight */
6739 U16 const nextword = ST.nextword;
6740 reg_trie_wordinfo * const wordinfo
6741 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6742 for (word=ST.topword; word; word=wordinfo[word].prev) {
6743 if (word > nextword && (!min || word < min))
6756 ST.lastparen = rex->lastparen;
6757 ST.lastcloseparen = rex->lastcloseparen;
6761 /* find start char of end of current word */
6763 U32 chars; /* how many chars to skip */
6764 reg_trie_data * const trie
6765 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6767 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6769 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6774 /* the hard option - fold each char in turn and find
6775 * its folded length (which may be different */
6776 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6784 /* XXX This assumes the length is well-formed, as
6785 * does the UTF8SKIP below */
6786 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6794 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6799 uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6808 uc = utf8_hop(uc, chars);
6814 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6815 ? ST.jump[ST.nextword]
6819 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n",
6827 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6828 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6830 NOT_REACHED; /* NOTREACHED */
6832 /* only one choice left - just continue */
6834 AV *const trie_words
6835 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6836 SV ** const tmp = trie_words
6837 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6838 SV *sv= tmp ? sv_newmortal() : NULL;
6840 Perl_re_exec_indentf( aTHX_ "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6841 depth, PL_colors[4],
6843 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6844 PL_colors[0], PL_colors[1],
6845 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6847 : "not compiled under -Dr",
6851 locinput = (char*)uc;
6852 continue; /* execute rest of RE */
6858 if (! utf8_target) {
6868 ln = STR_LENl(scan);
6869 goto join_short_long_exact;
6871 case EXACTL: /* /abc/l */
6872 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6874 /* Complete checking would involve going through every character
6875 * matched by the string to see if any is above latin1. But the
6876 * comparision otherwise might very well be a fast assembly
6877 * language routine, and I (khw) don't think slowing things down
6878 * just to check for this warning is worth it. So this just checks
6879 * the first character */
6880 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6881 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6885 if (! utf8_target) {
6890 case EXACT: /* /abc/ */
6893 ln = STR_LENs(scan);
6895 join_short_long_exact:
6896 if (utf8_target != is_utf8_pat) {
6897 /* The target and the pattern have differing utf8ness. */
6899 const char * const e = s + ln;
6902 /* The target is utf8, the pattern is not utf8.
6903 * Above-Latin1 code points can't match the pattern;
6904 * invariants match exactly, and the other Latin1 ones need
6905 * to be downgraded to a single byte in order to do the
6906 * comparison. (If we could be confident that the target
6907 * is not malformed, this could be refactored to have fewer
6908 * tests by just assuming that if the first bytes match, it
6909 * is an invariant, but there are tests in the test suite
6910 * dealing with (??{...}) which violate this) */
6913 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6917 if (UTF8_IS_INVARIANT(*(U8*)l)) {
6924 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6934 /* The target is not utf8, the pattern is utf8. */
6937 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6941 if (UTF8_IS_INVARIANT(*(U8*)s)) {
6948 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6960 /* The target and the pattern have the same utf8ness. */
6961 /* Inline the first character, for speed. */
6962 if ( loceol - locinput < ln
6963 || UCHARAT(s) != nextbyte
6964 || (ln > 1 && memNE(s, locinput, ln)))
6973 case EXACTFL: /* /abc/il */
6976 const U8 * fold_array;
6978 U32 fold_utf8_flags;
6980 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6981 folder = foldEQ_locale;
6982 fold_array = PL_fold_locale;
6983 fold_utf8_flags = FOLDEQ_LOCALE;
6986 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
6987 is effectively /u; hence to match, target
6989 if (! utf8_target) {
6992 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
6993 | FOLDEQ_S2_FOLDS_SANE;
6994 folder = foldEQ_latin1_s2_folded;
6995 fold_array = PL_fold_latin1;
6998 case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */
6999 if (! utf8_target) {
7002 assert(is_utf8_pat);
7003 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7006 case EXACTFUP: /* /foo/iu, and something is problematic in
7007 'foo' so can't take shortcuts. */
7008 assert(! is_utf8_pat);
7009 folder = foldEQ_latin1;
7010 fold_array = PL_fold_latin1;
7011 fold_utf8_flags = 0;
7014 case EXACTFU: /* /abc/iu */
7015 folder = foldEQ_latin1_s2_folded;
7016 fold_array = PL_fold_latin1;
7017 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7020 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8
7022 assert(! is_utf8_pat);
7024 case EXACTFAA: /* /abc/iaa */
7025 folder = foldEQ_latin1_s2_folded;
7026 fold_array = PL_fold_latin1;
7027 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7028 if (is_utf8_pat || ! utf8_target) {
7030 /* The possible presence of a MICRO SIGN in the pattern forbids
7031 * us to view a non-UTF-8 pattern as folded when there is a
7033 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
7034 |FOLDEQ_S2_FOLDS_SANE;
7039 case EXACTF: /* /abc/i This node only generated for
7040 non-utf8 patterns */
7041 assert(! is_utf8_pat);
7043 fold_array = PL_fold;
7044 fold_utf8_flags = 0;
7048 ln = STR_LENs(scan);
7052 || state_num == EXACTFUP
7053 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
7055 /* Either target or the pattern are utf8, or has the issue where
7056 * the fold lengths may differ. */
7057 const char * const l = locinput;
7060 if (! foldEQ_utf8_flags(l, &e, 0, utf8_target,
7061 s, 0, ln, is_utf8_pat,fold_utf8_flags))
7069 /* Neither the target nor the pattern are utf8 */
7070 if (UCHARAT(s) != nextbyte
7072 && UCHARAT(s) != fold_array[nextbyte])
7076 if (loceol - locinput < ln)
7078 if (ln > 1 && ! folder(locinput, s, ln))
7084 case NBOUNDL: /* /\B/l */
7088 case BOUNDL: /* /\b/l */
7091 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7093 if (FLAGS(scan) != TRADITIONAL_BOUND) {
7094 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
7099 if (locinput == reginfo->strbeg)
7100 b1 = isWORDCHAR_LC('\n');
7102 U8 *p = reghop3((U8*)locinput, -1,
7103 (U8*)(reginfo->strbeg));
7104 b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
7106 b2 = (NEXTCHR_IS_EOS)
7107 ? isWORDCHAR_LC('\n')
7108 : isWORDCHAR_LC_utf8_safe((U8*) locinput,
7109 (U8*) reginfo->strend);
7111 else { /* Here the string isn't utf8 */
7112 b1 = (locinput == reginfo->strbeg)
7113 ? isWORDCHAR_LC('\n')
7114 : isWORDCHAR_LC(UCHARAT(locinput - 1));
7115 b2 = (NEXTCHR_IS_EOS)
7116 ? isWORDCHAR_LC('\n')
7117 : isWORDCHAR_LC(nextbyte);
7119 if (to_complement ^ (b1 == b2)) {
7125 case NBOUND: /* /\B/ */
7129 case BOUND: /* /\b/ */
7133 goto bound_ascii_match_only;
7135 case NBOUNDA: /* /\B/a */
7139 case BOUNDA: /* /\b/a */
7143 bound_ascii_match_only:
7144 /* Here the string isn't utf8, or is utf8 and only ascii characters
7145 * are to match \w. In the latter case looking at the byte just
7146 * prior to the current one may be just the final byte of a
7147 * multi-byte character. This is ok. There are two cases:
7148 * 1) it is a single byte character, and then the test is doing
7149 * just what it's supposed to.
7150 * 2) it is a multi-byte character, in which case the final byte is
7151 * never mistakable for ASCII, and so the test will say it is
7152 * not a word character, which is the correct answer. */
7153 b1 = (locinput == reginfo->strbeg)
7154 ? isWORDCHAR_A('\n')
7155 : isWORDCHAR_A(UCHARAT(locinput - 1));
7156 b2 = (NEXTCHR_IS_EOS)
7157 ? isWORDCHAR_A('\n')
7158 : isWORDCHAR_A(nextbyte);
7159 if (to_complement ^ (b1 == b2)) {
7165 case NBOUNDU: /* /\B/u */
7169 case BOUNDU: /* /\b/u */
7172 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
7175 else if (utf8_target) {
7177 switch((bound_type) FLAGS(scan)) {
7178 case TRADITIONAL_BOUND:
7181 if (locinput == reginfo->strbeg) {
7182 b1 = 0 /* isWORDCHAR_L1('\n') */;
7185 U8 *p = reghop3((U8*)locinput, -1,
7186 (U8*)(reginfo->strbeg));
7188 b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
7190 b2 = (NEXTCHR_IS_EOS)
7191 ? 0 /* isWORDCHAR_L1('\n') */
7192 : isWORDCHAR_utf8_safe((U8*)locinput,
7193 (U8*) reginfo->strend);
7194 match = cBOOL(b1 != b2);
7198 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7199 match = TRUE; /* GCB always matches at begin and
7203 /* Find the gcb values of previous and current
7204 * chars, then see if is a break point */
7205 match = isGCB(getGCB_VAL_UTF8(
7206 reghop3((U8*)locinput,
7208 (U8*)(reginfo->strbeg)),
7209 (U8*) reginfo->strend),
7210 getGCB_VAL_UTF8((U8*) locinput,
7211 (U8*) reginfo->strend),
7212 (U8*) reginfo->strbeg,
7219 if (locinput == reginfo->strbeg) {
7222 else if (NEXTCHR_IS_EOS) {
7226 match = isLB(getLB_VAL_UTF8(
7227 reghop3((U8*)locinput,
7229 (U8*)(reginfo->strbeg)),
7230 (U8*) reginfo->strend),
7231 getLB_VAL_UTF8((U8*) locinput,
7232 (U8*) reginfo->strend),
7233 (U8*) reginfo->strbeg,
7235 (U8*) reginfo->strend,
7240 case SB_BOUND: /* Always matches at begin and end */
7241 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7245 match = isSB(getSB_VAL_UTF8(
7246 reghop3((U8*)locinput,
7248 (U8*)(reginfo->strbeg)),
7249 (U8*) reginfo->strend),
7250 getSB_VAL_UTF8((U8*) locinput,
7251 (U8*) reginfo->strend),
7252 (U8*) reginfo->strbeg,
7254 (U8*) reginfo->strend,
7260 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7264 match = isWB(WB_UNKNOWN,
7266 reghop3((U8*)locinput,
7268 (U8*)(reginfo->strbeg)),
7269 (U8*) reginfo->strend),
7270 getWB_VAL_UTF8((U8*) locinput,
7271 (U8*) reginfo->strend),
7272 (U8*) reginfo->strbeg,
7274 (U8*) reginfo->strend,
7280 else { /* Not utf8 target */
7281 switch((bound_type) FLAGS(scan)) {
7282 case TRADITIONAL_BOUND:
7285 b1 = (locinput == reginfo->strbeg)
7286 ? 0 /* isWORDCHAR_L1('\n') */
7287 : isWORDCHAR_L1(UCHARAT(locinput - 1));
7288 b2 = (NEXTCHR_IS_EOS)
7289 ? 0 /* isWORDCHAR_L1('\n') */
7290 : isWORDCHAR_L1(nextbyte);
7291 match = cBOOL(b1 != b2);
7296 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7297 match = TRUE; /* GCB always matches at begin and
7300 else { /* Only CR-LF combo isn't a GCB in 0-255
7302 match = UCHARAT(locinput - 1) != '\r'
7303 || UCHARAT(locinput) != '\n';
7308 if (locinput == reginfo->strbeg) {
7311 else if (NEXTCHR_IS_EOS) {
7315 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
7316 getLB_VAL_CP(UCHARAT(locinput)),
7317 (U8*) reginfo->strbeg,
7319 (U8*) reginfo->strend,
7324 case SB_BOUND: /* Always matches at begin and end */
7325 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7329 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
7330 getSB_VAL_CP(UCHARAT(locinput)),
7331 (U8*) reginfo->strbeg,
7333 (U8*) reginfo->strend,
7339 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7343 match = isWB(WB_UNKNOWN,
7344 getWB_VAL_CP(UCHARAT(locinput -1)),
7345 getWB_VAL_CP(UCHARAT(locinput)),
7346 (U8*) reginfo->strbeg,
7348 (U8*) reginfo->strend,
7355 if (to_complement ^ ! match) {
7361 case ANYOFL: /* /[abc]/l */
7362 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7363 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
7366 case ANYOFD: /* /[abc]/d */
7367 case ANYOF: /* /[abc]/ */
7368 if (NEXTCHR_IS_EOS || locinput >= loceol)
7370 if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput))
7371 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
7373 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
7379 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7384 goto increment_locinput;
7390 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
7391 || locinput >= loceol)
7395 locinput++; /* ANYOFM is always single byte */
7400 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
7401 || locinput >= loceol)
7405 goto increment_locinput;
7411 || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7412 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7417 goto increment_locinput;
7423 || ANYOF_FLAGS(scan) != (U8) *locinput
7424 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7429 goto increment_locinput;
7435 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
7436 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
7437 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
7438 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7443 goto increment_locinput;
7449 || loceol - locinput < FLAGS(scan)
7450 || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
7451 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7456 goto increment_locinput;
7460 if (NEXTCHR_IS_EOS) {
7465 if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7466 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7467 (U8 *) reginfo->strend,
7469 ANYOFRbase(scan), ANYOFRdelta(scan)))
7475 if (! withinCOUNT((U8) *locinput,
7476 ANYOFRbase(scan), ANYOFRdelta(scan)))
7481 goto increment_locinput;
7485 if (NEXTCHR_IS_EOS) {
7490 if ( ANYOF_FLAGS(scan) != (U8) *locinput
7491 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7492 (U8 *) reginfo->strend,
7494 ANYOFRbase(scan), ANYOFRdelta(scan)))
7500 if (! withinCOUNT((U8) *locinput,
7501 ANYOFRbase(scan), ANYOFRdelta(scan)))
7506 goto increment_locinput;
7509 /* The argument (FLAGS) to all the POSIX node types is the class number
7512 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
7516 case POSIXL: /* \w or [:punct:] etc. under /l */
7517 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7518 if (NEXTCHR_IS_EOS || locinput >= loceol)
7521 /* Use isFOO_lc() for characters within Latin1. (Note that
7522 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7523 * wouldn't be invariant) */
7524 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7525 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
7533 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7534 /* An above Latin-1 code point, or malformed */
7535 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7537 goto utf8_posix_above_latin1;
7540 /* Here is a UTF-8 variant code point below 256 and the target is
7542 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7543 EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7544 *(locinput + 1))))))
7549 goto increment_locinput;
7551 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
7555 case POSIXD: /* \w or [:punct:] etc. under /d */
7561 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
7563 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7567 /* All UTF-8 variants match */
7568 if (! UTF8_IS_INVARIANT(nextbyte)) {
7569 goto increment_locinput;
7575 case POSIXA: /* \w or [:punct:] etc. under /a */
7578 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7579 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7580 * character is a single byte */
7582 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7588 if (! (to_complement ^ cBOOL(_generic_isCC_A(nextbyte,
7594 /* Here we are either not in utf8, or we matched a utf8-invariant,
7595 * so the next char is the next byte */
7599 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
7603 case POSIXU: /* \w or [:punct:] etc. under /u */
7605 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7609 /* Use _generic_isCC() for characters within Latin1. (Note that
7610 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7611 * wouldn't be invariant) */
7612 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7613 if (! (to_complement ^ cBOOL(_generic_isCC(nextbyte,
7620 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7621 if (! (to_complement
7622 ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7630 else { /* Handle above Latin-1 code points */
7631 utf8_posix_above_latin1:
7632 classnum = (_char_class_number) FLAGS(scan);
7635 if (! (to_complement
7636 ^ cBOOL(_invlist_contains_cp(
7637 PL_XPosix_ptrs[classnum],
7638 utf8_to_uvchr_buf((U8 *) locinput,
7639 (U8 *) reginfo->strend,
7645 case _CC_ENUM_SPACE:
7646 if (! (to_complement
7647 ^ cBOOL(is_XPERLSPACE_high(locinput))))
7652 case _CC_ENUM_BLANK:
7653 if (! (to_complement
7654 ^ cBOOL(is_HORIZWS_high(locinput))))
7659 case _CC_ENUM_XDIGIT:
7660 if (! (to_complement
7661 ^ cBOOL(is_XDIGIT_high(locinput))))
7666 case _CC_ENUM_VERTSPACE:
7667 if (! (to_complement
7668 ^ cBOOL(is_VERTWS_high(locinput))))
7673 case _CC_ENUM_CNTRL: /* These can't match above Latin1 */
7674 case _CC_ENUM_ASCII:
7675 if (! to_complement) {
7680 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7684 case CLUMP: /* Match \X: logical Unicode character. This is defined as
7685 a Unicode extended Grapheme Cluster */
7686 if (NEXTCHR_IS_EOS || locinput >= loceol)
7688 if (! utf8_target) {
7690 /* Match either CR LF or '.', as all the other possibilities
7692 locinput++; /* Match the . or CR */
7693 if (nextbyte == '\r' /* And if it was CR, and the next is LF,
7695 && locinput < loceol
7696 && UCHARAT(locinput) == '\n')
7703 /* Get the gcb type for the current character */
7704 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7705 (U8*) reginfo->strend);
7707 /* Then scan through the input until we get to the first
7708 * character whose type is supposed to be a gcb with the
7709 * current character. (There is always a break at the
7711 locinput += UTF8SKIP(locinput);
7712 while (locinput < loceol) {
7713 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7714 (U8*) reginfo->strend);
7715 if (isGCB(prev_gcb, cur_gcb,
7716 (U8*) reginfo->strbeg, (U8*) locinput,
7723 locinput += UTF8SKIP(locinput);
7730 case REFFLN: /* /\g{name}/il */
7731 { /* The capture buffer cases. The ones beginning with N for the
7732 named buffers just convert to the equivalent numbered and
7733 pretend they were called as the corresponding numbered buffer
7735 /* don't initialize these in the declaration, it makes C++
7740 const U8 *fold_array;
7743 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7744 folder = foldEQ_locale;
7745 fold_array = PL_fold_locale;
7747 utf8_fold_flags = FOLDEQ_LOCALE;
7750 case REFFAN: /* /\g{name}/iaa */
7751 folder = foldEQ_latin1;
7752 fold_array = PL_fold_latin1;
7754 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7757 case REFFUN: /* /\g{name}/iu */
7758 folder = foldEQ_latin1;
7759 fold_array = PL_fold_latin1;
7761 utf8_fold_flags = 0;
7764 case REFFN: /* /\g{name}/i */
7766 fold_array = PL_fold;
7768 utf8_fold_flags = 0;
7771 case REFN: /* /\g{name}/ */
7775 utf8_fold_flags = 0;
7778 /* For the named back references, find the corresponding buffer
7780 n = reg_check_named_buff_matched(rex,scan);
7785 goto do_nref_ref_common;
7787 case REFFL: /* /\1/il */
7788 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7789 folder = foldEQ_locale;
7790 fold_array = PL_fold_locale;
7791 utf8_fold_flags = FOLDEQ_LOCALE;
7794 case REFFA: /* /\1/iaa */
7795 folder = foldEQ_latin1;
7796 fold_array = PL_fold_latin1;
7797 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7800 case REFFU: /* /\1/iu */
7801 folder = foldEQ_latin1;
7802 fold_array = PL_fold_latin1;
7803 utf8_fold_flags = 0;
7806 case REFF: /* /\1/i */
7808 fold_array = PL_fold;
7809 utf8_fold_flags = 0;
7812 case REF: /* /\1/ */
7815 utf8_fold_flags = 0;
7819 n = ARG(scan); /* which paren pair */
7822 ln = rex->offs[n].start;
7823 endref = rex->offs[n].end;
7824 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7825 if (rex->lastparen < n || ln == -1 || endref == -1)
7826 sayNO; /* Do not match unless seen CLOSEn. */
7830 s = reginfo->strbeg + ln;
7831 if (type != REF /* REF can do byte comparison */
7832 && (utf8_target || type == REFFU || type == REFFL))
7834 char * limit = loceol;
7836 /* This call case insensitively compares the entire buffer
7837 * at s, with the current input starting at locinput, but
7838 * not going off the end given by loceol, and
7839 * returns in <limit> upon success, how much of the
7840 * current input was matched */
7841 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7842 locinput, &limit, 0, utf8_target, utf8_fold_flags))
7850 /* Not utf8: Inline the first character, for speed. */
7851 if ( ! NEXTCHR_IS_EOS
7852 && locinput < loceol
7853 && UCHARAT(s) != nextbyte
7855 || UCHARAT(s) != fold_array[nextbyte]))
7860 if (locinput + ln > loceol)
7862 if (ln > 1 && (type == REF
7863 ? memNE(s, locinput, ln)
7864 : ! folder(locinput, s, ln)))
7870 case NOTHING: /* null op; e.g. the 'nothing' following
7871 * the '*' in m{(a+|b)*}' */
7873 case TAIL: /* placeholder while compiling (A|B|C) */
7877 #define ST st->u.eval
7878 #define CUR_EVAL cur_eval->u.eval
7884 regexp_internal *rei;
7885 regnode *startpoint;
7888 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
7889 arg= (U32)ARG(scan);
7890 if (cur_eval && cur_eval->locinput == locinput) {
7891 if ( ++nochange_depth > max_nochange_depth )
7893 "Pattern subroutine nesting without pos change"
7894 " exceeded limit in regex");
7901 startpoint = scan + ARG2L(scan);
7902 EVAL_CLOSE_PAREN_SET( st, arg );
7903 /* Detect infinite recursion
7905 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7906 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7907 * So we track the position in the string we are at each time
7908 * we recurse and if we try to enter the same routine twice from
7909 * the same position we throw an error.
7911 if ( rex->recurse_locinput[arg] == locinput ) {
7912 /* FIXME: we should show the regop that is failing as part
7913 * of the error message. */
7914 Perl_croak(aTHX_ "Infinite recursion in regex");
7916 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7917 rex->recurse_locinput[arg]= locinput;
7920 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7922 Perl_re_exec_indentf( aTHX_
7923 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7924 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7930 /* Save all the positions seen so far. */
7931 ST.cp = regcppush(rex, 0, maxopenparen);
7932 REGCP_SET(ST.lastcp);
7934 /* and then jump to the code we share with EVAL */
7935 goto eval_recurse_doit;
7938 case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */
7939 if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
7940 if ( ++nochange_depth > max_nochange_depth )
7941 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7946 /* execute the code in the {...} */
7950 OP * const oop = PL_op;
7951 COP * const ocurcop = PL_curcop;
7955 /* save *all* paren positions */
7956 regcppush(rex, 0, maxopenparen);
7957 REGCP_SET(ST.lastcp);
7960 caller_cv = find_runcv(NULL);
7964 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7966 (REGEXP*)(rexi->data->data[n])
7968 nop = (OP*)rexi->data->data[n+1];
7970 else if (rexi->data->what[n] == 'l') { /* literal code */
7972 nop = (OP*)rexi->data->data[n];
7973 assert(CvDEPTH(newcv));
7976 /* literal with own CV */
7977 assert(rexi->data->what[n] == 'L');
7978 newcv = rex->qr_anoncv;
7979 nop = (OP*)rexi->data->data[n];
7982 /* Some notes about MULTICALL and the context and save stacks.
7985 * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7986 * since codeblocks don't introduce a new scope (so that
7987 * local() etc accumulate), at the end of a successful
7988 * match there will be a SAVEt_CLEARSV on the savestack
7989 * for each of $x, $y, $z. If the three code blocks above
7990 * happen to have come from different CVs (e.g. via
7991 * embedded qr//s), then we must ensure that during any
7992 * savestack unwinding, PL_comppad always points to the
7993 * right pad at each moment. We achieve this by
7994 * interleaving SAVEt_COMPPAD's on the savestack whenever
7995 * there is a change of pad.
7996 * In theory whenever we call a code block, we should
7997 * push a CXt_SUB context, then pop it on return from
7998 * that code block. This causes a bit of an issue in that
7999 * normally popping a context also clears the savestack
8000 * back to cx->blk_oldsaveix, but here we specifically
8001 * don't want to clear the save stack on exit from the
8003 * Also for efficiency we don't want to keep pushing and
8004 * popping the single SUB context as we backtrack etc.
8005 * So instead, we push a single context the first time
8006 * we need, it, then hang onto it until the end of this
8007 * function. Whenever we encounter a new code block, we
8008 * update the CV etc if that's changed. During the times
8009 * in this function where we're not executing a code
8010 * block, having the SUB context still there is a bit
8011 * naughty - but we hope that no-one notices.
8012 * When the SUB context is initially pushed, we fake up
8013 * cx->blk_oldsaveix to be as if we'd pushed this context
8014 * on first entry to S_regmatch rather than at some random
8015 * point during the regexe execution. That way if we
8016 * croak, popping the context stack will ensure that
8017 * *everything* SAVEd by this function is undone and then
8018 * the context popped, rather than e.g., popping the
8019 * context (and restoring the original PL_comppad) then
8020 * popping more of the savestack and restoring a bad
8024 /* If this is the first EVAL, push a MULTICALL. On
8025 * subsequent calls, if we're executing a different CV, or
8026 * if PL_comppad has got messed up from backtracking
8027 * through SAVECOMPPADs, then refresh the context.
8029 if (newcv != last_pushed_cv || PL_comppad != last_pad)
8031 U8 flags = (CXp_SUB_RE |
8032 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
8034 if (last_pushed_cv) {
8035 CHANGE_MULTICALL_FLAGS(newcv, flags);
8038 PUSH_MULTICALL_FLAGS(newcv, flags);
8040 /* see notes above */
8041 CX_CUR()->blk_oldsaveix = orig_savestack_ix;
8043 last_pushed_cv = newcv;
8046 /* these assignments are just to silence compiler
8048 multicall_cop = NULL;
8050 last_pad = PL_comppad;
8052 /* the initial nextstate you would normally execute
8053 * at the start of an eval (which would cause error
8054 * messages to come from the eval), may be optimised
8055 * away from the execution path in the regex code blocks;
8056 * so manually set PL_curcop to it initially */
8058 OP *o = cUNOPx(nop)->op_first;
8059 assert(o->op_type == OP_NULL);
8060 if (o->op_targ == OP_SCOPE) {
8061 o = cUNOPo->op_first;
8064 assert(o->op_targ == OP_LEAVE);
8065 o = cUNOPo->op_first;
8066 assert(o->op_type == OP_ENTER);
8070 if (o->op_type != OP_STUB) {
8071 assert( o->op_type == OP_NEXTSTATE
8072 || o->op_type == OP_DBSTATE
8073 || (o->op_type == OP_NULL
8074 && ( o->op_targ == OP_NEXTSTATE
8075 || o->op_targ == OP_DBSTATE
8079 PL_curcop = (COP*)o;
8084 DEBUG_STATE_r( Perl_re_printf( aTHX_
8085 " re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
8087 rex->offs[0].end = locinput - reginfo->strbeg;
8088 if (reginfo->info_aux_eval->pos_magic)
8089 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
8090 reginfo->sv, reginfo->strbeg,
8091 locinput - reginfo->strbeg);
8094 SV *sv_mrk = get_sv("REGMARK", 1);
8095 sv_setsv(sv_mrk, sv_yes_mark);
8098 /* we don't use MULTICALL here as we want to call the
8099 * first op of the block of interest, rather than the
8100 * first op of the sub. Also, we don't want to free
8101 * the savestack frame */
8102 before = (IV)(SP-PL_stack_base);
8104 CALLRUNOPS(aTHX); /* Scalar context. */
8106 if ((IV)(SP-PL_stack_base) == before)
8107 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8113 /* before restoring everything, evaluate the returned
8114 * value, so that 'uninit' warnings don't use the wrong
8115 * PL_op or pad. Also need to process any magic vars
8116 * (e.g. $1) *before* parentheses are restored */
8121 if (logical == 0) { /* (?{})/ */
8122 SV *replsv = save_scalar(PL_replgv);
8123 sv_setsv(replsv, ret); /* $^R */
8126 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
8127 sw = cBOOL(SvTRUE_NN(ret));
8130 else { /* /(??{}) */
8131 /* if its overloaded, let the regex compiler handle
8132 * it; otherwise extract regex, or stringify */
8133 if (SvGMAGICAL(ret))
8134 ret = sv_mortalcopy(ret);
8135 if (!SvAMAGIC(ret)) {
8139 if (SvTYPE(sv) == SVt_REGEXP)
8140 re_sv = (REGEXP*) sv;
8141 else if (SvSMAGICAL(ret)) {
8142 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
8144 re_sv = (REGEXP *) mg->mg_obj;
8147 /* force any undef warnings here */
8148 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
8149 ret = sv_mortalcopy(ret);
8150 (void) SvPV_force_nolen(ret);
8156 /* *** Note that at this point we don't restore
8157 * PL_comppad, (or pop the CxSUB) on the assumption it may
8158 * be used again soon. This is safe as long as nothing
8159 * in the regexp code uses the pad ! */
8161 PL_curcop = ocurcop;
8162 regcp_restore(rex, ST.lastcp, &maxopenparen);
8163 PL_curpm_under = PL_curpm;
8164 PL_curpm = PL_reg_curpm;
8167 PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
8173 /* only /(??{})/ from now on */
8176 /* extract RE object from returned value; compiling if
8180 re_sv = reg_temp_copy(NULL, re_sv);
8185 if (SvUTF8(ret) && IN_BYTES) {
8186 /* In use 'bytes': make a copy of the octet
8187 * sequence, but without the flag on */
8189 const char *const p = SvPV(ret, len);
8190 ret = newSVpvn_flags(p, len, SVs_TEMP);
8192 if (rex->intflags & PREGf_USE_RE_EVAL)
8193 pm_flags |= PMf_USE_RE_EVAL;
8195 /* if we got here, it should be an engine which
8196 * supports compiling code blocks and stuff */
8197 assert(rex->engine && rex->engine->op_comp);
8198 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
8199 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
8200 rex->engine, NULL, NULL,
8201 /* copy /msixn etc to inner pattern */
8206 & (SVs_TEMP | SVs_GMG | SVf_ROK))
8207 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
8208 /* This isn't a first class regexp. Instead, it's
8209 caching a regexp onto an existing, Perl visible
8211 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
8217 RXp_MATCH_COPIED_off(re);
8218 re->subbeg = rex->subbeg;
8219 re->sublen = rex->sublen;
8220 re->suboffset = rex->suboffset;
8221 re->subcoffset = rex->subcoffset;
8223 re->lastcloseparen = 0;
8226 debug_start_match(re_sv, utf8_target, locinput,
8227 reginfo->strend, "EVAL/GOSUB: Matching embedded");
8229 startpoint = rei->program + 1;
8230 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
8231 * close_paren only for GOSUB */
8232 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
8233 /* Save all the seen positions so far. */
8234 ST.cp = regcppush(rex, 0, maxopenparen);
8235 REGCP_SET(ST.lastcp);
8236 /* and set maxopenparen to 0, since we are starting a "fresh" match */
8238 /* run the pattern returned from (??{...}) */
8240 eval_recurse_doit: /* Share code with GOSUB below this line
8241 * At this point we expect the stack context to be
8242 * set up correctly */
8244 /* invalidate the S-L poscache. We're now executing a
8245 * different set of WHILEM ops (and their associated
8246 * indexes) against the same string, so the bits in the
8247 * cache are meaningless. Setting maxiter to zero forces
8248 * the cache to be invalidated and zeroed before reuse.
8249 * XXX This is too dramatic a measure. Ideally we should
8250 * save the old cache and restore when running the outer
8252 reginfo->poscache_maxiter = 0;
8254 /* the new regexp might have a different is_utf8_pat than we do */
8255 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
8257 ST.prev_rex = rex_sv;
8258 ST.prev_curlyx = cur_curlyx;
8260 SET_reg_curpm(rex_sv);
8265 ST.prev_eval = cur_eval;
8267 /* now continue from first node in postoned RE */
8268 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
8269 loceol, script_run_begin);
8270 NOT_REACHED; /* NOTREACHED */
8273 case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
8274 /* note: this is called twice; first after popping B, then A */
8276 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n",
8277 depth, cur_eval, ST.prev_eval);
8280 #define SET_RECURSE_LOCINPUT(STR,VAL)\
8281 if ( cur_eval && CUR_EVAL.close_paren ) {\
8283 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
8285 CUR_EVAL.close_paren - 1,\
8289 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
8292 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
8294 rex_sv = ST.prev_rex;
8295 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8296 SET_reg_curpm(rex_sv);
8297 rex = ReANY(rex_sv);
8298 rexi = RXi_GET(rex);
8300 /* preserve $^R across LEAVE's. See Bug 121070. */
8301 SV *save_sv= GvSV(PL_replgv);
8303 SvREFCNT_inc(save_sv);
8304 regcpblow(ST.cp); /* LEAVE in disguise */
8305 /* don't move this initialization up */
8306 replsv = GvSV(PL_replgv);
8307 sv_setsv(replsv, save_sv);
8309 SvREFCNT_dec(save_sv);
8311 cur_eval = ST.prev_eval;
8312 cur_curlyx = ST.prev_curlyx;
8314 /* Invalidate cache. See "invalidate" comment above. */
8315 reginfo->poscache_maxiter = 0;
8316 if ( nochange_depth )
8319 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
8323 case EVAL_B_fail: /* unsuccessful B in (?{...})B */
8324 REGCP_UNWIND(ST.lastcp);
8327 case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
8328 /* note: this is called twice; first after popping B, then A */
8330 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
8331 depth, cur_eval, ST.prev_eval);
8334 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
8336 rex_sv = ST.prev_rex;
8337 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8338 SET_reg_curpm(rex_sv);
8339 rex = ReANY(rex_sv);
8340 rexi = RXi_GET(rex);
8342 REGCP_UNWIND(ST.lastcp);
8343 regcppop(rex, &maxopenparen);
8344 cur_eval = ST.prev_eval;
8345 cur_curlyx = ST.prev_curlyx;
8347 /* Invalidate cache. See "invalidate" comment above. */
8348 reginfo->poscache_maxiter = 0;
8349 if ( nochange_depth )
8352 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
8357 n = ARG(scan); /* which paren pair */
8358 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
8359 if (n > maxopenparen)
8361 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
8362 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
8367 (IV)rex->offs[n].start_tmp,
8373 case SROPEN: /* (*SCRIPT_RUN: */
8374 script_run_begin = (U8 *) locinput;
8379 n = ARG(scan); /* which paren pair */
8380 CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8381 locinput - reginfo->strbeg);
8382 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
8387 case SRCLOSE: /* (*SCRIPT_RUN: ... ) */
8389 if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
8397 case ACCEPT: /* (*ACCEPT) */
8400 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8401 utmp = (U32)ARG2L(scan);
8407 cursor && ( OP(cursor) != END );
8408 cursor = ( PL_regkind[ OP(cursor) ] == END )
8412 if ( OP(cursor) != CLOSE )
8417 if ( n > lastopen ) /* might be OPEN/CLOSE in the way */
8418 continue; /* so skip this one */
8420 CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8421 locinput - reginfo->strbeg);
8423 if ( n == utmp || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
8430 case GROUPP: /* (?(1)) */
8431 n = ARG(scan); /* which paren pair */
8432 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
8435 case GROUPPN: /* (?(<name>)) */
8436 /* reg_check_named_buff_matched returns 0 for no match */
8437 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
8440 case INSUBP: /* (?(R)) */
8442 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
8443 * of SCAN is already set up as matches a eval.close_paren */
8444 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
8447 case DEFINEP: /* (?(DEFINE)) */
8451 case IFTHEN: /* (?(cond)A|B) */
8452 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8454 next = NEXTOPER(NEXTOPER(scan));
8456 next = scan + ARG(scan);
8457 if (OP(next) == IFTHEN) /* Fake one. */
8458 next = NEXTOPER(NEXTOPER(next));
8462 case LOGICAL: /* modifier for EVAL and IFMATCH */
8463 logical = scan->flags;
8466 /*******************************************************************
8468 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
8469 pattern, where A and B are subpatterns. (For simple A, CURLYM or
8470 STAR/PLUS/CURLY/CURLYN are used instead.)
8472 A*B is compiled as <CURLYX><A><WHILEM><B>
8474 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
8475 state, which contains the current count, initialised to -1. It also sets
8476 cur_curlyx to point to this state, with any previous value saved in the
8479 CURLYX then jumps straight to the WHILEM op, rather than executing A,
8480 since the pattern may possibly match zero times (i.e. it's a while {} loop
8481 rather than a do {} while loop).
8483 Each entry to WHILEM represents a successful match of A. The count in the
8484 CURLYX block is incremented, another WHILEM state is pushed, and execution
8485 passes to A or B depending on greediness and the current count.
8487 For example, if matching against the string a1a2a3b (where the aN are
8488 substrings that match /A/), then the match progresses as follows: (the
8489 pushed states are interspersed with the bits of strings matched so far):
8492 <CURLYX cnt=0><WHILEM>
8493 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
8494 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
8495 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
8496 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
8498 (Contrast this with something like CURLYM, which maintains only a single
8502 a1 <CURLYM cnt=1> a2
8503 a1 a2 <CURLYM cnt=2> a3
8504 a1 a2 a3 <CURLYM cnt=3> b
8507 Each WHILEM state block marks a point to backtrack to upon partial failure
8508 of A or B, and also contains some minor state data related to that
8509 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
8510 overall state, such as the count, and pointers to the A and B ops.
8512 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
8513 must always point to the *current* CURLYX block, the rules are:
8515 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
8516 and set cur_curlyx to point the new block.
8518 When popping the CURLYX block after a successful or unsuccessful match,
8519 restore the previous cur_curlyx.
8521 When WHILEM is about to execute B, save the current cur_curlyx, and set it
8522 to the outer one saved in the CURLYX block.
8524 When popping the WHILEM block after a successful or unsuccessful B match,
8525 restore the previous cur_curlyx.
8527 Here's an example for the pattern (AI* BI)*BO
8528 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
8531 curlyx backtrack stack
8532 ------ ---------------
8534 CO <CO prev=NULL> <WO>
8535 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8536 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8537 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8539 At this point the pattern succeeds, and we work back down the stack to
8540 clean up, restoring as we go:
8542 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8543 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8544 CO <CO prev=NULL> <WO>
8547 *******************************************************************/
8549 #define ST st->u.curlyx
8551 case CURLYX: /* start of /A*B/ (for complex A) */
8553 /* No need to save/restore up to this paren */
8554 I32 parenfloor = scan->flags;
8556 assert(next); /* keep Coverity happy */
8557 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
8560 /* XXXX Probably it is better to teach regpush to support
8561 parenfloor > maxopenparen ... */
8562 if (parenfloor > (I32)rex->lastparen)
8563 parenfloor = rex->lastparen; /* Pessimization... */
8565 ST.prev_curlyx= cur_curlyx;
8567 ST.cp = PL_savestack_ix;
8569 /* these fields contain the state of the current curly.
8570 * they are accessed by subsequent WHILEMs */
8571 ST.parenfloor = parenfloor;
8576 ST.count = -1; /* this will be updated by WHILEM */
8577 ST.lastloc = NULL; /* this will be updated by WHILEM */
8579 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
8581 NOT_REACHED; /* NOTREACHED */
8584 case CURLYX_end: /* just finished matching all of A*B */
8585 cur_curlyx = ST.prev_curlyx;
8587 NOT_REACHED; /* NOTREACHED */
8589 case CURLYX_end_fail: /* just failed to match all of A*B */
8591 cur_curlyx = ST.prev_curlyx;
8593 NOT_REACHED; /* NOTREACHED */
8597 #define ST st->u.whilem
8599 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
8601 /* see the discussion above about CURLYX/WHILEM */
8606 assert(cur_curlyx); /* keep Coverity happy */
8608 min = ARG1(cur_curlyx->u.curlyx.me);
8609 max = ARG2(cur_curlyx->u.curlyx.me);
8610 A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
8611 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8612 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8613 ST.cache_offset = 0;
8617 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n",
8618 depth, (long)n, min, max)
8621 /* First just match a string of min A's. */
8624 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8625 cur_curlyx->u.curlyx.lastloc = locinput;
8626 REGCP_SET(ST.lastcp);
8628 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8630 NOT_REACHED; /* NOTREACHED */
8633 /* If degenerate A matches "", assume A done. */
8635 if (locinput == cur_curlyx->u.curlyx.lastloc) {
8636 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: empty match detected, trying continuation...\n",
8639 goto do_whilem_B_max;
8642 /* super-linear cache processing.
8644 * The idea here is that for certain types of CURLYX/WHILEM -
8645 * principally those whose upper bound is infinity (and
8646 * excluding regexes that have things like \1 and other very
8647 * non-regular expresssiony things), then if a pattern like
8648 * /....A*.../ fails and we backtrack to the WHILEM, then we
8649 * make a note that this particular WHILEM op was at string
8650 * position 47 (say) when the rest of pattern failed. Then, if
8651 * we ever find ourselves back at that WHILEM, and at string
8652 * position 47 again, we can just fail immediately rather than
8653 * running the rest of the pattern again.
8655 * This is very handy when patterns start to go
8656 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8657 * with a combinatorial explosion of backtracking.
8659 * The cache is implemented as a bit array, with one bit per
8660 * string byte position per WHILEM op (up to 16) - so its
8661 * between 0.25 and 2x the string size.
8663 * To avoid allocating a poscache buffer every time, we do an
8664 * initially countdown; only after we have executed a WHILEM
8665 * op (string-length x #WHILEMs) times do we allocate the
8668 * The top 4 bits of scan->flags byte say how many different
8669 * relevant CURLLYX/WHILEM op pairs there are, while the
8670 * bottom 4-bits is the identifying index number of this
8676 if (!reginfo->poscache_maxiter) {
8677 /* start the countdown: Postpone detection until we
8678 * know the match is not *that* much linear. */
8679 reginfo->poscache_maxiter
8680 = (reginfo->strend - reginfo->strbeg + 1)
8682 /* possible overflow for long strings and many CURLYX's */
8683 if (reginfo->poscache_maxiter < 0)
8684 reginfo->poscache_maxiter = I32_MAX;
8685 reginfo->poscache_iter = reginfo->poscache_maxiter;
8688 if (reginfo->poscache_iter-- == 0) {
8689 /* initialise cache */
8690 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8691 regmatch_info_aux *const aux = reginfo->info_aux;
8692 if (aux->poscache) {
8693 if ((SSize_t)reginfo->poscache_size < size) {
8694 Renew(aux->poscache, size, char);
8695 reginfo->poscache_size = size;
8697 Zero(aux->poscache, size, char);
8700 reginfo->poscache_size = size;
8701 Newxz(aux->poscache, size, char);
8703 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8704 "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8705 PL_colors[4], PL_colors[5])
8709 if (reginfo->poscache_iter < 0) {
8710 /* have we already failed at this position? */
8711 SSize_t offset, mask;
8713 reginfo->poscache_iter = -1; /* stop eventual underflow */
8714 offset = (scan->flags & 0xf) - 1
8715 + (locinput - reginfo->strbeg)
8717 mask = 1 << (offset % 8);
8719 if (reginfo->info_aux->poscache[offset] & mask) {
8720 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: (cache) already tried at this position...\n",
8723 cur_curlyx->u.curlyx.count--;
8724 sayNO; /* cache records failure */
8726 ST.cache_offset = offset;
8727 ST.cache_mask = mask;
8731 /* Prefer B over A for minimal matching. */
8733 if (cur_curlyx->u.curlyx.minmod) {
8734 ST.save_curlyx = cur_curlyx;
8735 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8736 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8737 locinput, loceol, script_run_begin);
8738 NOT_REACHED; /* NOTREACHED */
8741 /* Prefer A over B for maximal matching. */
8743 if (n < max) { /* More greed allowed? */
8744 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8746 cur_curlyx->u.curlyx.lastloc = locinput;
8747 REGCP_SET(ST.lastcp);
8748 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8750 NOT_REACHED; /* NOTREACHED */
8752 goto do_whilem_B_max;
8754 NOT_REACHED; /* NOTREACHED */
8756 case WHILEM_B_min: /* just matched B in a minimal match */
8757 case WHILEM_B_max: /* just matched B in a maximal match */
8758 cur_curlyx = ST.save_curlyx;
8760 NOT_REACHED; /* NOTREACHED */
8762 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8763 cur_curlyx = ST.save_curlyx;
8764 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8765 cur_curlyx->u.curlyx.count--;
8767 NOT_REACHED; /* NOTREACHED */
8769 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8771 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8772 REGCP_UNWIND(ST.lastcp);
8773 regcppop(rex, &maxopenparen);
8774 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8775 cur_curlyx->u.curlyx.count--;
8777 NOT_REACHED; /* NOTREACHED */
8779 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8780 REGCP_UNWIND(ST.lastcp);
8781 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8782 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: failed, trying continuation...\n",
8788 ST.save_curlyx = cur_curlyx;
8789 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8790 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8791 locinput, loceol, script_run_begin);
8792 NOT_REACHED; /* NOTREACHED */
8794 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8795 cur_curlyx = ST.save_curlyx;
8797 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: B min fail: trying longer...\n", depth)
8799 /* Try grabbing another A and see if it helps. */
8800 cur_curlyx->u.curlyx.lastloc = locinput;
8801 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8803 REGCP_SET(ST.lastcp);
8804 PUSH_STATE_GOTO(WHILEM_A_min,
8805 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8806 locinput, loceol, script_run_begin);
8807 NOT_REACHED; /* NOTREACHED */
8810 #define ST st->u.branch
8812 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
8813 next = scan + ARG(scan);
8816 scan = NEXTOPER(scan);
8819 case BRANCH: /* /(...|A|...)/ */
8820 scan = NEXTOPER(scan); /* scan now points to inner node */
8821 ST.lastparen = rex->lastparen;
8822 ST.lastcloseparen = rex->lastcloseparen;
8823 ST.next_branch = next;
8826 /* Now go into the branch */
8828 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8831 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8834 NOT_REACHED; /* NOTREACHED */
8836 case CUTGROUP: /* /(*THEN)/ */
8837 sv_yes_mark = st->u.mark.mark_name = scan->flags
8838 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8840 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8842 NOT_REACHED; /* NOTREACHED */
8844 case CUTGROUP_next_fail:
8847 if (st->u.mark.mark_name)
8848 sv_commit = st->u.mark.mark_name;
8850 NOT_REACHED; /* NOTREACHED */
8854 NOT_REACHED; /* NOTREACHED */
8856 case BRANCH_next_fail: /* that branch failed; try the next, if any */
8861 REGCP_UNWIND(ST.cp);
8862 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8863 scan = ST.next_branch;
8864 /* no more branches? */
8865 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8867 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n",
8874 continue; /* execute next BRANCH[J] op */
8877 case MINMOD: /* next op will be non-greedy, e.g. A*? */
8882 #define ST st->u.curlym
8884 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
8886 /* This is an optimisation of CURLYX that enables us to push
8887 * only a single backtracking state, no matter how many matches
8888 * there are in {m,n}. It relies on the pattern being constant
8889 * length, with no parens to influence future backrefs
8893 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8895 ST.lastparen = rex->lastparen;
8896 ST.lastcloseparen = rex->lastcloseparen;
8898 /* if paren positive, emulate an OPEN/CLOSE around A */
8900 U32 paren = ST.me->flags;
8902 if (paren > maxopenparen)
8903 maxopenparen = paren;
8904 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8912 ST.Binfo.count = -1;
8915 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8918 curlym_do_A: /* execute the A in /A{m,n}B/ */
8919 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
8921 NOT_REACHED; /* NOTREACHED */
8923 case CURLYM_A: /* we've just matched an A */
8925 /* after first match, determine A's length: u.curlym.alen */
8926 if (ST.count == 1) {
8927 if (reginfo->is_utf8_target) {
8928 char *s = st->locinput;
8929 while (s < locinput) {
8935 ST.alen = locinput - st->locinput;
8938 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8941 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8942 depth, (IV) ST.count, (IV)ST.alen)
8945 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8950 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8951 if ( max == REG_INFTY || ST.count < max )
8952 goto curlym_do_A; /* try to match another A */
8954 goto curlym_do_B; /* try to match B */
8956 case CURLYM_A_fail: /* just failed to match an A */
8957 REGCP_UNWIND(ST.cp);
8960 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8961 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8964 curlym_do_B: /* execute the B in /A{m,n}B/ */
8966 goto curlym_close_B;
8968 if (ST.Binfo.count < 0) {
8969 /* calculate possible match of 1st char following curly */
8971 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8972 regnode *text_node = ST.B;
8973 if (! HAS_TEXT(text_node))
8974 FIND_NEXT_IMPT(text_node);
8975 if (PL_regkind[OP(text_node)] == EXACT) {
8976 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
8977 &ST.Binfo, reginfo))
8986 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n",
8987 depth, (IV)ST.count)
8989 if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) {
8990 assert(ST.Binfo.count > 0);
8992 /* Do a quick test to hopefully rule out most non-matches */
8993 if ( locinput + ST.Binfo.min_length > loceol
8994 || ! S_test_EXACTISH_ST(locinput, ST.Binfo))
8997 Perl_re_exec_indentf( aTHX_
8998 "CURLYM Fast bail next target=0x%X anded==0x%X"
9001 (int) nextbyte, ST.Binfo.first_byte_anded,
9002 ST.Binfo.first_byte_mask)
9004 state_num = CURLYM_B_fail;
9005 goto reenter_switch;
9011 /* emulate CLOSE: mark current A as captured */
9012 U32 paren = (U32)ST.me->flags;
9013 if (ST.count || is_accepted) {
9014 CLOSE_CAPTURE(paren,
9015 HOPc(locinput, -ST.alen) - reginfo->strbeg,
9016 locinput - reginfo->strbeg);
9019 rex->offs[paren].end = -1;
9020 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
9022 if (ST.count || is_accepted)
9032 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */
9034 NOT_REACHED; /* NOTREACHED */
9036 case CURLYM_B_fail: /* just failed to match a B */
9037 REGCP_UNWIND(ST.cp);
9038 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9040 I32 max = ARG2(ST.me);
9041 if (max != REG_INFTY && ST.count == max)
9043 goto curlym_do_A; /* try to match a further A */
9045 /* backtrack one A */
9046 if (ST.count == ARG1(ST.me) /* min */)
9049 SET_locinput(HOPc(locinput, -ST.alen));
9050 goto curlym_do_B; /* try to match B */
9053 #define ST st->u.curly
9055 #define CURLY_SETPAREN(paren, success) \
9058 CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
9059 locinput - reginfo->strbeg); \
9062 rex->offs[paren].end = -1; \
9063 rex->lastparen = ST.lastparen; \
9064 rex->lastcloseparen = ST.lastcloseparen; \
9068 case STAR: /* /A*B/ where A is width 1 char */
9072 scan = NEXTOPER(scan);
9075 case PLUS: /* /A+B/ where A is width 1 char */
9079 scan = NEXTOPER(scan);
9082 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
9083 ST.paren = scan->flags; /* Which paren to set */
9084 ST.lastparen = rex->lastparen;
9085 ST.lastcloseparen = rex->lastcloseparen;
9086 if (ST.paren > maxopenparen)
9087 maxopenparen = ST.paren;
9088 ST.min = ARG1(scan); /* min to match */
9089 ST.max = ARG2(scan); /* max to match */
9090 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
9092 /* handle the single-char capture called as a GOSUB etc */
9093 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
9095 char *li = locinput;
9096 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
9104 case CURLY: /* /A{m,n}B/ where A is width 1 char */
9106 ST.min = ARG1(scan); /* min to match */
9107 ST.max = ARG2(scan); /* max to match */
9108 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
9111 * Lookahead to avoid useless match attempts
9112 * when we know what character comes next.
9114 * Used to only do .*x and .*?x, but now it allows
9115 * for )'s, ('s and (?{ ... })'s to be in the way
9116 * of the quantifier and the EXACT-like node. -- japhy
9119 assert(ST.min <= ST.max);
9120 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
9124 regnode *text_node = next;
9126 if (! HAS_TEXT(text_node))
9127 FIND_NEXT_IMPT(text_node);
9129 if (! HAS_TEXT(text_node))
9132 if ( PL_regkind[OP(text_node)] != EXACT ) {
9136 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9137 &ST.Binfo, reginfo))
9148 char *li = locinput;
9151 regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
9158 if (ST.Binfo.count <= 0)
9159 goto curly_try_B_min;
9161 ST.oldloc = locinput;
9163 /* set ST.maxpos to the furthest point along the
9164 * string that could possibly match, i.e., that a match could
9166 if (ST.max == REG_INFTY) {
9167 ST.maxpos = loceol - 1;
9169 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
9172 else if (utf8_target) {
9173 int m = ST.max - ST.min;
9174 for (ST.maxpos = locinput;
9175 m >0 && ST.maxpos < loceol; m--)
9176 ST.maxpos += UTF8SKIP(ST.maxpos);
9179 ST.maxpos = locinput + ST.max - ST.min;
9180 if (ST.maxpos >= loceol)
9181 ST.maxpos = loceol - 1;
9183 goto curly_try_B_min_known;
9187 /* avoid taking address of locinput, so it can remain
9189 char *li = locinput;
9190 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
9191 if (ST.count < ST.min)
9194 if ((ST.count > ST.min)
9195 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
9197 /* A{m,n} must come at the end of the string, there's
9198 * no point in backing off ... */
9200 /* ...except that $ and \Z can match before *and* after
9201 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
9202 We may back off by one in this case. */
9203 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
9207 goto curly_try_B_max;
9209 NOT_REACHED; /* NOTREACHED */
9211 case CURLY_B_min_fail:
9212 /* failed to find B in a non-greedy match. */
9214 REGCP_UNWIND(ST.cp);
9216 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9219 if (ST.Binfo.count == 0) {
9220 /* failed -- move forward one */
9221 char *li = locinput;
9222 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
9227 if (!( ST.count <= ST.max
9228 /* count overflow ? */
9229 || (ST.max == REG_INFTY && ST.count > 0))
9235 /* Couldn't or didn't -- move forward. */
9236 ST.oldloc = locinput;
9238 locinput += UTF8SKIP(locinput);
9243 curly_try_B_min_known:
9244 /* find the next place where 'B' could work, then call B */
9245 if (locinput + ST.Binfo.initial_exact < loceol) {
9246 if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
9248 /* Here, the mask is all 1's for the entire length of
9249 * any possible match. (That actually means that there
9250 * is only one possible match.) Look for the next
9252 locinput = ninstr(locinput, loceol,
9253 (char *) ST.Binfo.matches,
9254 (char *) ST.Binfo.matches
9255 + ST.Binfo.initial_exact);
9256 if (locinput == NULL) {
9261 /* If the first byte(s) of the mask are all ones, it
9262 * means those bytes must match identically, so can use
9263 * ninstr() to find the next possible matchpoint */
9264 if (ST.Binfo.initial_exact > 0) {
9265 locinput = ninstr(locinput, loceol,
9266 (char *) ST.Binfo.matches,
9267 (char *) ST.Binfo.matches
9268 + ST.Binfo.initial_exact);
9270 else { /* Otherwise find the next byte that matches,
9272 locinput = (char *) find_next_masked(
9273 (U8 *) locinput, (U8 *) loceol,
9274 ST.Binfo.first_byte_anded,
9275 ST.Binfo.first_byte_mask);
9276 /* Advance to the end of a multi-byte character */
9278 while ( locinput < loceol
9279 && UTF8_IS_CONTINUATION(*locinput))
9285 if ( locinput == NULL
9286 || locinput + ST.Binfo.min_length > loceol)
9291 /* Here, we have found a possible match point; if can't
9292 * rule it out, quit the loop so can check fully */
9293 if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
9297 locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
9299 } while (locinput <= ST.maxpos);
9302 if (locinput > ST.maxpos)
9306 ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
9307 : (STRLEN) (locinput - ST.oldloc);
9310 /* Here is at the beginning of a character that meets the mask
9311 * criteria. Need to make sure that some real possibility */
9314 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
9315 * at what may be the beginning of b; check that everything
9316 * between oldloc and locinput matches */
9317 char *li = ST.oldloc;
9319 if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
9321 assert(n == REG_INFTY || locinput == li);
9326 CURLY_SETPAREN(ST.paren, ST.count);
9327 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
9329 NOT_REACHED; /* NOTREACHED */
9333 /* a successful greedy match: now try to match B */
9334 if ( ST.Binfo.count <= 0
9335 || ( ST.Binfo.count > 0
9336 && locinput + ST.Binfo.min_length <= loceol
9337 && S_test_EXACTISH_ST(locinput, ST.Binfo)))
9339 CURLY_SETPAREN(ST.paren, ST.count);
9340 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
9342 NOT_REACHED; /* NOTREACHED */
9346 case CURLY_B_max_fail:
9347 /* failed to find B in a greedy match */
9349 REGCP_UNWIND(ST.cp);
9351 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9354 if (--ST.count < ST.min)
9356 locinput = HOPc(locinput, -1);
9357 goto curly_try_B_max;
9361 case END: /* last op of main pattern */
9364 /* we've just finished A in /(??{A})B/; now continue with B */
9366 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
9367 st->u.eval.prev_rex = rex_sv; /* inner */
9369 /* Save *all* the positions. */
9370 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
9371 rex_sv = CUR_EVAL.prev_rex;
9372 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
9373 SET_reg_curpm(rex_sv);
9374 rex = ReANY(rex_sv);
9375 rexi = RXi_GET(rex);
9377 st->u.eval.prev_curlyx = cur_curlyx;
9378 cur_curlyx = CUR_EVAL.prev_curlyx;
9380 REGCP_SET(st->u.eval.lastcp);
9382 /* Restore parens of the outer rex without popping the
9384 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
9386 st->u.eval.prev_eval = cur_eval;
9387 cur_eval = CUR_EVAL.prev_eval;
9389 Perl_re_exec_indentf( aTHX_ "END: EVAL trying tail ... (cur_eval=%p)\n",
9391 if ( nochange_depth )
9394 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
9396 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */
9397 st->u.eval.prev_eval->u.eval.B,
9398 locinput, loceol, script_run_begin);
9401 if (locinput < reginfo->till) {
9402 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
9403 "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
9405 (long)(locinput - startpos),
9406 (long)(reginfo->till - startpos),
9409 sayNO_SILENT; /* Cannot match: too short. */
9411 sayYES; /* Success! */
9413 case LOOKBEHIND_END: /* validate that *lookbehind* UNLESSM/IFMATCH
9414 matches end at the right spot, required for
9415 variable length matches. */
9416 if (match_end && locinput != match_end)
9419 Perl_re_exec_indentf( aTHX_
9420 "%sLOOKBEHIND_END: subpattern failed...%s\n",
9421 depth, PL_colors[4], PL_colors[5]));
9422 sayNO; /* Variable length match didn't line up */
9426 case SUCCEED: /* successful SUSPEND/CURLYM and
9427 *lookahead* IFMATCH/UNLESSM*/
9429 Perl_re_exec_indentf( aTHX_
9430 "%sSUCCEED: subpattern success...%s\n",
9431 depth, PL_colors[4], PL_colors[5]));
9432 sayYES; /* Success! */
9435 #define ST st->u.ifmatch
9437 case SUSPEND: /* (?>A) */
9439 ST.start = locinput;
9444 case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
9446 goto ifmatch_trivial_fail_test;
9448 case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
9450 ifmatch_trivial_fail_test:
9451 ST.prev_match_end= match_end;
9452 ST.count = scan->next_off + 1; /* next_off repurposed to be
9453 lookbehind count, requires
9455 if (! scan->flags) { /* 'flags' zero means lookahed */
9457 /* Lookahead starts here and ends at the normal place */
9458 ST.start = locinput;
9463 PERL_UINT_FAST8_T back_count = scan->flags;
9465 match_end = locinput;
9467 /* Lookbehind can look beyond the current position */
9470 /* ... and starts at the first place in the input that is in
9471 * the range of the possible start positions */
9472 for (; ST.count > 0; ST.count--, back_count--) {
9473 s = HOPBACKc(locinput, back_count);
9480 /* If the lookbehind doesn't start in the actual string, is a
9481 * trivial match failure */
9482 match_end = ST.prev_match_end;
9485 sw = 1 - cBOOL(ST.wanted);
9490 /* Here, we didn't want it to match, so is actually success */
9491 next = scan + ARG(scan);
9499 ST.logical = logical;
9500 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9502 /* execute body of (?...A) */
9503 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
9504 ST.end, script_run_begin);
9505 NOT_REACHED; /* NOTREACHED */
9510 case IFMATCH_A_fail: /* body of (?...A) failed */
9511 if (! ST.logical && ST.count > 1) {
9513 /* It isn't a real failure until we've tried all starting
9514 * positions. Move to the next starting position and retry */
9516 ST.start = HOPc(ST.start, 1);
9518 logical = ST.logical;
9522 /* Here, all starting positions have been tried. */
9526 case IFMATCH_A: /* body of (?...A) succeeded */
9529 sw = matched == ST.wanted;
9530 match_end = ST.prev_match_end;
9531 if (! ST.logical && !sw) {
9535 if (OP(ST.me) != SUSPEND) {
9536 /* restore old position except for (?>...) */
9537 locinput = st->locinput;
9538 loceol = st->loceol;
9539 script_run_begin = st->sr0;
9541 scan = ST.me + ARG(ST.me);
9544 continue; /* execute B */
9549 case LONGJMP: /* alternative with many branches compiles to
9550 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9551 next = scan + ARG(scan);
9556 case COMMIT: /* (*COMMIT) */
9557 reginfo->cutpoint = loceol;
9560 case PRUNE: /* (*PRUNE) */
9562 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9563 PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9565 NOT_REACHED; /* NOTREACHED */
9567 case COMMIT_next_fail:
9571 NOT_REACHED; /* NOTREACHED */
9573 case OPFAIL: /* (*FAIL) */
9575 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9577 /* deal with (?(?!)X|Y) properly,
9578 * make sure we trigger the no branch
9579 * of the trailing IFTHEN structure*/
9585 NOT_REACHED; /* NOTREACHED */
9587 #define ST st->u.mark
9588 case MARKPOINT: /* (*MARK:foo) */
9589 ST.prev_mark = mark_state;
9590 ST.mark_name = sv_commit = sv_yes_mark
9591 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9593 ST.mark_loc = locinput;
9594 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9596 NOT_REACHED; /* NOTREACHED */
9598 case MARKPOINT_next:
9599 mark_state = ST.prev_mark;
9601 NOT_REACHED; /* NOTREACHED */
9603 case MARKPOINT_next_fail:
9604 if (popmark && sv_eq(ST.mark_name,popmark))
9606 if (ST.mark_loc > startpoint)
9607 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9608 popmark = NULL; /* we found our mark */
9609 sv_commit = ST.mark_name;
9612 Perl_re_exec_indentf( aTHX_ "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9614 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9617 mark_state = ST.prev_mark;
9618 sv_yes_mark = mark_state ?
9619 mark_state->u.mark.mark_name : NULL;
9621 NOT_REACHED; /* NOTREACHED */
9623 case SKIP: /* (*SKIP) */
9625 /* (*SKIP) : if we fail we cut here*/
9626 ST.mark_name = NULL;
9627 ST.mark_loc = locinput;
9628 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9631 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9632 otherwise do nothing. Meaning we need to scan
9634 regmatch_state *cur = mark_state;
9635 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9638 if ( sv_eq( cur->u.mark.mark_name,
9641 ST.mark_name = find;
9642 PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9645 cur = cur->u.mark.prev_mark;
9648 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9651 case SKIP_next_fail:
9653 /* (*CUT:NAME) - Set up to search for the name as we
9654 collapse the stack*/
9655 popmark = ST.mark_name;
9657 /* (*CUT) - No name, we cut here.*/
9658 if (ST.mark_loc > startpoint)
9659 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9660 /* but we set sv_commit to latest mark_name if there
9661 is one so they can test to see how things lead to this
9664 sv_commit=mark_state->u.mark.mark_name;
9668 NOT_REACHED; /* NOTREACHED */
9671 case LNBREAK: /* \R */
9672 if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9679 PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9680 PTR2UV(scan), OP(scan));
9681 Perl_croak(aTHX_ "regexp memory corruption");
9683 /* this is a point to jump to in order to increment
9684 * locinput by one character */
9686 assert(!NEXTCHR_IS_EOS);
9688 locinput += PL_utf8skip[nextbyte];
9689 /* locinput is allowed to go 1 char off the end (signifying
9690 * EOS), but not 2+ */
9691 if (locinput > loceol)
9700 /* switch break jumps here */
9701 scan = next; /* prepare to execute the next op and ... */
9702 continue; /* ... jump back to the top, reusing st */
9706 /* push a state that backtracks on success */
9707 st->u.yes.prev_yes_state = yes_state;
9711 /* push a new regex state, then continue at scan */
9713 regmatch_state *newst;
9714 DECLARE_AND_GET_RE_DEBUG_FLAGS;
9716 DEBUG_r( /* DEBUG_STACK_r */
9717 if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
9718 regmatch_state *cur = st;
9719 regmatch_state *curyes = yes_state;
9721 regmatch_slab *slab = PL_regmatch_slab;
9722 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9723 if (cur < SLAB_FIRST(slab)) {
9725 cur = SLAB_LAST(slab);
9727 Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9730 depth - i, PL_reg_name[cur->resume_state],
9731 (curyes == cur) ? "yes" : ""
9734 curyes = cur->u.yes.prev_yes_state;
9737 DEBUG_STATE_pp("push")
9740 st->locinput = locinput;
9741 st->loceol = loceol;
9742 st->sr0 = script_run_begin;
9744 if (newst > SLAB_LAST(PL_regmatch_slab))
9745 newst = S_push_slab(aTHX);
9746 PL_regmatch_state = newst;
9748 locinput = pushinput;
9750 script_run_begin = pushsr0;
9756 #ifdef SOLARIS_BAD_OPTIMIZER
9757 # undef PL_charclass
9761 * We get here only if there's trouble -- normally "case END" is
9762 * the terminating point.
9764 Perl_croak(aTHX_ "corrupted regexp pointers");
9765 NOT_REACHED; /* NOTREACHED */
9769 /* we have successfully completed a subexpression, but we must now
9770 * pop to the state marked by yes_state and continue from there */
9771 assert(st != yes_state);
9773 while (st != yes_state) {
9775 if (st < SLAB_FIRST(PL_regmatch_slab)) {
9776 PL_regmatch_slab = PL_regmatch_slab->prev;
9777 st = SLAB_LAST(PL_regmatch_slab);
9781 DEBUG_STATE_pp("pop (no final)");
9783 DEBUG_STATE_pp("pop (yes)");
9789 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9790 || yes_state > SLAB_LAST(PL_regmatch_slab))
9792 /* not in this slab, pop slab */
9793 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9794 PL_regmatch_slab = PL_regmatch_slab->prev;
9795 st = SLAB_LAST(PL_regmatch_slab);
9797 depth -= (st - yes_state);
9800 yes_state = st->u.yes.prev_yes_state;
9801 PL_regmatch_state = st;
9804 locinput= st->locinput;
9806 script_run_begin = st->sr0;
9808 state_num = st->resume_state + no_final;
9809 goto reenter_switch;
9812 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n",
9813 PL_colors[4], PL_colors[5]));
9815 if (reginfo->info_aux_eval) {
9816 /* each successfully executed (?{...}) block does the equivalent of
9817 * local $^R = do {...}
9818 * When popping the save stack, all these locals would be undone;
9819 * bypass this by setting the outermost saved $^R to the latest
9821 /* I dont know if this is needed or works properly now.
9822 * see code related to PL_replgv elsewhere in this file.
9825 if (oreplsv != GvSV(PL_replgv)) {
9826 sv_setsv(oreplsv, GvSV(PL_replgv));
9827 SvSETMAGIC(oreplsv);
9835 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n",
9837 PL_colors[4], PL_colors[5])
9849 /* there's a previous state to backtrack to */
9851 if (st < SLAB_FIRST(PL_regmatch_slab)) {
9852 PL_regmatch_slab = PL_regmatch_slab->prev;
9853 st = SLAB_LAST(PL_regmatch_slab);
9855 PL_regmatch_state = st;
9856 locinput= st->locinput;
9858 script_run_begin = st->sr0;
9860 DEBUG_STATE_pp("pop");
9862 if (yes_state == st)
9863 yes_state = st->u.yes.prev_yes_state;
9865 state_num = st->resume_state + 1; /* failure = success + 1 */
9867 goto reenter_switch;
9872 if (rex->intflags & PREGf_VERBARG_SEEN) {
9873 SV *sv_err = get_sv("REGERROR", 1);
9874 SV *sv_mrk = get_sv("REGMARK", 1);
9876 sv_commit = &PL_sv_no;
9878 sv_yes_mark = &PL_sv_yes;
9881 sv_commit = &PL_sv_yes;
9882 sv_yes_mark = &PL_sv_no;
9886 sv_setsv(sv_err, sv_commit);
9887 sv_setsv(sv_mrk, sv_yes_mark);
9891 if (last_pushed_cv) {
9893 /* see "Some notes about MULTICALL" above */
9895 PERL_UNUSED_VAR(SP);
9898 LEAVE_SCOPE(orig_savestack_ix);
9900 assert(!result || locinput - reginfo->strbeg >= 0);
9901 return result ? locinput - reginfo->strbeg : -1;
9905 - regrepeat - repeatedly match something simple, report how many
9907 * What 'simple' means is a node which can be the operand of a quantifier like
9910 * startposp - pointer to a pointer to the start position. This is updated
9911 * to point to the byte following the highest successful
9913 * p - the regnode to be repeatedly matched against.
9914 * loceol - pointer to the end position beyond which we aren't supposed to
9916 * reginfo - struct holding match state, such as utf8_target
9917 * max - maximum number of things to match.
9918 * depth - (for debugging) backtracking depth.
9921 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9922 char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
9924 char *scan; /* Pointer to current position in target string */
9926 char *this_eol = loceol; /* potentially adjusted version. */
9927 I32 hardcount = 0; /* How many matches so far */
9928 bool utf8_target = reginfo->is_utf8_target;
9929 unsigned int to_complement = 0; /* Invert the result? */
9930 _char_class_number classnum;
9932 PERL_ARGS_ASSERT_REGREPEAT;
9934 /* This routine is structured so that we switch on the input OP. Each OP
9935 * case: statement contains a loop to repeatedly apply the OP, advancing
9936 * the input until it fails, or reaches the end of the input, or until it
9937 * reaches the upper limit of matches. */
9940 if (max == REG_INFTY) /* This is a special marker to go to the platform's
9943 else if (! utf8_target && this_eol - scan > max)
9944 this_eol = scan + max;
9946 /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol>
9947 * down to the maximum of how far we should go in it (but leaving it set to
9948 * the real end if the maximum permissible would take us beyond that).
9949 * This allows us to make the loop exit condition that we haven't gone past
9950 * <this_eol> to also mean that we haven't exceeded the max permissible
9951 * count, saving a test each time through the loop. But it assumes that
9952 * the OP matches a single byte, which is true for most of the OPs below
9953 * when applied to a non-UTF-8 target. Those relatively few OPs that don't
9954 * have this characteristic have to compensate.
9956 * There is no such adjustment for UTF-8 targets, since the number of bytes
9957 * per character can vary. OPs will have to test both that the count is
9958 * less than the max permissible (using <hardcount> to keep track), and
9959 * that we are still within the bounds of the string (using <this_eol>. A
9960 * few OPs match a single byte no matter what the encoding. They can omit
9961 * the max test if, for the UTF-8 case, they do the adjustment that was
9964 * Thus, the code above sets things up for the common case; and exceptional
9965 * cases need extra work; the common case is to make sure <scan> doesn't go
9966 * past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
9967 * count doesn't exceed the maximum permissible */
9969 switch (with_t_UTF8ness(OP(p), utf8_target)) {
9971 while (scan < this_eol && hardcount < max && *scan != '\n') {
9972 scan += UTF8SKIP(scan);
9978 scan = (char *) memchr(scan, '\n', this_eol - scan);
9985 while (scan < this_eol && hardcount < max) {
9986 scan += UTF8SKIP(scan);
9996 case LEXACT_REQ8_tb:
9997 case EXACTFU_REQ8_tb:
10001 if (UTF8_IS_ABOVE_LATIN1(*scan)) {
10002 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
10011 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10014 case EXACT_REQ8_t8:
10015 case LEXACT_REQ8_t8:
10016 case EXACTFU_REQ8_t8:
10023 case EXACTFAA_NO_TRIE_t8:
10024 case EXACTFAA_NO_TRIE_tb:
10033 struct next_matchable_info Binfo;
10034 PERL_UINT_FAST8_T definitive_len;
10036 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
10038 /* Set up termination info, and quit if we can rule out that we've
10039 * gotten a match of the termination criteria */
10040 if ( ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo)
10041 || scan + Binfo.min_length > this_eol
10042 || ! S_test_EXACTISH_ST(scan, Binfo))
10047 definitive_len = Binfo.initial_definitive;
10049 /* Here there are potential matches, and the first byte(s) matched our
10052 * If we got a definitive match of some initial bytes, there is no
10053 * possibility of false positives as far as it got */
10054 if (definitive_len > 0) {
10056 /* If as far as it got is the maximum possible, there were no false
10057 * positives at all. Since we have everything set up, see how many
10058 * repeats there are. */
10059 if (definitive_len >= Binfo.max_length) {
10061 /* We've already found one match */
10062 scan += definitive_len;
10065 /* If want more than the one match, and there is room for more,
10066 * see if there are any */
10067 if (hardcount < max && scan + definitive_len <= this_eol) {
10069 /* If the character is only a single byte long, just span
10070 * all such bytes. */
10071 if (definitive_len == 1) {
10072 const char * orig_scan = scan;
10074 if (this_eol - (scan - hardcount) > max) {
10075 this_eol = scan - hardcount + max;
10078 /* Use different routines depending on whether it's an
10079 * exact match or matches with a mask */
10080 if (Binfo.initial_exact == 1) {
10081 scan = (char *) find_span_end((U8 *) scan,
10086 scan = (char *) find_span_end_mask(
10089 Binfo.first_byte_anded,
10090 Binfo.first_byte_mask);
10093 hardcount += scan - orig_scan;
10095 else { /* Here, the full character definitive match is more
10097 while ( hardcount < max
10098 && scan + definitive_len <= this_eol
10099 && S_test_EXACTISH_ST(scan, Binfo))
10101 scan += definitive_len;
10108 } /* End of a full character is definitively matched */
10110 /* Here, an initial portion of the character matched definitively,
10111 * and the rest matched as well, but could have false positives */
10114 PERL_INT_FAST8_T i;
10115 U8 * matches = Binfo.matches;
10117 /* The first bytes were definitive. Look at the remaining */
10118 for (i = 0; i < Binfo.count; i++) {
10119 if (memEQ(scan + definitive_len,
10120 matches + definitive_len,
10121 Binfo.lengths[i] - definitive_len))
10123 goto found_a_completion;
10126 matches += Binfo.lengths[i];
10129 /* Didn't find anything to complete our initial match. Stop
10133 found_a_completion:
10135 /* Here, matched a full character, Include it in the result,
10136 * and then look to see if the next char matches */
10138 scan += Binfo.lengths[i];
10140 } while ( hardcount < max
10141 && scan + definitive_len < this_eol
10142 && S_test_EXACTISH_ST(scan, Binfo));
10144 /* Here, have advanced as far as possible */
10146 } /* End of found some initial bytes that definitively matched */
10148 /* Here, we can't rule out that we have found the beginning of 'B', but
10149 * there were no initial bytes that could rule out anything
10150 * definitively. Use brute force to examine all the possibilities */
10151 while (scan < this_eol && hardcount < max) {
10152 PERL_INT_FAST8_T i;
10153 U8 * matches = Binfo.matches;
10155 for (i = 0; i < Binfo.count; i++) {
10156 if (memEQ(scan, matches, Binfo.lengths[i])) {
10160 matches += Binfo.lengths[i];
10167 scan += Binfo.lengths[i];
10173 case ANYOFPOSIXL_t8:
10175 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10176 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10181 while ( hardcount < max
10183 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10185 scan += UTF8SKIP(scan);
10190 case ANYOFPOSIXL_tb:
10192 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10193 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10198 if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
10199 while ( scan < this_eol
10200 && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
10204 while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
10210 if (this_eol - scan > max) {
10212 /* We didn't adjust <this_eol> at the beginning of this routine
10213 * because is UTF-8, but it is actually ok to do so, since here, to
10214 * match, 1 char == 1 byte. */
10215 this_eol = scan + max;
10220 scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol,
10221 (U8) ARG(p), FLAGS(p));
10225 while ( hardcount < max
10227 && (*scan & FLAGS(p)) != ARG(p))
10229 scan += UTF8SKIP(scan);
10235 scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol,
10236 (U8) ARG(p), FLAGS(p));
10239 case ANYOFH_tb: /* ANYOFH only can match UTF-8 targets */
10246 while ( hardcount < max
10248 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10249 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10251 scan += UTF8SKIP(scan);
10257 /* we know the first byte must be the FLAGS field */
10258 while ( hardcount < max
10260 && (U8) *scan == ANYOF_FLAGS(p)
10261 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10263 scan += UTF8SKIP(scan);
10269 while ( hardcount < max
10271 && inRANGE(NATIVE_UTF8_TO_I8(*scan),
10272 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
10273 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
10274 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10275 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10277 scan += UTF8SKIP(scan);
10283 while ( hardcount < max
10284 && scan + FLAGS(p) < this_eol
10285 && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
10286 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10288 scan += UTF8SKIP(scan);
10294 while ( hardcount < max
10296 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10297 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10300 ANYOFRbase(p), ANYOFRdelta(p)))
10302 scan += UTF8SKIP(scan);
10308 while ( hardcount < max
10310 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10318 while ( hardcount < max
10320 && (U8) *scan == ANYOF_FLAGS(p)
10321 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10324 ANYOFRbase(p), ANYOFRdelta(p)))
10326 scan += UTF8SKIP(scan);
10332 while ( hardcount < max
10334 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10341 /* The argument (FLAGS) to all the POSIX node types is the class number */
10348 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10349 while ( scan < this_eol
10350 && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan)))
10361 while ( hardcount < max && scan < this_eol
10362 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
10366 scan += UTF8SKIP(scan);
10375 if (this_eol - scan > max) {
10377 /* We didn't adjust <this_eol> at the beginning of this routine
10378 * because is UTF-8, but it is actually ok to do so, since here, to
10379 * match, 1 char == 1 byte. */
10380 this_eol = scan + max;
10385 while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
10394 while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
10401 /* The complement of something that matches only ASCII matches all
10402 * non-ASCII, plus everything in ASCII that isn't in the class. */
10403 while ( hardcount < max && scan < this_eol
10404 && ( ! isASCII_utf8_safe(scan, loceol)
10405 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
10407 scan += UTF8SKIP(scan);
10417 while ( scan < this_eol
10418 && to_complement ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
10431 classnum = (_char_class_number) FLAGS(p);
10432 switch (classnum) {
10434 while ( hardcount < max && scan < this_eol
10436 ^ cBOOL(_invlist_contains_cp(PL_XPosix_ptrs[classnum],
10437 utf8_to_uvchr_buf((U8 *) scan, (U8 *) this_eol, NULL))))
10439 scan += UTF8SKIP(scan);
10444 /* For the classes below, the knowledge of how to handle every code
10445 * point is compiled into Perl via a macro. This code is written
10446 * for making the loops as tight as possible. It could be
10447 * refactored to save space instead. */
10449 case _CC_ENUM_SPACE:
10450 while ( hardcount < max
10453 ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
10455 scan += UTF8SKIP(scan);
10459 case _CC_ENUM_BLANK:
10460 while ( hardcount < max
10463 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
10465 scan += UTF8SKIP(scan);
10469 case _CC_ENUM_XDIGIT:
10470 while ( hardcount < max
10473 ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10475 scan += UTF8SKIP(scan);
10479 case _CC_ENUM_VERTSPACE:
10480 while ( hardcount < max
10483 ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10485 scan += UTF8SKIP(scan);
10489 case _CC_ENUM_CNTRL:
10490 while ( hardcount < max
10493 ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10495 scan += UTF8SKIP(scan);
10503 while ( hardcount < max && scan < this_eol
10504 && (c=is_LNBREAK_utf8_safe(scan, this_eol)))
10512 /* LNBREAK can match one or two latin chars, which is ok, but we have
10513 * to use hardcount in this situation, and throw away the adjustment to
10514 * <this_eol> done before the switch statement */
10516 hardcount < max && scan < loceol
10517 && (c = is_LNBREAK_latin1_safe(scan, loceol))
10525 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized"
10526 " node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
10527 NOT_REACHED; /* NOTREACHED */
10534 c = scan - *startposp;
10538 DECLARE_AND_GET_RE_DEBUG_FLAGS;
10540 SV * const prop = sv_newmortal();
10541 regprop(prog, prop, p, reginfo, NULL);
10542 Perl_re_exec_indentf( aTHX_
10543 "%s can match %" IVdf " times out of %" IVdf "...\n",
10544 depth, SvPVX_const(prop),(IV)c,(IV)max);
10552 - reginclass - determine if a character falls into a character class
10554 n is the ANYOF-type regnode
10555 p is the target string
10556 p_end points to one byte beyond the end of the target string
10557 utf8_target tells whether p is in UTF-8.
10559 Returns true if matched; false otherwise.
10561 Note that this can be a synthetic start class, a combination of various
10562 nodes, so things you think might be mutually exclusive, such as locale,
10563 aren't. It can match both locale and non-locale
10568 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10570 const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10573 bool match = FALSE;
10576 PERL_ARGS_ASSERT_REGINCLASS;
10578 /* If c is not already the code point, get it. Note that
10579 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10580 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10582 const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10583 c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10584 if (c_len == (STRLEN)-1) {
10585 _force_out_malformed_utf8_message(p, p_end,
10587 1 /* 1 means die */ );
10588 NOT_REACHED; /* NOTREACHED */
10591 && (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10592 && ! ANYOFL_UTF8_LOCALE_REQD(flags))
10594 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10598 /* If this character is potentially in the bitmap, check it */
10599 if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10600 if (ANYOF_BITMAP_TEST(n, c))
10603 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10610 else if (flags & ANYOF_LOCALE_FLAGS) {
10611 if ( (flags & ANYOFL_FOLD)
10613 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10617 else if ( ANYOF_POSIXL_TEST_ANY_SET(n)
10618 && c <= U8_MAX /* param to isFOO_lc() */
10620 /* The data structure is arranged so bits 0, 2, 4, ... are set
10621 * if the class includes the Posix character class given by
10622 * bit/2; and 1, 3, 5, ... are set if the class includes the
10623 * complemented Posix class given by int(bit/2), so the
10624 * remainder modulo 2 tells us if to complement or not.
10626 * Note that this code assumes that all the classes are closed
10627 * under folding. For example, if a character matches \w, then
10628 * its fold does too; and vice versa. This should be true for
10629 * any well-behaved locale for all the currently defined Posix
10630 * classes, except for :lower: and :upper:, which are handled
10631 * by the pseudo-class :cased: which matches if either of the
10632 * other two does. To get rid of this assumption, an outer
10633 * loop could be used below to iterate over both the source
10634 * character, and its fold (if different) */
10636 U32 posixl_bits = ANYOF_POSIXL_BITMAP(n);
10639 /* Find the next set bit indicating a class to try matching
10641 U8 bit_pos = lsbit_pos32(posixl_bits);
10643 if (bit_pos % 2 ^ cBOOL(isFOO_lc(bit_pos/2, (U8) c))) {
10648 /* Remove this class from consideration; repeat */
10649 POSIXL_CLEAR(posixl_bits, bit_pos);
10650 } while(posixl_bits != 0);
10656 /* If the bitmap didn't (or couldn't) match, and something outside the
10657 * bitmap could match, try that. */
10659 if (c >= NUM_ANYOF_CODE_POINTS
10660 && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
10662 match = TRUE; /* Everything above the bitmap matches */
10664 /* Here doesn't match everything above the bitmap. If there is
10665 * some information available beyond the bitmap, we may find a
10666 * match in it. If so, this is most likely because the code point
10667 * is outside the bitmap range. But rarely, it could be because of
10668 * some other reason. If so, various flags are set to indicate
10669 * this possibility. On ANYOFD nodes, there may be matches that
10670 * happen only when the target string is UTF-8; or for other node
10671 * types, because runtime lookup is needed, regardless of the
10672 * UTF-8ness of the target string. Finally, under /il, there may
10673 * be some matches only possible if the locale is a UTF-8 one. */
10674 else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP
10675 && ( c >= NUM_ANYOF_CODE_POINTS
10676 || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
10677 && ( UNLIKELY(OP(n) != ANYOFD)
10678 || (utf8_target && ! isASCII_uni(c)
10679 # if NUM_ANYOF_CODE_POINTS > 256
10683 || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
10684 && IN_UTF8_CTYPE_LOCALE)))
10686 SV* only_utf8_locale = NULL;
10687 SV * const definition =
10688 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
10689 get_regclass_nonbitmap_data(prog, n, TRUE, 0,
10690 &only_utf8_locale, NULL);
10692 get_re_gclass_nonbitmap_data(prog, n, TRUE, 0,
10693 &only_utf8_locale, NULL);
10700 } else { /* Convert to utf8 */
10701 utf8_p = utf8_buffer;
10702 append_utf8_from_native_byte(*p, &utf8_p);
10703 utf8_p = utf8_buffer;
10706 /* Turkish locales have these hard-coded rules overriding
10708 if ( UNLIKELY(PL_in_utf8_turkic_locale)
10709 && isALPHA_FOLD_EQ(*p, 'i'))
10712 if (_invlist_contains_cp(definition,
10713 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
10718 else if (*p == 'I') {
10719 if (_invlist_contains_cp(definition,
10720 LATIN_SMALL_LETTER_DOTLESS_I))
10726 else if (_invlist_contains_cp(definition, c)) {
10730 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10731 match = _invlist_contains_cp(only_utf8_locale, c);
10735 /* In a Turkic locale under folding, hard-code the I i case pair
10737 if ( UNLIKELY(PL_in_utf8_turkic_locale)
10739 && (flags & ANYOFL_FOLD)
10742 if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10743 if (ANYOF_BITMAP_TEST(n, 'i')) {
10747 else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10748 if (ANYOF_BITMAP_TEST(n, 'I')) {
10754 if (UNICODE_IS_SUPER(c)
10756 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10758 && ckWARN_d(WARN_NON_UNICODE))
10760 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10761 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10765 #if ANYOF_INVERT != 1
10766 /* Depending on compiler optimization cBOOL takes time, so if don't have to
10768 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10771 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10772 return (flags & ANYOF_INVERT) ^ match;
10776 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10778 /* return the position 'off' UTF-8 characters away from 's', forward if
10779 * 'off' >= 0, backwards if negative. But don't go outside of position
10780 * 'lim', which better be < s if off < 0 */
10782 PERL_ARGS_ASSERT_REGHOP3;
10785 while (off-- && s < lim) {
10786 /* XXX could check well-formedness here */
10787 U8 *new_s = s + UTF8SKIP(s);
10788 if (new_s > lim) /* lim may be in the middle of a long character */
10794 while (off++ && s > lim) {
10796 if (UTF8_IS_CONTINUED(*s)) {
10797 while (s > lim && UTF8_IS_CONTINUATION(*s))
10799 if (! UTF8_IS_START(*s)) {
10800 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10803 /* XXX could check well-formedness here */
10810 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10812 PERL_ARGS_ASSERT_REGHOP4;
10815 while (off-- && s < rlim) {
10816 /* XXX could check well-formedness here */
10821 while (off++ && s > llim) {
10823 if (UTF8_IS_CONTINUED(*s)) {
10824 while (s > llim && UTF8_IS_CONTINUATION(*s))
10826 if (! UTF8_IS_START(*s)) {
10827 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10830 /* XXX could check well-formedness here */
10836 /* like reghop3, but returns NULL on overrun, rather than returning last
10840 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10842 PERL_ARGS_ASSERT_REGHOPMAYBE3;
10845 while (off-- && s < lim) {
10846 /* XXX could check well-formedness here */
10853 while (off++ && s > lim) {
10855 if (UTF8_IS_CONTINUED(*s)) {
10856 while (s > lim && UTF8_IS_CONTINUATION(*s))
10858 if (! UTF8_IS_START(*s)) {
10859 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10862 /* XXX could check well-formedness here */
10871 /* when executing a regex that may have (?{}), extra stuff needs setting
10872 up that will be visible to the called code, even before the current
10873 match has finished. In particular:
10875 * $_ is localised to the SV currently being matched;
10876 * pos($_) is created if necessary, ready to be updated on each call-out
10878 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10879 isn't set until the current pattern is successfully finished), so that
10880 $1 etc of the match-so-far can be seen;
10881 * save the old values of subbeg etc of the current regex, and set then
10882 to the current string (again, this is normally only done at the end
10887 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10890 regexp *const rex = ReANY(reginfo->prog);
10891 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10893 eval_state->rex = rex;
10894 eval_state->sv = reginfo->sv;
10897 /* Make $_ available to executed code. */
10898 if (reginfo->sv != DEFSV) {
10900 DEFSV_set(reginfo->sv);
10902 /* will be dec'd by S_cleanup_regmatch_info_aux */
10903 SvREFCNT_inc_NN(reginfo->sv);
10905 if (!(mg = mg_find_mglob(reginfo->sv))) {
10906 /* prepare for quick setting of pos */
10907 mg = sv_magicext_mglob(reginfo->sv);
10910 eval_state->pos_magic = mg;
10911 eval_state->pos = mg->mg_len;
10912 eval_state->pos_flags = mg->mg_flags;
10915 eval_state->pos_magic = NULL;
10917 if (!PL_reg_curpm) {
10918 /* PL_reg_curpm is a fake PMOP that we can attach the current
10919 * regex to and point PL_curpm at, so that $1 et al are visible
10920 * within a /(?{})/. It's just allocated once per interpreter the
10921 * first time its needed */
10922 Newxz(PL_reg_curpm, 1, PMOP);
10923 #ifdef USE_ITHREADS
10925 SV* const repointer = &PL_sv_undef;
10926 /* this regexp is also owned by the new PL_reg_curpm, which
10927 will try to free it. */
10928 av_push(PL_regex_padav, repointer);
10929 PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
10930 PL_regex_pad = AvARRAY(PL_regex_padav);
10934 SET_reg_curpm(reginfo->prog);
10935 eval_state->curpm = PL_curpm;
10936 PL_curpm_under = PL_curpm;
10937 PL_curpm = PL_reg_curpm;
10938 if (RXp_MATCH_COPIED(rex)) {
10939 /* Here is a serious problem: we cannot rewrite subbeg,
10940 since it may be needed if this match fails. Thus
10941 $` inside (?{}) could fail... */
10942 eval_state->subbeg = rex->subbeg;
10943 eval_state->sublen = rex->sublen;
10944 eval_state->suboffset = rex->suboffset;
10945 eval_state->subcoffset = rex->subcoffset;
10946 #ifdef PERL_ANY_COW
10947 eval_state->saved_copy = rex->saved_copy;
10949 RXp_MATCH_COPIED_off(rex);
10952 eval_state->subbeg = NULL;
10953 rex->subbeg = (char *)reginfo->strbeg;
10954 rex->suboffset = 0;
10955 rex->subcoffset = 0;
10956 rex->sublen = reginfo->strend - reginfo->strbeg;
10960 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10963 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10965 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10966 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
10969 Safefree(aux->poscache);
10973 /* undo the effects of S_setup_eval_state() */
10975 if (eval_state->subbeg) {
10976 regexp * const rex = eval_state->rex;
10977 rex->subbeg = eval_state->subbeg;
10978 rex->sublen = eval_state->sublen;
10979 rex->suboffset = eval_state->suboffset;
10980 rex->subcoffset = eval_state->subcoffset;
10981 #ifdef PERL_ANY_COW
10982 rex->saved_copy = eval_state->saved_copy;
10984 RXp_MATCH_COPIED_on(rex);
10986 if (eval_state->pos_magic)
10988 eval_state->pos_magic->mg_len = eval_state->pos;
10989 eval_state->pos_magic->mg_flags =
10990 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10991 | (eval_state->pos_flags & MGf_BYTES);
10994 PL_curpm = eval_state->curpm;
10995 SvREFCNT_dec(eval_state->sv);
10998 PL_regmatch_state = aux->old_regmatch_state;
10999 PL_regmatch_slab = aux->old_regmatch_slab;
11001 /* free all slabs above current one - this must be the last action
11002 * of this function, as aux and eval_state are allocated within
11003 * slabs and may be freed here */
11005 s = PL_regmatch_slab->next;
11007 PL_regmatch_slab->next = NULL;
11009 regmatch_slab * const osl = s;
11018 S_to_utf8_substr(pTHX_ regexp *prog)
11020 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
11021 * on the converted value */
11025 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
11028 if (prog->substrs->data[i].substr
11029 && !prog->substrs->data[i].utf8_substr) {
11030 SV* const sv = newSVsv(prog->substrs->data[i].substr);
11031 prog->substrs->data[i].utf8_substr = sv;
11032 sv_utf8_upgrade(sv);
11033 if (SvVALID(prog->substrs->data[i].substr)) {
11034 if (SvTAIL(prog->substrs->data[i].substr)) {
11035 /* Trim the trailing \n that fbm_compile added last
11037 SvCUR_set(sv, SvCUR(sv) - 1);
11038 /* Whilst this makes the SV technically "invalid" (as its
11039 buffer is no longer followed by "\0") when fbm_compile()
11040 adds the "\n" back, a "\0" is restored. */
11041 fbm_compile(sv, FBMcf_TAIL);
11043 fbm_compile(sv, 0);
11045 if (prog->substrs->data[i].substr == prog->check_substr)
11046 prog->check_utf8 = sv;
11052 S_to_byte_substr(pTHX_ regexp *prog)
11054 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
11055 * on the converted value; returns FALSE if can't be converted. */
11059 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
11062 if (prog->substrs->data[i].utf8_substr
11063 && !prog->substrs->data[i].substr) {
11064 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
11065 if (! sv_utf8_downgrade(sv, TRUE)) {
11066 SvREFCNT_dec_NN(sv);
11069 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
11070 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
11071 /* Trim the trailing \n that fbm_compile added last
11073 SvCUR_set(sv, SvCUR(sv) - 1);
11074 fbm_compile(sv, FBMcf_TAIL);
11076 fbm_compile(sv, 0);
11078 prog->substrs->data[i].substr = sv;
11079 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
11080 prog->check_substr = sv;
11087 #ifndef PERL_IN_XSUB_RE
11090 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
11092 /* Temporary helper function for toke.c. Verify that the code point 'cp'
11093 * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in
11094 * the larger string bounded by 'strbeg' and 'strend'.
11096 * 'cp' needs to be assigned (if not, a future version of the Unicode
11097 * Standard could make it something that combines with adjacent characters,
11098 * so code using it would then break), and there has to be a GCB break
11099 * before and after the character. */
11102 GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
11103 const U8 * prev_cp_start;
11105 PERL_ARGS_ASSERT_IS_GRAPHEME;
11107 if ( UNLIKELY(UNICODE_IS_SUPER(cp))
11108 || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
11110 /* These are considered graphemes */
11114 /* Otherwise, unassigned code points are forbidden */
11115 if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
11116 _invlist_search(PL_Assigned_invlist, cp))))
11121 cp_gcb_val = getGCB_VAL_CP(cp);
11123 /* Find the GCB value of the previous code point in the input */
11124 prev_cp_start = utf8_hop_back(s, -1, strbeg);
11125 if (UNLIKELY(prev_cp_start == s)) {
11126 prev_cp_gcb_val = GCB_EDGE;
11129 prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
11132 /* And check that is a grapheme boundary */
11133 if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
11134 TRUE /* is UTF-8 encoded */ ))
11139 /* Similarly verify there is a break between the current character and the
11143 next_cp_gcb_val = GCB_EDGE;
11146 next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
11149 return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
11153 =for apidoc_section $unicode
11155 =for apidoc isSCRIPT_RUN
11157 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
11158 not including C<send> form a "script run". C<utf8_target> is TRUE iff the
11159 sequence starting at C<s> is to be treated as UTF-8. To be precise, except for
11160 two degenerate cases given below, this function returns TRUE iff all code
11161 points in it come from any combination of three "scripts" given by the Unicode
11162 "Script Extensions" property: Common, Inherited, and possibly one other.
11163 Additionally all decimal digits must come from the same consecutive sequence of
11166 For example, if all the characters in the sequence are Greek, or Common, or
11167 Inherited, this function will return TRUE, provided any decimal digits in it
11168 are from the same block of digits in Common. (These are the ASCII digits
11169 "0".."9" and additionally a block for full width forms of these, and several
11170 others used in mathematical notation.) For scripts (unlike Greek) that have
11171 their own digits defined this will accept either digits from that set or from
11172 one of the Common digit sets, but not a combination of the two. Some scripts,
11173 such as Arabic, have more than one set of digits. All digits must come from
11174 the same set for this function to return TRUE.
11176 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
11177 contain the script found, using the C<SCX_enum> typedef. Its value will be
11178 C<SCX_INVALID> if the function returns FALSE.
11180 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
11181 will be C<SCX_INVALID>.
11183 If the sequence contains a single code point which is unassigned to a character
11184 in the version of Unicode being used, the function will return TRUE, and the
11185 script will be C<SCX_Unknown>. Any other combination of unassigned code points
11186 in the input sequence will result in the function treating the input as not
11187 being a script run.
11189 The returned script will be C<SCX_Inherited> iff all the code points in it are
11190 from the Inherited script.
11192 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
11193 it are from the Inherited or Common scripts.
11200 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
11202 /* Basically, it looks at each character in the sequence to see if the
11203 * above conditions are met; if not it fails. It uses an inversion map to
11204 * find the enum corresponding to the script of each character. But this
11205 * is complicated by the fact that a few code points can be in any of
11206 * several scripts. The data has been constructed so that there are
11207 * additional enum values (all negative) for these situations. The
11208 * absolute value of those is an index into another table which contains
11209 * pointers to auxiliary tables for each such situation. Each aux array
11210 * lists all the scripts for the given situation. There is another,
11211 * parallel, table that gives the number of entries in each aux table.
11212 * These are all defined in charclass_invlists.h */
11214 /* XXX Here are the additional things UTS 39 says could be done:
11216 * Forbid sequences of the same nonspacing mark
11218 * Check to see that all the characters are in the sets of exemplar
11219 * characters for at least one language in the Unicode Common Locale Data
11220 * Repository [CLDR]. */
11223 /* Things that match /\d/u */
11224 SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
11225 UV * decimals_array = invlist_array(decimals_invlist);
11227 /* What code point is the digit '0' of the script run? (0 meaning FALSE if
11228 * not currently known) */
11229 UV zero_of_run = 0;
11231 SCX_enum script_of_run = SCX_INVALID; /* Illegal value */
11232 SCX_enum script_of_char = SCX_INVALID;
11234 /* If the script remains not fully determined from iteration to iteration,
11235 * this is the current intersection of the possiblities. */
11236 SCX_enum * intersection = NULL;
11237 PERL_UINT_FAST8_T intersection_len = 0;
11239 bool retval = TRUE;
11240 SCX_enum * ret_script = NULL;
11244 PERL_ARGS_ASSERT_ISSCRIPT_RUN;
11246 /* All code points in 0..255 are either Common or Latin, so must be a
11247 * script run. We can return immediately unless we need to know which
11249 if (! utf8_target && LIKELY(send > s)) {
11250 if (ret_script == NULL) {
11254 /* If any character is Latin, the run is Latin */
11256 if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
11257 *ret_script = SCX_Latin;
11262 /* Here, all are Common */
11263 *ret_script = SCX_Common;
11267 /* Look at each character in the sequence */
11269 /* If the current character being examined is a digit, this is the code
11270 * point of the zero for its sequence of 10 */
11275 /* The code allows all scripts to use the ASCII digits. This is
11276 * because they are in the Common script. Hence any ASCII ones found
11277 * are ok, unless and until a digit from another set has already been
11278 * encountered. digit ranges in Common are not similarly blessed) */
11279 if (UNLIKELY(isDIGIT(*s))) {
11280 if (UNLIKELY(script_of_run == SCX_Unknown)) {
11285 if (zero_of_run != '0') {
11297 /* Here, isn't an ASCII digit. Find the code point of the character */
11298 if (! UTF8_IS_INVARIANT(*s)) {
11300 cp = valid_utf8_to_uvchr((U8 *) s, &len);
11307 /* If is within the range [+0 .. +9] of the script's zero, it also is a
11308 * digit in that script. We can skip the rest of this code for this
11310 if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
11314 /* Find the character's script. The correct values are hard-coded here
11315 * for small-enough code points. */
11316 if (cp < 0x2B9) { /* From inspection of Unicode db; extremely
11317 unlikely to change */
11319 || ( isALPHA_L1(cp)
11320 && LIKELY(cp != MICRO_SIGN_NATIVE)))
11322 script_of_char = SCX_Latin;
11325 script_of_char = SCX_Common;
11329 script_of_char = _Perl_SCX_invmap[
11330 _invlist_search(PL_SCX_invlist, cp)];
11333 /* We arbitrarily accept a single unassigned character, but not in
11334 * combination with anything else, and not a run of them. */
11335 if ( UNLIKELY(script_of_run == SCX_Unknown)
11336 || UNLIKELY( script_of_run != SCX_INVALID
11337 && script_of_char == SCX_Unknown))
11343 /* For the first character, or the run is inherited, the run's script
11344 * is set to the char's */
11345 if ( UNLIKELY(script_of_run == SCX_INVALID)
11346 || UNLIKELY(script_of_run == SCX_Inherited))
11348 script_of_run = script_of_char;
11351 /* For the character's script to be Unknown, it must be the first
11352 * character in the sequence (for otherwise a test above would have
11353 * prevented us from reaching here), and we have set the run's script
11354 * to it. Nothing further to be done for this character */
11355 if (UNLIKELY(script_of_char == SCX_Unknown)) {
11359 /* We accept 'inherited' script characters currently even at the
11360 * beginning. (We know that no characters in Inherited are digits, or
11361 * we'd have to check for that) */
11362 if (UNLIKELY(script_of_char == SCX_Inherited)) {
11366 /* If the run so far is Common, and the new character isn't, change the
11367 * run's script to that of this character */
11368 if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
11369 script_of_run = script_of_char;
11372 /* Now we can see if the script of the new character is the same as
11373 * that of the run */
11374 if (LIKELY(script_of_char == script_of_run)) {
11375 /* By far the most common case */
11376 goto scripts_match;
11379 /* Here, the script of the run isn't Common. But characters in Common
11380 * match any script */
11381 if (script_of_char == SCX_Common) {
11382 goto scripts_match;
11385 #ifndef HAS_SCX_AUX_TABLES
11387 /* Too early a Unicode version to have a code point belonging to more
11388 * than one script, so, if the scripts don't exactly match, fail */
11389 PERL_UNUSED_VAR(intersection_len);
11395 /* Here there is no exact match between the character's script and the
11396 * run's. And we've handled the special cases of scripts Unknown,
11397 * Inherited, and Common.
11399 * Negative script numbers signify that the value may be any of several
11400 * scripts, and we need to look at auxiliary information to make our
11401 * deterimination. But if both are non-negative, we can fail now */
11402 if (LIKELY(script_of_char >= 0)) {
11403 const SCX_enum * search_in;
11404 PERL_UINT_FAST8_T search_in_len;
11405 PERL_UINT_FAST8_T i;
11407 if (LIKELY(script_of_run >= 0)) {
11412 /* Use the previously constructed set of possible scripts, if any.
11414 if (intersection) {
11415 search_in = intersection;
11416 search_in_len = intersection_len;
11419 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
11420 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
11423 for (i = 0; i < search_in_len; i++) {
11424 if (search_in[i] == script_of_char) {
11425 script_of_run = script_of_char;
11426 goto scripts_match;
11433 else if (LIKELY(script_of_run >= 0)) {
11434 /* script of character could be one of several, but run is a single
11436 const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11437 const PERL_UINT_FAST8_T search_in_len
11438 = SCX_AUX_TABLE_lengths[-script_of_char];
11439 PERL_UINT_FAST8_T i;
11441 for (i = 0; i < search_in_len; i++) {
11442 if (search_in[i] == script_of_run) {
11443 script_of_char = script_of_run;
11444 goto scripts_match;
11452 /* Both run and char could be in one of several scripts. If the
11453 * intersection is empty, then this character isn't in this script
11454 * run. Otherwise, we need to calculate the intersection to use
11455 * for future iterations of the loop, unless we are already at the
11456 * final character */
11457 const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11458 const PERL_UINT_FAST8_T char_len
11459 = SCX_AUX_TABLE_lengths[-script_of_char];
11460 const SCX_enum * search_run;
11461 PERL_UINT_FAST8_T run_len;
11463 SCX_enum * new_overlap = NULL;
11464 PERL_UINT_FAST8_T i, j;
11466 if (intersection) {
11467 search_run = intersection;
11468 run_len = intersection_len;
11471 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11472 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11475 intersection_len = 0;
11477 for (i = 0; i < run_len; i++) {
11478 for (j = 0; j < char_len; j++) {
11479 if (search_run[i] == search_char[j]) {
11481 /* Here, the script at i,j matches. That means this
11482 * character is in the run. But continue on to find
11483 * the complete intersection, for the next loop
11484 * iteration, and for the digit check after it.
11486 * On the first found common script, we malloc space
11487 * for the intersection list for the worst case of the
11488 * intersection, which is the minimum of the number of
11489 * scripts remaining in each set. */
11490 if (intersection_len == 0) {
11492 MIN(run_len - i, char_len - j),
11495 new_overlap[intersection_len++] = search_run[i];
11500 /* Here we've looked through everything. If they have no scripts
11501 * in common, not a run */
11502 if (intersection_len == 0) {
11507 /* If there is only a single script in common, set to that.
11508 * Otherwise, use the intersection going forward */
11509 Safefree(intersection);
11510 intersection = NULL;
11511 if (intersection_len == 1) {
11512 script_of_run = script_of_char = new_overlap[0];
11513 Safefree(new_overlap);
11514 new_overlap = NULL;
11517 intersection = new_overlap;
11525 /* Here, the script of the character is compatible with that of the
11526 * run. That means that in most cases, it continues the script run.
11527 * Either it and the run match exactly, or one or both can be in any of
11528 * several scripts, and the intersection is not empty. However, if the
11529 * character is a decimal digit, it could still mean failure if it is
11530 * from the wrong sequence of 10. So, we need to look at if it's a
11531 * digit. We've already handled the 10 digits [0-9], and the next
11532 * lowest one is this one: */
11533 if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11534 continue; /* Not a digit; this character is part of the run */
11537 /* If we have a definitive '0' for the script of this character, we
11538 * know that for this to be a digit, it must be in the range of +0..+9
11540 if ( script_of_char >= 0
11541 && (zero_of_char = script_zeros[script_of_char]))
11543 if (! withinCOUNT(cp, zero_of_char, 9)) {
11544 continue; /* Not a digit; this character is part of the run
11549 else { /* Need to look up if this character is a digit or not */
11550 SSize_t index_of_zero_of_char;
11551 index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11552 if ( UNLIKELY(index_of_zero_of_char < 0)
11553 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11555 continue; /* Not a digit; this character is part of the run.
11559 zero_of_char = decimals_array[index_of_zero_of_char];
11562 /* Here, the character is a decimal digit, and the zero of its sequence
11563 * of 10 is in 'zero_of_char'. If we already have a zero for this run,
11564 * they better be the same. */
11566 if (zero_of_run != zero_of_char) {
11571 else { /* Otherwise we now have a zero for this run */
11572 zero_of_run = zero_of_char;
11574 } /* end of looping through CLOSESR text */
11576 Safefree(intersection);
11578 if (ret_script != NULL) {
11580 *ret_script = script_of_run;
11583 *ret_script = SCX_INVALID;
11590 #endif /* ifndef PERL_IN_XSUB_RE */
11593 * ex: set ts=8 sts=4 sw=4 et: