5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #include "invlist_inline.h"
84 #include "unicode_constants.h"
86 static const char b_utf8_locale_required[] =
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
88 " Assuming a UTF-8 locale";
90 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND \
92 if (! IN_UTF8_CTYPE_LOCALE) { \
93 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
94 b_utf8_locale_required); \
98 static const char sets_utf8_locale_required[] =
99 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
101 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n) \
103 if (! IN_UTF8_CTYPE_LOCALE && ANYOFL_UTF8_LOCALE_REQD(FLAGS(n))) { \
104 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
105 sets_utf8_locale_required); \
110 /* At least one required character in the target string is expressible only in
112 static const char non_utf8_target_but_utf8_required[]
113 = "Can't match, because target string needs to be in UTF-8\n";
116 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
117 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
122 #define STATIC static
129 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
131 #define HOPc(pos,off) \
132 (char *)(reginfo->is_utf8_target \
133 ? reghop3((U8*)pos, off, \
134 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
137 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
138 #define HOPBACK3(pos, off, lim) \
139 (reginfo->is_utf8_target \
140 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
141 : (pos - off >= lim) \
145 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
147 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
148 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
150 /* lim must be +ve. Returns NULL on overshoot */
151 #define HOPMAYBE3(pos,off,lim) \
152 (reginfo->is_utf8_target \
153 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
154 : ((U8*)pos + off <= lim) \
158 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
159 * off must be >=0; args should be vars rather than expressions */
160 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
161 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
162 : (U8*)((pos + off) > lim ? lim : (pos + off)))
163 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
165 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
166 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
168 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
170 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
171 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
173 /* for use after a quantifier and before an EXACT-like node -- japhy */
174 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
176 * NOTE that *nothing* that affects backtracking should be in here, specifically
177 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
178 * node that is in between two EXACT like nodes when ascertaining what the required
179 * "follow" character is. This should probably be moved to regex compile time
180 * although it may be done at run time beause of the REF possibility - more
181 * investigation required. -- demerphq
183 #define JUMPABLE(rn) ( \
185 (OP(rn) == CLOSE && \
186 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
188 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
189 OP(rn) == PLUS || OP(rn) == MINMOD || \
191 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
193 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
195 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
198 Search for mandatory following text node; for lookahead, the text must
199 follow but for lookbehind (rn->flags != 0) we skip to the next step.
201 #define FIND_NEXT_IMPT(rn) STMT_START { \
202 while (JUMPABLE(rn)) { \
203 const OPCODE type = OP(rn); \
204 if (type == SUSPEND || PL_regkind[type] == CURLY) \
205 rn = NEXTOPER(NEXTOPER(rn)); \
206 else if (type == PLUS) \
208 else if (type == IFMATCH) \
209 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
210 else rn += NEXT_OFF(rn); \
214 #define SLAB_FIRST(s) (&(s)->states[0])
215 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
217 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
218 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
219 static regmatch_state * S_push_slab(pTHX);
221 #define REGCP_PAREN_ELEMS 3
222 #define REGCP_OTHER_ELEMS 3
223 #define REGCP_FRAME_ELEMS 1
224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
225 * are needed for the regexp context stack bookkeeping. */
228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
230 const int retval = PL_savestack_ix;
231 const int paren_elems_to_push =
232 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
233 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
234 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
236 DECLARE_AND_GET_RE_DEBUG_FLAGS;
238 PERL_ARGS_ASSERT_REGCPPUSH;
240 if (paren_elems_to_push < 0)
241 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
242 (int)paren_elems_to_push, (int)maxopenparen,
243 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
245 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
246 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
247 " out of range (%lu-%ld)",
249 (unsigned long)maxopenparen,
252 SSGROW(total_elems + REGCP_FRAME_ELEMS);
255 if ((int)maxopenparen > (int)parenfloor)
256 Perl_re_exec_indentf( aTHX_
257 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
263 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
264 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
265 SSPUSHIV(rex->offs[p].end);
266 SSPUSHIV(rex->offs[p].start);
267 SSPUSHINT(rex->offs[p].start_tmp);
268 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
269 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
272 (IV)rex->offs[p].start,
273 (IV)rex->offs[p].start_tmp,
277 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
278 SSPUSHINT(maxopenparen);
279 SSPUSHINT(rex->lastparen);
280 SSPUSHINT(rex->lastcloseparen);
281 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
286 /* These are needed since we do not localize EVAL nodes: */
287 #define REGCP_SET(cp) \
289 Perl_re_exec_indentf( aTHX_ \
290 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
291 depth, (IV)PL_savestack_ix \
296 #define REGCP_UNWIND(cp) \
298 if (cp != PL_savestack_ix) \
299 Perl_re_exec_indentf( aTHX_ \
300 "Clearing an EVAL scope, savestack=%" \
301 IVdf "..%" IVdf "\n", \
302 depth, (IV)(cp), (IV)PL_savestack_ix \
307 /* set the start and end positions of capture ix */
308 #define CLOSE_CAPTURE(ix, s, e) \
309 rex->offs[ix].start = s; \
310 rex->offs[ix].end = e; \
311 if (ix > rex->lastparen) \
312 rex->lastparen = ix; \
313 rex->lastcloseparen = ix; \
314 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
315 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
320 (IV)rex->offs[ix].start, \
321 (IV)rex->offs[ix].end, \
325 #define UNWIND_PAREN(lp, lcp) \
326 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
327 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
332 (UV)(rex->lastparen), \
335 for (n = rex->lastparen; n > lp; n--) \
336 rex->offs[n].end = -1; \
337 rex->lastparen = n; \
338 rex->lastcloseparen = lcp;
342 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
346 DECLARE_AND_GET_RE_DEBUG_FLAGS;
348 PERL_ARGS_ASSERT_REGCPPOP;
350 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
352 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
353 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
354 rex->lastcloseparen = SSPOPINT;
355 rex->lastparen = SSPOPINT;
356 *maxopenparen_p = SSPOPINT;
358 i -= REGCP_OTHER_ELEMS;
359 /* Now restore the parentheses context. */
361 if (i || rex->lastparen + 1 <= rex->nparens)
362 Perl_re_exec_indentf( aTHX_
363 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
369 paren = *maxopenparen_p;
370 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
372 rex->offs[paren].start_tmp = SSPOPINT;
373 rex->offs[paren].start = SSPOPIV;
375 if (paren <= rex->lastparen)
376 rex->offs[paren].end = tmps;
377 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
378 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
381 (IV)rex->offs[paren].start,
382 (IV)rex->offs[paren].start_tmp,
383 (IV)rex->offs[paren].end,
384 (paren > rex->lastparen ? "(skipped)" : ""));
389 /* It would seem that the similar code in regtry()
390 * already takes care of this, and in fact it is in
391 * a better location to since this code can #if 0-ed out
392 * but the code in regtry() is needed or otherwise tests
393 * requiring null fields (pat.t#187 and split.t#{13,14}
394 * (as of patchlevel 7877) will fail. Then again,
395 * this code seems to be necessary or otherwise
396 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
397 * --jhi updated by dapm */
398 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
399 if (i > *maxopenparen_p)
400 rex->offs[i].start = -1;
401 rex->offs[i].end = -1;
402 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
403 " \\%" UVuf ": %s ..-1 undeffing\n",
406 (i > *maxopenparen_p) ? "-1" : " "
412 /* restore the parens and associated vars at savestack position ix,
413 * but without popping the stack */
416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
418 I32 tmpix = PL_savestack_ix;
419 PERL_ARGS_ASSERT_REGCP_RESTORE;
421 PL_savestack_ix = ix;
422 regcppop(rex, maxopenparen_p);
423 PL_savestack_ix = tmpix;
426 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
429 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
431 /* Returns a boolean as to whether or not 'character' is a member of the
432 * Posix character class given by 'classnum' that should be equivalent to a
433 * value in the typedef '_char_class_number'.
435 * Ideally this could be replaced by a just an array of function pointers
436 * to the C library functions that implement the macros this calls.
437 * However, to compile, the precise function signatures are required, and
438 * these may vary from platform to platform. To avoid having to figure
439 * out what those all are on each platform, I (khw) am using this method,
440 * which adds an extra layer of function call overhead (unless the C
441 * optimizer strips it away). But we don't particularly care about
442 * performance with locales anyway. */
444 switch ((_char_class_number) classnum) {
445 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
446 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
447 case _CC_ENUM_ASCII: return isASCII_LC(character);
448 case _CC_ENUM_BLANK: return isBLANK_LC(character);
449 case _CC_ENUM_CASED: return isLOWER_LC(character)
450 || isUPPER_LC(character);
451 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
452 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
453 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
454 case _CC_ENUM_LOWER: return isLOWER_LC(character);
455 case _CC_ENUM_PRINT: return isPRINT_LC(character);
456 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
457 case _CC_ENUM_SPACE: return isSPACE_LC(character);
458 case _CC_ENUM_UPPER: return isUPPER_LC(character);
459 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
460 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
461 default: /* VERTSPACE should never occur in locales */
462 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
465 NOT_REACHED; /* NOTREACHED */
469 PERL_STATIC_INLINE I32
470 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
472 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be
473 * folded. Works on all folds representable without UTF-8, except for
474 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it
475 * check that the strings each have at least 'len' characters.
477 * There is almost an identical API function where s2 need not be folded:
478 * Perl_foldEQ_latin1() */
480 const U8 *a = (const U8 *)s1;
481 const U8 *b = (const U8 *)s2;
483 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
488 assert(! isUPPER_L1(*b));
489 if (toLOWER_L1(*a) != *b) {
498 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
500 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
501 * 'character' is a member of the Posix character class given by 'classnum'
502 * that should be equivalent to a value in the typedef
503 * '_char_class_number'.
505 * This just calls isFOO_lc on the code point for the character if it is in
506 * the range 0-255. Outside that range, all characters use Unicode
507 * rules, ignoring any locale. So use the Unicode function if this class
508 * requires an inversion list, and use the Unicode macro otherwise. */
511 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
513 if (UTF8_IS_INVARIANT(*character)) {
514 return isFOO_lc(classnum, *character);
516 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
517 return isFOO_lc(classnum,
518 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
521 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
523 switch ((_char_class_number) classnum) {
524 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
525 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
526 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
527 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
529 return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
530 utf8_to_uvchr_buf(character, e, NULL));
533 return FALSE; /* Things like CNTRL are always below 256 */
537 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
539 /* Returns the position of the first byte in the sequence between 's' and
540 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
543 PERL_ARGS_ASSERT_FIND_SPAN_END;
547 if ((STRLEN) (send - s) >= PERL_WORDSIZE
548 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
549 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
551 PERL_UINTMAX_T span_word;
553 /* Process per-byte until reach word boundary. XXX This loop could be
554 * eliminated if we knew that this platform had fast unaligned reads */
555 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
556 if (*s != span_byte) {
562 /* Create a word filled with the bytes we are spanning */
563 span_word = PERL_COUNT_MULTIPLIER * span_byte;
565 /* Process per-word as long as we have at least a full word left */
568 /* Keep going if the whole word is composed of 'span_byte's */
569 if ((* (PERL_UINTMAX_T *) s) == span_word) {
574 /* Here, at least one byte in the word isn't 'span_byte'. */
582 /* This xor leaves 1 bits only in those non-matching bytes */
583 span_word ^= * (PERL_UINTMAX_T *) s;
585 /* Make sure the upper bit of each non-matching byte is set. This
586 * makes each such byte look like an ASCII platform variant byte */
587 span_word |= span_word << 1;
588 span_word |= span_word << 2;
589 span_word |= span_word << 4;
591 /* That reduces the problem to what this function solves */
592 return s + variant_byte_number(span_word);
596 } while (s + PERL_WORDSIZE <= send);
599 /* Process the straggler bytes beyond the final word boundary */
601 if (*s != span_byte) {
611 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
613 /* Returns the position of the first byte in the sequence between 's'
614 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
615 * returns 'send' if none found. It uses word-level operations instead of
616 * byte to speed up the process */
618 PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
621 assert((byte & mask) == byte);
625 if ((STRLEN) (send - s) >= PERL_WORDSIZE
626 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
627 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
629 PERL_UINTMAX_T word, mask_word;
631 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
632 if (((*s) & mask) == byte) {
638 word = PERL_COUNT_MULTIPLIER * byte;
639 mask_word = PERL_COUNT_MULTIPLIER * mask;
642 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
644 /* If 'masked' contains bytes with the bit pattern of 'byte' within
645 * it, xoring with 'word' will leave each of the 8 bits in such
646 * bytes be 0, and no byte containing any other bit pattern will be
650 /* This causes the most significant bit to be set to 1 for any
651 * bytes in the word that aren't completely 0 */
652 masked |= masked << 1;
653 masked |= masked << 2;
654 masked |= masked << 4;
656 /* The msbits are the same as what marks a byte as variant, so we
657 * can use this mask. If all msbits are 1, the word doesn't
659 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
664 /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
665 * and any that are, are 0. Complement and re-AND to swap that */
667 masked &= PERL_VARIANTS_WORD_MASK;
669 /* This reduces the problem to that solved by this function */
670 s += variant_byte_number(masked);
673 } while (s + PERL_WORDSIZE <= send);
679 if (((*s) & mask) == byte) {
689 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
691 /* Returns the position of the first byte in the sequence between 's' and
692 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
693 * 'span_byte' should have been ANDed with 'mask' in the call of this
694 * function. Returns 'send' if none found. Works like find_span_end(),
695 * except for the AND */
697 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
700 assert((span_byte & mask) == span_byte);
702 if ((STRLEN) (send - s) >= PERL_WORDSIZE
703 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
704 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
706 PERL_UINTMAX_T span_word, mask_word;
708 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
709 if (((*s) & mask) != span_byte) {
715 span_word = PERL_COUNT_MULTIPLIER * span_byte;
716 mask_word = PERL_COUNT_MULTIPLIER * mask;
719 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
721 if (masked == span_word) {
733 masked |= masked << 1;
734 masked |= masked << 2;
735 masked |= masked << 4;
736 return s + variant_byte_number(masked);
740 } while (s + PERL_WORDSIZE <= send);
744 if (((*s) & mask) != span_byte) {
754 * pregexec and friends
757 #ifndef PERL_IN_XSUB_RE
759 - pregexec - match a regexp against a string
762 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
763 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
764 /* stringarg: the point in the string at which to begin matching */
765 /* strend: pointer to null at end of string */
766 /* strbeg: real beginning of string */
767 /* minend: end of match must be >= minend bytes after stringarg. */
768 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
769 * itself is accessed via the pointers above */
770 /* nosave: For optimizations. */
772 PERL_ARGS_ASSERT_PREGEXEC;
775 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
776 nosave ? 0 : REXEC_COPY_STR);
782 /* re_intuit_start():
784 * Based on some optimiser hints, try to find the earliest position in the
785 * string where the regex could match.
787 * rx: the regex to match against
788 * sv: the SV being matched: only used for utf8 flag; the string
789 * itself is accessed via the pointers below. Note that on
790 * something like an overloaded SV, SvPOK(sv) may be false
791 * and the string pointers may point to something unrelated to
793 * strbeg: real beginning of string
794 * strpos: the point in the string at which to begin matching
795 * strend: pointer to the byte following the last char of the string
796 * flags currently unused; set to 0
797 * data: currently unused; set to NULL
799 * The basic idea of re_intuit_start() is to use some known information
800 * about the pattern, namely:
802 * a) the longest known anchored substring (i.e. one that's at a
803 * constant offset from the beginning of the pattern; but not
804 * necessarily at a fixed offset from the beginning of the
806 * b) the longest floating substring (i.e. one that's not at a constant
807 * offset from the beginning of the pattern);
808 * c) Whether the pattern is anchored to the string; either
809 * an absolute anchor: /^../, or anchored to \n: /^.../m,
810 * or anchored to pos(): /\G/;
811 * d) A start class: a real or synthetic character class which
812 * represents which characters are legal at the start of the pattern;
814 * to either quickly reject the match, or to find the earliest position
815 * within the string at which the pattern might match, thus avoiding
816 * running the full NFA engine at those earlier locations, only to
817 * eventually fail and retry further along.
819 * Returns NULL if the pattern can't match, or returns the address within
820 * the string which is the earliest place the match could occur.
822 * The longest of the anchored and floating substrings is called 'check'
823 * and is checked first. The other is called 'other' and is checked
824 * second. The 'other' substring may not be present. For example,
826 * /(abc|xyz)ABC\d{0,3}DEFG/
830 * check substr (float) = "DEFG", offset 6..9 chars
831 * other substr (anchored) = "ABC", offset 3..3 chars
834 * Be aware that during the course of this function, sometimes 'anchored'
835 * refers to a substring being anchored relative to the start of the
836 * pattern, and sometimes to the pattern itself being anchored relative to
837 * the string. For example:
839 * /\dabc/: "abc" is anchored to the pattern;
840 * /^\dabc/: "abc" is anchored to the pattern and the string;
841 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
842 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
843 * but the pattern is anchored to the string.
847 Perl_re_intuit_start(pTHX_
850 const char * const strbeg,
854 re_scream_pos_data *data)
856 struct regexp *const prog = ReANY(rx);
857 SSize_t start_shift = prog->check_offset_min;
858 /* Should be nonnegative! */
859 SSize_t end_shift = 0;
860 /* current lowest pos in string where the regex can start matching */
861 char *rx_origin = strpos;
863 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
864 U8 other_ix = 1 - prog->substrs->check_ix;
866 char *other_last = strpos;/* latest pos 'other' substr already checked to */
867 char *check_at = NULL; /* check substr found at this pos */
868 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
869 RXi_GET_DECL(prog,progi);
870 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
871 regmatch_info *const reginfo = ®info_buf;
872 DECLARE_AND_GET_RE_DEBUG_FLAGS;
874 PERL_ARGS_ASSERT_RE_INTUIT_START;
875 PERL_UNUSED_ARG(flags);
876 PERL_UNUSED_ARG(data);
878 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
879 "Intuit: trying to determine minimum start position...\n"));
881 /* for now, assume that all substr offsets are positive. If at some point
882 * in the future someone wants to do clever things with lookbehind and
883 * -ve offsets, they'll need to fix up any code in this function
884 * which uses these offsets. See the thread beginning
885 * <20140113145929.GF27210@iabyn.com>
887 assert(prog->substrs->data[0].min_offset >= 0);
888 assert(prog->substrs->data[0].max_offset >= 0);
889 assert(prog->substrs->data[1].min_offset >= 0);
890 assert(prog->substrs->data[1].max_offset >= 0);
891 assert(prog->substrs->data[2].min_offset >= 0);
892 assert(prog->substrs->data[2].max_offset >= 0);
894 /* for now, assume that if both present, that the floating substring
895 * doesn't start before the anchored substring.
896 * If you break this assumption (e.g. doing better optimisations
897 * with lookahead/behind), then you'll need to audit the code in this
898 * function carefully first
901 ! ( (prog->anchored_utf8 || prog->anchored_substr)
902 && (prog->float_utf8 || prog->float_substr))
903 || (prog->float_min_offset >= prog->anchored_offset));
905 /* byte rather than char calculation for efficiency. It fails
906 * to quickly reject some cases that can't match, but will reject
907 * them later after doing full char arithmetic */
908 if (prog->minlen > strend - strpos) {
909 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
910 " String too short...\n"));
914 RXp_MATCH_UTF8_set(prog, utf8_target);
915 reginfo->is_utf8_target = cBOOL(utf8_target);
916 reginfo->info_aux = NULL;
917 reginfo->strbeg = strbeg;
918 reginfo->strend = strend;
919 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
921 /* not actually used within intuit, but zero for safety anyway */
922 reginfo->poscache_maxiter = 0;
925 if ((!prog->anchored_utf8 && prog->anchored_substr)
926 || (!prog->float_utf8 && prog->float_substr))
927 to_utf8_substr(prog);
928 check = prog->check_utf8;
930 if (!prog->check_substr && prog->check_utf8) {
931 if (! to_byte_substr(prog)) {
932 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
935 check = prog->check_substr;
938 /* dump the various substring data */
939 DEBUG_OPTIMISE_MORE_r({
941 for (i=0; i<=2; i++) {
942 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
943 : prog->substrs->data[i].substr);
947 Perl_re_printf( aTHX_
948 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
949 " useful=%" IVdf " utf8=%d [%s]\n",
951 (IV)prog->substrs->data[i].min_offset,
952 (IV)prog->substrs->data[i].max_offset,
953 (IV)prog->substrs->data[i].end_shift,
960 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
962 /* ml_anch: check after \n?
964 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
965 * with /.*.../, these flags will have been added by the
967 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
968 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
970 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
971 && !(prog->intflags & PREGf_IMPLICIT);
973 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
974 /* we are only allowed to match at BOS or \G */
976 /* trivially reject if there's a BOS anchor and we're not at BOS.
978 * Note that we don't try to do a similar quick reject for
979 * \G, since generally the caller will have calculated strpos
980 * based on pos() and gofs, so the string is already correctly
981 * anchored by definition; and handling the exceptions would
982 * be too fiddly (e.g. REXEC_IGNOREPOS).
984 if ( strpos != strbeg
985 && (prog->intflags & PREGf_ANCH_SBOL))
987 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
988 " Not at start...\n"));
992 /* in the presence of an anchor, the anchored (relative to the
993 * start of the regex) substr must also be anchored relative
994 * to strpos. So quickly reject if substr isn't found there.
995 * This works for \G too, because the caller will already have
996 * subtracted gofs from pos, and gofs is the offset from the
997 * \G to the start of the regex. For example, in /.abc\Gdef/,
998 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
999 * caller will have set strpos=pos()-4; we look for the substr
1000 * at position pos()-4+1, which lines up with the "a" */
1002 if (prog->check_offset_min == prog->check_offset_max) {
1003 /* Substring at constant offset from beg-of-str... */
1004 SSize_t slen = SvCUR(check);
1005 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1007 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1008 " Looking for check substr at fixed offset %" IVdf "...\n",
1009 (IV)prog->check_offset_min));
1011 if (SvTAIL(check)) {
1012 /* In this case, the regex is anchored at the end too.
1013 * Unless it's a multiline match, the lengths must match
1014 * exactly, give or take a \n. NB: slen >= 1 since
1015 * the last char of check is \n */
1017 && ( strend - s > slen
1018 || strend - s < slen - 1
1019 || (strend - s == slen && strend[-1] != '\n')))
1021 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1022 " String too long...\n"));
1025 /* Now should match s[0..slen-2] */
1028 if (slen && (strend - s < slen
1029 || *SvPVX_const(check) != *s
1030 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1032 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1033 " String not equal...\n"));
1038 goto success_at_start;
1043 end_shift = prog->check_end_shift;
1045 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
1047 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1048 (IV)end_shift, RX_PRECOMP(rx));
1053 /* This is the (re)entry point of the main loop in this function.
1054 * The goal of this loop is to:
1055 * 1) find the "check" substring in the region rx_origin..strend
1056 * (adjusted by start_shift / end_shift). If not found, reject
1058 * 2) If it exists, look for the "other" substr too if defined; for
1059 * example, if the check substr maps to the anchored substr, then
1060 * check the floating substr, and vice-versa. If not found, go
1061 * back to (1) with rx_origin suitably incremented.
1062 * 3) If we find an rx_origin position that doesn't contradict
1063 * either of the substrings, then check the possible additional
1064 * constraints on rx_origin of /^.../m or a known start class.
1065 * If these fail, then depending on which constraints fail, jump
1066 * back to here, or to various other re-entry points further along
1067 * that skip some of the first steps.
1068 * 4) If we pass all those tests, update the BmUSEFUL() count on the
1069 * substring. If the start position was determined to be at the
1070 * beginning of the string - so, not rejected, but not optimised,
1071 * since we have to run regmatch from position 0 - decrement the
1072 * BmUSEFUL() count. Otherwise increment it.
1076 /* first, look for the 'check' substring */
1082 DEBUG_OPTIMISE_MORE_r({
1083 Perl_re_printf( aTHX_
1084 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1085 " Start shift: %" IVdf " End shift %" IVdf
1086 " Real end Shift: %" IVdf "\n",
1087 (IV)(rx_origin - strbeg),
1088 (IV)prog->check_offset_min,
1091 (IV)prog->check_end_shift);
1094 end_point = HOPBACK3(strend, end_shift, rx_origin);
1097 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1102 /* If the regex is absolutely anchored to either the start of the
1103 * string (SBOL) or to pos() (ANCH_GPOS), then
1104 * check_offset_max represents an upper bound on the string where
1105 * the substr could start. For the ANCH_GPOS case, we assume that
1106 * the caller of intuit will have already set strpos to
1107 * pos()-gofs, so in this case strpos + offset_max will still be
1108 * an upper bound on the substr.
1111 && prog->intflags & PREGf_ANCH
1112 && prog->check_offset_max != SSize_t_MAX)
1114 SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1115 const char * const anchor =
1116 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1117 SSize_t targ_len = (char*)end_point - anchor;
1119 if (check_len > targ_len) {
1120 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1121 "Target string too short to match required substring...\n"));
1125 /* do a bytes rather than chars comparison. It's conservative;
1126 * so it skips doing the HOP if the result can't possibly end
1127 * up earlier than the old value of end_point.
1129 assert(anchor + check_len <= (char *)end_point);
1130 if (prog->check_offset_max + check_len < targ_len) {
1131 end_point = HOP3lim((U8*)anchor,
1132 prog->check_offset_max,
1133 end_point - check_len
1136 if (end_point < start_point)
1141 check_at = fbm_instr( start_point, end_point,
1142 check, multiline ? FBMrf_MULTILINE : 0);
1144 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1145 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1146 (IV)((char*)start_point - strbeg),
1147 (IV)((char*)end_point - strbeg),
1148 (IV)(check_at ? check_at - strbeg : -1)
1151 /* Update the count-of-usability, remove useless subpatterns,
1155 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1156 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1157 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
1158 (check_at ? "Found" : "Did not find"),
1159 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1160 ? "anchored" : "floating"),
1163 (check_at ? " at offset " : "...\n") );
1168 /* set rx_origin to the minimum position where the regex could start
1169 * matching, given the constraint of the just-matched check substring.
1170 * But don't set it lower than previously.
1173 if (check_at - rx_origin > prog->check_offset_max)
1174 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1175 /* Finish the diagnostic message */
1176 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1177 "%ld (rx_origin now %" IVdf ")...\n",
1178 (long)(check_at - strbeg),
1179 (IV)(rx_origin - strbeg)
1184 /* now look for the 'other' substring if defined */
1186 if (prog->substrs->data[other_ix].utf8_substr
1187 || prog->substrs->data[other_ix].substr)
1189 /* Take into account the "other" substring. */
1193 struct reg_substr_datum *other;
1196 other = &prog->substrs->data[other_ix];
1197 if (!utf8_target && !other->substr) {
1198 if (!to_byte_substr(prog)) {
1199 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1203 /* if "other" is anchored:
1204 * we've previously found a floating substr starting at check_at.
1205 * This means that the regex origin must lie somewhere
1206 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1207 * and max: HOP3(check_at, -check_offset_min)
1208 * (except that min will be >= strpos)
1209 * So the fixed substr must lie somewhere between
1210 * HOP3(min, anchored_offset)
1211 * HOP3(max, anchored_offset) + SvCUR(substr)
1214 /* if "other" is floating
1215 * Calculate last1, the absolute latest point where the
1216 * floating substr could start in the string, ignoring any
1217 * constraints from the earlier fixed match. It is calculated
1220 * strend - prog->minlen (in chars) is the absolute latest
1221 * position within the string where the origin of the regex
1222 * could appear. The latest start point for the floating
1223 * substr is float_min_offset(*) on from the start of the
1224 * regex. last1 simply combines thee two offsets.
1226 * (*) You might think the latest start point should be
1227 * float_max_offset from the regex origin, and technically
1228 * you'd be correct. However, consider
1230 * Here, float min, max are 3,5 and minlen is 7.
1231 * This can match either
1235 * In the first case, the regex matches minlen chars; in the
1236 * second, minlen+1, in the third, minlen+2.
1237 * In the first case, the floating offset is 3 (which equals
1238 * float_min), in the second, 4, and in the third, 5 (which
1239 * equals float_max). In all cases, the floating string bcd
1240 * can never start more than 4 chars from the end of the
1241 * string, which equals minlen - float_min. As the substring
1242 * starts to match more than float_min from the start of the
1243 * regex, it makes the regex match more than minlen chars,
1244 * and the two cancel each other out. So we can always use
1245 * float_min - minlen, rather than float_max - minlen for the
1246 * latest position in the string.
1248 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1249 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1252 assert(prog->minlen >= other->min_offset);
1253 last1 = HOP3c(strend,
1254 other->min_offset - prog->minlen, strbeg);
1256 if (other_ix) {/* i.e. if (other-is-float) */
1257 /* last is the latest point where the floating substr could
1258 * start, *given* any constraints from the earlier fixed
1259 * match. This constraint is that the floating string starts
1260 * <= float_max_offset chars from the regex origin (rx_origin).
1261 * If this value is less than last1, use it instead.
1263 assert(rx_origin <= last1);
1265 /* this condition handles the offset==infinity case, and
1266 * is a short-cut otherwise. Although it's comparing a
1267 * byte offset to a char length, it does so in a safe way,
1268 * since 1 char always occupies 1 or more bytes,
1269 * so if a string range is (last1 - rx_origin) bytes,
1270 * it will be less than or equal to (last1 - rx_origin)
1271 * chars; meaning it errs towards doing the accurate HOP3
1272 * rather than just using last1 as a short-cut */
1273 (last1 - rx_origin) < other->max_offset
1275 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1278 assert(strpos + start_shift <= check_at);
1279 last = HOP4c(check_at, other->min_offset - start_shift,
1283 s = HOP3c(rx_origin, other->min_offset, strend);
1284 if (s < other_last) /* These positions already checked */
1287 must = utf8_target ? other->utf8_substr : other->substr;
1288 assert(SvPOK(must));
1291 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1297 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1298 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1299 (IV)(from - strbeg),
1305 (unsigned char*)from,
1308 multiline ? FBMrf_MULTILINE : 0
1310 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1311 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1312 (IV)(from - strbeg),
1314 (IV)(s ? s - strbeg : -1)
1320 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1321 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1322 Perl_re_printf( aTHX_ " %s %s substr %s%s",
1323 s ? "Found" : "Contradicts",
1324 other_ix ? "floating" : "anchored",
1325 quoted, RE_SV_TAIL(must));
1330 /* last1 is latest possible substr location. If we didn't
1331 * find it before there, we never will */
1332 if (last >= last1) {
1333 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1334 "; giving up...\n"));
1338 /* try to find the check substr again at a later
1339 * position. Maybe next time we'll find the "other" substr
1341 other_last = HOP3c(last, 1, strend) /* highest failure */;
1343 other_ix /* i.e. if other-is-float */
1344 ? HOP3c(rx_origin, 1, strend)
1345 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1346 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1347 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1348 (other_ix ? "floating" : "anchored"),
1349 (long)(HOP3c(check_at, 1, strend) - strbeg),
1350 (IV)(rx_origin - strbeg)
1355 if (other_ix) { /* if (other-is-float) */
1356 /* other_last is set to s, not s+1, since its possible for
1357 * a floating substr to fail first time, then succeed
1358 * second time at the same floating position; e.g.:
1359 * "-AB--AABZ" =~ /\wAB\d*Z/
1360 * The first time round, anchored and float match at
1361 * "-(AB)--AAB(Z)" then fail on the initial \w character
1362 * class. Second time round, they match at "-AB--A(AB)(Z)".
1367 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1368 other_last = HOP3c(s, 1, strend);
1370 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1371 " at offset %ld (rx_origin now %" IVdf ")...\n",
1373 (IV)(rx_origin - strbeg)
1379 DEBUG_OPTIMISE_MORE_r(
1380 Perl_re_printf( aTHX_
1381 " Check-only match: offset min:%" IVdf " max:%" IVdf
1382 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1383 " strend:%" IVdf "\n",
1384 (IV)prog->check_offset_min,
1385 (IV)prog->check_offset_max,
1386 (IV)(check_at-strbeg),
1387 (IV)(rx_origin-strbeg),
1388 (IV)(rx_origin-check_at),
1394 postprocess_substr_matches:
1396 /* handle the extra constraint of /^.../m if present */
1398 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1401 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1402 " looking for /^/m anchor"));
1404 /* we have failed the constraint of a \n before rx_origin.
1405 * Find the next \n, if any, even if it's beyond the current
1406 * anchored and/or floating substrings. Whether we should be
1407 * scanning ahead for the next \n or the next substr is debatable.
1408 * On the one hand you'd expect rare substrings to appear less
1409 * often than \n's. On the other hand, searching for \n means
1410 * we're effectively flipping between check_substr and "\n" on each
1411 * iteration as the current "rarest" candidate string, which
1412 * means for example that we'll quickly reject the whole string if
1413 * hasn't got a \n, rather than trying every substr position
1417 s = HOP3c(strend, - prog->minlen, strpos);
1418 if (s <= rx_origin ||
1419 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1421 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1422 " Did not find /%s^%s/m...\n",
1423 PL_colors[0], PL_colors[1]));
1427 /* earliest possible origin is 1 char after the \n.
1428 * (since *rx_origin == '\n', it's safe to ++ here rather than
1429 * HOP(rx_origin, 1)) */
1432 if (prog->substrs->check_ix == 0 /* check is anchored */
1433 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1435 /* Position contradicts check-string; either because
1436 * check was anchored (and thus has no wiggle room),
1437 * or check was float and rx_origin is above the float range */
1438 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1439 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1440 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1444 /* if we get here, the check substr must have been float,
1445 * is in range, and we may or may not have had an anchored
1446 * "other" substr which still contradicts */
1447 assert(prog->substrs->check_ix); /* check is float */
1449 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1450 /* whoops, the anchored "other" substr exists, so we still
1451 * contradict. On the other hand, the float "check" substr
1452 * didn't contradict, so just retry the anchored "other"
1454 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1455 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1456 PL_colors[0], PL_colors[1],
1457 (IV)(rx_origin - strbeg + prog->anchored_offset),
1458 (IV)(rx_origin - strbeg)
1460 goto do_other_substr;
1463 /* success: we don't contradict the found floating substring
1464 * (and there's no anchored substr). */
1465 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1466 " Found /%s^%s/m with rx_origin %ld...\n",
1467 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1470 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1471 " (multiline anchor test skipped)\n"));
1477 /* if we have a starting character class, then test that extra constraint.
1478 * (trie stclasses are too expensive to use here, we are better off to
1479 * leave it to regmatch itself) */
1481 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1482 const U8* const str = (U8*)STRING(progi->regstclass);
1484 /* XXX this value could be pre-computed */
1485 const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1486 ? (reginfo->is_utf8_pat
1487 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1488 : (SSize_t)STR_LEN(progi->regstclass))
1492 /* latest pos that a matching float substr constrains rx start to */
1493 char *rx_max_float = NULL;
1495 /* if the current rx_origin is anchored, either by satisfying an
1496 * anchored substring constraint, or a /^.../m constraint, then we
1497 * can reject the current origin if the start class isn't found
1498 * at the current position. If we have a float-only match, then
1499 * rx_origin is constrained to a range; so look for the start class
1500 * in that range. if neither, then look for the start class in the
1501 * whole rest of the string */
1503 /* XXX DAPM it's not clear what the minlen test is for, and why
1504 * it's not used in the floating case. Nothing in the test suite
1505 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1506 * Here are some old comments, which may or may not be correct:
1508 * minlen == 0 is possible if regstclass is \b or \B,
1509 * and the fixed substr is ''$.
1510 * Since minlen is already taken into account, rx_origin+1 is
1511 * before strend; accidentally, minlen >= 1 guaranties no false
1512 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1513 * 0) below assumes that regstclass does not come from lookahead...
1514 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1515 * This leaves EXACTF-ish only, which are dealt with in
1519 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1520 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1521 else if (prog->float_substr || prog->float_utf8) {
1522 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1523 endpos = HOP3clim(rx_max_float, cl_l, strend);
1528 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1529 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
1530 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1531 (IV)start_shift, (IV)(check_at - strbeg),
1532 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1534 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1537 if (endpos == strend) {
1538 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1539 " Could not match STCLASS...\n") );
1542 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1543 " This position contradicts STCLASS...\n") );
1544 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1545 && !(prog->intflags & PREGf_IMPLICIT))
1548 /* Contradict one of substrings */
1549 if (prog->anchored_substr || prog->anchored_utf8) {
1550 if (prog->substrs->check_ix == 1) { /* check is float */
1551 /* Have both, check_string is floating */
1552 assert(rx_origin + start_shift <= check_at);
1553 if (rx_origin + start_shift != check_at) {
1554 /* not at latest position float substr could match:
1555 * Recheck anchored substring, but not floating.
1556 * The condition above is in bytes rather than
1557 * chars for efficiency. It's conservative, in
1558 * that it errs on the side of doing 'goto
1559 * do_other_substr'. In this case, at worst,
1560 * an extra anchored search may get done, but in
1561 * practice the extra fbm_instr() is likely to
1562 * get skipped anyway. */
1563 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1564 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1565 (long)(other_last - strbeg),
1566 (IV)(rx_origin - strbeg)
1568 goto do_other_substr;
1576 /* In the presence of ml_anch, we might be able to
1577 * find another \n without breaking the current float
1580 /* strictly speaking this should be HOP3c(..., 1, ...),
1581 * but since we goto a block of code that's going to
1582 * search for the next \n if any, its safe here */
1584 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1585 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
1586 PL_colors[0], PL_colors[1],
1587 (long)(rx_origin - strbeg)) );
1588 goto postprocess_substr_matches;
1591 /* strictly speaking this can never be true; but might
1592 * be if we ever allow intuit without substrings */
1593 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1596 rx_origin = rx_max_float;
1599 /* at this point, any matching substrings have been
1600 * contradicted. Start again... */
1602 rx_origin = HOP3c(rx_origin, 1, strend);
1604 /* uses bytes rather than char calculations for efficiency.
1605 * It's conservative: it errs on the side of doing 'goto restart',
1606 * where there is code that does a proper char-based test */
1607 if (rx_origin + start_shift + end_shift > strend) {
1608 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1609 " Could not match STCLASS...\n") );
1612 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1613 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1614 (prog->substrs->check_ix ? "floating" : "anchored"),
1615 (long)(rx_origin + start_shift - strbeg),
1616 (IV)(rx_origin - strbeg)
1623 if (rx_origin != s) {
1624 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1625 " By STCLASS: moving %ld --> %ld\n",
1626 (long)(rx_origin - strbeg), (long)(s - strbeg))
1630 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1631 " Does not contradict STCLASS...\n");
1636 /* Decide whether using the substrings helped */
1638 if (rx_origin != strpos) {
1639 /* Fixed substring is found far enough so that the match
1640 cannot start at strpos. */
1642 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
1643 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1646 /* The found rx_origin position does not prohibit matching at
1647 * strpos, so calling intuit didn't gain us anything. Decrement
1648 * the BmUSEFUL() count on the check substring, and if we reach
1650 if (!(prog->intflags & PREGf_NAUGHTY)
1652 prog->check_utf8 /* Could be deleted already */
1653 && --BmUSEFUL(prog->check_utf8) < 0
1654 && (prog->check_utf8 == prog->float_utf8)
1656 prog->check_substr /* Could be deleted already */
1657 && --BmUSEFUL(prog->check_substr) < 0
1658 && (prog->check_substr == prog->float_substr)
1661 /* If flags & SOMETHING - do not do it many times on the same match */
1662 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
1663 /* XXX Does the destruction order has to change with utf8_target? */
1664 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1665 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1666 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1667 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1668 check = NULL; /* abort */
1669 /* XXXX This is a remnant of the old implementation. It
1670 looks wasteful, since now INTUIT can use many
1671 other heuristics. */
1672 prog->extflags &= ~RXf_USE_INTUIT;
1676 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1677 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1678 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1682 fail_finish: /* Substring not found */
1683 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1684 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1686 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
1687 PL_colors[4], PL_colors[5]));
1692 #define DECL_TRIE_TYPE(scan) \
1693 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1694 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1695 trie_utf8l, trie_flu8, trie_flu8_latin } \
1696 trie_type = ((scan->flags == EXACT) \
1697 ? (utf8_target ? trie_utf8 : trie_plain) \
1698 : (scan->flags == EXACTL) \
1699 ? (utf8_target ? trie_utf8l : trie_plain) \
1700 : (scan->flags == EXACTFAA) \
1702 ? trie_utf8_exactfa_fold \
1703 : trie_latin_utf8_exactfa_fold) \
1704 : (scan->flags == EXACTFLU8 \
1707 : trie_flu8_latin) \
1710 : trie_latin_utf8_fold)))
1712 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1713 * 'foldbuf+sizeof(foldbuf)' */
1714 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1717 U8 flags = FOLD_FLAGS_FULL; \
1718 switch (trie_type) { \
1720 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1721 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \
1722 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1724 goto do_trie_utf8_fold; \
1725 case trie_utf8_exactfa_fold: \
1726 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1728 case trie_utf8_fold: \
1729 do_trie_utf8_fold: \
1730 if ( foldlen>0 ) { \
1731 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1736 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
1738 len = UTF8_SAFE_SKIP(uc, uc_end); \
1739 skiplen = UVCHR_SKIP( uvc ); \
1740 foldlen -= skiplen; \
1741 uscan = foldbuf + skiplen; \
1744 case trie_flu8_latin: \
1745 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1746 goto do_trie_latin_utf8_fold; \
1747 case trie_latin_utf8_exactfa_fold: \
1748 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1750 case trie_latin_utf8_fold: \
1751 do_trie_latin_utf8_fold: \
1752 if ( foldlen>0 ) { \
1753 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1759 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1760 skiplen = UVCHR_SKIP( uvc ); \
1761 foldlen -= skiplen; \
1762 uscan = foldbuf + skiplen; \
1766 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1767 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1768 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1772 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
1779 charid = trie->charmap[ uvc ]; \
1783 if (widecharmap) { \
1784 SV** const svpp = hv_fetch(widecharmap, \
1785 (char*)&uvc, sizeof(UV), 0); \
1787 charid = (U16)SvIV(*svpp); \
1792 #define DUMP_EXEC_POS(li,s,doutf8,depth) \
1793 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1794 startpos, doutf8, depth)
1796 #define REXEC_FBC_UTF8_SCAN(CODE) \
1798 while (s < strend) { \
1800 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1804 #define REXEC_FBC_NON_UTF8_SCAN(CODE) \
1806 while (s < strend) { \
1812 #define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1814 while (s < strend) { \
1815 REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1819 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND) \
1821 while (s < strend) { \
1822 REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1826 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1829 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1830 previous_occurrence_end = s; \
1836 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1840 previous_occurrence_end = s; \
1846 /* We keep track of where the next character should start after an occurrence
1847 * of the one we're looking for. Knowing that, we can see right away if the
1848 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
1849 * don't accept the 2nd and succeeding adjacent occurrences */
1850 #define FBC_CHECK_AND_TRY \
1852 || s != previous_occurrence_end) \
1853 && ( reginfo->intuit \
1854 || (s <= reginfo->strend && regtry(reginfo, &s)))) \
1860 /* These differ from the above macros in that they call a function which
1861 * returns the next occurrence of the thing being looked for in 's'; and
1862 * 'strend' if there is no such occurrence. */
1863 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f) \
1864 while (s < strend) { \
1866 if (s >= strend) { \
1872 previous_occurrence_end = s; \
1875 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f) \
1876 while (s < strend) { \
1878 if (s >= strend) { \
1884 previous_occurrence_end = s; \
1887 /* This differs from the above macros in that it is passed a single byte that
1888 * is known to begin the next occurrence of the thing being looked for in 's'.
1889 * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1890 * at that position. */
1891 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \
1892 while (s < strend) { \
1893 s = (char *) memchr(s, byte, strend -s); \
1895 s = (char *) strend; \
1901 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1902 previous_occurrence_end = s; \
1909 /* The four macros below are slightly different versions of the same logic.
1911 * The first is for /a and /aa when the target string is UTF-8. This can only
1912 * match ascii, but it must advance based on UTF-8. The other three handle
1913 * the non-UTF-8 and the more generic UTF-8 cases. In all four, we are
1914 * looking for the boundary (or non-boundary) between a word and non-word
1915 * character. The utf8 and non-utf8 cases have the same logic, but the details
1916 * must be different. Find the "wordness" of the character just prior to this
1917 * one, and compare it with the wordness of this one. If they differ, we have
1918 * a boundary. At the beginning of the string, pretend that the previous
1919 * character was a new-line.
1921 * All these macros uncleanly have side-effects with each other and outside
1922 * variables. So far it's been too much trouble to clean-up
1924 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1925 * a word character or not.
1926 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1928 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1930 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1931 * are looking for a boundary or for a non-boundary. If we are looking for a
1932 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1933 * see if this tentative match actually works, and if so, to quit the loop
1934 * here. And vice-versa if we are looking for a non-boundary.
1936 * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
1937 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1938 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1939 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1940 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1941 * complement. But in that branch we complement tmp, meaning that at the
1942 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1943 * which means at the top of the loop in the next iteration, it is
1944 * TEST_NON_UTF8(s-1) */
1945 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1946 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1947 tmp = TEST_NON_UTF8(tmp); \
1948 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1949 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1951 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1958 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1959 * TEST_UTF8 is a macro that for the same input code points returns identically
1960 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
1961 * end pointer as well) */
1962 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
1963 if (s == reginfo->strbeg) { \
1966 else { /* Back-up to the start of the previous character */ \
1967 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1968 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1969 0, UTF8_ALLOW_DEFAULT); \
1971 tmp = TEST_UV(tmp); \
1972 REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */ \
1973 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
1982 /* Like the above two macros, for a UTF-8 target string. UTF8_CODE is the
1983 * complete code for handling UTF-8. Common to the BOUND and NBOUND cases,
1984 * set-up by the FBC_BOUND, etc macros below */
1985 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1987 /* Here, things have been set up by the previous code so that tmp is the \
1988 * return of TEST_NON_UTF8(s-1). We also have to check if this matches \
1989 * against the EOS, which we treat as a \n */ \
1990 if (tmp == ! TEST_NON_UTF8('\n')) { \
1997 /* Same as the macro above, but the target isn't UTF-8 */
1998 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1999 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2000 tmp = TEST_NON_UTF8(tmp); \
2001 REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */ \
2002 if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) { \
2010 /* Here, things have been set up by the previous code so that tmp is \
2011 * the return of TEST_NON_UTF8(s-1). We also have to check if this \
2012 * matches against the EOS, which we treat as a \n */ \
2013 if (tmp == ! TEST_NON_UTF8('\n')) { \
2020 /* This is the macro to use when we want to see if something that looks like it
2021 * could match, actually does, and if so exits the loop. It needs to be used
2022 * only for bounds checking macros, as it allows for matching beyond the end of
2023 * string (which should be zero length without having to look at the string
2025 #define REXEC_FBC_TRYIT \
2026 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \
2029 /* The only difference between the BOUND and NBOUND cases is that
2030 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2031 * NBOUND. This is accomplished by passing it as either the if or else clause,
2032 * with the other one being empty (PLACEHOLDER is defined as empty).
2034 * The TEST_FOO parameters are for operating on different forms of input, but
2035 * all should be ones that return identically for the same underlying code
2038 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2039 FBC_BOUND_COMMON_UTF8( \
2040 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2041 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2043 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8) \
2044 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2046 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8) \
2047 FBC_BOUND_COMMON_UTF8( \
2048 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2049 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2051 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8) \
2052 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2054 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2055 FBC_BOUND_COMMON_UTF8( \
2056 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2057 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2059 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8) \
2060 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2062 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8) \
2063 FBC_BOUND_COMMON_UTF8( \
2064 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2065 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2067 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8) \
2068 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2072 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2073 IV cp_out = _invlist_search(invlist, cp_in);
2074 assert(cp_out >= 0);
2077 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2078 invmap[S_get_break_val_cp_checked(invlist, cp)]
2080 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2081 invmap[_invlist_search(invlist, cp)]
2084 /* Takes a pointer to an inversion list, a pointer to its corresponding
2085 * inversion map, and a code point, and returns the code point's value
2086 * according to the two arrays. It assumes that all code points have a value.
2087 * This is used as the base macro for macros for particular properties */
2088 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
2089 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2091 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2092 * of a code point, returning the value for the first code point in the string.
2093 * And it takes the particular macro name that finds the desired value given a
2094 * code point. Merely convert the UTF-8 to code point and call the cp macro */
2095 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
2096 (__ASSERT_(pos < strend) \
2097 /* Note assumes is valid UTF-8 */ \
2098 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2100 /* Returns the GCB value for the input code point */
2101 #define getGCB_VAL_CP(cp) \
2102 _generic_GET_BREAK_VAL_CP( \
2107 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2108 * bounded by pos and strend */
2109 #define getGCB_VAL_UTF8(pos, strend) \
2110 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2112 /* Returns the LB value for the input code point */
2113 #define getLB_VAL_CP(cp) \
2114 _generic_GET_BREAK_VAL_CP( \
2119 /* Returns the LB value for the first code point in the UTF-8 encoded string
2120 * bounded by pos and strend */
2121 #define getLB_VAL_UTF8(pos, strend) \
2122 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2125 /* Returns the SB value for the input code point */
2126 #define getSB_VAL_CP(cp) \
2127 _generic_GET_BREAK_VAL_CP( \
2132 /* Returns the SB value for the first code point in the UTF-8 encoded string
2133 * bounded by pos and strend */
2134 #define getSB_VAL_UTF8(pos, strend) \
2135 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2137 /* Returns the WB value for the input code point */
2138 #define getWB_VAL_CP(cp) \
2139 _generic_GET_BREAK_VAL_CP( \
2144 /* Returns the WB value for the first code point in the UTF-8 encoded string
2145 * bounded by pos and strend */
2146 #define getWB_VAL_UTF8(pos, strend) \
2147 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2149 /* We know what class REx starts with. Try to find this position... */
2150 /* if reginfo->intuit, its a dryrun */
2151 /* annoyingly all the vars in this routine have different names from their counterparts
2152 in regmatch. /grrr */
2154 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2155 const char *strend, regmatch_info *reginfo)
2158 /* TRUE if x+ need not match at just the 1st pos of run of x's */
2159 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2161 char *pat_string; /* The pattern's exactish string */
2162 char *pat_end; /* ptr to end char of pat_string */
2163 re_fold_t folder; /* Function for computing non-utf8 folds */
2164 const U8 *fold_array; /* array for folding ords < 256 */
2171 /* In some cases we accept only the first occurence of 'x' in a sequence of
2172 * them. This variable points to just beyond the end of the previous
2173 * occurrence of 'x', hence we can tell if we are in a sequence. (Having
2174 * it point to beyond the 'x' allows us to work for UTF-8 without having to
2176 char * previous_occurrence_end = 0;
2178 I32 tmp; /* Scratch variable */
2179 const bool utf8_target = reginfo->is_utf8_target;
2180 UV utf8_fold_flags = 0;
2181 const bool is_utf8_pat = reginfo->is_utf8_pat;
2182 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2183 with a result inverts that result, as 0^1 =
2185 _char_class_number classnum;
2187 RXi_GET_DECL(prog,progi);
2189 PERL_ARGS_ASSERT_FIND_BYCLASS;
2191 /* We know what class it must start with. The case statements below have
2192 * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2193 * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2194 * ('p8' and 'pb'. */
2195 switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2197 case ANYOFPOSIXL_t8_pb:
2198 case ANYOFPOSIXL_t8_p8:
2201 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2202 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2210 REXEC_FBC_UTF8_CLASS_SCAN(
2211 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2214 case ANYOFPOSIXL_tb_pb:
2215 case ANYOFPOSIXL_tb_p8:
2218 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2219 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2227 if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2228 /* We know that s is in the bitmap range since the target isn't
2229 * UTF-8, so what happens for out-of-range values is not relevant,
2230 * so exclude that from the flags */
2231 REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2235 REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2239 case ANYOFM_tb_pb: /* ARG() is the base byte; FLAGS() the mask byte */
2241 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2242 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2247 /* UTF-8ness doesn't matter because only matches UTF-8 invariants. But
2248 * we do anyway for performance reasons, as otherwise we would have to
2249 * examine all the continuation characters */
2250 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2251 find_next_masked((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2256 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2257 find_span_end_mask((U8 *) s, (U8 *) strend, (U8) ARG(c), FLAGS(c)));
2261 case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2263 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2264 (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2265 (U8) ARG(c), FLAGS(c)));
2268 /* These nodes all require at least one code point to be in UTF-8 to
2278 case EXACTFLU8_tb_pb:
2279 case EXACTFLU8_tb_p8:
2280 case EXACTFU_REQ8_tb_pb:
2281 case EXACTFU_REQ8_tb_p8:
2286 REXEC_FBC_UTF8_CLASS_SCAN(
2287 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2288 && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
2294 /* We know what the first byte of any matched string should be. */
2295 U8 first_byte = FLAGS(c);
2297 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2298 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2304 REXEC_FBC_UTF8_CLASS_SCAN(
2305 ( inRANGE(NATIVE_UTF8_TO_I8(*s),
2306 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2307 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2308 && reginclass(prog, c, (U8*)s, (U8*) strend,
2314 REXEC_FBC_UTF8_CLASS_SCAN(
2315 ( strend -s >= FLAGS(c)
2316 && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c))
2317 && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
2322 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2323 ANYOFRbase(c), ANYOFRdelta(c)));
2328 REXEC_FBC_UTF8_CLASS_SCAN(
2329 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2330 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2333 ANYOFRbase(c), ANYOFRdelta(c))));
2338 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2339 ANYOFRbase(c), ANYOFRdelta(c)));
2344 { /* We know what the first byte of any matched string should be */
2345 U8 first_byte = FLAGS(c);
2347 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2348 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2351 ANYOFRbase(c), ANYOFRdelta(c)));
2355 case EXACTFAA_tb_pb:
2357 /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2358 * which these functions don't handle anyway */
2359 fold_array = PL_fold_latin1;
2360 folder = foldEQ_latin1_s2_folded;
2361 goto do_exactf_non_utf8;
2364 fold_array = PL_fold;
2366 goto do_exactf_non_utf8;
2369 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2371 if (IN_UTF8_CTYPE_LOCALE) {
2372 utf8_fold_flags = FOLDEQ_LOCALE;
2373 goto do_exactf_utf8;
2376 fold_array = PL_fold_locale;
2377 folder = foldEQ_locale;
2378 goto do_exactf_non_utf8;
2381 /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2382 * don't have to worry here about this single special case in the
2384 fold_array = PL_fold_latin1;
2385 folder = foldEQ_latin1_s2_folded;
2389 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2390 are no glitches with fold-length differences
2391 between the target string and pattern */
2393 /* The idea in the non-utf8 EXACTF* cases is to first find the first
2394 * character of the EXACTF* node and then, if necessary,
2395 * case-insensitively compare the full text of the node. c1 is the
2396 * first character. c2 is its fold. This logic will not work for
2397 * Unicode semantics and the german sharp ss, which hence should not be
2398 * compiled into a node that gets here. */
2399 pat_string = STRINGs(c);
2400 ln = STR_LENs(c); /* length to match in octets/bytes */
2402 /* We know that we have to match at least 'ln' bytes (which is the same
2403 * as characters, since not utf8). If we have to match 3 characters,
2404 * and there are only 2 availabe, we know without trying that it will
2405 * fail; so don't start a match past the required minimum number from
2407 e = HOP3c(strend, -((SSize_t)ln), s);
2412 c2 = fold_array[c1];
2413 if (c1 == c2) { /* If char and fold are the same */
2415 s = (char *) memchr(s, c1, e + 1 - s);
2420 /* Check that the rest of the node matches */
2421 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2422 && (reginfo->intuit || regtry(reginfo, &s)) )
2430 U8 bits_differing = c1 ^ c2;
2432 /* If the folds differ in one bit position only, we can mask to
2433 * match either of them, and can use this faster find method. Both
2434 * ASCII and EBCDIC tend to have their case folds differ in only
2435 * one position, so this is very likely */
2436 if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2437 bits_differing = ~ bits_differing;
2439 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2440 (c1 & bits_differing), bits_differing);
2445 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2446 && (reginfo->intuit || regtry(reginfo, &s)) )
2453 else { /* Otherwise, stuck with looking byte-at-a-time. This
2454 should actually happen only in EXACTFL nodes */
2456 if ( (*(U8*)s == c1 || *(U8*)s == c2)
2457 && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2458 && (reginfo->intuit || regtry(reginfo, &s)) )
2468 case EXACTFAA_tb_p8:
2469 case EXACTFAA_t8_p8:
2470 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2471 |FOLDEQ_S2_ALREADY_FOLDED
2472 |FOLDEQ_S2_FOLDS_SANE;
2473 goto do_exactf_utf8;
2475 case EXACTFAA_NO_TRIE_tb_pb:
2476 case EXACTFAA_NO_TRIE_t8_pb:
2477 case EXACTFAA_t8_pb:
2479 /* Here, and elsewhere in this file, the reason we can't consider a
2480 * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2481 * is because any MICRO SIGN in the pattern won't be folded. Since the
2482 * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2483 * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2484 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2485 goto do_exactf_utf8;
2490 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2491 utf8_fold_flags = FOLDEQ_LOCALE;
2492 goto do_exactf_utf8;
2494 case EXACTFLU8_t8_pb:
2495 case EXACTFLU8_t8_p8:
2496 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2497 | FOLDEQ_S2_FOLDS_SANE;
2498 goto do_exactf_utf8;
2500 case EXACTFU_REQ8_t8_p8:
2501 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2502 goto do_exactf_utf8;
2507 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2508 goto do_exactf_utf8;
2510 /* The following are problematic even though pattern isn't UTF-8. Use
2511 * full functionality normally not done except for UTF-8. */
2513 case EXACTFUP_tb_pb:
2514 case EXACTFUP_t8_pb:
2520 /* If one of the operands is in utf8, we can't use the simpler
2521 * folding above, due to the fact that many different characters
2522 * can have the same fold, or portion of a fold, or different-
2524 pat_string = STRINGs(c);
2525 ln = STR_LENs(c); /* length to match in octets/bytes */
2526 pat_end = pat_string + ln;
2527 lnc = is_utf8_pat /* length to match in characters */
2528 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2531 /* We have 'lnc' characters to match in the pattern, but because of
2532 * multi-character folding, each character in the target can match
2533 * up to 3 characters (Unicode guarantees it will never exceed
2534 * this) if it is utf8-encoded; and up to 2 if not (based on the
2535 * fact that the Latin 1 folds are already determined, and the only
2536 * multi-char fold in that range is the sharp-s folding to 'ss'.
2537 * Thus, a pattern character can match as little as 1/3 of a string
2538 * character. Adjust lnc accordingly, rounding up, so that if we
2539 * need to match at least 4+1/3 chars, that really is 5. */
2540 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2541 lnc = (lnc + expansion - 1) / expansion;
2543 /* As in the non-UTF8 case, if we have to match 3 characters, and
2544 * only 2 are left, it's guaranteed to fail, so don't start a match
2545 * that would require us to go beyond the end of the string */
2546 e = HOP3c(strend, -((SSize_t)lnc), s);
2548 /* XXX Note that we could recalculate e to stop the loop earlier,
2549 * as the worst case expansion above will rarely be met, and as we
2550 * go along we would usually find that e moves further to the left.
2551 * This would happen only after we reached the point in the loop
2552 * where if there were no expansion we should fail. Unclear if
2553 * worth the expense */
2556 char *my_strend= (char *)strend;
2557 if ( foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
2558 pat_string, NULL, ln, is_utf8_pat,
2560 && (reginfo->intuit || regtry(reginfo, &s)) )
2564 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2571 case BOUND_tb_pb: /* /d without utf8 target is /a */
2573 /* regcomp.c makes sure that these only have the traditional \b
2575 assert(FLAGS(c) == TRADITIONAL_BOUND);
2577 FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
2580 case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2582 /* regcomp.c makes sure that these only have the traditional \b
2584 assert(FLAGS(c) == TRADITIONAL_BOUND);
2586 FBC_BOUND_A_UTF8(isWORDCHAR_A);
2591 case NBOUND_tb_pb: /* /d without utf8 target is /a */
2593 /* regcomp.c makes sure that these only have the traditional \b
2595 assert(FLAGS(c) == TRADITIONAL_BOUND);
2597 FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
2600 case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2602 /* regcomp.c makes sure that these only have the traditional \b
2604 assert(FLAGS(c) == TRADITIONAL_BOUND);
2606 FBC_NBOUND_A_UTF8(isWORDCHAR_A);
2611 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2612 FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2617 goto do_boundu_non_utf8;
2621 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2622 if (FLAGS(c) == TRADITIONAL_BOUND) {
2623 FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
2627 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2630 goto do_boundu_non_utf8;
2634 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2635 if (FLAGS(c) == TRADITIONAL_BOUND) {
2636 FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2640 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2642 goto do_boundu_non_utf8;
2646 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2647 FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
2652 if (s == reginfo->strbeg) {
2653 if (reginfo->intuit || regtry(reginfo, &s))
2658 /* Didn't match. Try at the next position (if there is one) */
2660 if (UNLIKELY(s >= reginfo->strend)) {
2665 switch((bound_type) FLAGS(c)) {
2666 case TRADITIONAL_BOUND: /* Should have already been handled */
2671 /* Not utf8. Everything is a GCB except between CR and LF */
2672 while (s < strend) {
2673 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2674 || UCHARAT(s) != '\n'))
2675 && (reginfo->intuit || regtry(reginfo, &s)))
2686 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2687 while (s < strend) {
2688 LB_enum after = getLB_VAL_CP((U8) *s);
2689 if (to_complement ^ isLB(before,
2691 (U8*) reginfo->strbeg,
2693 (U8*) reginfo->strend,
2694 0 /* target not utf8 */ )
2695 && (reginfo->intuit || regtry(reginfo, &s)))
2708 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2709 while (s < strend) {
2710 SB_enum after = getSB_VAL_CP((U8) *s);
2711 if ((to_complement ^ isSB(before,
2713 (U8*) reginfo->strbeg,
2715 (U8*) reginfo->strend,
2716 0 /* target not utf8 */ ))
2717 && (reginfo->intuit || regtry(reginfo, &s)))
2730 WB_enum previous = WB_UNKNOWN;
2731 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2732 while (s < strend) {
2733 WB_enum after = getWB_VAL_CP((U8) *s);
2734 if ((to_complement ^ isWB(previous,
2737 (U8*) reginfo->strbeg,
2739 (U8*) reginfo->strend,
2740 0 /* target not utf8 */ ))
2741 && (reginfo->intuit || regtry(reginfo, &s)))
2752 /* Here are at the final position in the target string, which is a
2753 * boundary by definition, so matches, depending on other constraints.
2755 if ( reginfo->intuit
2756 || (s <= reginfo->strend && regtry(reginfo, &s)))
2765 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2766 if (FLAGS(c) == TRADITIONAL_BOUND) {
2767 FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2768 isWORDCHAR_LC_utf8_safe);
2772 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2775 goto do_boundu_utf8;
2779 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2780 if (FLAGS(c) == TRADITIONAL_BOUND) {
2781 FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2782 isWORDCHAR_LC_utf8_safe);
2786 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2789 goto do_boundu_utf8;
2793 /* regcomp.c makes sure that these only have the traditional \b
2795 assert(FLAGS(c) == TRADITIONAL_BOUND);
2801 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2802 FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2803 isWORDCHAR_utf8_safe);
2808 goto do_boundu_utf8;
2812 /* regcomp.c makes sure that these only have the traditional \b
2814 assert(FLAGS(c) == TRADITIONAL_BOUND);
2820 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2821 FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2826 if (s == reginfo->strbeg) {
2827 if (reginfo->intuit || regtry(reginfo, &s))
2832 /* Didn't match. Try at the next position (if there is one) */
2833 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2834 if (UNLIKELY(s >= reginfo->strend)) {
2839 switch((bound_type) FLAGS(c)) {
2840 case TRADITIONAL_BOUND: /* Should have already been handled */
2846 GCB_enum before = getGCB_VAL_UTF8(
2848 (U8*)(reginfo->strbeg)),
2849 (U8*) reginfo->strend);
2850 while (s < strend) {
2851 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2852 (U8*) reginfo->strend);
2853 if ( (to_complement ^ isGCB(before,
2855 (U8*) reginfo->strbeg,
2857 1 /* target is utf8 */ ))
2858 && (reginfo->intuit || regtry(reginfo, &s)))
2863 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2870 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2872 (U8*)(reginfo->strbeg)),
2873 (U8*) reginfo->strend);
2874 while (s < strend) {
2875 LB_enum after = getLB_VAL_UTF8((U8*) s,
2876 (U8*) reginfo->strend);
2877 if (to_complement ^ isLB(before,
2879 (U8*) reginfo->strbeg,
2881 (U8*) reginfo->strend,
2882 1 /* target is utf8 */ )
2883 && (reginfo->intuit || regtry(reginfo, &s)))
2888 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2896 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2898 (U8*)(reginfo->strbeg)),
2899 (U8*) reginfo->strend);
2900 while (s < strend) {
2901 SB_enum after = getSB_VAL_UTF8((U8*) s,
2902 (U8*) reginfo->strend);
2903 if ((to_complement ^ isSB(before,
2905 (U8*) reginfo->strbeg,
2907 (U8*) reginfo->strend,
2908 1 /* target is utf8 */ ))
2909 && (reginfo->intuit || regtry(reginfo, &s)))
2914 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2922 /* We are at a boundary between char_sub_0 and char_sub_1.
2923 * We also keep track of the value for char_sub_-1 as we
2924 * loop through the line. Context may be needed to make a
2925 * determination, and if so, this can save having to
2927 WB_enum previous = WB_UNKNOWN;
2928 WB_enum before = getWB_VAL_UTF8(
2931 (U8*)(reginfo->strbeg)),
2932 (U8*) reginfo->strend);
2933 while (s < strend) {
2934 WB_enum after = getWB_VAL_UTF8((U8*) s,
2935 (U8*) reginfo->strend);
2936 if ((to_complement ^ isWB(previous,
2939 (U8*) reginfo->strbeg,
2941 (U8*) reginfo->strend,
2942 1 /* target is utf8 */ ))
2943 && (reginfo->intuit || regtry(reginfo, &s)))
2949 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2954 /* Here are at the final position in the target string, which is a
2955 * boundary by definition, so matches, depending on other constraints.
2958 if ( reginfo->intuit
2959 || (s <= reginfo->strend && regtry(reginfo, &s)))
2967 REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
2972 REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
2975 /* The argument to all the POSIX node types is the class number to pass
2976 * to _generic_isCC() to build a mask for searching in PL_charclass[] */
2985 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2986 REXEC_FBC_UTF8_CLASS_SCAN(
2987 to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
2998 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2999 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3000 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3005 /* The complement of something that matches only ASCII matches all
3006 * non-ASCII, plus everything in ASCII that isn't in the class. */
3007 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend)
3008 || ! _generic_isCC_A(*s, FLAGS(c)));
3013 /* Don't need to worry about utf8, as it can match only a single
3014 * byte invariant character. But we do anyway for performance reasons,
3015 * as otherwise we would have to examine all the continuation
3017 REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
3031 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3032 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
3042 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3043 to_complement ^ cBOOL(_generic_isCC(*s,
3058 classnum = (_char_class_number) FLAGS(c);
3061 REXEC_FBC_UTF8_CLASS_SCAN(
3062 to_complement ^ cBOOL(_invlist_contains_cp(
3063 PL_XPosix_ptrs[classnum],
3064 utf8_to_uvchr_buf((U8 *) s,
3069 case _CC_ENUM_SPACE:
3070 REXEC_FBC_UTF8_CLASS_SCAN(
3071 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3074 case _CC_ENUM_BLANK:
3075 REXEC_FBC_UTF8_CLASS_SCAN(
3076 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3079 case _CC_ENUM_XDIGIT:
3080 REXEC_FBC_UTF8_CLASS_SCAN(
3081 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3084 case _CC_ENUM_VERTSPACE:
3085 REXEC_FBC_UTF8_CLASS_SCAN(
3086 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3089 case _CC_ENUM_CNTRL:
3090 REXEC_FBC_UTF8_CLASS_SCAN(
3091 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3096 case AHOCORASICKC_tb_pb:
3097 case AHOCORASICKC_tb_p8:
3098 case AHOCORASICKC_t8_pb:
3099 case AHOCORASICKC_t8_p8:
3100 case AHOCORASICK_tb_pb:
3101 case AHOCORASICK_tb_p8:
3102 case AHOCORASICK_t8_pb:
3103 case AHOCORASICK_t8_p8:
3106 /* what trie are we using right now */
3107 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3108 reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
3109 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3111 const char *last_start = strend - trie->minlen;
3113 const char *real_start = s;
3115 STRLEN maxlen = trie->maxlen;
3117 U8 **points; /* map of where we were in the input string
3118 when reading a given char. For ASCII this
3119 is unnecessary overhead as the relationship
3120 is always 1:1, but for Unicode, especially
3121 case folded Unicode this is not true. */
3122 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3126 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3128 /* We can't just allocate points here. We need to wrap it in
3129 * an SV so it gets freed properly if there is a croak while
3130 * running the match */
3133 sv_points=newSV(maxlen * sizeof(U8 *));
3134 SvCUR_set(sv_points,
3135 maxlen * sizeof(U8 *));
3136 SvPOK_on(sv_points);
3137 sv_2mortal(sv_points);
3138 points=(U8**)SvPV_nolen(sv_points );
3139 if ( trie_type != trie_utf8_fold
3140 && (trie->bitmap || OP(c)==AHOCORASICKC) )
3143 bitmap=(U8*)trie->bitmap;
3145 bitmap=(U8*)ANYOF_BITMAP(c);
3147 /* this is the Aho-Corasick algorithm modified a touch
3148 to include special handling for long "unknown char" sequences.
3149 The basic idea being that we use AC as long as we are dealing
3150 with a possible matching char, when we encounter an unknown char
3151 (and we have not encountered an accepting state) we scan forward
3152 until we find a legal starting char.
3153 AC matching is basically that of trie matching, except that when
3154 we encounter a failing transition, we fall back to the current
3155 states "fail state", and try the current char again, a process
3156 we repeat until we reach the root state, state 1, or a legal
3157 transition. If we fail on the root state then we can either
3158 terminate if we have reached an accepting state previously, or
3159 restart the entire process from the beginning if we have not.
3162 while (s <= last_start) {
3163 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3171 U8 *uscan = (U8*)NULL;
3172 U8 *leftmost = NULL;
3174 U32 accepted_word= 0;
3178 while ( state && uc <= (U8*)strend ) {
3180 U32 word = aho->states[ state ].wordnum;
3184 DEBUG_TRIE_EXECUTE_r(
3185 if ( uc <= (U8*)last_start
3186 && !BITMAP_TEST(bitmap,*uc) )
3188 dump_exec_pos( (char *)uc, c, strend,
3190 (char *)uc, utf8_target, 0 );
3191 Perl_re_printf( aTHX_
3192 " Scanning for legal start char...\n");
3196 while ( uc <= (U8*)last_start
3197 && !BITMAP_TEST(bitmap,*uc) )
3202 while ( uc <= (U8*)last_start
3203 && ! BITMAP_TEST(bitmap,*uc) )
3210 if (uc >(U8*)last_start) break;
3214 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3216 if (!leftmost || lpos < leftmost) {
3217 DEBUG_r(accepted_word=word);
3223 points[pointpos++ % maxlen]= uc;
3224 if (foldlen || uc < (U8*)strend) {
3225 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3226 (U8 *) strend, uscan, len, uvc,
3227 charid, foldlen, foldbuf,
3229 DEBUG_TRIE_EXECUTE_r({
3230 dump_exec_pos( (char *)uc, c, strend,
3231 real_start, s, utf8_target, 0);
3232 Perl_re_printf( aTHX_
3233 " Charid:%3u CP:%4" UVxf " ",
3245 word = aho->states[ state ].wordnum;
3247 base = aho->states[ state ].trans.base;
3249 DEBUG_TRIE_EXECUTE_r({
3251 dump_exec_pos((char *)uc, c, strend, real_start,
3252 s, utf8_target, 0 );
3253 Perl_re_printf( aTHX_
3254 "%sState: %4" UVxf ", word=%" UVxf,
3255 failed ? " Fail transition to " : "",
3256 (UV)state, (UV)word);
3262 ( ((offset = base + charid
3263 - 1 - trie->uniquecharcount)) >= 0)
3264 && ((U32)offset < trie->lasttrans)
3265 && trie->trans[offset].check == state
3266 && (tmp=trie->trans[offset].next))
3268 DEBUG_TRIE_EXECUTE_r(
3269 Perl_re_printf( aTHX_ " - legal\n"));
3274 DEBUG_TRIE_EXECUTE_r(
3275 Perl_re_printf( aTHX_ " - fail\n"));
3277 state = aho->fail[state];
3281 /* we must be accepting here */
3282 DEBUG_TRIE_EXECUTE_r(
3283 Perl_re_printf( aTHX_ " - accepting\n"));
3292 if (!state) state = 1;
3295 if ( aho->states[ state ].wordnum ) {
3296 U8 *lpos = points[ (pointpos
3297 - trie->wordinfo[aho->states[ state ]
3298 .wordnum].len) % maxlen ];
3299 if (!leftmost || lpos < leftmost) {
3300 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3305 s = (char*)leftmost;
3306 DEBUG_TRIE_EXECUTE_r({
3307 Perl_re_printf( aTHX_ "Matches word #%" UVxf
3308 " at position %" IVdf ". Trying full"
3310 (UV)accepted_word, (IV)(s - real_start)
3313 if (reginfo->intuit || regtry(reginfo, &s)) {
3318 if (s < reginfo->strend) {
3321 DEBUG_TRIE_EXECUTE_r({
3322 Perl_re_printf( aTHX_
3323 "Pattern failed. Looking for new start"
3327 DEBUG_TRIE_EXECUTE_r(
3328 Perl_re_printf( aTHX_ "No match.\n"));
3337 case EXACTFU_REQ8_t8_pb:
3338 case EXACTFUP_tb_p8:
3339 case EXACTFUP_t8_p8:
3341 case EXACTF_t8_p8: /* This node only generated for non-utf8 patterns */
3342 case EXACTFAA_NO_TRIE_tb_p8:
3343 case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3348 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3349 } /* End of switch on node type */
3357 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3358 * flags have same meanings as with regexec_flags() */
3361 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3368 struct regexp *const prog = ReANY(rx);
3370 if (flags & REXEC_COPY_STR) {
3373 DEBUG_C(Perl_re_printf( aTHX_
3374 "Copy on write: regexp capture, type %d\n",
3376 /* Create a new COW SV to share the match string and store
3377 * in saved_copy, unless the current COW SV in saved_copy
3378 * is valid and suitable for our purpose */
3379 if (( prog->saved_copy
3380 && SvIsCOW(prog->saved_copy)
3381 && SvPOKp(prog->saved_copy)
3384 && SvPVX(sv) == SvPVX(prog->saved_copy)))
3386 /* just reuse saved_copy SV */
3387 if (RXp_MATCH_COPIED(prog)) {
3388 Safefree(prog->subbeg);
3389 RXp_MATCH_COPIED_off(prog);
3393 /* create new COW SV to share string */
3394 RXp_MATCH_COPY_FREE(prog);
3395 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3397 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3398 assert (SvPOKp(prog->saved_copy));
3399 prog->sublen = strend - strbeg;
3400 prog->suboffset = 0;
3401 prog->subcoffset = 0;
3406 SSize_t max = strend - strbeg;
3409 if ( (flags & REXEC_COPY_SKIP_POST)
3410 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3411 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3412 ) { /* don't copy $' part of string */
3415 /* calculate the right-most part of the string covered
3416 * by a capture. Due to lookahead, this may be to
3417 * the right of $&, so we have to scan all captures */
3418 while (n <= prog->lastparen) {
3419 if (prog->offs[n].end > max)
3420 max = prog->offs[n].end;
3424 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3425 ? prog->offs[0].start
3427 assert(max >= 0 && max <= strend - strbeg);
3430 if ( (flags & REXEC_COPY_SKIP_PRE)
3431 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3432 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3433 ) { /* don't copy $` part of string */
3436 /* calculate the left-most part of the string covered
3437 * by a capture. Due to lookbehind, this may be to
3438 * the left of $&, so we have to scan all captures */
3439 while (min && n <= prog->lastparen) {
3440 if ( prog->offs[n].start != -1
3441 && prog->offs[n].start < min)
3443 min = prog->offs[n].start;
3447 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3448 && min > prog->offs[0].end
3450 min = prog->offs[0].end;
3454 assert(min >= 0 && min <= max && min <= strend - strbeg);
3457 if (RXp_MATCH_COPIED(prog)) {
3458 if (sublen > prog->sublen)
3460 (char*)saferealloc(prog->subbeg, sublen+1);
3463 prog->subbeg = (char*)safemalloc(sublen+1);
3464 Copy(strbeg + min, prog->subbeg, sublen, char);
3465 prog->subbeg[sublen] = '\0';
3466 prog->suboffset = min;
3467 prog->sublen = sublen;
3468 RXp_MATCH_COPIED_on(prog);
3470 prog->subcoffset = prog->suboffset;
3471 if (prog->suboffset && utf8_target) {
3472 /* Convert byte offset to chars.
3473 * XXX ideally should only compute this if @-/@+
3474 * has been seen, a la PL_sawampersand ??? */
3476 /* If there's a direct correspondence between the
3477 * string which we're matching and the original SV,
3478 * then we can use the utf8 len cache associated with
3479 * the SV. In particular, it means that under //g,
3480 * sv_pos_b2u() will use the previously cached
3481 * position to speed up working out the new length of
3482 * subcoffset, rather than counting from the start of
3483 * the string each time. This stops
3484 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3485 * from going quadratic */
3486 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3487 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3488 SV_GMAGIC|SV_CONST_RETURN);
3490 prog->subcoffset = utf8_length((U8*)strbeg,
3491 (U8*)(strbeg+prog->suboffset));
3495 RXp_MATCH_COPY_FREE(prog);
3496 prog->subbeg = strbeg;
3497 prog->suboffset = 0;
3498 prog->subcoffset = 0;
3499 prog->sublen = strend - strbeg;
3507 - regexec_flags - match a regexp against a string
3510 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3511 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3512 /* stringarg: the point in the string at which to begin matching */
3513 /* strend: pointer to null at end of string */
3514 /* strbeg: real beginning of string */
3515 /* minend: end of match must be >= minend bytes after stringarg. */
3516 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
3517 * itself is accessed via the pointers above */
3518 /* data: May be used for some additional optimizations.
3519 Currently unused. */
3520 /* flags: For optimizations. See REXEC_* in regexp.h */
3523 struct regexp *const prog = ReANY(rx);
3527 SSize_t minlen; /* must match at least this many chars */
3528 SSize_t dontbother = 0; /* how many characters not to try at end */
3529 const bool utf8_target = cBOOL(DO_UTF8(sv));
3531 RXi_GET_DECL(prog,progi);
3532 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3533 regmatch_info *const reginfo = ®info_buf;
3534 regexp_paren_pair *swap = NULL;
3536 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3538 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3539 PERL_UNUSED_ARG(data);
3541 /* Be paranoid... */
3543 Perl_croak(aTHX_ "NULL regexp parameter");
3547 debug_start_match(rx, utf8_target, stringarg, strend,
3551 startpos = stringarg;
3553 /* set these early as they may be used by the HOP macros below */
3554 reginfo->strbeg = strbeg;
3555 reginfo->strend = strend;
3556 reginfo->is_utf8_target = cBOOL(utf8_target);
3558 if (prog->intflags & PREGf_GPOS_SEEN) {
3561 /* set reginfo->ganch, the position where \G can match */
3564 (flags & REXEC_IGNOREPOS)
3565 ? stringarg /* use start pos rather than pos() */
3566 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3567 /* Defined pos(): */
3568 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3569 : strbeg; /* pos() not defined; use start of string */
3571 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3572 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3574 /* in the presence of \G, we may need to start looking earlier in
3575 * the string than the suggested start point of stringarg:
3576 * if prog->gofs is set, then that's a known, fixed minimum
3579 * /ab|c\G/: gofs = 1
3580 * or if the minimum offset isn't known, then we have to go back
3581 * to the start of the string, e.g. /w+\G/
3584 if (prog->intflags & PREGf_ANCH_GPOS) {
3586 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3588 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3590 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3591 "fail: ganch-gofs before earliest possible start\n"));
3596 startpos = reginfo->ganch;
3598 else if (prog->gofs) {
3599 startpos = HOPBACKc(startpos, prog->gofs);
3603 else if (prog->intflags & PREGf_GPOS_FLOAT)
3607 minlen = prog->minlen;
3608 if ((startpos + minlen) > strend || startpos < strbeg) {
3609 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3610 "Regex match can't succeed, so not even tried\n"));
3614 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3615 * which will call destuctors to reset PL_regmatch_state, free higher
3616 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3617 * regmatch_info_aux_eval */
3619 oldsave = PL_savestack_ix;
3623 if ((prog->extflags & RXf_USE_INTUIT)
3624 && !(flags & REXEC_CHECKED))
3626 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3631 if (prog->extflags & RXf_CHECK_ALL) {
3632 /* we can match based purely on the result of INTUIT.
3633 * Set up captures etc just for $& and $-[0]
3634 * (an intuit-only match wont have $1,$2,..) */
3635 assert(!prog->nparens);
3637 /* s/// doesn't like it if $& is earlier than where we asked it to
3638 * start searching (which can happen on something like /.\G/) */
3639 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3642 /* this should only be possible under \G */
3643 assert(prog->intflags & PREGf_GPOS_SEEN);
3644 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3645 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3649 /* match via INTUIT shouldn't have any captures.
3650 * Let @-, @+, $^N know */
3651 prog->lastparen = prog->lastcloseparen = 0;
3652 RXp_MATCH_UTF8_set(prog, utf8_target);
3653 prog->offs[0].start = s - strbeg;
3654 prog->offs[0].end = utf8_target
3655 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3656 : s - strbeg + prog->minlenret;
3657 if ( !(flags & REXEC_NOT_FIRST) )
3658 S_reg_set_capture_string(aTHX_ rx,
3660 sv, flags, utf8_target);
3666 multiline = prog->extflags & RXf_PMf_MULTILINE;
3668 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3669 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3670 "String too short [regexec_flags]...\n"));
3674 /* Check validity of program. */
3675 if (UCHARAT(progi->program) != REG_MAGIC) {
3676 Perl_croak(aTHX_ "corrupted regexp program");
3679 RXp_MATCH_TAINTED_off(prog);
3680 RXp_MATCH_UTF8_set(prog, utf8_target);
3682 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3683 reginfo->intuit = 0;
3684 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3685 reginfo->warned = FALSE;
3687 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3688 /* see how far we have to get to not match where we matched before */
3689 reginfo->till = stringarg + minend;
3691 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3692 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3693 S_cleanup_regmatch_info_aux has executed (registered by
3694 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3695 magic belonging to this SV.
3696 Not newSVsv, either, as it does not COW.
3698 reginfo->sv = newSV(0);
3699 SvSetSV_nosteal(reginfo->sv, sv);
3700 SAVEFREESV(reginfo->sv);
3703 /* reserve next 2 or 3 slots in PL_regmatch_state:
3704 * slot N+0: may currently be in use: skip it
3705 * slot N+1: use for regmatch_info_aux struct
3706 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3707 * slot N+3: ready for use by regmatch()
3711 regmatch_state *old_regmatch_state;
3712 regmatch_slab *old_regmatch_slab;
3713 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3715 /* on first ever match, allocate first slab */
3716 if (!PL_regmatch_slab) {
3717 Newx(PL_regmatch_slab, 1, regmatch_slab);
3718 PL_regmatch_slab->prev = NULL;
3719 PL_regmatch_slab->next = NULL;
3720 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3723 old_regmatch_state = PL_regmatch_state;
3724 old_regmatch_slab = PL_regmatch_slab;
3726 for (i=0; i <= max; i++) {
3728 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3730 reginfo->info_aux_eval =
3731 reginfo->info_aux->info_aux_eval =
3732 &(PL_regmatch_state->u.info_aux_eval);
3734 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3735 PL_regmatch_state = S_push_slab(aTHX);
3738 /* note initial PL_regmatch_state position; at end of match we'll
3739 * pop back to there and free any higher slabs */
3741 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3742 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
3743 reginfo->info_aux->poscache = NULL;
3745 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3747 if ((prog->extflags & RXf_EVAL_SEEN))
3748 S_setup_eval_state(aTHX_ reginfo);
3750 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3753 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3754 /* We have to be careful. If the previous successful match
3755 was from this regex we don't want a subsequent partially
3756 successful match to clobber the old results.
3757 So when we detect this possibility we add a swap buffer
3758 to the re, and switch the buffer each match. If we fail,
3759 we switch it back; otherwise we leave it swapped.
3762 /* avoid leak if we die, or clean up anyway if match completes */
3764 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3765 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3766 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3774 if (prog->recurse_locinput)
3775 Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3777 /* Simplest case: anchored match (but not \G) need be tried only once,
3778 * or with MBOL, only at the beginning of each line.
3780 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3781 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3782 * match at the start of the string then it won't match anywhere else
3783 * either; while with /.*.../, if it doesn't match at the beginning,
3784 * the earliest it could match is at the start of the next line */
3786 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3789 if (regtry(reginfo, &s))
3792 if (!(prog->intflags & PREGf_ANCH_MBOL))
3795 /* didn't match at start, try at other newline positions */
3798 dontbother = minlen - 1;
3799 end = HOP3c(strend, -dontbother, strbeg) - 1;
3801 /* skip to next newline */
3803 while (s <= end) { /* note it could be possible to match at the end of the string */
3804 /* NB: newlines are the same in unicode as they are in latin */
3807 if (prog->check_substr || prog->check_utf8) {
3808 /* note that with PREGf_IMPLICIT, intuit can only fail
3809 * or return the start position, so it's of limited utility.
3810 * Nevertheless, I made the decision that the potential for
3811 * quick fail was still worth it - DAPM */
3812 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3816 if (regtry(reginfo, &s))
3820 } /* end anchored search */
3822 /* anchored \G match */
3823 if (prog->intflags & PREGf_ANCH_GPOS)
3825 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3826 assert(prog->intflags & PREGf_GPOS_SEEN);
3827 /* For anchored \G, the only position it can match from is
3828 * (ganch-gofs); we already set startpos to this above; if intuit
3829 * moved us on from there, we can't possibly succeed */
3830 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3831 if (s == startpos && regtry(reginfo, &s))
3836 /* Messy cases: unanchored match. */
3838 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3839 /* we have /x+whatever/ */
3840 /* it must be a one character string (XXXX Except is_utf8_pat?) */
3846 if (! prog->anchored_utf8) {
3847 to_utf8_substr(prog);
3849 ch = SvPVX_const(prog->anchored_utf8)[0];
3850 REXEC_FBC_UTF8_SCAN(
3852 DEBUG_EXECUTE_r( did_match = 1 );
3853 if (regtry(reginfo, &s)) goto got_it;
3854 s += UTF8_SAFE_SKIP(s, strend);
3855 while (s < strend && *s == ch)
3862 if (! prog->anchored_substr) {
3863 if (! to_byte_substr(prog)) {
3864 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3867 ch = SvPVX_const(prog->anchored_substr)[0];
3868 REXEC_FBC_NON_UTF8_SCAN(
3870 DEBUG_EXECUTE_r( did_match = 1 );
3871 if (regtry(reginfo, &s)) goto got_it;
3873 while (s < strend && *s == ch)
3878 DEBUG_EXECUTE_r(if (!did_match)
3879 Perl_re_printf( aTHX_
3880 "Did not find anchored character...\n")
3883 else if (prog->anchored_substr != NULL
3884 || prog->anchored_utf8 != NULL
3885 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3886 && prog->float_max_offset < strend - s)) {
3891 char *last1; /* Last position checked before */
3895 if (prog->anchored_substr || prog->anchored_utf8) {
3897 if (! prog->anchored_utf8) {
3898 to_utf8_substr(prog);
3900 must = prog->anchored_utf8;
3903 if (! prog->anchored_substr) {
3904 if (! to_byte_substr(prog)) {
3905 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3908 must = prog->anchored_substr;
3910 back_max = back_min = prog->anchored_offset;
3913 if (! prog->float_utf8) {
3914 to_utf8_substr(prog);
3916 must = prog->float_utf8;
3919 if (! prog->float_substr) {
3920 if (! to_byte_substr(prog)) {
3921 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3924 must = prog->float_substr;
3926 back_max = prog->float_max_offset;
3927 back_min = prog->float_min_offset;
3933 last = HOP3c(strend, /* Cannot start after this */
3934 -(SSize_t)(CHR_SVLEN(must)
3935 - (SvTAIL(must) != 0) + back_min), strbeg);
3937 if (s > reginfo->strbeg)
3938 last1 = HOPc(s, -1);
3940 last1 = s - 1; /* bogus */
3942 /* XXXX check_substr already used to find "s", can optimize if
3943 check_substr==must. */
3945 strend = HOPc(strend, -dontbother);
3946 while ( (s <= last) &&
3947 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
3948 (unsigned char*)strend, must,
3949 multiline ? FBMrf_MULTILINE : 0)) ) {
3950 DEBUG_EXECUTE_r( did_match = 1 );
3951 if (HOPc(s, -back_max) > last1) {
3952 last1 = HOPc(s, -back_min);
3953 s = HOPc(s, -back_max);
3956 char * const t = (last1 >= reginfo->strbeg)
3957 ? HOPc(last1, 1) : last1 + 1;
3959 last1 = HOPc(s, -back_min);
3963 while (s <= last1) {
3964 if (regtry(reginfo, &s))
3967 s++; /* to break out of outer loop */
3974 while (s <= last1) {
3975 if (regtry(reginfo, &s))
3981 DEBUG_EXECUTE_r(if (!did_match) {
3982 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3983 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3984 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n",
3985 ((must == prog->anchored_substr || must == prog->anchored_utf8)
3986 ? "anchored" : "floating"),
3987 quoted, RE_SV_TAIL(must));
3991 else if ( (c = progi->regstclass) ) {
3993 const OPCODE op = OP(progi->regstclass);
3994 /* don't bother with what can't match */
3995 if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3996 strend = HOPc(strend, -(minlen - 1));
3999 SV * const prop = sv_newmortal();
4000 regprop(prog, prop, c, reginfo, NULL);
4002 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
4003 s,strend-s,PL_dump_re_max_len);
4004 Perl_re_printf( aTHX_
4005 "Matching stclass %.*s against %s (%d bytes)\n",
4006 (int)SvCUR(prop), SvPVX_const(prop),
4007 quoted, (int)(strend - s));
4010 if (find_byclass(prog, c, s, strend, reginfo))
4012 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n"));
4016 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
4024 if (! prog->float_utf8) {
4025 to_utf8_substr(prog);
4027 float_real = prog->float_utf8;
4030 if (! prog->float_substr) {
4031 if (! to_byte_substr(prog)) {
4032 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4035 float_real = prog->float_substr;
4038 little = SvPV_const(float_real, len);
4039 if (SvTAIL(float_real)) {
4040 /* This means that float_real contains an artificial \n on
4041 * the end due to the presence of something like this:
4042 * /foo$/ where we can match both "foo" and "foo\n" at the
4043 * end of the string. So we have to compare the end of the
4044 * string first against the float_real without the \n and
4045 * then against the full float_real with the string. We
4046 * have to watch out for cases where the string might be
4047 * smaller than the float_real or the float_real without
4049 char *checkpos= strend - len;
4051 Perl_re_printf( aTHX_
4052 "%sChecking for float_real.%s\n",
4053 PL_colors[4], PL_colors[5]));
4054 if (checkpos + 1 < strbeg) {
4055 /* can't match, even if we remove the trailing \n
4056 * string is too short to match */
4058 Perl_re_printf( aTHX_
4059 "%sString shorter than required trailing substring, cannot match.%s\n",
4060 PL_colors[4], PL_colors[5]));
4062 } else if (memEQ(checkpos + 1, little, len - 1)) {
4063 /* can match, the end of the string matches without the
4065 last = checkpos + 1;
4066 } else if (checkpos < strbeg) {
4067 /* cant match, string is too short when the "\n" is
4070 Perl_re_printf( aTHX_
4071 "%sString does not contain required trailing substring, cannot match.%s\n",
4072 PL_colors[4], PL_colors[5]));
4074 } else if (!multiline) {
4075 /* non multiline match, so compare with the "\n" at the
4076 * end of the string */
4077 if (memEQ(checkpos, little, len)) {
4081 Perl_re_printf( aTHX_
4082 "%sString does not contain required trailing substring, cannot match.%s\n",
4083 PL_colors[4], PL_colors[5]));
4087 /* multiline match, so we have to search for a place
4088 * where the full string is located */
4094 last = rninstr(s, strend, little, little + len);
4096 last = strend; /* matching "$" */
4099 /* at one point this block contained a comment which was
4100 * probably incorrect, which said that this was a "should not
4101 * happen" case. Even if it was true when it was written I am
4102 * pretty sure it is not anymore, so I have removed the comment
4103 * and replaced it with this one. Yves */
4105 Perl_re_printf( aTHX_
4106 "%sString does not contain required substring, cannot match.%s\n",
4107 PL_colors[4], PL_colors[5]
4111 dontbother = strend - last + prog->float_min_offset;
4113 if (minlen && (dontbother < minlen))
4114 dontbother = minlen - 1;
4115 strend -= dontbother; /* this one's always in bytes! */
4116 /* We don't know much -- general case. */
4119 if (regtry(reginfo, &s))
4128 if (regtry(reginfo, &s))
4130 } while (s++ < strend);
4138 /* s/// doesn't like it if $& is earlier than where we asked it to
4139 * start searching (which can happen on something like /.\G/) */
4140 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
4141 && (prog->offs[0].start < stringarg - strbeg))
4143 /* this should only be possible under \G */
4144 assert(prog->intflags & PREGf_GPOS_SEEN);
4145 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
4146 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
4150 /* clean up; this will trigger destructors that will free all slabs
4151 * above the current one, and cleanup the regmatch_info_aux
4152 * and regmatch_info_aux_eval sructs */
4154 LEAVE_SCOPE(oldsave);
4156 if (RXp_PAREN_NAMES(prog))
4157 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
4159 /* make sure $`, $&, $', and $digit will work later */
4160 if ( !(flags & REXEC_NOT_FIRST) )
4161 S_reg_set_capture_string(aTHX_ rx,
4162 strbeg, reginfo->strend,
4163 sv, flags, utf8_target);
4168 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n",
4169 PL_colors[4], PL_colors[5]));
4172 /* we failed :-( roll it back.
4173 * Since the swap buffer will be freed on scope exit which follows
4174 * shortly, restore the old captures by copying 'swap's original
4175 * data to the new offs buffer
4177 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
4178 "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
4185 Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
4188 /* clean up; this will trigger destructors that will free all slabs
4189 * above the current one, and cleanup the regmatch_info_aux
4190 * and regmatch_info_aux_eval sructs */
4192 LEAVE_SCOPE(oldsave);
4198 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
4199 * Do inc before dec, in case old and new rex are the same */
4200 #define SET_reg_curpm(Re2) \
4201 if (reginfo->info_aux_eval) { \
4202 (void)ReREFCNT_inc(Re2); \
4203 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
4204 PM_SETRE((PL_reg_curpm), (Re2)); \
4209 - regtry - try match at specific point
4211 STATIC bool /* 0 failure, 1 success */
4212 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4215 REGEXP *const rx = reginfo->prog;
4216 regexp *const prog = ReANY(rx);
4219 U32 depth = 0; /* used by REGCP_SET */
4221 RXi_GET_DECL(prog,progi);
4222 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4224 PERL_ARGS_ASSERT_REGTRY;
4226 reginfo->cutpoint=NULL;
4228 prog->offs[0].start = *startposp - reginfo->strbeg;
4229 prog->lastparen = 0;
4230 prog->lastcloseparen = 0;
4232 /* XXXX What this code is doing here?!!! There should be no need
4233 to do this again and again, prog->lastparen should take care of
4236 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4237 * Actually, the code in regcppop() (which Ilya may be meaning by
4238 * prog->lastparen), is not needed at all by the test suite
4239 * (op/regexp, op/pat, op/split), but that code is needed otherwise
4240 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4241 * Meanwhile, this code *is* needed for the
4242 * above-mentioned test suite tests to succeed. The common theme
4243 * on those tests seems to be returning null fields from matches.
4244 * --jhi updated by dapm */
4246 /* After encountering a variant of the issue mentioned above I think
4247 * the point Ilya was making is that if we properly unwind whenever
4248 * we set lastparen to a smaller value then we should not need to do
4249 * this every time, only when needed. So if we have tests that fail if
4250 * we remove this, then it suggests somewhere else we are improperly
4251 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4252 * places it is called, and related regcp() routines. - Yves */
4254 if (prog->nparens) {
4255 regexp_paren_pair *pp = prog->offs;
4257 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4265 result = regmatch(reginfo, *startposp, progi->program + 1);
4267 prog->offs[0].end = result;
4270 if (reginfo->cutpoint)
4271 *startposp= reginfo->cutpoint;
4272 REGCP_UNWIND(lastcp);
4276 /* this is used to determine how far from the left messages like
4277 'failed...' are printed in regexec.c. It should be set such that
4278 messages are inline with the regop output that created them.
4280 #define REPORT_CODE_OFF 29
4281 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4284 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4288 PerlIO *f= Perl_debug_log;
4289 PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4290 va_start(ap, depth);
4291 PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4292 result = PerlIO_vprintf(f, fmt, ap);
4296 #endif /* DEBUGGING */
4298 /* grab a new slab and return the first slot in it */
4300 STATIC regmatch_state *
4303 regmatch_slab *s = PL_regmatch_slab->next;
4305 Newx(s, 1, regmatch_slab);
4306 s->prev = PL_regmatch_slab;
4308 PL_regmatch_slab->next = s;
4310 PL_regmatch_slab = s;
4311 return SLAB_FIRST(s);
4317 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4318 const char *start, const char *end, const char *blurb)
4320 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4322 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4327 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4328 RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4330 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4331 start, end - start, PL_dump_re_max_len);
4333 Perl_re_printf( aTHX_
4334 "%s%s REx%s %s against %s\n",
4335 PL_colors[4], blurb, PL_colors[5], s0, s1);
4337 if (utf8_target||utf8_pat)
4338 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n",
4339 utf8_pat ? "pattern" : "",
4340 utf8_pat && utf8_target ? " and " : "",
4341 utf8_target ? "string" : ""
4347 S_dump_exec_pos(pTHX_ const char *locinput,
4348 const regnode *scan,
4349 const char *loc_regeol,
4350 const char *loc_bostr,
4351 const char *loc_reg_starttry,
4352 const bool utf8_target,
4356 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4357 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4358 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4359 /* The part of the string before starttry has one color
4360 (pref0_len chars), between starttry and current
4361 position another one (pref_len - pref0_len chars),
4362 after the current position the third one.
4363 We assume that pref0_len <= pref_len, otherwise we
4364 decrease pref0_len. */
4365 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4366 ? (5 + taill) - l : locinput - loc_bostr;
4369 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4371 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4373 pref0_len = pref_len - (locinput - loc_reg_starttry);
4374 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4375 l = ( loc_regeol - locinput > (5 + taill) - pref_len
4376 ? (5 + taill) - pref_len : loc_regeol - locinput);
4377 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4381 if (pref0_len > pref_len)
4382 pref0_len = pref_len;
4384 const int is_uni = utf8_target ? 1 : 0;
4386 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4387 (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4389 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4390 (locinput - pref_len + pref0_len),
4391 pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4393 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4394 locinput, loc_regeol - locinput, 10, 0, 1);
4396 const STRLEN tlen=len0+len1+len2;
4397 Perl_re_printf( aTHX_
4398 "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4399 (IV)(locinput - loc_bostr),
4402 (docolor ? "" : "> <"),
4404 (int)(tlen > 19 ? 0 : 19 - tlen),
4412 /* reg_check_named_buff_matched()
4413 * Checks to see if a named buffer has matched. The data array of
4414 * buffer numbers corresponding to the buffer is expected to reside
4415 * in the regexp->data->data array in the slot stored in the ARG() of
4416 * node involved. Note that this routine doesn't actually care about the
4417 * name, that information is not preserved from compilation to execution.
4418 * Returns the index of the leftmost defined buffer with the given name
4419 * or 0 if non of the buffers matched.
4422 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4425 RXi_GET_DECL(rex,rexi);
4426 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4427 I32 *nums=(I32*)SvPVX(sv_dat);
4429 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4431 for ( n=0; n<SvIVX(sv_dat); n++ ) {
4432 if ((I32)rex->lastparen >= nums[n] &&
4433 rex->offs[nums[n]].end != -1)
4442 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
4443 struct next_matchable_info * m,
4444 regmatch_info *reginfo)
4446 /* This function determines various characteristics about every possible
4447 * initial match of the passed-in EXACTish <text_node>, and stores them in
4450 * That includes a match string and a parallel mask, such that if you AND
4451 * the target string with the mask and compare with the match string,
4452 * you'll have a pretty good idea, perhaps even perfect, if that portion of
4453 * the target matches or not.
4455 * The motivation behind this function is to allow the caller to set up
4456 * tight loops for matching. Consider patterns like '.*B' or '.*?B' where
4457 * B is an arbitrary EXACTish node. To find the end of .*, we look for the
4458 * beginning oF B, which is the passed in <text_node> That's where this
4459 * function comes in. The values it returns can quickly be used to rule
4460 * out many, or all, cases of possible matches not actually being the
4461 * beginning of B, <text_node>. It is also used in regrepeat() where we
4462 * have 'A*', for arbitrary 'A'. This sets up criteria to more efficiently
4463 * determine where the span of 'A's stop.
4465 * If <text_node> is of type EXACT, there is only one possible character
4466 * that can match its first character, and so the situation is quite
4467 * simple. But things can get much more complicated if folding is
4468 * involved. It may be that the first character of an EXACTFish node
4469 * doesn't participate in any possible fold, e.g., punctuation, so it can
4470 * be matched only by itself. The vast majority of characters that are in
4471 * folds match just two things, their lower and upper-case equivalents.
4472 * But not all are like that; some have multiple possible matches, or match
4473 * sequences of more than one character. This function sorts all that out.
4475 * It returns information about all possibilities of what the first
4476 * character(s) of <text_node> could look like. Again, if <text_node> is a
4477 * plain EXACT node, that's just the actual first bytes of the first
4478 * character; but otherwise it is the bytes, that when masked, match all
4479 * possible combinations of all the initial bytes of all the characters
4480 * that could match, folded. (Actually, this is a slight over promise. It
4481 * handles only up to the initial 5 bytes, which is enough for all Unicode
4482 * characters, but not for all non-Unicode ones.)
4484 * Here's an example to clarify. Suppose the first character of
4485 * <text_node> is the letter 'C', and we are under /i matching. That means
4486 * 'c' also matches. The representations of these two characters differ in
4487 * just one bit, so the mask would be a zero in that position and ones in
4488 * the other 7. And the returned string would be the AND of these two
4489 * characters, and would be one byte long, since these characters are each
4490 * a single byte. ANDing the target <text_node> with this mask will yield
4491 * the returned string if and only if <text_node> begins with one of these
4492 * two characters. So, the function would also return that the definitive
4493 * length matched is 1 byte.
4495 * Now, suppose instead of the letter 'C', <text_node> begins with the
4496 * letter 'F'. The situation is much more complicated because there are
4497 * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also
4498 * begins with 'f', and hence could match. We add these into the returned
4499 * string and mask, but the result isn't definitive; the caller has to
4500 * check further if its AND and compare pass. But the failure of that
4501 * compare will quickly rule out most possible inputs.
4503 * Much of this could be done in regcomp.c at compile time, except for
4504 * locale-dependent, and UTF-8 target dependent data. Extra data fields
4505 * could be used for one or the other eventualities.
4507 * If this function determines that no possible character in the target
4508 * string can match, it returns FALSE; otherwise TRUE. (The FALSE
4509 * situation occurs if the first character in <text_node> requires UTF-8 to
4510 * represent, and the target string isn't in UTF-8.)
4512 * Some analysis is in GH #18414, located at the time of this writing at:
4513 * https://github.com/Perl/perl5/issues/18414
4516 const bool utf8_target = reginfo->is_utf8_target;
4517 bool utf8_pat = reginfo->is_utf8_pat;
4519 PERL_UINT_FAST8_T i;
4521 /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
4523 U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
4524 U8 lengths[MAX_MATCHES] = { 0 };
4526 U8 index_of_longest = 0;
4528 U8 *pat = (U8*)STRING(text_node);
4529 Size_t pat_len = STR_LEN(text_node);
4530 U8 op = OP(text_node);
4532 U8 byte_mask[5] = {0};
4533 U8 byte_anded[5] = {0};
4535 /* There are some folds in Unicode to multiple characters. This will hold
4536 * such characters that could fold to the beginning of 'text_node' */
4537 UV multi_fold_from = 0;
4539 /* We may have to create a modified copy of the pattern */
4540 U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' };
4543 m->min_length = 255;
4546 /* Even if the first character in the node can match something in Latin1,
4547 * if there is anything in the node that can't, the match must fail */
4548 if (! utf8_target && isEXACT_REQ8(op)) {
4552 /* Define a temporary op for use in this function, using an existing one that
4553 * should never be a real op during execution */
4554 #define TURKISH PSEUDO
4556 /* What to do about these two nodes had to be deferred to runtime (which is
4557 * now). If the extra information we now have so indicates, turn them into
4559 if ( (op == EXACTF && utf8_target)
4560 || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4562 if (op == EXACTFL && PL_in_utf8_turkic_locale) {
4569 /* And certain situations are better handled if we create a modified
4570 * version of the pattern */
4571 if (utf8_pat) { /* Here, must have been EXACTFL, so look at the
4572 specific problematic characters */
4573 if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) {
4575 /* The node could start with characters that are the first ones
4576 * of a multi-character fold. */
4578 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4579 if (multi_fold_from) {
4581 /* Here, they do form a sequence that matches the fold of a
4582 * single character. That single character then is a
4583 * possible match. Below we will look again at this, but
4584 * the code below is expecting every character in the
4585 * pattern to be folded, which the input isn't required to
4586 * be in this case. So, just fold the single character,
4587 * and the result will be in the expected form. */
4588 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4592 /* Turkish has a couple extra possibilities. */
4593 else if ( UNLIKELY(op == TURKISH)
4595 && isALPHA_FOLD_EQ(pat[0], 'f')
4596 && ( memBEGINs(pat + 1, pat_len - 1,
4597 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4599 && isALPHA_FOLD_EQ(pat[1], 'f')
4600 && memBEGINs(pat + 2, pat_len - 2,
4601 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4603 /* The macros for finding a multi-char fold don't include
4604 * the Turkish possibilities, in which U+130 folds to 'i'.
4605 * Hard-code these. It's very unlikely that Unicode will
4606 * ever add any others. */
4607 if (pat[1] == 'f') {
4609 Copy("ffi", mod_pat, pat_len, U8);
4613 Copy("fi", mod_pat, pat_len, U8);
4617 else if ( UTF8_IS_DOWNGRADEABLE_START(*pat)
4618 && LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8))
4619 && LIKELY(memNEs(pat, pat_len,
4620 LATIN_SMALL_LETTER_SHARP_S_UTF8))
4621 && (LIKELY(op != TURKISH || *pat != 'I')))
4623 /* For all cases of things between 0-255, except the ones
4624 * in the conditional above, the fold is just the lower
4625 * case, which is faster than the more general case. */
4626 mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0],
4632 else { /* Code point above 255, or needs special handling */
4633 _to_utf8_fold_flags(pat, pat + pat_len,
4635 FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE);
4640 else if /* Below is not a UTF-8 pattern; there's a somewhat different
4641 set of problematic characters */
4643 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len)))
4645 /* We may have to canonicalize a multi-char fold, as in the UTF-8
4647 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4651 else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) {
4652 mod_pat[0] = mod_pat[1] = 's';
4654 utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant
4655 chars, and speeds copying */
4658 else if (LIKELY(op != TURKISH || *pat != 'I')) {
4659 mod_pat[0] = toLOWER_L1(*pat);
4664 else if /* Below isn't a node that we convert to UTF-8 */
4667 && op == EXACTFAA_NO_TRIE
4668 && *pat == LATIN_SMALL_LETTER_SHARP_S)
4670 /* A very special case. Folding U+DF goes to U+17F under /iaa. We
4671 * did this at compile time when the pattern was UTF-8 , but otherwise
4672 * we couldn't do it earlier, because it requires a UTF-8 target for
4673 * this match to be legal. */
4674 pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1);
4675 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8
4676 LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8);
4681 /* Here, we have taken care of the initial work for a few very problematic
4682 * situations, possibly creating a modified pattern.
4684 * Now ready for the general case. We build up all the possible things
4685 * that could match the first character of the pattern into the elements of
4688 * Everything generally matches at least itself. But if there is a
4689 * UTF8ness mismatch, we have to convert to that of the target string. */
4690 if (UTF8_IS_INVARIANT(*pat)) { /* Immaterial if either is in UTF-8 */
4691 matches[0][0] = pat[0];
4695 else if (utf8_target) {
4697 lengths[0] = UTF8SKIP(pat);
4698 Copy(pat, matches[0], lengths[0], U8);
4701 else { /* target is UTF-8, pattern isn't */
4702 matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
4703 matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
4708 else if (! utf8_pat) { /* Neither is UTF-8 */
4709 matches[0][0] = pat[0];
4713 else /* target isn't UTF-8; pattern is. No match possible unless the
4714 pattern's first character can fit in a byte */
4715 if (UTF8_IS_DOWNGRADEABLE_START(*pat))
4717 matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
4722 /* Here we have taken care of any necessary node-type changes */
4725 m->max_length = lengths[0];
4726 m->min_length = lengths[0];
4729 /* For non-folding nodes, there are no other possible candidate matches,
4730 * but for foldable ones, we have to look further. */
4731 if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */
4732 UV folded; /* The first character in the pattern, folded */
4733 U32 first_fold_from; /* A character that folds to it */
4734 const U32 * remaining_fold_froms; /* The remaining characters that
4735 fold to it, if any */
4736 Size_t folds_to_count; /* The total number of characters that fold to
4739 /* If the node begins with a sequence of more than one character that
4740 * together form the fold of a single character, it is called a
4741 * 'multi-character fold', and the normal functions don't handle this
4742 * case. We set 'multi_fold_from' to the single folded-from character,
4743 * which is handled in an extra iteration below */
4745 folded = valid_utf8_to_uvchr(pat, NULL);
4747 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4752 /* This may generate illegal combinations for things like EXACTF,
4753 * but rather than repeat the logic and exclude them here, all such
4754 * illegalities are checked for and skipped below in the loop */
4756 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len);
4759 /* Everything matches at least itself; initialize to that because the
4760 * only the branches below that set it are the ones where the number
4764 /* There are a few special cases for locale-dependent nodes, where the
4765 * run-time context was needed before we could know what matched */
4766 if (UNLIKELY(op == EXACTFL) && folded < 256) {
4767 first_fold_from = PL_fold_locale[folded];
4769 else if ( op == EXACTFL && utf8_target && utf8_pat
4770 && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8
4771 LATIN_SMALL_LETTER_LONG_S_UTF8))
4773 first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S;
4775 else if (UNLIKELY( op == TURKISH
4776 && ( isALPHA_FOLD_EQ(folded, 'i')
4778 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
4779 LATIN_SMALL_LETTER_DOTLESS_I))))
4780 { /* Turkish folding requires special handling */
4782 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4783 else if (folded == 'I')
4784 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I;
4785 else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
4786 first_fold_from = 'i';
4787 else first_fold_from = 'I';
4790 /* Here, isn't a special case: use the generic function to
4791 * calculate what folds to this */
4793 /* Look up what code points (besides itself) fold to 'folded';
4794 * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4795 folds_to_count = _inverse_folds(folded, &first_fold_from,
4796 &remaining_fold_froms);
4799 /* Add each character that folds to 'folded' to the list of them,
4800 * subject to limitations based on the node type and target UTF8ness.
4801 * If there was a character that folded to multiple characters, do an
4802 * extra iteration for it. (Note the extra iteration if there is a
4803 * multi-character fold) */
4804 for (i = 0; i < folds_to_count
4805 + UNLIKELY(multi_fold_from != 0); i++)
4809 if (i >= folds_to_count) { /* Final iteration: handle the
4811 fold_from = multi_fold_from;
4814 fold_from = first_fold_from;
4816 else if (i < folds_to_count) {
4817 fold_from = remaining_fold_froms[i-1];
4820 if (folded == fold_from) { /* We already added the character
4825 /* EXACTF doesn't have any non-ascii folds */
4826 if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) {
4830 /* In /iaa nodes, neither or both must be ASCII to be a legal fold
4832 if ( isASCII(folded) != isASCII(fold_from)
4833 && inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE))
4839 /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8
4840 * locale, but those have been converted to EXACTFU above) */
4842 && (folded < 256) != (fold_from < 256))
4847 /* If this triggers, it likely is because of the unlikely case
4848 * where a new Unicode standard has changed what MAX_MATCHES should
4850 assert(m->count < MAX_MATCHES);
4852 /* Add this character to the list of possible matches */
4854 uvchr_to_utf8(matches[m->count], fold_from);
4855 lengths[m->count] = UVCHR_SKIP(fold_from);
4858 else { /* Non-UTF8 target: no code point above 255 can appear in it
4860 if (fold_from > 255) {
4864 matches[m->count][0] = fold_from;
4865 lengths[m->count] = 1;
4869 /* Update min and mlengths */
4870 if (m->min_length > lengths[m->count-1]) {
4871 m->min_length = lengths[m->count-1];
4874 if (m->max_length < lengths[m->count-1]) {
4875 index_of_longest = m->count - 1;
4876 m->max_length = lengths[index_of_longest];
4878 } /* looped through each potential fold */
4880 /* If there is something that folded to an initial multi-character
4881 * fold, repeat, using it. This catches some edge cases. An example
4882 * of one is /ss/i when UTF-8 encoded. The function
4883 * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF
4884 * (LATIN SMALL SHARP S). If it returned a list of characters, this
4885 * code wouldn't be needed. But since it doesn't, we have to look what
4886 * folds to the U+DF. In this case, U+1E9E does, and has to be added.
4888 if (multi_fold_from) {
4889 folded = multi_fold_from;
4890 multi_fold_from = 0;
4893 } /* End of finding things that participate in this fold */
4895 if (m->count == 0) { /* If nothing found, can't match */
4900 /* Have calculated all possible matches. Now calculate the mask and AND
4902 m->initial_exact = 0;
4903 m->initial_definitive = 0;
4906 unsigned int mask_ones = 0;
4907 unsigned int possible_ones = 0;
4910 /* For each byte that is in all possible matches ... */
4911 for (j = 0; j < MIN(m->min_length, 5); j++) {
4913 /* Initialize the accumulator for this byte */
4914 byte_mask[j] = 0xFF;
4915 byte_anded[j] = matches[0][j];
4917 /* Then the rest of the rows (folds). The mask is based on, like,
4918 * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0
4919 * where they differ. */
4920 for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) {
4921 byte_mask[j] &= ~ (byte_anded[j] ^ matches[i][j]);
4922 byte_anded[j] &= matches[i][j];
4925 /* Keep track of the number of initial mask bytes that are all one
4926 * bits. The code calling this can use this number to know that
4927 * a string that matches this number of bytes in the pattern is an
4928 * exact match of that pattern for this number of bytes. But also
4929 * counted are the number of initial bytes that in total have a
4930 * single zero bit. If a string matches those, masked, it must be
4931 * one of two possibilites, both of which this function has
4932 * determined are legal. (But if that single 0 is one of the
4933 * initial bits for masking a UTF-8 start byte, that could
4934 * incorrectly lead to different length strings appearing to be
4935 * equivalent, so only do this optimization when the matchables are
4936 * all the same length. This was uncovered by testing
4938 if (m->min_length == m->max_length) {
4939 mask_ones += PL_bitcount[byte_mask[j]];
4941 if (mask_ones + 1 >= possible_ones) {
4942 m->initial_definitive++;
4943 if (mask_ones >= possible_ones) {
4951 /* The first byte is separate for speed */
4952 m->first_byte_mask = byte_mask[0];
4953 m->first_byte_anded = byte_anded[0];
4955 /* Then pack up to the next 4 bytes into a word */
4956 m->mask32 = m->anded32 = 0;
4957 for (i = 1; i < MIN(m->min_length, 5); i++) {
4959 U8 shift = (which - 1) * 8;
4960 m->mask32 |= (U32) byte_mask[i] << shift;
4961 m->anded32 |= (U32) byte_anded[i] << shift;
4964 /* Finally, take the match strings and place them sequentially into a
4965 * one-dimensional array. (This is done to save significant space in the
4966 * structure.) Sort so the longest (presumably the least likely) is last.
4967 * XXX When this gets moved to regcomp, may want to fully sort shortest
4968 * first, but above we generally used the folded code point first, and
4969 * those tend to be no longer than their upper case values, so this is
4970 * already pretty well sorted by size.
4972 * If the asserts fail, it's most likely because a new version of the
4973 * Unicode standard requires more space; simply increase the declaration
4977 U8 output_index = 0;
4979 if (m->count > 1) { /* No need to sort a single entry */
4980 for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
4982 /* Keep the same order for all but the longest. (If the
4983 * asserts fail, it could be because m->matches is declared too
4984 * short, either because of a new Unicode release, or an
4985 * overlooked test case, or it could be a bug.) */
4986 if (i != index_of_longest) {
4987 assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
4988 Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
4989 cur_pos += lengths[i];
4990 m->lengths[output_index++] = lengths[i];
4995 assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches));
4996 Copy(matches[index_of_longest], m->matches + cur_pos,
4997 lengths[index_of_longest], U8);
4999 /* Place the longest match last */
5000 m->lengths[output_index] = lengths[index_of_longest];
5007 PERL_STATIC_FORCE_INLINE /* We want speed at the expense of size */
5009 S_test_EXACTISH_ST(const char * loc,
5010 struct next_matchable_info info)
5012 /* This function uses the data set up in setup_EXACTISH_ST() to see if the
5013 * bytes starting at 'loc' can match based on 'next_matchable_info' */
5017 /* Check the first byte */
5018 if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded)
5021 /* Pack the next up-to-4 bytes into a 32 bit word */
5022 switch (info.min_length) {
5024 input32 |= (U32) ((U8) loc[4]) << 3 * 8;
5027 input32 |= (U8) loc[3] << 2 * 8;
5030 input32 |= (U8) loc[2] << 1 * 8;
5033 input32 |= (U8) loc[1];
5036 return TRUE; /* We already tested and passed the 0th byte */
5041 /* And AND that with the mask and compare that with the assembled ANDED
5043 return (input32 & info.mask32) == info.anded32;
5047 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
5049 /* returns a boolean indicating if there is a Grapheme Cluster Boundary
5050 * between the inputs. See https://www.unicode.org/reports/tr29/. */
5052 PERL_ARGS_ASSERT_ISGCB;
5054 switch (GCB_table[before][after]) {
5061 case GCB_RI_then_RI:
5064 U8 * temp_pos = (U8 *) curpos;
5066 /* Do not break within emoji flag sequences. That is, do not
5067 * break between regional indicator (RI) symbols if there is an
5068 * odd number of RI characters before the break point.
5069 * GB12 sot (RI RI)* RI × RI
5070 * GB13 [^RI] (RI RI)* RI × RI */
5072 while (backup_one_GCB(strbeg,
5074 utf8_target) == GCB_Regional_Indicator)
5079 return RI_count % 2 != 1;
5082 case GCB_EX_then_EM:
5084 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */
5086 U8 * temp_pos = (U8 *) curpos;
5090 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5092 while (prev == GCB_Extend);
5094 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
5097 case GCB_Maybe_Emoji_NonBreak:
5101 /* Do not break within emoji modifier sequences or emoji zwj sequences.
5102 GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
5104 U8 * temp_pos = (U8 *) curpos;
5108 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5110 while (prev == GCB_Extend);
5112 return prev != GCB_ExtPict_XX;
5120 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
5121 before, after, GCB_table[before][after]);
5128 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5132 PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
5134 if (*curpos < strbeg) {
5139 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5140 U8 * prev_prev_char_pos;
5142 if (! prev_char_pos) {
5146 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5147 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5148 *curpos = prev_char_pos;
5149 prev_char_pos = prev_prev_char_pos;
5152 *curpos = (U8 *) strbeg;
5157 if (*curpos - 2 < strbeg) {
5158 *curpos = (U8 *) strbeg;
5162 gcb = getGCB_VAL_CP(*(*curpos - 1));
5168 /* Combining marks attach to most classes that precede them, but this defines
5169 * the exceptions (from TR14) */
5170 #define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
5171 || prev == LB_Mandatory_Break \
5172 || prev == LB_Carriage_Return \
5173 || prev == LB_Line_Feed \
5174 || prev == LB_Next_Line \
5175 || prev == LB_Space \
5176 || prev == LB_ZWSpace))
5179 S_isLB(pTHX_ LB_enum before,
5181 const U8 * const strbeg,
5182 const U8 * const curpos,
5183 const U8 * const strend,
5184 const bool utf8_target)
5186 U8 * temp_pos = (U8 *) curpos;
5187 LB_enum prev = before;
5189 /* Is the boundary between 'before' and 'after' line-breakable?
5190 * Most of this is just a table lookup of a generated table from Unicode
5191 * rules. But some rules require context to decide, and so have to be
5192 * implemented in code */
5194 PERL_ARGS_ASSERT_ISLB;
5196 /* Rule numbers in the comments below are as of Unicode 9.0 */
5200 switch (LB_table[before][after]) {
5205 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5208 case LB_SP_foo + LB_BREAKABLE:
5209 case LB_SP_foo + LB_NOBREAK:
5210 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5212 /* When we have something following a SP, we have to look at the
5213 * context in order to know what to do.
5215 * SP SP should not reach here because LB7: Do not break before
5216 * spaces. (For two spaces in a row there is nothing that
5217 * overrides that) */
5218 assert(after != LB_Space);
5220 /* Here we have a space followed by a non-space. Mostly this is a
5221 * case of LB18: "Break after spaces". But there are complications
5222 * as the handling of spaces is somewhat tricky. They are in a
5223 * number of rules, which have to be applied in priority order, but
5224 * something earlier in the string can cause a rule to be skipped
5225 * and a lower priority rule invoked. A prime example is LB7 which
5226 * says don't break before a space. But rule LB8 (lower priority)
5227 * says that the first break opportunity after a ZW is after any
5228 * span of spaces immediately after it. If a ZW comes before a SP
5229 * in the input, rule LB8 applies, and not LB7. Other such rules
5230 * involve combining marks which are rules 9 and 10, but they may
5231 * override higher priority rules if they come earlier in the
5232 * string. Since we're doing random access into the middle of the
5233 * string, we have to look for rules that should get applied based
5234 * on both string position and priority. Combining marks do not
5235 * attach to either ZW nor SP, so we don't have to consider them
5238 * To check for LB8, we have to find the first non-space character
5239 * before this span of spaces */
5241 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5243 while (prev == LB_Space);
5245 /* LB8 Break before any character following a zero-width space,
5246 * even if one or more spaces intervene.
5248 * So if we have a ZW just before this span, and to get here this
5249 * is the final space in the span. */
5250 if (prev == LB_ZWSpace) {
5254 /* Here, not ZW SP+. There are several rules that have higher
5255 * priority than LB18 and can be resolved now, as they don't depend
5256 * on anything earlier in the string (except ZW, which we have
5257 * already handled). One of these rules is LB11 Do not break
5258 * before Word joiner, but we have specially encoded that in the
5259 * lookup table so it is caught by the single test below which
5260 * catches the other ones. */
5261 if (LB_table[LB_Space][after] - LB_SP_foo
5262 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
5267 /* If we get here, we have to XXX consider combining marks. */
5268 if (prev == LB_Combining_Mark) {
5270 /* What happens with these depends on the character they
5273 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5275 while (prev == LB_Combining_Mark);
5277 /* Most times these attach to and inherit the characteristics
5278 * of that character, but not always, and when not, they are to
5279 * be treated as AL by rule LB10. */
5280 if (! LB_CM_ATTACHES_TO(prev)) {
5281 prev = LB_Alphabetic;
5285 /* Here, we have the character preceding the span of spaces all set
5286 * up. We follow LB18: "Break after spaces" unless the table shows
5287 * that is overriden */
5288 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
5292 /* We don't know how to treat the CM except by looking at the first
5293 * non-CM character preceding it. ZWJ is treated as CM */
5295 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5297 while (prev == LB_Combining_Mark || prev == LB_ZWJ);
5299 /* Here, 'prev' is that first earlier non-CM character. If the CM
5300 * attatches to it, then it inherits the behavior of 'prev'. If it
5301 * doesn't attach, it is to be treated as an AL */
5302 if (! LB_CM_ATTACHES_TO(prev)) {
5303 prev = LB_Alphabetic;
5308 case LB_HY_or_BA_then_foo + LB_BREAKABLE:
5309 case LB_HY_or_BA_then_foo + LB_NOBREAK:
5311 /* LB21a Don't break after Hebrew + Hyphen.
5312 * HL (HY | BA) × */
5314 if (backup_one_LB(strbeg, &temp_pos, utf8_target)
5315 == LB_Hebrew_Letter)
5320 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
5322 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
5323 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5325 /* LB25a (PR | PO) × ( OP | HY )? NU */
5326 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5330 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5333 case LB_SY_or_IS_then_various + LB_BREAKABLE:
5334 case LB_SY_or_IS_then_various + LB_NOBREAK:
5336 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
5338 LB_enum temp = prev;
5340 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5342 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5343 if (temp == LB_Numeric) {
5347 return LB_table[prev][after] - LB_SY_or_IS_then_various
5351 case LB_various_then_PO_or_PR + LB_BREAKABLE:
5352 case LB_various_then_PO_or_PR + LB_NOBREAK:
5354 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
5356 LB_enum temp = prev;
5357 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5359 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5361 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5362 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5364 if (temp == LB_Numeric) {
5367 return LB_various_then_PO_or_PR;
5370 case LB_RI_then_RI + LB_NOBREAK:
5371 case LB_RI_then_RI + LB_BREAKABLE:
5375 /* LB30a Break between two regional indicator symbols if and
5376 * only if there are an even number of regional indicators
5377 * preceding the position of the break.
5379 * sot (RI RI)* RI × RI
5380 * [^RI] (RI RI)* RI × RI */
5382 while (backup_one_LB(strbeg,
5384 utf8_target) == LB_Regional_Indicator)
5389 return RI_count % 2 == 0;
5397 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5398 before, after, LB_table[before][after]);
5405 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5410 PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5412 if (*curpos >= strend) {
5417 *curpos += UTF8SKIP(*curpos);
5418 if (*curpos >= strend) {
5421 lb = getLB_VAL_UTF8(*curpos, strend);
5425 if (*curpos >= strend) {
5428 lb = getLB_VAL_CP(**curpos);
5435 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5439 PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5441 if (*curpos < strbeg) {
5446 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5447 U8 * prev_prev_char_pos;
5449 if (! prev_char_pos) {
5453 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5454 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5455 *curpos = prev_char_pos;
5456 prev_char_pos = prev_prev_char_pos;
5459 *curpos = (U8 *) strbeg;
5464 if (*curpos - 2 < strbeg) {
5465 *curpos = (U8 *) strbeg;
5469 lb = getLB_VAL_CP(*(*curpos - 1));
5476 S_isSB(pTHX_ SB_enum before,
5478 const U8 * const strbeg,
5479 const U8 * const curpos,
5480 const U8 * const strend,
5481 const bool utf8_target)
5483 /* returns a boolean indicating if there is a Sentence Boundary Break
5484 * between the inputs. See https://www.unicode.org/reports/tr29/ */
5486 U8 * lpos = (U8 *) curpos;
5487 bool has_para_sep = FALSE;
5488 bool has_sp = FALSE;
5490 PERL_ARGS_ASSERT_ISSB;
5492 /* Break at the start and end of text.
5495 But unstated in Unicode is don't break if the text is empty */
5496 if (before == SB_EDGE || after == SB_EDGE) {
5497 return before != after;
5500 /* SB 3: Do not break within CRLF. */
5501 if (before == SB_CR && after == SB_LF) {
5505 /* Break after paragraph separators. CR and LF are considered
5506 * so because Unicode views text as like word processing text where there
5507 * are no newlines except between paragraphs, and the word processor takes
5508 * care of wrapping without there being hard line-breaks in the text *./
5509 SB4. Sep | CR | LF ÷ */
5510 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5514 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5515 * (See Section 6.2, Replacing Ignore Rules.)
5516 SB5. X (Extend | Format)* → X */
5517 if (after == SB_Extend || after == SB_Format) {
5519 /* Implied is that the these characters attach to everything
5520 * immediately prior to them except for those separator-type
5521 * characters. And the rules earlier have already handled the case
5522 * when one of those immediately precedes the extend char */
5526 if (before == SB_Extend || before == SB_Format) {
5527 U8 * temp_pos = lpos;
5528 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5529 if ( backup != SB_EDGE
5538 /* Here, both 'before' and 'backup' are these types; implied is that we
5539 * don't break between them */
5540 if (backup == SB_Extend || backup == SB_Format) {
5545 /* Do not break after ambiguous terminators like period, if they are
5546 * immediately followed by a number or lowercase letter, if they are
5547 * between uppercase letters, if the first following letter (optionally
5548 * after certain punctuation) is lowercase, or if they are followed by
5549 * "continuation" punctuation such as comma, colon, or semicolon. For
5550 * example, a period may be an abbreviation or numeric period, and thus may
5551 * not mark the end of a sentence.
5553 * SB6. ATerm × Numeric */
5554 if (before == SB_ATerm && after == SB_Numeric) {
5558 /* SB7. (Upper | Lower) ATerm × Upper */
5559 if (before == SB_ATerm && after == SB_Upper) {
5560 U8 * temp_pos = lpos;
5561 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5562 if (backup == SB_Upper || backup == SB_Lower) {
5567 /* The remaining rules that aren't the final one, all require an STerm or
5568 * an ATerm after having backed up over some Close* Sp*, and in one case an
5569 * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5570 * So do that backup now, setting flags if either Sp or a paragraph
5571 * separator are found */
5573 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5574 has_para_sep = TRUE;
5575 before = backup_one_SB(strbeg, &lpos, utf8_target);
5578 if (before == SB_Sp) {
5581 before = backup_one_SB(strbeg, &lpos, utf8_target);
5583 while (before == SB_Sp);
5586 while (before == SB_Close) {
5587 before = backup_one_SB(strbeg, &lpos, utf8_target);
5590 /* The next few rules apply only when the backed-up-to is an ATerm, and in
5591 * most cases an STerm */
5592 if (before == SB_STerm || before == SB_ATerm) {
5594 /* So, here the lhs matches
5595 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5596 * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5597 * The rules that apply here are:
5599 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR
5600 | LF | STerm | ATerm) )* Lower
5601 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
5602 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF)
5603 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF)
5604 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷
5607 /* And all but SB11 forbid having seen a paragraph separator */
5608 if (! has_para_sep) {
5609 if (before == SB_ATerm) { /* SB8 */
5610 U8 * rpos = (U8 *) curpos;
5611 SB_enum later = after;
5613 while ( later != SB_OLetter
5614 && later != SB_Upper
5615 && later != SB_Lower
5619 && later != SB_STerm
5620 && later != SB_ATerm
5621 && later != SB_EDGE)
5623 later = advance_one_SB(&rpos, strend, utf8_target);
5625 if (later == SB_Lower) {
5630 if ( after == SB_SContinue /* SB8a */
5631 || after == SB_STerm
5632 || after == SB_ATerm)
5637 if (! has_sp) { /* SB9 applies only if there was no Sp* */
5638 if ( after == SB_Close
5648 /* SB10. This and SB9 could probably be combined some way, but khw
5649 * has decided to follow the Unicode rule book precisely for
5650 * simplified maintenance */
5664 /* Otherwise, do not break.
5671 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5675 PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5677 if (*curpos >= strend) {
5683 *curpos += UTF8SKIP(*curpos);
5684 if (*curpos >= strend) {
5687 sb = getSB_VAL_UTF8(*curpos, strend);
5688 } while (sb == SB_Extend || sb == SB_Format);
5693 if (*curpos >= strend) {
5696 sb = getSB_VAL_CP(**curpos);
5697 } while (sb == SB_Extend || sb == SB_Format);
5704 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5708 PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5710 if (*curpos < strbeg) {
5715 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5716 if (! prev_char_pos) {
5720 /* Back up over Extend and Format. curpos is always just to the right
5721 * of the characater whose value we are getting */
5723 U8 * prev_prev_char_pos;
5724 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5727 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5728 *curpos = prev_char_pos;
5729 prev_char_pos = prev_prev_char_pos;
5732 *curpos = (U8 *) strbeg;
5735 } while (sb == SB_Extend || sb == SB_Format);
5739 if (*curpos - 2 < strbeg) {
5740 *curpos = (U8 *) strbeg;
5744 sb = getSB_VAL_CP(*(*curpos - 1));
5745 } while (sb == SB_Extend || sb == SB_Format);
5752 S_isWB(pTHX_ WB_enum previous,
5755 const U8 * const strbeg,
5756 const U8 * const curpos,
5757 const U8 * const strend,
5758 const bool utf8_target)
5760 /* Return a boolean as to if the boundary between 'before' and 'after' is
5761 * a Unicode word break, using their published algorithm, but tailored for
5762 * Perl by treating spans of white space as one unit. Context may be
5763 * needed to make this determination. If the value for the character
5764 * before 'before' is known, it is passed as 'previous'; otherwise that
5765 * should be set to WB_UNKNOWN. The other input parameters give the
5766 * boundaries and current position in the matching of the string. That
5767 * is, 'curpos' marks the position where the character whose wb value is
5768 * 'after' begins. See http://www.unicode.org/reports/tr29/ */
5770 U8 * before_pos = (U8 *) curpos;
5771 U8 * after_pos = (U8 *) curpos;
5772 WB_enum prev = before;
5775 PERL_ARGS_ASSERT_ISWB;
5777 /* Rule numbers in the comments below are as of Unicode 9.0 */
5781 switch (WB_table[before][after]) {
5788 case WB_hs_then_hs: /* 2 horizontal spaces in a row */
5789 next = advance_one_WB(&after_pos, strend, utf8_target,
5790 FALSE /* Don't skip Extend nor Format */ );
5791 /* A space immediately preceeding an Extend or Format is attached
5792 * to by them, and hence gets separated from previous spaces.
5793 * Otherwise don't break between horizontal white space */
5794 return next == WB_Extend || next == WB_Format;
5796 /* WB4 Ignore Format and Extend characters, except when they appear at
5797 * the beginning of a region of text. This code currently isn't
5798 * general purpose, but it works as the rules are currently and likely
5799 * to be laid out. The reason it works is that when 'they appear at
5800 * the beginning of a region of text', the rule is to break before
5801 * them, just like any other character. Therefore, the default rule
5802 * applies and we don't have to look in more depth. Should this ever
5803 * change, we would have to have 2 'case' statements, like in the rules
5804 * below, and backup a single character (not spacing over the extend
5805 * ones) and then see if that is one of the region-end characters and
5807 case WB_Ex_or_FO_or_ZWJ_then_foo:
5808 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5811 case WB_DQ_then_HL + WB_BREAKABLE:
5812 case WB_DQ_then_HL + WB_NOBREAK:
5814 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */
5816 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5817 == WB_Hebrew_Letter)
5822 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5824 case WB_HL_then_DQ + WB_BREAKABLE:
5825 case WB_HL_then_DQ + WB_NOBREAK:
5827 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */
5829 if (advance_one_WB(&after_pos, strend, utf8_target,
5830 TRUE /* Do skip Extend and Format */ )
5831 == WB_Hebrew_Letter)
5836 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5838 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5839 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5841 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
5842 * | Single_Quote) (ALetter | Hebrew_Letter) */
5844 next = advance_one_WB(&after_pos, strend, utf8_target,
5845 TRUE /* Do skip Extend and Format */ );
5847 if (next == WB_ALetter || next == WB_Hebrew_Letter)
5852 return WB_table[before][after]
5853 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5855 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5856 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5858 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5859 * | Single_Quote) × (ALetter | Hebrew_Letter) */
5861 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5862 if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5867 return WB_table[before][after]
5868 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5870 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5871 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5873 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
5876 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5882 return WB_table[before][after]
5883 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5885 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5886 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5888 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */
5890 if (advance_one_WB(&after_pos, strend, utf8_target,
5891 TRUE /* Do skip Extend and Format */ )
5897 return WB_table[before][after]
5898 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5900 case WB_RI_then_RI + WB_NOBREAK:
5901 case WB_RI_then_RI + WB_BREAKABLE:
5905 /* Do not break within emoji flag sequences. That is, do not
5906 * break between regional indicator (RI) symbols if there is an
5907 * odd number of RI characters before the potential break
5910 * WB15 sot (RI RI)* RI × RI
5911 * WB16 [^RI] (RI RI)* RI × RI */
5913 while (backup_one_WB(&previous,
5916 utf8_target) == WB_Regional_Indicator)
5921 return RI_count % 2 != 1;
5929 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5930 before, after, WB_table[before][after]);
5937 S_advance_one_WB(pTHX_ U8 ** curpos,
5938 const U8 * const strend,
5939 const bool utf8_target,
5940 const bool skip_Extend_Format)
5944 PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5946 if (*curpos >= strend) {
5952 /* Advance over Extend and Format */
5954 *curpos += UTF8SKIP(*curpos);
5955 if (*curpos >= strend) {
5958 wb = getWB_VAL_UTF8(*curpos, strend);
5959 } while ( skip_Extend_Format
5960 && (wb == WB_Extend || wb == WB_Format));
5965 if (*curpos >= strend) {
5968 wb = getWB_VAL_CP(**curpos);
5969 } while ( skip_Extend_Format
5970 && (wb == WB_Extend || wb == WB_Format));
5977 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5981 PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5983 /* If we know what the previous character's break value is, don't have
5985 if (*previous != WB_UNKNOWN) {
5988 /* But we need to move backwards by one */
5990 *curpos = reghopmaybe3(*curpos, -1, strbeg);
5992 *previous = WB_EDGE;
5993 *curpos = (U8 *) strbeg;
5996 *previous = WB_UNKNOWN;
6001 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
6004 /* And we always back up over these three types */
6005 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
6010 if (*curpos < strbeg) {
6015 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
6016 if (! prev_char_pos) {
6020 /* Back up over Extend and Format. curpos is always just to the right
6021 * of the characater whose value we are getting */
6023 U8 * prev_prev_char_pos;
6024 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
6028 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
6029 *curpos = prev_char_pos;
6030 prev_char_pos = prev_prev_char_pos;
6033 *curpos = (U8 *) strbeg;
6036 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
6040 if (*curpos - 2 < strbeg) {
6041 *curpos = (U8 *) strbeg;
6045 wb = getWB_VAL_CP(*(*curpos - 1));
6046 } while (wb == WB_Extend || wb == WB_Format);
6052 /* Macros for regmatch(), using its internal variables */
6053 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
6054 #define NEXTCHR_IS_EOS (nextbyte < 0)
6056 #define SET_nextchr \
6057 nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
6059 #define SET_locinput(p) \
6063 #define sayYES goto yes
6064 #define sayNO goto no
6065 #define sayNO_SILENT goto no_silent
6067 /* we dont use STMT_START/END here because it leads to
6068 "unreachable code" warnings, which are bogus, but distracting. */
6069 #define CACHEsayNO \
6070 if (ST.cache_mask) \
6071 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
6074 #define EVAL_CLOSE_PAREN_IS(st,expr) \
6077 ( ( st )->u.eval.close_paren ) && \
6078 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6081 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
6084 ( ( st )->u.eval.close_paren ) && \
6086 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6090 #define EVAL_CLOSE_PAREN_SET(st,expr) \
6091 (st)->u.eval.close_paren = ( (expr) + 1 )
6093 #define EVAL_CLOSE_PAREN_CLEAR(st) \
6094 (st)->u.eval.close_paren = 0
6096 /* push a new state then goto it */
6098 #define PUSH_STATE_GOTO(state, node, input, eol, sr0) \
6099 pushinput = input; \
6103 st->resume_state = state; \
6106 /* push a new state with success backtracking, then goto it */
6108 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \
6109 pushinput = input; \
6113 st->resume_state = state; \
6114 goto push_yes_state;
6116 #define DEBUG_STATE_pp(pp) \
6118 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
6119 Perl_re_printf( aTHX_ \
6120 "%*s" pp " %s%s%s%s%s\n", \
6121 INDENT_CHARS(depth), "", \
6122 PL_reg_name[st->resume_state], \
6123 ((st==yes_state||st==mark_state) ? "[" : ""), \
6124 ((st==yes_state) ? "Y" : ""), \
6125 ((st==mark_state) ? "M" : ""), \
6126 ((st==yes_state||st==mark_state) ? "]" : "") \
6132 regmatch() - main matching routine
6134 This is basically one big switch statement in a loop. We execute an op,
6135 set 'next' to point the next op, and continue. If we come to a point which
6136 we may need to backtrack to on failure such as (A|B|C), we push a
6137 backtrack state onto the backtrack stack. On failure, we pop the top
6138 state, and re-enter the loop at the state indicated. If there are no more
6139 states to pop, we return failure.
6141 Sometimes we also need to backtrack on success; for example /A+/, where
6142 after successfully matching one A, we need to go back and try to
6143 match another one; similarly for lookahead assertions: if the assertion
6144 completes successfully, we backtrack to the state just before the assertion
6145 and then carry on. In these cases, the pushed state is marked as
6146 'backtrack on success too'. This marking is in fact done by a chain of
6147 pointers, each pointing to the previous 'yes' state. On success, we pop to
6148 the nearest yes state, discarding any intermediate failure-only states.
6149 Sometimes a yes state is pushed just to force some cleanup code to be
6150 called at the end of a successful match or submatch; e.g. (??{$re}) uses
6151 it to free the inner regex.
6153 Note that failure backtracking rewinds the cursor position, while
6154 success backtracking leaves it alone.
6156 A pattern is complete when the END op is executed, while a subpattern
6157 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
6158 ops trigger the "pop to last yes state if any, otherwise return true"
6161 A common convention in this function is to use A and B to refer to the two
6162 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
6163 the subpattern to be matched possibly multiple times, while B is the entire
6164 rest of the pattern. Variable and state names reflect this convention.
6166 The states in the main switch are the union of ops and failure/success of
6167 substates associated with that op. For example, IFMATCH is the op
6168 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
6169 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
6170 successfully matched A and IFMATCH_A_fail is a state saying that we have
6171 just failed to match A. Resume states always come in pairs. The backtrack
6172 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
6173 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
6174 on success or failure.
6176 The struct that holds a backtracking state is actually a big union, with
6177 one variant for each major type of op. The variable st points to the
6178 top-most backtrack struct. To make the code clearer, within each
6179 block of code we #define ST to alias the relevant union.
6181 Here's a concrete example of a (vastly oversimplified) IFMATCH
6187 #define ST st->u.ifmatch
6189 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6190 ST.foo = ...; // some state we wish to save
6192 // push a yes backtrack state with a resume value of
6193 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
6195 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
6198 case IFMATCH_A: // we have successfully executed A; now continue with B
6200 bar = ST.foo; // do something with the preserved value
6203 case IFMATCH_A_fail: // A failed, so the assertion failed
6204 ...; // do some housekeeping, then ...
6205 sayNO; // propagate the failure
6212 For any old-timers reading this who are familiar with the old recursive
6213 approach, the code above is equivalent to:
6215 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6224 ...; // do some housekeeping, then ...
6225 sayNO; // propagate the failure
6228 The topmost backtrack state, pointed to by st, is usually free. If you
6229 want to claim it, populate any ST.foo fields in it with values you wish to
6230 save, then do one of
6232 PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
6233 PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
6235 which sets that backtrack state's resume value to 'resume_state', pushes a
6236 new free entry to the top of the backtrack stack, then goes to 'node'.
6237 On backtracking, the free slot is popped, and the saved state becomes the
6238 new free state. An ST.foo field in this new top state can be temporarily
6239 accessed to retrieve values, but once the main loop is re-entered, it
6240 becomes available for reuse.
6242 Note that the depth of the backtrack stack constantly increases during the
6243 left-to-right execution of the pattern, rather than going up and down with
6244 the pattern nesting. For example the stack is at its maximum at Z at the
6245 end of the pattern, rather than at X in the following:
6247 /(((X)+)+)+....(Y)+....Z/
6249 The only exceptions to this are lookahead/behind assertions and the cut,
6250 (?>A), which pop all the backtrack states associated with A before
6253 Backtrack state structs are allocated in slabs of about 4K in size.
6254 PL_regmatch_state and st always point to the currently active state,
6255 and PL_regmatch_slab points to the slab currently containing
6256 PL_regmatch_state. The first time regmatch() is called, the first slab is
6257 allocated, and is never freed until interpreter destruction. When the slab
6258 is full, a new one is allocated and chained to the end. At exit from
6259 regmatch(), slabs allocated since entry are freed.
6261 In order to work with variable length lookbehinds, an upper limit is placed on
6262 lookbehinds which is set to where the match position is at the end of where the
6263 lookbehind would get to. Nothing in the lookbehind should match above that,
6264 except we should be able to look beyond if for things like \b, which need the
6265 next character in the string to be able to determine if this is a boundary or
6266 not. We also can't match the end of string/line unless we are also at the end
6267 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
6268 that match a width, we have to add a condition that they are within the legal
6269 bounds of our window into the string.
6273 /* returns -1 on failure, $+[0] on success */
6275 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
6277 const bool utf8_target = reginfo->is_utf8_target;
6278 const U32 uniflags = UTF8_ALLOW_DEFAULT;
6279 REGEXP *rex_sv = reginfo->prog;
6280 regexp *rex = ReANY(rex_sv);
6281 RXi_GET_DECL(rex,rexi);
6282 /* the current state. This is a cached copy of PL_regmatch_state */
6284 /* cache heavy used fields of st in registers */
6287 U32 n = 0; /* general value; init to avoid compiler warning */
6288 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
6289 SSize_t endref = 0; /* offset of end of backref when ln is start */
6290 char *locinput = startpos;
6291 char *loceol = reginfo->strend;
6292 char *pushinput; /* where to continue after a PUSH */
6293 char *pusheol; /* where to stop matching (loceol) after a PUSH */
6294 U8 *pushsr0; /* save starting pos of script run */
6295 PERL_INT_FAST16_T nextbyte; /* is always set to UCHARAT(locinput), or -1
6298 bool result = 0; /* return value of S_regmatch */
6299 U32 depth = 0; /* depth of backtrack stack */
6300 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
6301 const U32 max_nochange_depth =
6302 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
6303 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
6304 regmatch_state *yes_state = NULL; /* state to pop to on success of
6306 /* mark_state piggy backs on the yes_state logic so that when we unwind
6307 the stack on success we can update the mark_state as we go */
6308 regmatch_state *mark_state = NULL; /* last mark state we have seen */
6309 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
6310 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
6312 bool no_final = 0; /* prevent failure from backtracking? */
6313 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
6314 char *startpoint = locinput;
6315 SV *popmark = NULL; /* are we looking for a mark? */
6316 SV *sv_commit = NULL; /* last mark name seen in failure */
6317 SV *sv_yes_mark = NULL; /* last mark name we have seen
6318 during a successful match */
6319 U32 lastopen = 0; /* last open we saw */
6320 bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
6321 SV* const oreplsv = GvSVn(PL_replgv);
6322 /* these three flags are set by various ops to signal information to
6323 * the very next op. They have a useful lifetime of exactly one loop
6324 * iteration, and are not preserved or restored by state pushes/pops
6326 bool sw = 0; /* the condition value in (?(cond)a|b) */
6327 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
6328 int logical = 0; /* the following EVAL is:
6332 or the following IFMATCH/UNLESSM is:
6333 false: plain (?=foo)
6334 true: used as a condition: (?(?=foo))
6336 PAD* last_pad = NULL;
6338 U8 gimme = G_SCALAR;
6339 CV *caller_cv = NULL; /* who called us */
6340 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
6341 U32 maxopenparen = 0; /* max '(' index seen so far */
6342 int to_complement; /* Invert the result? */
6343 _char_class_number classnum;
6344 bool is_utf8_pat = reginfo->is_utf8_pat;
6346 I32 orig_savestack_ix = PL_savestack_ix;
6347 U8 * script_run_begin = NULL;
6349 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
6350 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
6351 # define SOLARIS_BAD_OPTIMIZER
6352 const U32 *pl_charclass_dup = PL_charclass;
6353 # define PL_charclass pl_charclass_dup
6357 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6360 /* protect against undef(*^R) */
6361 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
6363 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
6364 multicall_oldcatch = 0;
6365 PERL_UNUSED_VAR(multicall_cop);
6367 PERL_ARGS_ASSERT_REGMATCH;
6369 st = PL_regmatch_state;
6371 /* Note that nextbyte is a byte even in UTF */
6375 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
6376 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6377 Perl_re_printf( aTHX_ "regmatch start\n" );
6380 while (scan != NULL) {
6381 next = scan + NEXT_OFF(scan);
6384 state_num = OP(scan);
6388 if (state_num <= REGNODE_MAX) {
6389 SV * const prop = sv_newmortal();
6390 regnode *rnext = regnext(scan);
6392 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6393 regprop(rex, prop, scan, reginfo, NULL);
6394 Perl_re_printf( aTHX_
6395 "%*s%" IVdf ":%s(%" IVdf ")\n",
6396 INDENT_CHARS(depth), "",
6397 (IV)(scan - rexi->program),
6399 (PL_regkind[OP(scan)] == END || !rnext) ?
6400 0 : (IV)(rnext - rexi->program));
6407 assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
6409 switch (state_num) {
6410 case SBOL: /* /^../ and /\A../ */
6411 if (locinput == reginfo->strbeg)
6415 case MBOL: /* /^../m */
6416 if (locinput == reginfo->strbeg ||
6417 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
6424 if (locinput == reginfo->ganch)
6428 case KEEPS: /* \K */
6429 /* update the startpoint */
6430 st->u.keeper.val = rex->offs[0].start;
6431 rex->offs[0].start = locinput - reginfo->strbeg;
6432 PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
6434 NOT_REACHED; /* NOTREACHED */
6436 case KEEPS_next_fail:
6437 /* rollback the start point change */
6438 rex->offs[0].start = st->u.keeper.val;
6440 NOT_REACHED; /* NOTREACHED */
6442 case MEOL: /* /..$/m */
6443 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6447 case SEOL: /* /..$/ */
6448 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6450 if (reginfo->strend - locinput > 1)
6455 if (!NEXTCHR_IS_EOS)
6459 case SANY: /* /./s */
6460 if (NEXTCHR_IS_EOS || locinput >= loceol)
6462 goto increment_locinput;
6464 case REG_ANY: /* /./ */
6466 || locinput >= loceol
6467 || nextbyte == '\n')
6471 goto increment_locinput;
6475 #define ST st->u.trie
6476 case TRIEC: /* (ab|cd) with known charclass */
6477 /* In this case the charclass data is available inline so
6478 we can fail fast without a lot of extra overhead.
6480 if ( ! NEXTCHR_IS_EOS
6481 && locinput < loceol
6482 && ! ANYOF_BITMAP_TEST(scan, nextbyte))
6485 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6486 depth, PL_colors[4], PL_colors[5])
6489 NOT_REACHED; /* NOTREACHED */
6492 case TRIE: /* (ab|cd) */
6493 /* the basic plan of execution of the trie is:
6494 * At the beginning, run though all the states, and
6495 * find the longest-matching word. Also remember the position
6496 * of the shortest matching word. For example, this pattern:
6499 * when matched against the string "abcde", will generate
6500 * accept states for all words except 3, with the longest
6501 * matching word being 4, and the shortest being 2 (with
6502 * the position being after char 1 of the string).
6504 * Then for each matching word, in word order (i.e. 1,2,4,5),
6505 * we run the remainder of the pattern; on each try setting
6506 * the current position to the character following the word,
6507 * returning to try the next word on failure.
6509 * We avoid having to build a list of words at runtime by
6510 * using a compile-time structure, wordinfo[].prev, which
6511 * gives, for each word, the previous accepting word (if any).
6512 * In the case above it would contain the mappings 1->2, 2->0,
6513 * 3->0, 4->5, 5->1. We can use this table to generate, from
6514 * the longest word (4 above), a list of all words, by
6515 * following the list of prev pointers; this gives us the
6516 * unordered list 4,5,1,2. Then given the current word we have
6517 * just tried, we can go through the list and find the
6518 * next-biggest word to try (so if we just failed on word 2,
6519 * the next in the list is 4).
6521 * Since at runtime we don't record the matching position in
6522 * the string for each word, we have to work that out for
6523 * each word we're about to process. The wordinfo table holds
6524 * the character length of each word; given that we recorded
6525 * at the start: the position of the shortest word and its
6526 * length in chars, we just need to move the pointer the
6527 * difference between the two char lengths. Depending on
6528 * Unicode status and folding, that's cheap or expensive.
6530 * This algorithm is optimised for the case where are only a
6531 * small number of accept states, i.e. 0,1, or maybe 2.
6532 * With lots of accepts states, and having to try all of them,
6533 * it becomes quadratic on number of accept states to find all
6538 /* what type of TRIE am I? (utf8 makes this contextual) */
6539 DECL_TRIE_TYPE(scan);
6541 /* what trie are we using right now */
6542 reg_trie_data * const trie
6543 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
6544 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
6545 U32 state = trie->startstate;
6547 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
6548 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6551 && UTF8_IS_ABOVE_LATIN1(nextbyte)
6552 && scan->flags == EXACTL)
6554 /* We only output for EXACTL, as we let the folder
6555 * output this message for EXACTFLU8 to avoid
6557 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6563 || locinput >= loceol
6564 || ! TRIE_BITMAP_TEST(trie, nextbyte)))
6566 if (trie->states[ state ].wordnum) {
6568 Perl_re_exec_indentf( aTHX_ "%sTRIE: matched empty string...%s\n",
6569 depth, PL_colors[4], PL_colors[5])
6575 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6576 depth, PL_colors[4], PL_colors[5])
6583 U8 *uc = ( U8* )locinput;
6587 U8 *uscan = (U8*)NULL;
6588 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6589 U32 charcount = 0; /* how many input chars we have matched */
6590 U32 accepted = 0; /* have we seen any accepting states? */
6592 ST.jump = trie->jump;
6595 ST.longfold = FALSE; /* char longer if folded => it's harder */
6598 /* fully traverse the TRIE; note the position of the
6599 shortest accept state and the wordnum of the longest
6602 while ( state && uc <= (U8*)(loceol) ) {
6603 U32 base = trie->states[ state ].trans.base;
6607 wordnum = trie->states[ state ].wordnum;
6609 if (wordnum) { /* it's an accept state */
6612 /* record first match position */
6614 ST.firstpos = (U8*)locinput;
6619 ST.firstchars = charcount;
6622 if (!ST.nextword || wordnum < ST.nextword)
6623 ST.nextword = wordnum;
6624 ST.topword = wordnum;
6627 DEBUG_TRIE_EXECUTE_r({
6628 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6630 PerlIO_printf( Perl_debug_log,
6631 "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6632 INDENT_CHARS(depth), "", PL_colors[4],
6633 (UV)state, (accepted ? 'Y' : 'N'));
6636 /* read a char and goto next state */
6637 if ( base && (foldlen || uc < (U8*)(loceol))) {
6639 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6640 (U8 *) loceol, uscan,
6641 len, uvc, charid, foldlen,
6648 base + charid - 1 - trie->uniquecharcount)) >= 0)
6650 && ((U32)offset < trie->lasttrans)
6651 && trie->trans[offset].check == state)
6653 state = trie->trans[offset].next;
6664 DEBUG_TRIE_EXECUTE_r(
6665 Perl_re_printf( aTHX_
6666 "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6667 charid, uvc, (UV)state, PL_colors[5] );
6673 /* calculate total number of accept states */
6678 w = trie->wordinfo[w].prev;
6681 ST.accepted = accepted;
6685 Perl_re_exec_indentf( aTHX_ "%sTRIE: got %" IVdf " possible matches%s\n",
6687 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6689 goto trie_first_try; /* jump into the fail handler */
6691 NOT_REACHED; /* NOTREACHED */
6693 case TRIE_next_fail: /* we failed - try next alternative */
6697 /* undo any captures done in the tail part of a branch,
6699 * /(?:X(.)(.)|Y(.)).../
6700 * where the trie just matches X then calls out to do the
6701 * rest of the branch */
6702 REGCP_UNWIND(ST.cp);
6703 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6705 if (!--ST.accepted) {
6707 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n",
6715 /* Find next-highest word to process. Note that this code
6716 * is O(N^2) per trie run (O(N) per branch), so keep tight */
6719 U16 const nextword = ST.nextword;
6720 reg_trie_wordinfo * const wordinfo
6721 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6722 for (word=ST.topword; word; word=wordinfo[word].prev) {
6723 if (word > nextword && (!min || word < min))
6736 ST.lastparen = rex->lastparen;
6737 ST.lastcloseparen = rex->lastcloseparen;
6741 /* find start char of end of current word */
6743 U32 chars; /* how many chars to skip */
6744 reg_trie_data * const trie
6745 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6747 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6749 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6754 /* the hard option - fold each char in turn and find
6755 * its folded length (which may be different */
6756 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6764 /* XXX This assumes the length is well-formed, as
6765 * does the UTF8SKIP below */
6766 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6774 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6779 uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6788 uc = utf8_hop(uc, chars);
6794 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6795 ? ST.jump[ST.nextword]
6799 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n",
6807 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6808 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6810 NOT_REACHED; /* NOTREACHED */
6812 /* only one choice left - just continue */
6814 AV *const trie_words
6815 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6816 SV ** const tmp = trie_words
6817 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6818 SV *sv= tmp ? sv_newmortal() : NULL;
6820 Perl_re_exec_indentf( aTHX_ "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6821 depth, PL_colors[4],
6823 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6824 PL_colors[0], PL_colors[1],
6825 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6827 : "not compiled under -Dr",
6831 locinput = (char*)uc;
6832 continue; /* execute rest of RE */
6838 if (! utf8_target) {
6848 ln = STR_LENl(scan);
6849 goto join_short_long_exact;
6851 case EXACTL: /* /abc/l */
6852 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6854 /* Complete checking would involve going through every character
6855 * matched by the string to see if any is above latin1. But the
6856 * comparision otherwise might very well be a fast assembly
6857 * language routine, and I (khw) don't think slowing things down
6858 * just to check for this warning is worth it. So this just checks
6859 * the first character */
6860 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6861 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6865 if (! utf8_target) {
6870 case EXACT: /* /abc/ */
6873 ln = STR_LENs(scan);
6875 join_short_long_exact:
6876 if (utf8_target != is_utf8_pat) {
6877 /* The target and the pattern have differing utf8ness. */
6879 const char * const e = s + ln;
6882 /* The target is utf8, the pattern is not utf8.
6883 * Above-Latin1 code points can't match the pattern;
6884 * invariants match exactly, and the other Latin1 ones need
6885 * to be downgraded to a single byte in order to do the
6886 * comparison. (If we could be confident that the target
6887 * is not malformed, this could be refactored to have fewer
6888 * tests by just assuming that if the first bytes match, it
6889 * is an invariant, but there are tests in the test suite
6890 * dealing with (??{...}) which violate this) */
6893 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6897 if (UTF8_IS_INVARIANT(*(U8*)l)) {
6904 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6914 /* The target is not utf8, the pattern is utf8. */
6917 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6921 if (UTF8_IS_INVARIANT(*(U8*)s)) {
6928 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6940 /* The target and the pattern have the same utf8ness. */
6941 /* Inline the first character, for speed. */
6942 if ( loceol - locinput < ln
6943 || UCHARAT(s) != nextbyte
6944 || (ln > 1 && memNE(s, locinput, ln)))
6953 case EXACTFL: /* /abc/il */
6956 const U8 * fold_array;
6958 U32 fold_utf8_flags;
6960 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6961 folder = foldEQ_locale;
6962 fold_array = PL_fold_locale;
6963 fold_utf8_flags = FOLDEQ_LOCALE;
6966 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
6967 is effectively /u; hence to match, target
6969 if (! utf8_target) {
6972 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
6973 | FOLDEQ_S2_FOLDS_SANE;
6974 folder = foldEQ_latin1_s2_folded;
6975 fold_array = PL_fold_latin1;
6978 case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */
6979 if (! utf8_target) {
6982 assert(is_utf8_pat);
6983 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6986 case EXACTFUP: /* /foo/iu, and something is problematic in
6987 'foo' so can't take shortcuts. */
6988 assert(! is_utf8_pat);
6989 folder = foldEQ_latin1;
6990 fold_array = PL_fold_latin1;
6991 fold_utf8_flags = 0;
6994 case EXACTFU: /* /abc/iu */
6995 folder = foldEQ_latin1_s2_folded;
6996 fold_array = PL_fold_latin1;
6997 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7000 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8
7002 assert(! is_utf8_pat);
7004 case EXACTFAA: /* /abc/iaa */
7005 folder = foldEQ_latin1_s2_folded;
7006 fold_array = PL_fold_latin1;
7007 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7008 if (is_utf8_pat || ! utf8_target) {
7010 /* The possible presence of a MICRO SIGN in the pattern forbids
7011 * us to view a non-UTF-8 pattern as folded when there is a
7013 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
7014 |FOLDEQ_S2_FOLDS_SANE;
7019 case EXACTF: /* /abc/i This node only generated for
7020 non-utf8 patterns */
7021 assert(! is_utf8_pat);
7023 fold_array = PL_fold;
7024 fold_utf8_flags = 0;
7028 ln = STR_LENs(scan);
7032 || state_num == EXACTFUP
7033 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
7035 /* Either target or the pattern are utf8, or has the issue where
7036 * the fold lengths may differ. */
7037 const char * const l = locinput;
7040 if (! foldEQ_utf8_flags(l, &e, 0, utf8_target,
7041 s, 0, ln, is_utf8_pat,fold_utf8_flags))
7049 /* Neither the target nor the pattern are utf8 */
7050 if (UCHARAT(s) != nextbyte
7052 && UCHARAT(s) != fold_array[nextbyte])
7056 if (loceol - locinput < ln)
7058 if (ln > 1 && ! folder(locinput, s, ln))
7064 case NBOUNDL: /* /\B/l */
7068 case BOUNDL: /* /\b/l */
7071 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7073 if (FLAGS(scan) != TRADITIONAL_BOUND) {
7074 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
7079 if (locinput == reginfo->strbeg)
7080 b1 = isWORDCHAR_LC('\n');
7082 U8 *p = reghop3((U8*)locinput, -1,
7083 (U8*)(reginfo->strbeg));
7084 b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
7086 b2 = (NEXTCHR_IS_EOS)
7087 ? isWORDCHAR_LC('\n')
7088 : isWORDCHAR_LC_utf8_safe((U8*) locinput,
7089 (U8*) reginfo->strend);
7091 else { /* Here the string isn't utf8 */
7092 b1 = (locinput == reginfo->strbeg)
7093 ? isWORDCHAR_LC('\n')
7094 : isWORDCHAR_LC(UCHARAT(locinput - 1));
7095 b2 = (NEXTCHR_IS_EOS)
7096 ? isWORDCHAR_LC('\n')
7097 : isWORDCHAR_LC(nextbyte);
7099 if (to_complement ^ (b1 == b2)) {
7105 case NBOUND: /* /\B/ */
7109 case BOUND: /* /\b/ */
7113 goto bound_ascii_match_only;
7115 case NBOUNDA: /* /\B/a */
7119 case BOUNDA: /* /\b/a */
7123 bound_ascii_match_only:
7124 /* Here the string isn't utf8, or is utf8 and only ascii characters
7125 * are to match \w. In the latter case looking at the byte just
7126 * prior to the current one may be just the final byte of a
7127 * multi-byte character. This is ok. There are two cases:
7128 * 1) it is a single byte character, and then the test is doing
7129 * just what it's supposed to.
7130 * 2) it is a multi-byte character, in which case the final byte is
7131 * never mistakable for ASCII, and so the test will say it is
7132 * not a word character, which is the correct answer. */
7133 b1 = (locinput == reginfo->strbeg)
7134 ? isWORDCHAR_A('\n')
7135 : isWORDCHAR_A(UCHARAT(locinput - 1));
7136 b2 = (NEXTCHR_IS_EOS)
7137 ? isWORDCHAR_A('\n')
7138 : isWORDCHAR_A(nextbyte);
7139 if (to_complement ^ (b1 == b2)) {
7145 case NBOUNDU: /* /\B/u */
7149 case BOUNDU: /* /\b/u */
7152 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
7155 else if (utf8_target) {
7157 switch((bound_type) FLAGS(scan)) {
7158 case TRADITIONAL_BOUND:
7161 if (locinput == reginfo->strbeg) {
7162 b1 = 0 /* isWORDCHAR_L1('\n') */;
7165 U8 *p = reghop3((U8*)locinput, -1,
7166 (U8*)(reginfo->strbeg));
7168 b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
7170 b2 = (NEXTCHR_IS_EOS)
7171 ? 0 /* isWORDCHAR_L1('\n') */
7172 : isWORDCHAR_utf8_safe((U8*)locinput,
7173 (U8*) reginfo->strend);
7174 match = cBOOL(b1 != b2);
7178 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7179 match = TRUE; /* GCB always matches at begin and
7183 /* Find the gcb values of previous and current
7184 * chars, then see if is a break point */
7185 match = isGCB(getGCB_VAL_UTF8(
7186 reghop3((U8*)locinput,
7188 (U8*)(reginfo->strbeg)),
7189 (U8*) reginfo->strend),
7190 getGCB_VAL_UTF8((U8*) locinput,
7191 (U8*) reginfo->strend),
7192 (U8*) reginfo->strbeg,
7199 if (locinput == reginfo->strbeg) {
7202 else if (NEXTCHR_IS_EOS) {
7206 match = isLB(getLB_VAL_UTF8(
7207 reghop3((U8*)locinput,
7209 (U8*)(reginfo->strbeg)),
7210 (U8*) reginfo->strend),
7211 getLB_VAL_UTF8((U8*) locinput,
7212 (U8*) reginfo->strend),
7213 (U8*) reginfo->strbeg,
7215 (U8*) reginfo->strend,
7220 case SB_BOUND: /* Always matches at begin and end */
7221 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7225 match = isSB(getSB_VAL_UTF8(
7226 reghop3((U8*)locinput,
7228 (U8*)(reginfo->strbeg)),
7229 (U8*) reginfo->strend),
7230 getSB_VAL_UTF8((U8*) locinput,
7231 (U8*) reginfo->strend),
7232 (U8*) reginfo->strbeg,
7234 (U8*) reginfo->strend,
7240 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7244 match = isWB(WB_UNKNOWN,
7246 reghop3((U8*)locinput,
7248 (U8*)(reginfo->strbeg)),
7249 (U8*) reginfo->strend),
7250 getWB_VAL_UTF8((U8*) locinput,
7251 (U8*) reginfo->strend),
7252 (U8*) reginfo->strbeg,
7254 (U8*) reginfo->strend,
7260 else { /* Not utf8 target */
7261 switch((bound_type) FLAGS(scan)) {
7262 case TRADITIONAL_BOUND:
7265 b1 = (locinput == reginfo->strbeg)
7266 ? 0 /* isWORDCHAR_L1('\n') */
7267 : isWORDCHAR_L1(UCHARAT(locinput - 1));
7268 b2 = (NEXTCHR_IS_EOS)
7269 ? 0 /* isWORDCHAR_L1('\n') */
7270 : isWORDCHAR_L1(nextbyte);
7271 match = cBOOL(b1 != b2);
7276 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7277 match = TRUE; /* GCB always matches at begin and
7280 else { /* Only CR-LF combo isn't a GCB in 0-255
7282 match = UCHARAT(locinput - 1) != '\r'
7283 || UCHARAT(locinput) != '\n';
7288 if (locinput == reginfo->strbeg) {
7291 else if (NEXTCHR_IS_EOS) {
7295 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
7296 getLB_VAL_CP(UCHARAT(locinput)),
7297 (U8*) reginfo->strbeg,
7299 (U8*) reginfo->strend,
7304 case SB_BOUND: /* Always matches at begin and end */
7305 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7309 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
7310 getSB_VAL_CP(UCHARAT(locinput)),
7311 (U8*) reginfo->strbeg,
7313 (U8*) reginfo->strend,
7319 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7323 match = isWB(WB_UNKNOWN,
7324 getWB_VAL_CP(UCHARAT(locinput -1)),
7325 getWB_VAL_CP(UCHARAT(locinput)),
7326 (U8*) reginfo->strbeg,
7328 (U8*) reginfo->strend,
7335 if (to_complement ^ ! match) {
7341 case ANYOFL: /* /[abc]/l */
7342 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7343 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
7346 case ANYOFD: /* /[abc]/d */
7347 case ANYOF: /* /[abc]/ */
7348 if (NEXTCHR_IS_EOS || locinput >= loceol)
7350 if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput))
7351 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
7353 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
7359 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7364 goto increment_locinput;
7370 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
7371 || locinput >= loceol)
7375 locinput++; /* ANYOFM is always single byte */
7380 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
7381 || locinput >= loceol)
7385 goto increment_locinput;
7391 || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7392 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7397 goto increment_locinput;
7403 || ANYOF_FLAGS(scan) != (U8) *locinput
7404 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7409 goto increment_locinput;
7415 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
7416 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
7417 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
7418 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7423 goto increment_locinput;
7429 || loceol - locinput < FLAGS(scan)
7430 || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
7431 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7436 goto increment_locinput;
7440 if (NEXTCHR_IS_EOS) {
7445 if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7446 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7447 (U8 *) reginfo->strend,
7449 ANYOFRbase(scan), ANYOFRdelta(scan)))
7455 if (! withinCOUNT((U8) *locinput,
7456 ANYOFRbase(scan), ANYOFRdelta(scan)))
7461 goto increment_locinput;
7465 if (NEXTCHR_IS_EOS) {
7470 if ( ANYOF_FLAGS(scan) != (U8) *locinput
7471 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7472 (U8 *) reginfo->strend,
7474 ANYOFRbase(scan), ANYOFRdelta(scan)))
7480 if (! withinCOUNT((U8) *locinput,
7481 ANYOFRbase(scan), ANYOFRdelta(scan)))
7486 goto increment_locinput;
7489 /* The argument (FLAGS) to all the POSIX node types is the class number
7492 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
7496 case POSIXL: /* \w or [:punct:] etc. under /l */
7497 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7498 if (NEXTCHR_IS_EOS || locinput >= loceol)
7501 /* Use isFOO_lc() for characters within Latin1. (Note that
7502 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7503 * wouldn't be invariant) */
7504 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7505 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
7513 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7514 /* An above Latin-1 code point, or malformed */
7515 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7517 goto utf8_posix_above_latin1;
7520 /* Here is a UTF-8 variant code point below 256 and the target is
7522 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7523 EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7524 *(locinput + 1))))))
7529 goto increment_locinput;
7531 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
7535 case POSIXD: /* \w or [:punct:] etc. under /d */
7541 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
7543 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7547 /* All UTF-8 variants match */
7548 if (! UTF8_IS_INVARIANT(nextbyte)) {
7549 goto increment_locinput;
7555 case POSIXA: /* \w or [:punct:] etc. under /a */
7558 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7559 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7560 * character is a single byte */
7562 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7568 if (! (to_complement ^ cBOOL(_generic_isCC_A(nextbyte,
7574 /* Here we are either not in utf8, or we matched a utf8-invariant,
7575 * so the next char is the next byte */
7579 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
7583 case POSIXU: /* \w or [:punct:] etc. under /u */
7585 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7589 /* Use _generic_isCC() for characters within Latin1. (Note that
7590 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7591 * wouldn't be invariant) */
7592 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7593 if (! (to_complement ^ cBOOL(_generic_isCC(nextbyte,
7600 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7601 if (! (to_complement
7602 ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7610 else { /* Handle above Latin-1 code points */
7611 utf8_posix_above_latin1:
7612 classnum = (_char_class_number) FLAGS(scan);
7615 if (! (to_complement
7616 ^ cBOOL(_invlist_contains_cp(
7617 PL_XPosix_ptrs[classnum],
7618 utf8_to_uvchr_buf((U8 *) locinput,
7619 (U8 *) reginfo->strend,
7625 case _CC_ENUM_SPACE:
7626 if (! (to_complement
7627 ^ cBOOL(is_XPERLSPACE_high(locinput))))
7632 case _CC_ENUM_BLANK:
7633 if (! (to_complement
7634 ^ cBOOL(is_HORIZWS_high(locinput))))
7639 case _CC_ENUM_XDIGIT:
7640 if (! (to_complement
7641 ^ cBOOL(is_XDIGIT_high(locinput))))
7646 case _CC_ENUM_VERTSPACE:
7647 if (! (to_complement
7648 ^ cBOOL(is_VERTWS_high(locinput))))
7653 case _CC_ENUM_CNTRL: /* These can't match above Latin1 */
7654 case _CC_ENUM_ASCII:
7655 if (! to_complement) {
7660 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7664 case CLUMP: /* Match \X: logical Unicode character. This is defined as
7665 a Unicode extended Grapheme Cluster */
7666 if (NEXTCHR_IS_EOS || locinput >= loceol)
7668 if (! utf8_target) {
7670 /* Match either CR LF or '.', as all the other possibilities
7672 locinput++; /* Match the . or CR */
7673 if (nextbyte == '\r' /* And if it was CR, and the next is LF,
7675 && locinput < loceol
7676 && UCHARAT(locinput) == '\n')
7683 /* Get the gcb type for the current character */
7684 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7685 (U8*) reginfo->strend);
7687 /* Then scan through the input until we get to the first
7688 * character whose type is supposed to be a gcb with the
7689 * current character. (There is always a break at the
7691 locinput += UTF8SKIP(locinput);
7692 while (locinput < loceol) {
7693 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7694 (U8*) reginfo->strend);
7695 if (isGCB(prev_gcb, cur_gcb,
7696 (U8*) reginfo->strbeg, (U8*) locinput,
7703 locinput += UTF8SKIP(locinput);
7710 case REFFLN: /* /\g{name}/il */
7711 { /* The capture buffer cases. The ones beginning with N for the
7712 named buffers just convert to the equivalent numbered and
7713 pretend they were called as the corresponding numbered buffer
7715 /* don't initialize these in the declaration, it makes C++
7720 const U8 *fold_array;
7723 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7724 folder = foldEQ_locale;
7725 fold_array = PL_fold_locale;
7727 utf8_fold_flags = FOLDEQ_LOCALE;
7730 case REFFAN: /* /\g{name}/iaa */
7731 folder = foldEQ_latin1;
7732 fold_array = PL_fold_latin1;
7734 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7737 case REFFUN: /* /\g{name}/iu */
7738 folder = foldEQ_latin1;
7739 fold_array = PL_fold_latin1;
7741 utf8_fold_flags = 0;
7744 case REFFN: /* /\g{name}/i */
7746 fold_array = PL_fold;
7748 utf8_fold_flags = 0;
7751 case REFN: /* /\g{name}/ */
7755 utf8_fold_flags = 0;
7758 /* For the named back references, find the corresponding buffer
7760 n = reg_check_named_buff_matched(rex,scan);
7765 goto do_nref_ref_common;
7767 case REFFL: /* /\1/il */
7768 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7769 folder = foldEQ_locale;
7770 fold_array = PL_fold_locale;
7771 utf8_fold_flags = FOLDEQ_LOCALE;
7774 case REFFA: /* /\1/iaa */
7775 folder = foldEQ_latin1;
7776 fold_array = PL_fold_latin1;
7777 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7780 case REFFU: /* /\1/iu */
7781 folder = foldEQ_latin1;
7782 fold_array = PL_fold_latin1;
7783 utf8_fold_flags = 0;
7786 case REFF: /* /\1/i */
7788 fold_array = PL_fold;
7789 utf8_fold_flags = 0;
7792 case REF: /* /\1/ */
7795 utf8_fold_flags = 0;
7799 n = ARG(scan); /* which paren pair */
7802 ln = rex->offs[n].start;
7803 endref = rex->offs[n].end;
7804 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7805 if (rex->lastparen < n || ln == -1 || endref == -1)
7806 sayNO; /* Do not match unless seen CLOSEn. */
7810 s = reginfo->strbeg + ln;
7811 if (type != REF /* REF can do byte comparison */
7812 && (utf8_target || type == REFFU || type == REFFL))
7814 char * limit = loceol;
7816 /* This call case insensitively compares the entire buffer
7817 * at s, with the current input starting at locinput, but
7818 * not going off the end given by loceol, and
7819 * returns in <limit> upon success, how much of the
7820 * current input was matched */
7821 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7822 locinput, &limit, 0, utf8_target, utf8_fold_flags))
7830 /* Not utf8: Inline the first character, for speed. */
7831 if ( ! NEXTCHR_IS_EOS
7832 && locinput < loceol
7833 && UCHARAT(s) != nextbyte
7835 || UCHARAT(s) != fold_array[nextbyte]))
7840 if (locinput + ln > loceol)
7842 if (ln > 1 && (type == REF
7843 ? memNE(s, locinput, ln)
7844 : ! folder(locinput, s, ln)))
7850 case NOTHING: /* null op; e.g. the 'nothing' following
7851 * the '*' in m{(a+|b)*}' */
7853 case TAIL: /* placeholder while compiling (A|B|C) */
7857 #define ST st->u.eval
7858 #define CUR_EVAL cur_eval->u.eval
7864 regexp_internal *rei;
7865 regnode *startpoint;
7868 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
7869 arg= (U32)ARG(scan);
7870 if (cur_eval && cur_eval->locinput == locinput) {
7871 if ( ++nochange_depth > max_nochange_depth )
7873 "Pattern subroutine nesting without pos change"
7874 " exceeded limit in regex");
7881 startpoint = scan + ARG2L(scan);
7882 EVAL_CLOSE_PAREN_SET( st, arg );
7883 /* Detect infinite recursion
7885 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7886 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7887 * So we track the position in the string we are at each time
7888 * we recurse and if we try to enter the same routine twice from
7889 * the same position we throw an error.
7891 if ( rex->recurse_locinput[arg] == locinput ) {
7892 /* FIXME: we should show the regop that is failing as part
7893 * of the error message. */
7894 Perl_croak(aTHX_ "Infinite recursion in regex");
7896 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7897 rex->recurse_locinput[arg]= locinput;
7900 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7902 Perl_re_exec_indentf( aTHX_
7903 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7904 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7910 /* Save all the positions seen so far. */
7911 ST.cp = regcppush(rex, 0, maxopenparen);
7912 REGCP_SET(ST.lastcp);
7914 /* and then jump to the code we share with EVAL */
7915 goto eval_recurse_doit;
7918 case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */
7919 if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
7920 if ( ++nochange_depth > max_nochange_depth )
7921 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7926 /* execute the code in the {...} */
7930 OP * const oop = PL_op;
7931 COP * const ocurcop = PL_curcop;
7935 /* save *all* paren positions */
7936 regcppush(rex, 0, maxopenparen);
7937 REGCP_SET(ST.lastcp);
7940 caller_cv = find_runcv(NULL);
7944 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7946 (REGEXP*)(rexi->data->data[n])
7948 nop = (OP*)rexi->data->data[n+1];
7950 else if (rexi->data->what[n] == 'l') { /* literal code */
7952 nop = (OP*)rexi->data->data[n];
7953 assert(CvDEPTH(newcv));
7956 /* literal with own CV */
7957 assert(rexi->data->what[n] == 'L');
7958 newcv = rex->qr_anoncv;
7959 nop = (OP*)rexi->data->data[n];
7962 /* Some notes about MULTICALL and the context and save stacks.
7965 * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7966 * since codeblocks don't introduce a new scope (so that
7967 * local() etc accumulate), at the end of a successful
7968 * match there will be a SAVEt_CLEARSV on the savestack
7969 * for each of $x, $y, $z. If the three code blocks above
7970 * happen to have come from different CVs (e.g. via
7971 * embedded qr//s), then we must ensure that during any
7972 * savestack unwinding, PL_comppad always points to the
7973 * right pad at each moment. We achieve this by
7974 * interleaving SAVEt_COMPPAD's on the savestack whenever
7975 * there is a change of pad.
7976 * In theory whenever we call a code block, we should
7977 * push a CXt_SUB context, then pop it on return from
7978 * that code block. This causes a bit of an issue in that
7979 * normally popping a context also clears the savestack
7980 * back to cx->blk_oldsaveix, but here we specifically
7981 * don't want to clear the save stack on exit from the
7983 * Also for efficiency we don't want to keep pushing and
7984 * popping the single SUB context as we backtrack etc.
7985 * So instead, we push a single context the first time
7986 * we need, it, then hang onto it until the end of this
7987 * function. Whenever we encounter a new code block, we
7988 * update the CV etc if that's changed. During the times
7989 * in this function where we're not executing a code
7990 * block, having the SUB context still there is a bit
7991 * naughty - but we hope that no-one notices.
7992 * When the SUB context is initially pushed, we fake up
7993 * cx->blk_oldsaveix to be as if we'd pushed this context
7994 * on first entry to S_regmatch rather than at some random
7995 * point during the regexe execution. That way if we
7996 * croak, popping the context stack will ensure that
7997 * *everything* SAVEd by this function is undone and then
7998 * the context popped, rather than e.g., popping the
7999 * context (and restoring the original PL_comppad) then
8000 * popping more of the savestack and restoring a bad
8004 /* If this is the first EVAL, push a MULTICALL. On
8005 * subsequent calls, if we're executing a different CV, or
8006 * if PL_comppad has got messed up from backtracking
8007 * through SAVECOMPPADs, then refresh the context.
8009 if (newcv != last_pushed_cv || PL_comppad != last_pad)
8011 U8 flags = (CXp_SUB_RE |
8012 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
8014 if (last_pushed_cv) {
8015 CHANGE_MULTICALL_FLAGS(newcv, flags);
8018 PUSH_MULTICALL_FLAGS(newcv, flags);
8020 /* see notes above */
8021 CX_CUR()->blk_oldsaveix = orig_savestack_ix;
8023 last_pushed_cv = newcv;
8026 /* these assignments are just to silence compiler
8028 multicall_cop = NULL;
8030 last_pad = PL_comppad;
8032 /* the initial nextstate you would normally execute
8033 * at the start of an eval (which would cause error
8034 * messages to come from the eval), may be optimised
8035 * away from the execution path in the regex code blocks;
8036 * so manually set PL_curcop to it initially */
8038 OP *o = cUNOPx(nop)->op_first;
8039 assert(o->op_type == OP_NULL);
8040 if (o->op_targ == OP_SCOPE) {
8041 o = cUNOPo->op_first;
8044 assert(o->op_targ == OP_LEAVE);
8045 o = cUNOPo->op_first;
8046 assert(o->op_type == OP_ENTER);
8050 if (o->op_type != OP_STUB) {
8051 assert( o->op_type == OP_NEXTSTATE
8052 || o->op_type == OP_DBSTATE
8053 || (o->op_type == OP_NULL
8054 && ( o->op_targ == OP_NEXTSTATE
8055 || o->op_targ == OP_DBSTATE
8059 PL_curcop = (COP*)o;
8064 DEBUG_STATE_r( Perl_re_printf( aTHX_
8065 " re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
8067 rex->offs[0].end = locinput - reginfo->strbeg;
8068 if (reginfo->info_aux_eval->pos_magic)
8069 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
8070 reginfo->sv, reginfo->strbeg,
8071 locinput - reginfo->strbeg);
8074 SV *sv_mrk = get_sv("REGMARK", 1);
8075 sv_setsv(sv_mrk, sv_yes_mark);
8078 /* we don't use MULTICALL here as we want to call the
8079 * first op of the block of interest, rather than the
8080 * first op of the sub. Also, we don't want to free
8081 * the savestack frame */
8082 before = (IV)(SP-PL_stack_base);
8084 CALLRUNOPS(aTHX); /* Scalar context. */
8086 if ((IV)(SP-PL_stack_base) == before)
8087 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8093 /* before restoring everything, evaluate the returned
8094 * value, so that 'uninit' warnings don't use the wrong
8095 * PL_op or pad. Also need to process any magic vars
8096 * (e.g. $1) *before* parentheses are restored */
8101 if (logical == 0) { /* (?{})/ */
8102 SV *replsv = save_scalar(PL_replgv);
8103 sv_setsv(replsv, ret); /* $^R */
8106 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
8107 sw = cBOOL(SvTRUE_NN(ret));
8110 else { /* /(??{}) */
8111 /* if its overloaded, let the regex compiler handle
8112 * it; otherwise extract regex, or stringify */
8113 if (SvGMAGICAL(ret))
8114 ret = sv_mortalcopy(ret);
8115 if (!SvAMAGIC(ret)) {
8119 if (SvTYPE(sv) == SVt_REGEXP)
8120 re_sv = (REGEXP*) sv;
8121 else if (SvSMAGICAL(ret)) {
8122 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
8124 re_sv = (REGEXP *) mg->mg_obj;
8127 /* force any undef warnings here */
8128 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
8129 ret = sv_mortalcopy(ret);
8130 (void) SvPV_force_nolen(ret);
8136 /* *** Note that at this point we don't restore
8137 * PL_comppad, (or pop the CxSUB) on the assumption it may
8138 * be used again soon. This is safe as long as nothing
8139 * in the regexp code uses the pad ! */
8141 PL_curcop = ocurcop;
8142 regcp_restore(rex, ST.lastcp, &maxopenparen);
8143 PL_curpm_under = PL_curpm;
8144 PL_curpm = PL_reg_curpm;
8147 PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
8153 /* only /(??{})/ from now on */
8156 /* extract RE object from returned value; compiling if
8160 re_sv = reg_temp_copy(NULL, re_sv);
8165 if (SvUTF8(ret) && IN_BYTES) {
8166 /* In use 'bytes': make a copy of the octet
8167 * sequence, but without the flag on */
8169 const char *const p = SvPV(ret, len);
8170 ret = newSVpvn_flags(p, len, SVs_TEMP);
8172 if (rex->intflags & PREGf_USE_RE_EVAL)
8173 pm_flags |= PMf_USE_RE_EVAL;
8175 /* if we got here, it should be an engine which
8176 * supports compiling code blocks and stuff */
8177 assert(rex->engine && rex->engine->op_comp);
8178 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
8179 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
8180 rex->engine, NULL, NULL,
8181 /* copy /msixn etc to inner pattern */
8186 & (SVs_TEMP | SVs_GMG | SVf_ROK))
8187 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
8188 /* This isn't a first class regexp. Instead, it's
8189 caching a regexp onto an existing, Perl visible
8191 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
8197 RXp_MATCH_COPIED_off(re);
8198 re->subbeg = rex->subbeg;
8199 re->sublen = rex->sublen;
8200 re->suboffset = rex->suboffset;
8201 re->subcoffset = rex->subcoffset;
8203 re->lastcloseparen = 0;
8206 debug_start_match(re_sv, utf8_target, locinput,
8207 reginfo->strend, "EVAL/GOSUB: Matching embedded");
8209 startpoint = rei->program + 1;
8210 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
8211 * close_paren only for GOSUB */
8212 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
8213 /* Save all the seen positions so far. */
8214 ST.cp = regcppush(rex, 0, maxopenparen);
8215 REGCP_SET(ST.lastcp);
8216 /* and set maxopenparen to 0, since we are starting a "fresh" match */
8218 /* run the pattern returned from (??{...}) */
8220 eval_recurse_doit: /* Share code with GOSUB below this line
8221 * At this point we expect the stack context to be
8222 * set up correctly */
8224 /* invalidate the S-L poscache. We're now executing a
8225 * different set of WHILEM ops (and their associated
8226 * indexes) against the same string, so the bits in the
8227 * cache are meaningless. Setting maxiter to zero forces
8228 * the cache to be invalidated and zeroed before reuse.
8229 * XXX This is too dramatic a measure. Ideally we should
8230 * save the old cache and restore when running the outer
8232 reginfo->poscache_maxiter = 0;
8234 /* the new regexp might have a different is_utf8_pat than we do */
8235 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
8237 ST.prev_rex = rex_sv;
8238 ST.prev_curlyx = cur_curlyx;
8240 SET_reg_curpm(rex_sv);
8245 ST.prev_eval = cur_eval;
8247 /* now continue from first node in postoned RE */
8248 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
8249 loceol, script_run_begin);
8250 NOT_REACHED; /* NOTREACHED */
8253 case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
8254 /* note: this is called twice; first after popping B, then A */
8256 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n",
8257 depth, cur_eval, ST.prev_eval);
8260 #define SET_RECURSE_LOCINPUT(STR,VAL)\
8261 if ( cur_eval && CUR_EVAL.close_paren ) {\
8263 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
8265 CUR_EVAL.close_paren - 1,\
8269 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
8272 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
8274 rex_sv = ST.prev_rex;
8275 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8276 SET_reg_curpm(rex_sv);
8277 rex = ReANY(rex_sv);
8278 rexi = RXi_GET(rex);
8280 /* preserve $^R across LEAVE's. See Bug 121070. */
8281 SV *save_sv= GvSV(PL_replgv);
8283 SvREFCNT_inc(save_sv);
8284 regcpblow(ST.cp); /* LEAVE in disguise */
8285 /* don't move this initialization up */
8286 replsv = GvSV(PL_replgv);
8287 sv_setsv(replsv, save_sv);
8289 SvREFCNT_dec(save_sv);
8291 cur_eval = ST.prev_eval;
8292 cur_curlyx = ST.prev_curlyx;
8294 /* Invalidate cache. See "invalidate" comment above. */
8295 reginfo->poscache_maxiter = 0;
8296 if ( nochange_depth )
8299 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
8303 case EVAL_B_fail: /* unsuccessful B in (?{...})B */
8304 REGCP_UNWIND(ST.lastcp);
8307 case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
8308 /* note: this is called twice; first after popping B, then A */
8310 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
8311 depth, cur_eval, ST.prev_eval);
8314 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
8316 rex_sv = ST.prev_rex;
8317 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8318 SET_reg_curpm(rex_sv);
8319 rex = ReANY(rex_sv);
8320 rexi = RXi_GET(rex);
8322 REGCP_UNWIND(ST.lastcp);
8323 regcppop(rex, &maxopenparen);
8324 cur_eval = ST.prev_eval;
8325 cur_curlyx = ST.prev_curlyx;
8327 /* Invalidate cache. See "invalidate" comment above. */
8328 reginfo->poscache_maxiter = 0;
8329 if ( nochange_depth )
8332 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
8337 n = ARG(scan); /* which paren pair */
8338 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
8339 if (n > maxopenparen)
8341 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
8342 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
8347 (IV)rex->offs[n].start_tmp,
8353 case SROPEN: /* (*SCRIPT_RUN: */
8354 script_run_begin = (U8 *) locinput;
8359 n = ARG(scan); /* which paren pair */
8360 CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8361 locinput - reginfo->strbeg);
8362 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
8367 case SRCLOSE: /* (*SCRIPT_RUN: ... ) */
8369 if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
8377 case ACCEPT: /* (*ACCEPT) */
8379 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8383 cursor && OP(cursor)!=END;
8384 cursor=regnext(cursor))
8386 if ( OP(cursor)==CLOSE ){
8388 if ( n <= lastopen ) {
8389 CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8390 locinput - reginfo->strbeg);
8391 if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
8400 case GROUPP: /* (?(1)) */
8401 n = ARG(scan); /* which paren pair */
8402 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
8405 case GROUPPN: /* (?(<name>)) */
8406 /* reg_check_named_buff_matched returns 0 for no match */
8407 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
8410 case INSUBP: /* (?(R)) */
8412 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
8413 * of SCAN is already set up as matches a eval.close_paren */
8414 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
8417 case DEFINEP: /* (?(DEFINE)) */
8421 case IFTHEN: /* (?(cond)A|B) */
8422 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8424 next = NEXTOPER(NEXTOPER(scan));
8426 next = scan + ARG(scan);
8427 if (OP(next) == IFTHEN) /* Fake one. */
8428 next = NEXTOPER(NEXTOPER(next));
8432 case LOGICAL: /* modifier for EVAL and IFMATCH */
8433 logical = scan->flags;
8436 /*******************************************************************
8438 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
8439 pattern, where A and B are subpatterns. (For simple A, CURLYM or
8440 STAR/PLUS/CURLY/CURLYN are used instead.)
8442 A*B is compiled as <CURLYX><A><WHILEM><B>
8444 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
8445 state, which contains the current count, initialised to -1. It also sets
8446 cur_curlyx to point to this state, with any previous value saved in the
8449 CURLYX then jumps straight to the WHILEM op, rather than executing A,
8450 since the pattern may possibly match zero times (i.e. it's a while {} loop
8451 rather than a do {} while loop).
8453 Each entry to WHILEM represents a successful match of A. The count in the
8454 CURLYX block is incremented, another WHILEM state is pushed, and execution
8455 passes to A or B depending on greediness and the current count.
8457 For example, if matching against the string a1a2a3b (where the aN are
8458 substrings that match /A/), then the match progresses as follows: (the
8459 pushed states are interspersed with the bits of strings matched so far):
8462 <CURLYX cnt=0><WHILEM>
8463 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
8464 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
8465 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
8466 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
8468 (Contrast this with something like CURLYM, which maintains only a single
8472 a1 <CURLYM cnt=1> a2
8473 a1 a2 <CURLYM cnt=2> a3
8474 a1 a2 a3 <CURLYM cnt=3> b
8477 Each WHILEM state block marks a point to backtrack to upon partial failure
8478 of A or B, and also contains some minor state data related to that
8479 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
8480 overall state, such as the count, and pointers to the A and B ops.
8482 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
8483 must always point to the *current* CURLYX block, the rules are:
8485 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
8486 and set cur_curlyx to point the new block.
8488 When popping the CURLYX block after a successful or unsuccessful match,
8489 restore the previous cur_curlyx.
8491 When WHILEM is about to execute B, save the current cur_curlyx, and set it
8492 to the outer one saved in the CURLYX block.
8494 When popping the WHILEM block after a successful or unsuccessful B match,
8495 restore the previous cur_curlyx.
8497 Here's an example for the pattern (AI* BI)*BO
8498 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
8501 curlyx backtrack stack
8502 ------ ---------------
8504 CO <CO prev=NULL> <WO>
8505 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8506 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8507 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8509 At this point the pattern succeeds, and we work back down the stack to
8510 clean up, restoring as we go:
8512 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8513 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8514 CO <CO prev=NULL> <WO>
8517 *******************************************************************/
8519 #define ST st->u.curlyx
8521 case CURLYX: /* start of /A*B/ (for complex A) */
8523 /* No need to save/restore up to this paren */
8524 I32 parenfloor = scan->flags;
8526 assert(next); /* keep Coverity happy */
8527 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
8530 /* XXXX Probably it is better to teach regpush to support
8531 parenfloor > maxopenparen ... */
8532 if (parenfloor > (I32)rex->lastparen)
8533 parenfloor = rex->lastparen; /* Pessimization... */
8535 ST.prev_curlyx= cur_curlyx;
8537 ST.cp = PL_savestack_ix;
8539 /* these fields contain the state of the current curly.
8540 * they are accessed by subsequent WHILEMs */
8541 ST.parenfloor = parenfloor;
8546 ST.count = -1; /* this will be updated by WHILEM */
8547 ST.lastloc = NULL; /* this will be updated by WHILEM */
8549 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
8551 NOT_REACHED; /* NOTREACHED */
8554 case CURLYX_end: /* just finished matching all of A*B */
8555 cur_curlyx = ST.prev_curlyx;
8557 NOT_REACHED; /* NOTREACHED */
8559 case CURLYX_end_fail: /* just failed to match all of A*B */
8561 cur_curlyx = ST.prev_curlyx;
8563 NOT_REACHED; /* NOTREACHED */
8567 #define ST st->u.whilem
8569 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
8571 /* see the discussion above about CURLYX/WHILEM */
8576 assert(cur_curlyx); /* keep Coverity happy */
8578 min = ARG1(cur_curlyx->u.curlyx.me);
8579 max = ARG2(cur_curlyx->u.curlyx.me);
8580 A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
8581 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8582 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8583 ST.cache_offset = 0;
8587 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n",
8588 depth, (long)n, min, max)
8591 /* First just match a string of min A's. */
8594 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8595 cur_curlyx->u.curlyx.lastloc = locinput;
8596 REGCP_SET(ST.lastcp);
8598 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8600 NOT_REACHED; /* NOTREACHED */
8603 /* If degenerate A matches "", assume A done. */
8605 if (locinput == cur_curlyx->u.curlyx.lastloc) {
8606 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: empty match detected, trying continuation...\n",
8609 goto do_whilem_B_max;
8612 /* super-linear cache processing.
8614 * The idea here is that for certain types of CURLYX/WHILEM -
8615 * principally those whose upper bound is infinity (and
8616 * excluding regexes that have things like \1 and other very
8617 * non-regular expresssiony things), then if a pattern like
8618 * /....A*.../ fails and we backtrack to the WHILEM, then we
8619 * make a note that this particular WHILEM op was at string
8620 * position 47 (say) when the rest of pattern failed. Then, if
8621 * we ever find ourselves back at that WHILEM, and at string
8622 * position 47 again, we can just fail immediately rather than
8623 * running the rest of the pattern again.
8625 * This is very handy when patterns start to go
8626 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8627 * with a combinatorial explosion of backtracking.
8629 * The cache is implemented as a bit array, with one bit per
8630 * string byte position per WHILEM op (up to 16) - so its
8631 * between 0.25 and 2x the string size.
8633 * To avoid allocating a poscache buffer every time, we do an
8634 * initially countdown; only after we have executed a WHILEM
8635 * op (string-length x #WHILEMs) times do we allocate the
8638 * The top 4 bits of scan->flags byte say how many different
8639 * relevant CURLLYX/WHILEM op pairs there are, while the
8640 * bottom 4-bits is the identifying index number of this
8646 if (!reginfo->poscache_maxiter) {
8647 /* start the countdown: Postpone detection until we
8648 * know the match is not *that* much linear. */
8649 reginfo->poscache_maxiter
8650 = (reginfo->strend - reginfo->strbeg + 1)
8652 /* possible overflow for long strings and many CURLYX's */
8653 if (reginfo->poscache_maxiter < 0)
8654 reginfo->poscache_maxiter = I32_MAX;
8655 reginfo->poscache_iter = reginfo->poscache_maxiter;
8658 if (reginfo->poscache_iter-- == 0) {
8659 /* initialise cache */
8660 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8661 regmatch_info_aux *const aux = reginfo->info_aux;
8662 if (aux->poscache) {
8663 if ((SSize_t)reginfo->poscache_size < size) {
8664 Renew(aux->poscache, size, char);
8665 reginfo->poscache_size = size;
8667 Zero(aux->poscache, size, char);
8670 reginfo->poscache_size = size;
8671 Newxz(aux->poscache, size, char);
8673 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8674 "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8675 PL_colors[4], PL_colors[5])
8679 if (reginfo->poscache_iter < 0) {
8680 /* have we already failed at this position? */
8681 SSize_t offset, mask;
8683 reginfo->poscache_iter = -1; /* stop eventual underflow */
8684 offset = (scan->flags & 0xf) - 1
8685 + (locinput - reginfo->strbeg)
8687 mask = 1 << (offset % 8);
8689 if (reginfo->info_aux->poscache[offset] & mask) {
8690 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: (cache) already tried at this position...\n",
8693 cur_curlyx->u.curlyx.count--;
8694 sayNO; /* cache records failure */
8696 ST.cache_offset = offset;
8697 ST.cache_mask = mask;
8701 /* Prefer B over A for minimal matching. */
8703 if (cur_curlyx->u.curlyx.minmod) {
8704 ST.save_curlyx = cur_curlyx;
8705 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8706 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8707 locinput, loceol, script_run_begin);
8708 NOT_REACHED; /* NOTREACHED */
8711 /* Prefer A over B for maximal matching. */
8713 if (n < max) { /* More greed allowed? */
8714 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8716 cur_curlyx->u.curlyx.lastloc = locinput;
8717 REGCP_SET(ST.lastcp);
8718 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8720 NOT_REACHED; /* NOTREACHED */
8722 goto do_whilem_B_max;
8724 NOT_REACHED; /* NOTREACHED */
8726 case WHILEM_B_min: /* just matched B in a minimal match */
8727 case WHILEM_B_max: /* just matched B in a maximal match */
8728 cur_curlyx = ST.save_curlyx;
8730 NOT_REACHED; /* NOTREACHED */
8732 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8733 cur_curlyx = ST.save_curlyx;
8734 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8735 cur_curlyx->u.curlyx.count--;
8737 NOT_REACHED; /* NOTREACHED */
8739 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8741 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8742 REGCP_UNWIND(ST.lastcp);
8743 regcppop(rex, &maxopenparen);
8744 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8745 cur_curlyx->u.curlyx.count--;
8747 NOT_REACHED; /* NOTREACHED */
8749 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8750 REGCP_UNWIND(ST.lastcp);
8751 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8752 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: failed, trying continuation...\n",
8756 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8757 && ckWARN(WARN_REGEXP)
8758 && !reginfo->warned)
8760 reginfo->warned = TRUE;
8761 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8762 "Complex regular subexpression recursion limit (%d) "
8768 ST.save_curlyx = cur_curlyx;
8769 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8770 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8771 locinput, loceol, script_run_begin);
8772 NOT_REACHED; /* NOTREACHED */
8774 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8775 cur_curlyx = ST.save_curlyx;
8777 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8778 /* Maximum greed exceeded */
8779 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8780 && ckWARN(WARN_REGEXP)
8781 && !reginfo->warned)
8783 reginfo->warned = TRUE;
8784 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8785 "Complex regular subexpression recursion "
8786 "limit (%d) exceeded",
8789 cur_curlyx->u.curlyx.count--;
8793 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: B min fail: trying longer...\n", depth)
8795 /* Try grabbing another A and see if it helps. */
8796 cur_curlyx->u.curlyx.lastloc = locinput;
8797 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8799 REGCP_SET(ST.lastcp);
8800 PUSH_STATE_GOTO(WHILEM_A_min,
8801 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8802 locinput, loceol, script_run_begin);
8803 NOT_REACHED; /* NOTREACHED */
8806 #define ST st->u.branch
8808 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
8809 next = scan + ARG(scan);
8812 scan = NEXTOPER(scan);
8815 case BRANCH: /* /(...|A|...)/ */
8816 scan = NEXTOPER(scan); /* scan now points to inner node */
8817 ST.lastparen = rex->lastparen;
8818 ST.lastcloseparen = rex->lastcloseparen;
8819 ST.next_branch = next;
8822 /* Now go into the branch */
8824 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8827 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8830 NOT_REACHED; /* NOTREACHED */
8832 case CUTGROUP: /* /(*THEN)/ */
8833 sv_yes_mark = st->u.mark.mark_name = scan->flags
8834 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8836 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8838 NOT_REACHED; /* NOTREACHED */
8840 case CUTGROUP_next_fail:
8843 if (st->u.mark.mark_name)
8844 sv_commit = st->u.mark.mark_name;
8846 NOT_REACHED; /* NOTREACHED */
8850 NOT_REACHED; /* NOTREACHED */
8852 case BRANCH_next_fail: /* that branch failed; try the next, if any */
8857 REGCP_UNWIND(ST.cp);
8858 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8859 scan = ST.next_branch;
8860 /* no more branches? */
8861 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8863 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n",
8870 continue; /* execute next BRANCH[J] op */
8873 case MINMOD: /* next op will be non-greedy, e.g. A*? */
8878 #define ST st->u.curlym
8880 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
8882 /* This is an optimisation of CURLYX that enables us to push
8883 * only a single backtracking state, no matter how many matches
8884 * there are in {m,n}. It relies on the pattern being constant
8885 * length, with no parens to influence future backrefs
8889 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8891 ST.lastparen = rex->lastparen;
8892 ST.lastcloseparen = rex->lastcloseparen;
8894 /* if paren positive, emulate an OPEN/CLOSE around A */
8896 U32 paren = ST.me->flags;
8897 if (paren > maxopenparen)
8898 maxopenparen = paren;
8899 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8907 ST.Binfo.count = -1;
8910 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8913 curlym_do_A: /* execute the A in /A{m,n}B/ */
8914 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
8916 NOT_REACHED; /* NOTREACHED */
8918 case CURLYM_A: /* we've just matched an A */
8920 /* after first match, determine A's length: u.curlym.alen */
8921 if (ST.count == 1) {
8922 if (reginfo->is_utf8_target) {
8923 char *s = st->locinput;
8924 while (s < locinput) {
8930 ST.alen = locinput - st->locinput;
8933 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8936 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8937 depth, (IV) ST.count, (IV)ST.alen)
8940 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8944 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8945 if ( max == REG_INFTY || ST.count < max )
8946 goto curlym_do_A; /* try to match another A */
8948 goto curlym_do_B; /* try to match B */
8950 case CURLYM_A_fail: /* just failed to match an A */
8951 REGCP_UNWIND(ST.cp);
8954 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8955 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8958 curlym_do_B: /* execute the B in /A{m,n}B/ */
8959 if (ST.Binfo.count < 0) {
8960 /* calculate possible match of 1st char following curly */
8962 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8963 regnode *text_node = ST.B;
8964 if (! HAS_TEXT(text_node))
8965 FIND_NEXT_IMPT(text_node);
8966 if (PL_regkind[OP(text_node)] == EXACT) {
8967 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
8968 &ST.Binfo, reginfo))
8977 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n",
8978 depth, (IV)ST.count)
8980 if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) {
8981 assert(ST.Binfo.count > 0);
8983 /* Do a quick test to hopefully rule out most non-matches */
8984 if ( locinput + ST.Binfo.min_length > loceol
8985 || ! S_test_EXACTISH_ST(locinput, ST.Binfo))
8988 Perl_re_exec_indentf( aTHX_
8989 "CURLYM Fast bail next target=0x%X anded==0x%X"
8992 (int) nextbyte, ST.Binfo.first_byte_anded,
8993 ST.Binfo.first_byte_mask)
8995 state_num = CURLYM_B_fail;
8996 goto reenter_switch;
9001 /* emulate CLOSE: mark current A as captured */
9002 U32 paren = (U32)ST.me->flags;
9004 CLOSE_CAPTURE(paren,
9005 HOPc(locinput, -ST.alen) - reginfo->strbeg,
9006 locinput - reginfo->strbeg);
9009 rex->offs[paren].end = -1;
9011 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
9020 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */
9022 NOT_REACHED; /* NOTREACHED */
9024 case CURLYM_B_fail: /* just failed to match a B */
9025 REGCP_UNWIND(ST.cp);
9026 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9028 I32 max = ARG2(ST.me);
9029 if (max != REG_INFTY && ST.count == max)
9031 goto curlym_do_A; /* try to match a further A */
9033 /* backtrack one A */
9034 if (ST.count == ARG1(ST.me) /* min */)
9037 SET_locinput(HOPc(locinput, -ST.alen));
9038 goto curlym_do_B; /* try to match B */
9041 #define ST st->u.curly
9043 #define CURLY_SETPAREN(paren, success) \
9046 CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
9047 locinput - reginfo->strbeg); \
9050 rex->offs[paren].end = -1; \
9051 rex->lastparen = ST.lastparen; \
9052 rex->lastcloseparen = ST.lastcloseparen; \
9056 case STAR: /* /A*B/ where A is width 1 char */
9060 scan = NEXTOPER(scan);
9063 case PLUS: /* /A+B/ where A is width 1 char */
9067 scan = NEXTOPER(scan);
9070 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
9071 ST.paren = scan->flags; /* Which paren to set */
9072 ST.lastparen = rex->lastparen;
9073 ST.lastcloseparen = rex->lastcloseparen;
9074 if (ST.paren > maxopenparen)
9075 maxopenparen = ST.paren;
9076 ST.min = ARG1(scan); /* min to match */
9077 ST.max = ARG2(scan); /* max to match */
9078 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
9080 /* handle the single-char capture called as a GOSUB etc */
9081 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
9083 char *li = locinput;
9084 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
9092 case CURLY: /* /A{m,n}B/ where A is width 1 char */
9094 ST.min = ARG1(scan); /* min to match */
9095 ST.max = ARG2(scan); /* max to match */
9096 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
9099 * Lookahead to avoid useless match attempts
9100 * when we know what character comes next.
9102 * Used to only do .*x and .*?x, but now it allows
9103 * for )'s, ('s and (?{ ... })'s to be in the way
9104 * of the quantifier and the EXACT-like node. -- japhy
9107 assert(ST.min <= ST.max);
9108 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
9112 regnode *text_node = next;
9114 if (! HAS_TEXT(text_node))
9115 FIND_NEXT_IMPT(text_node);
9117 if (! HAS_TEXT(text_node))
9120 if ( PL_regkind[OP(text_node)] != EXACT ) {
9124 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9125 &ST.Binfo, reginfo))
9136 char *li = locinput;
9139 regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
9146 if (ST.Binfo.count <= 0)
9147 goto curly_try_B_min;
9149 ST.oldloc = locinput;
9151 /* set ST.maxpos to the furthest point along the
9152 * string that could possibly match, i.e., that a match could
9154 if (ST.max == REG_INFTY) {
9155 ST.maxpos = loceol - 1;
9157 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
9160 else if (utf8_target) {
9161 int m = ST.max - ST.min;
9162 for (ST.maxpos = locinput;
9163 m >0 && ST.maxpos < loceol; m--)
9164 ST.maxpos += UTF8SKIP(ST.maxpos);
9167 ST.maxpos = locinput + ST.max - ST.min;
9168 if (ST.maxpos >= loceol)
9169 ST.maxpos = loceol - 1;
9171 goto curly_try_B_min_known;
9175 /* avoid taking address of locinput, so it can remain
9177 char *li = locinput;
9178 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
9179 if (ST.count < ST.min)
9182 if ((ST.count > ST.min)
9183 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
9185 /* A{m,n} must come at the end of the string, there's
9186 * no point in backing off ... */
9188 /* ...except that $ and \Z can match before *and* after
9189 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
9190 We may back off by one in this case. */
9191 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
9195 goto curly_try_B_max;
9197 NOT_REACHED; /* NOTREACHED */
9199 case CURLY_B_min_fail:
9200 /* failed to find B in a non-greedy match. */
9202 REGCP_UNWIND(ST.cp);
9204 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9207 if (ST.Binfo.count == 0) {
9208 /* failed -- move forward one */
9209 char *li = locinput;
9210 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
9215 if (!( ST.count <= ST.max
9216 /* count overflow ? */
9217 || (ST.max == REG_INFTY && ST.count > 0))
9223 /* Couldn't or didn't -- move forward. */
9224 ST.oldloc = locinput;
9226 locinput += UTF8SKIP(locinput);
9231 curly_try_B_min_known:
9232 /* find the next place where 'B' could work, then call B */
9233 if (locinput + ST.Binfo.initial_exact < loceol) {
9234 if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
9236 /* Here, the mask is all 1's for the entire length of
9237 * any possible match. (That actually means that there
9238 * is only one possible match.) Look for the next
9240 locinput = ninstr(locinput, loceol,
9241 (char *) ST.Binfo.matches,
9242 (char *) ST.Binfo.matches
9243 + ST.Binfo.initial_exact);
9244 if (locinput == NULL) {
9249 /* If the first byte(s) of the mask are all ones, it
9250 * means those bytes must match identically, so can use
9251 * ninstr() to find the next possible matchpoint */
9252 if (ST.Binfo.initial_exact > 0) {
9253 locinput = ninstr(locinput, loceol,
9254 (char *) ST.Binfo.matches,
9255 (char *) ST.Binfo.matches
9256 + ST.Binfo.initial_exact);
9258 else { /* Otherwise find the next byte that matches,
9260 locinput = (char *) find_next_masked(
9261 (U8 *) locinput, (U8 *) loceol,
9262 ST.Binfo.first_byte_anded,
9263 ST.Binfo.first_byte_mask);
9264 /* Advance to the end of a multi-byte character */
9266 while ( locinput < loceol
9267 && UTF8_IS_CONTINUATION(*locinput))
9273 if ( locinput == NULL
9274 || locinput + ST.Binfo.min_length > loceol)
9279 /* Here, we have found a possible match point; if can't
9280 * rule it out, quit the loop so can check fully */
9281 if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
9285 locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
9287 } while (locinput <= ST.maxpos);
9290 if (locinput > ST.maxpos)
9294 ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
9295 : (STRLEN) (locinput - ST.oldloc);
9298 /* Here is at the beginning of a character that meets the mask
9299 * criteria. Need to make sure that some real possibility */
9302 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
9303 * at what may be the beginning of b; check that everything
9304 * between oldloc and locinput matches */
9305 char *li = ST.oldloc;
9307 if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
9309 assert(n == REG_INFTY || locinput == li);
9314 CURLY_SETPAREN(ST.paren, ST.count);
9315 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
9317 NOT_REACHED; /* NOTREACHED */
9321 /* a successful greedy match: now try to match B */
9322 if ( ST.Binfo.count <= 0
9323 || ( ST.Binfo.count > 0
9324 && locinput + ST.Binfo.min_length <= loceol
9325 && S_test_EXACTISH_ST(locinput, ST.Binfo)))
9327 CURLY_SETPAREN(ST.paren, ST.count);
9328 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
9330 NOT_REACHED; /* NOTREACHED */
9334 case CURLY_B_max_fail:
9335 /* failed to find B in a greedy match */
9337 REGCP_UNWIND(ST.cp);
9339 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9342 if (--ST.count < ST.min)
9344 locinput = HOPc(locinput, -1);
9345 goto curly_try_B_max;
9349 case END: /* last op of main pattern */
9352 /* we've just finished A in /(??{A})B/; now continue with B */
9353 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
9354 st->u.eval.prev_rex = rex_sv; /* inner */
9356 /* Save *all* the positions. */
9357 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
9358 rex_sv = CUR_EVAL.prev_rex;
9359 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
9360 SET_reg_curpm(rex_sv);
9361 rex = ReANY(rex_sv);
9362 rexi = RXi_GET(rex);
9364 st->u.eval.prev_curlyx = cur_curlyx;
9365 cur_curlyx = CUR_EVAL.prev_curlyx;
9367 REGCP_SET(st->u.eval.lastcp);
9369 /* Restore parens of the outer rex without popping the
9371 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
9373 st->u.eval.prev_eval = cur_eval;
9374 cur_eval = CUR_EVAL.prev_eval;
9376 Perl_re_exec_indentf( aTHX_ "END: EVAL trying tail ... (cur_eval=%p)\n",
9378 if ( nochange_depth )
9381 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
9383 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */
9384 st->u.eval.prev_eval->u.eval.B,
9385 locinput, loceol, script_run_begin);
9388 if (locinput < reginfo->till) {
9389 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
9390 "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
9392 (long)(locinput - startpos),
9393 (long)(reginfo->till - startpos),
9396 sayNO_SILENT; /* Cannot match: too short. */
9398 sayYES; /* Success! */
9400 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
9402 Perl_re_exec_indentf( aTHX_ "%sSUCCEED: subpattern success...%s\n",
9403 depth, PL_colors[4], PL_colors[5]));
9404 sayYES; /* Success! */
9407 #define ST st->u.ifmatch
9409 case SUSPEND: /* (?>A) */
9411 ST.start = locinput;
9416 case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
9418 goto ifmatch_trivial_fail_test;
9420 case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
9422 ifmatch_trivial_fail_test:
9423 ST.count = scan->next_off + 1; /* next_off repurposed to be
9424 lookbehind count, requires
9426 if (! scan->flags) { /* 'flags' zero means lookahed */
9428 /* Lookahead starts here and ends at the normal place */
9429 ST.start = locinput;
9433 PERL_UINT_FAST8_T back_count = scan->flags;
9436 /* Lookbehind can look beyond the current position */
9439 /* ... and starts at the first place in the input that is in
9440 * the range of the possible start positions */
9441 for (; ST.count > 0; ST.count--, back_count--) {
9442 s = HOPBACKc(locinput, back_count);
9449 /* If the lookbehind doesn't start in the actual string, is a
9450 * trivial match failure */
9453 sw = 1 - cBOOL(ST.wanted);
9458 /* Here, we didn't want it to match, so is actually success */
9459 next = scan + ARG(scan);
9467 ST.logical = logical;
9468 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9470 /* execute body of (?...A) */
9471 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
9472 ST.end, script_run_begin);
9473 NOT_REACHED; /* NOTREACHED */
9478 case IFMATCH_A_fail: /* body of (?...A) failed */
9479 if (! ST.logical && ST.count > 1) {
9481 /* It isn't a real failure until we've tried all starting
9482 * positions. Move to the next starting position and retry */
9484 ST.start = HOPc(ST.start, 1);
9486 logical = ST.logical;
9490 /* Here, all starting positions have been tried. */
9494 case IFMATCH_A: /* body of (?...A) succeeded */
9497 sw = matched == ST.wanted;
9498 if (! ST.logical && !sw) {
9502 if (OP(ST.me) != SUSPEND) {
9503 /* restore old position except for (?>...) */
9504 locinput = st->locinput;
9505 loceol = st->loceol;
9506 script_run_begin = st->sr0;
9508 scan = ST.me + ARG(ST.me);
9511 continue; /* execute B */
9516 case LONGJMP: /* alternative with many branches compiles to
9517 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9518 next = scan + ARG(scan);
9523 case COMMIT: /* (*COMMIT) */
9524 reginfo->cutpoint = loceol;
9527 case PRUNE: /* (*PRUNE) */
9529 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9530 PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9532 NOT_REACHED; /* NOTREACHED */
9534 case COMMIT_next_fail:
9538 NOT_REACHED; /* NOTREACHED */
9540 case OPFAIL: /* (*FAIL) */
9542 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9544 /* deal with (?(?!)X|Y) properly,
9545 * make sure we trigger the no branch
9546 * of the trailing IFTHEN structure*/
9552 NOT_REACHED; /* NOTREACHED */
9554 #define ST st->u.mark
9555 case MARKPOINT: /* (*MARK:foo) */
9556 ST.prev_mark = mark_state;
9557 ST.mark_name = sv_commit = sv_yes_mark
9558 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9560 ST.mark_loc = locinput;
9561 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9563 NOT_REACHED; /* NOTREACHED */
9565 case MARKPOINT_next:
9566 mark_state = ST.prev_mark;
9568 NOT_REACHED; /* NOTREACHED */
9570 case MARKPOINT_next_fail:
9571 if (popmark && sv_eq(ST.mark_name,popmark))
9573 if (ST.mark_loc > startpoint)
9574 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9575 popmark = NULL; /* we found our mark */
9576 sv_commit = ST.mark_name;
9579 Perl_re_exec_indentf( aTHX_ "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9581 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9584 mark_state = ST.prev_mark;
9585 sv_yes_mark = mark_state ?
9586 mark_state->u.mark.mark_name : NULL;
9588 NOT_REACHED; /* NOTREACHED */
9590 case SKIP: /* (*SKIP) */
9592 /* (*SKIP) : if we fail we cut here*/
9593 ST.mark_name = NULL;
9594 ST.mark_loc = locinput;
9595 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9598 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9599 otherwise do nothing. Meaning we need to scan
9601 regmatch_state *cur = mark_state;
9602 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9605 if ( sv_eq( cur->u.mark.mark_name,
9608 ST.mark_name = find;
9609 PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9612 cur = cur->u.mark.prev_mark;
9615 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9618 case SKIP_next_fail:
9620 /* (*CUT:NAME) - Set up to search for the name as we
9621 collapse the stack*/
9622 popmark = ST.mark_name;
9624 /* (*CUT) - No name, we cut here.*/
9625 if (ST.mark_loc > startpoint)
9626 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9627 /* but we set sv_commit to latest mark_name if there
9628 is one so they can test to see how things lead to this
9631 sv_commit=mark_state->u.mark.mark_name;
9635 NOT_REACHED; /* NOTREACHED */
9638 case LNBREAK: /* \R */
9639 if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9646 PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9647 PTR2UV(scan), OP(scan));
9648 Perl_croak(aTHX_ "regexp memory corruption");
9650 /* this is a point to jump to in order to increment
9651 * locinput by one character */
9653 assert(!NEXTCHR_IS_EOS);
9655 locinput += PL_utf8skip[nextbyte];
9656 /* locinput is allowed to go 1 char off the end (signifying
9657 * EOS), but not 2+ */
9658 if (locinput > loceol)
9667 /* switch break jumps here */
9668 scan = next; /* prepare to execute the next op and ... */
9669 continue; /* ... jump back to the top, reusing st */
9673 /* push a state that backtracks on success */
9674 st->u.yes.prev_yes_state = yes_state;
9678 /* push a new regex state, then continue at scan */
9680 regmatch_state *newst;
9681 DECLARE_AND_GET_RE_DEBUG_FLAGS;
9683 DEBUG_r( /* DEBUG_STACK_r */
9684 if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
9685 regmatch_state *cur = st;
9686 regmatch_state *curyes = yes_state;
9688 regmatch_slab *slab = PL_regmatch_slab;
9689 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9690 if (cur < SLAB_FIRST(slab)) {
9692 cur = SLAB_LAST(slab);
9694 Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9697 depth - i, PL_reg_name[cur->resume_state],
9698 (curyes == cur) ? "yes" : ""
9701 curyes = cur->u.yes.prev_yes_state;
9704 DEBUG_STATE_pp("push")
9707 st->locinput = locinput;
9708 st->loceol = loceol;
9709 st->sr0 = script_run_begin;
9711 if (newst > SLAB_LAST(PL_regmatch_slab))
9712 newst = S_push_slab(aTHX);
9713 PL_regmatch_state = newst;
9715 locinput = pushinput;
9717 script_run_begin = pushsr0;
9723 #ifdef SOLARIS_BAD_OPTIMIZER
9724 # undef PL_charclass
9728 * We get here only if there's trouble -- normally "case END" is
9729 * the terminating point.
9731 Perl_croak(aTHX_ "corrupted regexp pointers");
9732 NOT_REACHED; /* NOTREACHED */
9736 /* we have successfully completed a subexpression, but we must now
9737 * pop to the state marked by yes_state and continue from there */
9738 assert(st != yes_state);
9740 while (st != yes_state) {
9742 if (st < SLAB_FIRST(PL_regmatch_slab)) {
9743 PL_regmatch_slab = PL_regmatch_slab->prev;
9744 st = SLAB_LAST(PL_regmatch_slab);
9748 DEBUG_STATE_pp("pop (no final)");
9750 DEBUG_STATE_pp("pop (yes)");
9756 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9757 || yes_state > SLAB_LAST(PL_regmatch_slab))
9759 /* not in this slab, pop slab */
9760 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9761 PL_regmatch_slab = PL_regmatch_slab->prev;
9762 st = SLAB_LAST(PL_regmatch_slab);
9764 depth -= (st - yes_state);
9767 yes_state = st->u.yes.prev_yes_state;
9768 PL_regmatch_state = st;
9771 locinput= st->locinput;
9773 script_run_begin = st->sr0;
9775 state_num = st->resume_state + no_final;
9776 goto reenter_switch;
9779 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n",
9780 PL_colors[4], PL_colors[5]));
9782 if (reginfo->info_aux_eval) {
9783 /* each successfully executed (?{...}) block does the equivalent of
9784 * local $^R = do {...}
9785 * When popping the save stack, all these locals would be undone;
9786 * bypass this by setting the outermost saved $^R to the latest
9788 /* I dont know if this is needed or works properly now.
9789 * see code related to PL_replgv elsewhere in this file.
9792 if (oreplsv != GvSV(PL_replgv)) {
9793 sv_setsv(oreplsv, GvSV(PL_replgv));
9794 SvSETMAGIC(oreplsv);
9802 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n",
9804 PL_colors[4], PL_colors[5])
9816 /* there's a previous state to backtrack to */
9818 if (st < SLAB_FIRST(PL_regmatch_slab)) {
9819 PL_regmatch_slab = PL_regmatch_slab->prev;
9820 st = SLAB_LAST(PL_regmatch_slab);
9822 PL_regmatch_state = st;
9823 locinput= st->locinput;
9825 script_run_begin = st->sr0;
9827 DEBUG_STATE_pp("pop");
9829 if (yes_state == st)
9830 yes_state = st->u.yes.prev_yes_state;
9832 state_num = st->resume_state + 1; /* failure = success + 1 */
9834 goto reenter_switch;
9839 if (rex->intflags & PREGf_VERBARG_SEEN) {
9840 SV *sv_err = get_sv("REGERROR", 1);
9841 SV *sv_mrk = get_sv("REGMARK", 1);
9843 sv_commit = &PL_sv_no;
9845 sv_yes_mark = &PL_sv_yes;
9848 sv_commit = &PL_sv_yes;
9849 sv_yes_mark = &PL_sv_no;
9853 sv_setsv(sv_err, sv_commit);
9854 sv_setsv(sv_mrk, sv_yes_mark);
9858 if (last_pushed_cv) {
9860 /* see "Some notes about MULTICALL" above */
9862 PERL_UNUSED_VAR(SP);
9865 LEAVE_SCOPE(orig_savestack_ix);
9867 assert(!result || locinput - reginfo->strbeg >= 0);
9868 return result ? locinput - reginfo->strbeg : -1;
9872 - regrepeat - repeatedly match something simple, report how many
9874 * What 'simple' means is a node which can be the operand of a quantifier like
9877 * startposp - pointer to a pointer to the start position. This is updated
9878 * to point to the byte following the highest successful
9880 * p - the regnode to be repeatedly matched against.
9881 * loceol - pointer to the end position beyond which we aren't supposed to
9883 * reginfo - struct holding match state, such as utf8_target
9884 * max - maximum number of things to match.
9885 * depth - (for debugging) backtracking depth.
9888 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9889 char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
9891 char *scan; /* Pointer to current position in target string */
9893 char *this_eol = loceol; /* potentially adjusted version. */
9894 I32 hardcount = 0; /* How many matches so far */
9895 bool utf8_target = reginfo->is_utf8_target;
9896 unsigned int to_complement = 0; /* Invert the result? */
9897 _char_class_number classnum;
9899 PERL_ARGS_ASSERT_REGREPEAT;
9901 /* This routine is structured so that we switch on the input OP. Each OP
9902 * case: statement contains a loop to repeatedly apply the OP, advancing
9903 * the input until it fails, or reaches the end of the input, or until it
9904 * reaches the upper limit of matches. */
9907 if (max == REG_INFTY) /* This is a special marker to go to the platform's
9910 else if (! utf8_target && this_eol - scan > max)
9911 this_eol = scan + max;
9913 /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> down
9914 * to the maximum of how far we should go in it (but leaving it set to the
9915 * real end if the maximum permissible would take us beyond that). This
9916 * allows us to make the loop exit condition that we haven't gone past
9917 * <this_eol> to also mean that we haven't exceeded the max permissible
9918 * count, saving a test each time through the loop. But it assumes that
9919 * the OP matches a single byte, which is true for most of the OPs below
9920 * when applied to a non-UTF-8 target. Those relatively few OPs that don't
9921 * have this characteristic have to compensate.
9923 * There is no such adjustment for UTF-8 targets, sinc the number of bytes
9924 * per character can vary. OPs will have to test both that the count is
9925 * less than the max permissible (using <hardcount> to keep track), and
9926 * that we are still within the bounds of the string (using <this_eol>. A
9927 * few OPs match a single byte no matter what the encoding. They can omit
9928 * the max test if, for the UTF-8 case, they do the adjustment that was
9931 * Thus, the code above sets things up for the common case; and exceptional
9932 * cases need extra work; the common case is to make sure <scan> doesn't
9933 * go past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
9934 * count doesn't exceed the maximum permissible */
9939 while (scan < this_eol && hardcount < max && *scan != '\n') {
9940 scan += UTF8SKIP(scan);
9944 scan = (char *) memchr(scan, '\n', this_eol - scan);
9952 while (scan < this_eol && hardcount < max) {
9953 scan += UTF8SKIP(scan);
9962 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9963 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9969 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9975 if (! utf8_target) {
9983 case EXACTFAA_NO_TRIE:
9989 struct next_matchable_info Binfo;
9990 PERL_UINT_FAST8_T definitive_len;
9992 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9994 /* Set up termination info, and quit if we can rule out that we've
9995 * gotten a match of the termination criteria */
9996 if ( ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo)
9997 || scan + Binfo.min_length > this_eol
9998 || ! S_test_EXACTISH_ST(scan, Binfo))
10003 definitive_len = Binfo.initial_definitive;
10005 /* Here there are potential matches, and the first byte(s) matched our
10008 * If we got a definitive match of some initial bytes, there is no
10009 * possibility of false positives as far as it got */
10010 if (definitive_len > 0) {
10012 /* If as far as it got is the maximum possible, there were no false
10013 * positives at all. Since we have everything set up, see how many
10014 * repeats there are. */
10015 if (definitive_len >= Binfo.max_length) {
10017 /* We've already found one match */
10018 scan += definitive_len;
10021 /* If want more than the one match, and there is room for more,
10022 * see if there are any */
10023 if (hardcount < max && scan + definitive_len <= this_eol) {
10025 /* If the character is only a single byte long, just span
10026 * all such bytes. */
10027 if (definitive_len == 1) {
10028 const char * orig_scan = scan;
10030 if (this_eol - (scan - hardcount) > max) {
10031 this_eol = scan - hardcount + max;
10034 /* Use different routines depending on whether it's an
10035 * exact match or matches with a mask */
10036 if (Binfo.initial_exact == 1) {
10037 scan = (char *) find_span_end((U8 *) scan,
10042 scan = (char *) find_span_end_mask(
10045 Binfo.first_byte_anded,
10046 Binfo.first_byte_mask);
10049 hardcount += scan - orig_scan;
10051 else { /* Here, the full character definitive match is more
10053 while ( hardcount < max
10054 && scan + definitive_len <= this_eol
10055 && S_test_EXACTISH_ST(scan, Binfo))
10057 scan += definitive_len;
10064 } /* End of a full character is definitively matched */
10066 /* Here, an initial portion of the character matched definitively,
10067 * and the rest matched as well, but could have false positives */
10070 PERL_INT_FAST8_T i;
10071 U8 * matches = Binfo.matches;
10073 /* The first bytes were definitive. Look at the remaining */
10074 for (i = 0; i < Binfo.count; i++) {
10075 if (memEQ(scan + definitive_len,
10076 matches + definitive_len,
10077 Binfo.lengths[i] - definitive_len))
10079 goto found_a_completion;
10082 matches += Binfo.lengths[i];
10085 /* Didn't find anything to complete our initial match. Stop
10089 found_a_completion:
10091 /* Here, matched a full character, Include it in the result,
10092 * and then look to see if the next char matches */
10094 scan += Binfo.lengths[i];
10096 } while ( hardcount < max
10097 && scan + definitive_len < this_eol
10098 && S_test_EXACTISH_ST(scan, Binfo));
10100 /* Here, have advanced as far as possible */
10102 } /* End of found some initial bytes that definitively matched */
10104 /* Here, we can't rule out that we have found the beginning of 'B', but
10105 * there were no initial bytes that could rule out anything
10106 * definitively. Use brute force to examine all the possibilities */
10107 while (scan < this_eol && hardcount < max) {
10108 PERL_INT_FAST8_T i;
10109 U8 * matches = Binfo.matches;
10111 for (i = 0; i < Binfo.count; i++) {
10112 if (memEQ(scan, matches, Binfo.lengths[i])) {
10116 matches += Binfo.lengths[i];
10123 scan += Binfo.lengths[i];
10130 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10131 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10137 while (hardcount < max
10139 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, utf8_target))
10141 scan += UTF8SKIP(scan);
10145 else if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
10146 while (scan < this_eol
10147 && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
10151 while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
10157 if (utf8_target && this_eol - scan > max) {
10159 /* We didn't adjust <this_eol> at the beginning of this routine
10160 * because is UTF-8, but it is actually ok to do so, since here, to
10161 * match, 1 char == 1 byte. */
10162 this_eol = scan + max;
10165 scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
10170 while ( hardcount < max
10172 && (*scan & FLAGS(p)) != ARG(p))
10174 scan += UTF8SKIP(scan);
10179 scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
10184 if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
10185 while ( hardcount < max
10187 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10188 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10190 scan += UTF8SKIP(scan);
10197 if (utf8_target) { /* ANYOFHb only can match UTF-8 targets */
10199 /* we know the first byte must be the FLAGS field */
10200 while ( hardcount < max
10202 && (U8) *scan == ANYOF_FLAGS(p)
10203 && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
10206 scan += UTF8SKIP(scan);
10213 if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
10214 while ( hardcount < max
10216 && inRANGE(NATIVE_UTF8_TO_I8(*scan),
10217 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
10218 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
10219 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10220 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10222 scan += UTF8SKIP(scan);
10229 if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
10230 while ( hardcount < max
10231 && scan + FLAGS(p) < this_eol
10232 && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
10233 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10235 scan += UTF8SKIP(scan);
10243 while ( hardcount < max
10245 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10246 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10249 ANYOFRbase(p), ANYOFRdelta(p)))
10251 scan += UTF8SKIP(scan);
10256 while ( hardcount < max
10258 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10268 while ( hardcount < max
10270 && (U8) *scan == ANYOF_FLAGS(p)
10271 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10274 ANYOFRbase(p), ANYOFRdelta(p)))
10276 scan += UTF8SKIP(scan);
10281 while ( hardcount < max
10283 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10291 /* The argument (FLAGS) to all the POSIX node types is the class number */
10298 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10299 if (! utf8_target) {
10300 while (scan < this_eol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
10306 while (hardcount < max && scan < this_eol
10307 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
10311 scan += UTF8SKIP(scan);
10324 if (utf8_target && this_eol - scan > max) {
10326 /* We didn't adjust <this_eol> at the beginning of this routine
10327 * because is UTF-8, but it is actually ok to do so, since here, to
10328 * match, 1 char == 1 byte. */
10329 this_eol = scan + max;
10331 while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
10344 if (! utf8_target) {
10345 while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
10351 /* The complement of something that matches only ASCII matches all
10352 * non-ASCII, plus everything in ASCII that isn't in the class. */
10353 while (hardcount < max && scan < this_eol
10354 && ( ! isASCII_utf8_safe(scan, loceol)
10355 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
10357 scan += UTF8SKIP(scan);
10368 if (! utf8_target) {
10369 while (scan < this_eol && to_complement
10370 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
10377 classnum = (_char_class_number) FLAGS(p);
10378 switch (classnum) {
10380 while ( hardcount < max && scan < this_eol
10381 && to_complement ^ cBOOL(_invlist_contains_cp(
10382 PL_XPosix_ptrs[classnum],
10383 utf8_to_uvchr_buf((U8 *) scan,
10387 scan += UTF8SKIP(scan);
10392 /* For the classes below, the knowledge of how to handle
10393 * every code point is compiled in to Perl via a macro.
10394 * This code is written for making the loops as tight as
10395 * possible. It could be refactored to save space instead.
10398 case _CC_ENUM_SPACE:
10399 while (hardcount < max
10402 ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
10404 scan += UTF8SKIP(scan);
10408 case _CC_ENUM_BLANK:
10409 while (hardcount < max
10412 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
10414 scan += UTF8SKIP(scan);
10418 case _CC_ENUM_XDIGIT:
10419 while (hardcount < max
10422 ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10424 scan += UTF8SKIP(scan);
10428 case _CC_ENUM_VERTSPACE:
10429 while (hardcount < max
10432 ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10434 scan += UTF8SKIP(scan);
10438 case _CC_ENUM_CNTRL:
10439 while (hardcount < max
10442 ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10444 scan += UTF8SKIP(scan);
10454 while (hardcount < max && scan < this_eol &&
10455 (c=is_LNBREAK_utf8_safe(scan, this_eol))) {
10460 /* LNBREAK can match one or two latin chars, which is ok, but we
10461 * have to use hardcount in this situation, and throw away the
10462 * adjustment to <this_eol> done before the switch statement */
10463 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
10471 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
10472 NOT_REACHED; /* NOTREACHED */
10479 c = scan - *startposp;
10483 DECLARE_AND_GET_RE_DEBUG_FLAGS;
10485 SV * const prop = sv_newmortal();
10486 regprop(prog, prop, p, reginfo, NULL);
10487 Perl_re_exec_indentf( aTHX_ "%s can match %" IVdf " times out of %" IVdf "...\n",
10488 depth, SvPVX_const(prop),(IV)c,(IV)max);
10496 - reginclass - determine if a character falls into a character class
10498 n is the ANYOF-type regnode
10499 p is the target string
10500 p_end points to one byte beyond the end of the target string
10501 utf8_target tells whether p is in UTF-8.
10503 Returns true if matched; false otherwise.
10505 Note that this can be a synthetic start class, a combination of various
10506 nodes, so things you think might be mutually exclusive, such as locale,
10507 aren't. It can match both locale and non-locale
10512 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10514 const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10517 bool match = FALSE;
10520 PERL_ARGS_ASSERT_REGINCLASS;
10522 /* If c is not already the code point, get it. Note that
10523 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10524 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10526 const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10527 c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10528 if (c_len == (STRLEN)-1) {
10529 _force_out_malformed_utf8_message(p, p_end,
10531 1 /* 1 means die */ );
10532 NOT_REACHED; /* NOTREACHED */
10535 && (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10536 && ! ANYOFL_UTF8_LOCALE_REQD(flags))
10538 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10542 /* If this character is potentially in the bitmap, check it */
10543 if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10544 if (ANYOF_BITMAP_TEST(n, c))
10547 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10554 else if (flags & ANYOF_LOCALE_FLAGS) {
10555 if ( (flags & ANYOFL_FOLD)
10557 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10561 else if ( ANYOF_POSIXL_TEST_ANY_SET(n)
10562 && c <= U8_MAX /* param to isFOO_lc() */
10565 /* The data structure is arranged so bits 0, 2, 4, ... are set
10566 * if the class includes the Posix character class given by
10567 * bit/2; and 1, 3, 5, ... are set if the class includes the
10568 * complemented Posix class given by int(bit/2). So we loop
10569 * through the bits, each time changing whether we complement
10570 * the result or not. Suppose for the sake of illustration
10571 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
10572 * is set, it means there is a match for this ANYOF node if the
10573 * character is in the class given by the expression (0 / 2 = 0
10574 * = \w). If it is in that class, isFOO_lc() will return 1,
10575 * and since 'to_complement' is 0, the result will stay TRUE,
10576 * and we exit the loop. Suppose instead that bit 0 is 0, but
10577 * bit 1 is 1. That means there is a match if the character
10578 * matches \W. We won't bother to call isFOO_lc() on bit 0,
10579 * but will on bit 1. On the second iteration 'to_complement'
10580 * will be 1, so the exclusive or will reverse things, so we
10581 * are testing for \W. On the third iteration, 'to_complement'
10582 * will be 0, and we would be testing for \s; the fourth
10583 * iteration would test for \S, etc.
10585 * Note that this code assumes that all the classes are closed
10586 * under folding. For example, if a character matches \w, then
10587 * its fold does too; and vice versa. This should be true for
10588 * any well-behaved locale for all the currently defined Posix
10589 * classes, except for :lower: and :upper:, which are handled
10590 * by the pseudo-class :cased: which matches if either of the
10591 * other two does. To get rid of this assumption, an outer
10592 * loop could be used below to iterate over both the source
10593 * character, and its fold (if different) */
10596 int to_complement = 0;
10598 while (count < ANYOF_MAX) {
10599 if (ANYOF_POSIXL_TEST(n, count)
10600 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
10606 to_complement ^= 1;
10613 /* If the bitmap didn't (or couldn't) match, and something outside the
10614 * bitmap could match, try that. */
10616 if (c >= NUM_ANYOF_CODE_POINTS
10617 && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
10619 match = TRUE; /* Everything above the bitmap matches */
10621 /* Here doesn't match everything above the bitmap. If there is
10622 * some information available beyond the bitmap, we may find a
10623 * match in it. If so, this is most likely because the code point
10624 * is outside the bitmap range. But rarely, it could be because of
10625 * some other reason. If so, various flags are set to indicate
10626 * this possibility. On ANYOFD nodes, there may be matches that
10627 * happen only when the target string is UTF-8; or for other node
10628 * types, because runtime lookup is needed, regardless of the
10629 * UTF-8ness of the target string. Finally, under /il, there may
10630 * be some matches only possible if the locale is a UTF-8 one. */
10631 else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP
10632 && ( c >= NUM_ANYOF_CODE_POINTS
10633 || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
10634 && ( UNLIKELY(OP(n) != ANYOFD)
10635 || (utf8_target && ! isASCII_uni(c)
10636 # if NUM_ANYOF_CODE_POINTS > 256
10640 || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
10641 && IN_UTF8_CTYPE_LOCALE)))
10643 SV* only_utf8_locale = NULL;
10644 SV * const definition =
10645 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
10646 get_regclass_nonbitmap_data(prog, n, TRUE, 0,
10647 &only_utf8_locale, NULL);
10649 get_re_gclass_nonbitmap_data(prog, n, TRUE, 0,
10650 &only_utf8_locale, NULL);
10657 } else { /* Convert to utf8 */
10658 utf8_p = utf8_buffer;
10659 append_utf8_from_native_byte(*p, &utf8_p);
10660 utf8_p = utf8_buffer;
10663 /* Turkish locales have these hard-coded rules overriding
10665 if ( UNLIKELY(PL_in_utf8_turkic_locale)
10666 && isALPHA_FOLD_EQ(*p, 'i'))
10669 if (_invlist_contains_cp(definition,
10670 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
10675 else if (*p == 'I') {
10676 if (_invlist_contains_cp(definition,
10677 LATIN_SMALL_LETTER_DOTLESS_I))
10683 else if (_invlist_contains_cp(definition, c)) {
10687 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10688 match = _invlist_contains_cp(only_utf8_locale, c);
10692 /* In a Turkic locale under folding, hard-code the I i case pair
10694 if ( UNLIKELY(PL_in_utf8_turkic_locale)
10696 && (flags & ANYOFL_FOLD)
10699 if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10700 if (ANYOF_BITMAP_TEST(n, 'i')) {
10704 else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10705 if (ANYOF_BITMAP_TEST(n, 'I')) {
10711 if (UNICODE_IS_SUPER(c)
10713 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10715 && ckWARN_d(WARN_NON_UNICODE))
10717 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10718 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10722 #if ANYOF_INVERT != 1
10723 /* Depending on compiler optimization cBOOL takes time, so if don't have to
10725 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10728 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10729 return (flags & ANYOF_INVERT) ^ match;
10733 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10735 /* return the position 'off' UTF-8 characters away from 's', forward if
10736 * 'off' >= 0, backwards if negative. But don't go outside of position
10737 * 'lim', which better be < s if off < 0 */
10739 PERL_ARGS_ASSERT_REGHOP3;
10742 while (off-- && s < lim) {
10743 /* XXX could check well-formedness here */
10744 U8 *new_s = s + UTF8SKIP(s);
10745 if (new_s > lim) /* lim may be in the middle of a long character */
10751 while (off++ && s > lim) {
10753 if (UTF8_IS_CONTINUED(*s)) {
10754 while (s > lim && UTF8_IS_CONTINUATION(*s))
10756 if (! UTF8_IS_START(*s)) {
10757 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10760 /* XXX could check well-formedness here */
10767 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10769 PERL_ARGS_ASSERT_REGHOP4;
10772 while (off-- && s < rlim) {
10773 /* XXX could check well-formedness here */
10778 while (off++ && s > llim) {
10780 if (UTF8_IS_CONTINUED(*s)) {
10781 while (s > llim && UTF8_IS_CONTINUATION(*s))
10783 if (! UTF8_IS_START(*s)) {
10784 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10787 /* XXX could check well-formedness here */
10793 /* like reghop3, but returns NULL on overrun, rather than returning last
10797 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10799 PERL_ARGS_ASSERT_REGHOPMAYBE3;
10802 while (off-- && s < lim) {
10803 /* XXX could check well-formedness here */
10810 while (off++ && s > lim) {
10812 if (UTF8_IS_CONTINUED(*s)) {
10813 while (s > lim && UTF8_IS_CONTINUATION(*s))
10815 if (! UTF8_IS_START(*s)) {
10816 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10819 /* XXX could check well-formedness here */
10828 /* when executing a regex that may have (?{}), extra stuff needs setting
10829 up that will be visible to the called code, even before the current
10830 match has finished. In particular:
10832 * $_ is localised to the SV currently being matched;
10833 * pos($_) is created if necessary, ready to be updated on each call-out
10835 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10836 isn't set until the current pattern is successfully finished), so that
10837 $1 etc of the match-so-far can be seen;
10838 * save the old values of subbeg etc of the current regex, and set then
10839 to the current string (again, this is normally only done at the end
10844 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10847 regexp *const rex = ReANY(reginfo->prog);
10848 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10850 eval_state->rex = rex;
10851 eval_state->sv = reginfo->sv;
10854 /* Make $_ available to executed code. */
10855 if (reginfo->sv != DEFSV) {
10857 DEFSV_set(reginfo->sv);
10859 /* will be dec'd by S_cleanup_regmatch_info_aux */
10860 SvREFCNT_inc_NN(reginfo->sv);
10862 if (!(mg = mg_find_mglob(reginfo->sv))) {
10863 /* prepare for quick setting of pos */
10864 mg = sv_magicext_mglob(reginfo->sv);
10867 eval_state->pos_magic = mg;
10868 eval_state->pos = mg->mg_len;
10869 eval_state->pos_flags = mg->mg_flags;
10872 eval_state->pos_magic = NULL;
10874 if (!PL_reg_curpm) {
10875 /* PL_reg_curpm is a fake PMOP that we can attach the current
10876 * regex to and point PL_curpm at, so that $1 et al are visible
10877 * within a /(?{})/. It's just allocated once per interpreter the
10878 * first time its needed */
10879 Newxz(PL_reg_curpm, 1, PMOP);
10880 #ifdef USE_ITHREADS
10882 SV* const repointer = &PL_sv_undef;
10883 /* this regexp is also owned by the new PL_reg_curpm, which
10884 will try to free it. */
10885 av_push(PL_regex_padav, repointer);
10886 PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
10887 PL_regex_pad = AvARRAY(PL_regex_padav);
10891 SET_reg_curpm(reginfo->prog);
10892 eval_state->curpm = PL_curpm;
10893 PL_curpm_under = PL_curpm;
10894 PL_curpm = PL_reg_curpm;
10895 if (RXp_MATCH_COPIED(rex)) {
10896 /* Here is a serious problem: we cannot rewrite subbeg,
10897 since it may be needed if this match fails. Thus
10898 $` inside (?{}) could fail... */
10899 eval_state->subbeg = rex->subbeg;
10900 eval_state->sublen = rex->sublen;
10901 eval_state->suboffset = rex->suboffset;
10902 eval_state->subcoffset = rex->subcoffset;
10903 #ifdef PERL_ANY_COW
10904 eval_state->saved_copy = rex->saved_copy;
10906 RXp_MATCH_COPIED_off(rex);
10909 eval_state->subbeg = NULL;
10910 rex->subbeg = (char *)reginfo->strbeg;
10911 rex->suboffset = 0;
10912 rex->subcoffset = 0;
10913 rex->sublen = reginfo->strend - reginfo->strbeg;
10917 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10920 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10922 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10923 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
10926 Safefree(aux->poscache);
10930 /* undo the effects of S_setup_eval_state() */
10932 if (eval_state->subbeg) {
10933 regexp * const rex = eval_state->rex;
10934 rex->subbeg = eval_state->subbeg;
10935 rex->sublen = eval_state->sublen;
10936 rex->suboffset = eval_state->suboffset;
10937 rex->subcoffset = eval_state->subcoffset;
10938 #ifdef PERL_ANY_COW
10939 rex->saved_copy = eval_state->saved_copy;
10941 RXp_MATCH_COPIED_on(rex);
10943 if (eval_state->pos_magic)
10945 eval_state->pos_magic->mg_len = eval_state->pos;
10946 eval_state->pos_magic->mg_flags =
10947 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10948 | (eval_state->pos_flags & MGf_BYTES);
10951 PL_curpm = eval_state->curpm;
10952 SvREFCNT_dec(eval_state->sv);
10955 PL_regmatch_state = aux->old_regmatch_state;
10956 PL_regmatch_slab = aux->old_regmatch_slab;
10958 /* free all slabs above current one - this must be the last action
10959 * of this function, as aux and eval_state are allocated within
10960 * slabs and may be freed here */
10962 s = PL_regmatch_slab->next;
10964 PL_regmatch_slab->next = NULL;
10966 regmatch_slab * const osl = s;
10975 S_to_utf8_substr(pTHX_ regexp *prog)
10977 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10978 * on the converted value */
10982 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10985 if (prog->substrs->data[i].substr
10986 && !prog->substrs->data[i].utf8_substr) {
10987 SV* const sv = newSVsv(prog->substrs->data[i].substr);
10988 prog->substrs->data[i].utf8_substr = sv;
10989 sv_utf8_upgrade(sv);
10990 if (SvVALID(prog->substrs->data[i].substr)) {
10991 if (SvTAIL(prog->substrs->data[i].substr)) {
10992 /* Trim the trailing \n that fbm_compile added last
10994 SvCUR_set(sv, SvCUR(sv) - 1);
10995 /* Whilst this makes the SV technically "invalid" (as its
10996 buffer is no longer followed by "\0") when fbm_compile()
10997 adds the "\n" back, a "\0" is restored. */
10998 fbm_compile(sv, FBMcf_TAIL);
11000 fbm_compile(sv, 0);
11002 if (prog->substrs->data[i].substr == prog->check_substr)
11003 prog->check_utf8 = sv;
11009 S_to_byte_substr(pTHX_ regexp *prog)
11011 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
11012 * on the converted value; returns FALSE if can't be converted. */
11016 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
11019 if (prog->substrs->data[i].utf8_substr
11020 && !prog->substrs->data[i].substr) {
11021 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
11022 if (! sv_utf8_downgrade(sv, TRUE)) {
11023 SvREFCNT_dec_NN(sv);
11026 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
11027 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
11028 /* Trim the trailing \n that fbm_compile added last
11030 SvCUR_set(sv, SvCUR(sv) - 1);
11031 fbm_compile(sv, FBMcf_TAIL);
11033 fbm_compile(sv, 0);
11035 prog->substrs->data[i].substr = sv;
11036 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
11037 prog->check_substr = sv;
11044 #ifndef PERL_IN_XSUB_RE
11047 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
11049 /* Temporary helper function for toke.c. Verify that the code point 'cp'
11050 * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in
11051 * the larger string bounded by 'strbeg' and 'strend'.
11053 * 'cp' needs to be assigned (if not, a future version of the Unicode
11054 * Standard could make it something that combines with adjacent characters,
11055 * so code using it would then break), and there has to be a GCB break
11056 * before and after the character. */
11059 GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
11060 const U8 * prev_cp_start;
11062 PERL_ARGS_ASSERT_IS_GRAPHEME;
11064 if ( UNLIKELY(UNICODE_IS_SUPER(cp))
11065 || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
11067 /* These are considered graphemes */
11071 /* Otherwise, unassigned code points are forbidden */
11072 if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
11073 _invlist_search(PL_Assigned_invlist, cp))))
11078 cp_gcb_val = getGCB_VAL_CP(cp);
11080 /* Find the GCB value of the previous code point in the input */
11081 prev_cp_start = utf8_hop_back(s, -1, strbeg);
11082 if (UNLIKELY(prev_cp_start == s)) {
11083 prev_cp_gcb_val = GCB_EDGE;
11086 prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
11089 /* And check that is a grapheme boundary */
11090 if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
11091 TRUE /* is UTF-8 encoded */ ))
11096 /* Similarly verify there is a break between the current character and the
11100 next_cp_gcb_val = GCB_EDGE;
11103 next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
11106 return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
11110 =for apidoc_section $unicode
11112 =for apidoc isSCRIPT_RUN
11114 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
11115 not including C<send> form a "script run". C<utf8_target> is TRUE iff the
11116 sequence starting at C<s> is to be treated as UTF-8. To be precise, except for
11117 two degenerate cases given below, this function returns TRUE iff all code
11118 points in it come from any combination of three "scripts" given by the Unicode
11119 "Script Extensions" property: Common, Inherited, and possibly one other.
11120 Additionally all decimal digits must come from the same consecutive sequence of
11123 For example, if all the characters in the sequence are Greek, or Common, or
11124 Inherited, this function will return TRUE, provided any decimal digits in it
11125 are from the same block of digits in Common. (These are the ASCII digits
11126 "0".."9" and additionally a block for full width forms of these, and several
11127 others used in mathematical notation.) For scripts (unlike Greek) that have
11128 their own digits defined this will accept either digits from that set or from
11129 one of the Common digit sets, but not a combination of the two. Some scripts,
11130 such as Arabic, have more than one set of digits. All digits must come from
11131 the same set for this function to return TRUE.
11133 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
11134 contain the script found, using the C<SCX_enum> typedef. Its value will be
11135 C<SCX_INVALID> if the function returns FALSE.
11137 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
11138 will be C<SCX_INVALID>.
11140 If the sequence contains a single code point which is unassigned to a character
11141 in the version of Unicode being used, the function will return TRUE, and the
11142 script will be C<SCX_Unknown>. Any other combination of unassigned code points
11143 in the input sequence will result in the function treating the input as not
11144 being a script run.
11146 The returned script will be C<SCX_Inherited> iff all the code points in it are
11147 from the Inherited script.
11149 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
11150 it are from the Inherited or Common scripts.
11157 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
11159 /* Basically, it looks at each character in the sequence to see if the
11160 * above conditions are met; if not it fails. It uses an inversion map to
11161 * find the enum corresponding to the script of each character. But this
11162 * is complicated by the fact that a few code points can be in any of
11163 * several scripts. The data has been constructed so that there are
11164 * additional enum values (all negative) for these situations. The
11165 * absolute value of those is an index into another table which contains
11166 * pointers to auxiliary tables for each such situation. Each aux array
11167 * lists all the scripts for the given situation. There is another,
11168 * parallel, table that gives the number of entries in each aux table.
11169 * These are all defined in charclass_invlists.h */
11171 /* XXX Here are the additional things UTS 39 says could be done:
11173 * Forbid sequences of the same nonspacing mark
11175 * Check to see that all the characters are in the sets of exemplar
11176 * characters for at least one language in the Unicode Common Locale Data
11177 * Repository [CLDR]. */
11180 /* Things that match /\d/u */
11181 SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
11182 UV * decimals_array = invlist_array(decimals_invlist);
11184 /* What code point is the digit '0' of the script run? (0 meaning FALSE if
11185 * not currently known) */
11186 UV zero_of_run = 0;
11188 SCX_enum script_of_run = SCX_INVALID; /* Illegal value */
11189 SCX_enum script_of_char = SCX_INVALID;
11191 /* If the script remains not fully determined from iteration to iteration,
11192 * this is the current intersection of the possiblities. */
11193 SCX_enum * intersection = NULL;
11194 PERL_UINT_FAST8_T intersection_len = 0;
11196 bool retval = TRUE;
11197 SCX_enum * ret_script = NULL;
11201 PERL_ARGS_ASSERT_ISSCRIPT_RUN;
11203 /* All code points in 0..255 are either Common or Latin, so must be a
11204 * script run. We can return immediately unless we need to know which
11206 if (! utf8_target && LIKELY(send > s)) {
11207 if (ret_script == NULL) {
11211 /* If any character is Latin, the run is Latin */
11213 if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
11214 *ret_script = SCX_Latin;
11219 /* Here, all are Common */
11220 *ret_script = SCX_Common;
11224 /* Look at each character in the sequence */
11226 /* If the current character being examined is a digit, this is the code
11227 * point of the zero for its sequence of 10 */
11232 /* The code allows all scripts to use the ASCII digits. This is
11233 * because they are in the Common script. Hence any ASCII ones found
11234 * are ok, unless and until a digit from another set has already been
11235 * encountered. digit ranges in Common are not similarly blessed) */
11236 if (UNLIKELY(isDIGIT(*s))) {
11237 if (UNLIKELY(script_of_run == SCX_Unknown)) {
11242 if (zero_of_run != '0') {
11254 /* Here, isn't an ASCII digit. Find the code point of the character */
11255 if (! UTF8_IS_INVARIANT(*s)) {
11257 cp = valid_utf8_to_uvchr((U8 *) s, &len);
11264 /* If is within the range [+0 .. +9] of the script's zero, it also is a
11265 * digit in that script. We can skip the rest of this code for this
11267 if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
11271 /* Find the character's script. The correct values are hard-coded here
11272 * for small-enough code points. */
11273 if (cp < 0x2B9) { /* From inspection of Unicode db; extremely
11274 unlikely to change */
11276 || ( isALPHA_L1(cp)
11277 && LIKELY(cp != MICRO_SIGN_NATIVE)))
11279 script_of_char = SCX_Latin;
11282 script_of_char = SCX_Common;
11286 script_of_char = _Perl_SCX_invmap[
11287 _invlist_search(PL_SCX_invlist, cp)];
11290 /* We arbitrarily accept a single unassigned character, but not in
11291 * combination with anything else, and not a run of them. */
11292 if ( UNLIKELY(script_of_run == SCX_Unknown)
11293 || UNLIKELY( script_of_run != SCX_INVALID
11294 && script_of_char == SCX_Unknown))
11300 /* For the first character, or the run is inherited, the run's script
11301 * is set to the char's */
11302 if ( UNLIKELY(script_of_run == SCX_INVALID)
11303 || UNLIKELY(script_of_run == SCX_Inherited))
11305 script_of_run = script_of_char;
11308 /* For the character's script to be Unknown, it must be the first
11309 * character in the sequence (for otherwise a test above would have
11310 * prevented us from reaching here), and we have set the run's script
11311 * to it. Nothing further to be done for this character */
11312 if (UNLIKELY(script_of_char == SCX_Unknown)) {
11316 /* We accept 'inherited' script characters currently even at the
11317 * beginning. (We know that no characters in Inherited are digits, or
11318 * we'd have to check for that) */
11319 if (UNLIKELY(script_of_char == SCX_Inherited)) {
11323 /* If the run so far is Common, and the new character isn't, change the
11324 * run's script to that of this character */
11325 if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
11326 script_of_run = script_of_char;
11329 /* Now we can see if the script of the new character is the same as
11330 * that of the run */
11331 if (LIKELY(script_of_char == script_of_run)) {
11332 /* By far the most common case */
11333 goto scripts_match;
11336 /* Here, the script of the run isn't Common. But characters in Common
11337 * match any script */
11338 if (script_of_char == SCX_Common) {
11339 goto scripts_match;
11342 #ifndef HAS_SCX_AUX_TABLES
11344 /* Too early a Unicode version to have a code point belonging to more
11345 * than one script, so, if the scripts don't exactly match, fail */
11346 PERL_UNUSED_VAR(intersection_len);
11352 /* Here there is no exact match between the character's script and the
11353 * run's. And we've handled the special cases of scripts Unknown,
11354 * Inherited, and Common.
11356 * Negative script numbers signify that the value may be any of several
11357 * scripts, and we need to look at auxiliary information to make our
11358 * deterimination. But if both are non-negative, we can fail now */
11359 if (LIKELY(script_of_char >= 0)) {
11360 const SCX_enum * search_in;
11361 PERL_UINT_FAST8_T search_in_len;
11362 PERL_UINT_FAST8_T i;
11364 if (LIKELY(script_of_run >= 0)) {
11369 /* Use the previously constructed set of possible scripts, if any.
11371 if (intersection) {
11372 search_in = intersection;
11373 search_in_len = intersection_len;
11376 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
11377 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
11380 for (i = 0; i < search_in_len; i++) {
11381 if (search_in[i] == script_of_char) {
11382 script_of_run = script_of_char;
11383 goto scripts_match;
11390 else if (LIKELY(script_of_run >= 0)) {
11391 /* script of character could be one of several, but run is a single
11393 const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11394 const PERL_UINT_FAST8_T search_in_len
11395 = SCX_AUX_TABLE_lengths[-script_of_char];
11396 PERL_UINT_FAST8_T i;
11398 for (i = 0; i < search_in_len; i++) {
11399 if (search_in[i] == script_of_run) {
11400 script_of_char = script_of_run;
11401 goto scripts_match;
11409 /* Both run and char could be in one of several scripts. If the
11410 * intersection is empty, then this character isn't in this script
11411 * run. Otherwise, we need to calculate the intersection to use
11412 * for future iterations of the loop, unless we are already at the
11413 * final character */
11414 const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11415 const PERL_UINT_FAST8_T char_len
11416 = SCX_AUX_TABLE_lengths[-script_of_char];
11417 const SCX_enum * search_run;
11418 PERL_UINT_FAST8_T run_len;
11420 SCX_enum * new_overlap = NULL;
11421 PERL_UINT_FAST8_T i, j;
11423 if (intersection) {
11424 search_run = intersection;
11425 run_len = intersection_len;
11428 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11429 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11432 intersection_len = 0;
11434 for (i = 0; i < run_len; i++) {
11435 for (j = 0; j < char_len; j++) {
11436 if (search_run[i] == search_char[j]) {
11438 /* Here, the script at i,j matches. That means this
11439 * character is in the run. But continue on to find
11440 * the complete intersection, for the next loop
11441 * iteration, and for the digit check after it.
11443 * On the first found common script, we malloc space
11444 * for the intersection list for the worst case of the
11445 * intersection, which is the minimum of the number of
11446 * scripts remaining in each set. */
11447 if (intersection_len == 0) {
11449 MIN(run_len - i, char_len - j),
11452 new_overlap[intersection_len++] = search_run[i];
11457 /* Here we've looked through everything. If they have no scripts
11458 * in common, not a run */
11459 if (intersection_len == 0) {
11464 /* If there is only a single script in common, set to that.
11465 * Otherwise, use the intersection going forward */
11466 Safefree(intersection);
11467 intersection = NULL;
11468 if (intersection_len == 1) {
11469 script_of_run = script_of_char = new_overlap[0];
11470 Safefree(new_overlap);
11471 new_overlap = NULL;
11474 intersection = new_overlap;
11482 /* Here, the script of the character is compatible with that of the
11483 * run. That means that in most cases, it continues the script run.
11484 * Either it and the run match exactly, or one or both can be in any of
11485 * several scripts, and the intersection is not empty. However, if the
11486 * character is a decimal digit, it could still mean failure if it is
11487 * from the wrong sequence of 10. So, we need to look at if it's a
11488 * digit. We've already handled the 10 digits [0-9], and the next
11489 * lowest one is this one: */
11490 if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11491 continue; /* Not a digit; this character is part of the run */
11494 /* If we have a definitive '0' for the script of this character, we
11495 * know that for this to be a digit, it must be in the range of +0..+9
11497 if ( script_of_char >= 0
11498 && (zero_of_char = script_zeros[script_of_char]))
11500 if (! withinCOUNT(cp, zero_of_char, 9)) {
11501 continue; /* Not a digit; this character is part of the run
11506 else { /* Need to look up if this character is a digit or not */
11507 SSize_t index_of_zero_of_char;
11508 index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11509 if ( UNLIKELY(index_of_zero_of_char < 0)
11510 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11512 continue; /* Not a digit; this character is part of the run.
11516 zero_of_char = decimals_array[index_of_zero_of_char];
11519 /* Here, the character is a decimal digit, and the zero of its sequence
11520 * of 10 is in 'zero_of_char'. If we already have a zero for this run,
11521 * they better be the same. */
11523 if (zero_of_run != zero_of_char) {
11528 else { /* Otherwise we now have a zero for this run */
11529 zero_of_run = zero_of_char;
11531 } /* end of looping through CLOSESR text */
11533 Safefree(intersection);
11535 if (ret_script != NULL) {
11537 *ret_script = script_of_run;
11540 *ret_script = SCX_INVALID;
11547 #endif /* ifndef PERL_IN_XSUB_RE */
11550 * ex: set ts=8 sts=4 sw=4 et: