5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #include "invlist_inline.h"
84 #include "unicode_constants.h"
86 static const char b_utf8_locale_required[] =
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong."
88 " Assuming a UTF-8 locale";
90 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND \
92 if (! IN_UTF8_CTYPE_LOCALE) { \
93 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
94 b_utf8_locale_required); \
98 static const char sets_utf8_locale_required[] =
99 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
101 #define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n) \
103 if (! IN_UTF8_CTYPE_LOCALE && ANYOFL_UTF8_LOCALE_REQD(FLAGS(n))) { \
104 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \
105 sets_utf8_locale_required); \
110 /* At least one required character in the target string is expressible only in
112 static const char non_utf8_target_but_utf8_required[]
113 = "Can't match, because target string needs to be in UTF-8\n";
116 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
117 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
122 #define STATIC static
129 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
131 #define HOPc(pos,off) \
132 (char *)(reginfo->is_utf8_target \
133 ? reghop3((U8*)pos, off, \
134 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
137 /* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */
138 #define HOPBACK3(pos, off, lim) \
139 (reginfo->is_utf8_target \
140 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \
141 : (pos - off >= lim) \
145 #define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg))
147 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
148 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
150 /* lim must be +ve. Returns NULL on overshoot */
151 #define HOPMAYBE3(pos,off,lim) \
152 (reginfo->is_utf8_target \
153 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
154 : ((U8*)pos + off <= lim) \
158 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
159 * off must be >=0; args should be vars rather than expressions */
160 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
161 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
162 : (U8*)((pos + off) > lim ? lim : (pos + off)))
163 #define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
165 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
166 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
168 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
170 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
171 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
173 /* for use after a quantifier and before an EXACT-like node -- japhy */
174 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
176 * NOTE that *nothing* that affects backtracking should be in here, specifically
177 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
178 * node that is in between two EXACT like nodes when ascertaining what the required
179 * "follow" character is. This should probably be moved to regex compile time
180 * although it may be done at run time beause of the REF possibility - more
181 * investigation required. -- demerphq
183 #define JUMPABLE(rn) ( \
185 (OP(rn) == CLOSE && \
186 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
188 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
189 OP(rn) == PLUS || OP(rn) == MINMOD || \
191 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
193 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
195 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
198 Search for mandatory following text node; for lookahead, the text must
199 follow but for lookbehind (rn->flags != 0) we skip to the next step.
201 #define FIND_NEXT_IMPT(rn) STMT_START { \
202 while (JUMPABLE(rn)) { \
203 const OPCODE type = OP(rn); \
204 if (type == SUSPEND || PL_regkind[type] == CURLY) \
205 rn = NEXTOPER(NEXTOPER(rn)); \
206 else if (type == PLUS) \
208 else if (type == IFMATCH) \
209 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
210 else rn += NEXT_OFF(rn); \
214 #define SLAB_FIRST(s) (&(s)->states[0])
215 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
217 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
218 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
219 static regmatch_state * S_push_slab(pTHX);
221 #define REGCP_PAREN_ELEMS 3
222 #define REGCP_OTHER_ELEMS 3
223 #define REGCP_FRAME_ELEMS 1
224 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
225 * are needed for the regexp context stack bookkeeping. */
228 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
230 const int retval = PL_savestack_ix;
231 const int paren_elems_to_push =
232 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
233 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
234 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
236 DECLARE_AND_GET_RE_DEBUG_FLAGS;
238 PERL_ARGS_ASSERT_REGCPPUSH;
240 if (paren_elems_to_push < 0)
241 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
242 (int)paren_elems_to_push, (int)maxopenparen,
243 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
245 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
246 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
247 " out of range (%lu-%ld)",
249 (unsigned long)maxopenparen,
252 SSGROW(total_elems + REGCP_FRAME_ELEMS);
255 if ((int)maxopenparen > (int)parenfloor)
256 Perl_re_exec_indentf( aTHX_
257 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
263 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
264 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
265 SSPUSHIV(rex->offs[p].end);
266 SSPUSHIV(rex->offs[p].start);
267 SSPUSHINT(rex->offs[p].start_tmp);
268 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
269 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
272 (IV)rex->offs[p].start,
273 (IV)rex->offs[p].start_tmp,
277 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
278 SSPUSHINT(maxopenparen);
279 SSPUSHINT(rex->lastparen);
280 SSPUSHINT(rex->lastcloseparen);
281 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
286 /* These are needed since we do not localize EVAL nodes: */
287 #define REGCP_SET(cp) \
289 Perl_re_exec_indentf( aTHX_ \
290 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
291 depth, (IV)PL_savestack_ix \
296 #define REGCP_UNWIND(cp) \
298 if (cp != PL_savestack_ix) \
299 Perl_re_exec_indentf( aTHX_ \
300 "Clearing an EVAL scope, savestack=%" \
301 IVdf "..%" IVdf "\n", \
302 depth, (IV)(cp), (IV)PL_savestack_ix \
307 /* set the start and end positions of capture ix */
308 #define CLOSE_CAPTURE(ix, s, e) \
309 rex->offs[ix].start = s; \
310 rex->offs[ix].end = e; \
311 if (ix > rex->lastparen) \
312 rex->lastparen = ix; \
313 rex->lastcloseparen = ix; \
314 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
315 "CLOSE: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf " max: %" UVuf "\n", \
320 (IV)rex->offs[ix].start, \
321 (IV)rex->offs[ix].end, \
325 #define UNWIND_PAREN(lp, lcp) \
326 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
327 "UNWIND_PAREN: rex=0x%" UVxf " offs=0x%" UVxf ": invalidate (%" UVuf "..%" UVuf "] set lcp: %" UVuf "\n", \
332 (UV)(rex->lastparen), \
335 for (n = rex->lastparen; n > lp; n--) \
336 rex->offs[n].end = -1; \
337 rex->lastparen = n; \
338 rex->lastcloseparen = lcp;
342 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
346 DECLARE_AND_GET_RE_DEBUG_FLAGS;
348 PERL_ARGS_ASSERT_REGCPPOP;
350 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
352 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
353 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
354 rex->lastcloseparen = SSPOPINT;
355 rex->lastparen = SSPOPINT;
356 *maxopenparen_p = SSPOPINT;
358 i -= REGCP_OTHER_ELEMS;
359 /* Now restore the parentheses context. */
361 if (i || rex->lastparen + 1 <= rex->nparens)
362 Perl_re_exec_indentf( aTHX_
363 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
369 paren = *maxopenparen_p;
370 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
372 rex->offs[paren].start_tmp = SSPOPINT;
373 rex->offs[paren].start = SSPOPIV;
375 if (paren <= rex->lastparen)
376 rex->offs[paren].end = tmps;
377 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
378 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
381 (IV)rex->offs[paren].start,
382 (IV)rex->offs[paren].start_tmp,
383 (IV)rex->offs[paren].end,
384 (paren > rex->lastparen ? "(skipped)" : ""));
389 /* It would seem that the similar code in regtry()
390 * already takes care of this, and in fact it is in
391 * a better location to since this code can #if 0-ed out
392 * but the code in regtry() is needed or otherwise tests
393 * requiring null fields (pat.t#187 and split.t#{13,14}
394 * (as of patchlevel 7877) will fail. Then again,
395 * this code seems to be necessary or otherwise
396 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
397 * --jhi updated by dapm */
398 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
399 if (i > *maxopenparen_p)
400 rex->offs[i].start = -1;
401 rex->offs[i].end = -1;
402 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
403 " \\%" UVuf ": %s ..-1 undeffing\n",
406 (i > *maxopenparen_p) ? "-1" : " "
412 /* restore the parens and associated vars at savestack position ix,
413 * but without popping the stack */
416 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
418 I32 tmpix = PL_savestack_ix;
419 PERL_ARGS_ASSERT_REGCP_RESTORE;
421 PL_savestack_ix = ix;
422 regcppop(rex, maxopenparen_p);
423 PL_savestack_ix = tmpix;
426 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
429 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
431 /* Returns a boolean as to whether or not 'character' is a member of the
432 * Posix character class given by 'classnum' that should be equivalent to a
433 * value in the typedef '_char_class_number'.
435 * Ideally this could be replaced by a just an array of function pointers
436 * to the C library functions that implement the macros this calls.
437 * However, to compile, the precise function signatures are required, and
438 * these may vary from platform to platform. To avoid having to figure
439 * out what those all are on each platform, I (khw) am using this method,
440 * which adds an extra layer of function call overhead (unless the C
441 * optimizer strips it away). But we don't particularly care about
442 * performance with locales anyway. */
444 switch ((_char_class_number) classnum) {
445 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
446 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
447 case _CC_ENUM_ASCII: return isASCII_LC(character);
448 case _CC_ENUM_BLANK: return isBLANK_LC(character);
449 case _CC_ENUM_CASED: return isLOWER_LC(character)
450 || isUPPER_LC(character);
451 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
452 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
453 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
454 case _CC_ENUM_LOWER: return isLOWER_LC(character);
455 case _CC_ENUM_PRINT: return isPRINT_LC(character);
456 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
457 case _CC_ENUM_SPACE: return isSPACE_LC(character);
458 case _CC_ENUM_UPPER: return isUPPER_LC(character);
459 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
460 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
461 default: /* VERTSPACE should never occur in locales */
462 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
465 NOT_REACHED; /* NOTREACHED */
469 PERL_STATIC_INLINE I32
470 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len)
472 /* Compare non-UTF-8 using Unicode (Latin1) semantics. s2 must already be
473 * folded. Works on all folds representable without UTF-8, except for
474 * LATIN_SMALL_LETTER_SHARP_S, and does not check for this. Nor does it
475 * check that the strings each have at least 'len' characters.
477 * There is almost an identical API function where s2 need not be folded:
478 * Perl_foldEQ_latin1() */
480 const U8 *a = (const U8 *)s1;
481 const U8 *b = (const U8 *)s2;
483 PERL_ARGS_ASSERT_FOLDEQ_LATIN1_S2_FOLDED;
488 assert(! isUPPER_L1(*b));
489 if (toLOWER_L1(*a) != *b) {
498 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e)
500 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
501 * 'character' is a member of the Posix character class given by 'classnum'
502 * that should be equivalent to a value in the typedef
503 * '_char_class_number'.
505 * This just calls isFOO_lc on the code point for the character if it is in
506 * the range 0-255. Outside that range, all characters use Unicode
507 * rules, ignoring any locale. So use the Unicode function if this class
508 * requires an inversion list, and use the Unicode macro otherwise. */
511 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
513 if (UTF8_IS_INVARIANT(*character)) {
514 return isFOO_lc(classnum, *character);
516 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
517 return isFOO_lc(classnum,
518 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
521 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, e);
523 switch ((_char_class_number) classnum) {
524 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
525 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
526 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
527 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
529 return _invlist_contains_cp(PL_XPosix_ptrs[classnum],
530 utf8_to_uvchr_buf(character, e, NULL));
533 return FALSE; /* Things like CNTRL are always below 256 */
537 S_find_span_end(U8 * s, const U8 * send, const U8 span_byte)
539 /* Returns the position of the first byte in the sequence between 's' and
540 * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found.
543 PERL_ARGS_ASSERT_FIND_SPAN_END;
547 if ((STRLEN) (send - s) >= PERL_WORDSIZE
548 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
549 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
551 PERL_UINTMAX_T span_word;
553 /* Process per-byte until reach word boundary. XXX This loop could be
554 * eliminated if we knew that this platform had fast unaligned reads */
555 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
556 if (*s != span_byte) {
562 /* Create a word filled with the bytes we are spanning */
563 span_word = PERL_COUNT_MULTIPLIER * span_byte;
565 /* Process per-word as long as we have at least a full word left */
568 /* Keep going if the whole word is composed of 'span_byte's */
569 if ((* (PERL_UINTMAX_T *) s) == span_word) {
574 /* Here, at least one byte in the word isn't 'span_byte'. */
582 /* This xor leaves 1 bits only in those non-matching bytes */
583 span_word ^= * (PERL_UINTMAX_T *) s;
585 /* Make sure the upper bit of each non-matching byte is set. This
586 * makes each such byte look like an ASCII platform variant byte */
587 span_word |= span_word << 1;
588 span_word |= span_word << 2;
589 span_word |= span_word << 4;
591 /* That reduces the problem to what this function solves */
592 return s + variant_byte_number(span_word);
596 } while (s + PERL_WORDSIZE <= send);
599 /* Process the straggler bytes beyond the final word boundary */
601 if (*s != span_byte) {
611 S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask)
613 /* Returns the position of the first byte in the sequence between 's'
614 * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte';
615 * returns 'send' if none found. It uses word-level operations instead of
616 * byte to speed up the process */
618 PERL_ARGS_ASSERT_FIND_NEXT_MASKED;
621 assert((byte & mask) == byte);
625 if ((STRLEN) (send - s) >= PERL_WORDSIZE
626 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
627 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
629 PERL_UINTMAX_T word, mask_word;
631 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
632 if (((*s) & mask) == byte) {
638 word = PERL_COUNT_MULTIPLIER * byte;
639 mask_word = PERL_COUNT_MULTIPLIER * mask;
642 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
644 /* If 'masked' contains bytes with the bit pattern of 'byte' within
645 * it, xoring with 'word' will leave each of the 8 bits in such
646 * bytes be 0, and no byte containing any other bit pattern will be
650 /* This causes the most significant bit to be set to 1 for any
651 * bytes in the word that aren't completely 0 */
652 masked |= masked << 1;
653 masked |= masked << 2;
654 masked |= masked << 4;
656 /* The msbits are the same as what marks a byte as variant, so we
657 * can use this mask. If all msbits are 1, the word doesn't
659 if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) {
664 /* Here, the msbit of bytes in the word that aren't 'byte' are 1,
665 * and any that are, are 0. Complement and re-AND to swap that */
667 masked &= PERL_VARIANTS_WORD_MASK;
669 /* This reduces the problem to that solved by this function */
670 s += variant_byte_number(masked);
673 } while (s + PERL_WORDSIZE <= send);
679 if (((*s) & mask) == byte) {
689 S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask)
691 /* Returns the position of the first byte in the sequence between 's' and
692 * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'.
693 * 'span_byte' should have been ANDed with 'mask' in the call of this
694 * function. Returns 'send' if none found. Works like find_span_end(),
695 * except for the AND */
697 PERL_ARGS_ASSERT_FIND_SPAN_END_MASK;
700 assert((span_byte & mask) == span_byte);
702 if ((STRLEN) (send - s) >= PERL_WORDSIZE
703 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
704 - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
706 PERL_UINTMAX_T span_word, mask_word;
708 while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
709 if (((*s) & mask) != span_byte) {
715 span_word = PERL_COUNT_MULTIPLIER * span_byte;
716 mask_word = PERL_COUNT_MULTIPLIER * mask;
719 PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word;
721 if (masked == span_word) {
733 masked |= masked << 1;
734 masked |= masked << 2;
735 masked |= masked << 4;
736 return s + variant_byte_number(masked);
740 } while (s + PERL_WORDSIZE <= send);
744 if (((*s) & mask) != span_byte) {
754 * pregexec and friends
757 #ifndef PERL_IN_XSUB_RE
759 - pregexec - match a regexp against a string
762 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
763 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
764 /* stringarg: the point in the string at which to begin matching */
765 /* strend: pointer to null at end of string */
766 /* strbeg: real beginning of string */
767 /* minend: end of match must be >= minend bytes after stringarg. */
768 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
769 * itself is accessed via the pointers above */
770 /* nosave: For optimizations. */
772 PERL_ARGS_ASSERT_PREGEXEC;
775 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
776 nosave ? 0 : REXEC_COPY_STR);
782 /* re_intuit_start():
784 * Based on some optimiser hints, try to find the earliest position in the
785 * string where the regex could match.
787 * rx: the regex to match against
788 * sv: the SV being matched: only used for utf8 flag; the string
789 * itself is accessed via the pointers below. Note that on
790 * something like an overloaded SV, SvPOK(sv) may be false
791 * and the string pointers may point to something unrelated to
793 * strbeg: real beginning of string
794 * strpos: the point in the string at which to begin matching
795 * strend: pointer to the byte following the last char of the string
796 * flags currently unused; set to 0
797 * data: currently unused; set to NULL
799 * The basic idea of re_intuit_start() is to use some known information
800 * about the pattern, namely:
802 * a) the longest known anchored substring (i.e. one that's at a
803 * constant offset from the beginning of the pattern; but not
804 * necessarily at a fixed offset from the beginning of the
806 * b) the longest floating substring (i.e. one that's not at a constant
807 * offset from the beginning of the pattern);
808 * c) Whether the pattern is anchored to the string; either
809 * an absolute anchor: /^../, or anchored to \n: /^.../m,
810 * or anchored to pos(): /\G/;
811 * d) A start class: a real or synthetic character class which
812 * represents which characters are legal at the start of the pattern;
814 * to either quickly reject the match, or to find the earliest position
815 * within the string at which the pattern might match, thus avoiding
816 * running the full NFA engine at those earlier locations, only to
817 * eventually fail and retry further along.
819 * Returns NULL if the pattern can't match, or returns the address within
820 * the string which is the earliest place the match could occur.
822 * The longest of the anchored and floating substrings is called 'check'
823 * and is checked first. The other is called 'other' and is checked
824 * second. The 'other' substring may not be present. For example,
826 * /(abc|xyz)ABC\d{0,3}DEFG/
830 * check substr (float) = "DEFG", offset 6..9 chars
831 * other substr (anchored) = "ABC", offset 3..3 chars
834 * Be aware that during the course of this function, sometimes 'anchored'
835 * refers to a substring being anchored relative to the start of the
836 * pattern, and sometimes to the pattern itself being anchored relative to
837 * the string. For example:
839 * /\dabc/: "abc" is anchored to the pattern;
840 * /^\dabc/: "abc" is anchored to the pattern and the string;
841 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
842 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
843 * but the pattern is anchored to the string.
847 Perl_re_intuit_start(pTHX_
850 const char * const strbeg,
854 re_scream_pos_data *data)
856 struct regexp *const prog = ReANY(rx);
857 SSize_t start_shift = prog->check_offset_min;
858 /* Should be nonnegative! */
859 SSize_t end_shift = 0;
860 /* current lowest pos in string where the regex can start matching */
861 char *rx_origin = strpos;
863 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
864 U8 other_ix = 1 - prog->substrs->check_ix;
866 char *other_last = strpos;/* latest pos 'other' substr already checked to */
867 char *check_at = NULL; /* check substr found at this pos */
868 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
869 RXi_GET_DECL(prog,progi);
870 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
871 regmatch_info *const reginfo = ®info_buf;
872 DECLARE_AND_GET_RE_DEBUG_FLAGS;
874 PERL_ARGS_ASSERT_RE_INTUIT_START;
875 PERL_UNUSED_ARG(flags);
876 PERL_UNUSED_ARG(data);
878 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
879 "Intuit: trying to determine minimum start position...\n"));
881 /* for now, assume that all substr offsets are positive. If at some point
882 * in the future someone wants to do clever things with lookbehind and
883 * -ve offsets, they'll need to fix up any code in this function
884 * which uses these offsets. See the thread beginning
885 * <20140113145929.GF27210@iabyn.com>
887 assert(prog->substrs->data[0].min_offset >= 0);
888 assert(prog->substrs->data[0].max_offset >= 0);
889 assert(prog->substrs->data[1].min_offset >= 0);
890 assert(prog->substrs->data[1].max_offset >= 0);
891 assert(prog->substrs->data[2].min_offset >= 0);
892 assert(prog->substrs->data[2].max_offset >= 0);
894 /* for now, assume that if both present, that the floating substring
895 * doesn't start before the anchored substring.
896 * If you break this assumption (e.g. doing better optimisations
897 * with lookahead/behind), then you'll need to audit the code in this
898 * function carefully first
901 ! ( (prog->anchored_utf8 || prog->anchored_substr)
902 && (prog->float_utf8 || prog->float_substr))
903 || (prog->float_min_offset >= prog->anchored_offset));
905 /* byte rather than char calculation for efficiency. It fails
906 * to quickly reject some cases that can't match, but will reject
907 * them later after doing full char arithmetic */
908 if (prog->minlen > strend - strpos) {
909 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
910 " String too short...\n"));
914 RXp_MATCH_UTF8_set(prog, utf8_target);
915 reginfo->is_utf8_target = cBOOL(utf8_target);
916 reginfo->info_aux = NULL;
917 reginfo->strbeg = strbeg;
918 reginfo->strend = strend;
919 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
921 /* not actually used within intuit, but zero for safety anyway */
922 reginfo->poscache_maxiter = 0;
925 if ((!prog->anchored_utf8 && prog->anchored_substr)
926 || (!prog->float_utf8 && prog->float_substr))
927 to_utf8_substr(prog);
928 check = prog->check_utf8;
930 if (!prog->check_substr && prog->check_utf8) {
931 if (! to_byte_substr(prog)) {
932 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
935 check = prog->check_substr;
938 /* dump the various substring data */
939 DEBUG_OPTIMISE_MORE_r({
941 for (i=0; i<=2; i++) {
942 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
943 : prog->substrs->data[i].substr);
947 Perl_re_printf( aTHX_
948 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
949 " useful=%" IVdf " utf8=%d [%s]\n",
951 (IV)prog->substrs->data[i].min_offset,
952 (IV)prog->substrs->data[i].max_offset,
953 (IV)prog->substrs->data[i].end_shift,
960 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
962 /* ml_anch: check after \n?
964 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
965 * with /.*.../, these flags will have been added by the
967 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
968 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
970 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
971 && !(prog->intflags & PREGf_IMPLICIT);
973 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
974 /* we are only allowed to match at BOS or \G */
976 /* trivially reject if there's a BOS anchor and we're not at BOS.
978 * Note that we don't try to do a similar quick reject for
979 * \G, since generally the caller will have calculated strpos
980 * based on pos() and gofs, so the string is already correctly
981 * anchored by definition; and handling the exceptions would
982 * be too fiddly (e.g. REXEC_IGNOREPOS).
984 if ( strpos != strbeg
985 && (prog->intflags & PREGf_ANCH_SBOL))
987 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
988 " Not at start...\n"));
992 /* in the presence of an anchor, the anchored (relative to the
993 * start of the regex) substr must also be anchored relative
994 * to strpos. So quickly reject if substr isn't found there.
995 * This works for \G too, because the caller will already have
996 * subtracted gofs from pos, and gofs is the offset from the
997 * \G to the start of the regex. For example, in /.abc\Gdef/,
998 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
999 * caller will have set strpos=pos()-4; we look for the substr
1000 * at position pos()-4+1, which lines up with the "a" */
1002 if (prog->check_offset_min == prog->check_offset_max) {
1003 /* Substring at constant offset from beg-of-str... */
1004 SSize_t slen = SvCUR(check);
1005 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1007 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1008 " Looking for check substr at fixed offset %" IVdf "...\n",
1009 (IV)prog->check_offset_min));
1011 if (SvTAIL(check)) {
1012 /* In this case, the regex is anchored at the end too.
1013 * Unless it's a multiline match, the lengths must match
1014 * exactly, give or take a \n. NB: slen >= 1 since
1015 * the last char of check is \n */
1017 && ( strend - s > slen
1018 || strend - s < slen - 1
1019 || (strend - s == slen && strend[-1] != '\n')))
1021 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1022 " String too long...\n"));
1025 /* Now should match s[0..slen-2] */
1028 if (slen && (strend - s < slen
1029 || *SvPVX_const(check) != *s
1030 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
1032 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1033 " String not equal...\n"));
1038 goto success_at_start;
1043 end_shift = prog->check_end_shift;
1045 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
1047 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
1048 (IV)end_shift, RX_PRECOMP(rx));
1053 /* This is the (re)entry point of the main loop in this function.
1054 * The goal of this loop is to:
1055 * 1) find the "check" substring in the region rx_origin..strend
1056 * (adjusted by start_shift / end_shift). If not found, reject
1058 * 2) If it exists, look for the "other" substr too if defined; for
1059 * example, if the check substr maps to the anchored substr, then
1060 * check the floating substr, and vice-versa. If not found, go
1061 * back to (1) with rx_origin suitably incremented.
1062 * 3) If we find an rx_origin position that doesn't contradict
1063 * either of the substrings, then check the possible additional
1064 * constraints on rx_origin of /^.../m or a known start class.
1065 * If these fail, then depending on which constraints fail, jump
1066 * back to here, or to various other re-entry points further along
1067 * that skip some of the first steps.
1068 * 4) If we pass all those tests, update the BmUSEFUL() count on the
1069 * substring. If the start position was determined to be at the
1070 * beginning of the string - so, not rejected, but not optimised,
1071 * since we have to run regmatch from position 0 - decrement the
1072 * BmUSEFUL() count. Otherwise increment it.
1076 /* first, look for the 'check' substring */
1082 DEBUG_OPTIMISE_MORE_r({
1083 Perl_re_printf( aTHX_
1084 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
1085 " Start shift: %" IVdf " End shift %" IVdf
1086 " Real end Shift: %" IVdf "\n",
1087 (IV)(rx_origin - strbeg),
1088 (IV)prog->check_offset_min,
1091 (IV)prog->check_end_shift);
1094 end_point = HOPBACK3(strend, end_shift, rx_origin);
1097 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
1102 /* If the regex is absolutely anchored to either the start of the
1103 * string (SBOL) or to pos() (ANCH_GPOS), then
1104 * check_offset_max represents an upper bound on the string where
1105 * the substr could start. For the ANCH_GPOS case, we assume that
1106 * the caller of intuit will have already set strpos to
1107 * pos()-gofs, so in this case strpos + offset_max will still be
1108 * an upper bound on the substr.
1111 && prog->intflags & PREGf_ANCH
1112 && prog->check_offset_max != SSize_t_MAX)
1114 SSize_t check_len = SvCUR(check) - !!SvTAIL(check);
1115 const char * const anchor =
1116 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
1117 SSize_t targ_len = (char*)end_point - anchor;
1119 if (check_len > targ_len) {
1120 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1121 "Target string too short to match required substring...\n"));
1125 /* do a bytes rather than chars comparison. It's conservative;
1126 * so it skips doing the HOP if the result can't possibly end
1127 * up earlier than the old value of end_point.
1129 assert(anchor + check_len <= (char *)end_point);
1130 if (prog->check_offset_max + check_len < targ_len) {
1131 end_point = HOP3lim((U8*)anchor,
1132 prog->check_offset_max,
1133 end_point - check_len
1136 if (end_point < start_point)
1141 check_at = fbm_instr( start_point, end_point,
1142 check, multiline ? FBMrf_MULTILINE : 0);
1144 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1145 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1146 (IV)((char*)start_point - strbeg),
1147 (IV)((char*)end_point - strbeg),
1148 (IV)(check_at ? check_at - strbeg : -1)
1151 /* Update the count-of-usability, remove useless subpatterns,
1155 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1156 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
1157 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
1158 (check_at ? "Found" : "Did not find"),
1159 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
1160 ? "anchored" : "floating"),
1163 (check_at ? " at offset " : "...\n") );
1168 /* set rx_origin to the minimum position where the regex could start
1169 * matching, given the constraint of the just-matched check substring.
1170 * But don't set it lower than previously.
1173 if (check_at - rx_origin > prog->check_offset_max)
1174 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
1175 /* Finish the diagnostic message */
1176 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1177 "%ld (rx_origin now %" IVdf ")...\n",
1178 (long)(check_at - strbeg),
1179 (IV)(rx_origin - strbeg)
1184 /* now look for the 'other' substring if defined */
1186 if (prog->substrs->data[other_ix].utf8_substr
1187 || prog->substrs->data[other_ix].substr)
1189 /* Take into account the "other" substring. */
1193 struct reg_substr_datum *other;
1196 other = &prog->substrs->data[other_ix];
1197 if (!utf8_target && !other->substr) {
1198 if (!to_byte_substr(prog)) {
1199 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
1203 /* if "other" is anchored:
1204 * we've previously found a floating substr starting at check_at.
1205 * This means that the regex origin must lie somewhere
1206 * between min (rx_origin): HOP3(check_at, -check_offset_max)
1207 * and max: HOP3(check_at, -check_offset_min)
1208 * (except that min will be >= strpos)
1209 * So the fixed substr must lie somewhere between
1210 * HOP3(min, anchored_offset)
1211 * HOP3(max, anchored_offset) + SvCUR(substr)
1214 /* if "other" is floating
1215 * Calculate last1, the absolute latest point where the
1216 * floating substr could start in the string, ignoring any
1217 * constraints from the earlier fixed match. It is calculated
1220 * strend - prog->minlen (in chars) is the absolute latest
1221 * position within the string where the origin of the regex
1222 * could appear. The latest start point for the floating
1223 * substr is float_min_offset(*) on from the start of the
1224 * regex. last1 simply combines thee two offsets.
1226 * (*) You might think the latest start point should be
1227 * float_max_offset from the regex origin, and technically
1228 * you'd be correct. However, consider
1230 * Here, float min, max are 3,5 and minlen is 7.
1231 * This can match either
1235 * In the first case, the regex matches minlen chars; in the
1236 * second, minlen+1, in the third, minlen+2.
1237 * In the first case, the floating offset is 3 (which equals
1238 * float_min), in the second, 4, and in the third, 5 (which
1239 * equals float_max). In all cases, the floating string bcd
1240 * can never start more than 4 chars from the end of the
1241 * string, which equals minlen - float_min. As the substring
1242 * starts to match more than float_min from the start of the
1243 * regex, it makes the regex match more than minlen chars,
1244 * and the two cancel each other out. So we can always use
1245 * float_min - minlen, rather than float_max - minlen for the
1246 * latest position in the string.
1248 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1249 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1252 assert(prog->minlen >= other->min_offset);
1253 last1 = HOP3c(strend,
1254 other->min_offset - prog->minlen, strbeg);
1256 if (other_ix) {/* i.e. if (other-is-float) */
1257 /* last is the latest point where the floating substr could
1258 * start, *given* any constraints from the earlier fixed
1259 * match. This constraint is that the floating string starts
1260 * <= float_max_offset chars from the regex origin (rx_origin).
1261 * If this value is less than last1, use it instead.
1263 assert(rx_origin <= last1);
1265 /* this condition handles the offset==infinity case, and
1266 * is a short-cut otherwise. Although it's comparing a
1267 * byte offset to a char length, it does so in a safe way,
1268 * since 1 char always occupies 1 or more bytes,
1269 * so if a string range is (last1 - rx_origin) bytes,
1270 * it will be less than or equal to (last1 - rx_origin)
1271 * chars; meaning it errs towards doing the accurate HOP3
1272 * rather than just using last1 as a short-cut */
1273 (last1 - rx_origin) < other->max_offset
1275 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1278 assert(strpos + start_shift <= check_at);
1279 last = HOP4c(check_at, other->min_offset - start_shift,
1283 s = HOP3c(rx_origin, other->min_offset, strend);
1284 if (s < other_last) /* These positions already checked */
1287 must = utf8_target ? other->utf8_substr : other->substr;
1288 assert(SvPOK(must));
1291 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1297 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1298 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
1299 (IV)(from - strbeg),
1305 (unsigned char*)from,
1308 multiline ? FBMrf_MULTILINE : 0
1310 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1311 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
1312 (IV)(from - strbeg),
1314 (IV)(s ? s - strbeg : -1)
1320 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1321 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1322 Perl_re_printf( aTHX_ " %s %s substr %s%s",
1323 s ? "Found" : "Contradicts",
1324 other_ix ? "floating" : "anchored",
1325 quoted, RE_SV_TAIL(must));
1330 /* last1 is latest possible substr location. If we didn't
1331 * find it before there, we never will */
1332 if (last >= last1) {
1333 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1334 "; giving up...\n"));
1338 /* try to find the check substr again at a later
1339 * position. Maybe next time we'll find the "other" substr
1341 other_last = HOP3c(last, 1, strend) /* highest failure */;
1343 other_ix /* i.e. if other-is-float */
1344 ? HOP3c(rx_origin, 1, strend)
1345 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1346 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1347 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
1348 (other_ix ? "floating" : "anchored"),
1349 (long)(HOP3c(check_at, 1, strend) - strbeg),
1350 (IV)(rx_origin - strbeg)
1355 if (other_ix) { /* if (other-is-float) */
1356 /* other_last is set to s, not s+1, since its possible for
1357 * a floating substr to fail first time, then succeed
1358 * second time at the same floating position; e.g.:
1359 * "-AB--AABZ" =~ /\wAB\d*Z/
1360 * The first time round, anchored and float match at
1361 * "-(AB)--AAB(Z)" then fail on the initial \w character
1362 * class. Second time round, they match at "-AB--A(AB)(Z)".
1367 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1368 other_last = HOP3c(s, 1, strend);
1370 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1371 " at offset %ld (rx_origin now %" IVdf ")...\n",
1373 (IV)(rx_origin - strbeg)
1379 DEBUG_OPTIMISE_MORE_r(
1380 Perl_re_printf( aTHX_
1381 " Check-only match: offset min:%" IVdf " max:%" IVdf
1382 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1383 " strend:%" IVdf "\n",
1384 (IV)prog->check_offset_min,
1385 (IV)prog->check_offset_max,
1386 (IV)(check_at-strbeg),
1387 (IV)(rx_origin-strbeg),
1388 (IV)(rx_origin-check_at),
1394 postprocess_substr_matches:
1396 /* handle the extra constraint of /^.../m if present */
1398 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1401 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1402 " looking for /^/m anchor"));
1404 /* we have failed the constraint of a \n before rx_origin.
1405 * Find the next \n, if any, even if it's beyond the current
1406 * anchored and/or floating substrings. Whether we should be
1407 * scanning ahead for the next \n or the next substr is debatable.
1408 * On the one hand you'd expect rare substrings to appear less
1409 * often than \n's. On the other hand, searching for \n means
1410 * we're effectively flipping between check_substr and "\n" on each
1411 * iteration as the current "rarest" candidate string, which
1412 * means for example that we'll quickly reject the whole string if
1413 * hasn't got a \n, rather than trying every substr position
1417 s = HOP3c(strend, - prog->minlen, strpos);
1418 if (s <= rx_origin ||
1419 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1421 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1422 " Did not find /%s^%s/m...\n",
1423 PL_colors[0], PL_colors[1]));
1427 /* earliest possible origin is 1 char after the \n.
1428 * (since *rx_origin == '\n', it's safe to ++ here rather than
1429 * HOP(rx_origin, 1)) */
1432 if (prog->substrs->check_ix == 0 /* check is anchored */
1433 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1435 /* Position contradicts check-string; either because
1436 * check was anchored (and thus has no wiggle room),
1437 * or check was float and rx_origin is above the float range */
1438 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1439 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1440 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1444 /* if we get here, the check substr must have been float,
1445 * is in range, and we may or may not have had an anchored
1446 * "other" substr which still contradicts */
1447 assert(prog->substrs->check_ix); /* check is float */
1449 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1450 /* whoops, the anchored "other" substr exists, so we still
1451 * contradict. On the other hand, the float "check" substr
1452 * didn't contradict, so just retry the anchored "other"
1454 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1455 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
1456 PL_colors[0], PL_colors[1],
1457 (IV)(rx_origin - strbeg + prog->anchored_offset),
1458 (IV)(rx_origin - strbeg)
1460 goto do_other_substr;
1463 /* success: we don't contradict the found floating substring
1464 * (and there's no anchored substr). */
1465 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1466 " Found /%s^%s/m with rx_origin %ld...\n",
1467 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1470 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1471 " (multiline anchor test skipped)\n"));
1477 /* if we have a starting character class, then test that extra constraint.
1478 * (trie stclasses are too expensive to use here, we are better off to
1479 * leave it to regmatch itself) */
1481 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1482 const U8* const str = (U8*)STRING(progi->regstclass);
1484 /* XXX this value could be pre-computed */
1485 const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1486 ? (reginfo->is_utf8_pat
1487 ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str)
1488 : (SSize_t)STR_LEN(progi->regstclass))
1492 /* latest pos that a matching float substr constrains rx start to */
1493 char *rx_max_float = NULL;
1495 /* if the current rx_origin is anchored, either by satisfying an
1496 * anchored substring constraint, or a /^.../m constraint, then we
1497 * can reject the current origin if the start class isn't found
1498 * at the current position. If we have a float-only match, then
1499 * rx_origin is constrained to a range; so look for the start class
1500 * in that range. if neither, then look for the start class in the
1501 * whole rest of the string */
1503 /* XXX DAPM it's not clear what the minlen test is for, and why
1504 * it's not used in the floating case. Nothing in the test suite
1505 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1506 * Here are some old comments, which may or may not be correct:
1508 * minlen == 0 is possible if regstclass is \b or \B,
1509 * and the fixed substr is ''$.
1510 * Since minlen is already taken into account, rx_origin+1 is
1511 * before strend; accidentally, minlen >= 1 guaranties no false
1512 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1513 * 0) below assumes that regstclass does not come from lookahead...
1514 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1515 * This leaves EXACTF-ish only, which are dealt with in
1519 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1520 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
1521 else if (prog->float_substr || prog->float_utf8) {
1522 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1523 endpos = HOP3clim(rx_max_float, cl_l, strend);
1528 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1529 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
1530 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1531 (IV)start_shift, (IV)(check_at - strbeg),
1532 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1534 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1537 if (endpos == strend) {
1538 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1539 " Could not match STCLASS...\n") );
1542 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1543 " This position contradicts STCLASS...\n") );
1544 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1545 && !(prog->intflags & PREGf_IMPLICIT))
1548 /* Contradict one of substrings */
1549 if (prog->anchored_substr || prog->anchored_utf8) {
1550 if (prog->substrs->check_ix == 1) { /* check is float */
1551 /* Have both, check_string is floating */
1552 assert(rx_origin + start_shift <= check_at);
1553 if (rx_origin + start_shift != check_at) {
1554 /* not at latest position float substr could match:
1555 * Recheck anchored substring, but not floating.
1556 * The condition above is in bytes rather than
1557 * chars for efficiency. It's conservative, in
1558 * that it errs on the side of doing 'goto
1559 * do_other_substr'. In this case, at worst,
1560 * an extra anchored search may get done, but in
1561 * practice the extra fbm_instr() is likely to
1562 * get skipped anyway. */
1563 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1564 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
1565 (long)(other_last - strbeg),
1566 (IV)(rx_origin - strbeg)
1568 goto do_other_substr;
1576 /* In the presence of ml_anch, we might be able to
1577 * find another \n without breaking the current float
1580 /* strictly speaking this should be HOP3c(..., 1, ...),
1581 * but since we goto a block of code that's going to
1582 * search for the next \n if any, its safe here */
1584 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1585 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
1586 PL_colors[0], PL_colors[1],
1587 (long)(rx_origin - strbeg)) );
1588 goto postprocess_substr_matches;
1591 /* strictly speaking this can never be true; but might
1592 * be if we ever allow intuit without substrings */
1593 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1596 rx_origin = rx_max_float;
1599 /* at this point, any matching substrings have been
1600 * contradicted. Start again... */
1602 rx_origin = HOP3c(rx_origin, 1, strend);
1604 /* uses bytes rather than char calculations for efficiency.
1605 * It's conservative: it errs on the side of doing 'goto restart',
1606 * where there is code that does a proper char-based test */
1607 if (rx_origin + start_shift + end_shift > strend) {
1608 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1609 " Could not match STCLASS...\n") );
1612 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1613 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
1614 (prog->substrs->check_ix ? "floating" : "anchored"),
1615 (long)(rx_origin + start_shift - strbeg),
1616 (IV)(rx_origin - strbeg)
1623 if (rx_origin != s) {
1624 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1625 " By STCLASS: moving %ld --> %ld\n",
1626 (long)(rx_origin - strbeg), (long)(s - strbeg))
1630 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1631 " Does not contradict STCLASS...\n");
1636 /* Decide whether using the substrings helped */
1638 if (rx_origin != strpos) {
1639 /* Fixed substring is found far enough so that the match
1640 cannot start at strpos. */
1642 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
1643 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1646 /* The found rx_origin position does not prohibit matching at
1647 * strpos, so calling intuit didn't gain us anything. Decrement
1648 * the BmUSEFUL() count on the check substring, and if we reach
1650 if (!(prog->intflags & PREGf_NAUGHTY)
1652 prog->check_utf8 /* Could be deleted already */
1653 && --BmUSEFUL(prog->check_utf8) < 0
1654 && (prog->check_utf8 == prog->float_utf8)
1656 prog->check_substr /* Could be deleted already */
1657 && --BmUSEFUL(prog->check_substr) < 0
1658 && (prog->check_substr == prog->float_substr)
1661 /* If flags & SOMETHING - do not do it many times on the same match */
1662 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
1663 /* XXX Does the destruction order has to change with utf8_target? */
1664 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1665 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1666 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1667 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1668 check = NULL; /* abort */
1669 /* XXXX This is a remnant of the old implementation. It
1670 looks wasteful, since now INTUIT can use many
1671 other heuristics. */
1672 prog->extflags &= ~RXf_USE_INTUIT;
1676 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1677 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1678 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1682 fail_finish: /* Substring not found */
1683 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1684 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1686 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
1687 PL_colors[4], PL_colors[5]));
1692 #define DECL_TRIE_TYPE(scan) \
1693 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1694 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1695 trie_utf8l, trie_flu8, trie_flu8_latin } \
1696 trie_type = ((scan->flags == EXACT) \
1697 ? (utf8_target ? trie_utf8 : trie_plain) \
1698 : (scan->flags == EXACTL) \
1699 ? (utf8_target ? trie_utf8l : trie_plain) \
1700 : (scan->flags == EXACTFAA) \
1702 ? trie_utf8_exactfa_fold \
1703 : trie_latin_utf8_exactfa_fold) \
1704 : (scan->flags == EXACTFLU8 \
1707 : trie_flu8_latin) \
1710 : trie_latin_utf8_fold)))
1712 /* 'uscan' is set to foldbuf, and incremented, so below the end of uscan is
1713 * 'foldbuf+sizeof(foldbuf)' */
1714 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uc_end, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1717 U8 flags = FOLD_FLAGS_FULL; \
1718 switch (trie_type) { \
1720 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1721 if (UTF8_IS_ABOVE_LATIN1(*uc)) { \
1722 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1724 goto do_trie_utf8_fold; \
1725 case trie_utf8_exactfa_fold: \
1726 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1728 case trie_utf8_fold: \
1729 do_trie_utf8_fold: \
1730 if ( foldlen>0 ) { \
1731 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1736 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc_end, foldbuf, &foldlen, \
1738 len = UTF8_SAFE_SKIP(uc, uc_end); \
1739 skiplen = UVCHR_SKIP( uvc ); \
1740 foldlen -= skiplen; \
1741 uscan = foldbuf + skiplen; \
1744 case trie_flu8_latin: \
1745 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1746 goto do_trie_latin_utf8_fold; \
1747 case trie_latin_utf8_exactfa_fold: \
1748 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1750 case trie_latin_utf8_fold: \
1751 do_trie_latin_utf8_fold: \
1752 if ( foldlen>0 ) { \
1753 uvc = utf8n_to_uvchr( (const U8*) uscan, foldlen, &len, uniflags ); \
1759 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1760 skiplen = UVCHR_SKIP( uvc ); \
1761 foldlen -= skiplen; \
1762 uscan = foldbuf + skiplen; \
1766 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1767 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1768 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc_end); \
1772 uvc = utf8n_to_uvchr( (const U8*) uc, uc_end - uc, &len, uniflags ); \
1779 charid = trie->charmap[ uvc ]; \
1783 if (widecharmap) { \
1784 SV** const svpp = hv_fetch(widecharmap, \
1785 (char*)&uvc, sizeof(UV), 0); \
1787 charid = (U16)SvIV(*svpp); \
1792 #define DUMP_EXEC_POS(li,s,doutf8,depth) \
1793 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1794 startpos, doutf8, depth)
1796 #define REXEC_FBC_UTF8_SCAN(CODE) \
1798 while (s < strend) { \
1800 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1804 #define REXEC_FBC_NON_UTF8_SCAN(CODE) \
1806 while (s < strend) { \
1812 #define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1814 while (s < strend) { \
1815 REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1819 #define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND) \
1821 while (s < strend) { \
1822 REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1826 #define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \
1829 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1830 previous_occurrence_end = s; \
1836 #define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \
1840 previous_occurrence_end = s; \
1846 /* We keep track of where the next character should start after an occurrence
1847 * of the one we're looking for. Knowing that, we can see right away if the
1848 * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we
1849 * don't accept the 2nd and succeeding adjacent occurrences */
1850 #define FBC_CHECK_AND_TRY \
1852 || s != previous_occurrence_end) \
1853 && ( reginfo->intuit \
1854 || (s <= reginfo->strend && regtry(reginfo, &s)))) \
1860 /* These differ from the above macros in that they call a function which
1861 * returns the next occurrence of the thing being looked for in 's'; and
1862 * 'strend' if there is no such occurrence. */
1863 #define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f) \
1864 while (s < strend) { \
1866 if (s >= strend) { \
1872 previous_occurrence_end = s; \
1875 #define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f) \
1876 while (s < strend) { \
1878 if (s >= strend) { \
1884 previous_occurrence_end = s; \
1887 /* This differs from the above macros in that it is passed a single byte that
1888 * is known to begin the next occurrence of the thing being looked for in 's'.
1889 * It does a memchr to find the next occurrence of 'byte', before trying 'COND'
1890 * at that position. */
1891 #define REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(byte, COND) \
1892 while (s < strend) { \
1893 s = (char *) memchr(s, byte, strend -s); \
1895 s = (char *) strend; \
1901 s += UTF8_SAFE_SKIP(s, reginfo->strend); \
1902 previous_occurrence_end = s; \
1909 /* The four macros below are slightly different versions of the same logic.
1911 * The first is for /a and /aa when the target string is UTF-8. This can only
1912 * match ascii, but it must advance based on UTF-8. The other three handle
1913 * the non-UTF-8 and the more generic UTF-8 cases. In all four, we are
1914 * looking for the boundary (or non-boundary) between a word and non-word
1915 * character. The utf8 and non-utf8 cases have the same logic, but the details
1916 * must be different. Find the "wordness" of the character just prior to this
1917 * one, and compare it with the wordness of this one. If they differ, we have
1918 * a boundary. At the beginning of the string, pretend that the previous
1919 * character was a new-line.
1921 * All these macros uncleanly have side-effects with each other and outside
1922 * variables. So far it's been too much trouble to clean-up
1924 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1925 * a word character or not.
1926 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1928 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1930 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1931 * are looking for a boundary or for a non-boundary. If we are looking for a
1932 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1933 * see if this tentative match actually works, and if so, to quit the loop
1934 * here. And vice-versa if we are looking for a non-boundary.
1936 * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and
1937 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1938 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1939 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1940 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1941 * complement. But in that branch we complement tmp, meaning that at the
1942 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1943 * which means at the top of the loop in the next iteration, it is
1944 * TEST_NON_UTF8(s-1) */
1945 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1946 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1947 tmp = TEST_NON_UTF8(tmp); \
1948 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1949 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1951 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1958 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1959 * TEST_UTF8 is a macro that for the same input code points returns identically
1960 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an
1961 * end pointer as well) */
1962 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
1963 if (s == reginfo->strbeg) { \
1966 else { /* Back-up to the start of the previous character */ \
1967 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1968 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1969 0, UTF8_ALLOW_DEFAULT); \
1971 tmp = TEST_UV(tmp); \
1972 REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */ \
1973 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
1982 /* Like the above two macros, for a UTF-8 target string. UTF8_CODE is the
1983 * complete code for handling UTF-8. Common to the BOUND and NBOUND cases,
1984 * set-up by the FBC_BOUND, etc macros below */
1985 #define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1987 /* Here, things have been set up by the previous code so that tmp is the \
1988 * return of TEST_NON_UTF8(s-1). We also have to check if this matches \
1989 * against the EOS, which we treat as a \n */ \
1990 if (tmp == ! TEST_NON_UTF8('\n')) { \
1997 /* Same as the macro above, but the target isn't UTF-8 */
1998 #define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1999 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
2000 tmp = TEST_NON_UTF8(tmp); \
2001 REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */ \
2002 if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) { \
2010 /* Here, things have been set up by the previous code so that tmp is \
2011 * the return of TEST_NON_UTF8(s-1). We also have to check if this \
2012 * matches against the EOS, which we treat as a \n */ \
2013 if (tmp == ! TEST_NON_UTF8('\n')) { \
2020 /* This is the macro to use when we want to see if something that looks like it
2021 * could match, actually does, and if so exits the loop. It needs to be used
2022 * only for bounds checking macros, as it allows for matching beyond the end of
2023 * string (which should be zero length without having to look at the string
2025 #define REXEC_FBC_TRYIT \
2026 if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \
2029 /* The only difference between the BOUND and NBOUND cases is that
2030 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
2031 * NBOUND. This is accomplished by passing it as either the if or else clause,
2032 * with the other one being empty (PLACEHOLDER is defined as empty).
2034 * The TEST_FOO parameters are for operating on different forms of input, but
2035 * all should be ones that return identically for the same underlying code
2038 #define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2039 FBC_BOUND_COMMON_UTF8( \
2040 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
2041 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2043 #define FBC_BOUND_NON_UTF8(TEST_NON_UTF8) \
2044 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2046 #define FBC_BOUND_A_UTF8(TEST_NON_UTF8) \
2047 FBC_BOUND_COMMON_UTF8( \
2048 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\
2049 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2051 #define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8) \
2052 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
2054 #define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
2055 FBC_BOUND_COMMON_UTF8( \
2056 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2057 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2059 #define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8) \
2060 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2062 #define FBC_NBOUND_A_UTF8(TEST_NON_UTF8) \
2063 FBC_BOUND_COMMON_UTF8( \
2064 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
2065 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2067 #define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8) \
2068 FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
2072 S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
2073 IV cp_out = _invlist_search(invlist, cp_in);
2074 assert(cp_out >= 0);
2077 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2078 invmap[S_get_break_val_cp_checked(invlist, cp)]
2080 # define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
2081 invmap[_invlist_search(invlist, cp)]
2084 /* Takes a pointer to an inversion list, a pointer to its corresponding
2085 * inversion map, and a code point, and returns the code point's value
2086 * according to the two arrays. It assumes that all code points have a value.
2087 * This is used as the base macro for macros for particular properties */
2088 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
2089 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
2091 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
2092 * of a code point, returning the value for the first code point in the string.
2093 * And it takes the particular macro name that finds the desired value given a
2094 * code point. Merely convert the UTF-8 to code point and call the cp macro */
2095 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
2096 (__ASSERT_(pos < strend) \
2097 /* Note assumes is valid UTF-8 */ \
2098 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
2100 /* Returns the GCB value for the input code point */
2101 #define getGCB_VAL_CP(cp) \
2102 _generic_GET_BREAK_VAL_CP( \
2107 /* Returns the GCB value for the first code point in the UTF-8 encoded string
2108 * bounded by pos and strend */
2109 #define getGCB_VAL_UTF8(pos, strend) \
2110 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
2112 /* Returns the LB value for the input code point */
2113 #define getLB_VAL_CP(cp) \
2114 _generic_GET_BREAK_VAL_CP( \
2119 /* Returns the LB value for the first code point in the UTF-8 encoded string
2120 * bounded by pos and strend */
2121 #define getLB_VAL_UTF8(pos, strend) \
2122 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
2125 /* Returns the SB value for the input code point */
2126 #define getSB_VAL_CP(cp) \
2127 _generic_GET_BREAK_VAL_CP( \
2132 /* Returns the SB value for the first code point in the UTF-8 encoded string
2133 * bounded by pos and strend */
2134 #define getSB_VAL_UTF8(pos, strend) \
2135 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
2137 /* Returns the WB value for the input code point */
2138 #define getWB_VAL_CP(cp) \
2139 _generic_GET_BREAK_VAL_CP( \
2144 /* Returns the WB value for the first code point in the UTF-8 encoded string
2145 * bounded by pos and strend */
2146 #define getWB_VAL_UTF8(pos, strend) \
2147 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
2149 /* We know what class REx starts with. Try to find this position... */
2150 /* if reginfo->intuit, its a dryrun */
2151 /* annoyingly all the vars in this routine have different names from their counterparts
2152 in regmatch. /grrr */
2154 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
2155 const char *strend, regmatch_info *reginfo)
2158 /* TRUE if x+ need not match at just the 1st pos of run of x's */
2159 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
2161 char *pat_string; /* The pattern's exactish string */
2162 char *pat_end; /* ptr to end char of pat_string */
2163 re_fold_t folder; /* Function for computing non-utf8 folds */
2164 const U8 *fold_array; /* array for folding ords < 256 */
2171 /* In some cases we accept only the first occurence of 'x' in a sequence of
2172 * them. This variable points to just beyond the end of the previous
2173 * occurrence of 'x', hence we can tell if we are in a sequence. (Having
2174 * it point to beyond the 'x' allows us to work for UTF-8 without having to
2176 char * previous_occurrence_end = 0;
2178 I32 tmp; /* Scratch variable */
2179 const bool utf8_target = reginfo->is_utf8_target;
2180 UV utf8_fold_flags = 0;
2181 const bool is_utf8_pat = reginfo->is_utf8_pat;
2182 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
2183 with a result inverts that result, as 0^1 =
2185 _char_class_number classnum;
2187 RXi_GET_DECL(prog,progi);
2189 PERL_ARGS_ASSERT_FIND_BYCLASS;
2191 /* We know what class it must start with. The case statements below have
2192 * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb'
2193 * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern
2194 * ('p8' and 'pb'. */
2195 switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) {
2197 case ANYOFPOSIXL_t8_pb:
2198 case ANYOFPOSIXL_t8_p8:
2201 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2202 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2210 REXEC_FBC_UTF8_CLASS_SCAN(
2211 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2214 case ANYOFPOSIXL_tb_pb:
2215 case ANYOFPOSIXL_tb_p8:
2218 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2219 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c);
2227 if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
2228 /* We know that s is in the bitmap range since the target isn't
2229 * UTF-8, so what happens for out-of-range values is not relevant,
2230 * so exclude that from the flags */
2231 REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1,
2235 REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
2239 case ANYOFM_tb_pb: /* ARG() is the base byte; FLAGS() the mask byte */
2241 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2242 (char *) find_next_masked((U8 *) s, (U8 *) strend,
2243 (U8) ARG(c), FLAGS(c)));
2248 /* UTF-8ness doesn't matter because only matches UTF-8 invariants. But
2249 * we do anyway for performance reasons, as otherwise we would have to
2250 * examine all the continuation characters */
2251 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2252 (char *) find_next_masked((U8 *) s, (U8 *) strend,
2253 (U8) ARG(c), FLAGS(c)));
2258 REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(
2259 (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2260 (U8) ARG(c), FLAGS(c)));
2264 case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8
2266 REXEC_FBC_UTF8_FIND_NEXT_SCAN(
2267 (char *) find_span_end_mask((U8 *) s, (U8 *) strend,
2268 (U8) ARG(c), FLAGS(c)));
2271 /* These nodes all require at least one code point to be in UTF-8 to
2281 case EXACTFLU8_tb_pb:
2282 case EXACTFLU8_tb_p8:
2283 case EXACTFU_REQ8_tb_pb:
2284 case EXACTFU_REQ8_tb_p8:
2289 REXEC_FBC_UTF8_CLASS_SCAN(
2290 ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2291 && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
2297 /* We know what the first byte of any matched string should be. */
2298 U8 first_byte = FLAGS(c);
2300 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2301 reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */));
2307 REXEC_FBC_UTF8_CLASS_SCAN(
2308 ( inRANGE(NATIVE_UTF8_TO_I8(*s),
2309 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)),
2310 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)))
2311 && reginclass(prog, c, (U8*)s, (U8*) strend,
2317 REXEC_FBC_UTF8_CLASS_SCAN(
2318 ( strend -s >= FLAGS(c)
2319 && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c))
2320 && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)));
2325 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2326 ANYOFRbase(c), ANYOFRdelta(c)));
2331 REXEC_FBC_UTF8_CLASS_SCAN(
2332 ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c)
2333 && withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2336 ANYOFRbase(c), ANYOFRdelta(c))));
2341 REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s,
2342 ANYOFRbase(c), ANYOFRdelta(c)));
2347 { /* We know what the first byte of any matched string should be */
2348 U8 first_byte = FLAGS(c);
2350 REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte,
2351 withinCOUNT(utf8_to_uvchr_buf((U8 *) s,
2354 ANYOFRbase(c), ANYOFRdelta(c)));
2358 case EXACTFAA_tb_pb:
2360 /* Latin1 folds are not affected by /a, except it excludes the sharp s,
2361 * which these functions don't handle anyway */
2362 fold_array = PL_fold_latin1;
2363 folder = foldEQ_latin1_s2_folded;
2364 goto do_exactf_non_utf8;
2367 fold_array = PL_fold;
2369 goto do_exactf_non_utf8;
2372 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2374 if (IN_UTF8_CTYPE_LOCALE) {
2375 utf8_fold_flags = FOLDEQ_LOCALE;
2376 goto do_exactf_utf8;
2379 fold_array = PL_fold_locale;
2380 folder = foldEQ_locale;
2381 goto do_exactf_non_utf8;
2384 /* Any 'ss' in the pattern should have been replaced by regcomp, so we
2385 * don't have to worry here about this single special case in the
2387 fold_array = PL_fold_latin1;
2388 folder = foldEQ_latin1_s2_folded;
2392 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
2393 are no glitches with fold-length differences
2394 between the target string and pattern */
2396 /* The idea in the non-utf8 EXACTF* cases is to first find the first
2397 * character of the EXACTF* node and then, if necessary,
2398 * case-insensitively compare the full text of the node. c1 is the
2399 * first character. c2 is its fold. This logic will not work for
2400 * Unicode semantics and the german sharp ss, which hence should not be
2401 * compiled into a node that gets here. */
2402 pat_string = STRINGs(c);
2403 ln = STR_LENs(c); /* length to match in octets/bytes */
2405 /* We know that we have to match at least 'ln' bytes (which is the same
2406 * as characters, since not utf8). If we have to match 3 characters,
2407 * and there are only 2 availabe, we know without trying that it will
2408 * fail; so don't start a match past the required minimum number from
2410 e = HOP3c(strend, -((SSize_t)ln), s);
2415 c2 = fold_array[c1];
2416 if (c1 == c2) { /* If char and fold are the same */
2418 s = (char *) memchr(s, c1, e + 1 - s);
2423 /* Check that the rest of the node matches */
2424 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2425 && (reginfo->intuit || regtry(reginfo, &s)) )
2433 U8 bits_differing = c1 ^ c2;
2435 /* If the folds differ in one bit position only, we can mask to
2436 * match either of them, and can use this faster find method. Both
2437 * ASCII and EBCDIC tend to have their case folds differ in only
2438 * one position, so this is very likely */
2439 if (LIKELY(PL_bitcount[bits_differing] == 1)) {
2440 bits_differing = ~ bits_differing;
2442 s = (char *) find_next_masked((U8 *) s, (U8 *) e + 1,
2443 (c1 & bits_differing), bits_differing);
2448 if ( (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2449 && (reginfo->intuit || regtry(reginfo, &s)) )
2456 else { /* Otherwise, stuck with looking byte-at-a-time. This
2457 should actually happen only in EXACTFL nodes */
2459 if ( (*(U8*)s == c1 || *(U8*)s == c2)
2460 && (ln == 1 || folder(s + 1, pat_string + 1, ln - 1))
2461 && (reginfo->intuit || regtry(reginfo, &s)) )
2471 case EXACTFAA_tb_p8:
2472 case EXACTFAA_t8_p8:
2473 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII
2474 |FOLDEQ_S2_ALREADY_FOLDED
2475 |FOLDEQ_S2_FOLDS_SANE;
2476 goto do_exactf_utf8;
2478 case EXACTFAA_NO_TRIE_tb_pb:
2479 case EXACTFAA_NO_TRIE_t8_pb:
2480 case EXACTFAA_t8_pb:
2482 /* Here, and elsewhere in this file, the reason we can't consider a
2483 * non-UTF-8 pattern already folded in the presence of a UTF-8 target
2484 * is because any MICRO SIGN in the pattern won't be folded. Since the
2485 * fold of the MICRO SIGN requires UTF-8 to represent, we can consider
2486 * a non-UTF-8 pattern folded when matching a non-UTF-8 target */
2487 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2488 goto do_exactf_utf8;
2493 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2494 utf8_fold_flags = FOLDEQ_LOCALE;
2495 goto do_exactf_utf8;
2497 case EXACTFLU8_t8_pb:
2498 case EXACTFLU8_t8_p8:
2499 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
2500 | FOLDEQ_S2_FOLDS_SANE;
2501 goto do_exactf_utf8;
2503 case EXACTFU_REQ8_t8_p8:
2504 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2505 goto do_exactf_utf8;
2510 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
2511 goto do_exactf_utf8;
2513 /* The following are problematic even though pattern isn't UTF-8. Use
2514 * full functionality normally not done except for UTF-8. */
2516 case EXACTFUP_tb_pb:
2517 case EXACTFUP_t8_pb:
2523 /* If one of the operands is in utf8, we can't use the simpler
2524 * folding above, due to the fact that many different characters
2525 * can have the same fold, or portion of a fold, or different-
2527 pat_string = STRINGs(c);
2528 ln = STR_LENs(c); /* length to match in octets/bytes */
2529 pat_end = pat_string + ln;
2530 lnc = is_utf8_pat /* length to match in characters */
2531 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2534 /* We have 'lnc' characters to match in the pattern, but because of
2535 * multi-character folding, each character in the target can match
2536 * up to 3 characters (Unicode guarantees it will never exceed
2537 * this) if it is utf8-encoded; and up to 2 if not (based on the
2538 * fact that the Latin 1 folds are already determined, and the only
2539 * multi-char fold in that range is the sharp-s folding to 'ss'.
2540 * Thus, a pattern character can match as little as 1/3 of a string
2541 * character. Adjust lnc accordingly, rounding up, so that if we
2542 * need to match at least 4+1/3 chars, that really is 5. */
2543 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2544 lnc = (lnc + expansion - 1) / expansion;
2546 /* As in the non-UTF8 case, if we have to match 3 characters, and
2547 * only 2 are left, it's guaranteed to fail, so don't start a match
2548 * that would require us to go beyond the end of the string */
2549 e = HOP3c(strend, -((SSize_t)lnc), s);
2551 /* XXX Note that we could recalculate e to stop the loop earlier,
2552 * as the worst case expansion above will rarely be met, and as we
2553 * go along we would usually find that e moves further to the left.
2554 * This would happen only after we reached the point in the loop
2555 * where if there were no expansion we should fail. Unclear if
2556 * worth the expense */
2559 char *my_strend= (char *)strend;
2560 if ( foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
2561 pat_string, NULL, ln, is_utf8_pat,
2563 && (reginfo->intuit || regtry(reginfo, &s)) )
2567 s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1;
2574 case BOUND_tb_pb: /* /d without utf8 target is /a */
2576 /* regcomp.c makes sure that these only have the traditional \b
2578 assert(FLAGS(c) == TRADITIONAL_BOUND);
2580 FBC_BOUND_A_NON_UTF8(isWORDCHAR_A);
2583 case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2585 /* regcomp.c makes sure that these only have the traditional \b
2587 assert(FLAGS(c) == TRADITIONAL_BOUND);
2589 FBC_BOUND_A_UTF8(isWORDCHAR_A);
2594 case NBOUND_tb_pb: /* /d without utf8 target is /a */
2596 /* regcomp.c makes sure that these only have the traditional \b
2598 assert(FLAGS(c) == TRADITIONAL_BOUND);
2600 FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A);
2603 case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */
2605 /* regcomp.c makes sure that these only have the traditional \b
2607 assert(FLAGS(c) == TRADITIONAL_BOUND);
2609 FBC_NBOUND_A_UTF8(isWORDCHAR_A);
2614 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2615 FBC_NBOUND_NON_UTF8(isWORDCHAR_L1);
2620 goto do_boundu_non_utf8;
2624 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2625 if (FLAGS(c) == TRADITIONAL_BOUND) {
2626 FBC_NBOUND_NON_UTF8(isWORDCHAR_LC);
2630 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2633 goto do_boundu_non_utf8;
2637 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2638 if (FLAGS(c) == TRADITIONAL_BOUND) {
2639 FBC_BOUND_NON_UTF8(isWORDCHAR_LC);
2643 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2645 goto do_boundu_non_utf8;
2649 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2650 FBC_BOUND_NON_UTF8(isWORDCHAR_L1);
2655 if (s == reginfo->strbeg) {
2656 if (reginfo->intuit || regtry(reginfo, &s))
2661 /* Didn't match. Try at the next position (if there is one) */
2663 if (UNLIKELY(s >= reginfo->strend)) {
2668 switch((bound_type) FLAGS(c)) {
2669 case TRADITIONAL_BOUND: /* Should have already been handled */
2674 /* Not utf8. Everything is a GCB except between CR and LF */
2675 while (s < strend) {
2676 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2677 || UCHARAT(s) != '\n'))
2678 && (reginfo->intuit || regtry(reginfo, &s)))
2689 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2690 while (s < strend) {
2691 LB_enum after = getLB_VAL_CP((U8) *s);
2692 if (to_complement ^ isLB(before,
2694 (U8*) reginfo->strbeg,
2696 (U8*) reginfo->strend,
2697 0 /* target not utf8 */ )
2698 && (reginfo->intuit || regtry(reginfo, &s)))
2711 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2712 while (s < strend) {
2713 SB_enum after = getSB_VAL_CP((U8) *s);
2714 if ((to_complement ^ isSB(before,
2716 (U8*) reginfo->strbeg,
2718 (U8*) reginfo->strend,
2719 0 /* target not utf8 */ ))
2720 && (reginfo->intuit || regtry(reginfo, &s)))
2733 WB_enum previous = WB_UNKNOWN;
2734 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2735 while (s < strend) {
2736 WB_enum after = getWB_VAL_CP((U8) *s);
2737 if ((to_complement ^ isWB(previous,
2740 (U8*) reginfo->strbeg,
2742 (U8*) reginfo->strend,
2743 0 /* target not utf8 */ ))
2744 && (reginfo->intuit || regtry(reginfo, &s)))
2755 /* Here are at the final position in the target string, which is a
2756 * boundary by definition, so matches, depending on other constraints.
2758 if ( reginfo->intuit
2759 || (s <= reginfo->strend && regtry(reginfo, &s)))
2768 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2769 if (FLAGS(c) == TRADITIONAL_BOUND) {
2770 FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2771 isWORDCHAR_LC_utf8_safe);
2775 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2778 goto do_boundu_utf8;
2782 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2783 if (FLAGS(c) == TRADITIONAL_BOUND) {
2784 FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr,
2785 isWORDCHAR_LC_utf8_safe);
2789 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
2792 goto do_boundu_utf8;
2796 /* regcomp.c makes sure that these only have the traditional \b
2798 assert(FLAGS(c) == TRADITIONAL_BOUND);
2804 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2805 FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni,
2806 isWORDCHAR_utf8_safe);
2811 goto do_boundu_utf8;
2815 /* regcomp.c makes sure that these only have the traditional \b
2817 assert(FLAGS(c) == TRADITIONAL_BOUND);
2823 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2824 FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
2829 if (s == reginfo->strbeg) {
2830 if (reginfo->intuit || regtry(reginfo, &s))
2835 /* Didn't match. Try at the next position (if there is one) */
2836 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2837 if (UNLIKELY(s >= reginfo->strend)) {
2842 switch((bound_type) FLAGS(c)) {
2843 case TRADITIONAL_BOUND: /* Should have already been handled */
2849 GCB_enum before = getGCB_VAL_UTF8(
2851 (U8*)(reginfo->strbeg)),
2852 (U8*) reginfo->strend);
2853 while (s < strend) {
2854 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2855 (U8*) reginfo->strend);
2856 if ( (to_complement ^ isGCB(before,
2858 (U8*) reginfo->strbeg,
2860 1 /* target is utf8 */ ))
2861 && (reginfo->intuit || regtry(reginfo, &s)))
2866 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2873 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2875 (U8*)(reginfo->strbeg)),
2876 (U8*) reginfo->strend);
2877 while (s < strend) {
2878 LB_enum after = getLB_VAL_UTF8((U8*) s,
2879 (U8*) reginfo->strend);
2880 if (to_complement ^ isLB(before,
2882 (U8*) reginfo->strbeg,
2884 (U8*) reginfo->strend,
2885 1 /* target is utf8 */ )
2886 && (reginfo->intuit || regtry(reginfo, &s)))
2891 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2899 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2901 (U8*)(reginfo->strbeg)),
2902 (U8*) reginfo->strend);
2903 while (s < strend) {
2904 SB_enum after = getSB_VAL_UTF8((U8*) s,
2905 (U8*) reginfo->strend);
2906 if ((to_complement ^ isSB(before,
2908 (U8*) reginfo->strbeg,
2910 (U8*) reginfo->strend,
2911 1 /* target is utf8 */ ))
2912 && (reginfo->intuit || regtry(reginfo, &s)))
2917 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2925 /* We are at a boundary between char_sub_0 and char_sub_1.
2926 * We also keep track of the value for char_sub_-1 as we
2927 * loop through the line. Context may be needed to make a
2928 * determination, and if so, this can save having to
2930 WB_enum previous = WB_UNKNOWN;
2931 WB_enum before = getWB_VAL_UTF8(
2934 (U8*)(reginfo->strbeg)),
2935 (U8*) reginfo->strend);
2936 while (s < strend) {
2937 WB_enum after = getWB_VAL_UTF8((U8*) s,
2938 (U8*) reginfo->strend);
2939 if ((to_complement ^ isWB(previous,
2942 (U8*) reginfo->strbeg,
2944 (U8*) reginfo->strend,
2945 1 /* target is utf8 */ ))
2946 && (reginfo->intuit || regtry(reginfo, &s)))
2952 s += UTF8_SAFE_SKIP(s, reginfo->strend);
2957 /* Here are at the final position in the target string, which is a
2958 * boundary by definition, so matches, depending on other constraints.
2961 if ( reginfo->intuit
2962 || (s <= reginfo->strend && regtry(reginfo, &s)))
2970 REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend));
2975 REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend));
2978 /* The argument to all the POSIX node types is the class number to pass
2979 * to _generic_isCC() to build a mask for searching in PL_charclass[] */
2988 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2989 REXEC_FBC_UTF8_CLASS_SCAN(
2990 to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s,
3001 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3002 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3003 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
3008 /* The complement of something that matches only ASCII matches all
3009 * non-ASCII, plus everything in ASCII that isn't in the class. */
3010 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend)
3011 || ! _generic_isCC_A(*s, FLAGS(c)));
3016 /* Don't need to worry about utf8, as it can match only a single
3017 * byte invariant character. But we do anyway for performance reasons,
3018 * as otherwise we would have to examine all the continuation
3020 REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
3034 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3035 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
3045 REXEC_FBC_NON_UTF8_CLASS_SCAN(
3046 to_complement ^ cBOOL(_generic_isCC(*s,
3061 classnum = (_char_class_number) FLAGS(c);
3064 REXEC_FBC_UTF8_CLASS_SCAN(
3065 to_complement ^ cBOOL(_invlist_contains_cp(
3066 PL_XPosix_ptrs[classnum],
3067 utf8_to_uvchr_buf((U8 *) s,
3072 case _CC_ENUM_SPACE:
3073 REXEC_FBC_UTF8_CLASS_SCAN(
3074 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3077 case _CC_ENUM_BLANK:
3078 REXEC_FBC_UTF8_CLASS_SCAN(
3079 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3082 case _CC_ENUM_XDIGIT:
3083 REXEC_FBC_UTF8_CLASS_SCAN(
3084 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3087 case _CC_ENUM_VERTSPACE:
3088 REXEC_FBC_UTF8_CLASS_SCAN(
3089 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3092 case _CC_ENUM_CNTRL:
3093 REXEC_FBC_UTF8_CLASS_SCAN(
3094 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3099 case AHOCORASICKC_tb_pb:
3100 case AHOCORASICKC_tb_p8:
3101 case AHOCORASICKC_t8_pb:
3102 case AHOCORASICKC_t8_p8:
3103 case AHOCORASICK_tb_pb:
3104 case AHOCORASICK_tb_p8:
3105 case AHOCORASICK_t8_pb:
3106 case AHOCORASICK_t8_p8:
3109 /* what trie are we using right now */
3110 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
3111 reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie];
3112 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
3114 const char *last_start = strend - trie->minlen;
3116 const char *real_start = s;
3118 STRLEN maxlen = trie->maxlen;
3120 U8 **points; /* map of where we were in the input string
3121 when reading a given char. For ASCII this
3122 is unnecessary overhead as the relationship
3123 is always 1:1, but for Unicode, especially
3124 case folded Unicode this is not true. */
3125 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3129 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3131 /* We can't just allocate points here. We need to wrap it in
3132 * an SV so it gets freed properly if there is a croak while
3133 * running the match */
3136 sv_points=newSV(maxlen * sizeof(U8 *));
3137 SvCUR_set(sv_points,
3138 maxlen * sizeof(U8 *));
3139 SvPOK_on(sv_points);
3140 sv_2mortal(sv_points);
3141 points=(U8**)SvPV_nolen(sv_points );
3142 if ( trie_type != trie_utf8_fold
3143 && (trie->bitmap || OP(c)==AHOCORASICKC) )
3146 bitmap=(U8*)trie->bitmap;
3148 bitmap=(U8*)ANYOF_BITMAP(c);
3150 /* this is the Aho-Corasick algorithm modified a touch
3151 to include special handling for long "unknown char" sequences.
3152 The basic idea being that we use AC as long as we are dealing
3153 with a possible matching char, when we encounter an unknown char
3154 (and we have not encountered an accepting state) we scan forward
3155 until we find a legal starting char.
3156 AC matching is basically that of trie matching, except that when
3157 we encounter a failing transition, we fall back to the current
3158 states "fail state", and try the current char again, a process
3159 we repeat until we reach the root state, state 1, or a legal
3160 transition. If we fail on the root state then we can either
3161 terminate if we have reached an accepting state previously, or
3162 restart the entire process from the beginning if we have not.
3165 while (s <= last_start) {
3166 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3174 U8 *uscan = (U8*)NULL;
3175 U8 *leftmost = NULL;
3177 U32 accepted_word= 0;
3181 while ( state && uc <= (U8*)strend ) {
3183 U32 word = aho->states[ state ].wordnum;
3187 DEBUG_TRIE_EXECUTE_r(
3188 if ( uc <= (U8*)last_start
3189 && !BITMAP_TEST(bitmap,*uc) )
3191 dump_exec_pos( (char *)uc, c, strend,
3193 (char *)uc, utf8_target, 0 );
3194 Perl_re_printf( aTHX_
3195 " Scanning for legal start char...\n");
3199 while ( uc <= (U8*)last_start
3200 && !BITMAP_TEST(bitmap,*uc) )
3205 while ( uc <= (U8*)last_start
3206 && ! BITMAP_TEST(bitmap,*uc) )
3213 if (uc >(U8*)last_start) break;
3217 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len)
3219 if (!leftmost || lpos < leftmost) {
3220 DEBUG_r(accepted_word=word);
3226 points[pointpos++ % maxlen]= uc;
3227 if (foldlen || uc < (U8*)strend) {
3228 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3229 (U8 *) strend, uscan, len, uvc,
3230 charid, foldlen, foldbuf,
3232 DEBUG_TRIE_EXECUTE_r({
3233 dump_exec_pos( (char *)uc, c, strend,
3234 real_start, s, utf8_target, 0);
3235 Perl_re_printf( aTHX_
3236 " Charid:%3u CP:%4" UVxf " ",
3248 word = aho->states[ state ].wordnum;
3250 base = aho->states[ state ].trans.base;
3252 DEBUG_TRIE_EXECUTE_r({
3254 dump_exec_pos((char *)uc, c, strend, real_start,
3255 s, utf8_target, 0 );
3256 Perl_re_printf( aTHX_
3257 "%sState: %4" UVxf ", word=%" UVxf,
3258 failed ? " Fail transition to " : "",
3259 (UV)state, (UV)word);
3265 ( ((offset = base + charid
3266 - 1 - trie->uniquecharcount)) >= 0)
3267 && ((U32)offset < trie->lasttrans)
3268 && trie->trans[offset].check == state
3269 && (tmp=trie->trans[offset].next))
3271 DEBUG_TRIE_EXECUTE_r(
3272 Perl_re_printf( aTHX_ " - legal\n"));
3277 DEBUG_TRIE_EXECUTE_r(
3278 Perl_re_printf( aTHX_ " - fail\n"));
3280 state = aho->fail[state];
3284 /* we must be accepting here */
3285 DEBUG_TRIE_EXECUTE_r(
3286 Perl_re_printf( aTHX_ " - accepting\n"));
3295 if (!state) state = 1;
3298 if ( aho->states[ state ].wordnum ) {
3299 U8 *lpos = points[ (pointpos
3300 - trie->wordinfo[aho->states[ state ]
3301 .wordnum].len) % maxlen ];
3302 if (!leftmost || lpos < leftmost) {
3303 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
3308 s = (char*)leftmost;
3309 DEBUG_TRIE_EXECUTE_r({
3310 Perl_re_printf( aTHX_ "Matches word #%" UVxf
3311 " at position %" IVdf ". Trying full"
3313 (UV)accepted_word, (IV)(s - real_start)
3316 if (reginfo->intuit || regtry(reginfo, &s)) {
3321 if (s < reginfo->strend) {
3324 DEBUG_TRIE_EXECUTE_r({
3325 Perl_re_printf( aTHX_
3326 "Pattern failed. Looking for new start"
3330 DEBUG_TRIE_EXECUTE_r(
3331 Perl_re_printf( aTHX_ "No match.\n"));
3340 case EXACTFU_REQ8_t8_pb:
3341 case EXACTFUP_tb_p8:
3342 case EXACTFUP_t8_p8:
3344 case EXACTF_t8_p8: /* This node only generated for non-utf8 patterns */
3345 case EXACTFAA_NO_TRIE_tb_p8:
3346 case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8
3351 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
3352 } /* End of switch on node type */
3360 /* set RX_SAVED_COPY, RX_SUBBEG etc.
3361 * flags have same meanings as with regexec_flags() */
3364 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
3371 struct regexp *const prog = ReANY(rx);
3373 if (flags & REXEC_COPY_STR) {
3376 DEBUG_C(Perl_re_printf( aTHX_
3377 "Copy on write: regexp capture, type %d\n",
3379 /* Create a new COW SV to share the match string and store
3380 * in saved_copy, unless the current COW SV in saved_copy
3381 * is valid and suitable for our purpose */
3382 if (( prog->saved_copy
3383 && SvIsCOW(prog->saved_copy)
3384 && SvPOKp(prog->saved_copy)
3387 && SvPVX(sv) == SvPVX(prog->saved_copy)))
3389 /* just reuse saved_copy SV */
3390 if (RXp_MATCH_COPIED(prog)) {
3391 Safefree(prog->subbeg);
3392 RXp_MATCH_COPIED_off(prog);
3396 /* create new COW SV to share string */
3397 RXp_MATCH_COPY_FREE(prog);
3398 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
3400 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
3401 assert (SvPOKp(prog->saved_copy));
3402 prog->sublen = strend - strbeg;
3403 prog->suboffset = 0;
3404 prog->subcoffset = 0;
3409 SSize_t max = strend - strbeg;
3412 if ( (flags & REXEC_COPY_SKIP_POST)
3413 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3414 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
3415 ) { /* don't copy $' part of string */
3418 /* calculate the right-most part of the string covered
3419 * by a capture. Due to lookahead, this may be to
3420 * the right of $&, so we have to scan all captures */
3421 while (n <= prog->lastparen) {
3422 if (prog->offs[n].end > max)
3423 max = prog->offs[n].end;
3427 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
3428 ? prog->offs[0].start
3430 assert(max >= 0 && max <= strend - strbeg);
3433 if ( (flags & REXEC_COPY_SKIP_PRE)
3434 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
3435 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
3436 ) { /* don't copy $` part of string */
3439 /* calculate the left-most part of the string covered
3440 * by a capture. Due to lookbehind, this may be to
3441 * the left of $&, so we have to scan all captures */
3442 while (min && n <= prog->lastparen) {
3443 if ( prog->offs[n].start != -1
3444 && prog->offs[n].start < min)
3446 min = prog->offs[n].start;
3450 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
3451 && min > prog->offs[0].end
3453 min = prog->offs[0].end;
3457 assert(min >= 0 && min <= max && min <= strend - strbeg);
3460 if (RXp_MATCH_COPIED(prog)) {
3461 if (sublen > prog->sublen)
3463 (char*)saferealloc(prog->subbeg, sublen+1);
3466 prog->subbeg = (char*)safemalloc(sublen+1);
3467 Copy(strbeg + min, prog->subbeg, sublen, char);
3468 prog->subbeg[sublen] = '\0';
3469 prog->suboffset = min;
3470 prog->sublen = sublen;
3471 RXp_MATCH_COPIED_on(prog);
3473 prog->subcoffset = prog->suboffset;
3474 if (prog->suboffset && utf8_target) {
3475 /* Convert byte offset to chars.
3476 * XXX ideally should only compute this if @-/@+
3477 * has been seen, a la PL_sawampersand ??? */
3479 /* If there's a direct correspondence between the
3480 * string which we're matching and the original SV,
3481 * then we can use the utf8 len cache associated with
3482 * the SV. In particular, it means that under //g,
3483 * sv_pos_b2u() will use the previously cached
3484 * position to speed up working out the new length of
3485 * subcoffset, rather than counting from the start of
3486 * the string each time. This stops
3487 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
3488 * from going quadratic */
3489 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
3490 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
3491 SV_GMAGIC|SV_CONST_RETURN);
3493 prog->subcoffset = utf8_length((U8*)strbeg,
3494 (U8*)(strbeg+prog->suboffset));
3498 RXp_MATCH_COPY_FREE(prog);
3499 prog->subbeg = strbeg;
3500 prog->suboffset = 0;
3501 prog->subcoffset = 0;
3502 prog->sublen = strend - strbeg;
3510 - regexec_flags - match a regexp against a string
3513 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
3514 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
3515 /* stringarg: the point in the string at which to begin matching */
3516 /* strend: pointer to null at end of string */
3517 /* strbeg: real beginning of string */
3518 /* minend: end of match must be >= minend bytes after stringarg. */
3519 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
3520 * itself is accessed via the pointers above */
3521 /* data: May be used for some additional optimizations.
3522 Currently unused. */
3523 /* flags: For optimizations. See REXEC_* in regexp.h */
3526 struct regexp *const prog = ReANY(rx);
3530 SSize_t minlen; /* must match at least this many chars */
3531 SSize_t dontbother = 0; /* how many characters not to try at end */
3532 const bool utf8_target = cBOOL(DO_UTF8(sv));
3534 RXi_GET_DECL(prog,progi);
3535 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
3536 regmatch_info *const reginfo = ®info_buf;
3537 regexp_paren_pair *swap = NULL;
3539 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3541 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
3542 PERL_UNUSED_ARG(data);
3544 /* Be paranoid... */
3546 Perl_croak(aTHX_ "NULL regexp parameter");
3550 debug_start_match(rx, utf8_target, stringarg, strend,
3554 startpos = stringarg;
3556 /* set these early as they may be used by the HOP macros below */
3557 reginfo->strbeg = strbeg;
3558 reginfo->strend = strend;
3559 reginfo->is_utf8_target = cBOOL(utf8_target);
3561 if (prog->intflags & PREGf_GPOS_SEEN) {
3564 /* set reginfo->ganch, the position where \G can match */
3567 (flags & REXEC_IGNOREPOS)
3568 ? stringarg /* use start pos rather than pos() */
3569 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
3570 /* Defined pos(): */
3571 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
3572 : strbeg; /* pos() not defined; use start of string */
3574 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3575 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
3577 /* in the presence of \G, we may need to start looking earlier in
3578 * the string than the suggested start point of stringarg:
3579 * if prog->gofs is set, then that's a known, fixed minimum
3582 * /ab|c\G/: gofs = 1
3583 * or if the minimum offset isn't known, then we have to go back
3584 * to the start of the string, e.g. /w+\G/
3587 if (prog->intflags & PREGf_ANCH_GPOS) {
3589 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
3591 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
3593 DEBUG_GPOS_r(Perl_re_printf( aTHX_
3594 "fail: ganch-gofs before earliest possible start\n"));
3599 startpos = reginfo->ganch;
3601 else if (prog->gofs) {
3602 startpos = HOPBACKc(startpos, prog->gofs);
3606 else if (prog->intflags & PREGf_GPOS_FLOAT)
3610 minlen = prog->minlen;
3611 if ((startpos + minlen) > strend || startpos < strbeg) {
3612 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3613 "Regex match can't succeed, so not even tried\n"));
3617 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
3618 * which will call destuctors to reset PL_regmatch_state, free higher
3619 * PL_regmatch_slabs, and clean up regmatch_info_aux and
3620 * regmatch_info_aux_eval */
3622 oldsave = PL_savestack_ix;
3626 if ((prog->extflags & RXf_USE_INTUIT)
3627 && !(flags & REXEC_CHECKED))
3629 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
3634 if (prog->extflags & RXf_CHECK_ALL) {
3635 /* we can match based purely on the result of INTUIT.
3636 * Set up captures etc just for $& and $-[0]
3637 * (an intuit-only match wont have $1,$2,..) */
3638 assert(!prog->nparens);
3640 /* s/// doesn't like it if $& is earlier than where we asked it to
3641 * start searching (which can happen on something like /.\G/) */
3642 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3645 /* this should only be possible under \G */
3646 assert(prog->intflags & PREGf_GPOS_SEEN);
3647 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3648 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3652 /* match via INTUIT shouldn't have any captures.
3653 * Let @-, @+, $^N know */
3654 prog->lastparen = prog->lastcloseparen = 0;
3655 RXp_MATCH_UTF8_set(prog, utf8_target);
3656 prog->offs[0].start = s - strbeg;
3657 prog->offs[0].end = utf8_target
3658 ? (char*)utf8_hop_forward((U8*)s, prog->minlenret, (U8 *) strend) - strbeg
3659 : s - strbeg + prog->minlenret;
3660 if ( !(flags & REXEC_NOT_FIRST) )
3661 S_reg_set_capture_string(aTHX_ rx,
3663 sv, flags, utf8_target);
3669 multiline = prog->extflags & RXf_PMf_MULTILINE;
3671 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
3672 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
3673 "String too short [regexec_flags]...\n"));
3677 /* Check validity of program. */
3678 if (UCHARAT(progi->program) != REG_MAGIC) {
3679 Perl_croak(aTHX_ "corrupted regexp program");
3682 RXp_MATCH_TAINTED_off(prog);
3683 RXp_MATCH_UTF8_set(prog, utf8_target);
3685 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3686 reginfo->intuit = 0;
3687 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3688 reginfo->warned = FALSE;
3690 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
3691 /* see how far we have to get to not match where we matched before */
3692 reginfo->till = stringarg + minend;
3694 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
3695 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3696 S_cleanup_regmatch_info_aux has executed (registered by
3697 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3698 magic belonging to this SV.
3699 Not newSVsv, either, as it does not COW.
3701 reginfo->sv = newSV(0);
3702 SvSetSV_nosteal(reginfo->sv, sv);
3703 SAVEFREESV(reginfo->sv);
3706 /* reserve next 2 or 3 slots in PL_regmatch_state:
3707 * slot N+0: may currently be in use: skip it
3708 * slot N+1: use for regmatch_info_aux struct
3709 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3710 * slot N+3: ready for use by regmatch()
3714 regmatch_state *old_regmatch_state;
3715 regmatch_slab *old_regmatch_slab;
3716 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3718 /* on first ever match, allocate first slab */
3719 if (!PL_regmatch_slab) {
3720 Newx(PL_regmatch_slab, 1, regmatch_slab);
3721 PL_regmatch_slab->prev = NULL;
3722 PL_regmatch_slab->next = NULL;
3723 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3726 old_regmatch_state = PL_regmatch_state;
3727 old_regmatch_slab = PL_regmatch_slab;
3729 for (i=0; i <= max; i++) {
3731 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3733 reginfo->info_aux_eval =
3734 reginfo->info_aux->info_aux_eval =
3735 &(PL_regmatch_state->u.info_aux_eval);
3737 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3738 PL_regmatch_state = S_push_slab(aTHX);
3741 /* note initial PL_regmatch_state position; at end of match we'll
3742 * pop back to there and free any higher slabs */
3744 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3745 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
3746 reginfo->info_aux->poscache = NULL;
3748 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3750 if ((prog->extflags & RXf_EVAL_SEEN))
3751 S_setup_eval_state(aTHX_ reginfo);
3753 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3756 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3757 /* We have to be careful. If the previous successful match
3758 was from this regex we don't want a subsequent partially
3759 successful match to clobber the old results.
3760 So when we detect this possibility we add a swap buffer
3761 to the re, and switch the buffer each match. If we fail,
3762 we switch it back; otherwise we leave it swapped.
3765 /* avoid leak if we die, or clean up anyway if match completes */
3767 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3768 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
3769 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
3777 if (prog->recurse_locinput)
3778 Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3780 /* Simplest case: anchored match (but not \G) need be tried only once,
3781 * or with MBOL, only at the beginning of each line.
3783 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3784 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3785 * match at the start of the string then it won't match anywhere else
3786 * either; while with /.*.../, if it doesn't match at the beginning,
3787 * the earliest it could match is at the start of the next line */
3789 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3792 if (regtry(reginfo, &s))
3795 if (!(prog->intflags & PREGf_ANCH_MBOL))
3798 /* didn't match at start, try at other newline positions */
3801 dontbother = minlen - 1;
3802 end = HOP3c(strend, -dontbother, strbeg) - 1;
3804 /* skip to next newline */
3806 while (s <= end) { /* note it could be possible to match at the end of the string */
3807 /* NB: newlines are the same in unicode as they are in latin */
3810 if (prog->check_substr || prog->check_utf8) {
3811 /* note that with PREGf_IMPLICIT, intuit can only fail
3812 * or return the start position, so it's of limited utility.
3813 * Nevertheless, I made the decision that the potential for
3814 * quick fail was still worth it - DAPM */
3815 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3819 if (regtry(reginfo, &s))
3823 } /* end anchored search */
3825 /* anchored \G match */
3826 if (prog->intflags & PREGf_ANCH_GPOS)
3828 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3829 assert(prog->intflags & PREGf_GPOS_SEEN);
3830 /* For anchored \G, the only position it can match from is
3831 * (ganch-gofs); we already set startpos to this above; if intuit
3832 * moved us on from there, we can't possibly succeed */
3833 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
3834 if (s == startpos && regtry(reginfo, &s))
3839 /* Messy cases: unanchored match. */
3841 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3842 /* we have /x+whatever/ */
3843 /* it must be a one character string (XXXX Except is_utf8_pat?) */
3849 if (! prog->anchored_utf8) {
3850 to_utf8_substr(prog);
3852 ch = SvPVX_const(prog->anchored_utf8)[0];
3853 REXEC_FBC_UTF8_SCAN(
3855 DEBUG_EXECUTE_r( did_match = 1 );
3856 if (regtry(reginfo, &s)) goto got_it;
3857 s += UTF8_SAFE_SKIP(s, strend);
3858 while (s < strend && *s == ch)
3865 if (! prog->anchored_substr) {
3866 if (! to_byte_substr(prog)) {
3867 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3870 ch = SvPVX_const(prog->anchored_substr)[0];
3871 REXEC_FBC_NON_UTF8_SCAN(
3873 DEBUG_EXECUTE_r( did_match = 1 );
3874 if (regtry(reginfo, &s)) goto got_it;
3876 while (s < strend && *s == ch)
3881 DEBUG_EXECUTE_r(if (!did_match)
3882 Perl_re_printf( aTHX_
3883 "Did not find anchored character...\n")
3886 else if (prog->anchored_substr != NULL
3887 || prog->anchored_utf8 != NULL
3888 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3889 && prog->float_max_offset < strend - s)) {
3894 char *last1; /* Last position checked before */
3898 if (prog->anchored_substr || prog->anchored_utf8) {
3900 if (! prog->anchored_utf8) {
3901 to_utf8_substr(prog);
3903 must = prog->anchored_utf8;
3906 if (! prog->anchored_substr) {
3907 if (! to_byte_substr(prog)) {
3908 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3911 must = prog->anchored_substr;
3913 back_max = back_min = prog->anchored_offset;
3916 if (! prog->float_utf8) {
3917 to_utf8_substr(prog);
3919 must = prog->float_utf8;
3922 if (! prog->float_substr) {
3923 if (! to_byte_substr(prog)) {
3924 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3927 must = prog->float_substr;
3929 back_max = prog->float_max_offset;
3930 back_min = prog->float_min_offset;
3936 last = HOP3c(strend, /* Cannot start after this */
3937 -(SSize_t)(CHR_SVLEN(must)
3938 - (SvTAIL(must) != 0) + back_min), strbeg);
3940 if (s > reginfo->strbeg)
3941 last1 = HOPc(s, -1);
3943 last1 = s - 1; /* bogus */
3945 /* XXXX check_substr already used to find "s", can optimize if
3946 check_substr==must. */
3948 strend = HOPc(strend, -dontbother);
3949 while ( (s <= last) &&
3950 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
3951 (unsigned char*)strend, must,
3952 multiline ? FBMrf_MULTILINE : 0)) ) {
3953 DEBUG_EXECUTE_r( did_match = 1 );
3954 if (HOPc(s, -back_max) > last1) {
3955 last1 = HOPc(s, -back_min);
3956 s = HOPc(s, -back_max);
3959 char * const t = (last1 >= reginfo->strbeg)
3960 ? HOPc(last1, 1) : last1 + 1;
3962 last1 = HOPc(s, -back_min);
3966 while (s <= last1) {
3967 if (regtry(reginfo, &s))
3970 s++; /* to break out of outer loop */
3977 while (s <= last1) {
3978 if (regtry(reginfo, &s))
3984 DEBUG_EXECUTE_r(if (!did_match) {
3985 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3986 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3987 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n",
3988 ((must == prog->anchored_substr || must == prog->anchored_utf8)
3989 ? "anchored" : "floating"),
3990 quoted, RE_SV_TAIL(must));
3994 else if ( (c = progi->regstclass) ) {
3996 const OPCODE op = OP(progi->regstclass);
3997 /* don't bother with what can't match */
3998 if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
3999 strend = HOPc(strend, -(minlen - 1));
4002 SV * const prop = sv_newmortal();
4003 regprop(prog, prop, c, reginfo, NULL);
4005 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
4006 s,strend-s,PL_dump_re_max_len);
4007 Perl_re_printf( aTHX_
4008 "Matching stclass %.*s against %s (%d bytes)\n",
4009 (int)SvCUR(prop), SvPVX_const(prop),
4010 quoted, (int)(strend - s));
4013 if (find_byclass(prog, c, s, strend, reginfo))
4015 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n"));
4019 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
4027 if (! prog->float_utf8) {
4028 to_utf8_substr(prog);
4030 float_real = prog->float_utf8;
4033 if (! prog->float_substr) {
4034 if (! to_byte_substr(prog)) {
4035 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
4038 float_real = prog->float_substr;
4041 little = SvPV_const(float_real, len);
4042 if (SvTAIL(float_real)) {
4043 /* This means that float_real contains an artificial \n on
4044 * the end due to the presence of something like this:
4045 * /foo$/ where we can match both "foo" and "foo\n" at the
4046 * end of the string. So we have to compare the end of the
4047 * string first against the float_real without the \n and
4048 * then against the full float_real with the string. We
4049 * have to watch out for cases where the string might be
4050 * smaller than the float_real or the float_real without
4052 char *checkpos= strend - len;
4054 Perl_re_printf( aTHX_
4055 "%sChecking for float_real.%s\n",
4056 PL_colors[4], PL_colors[5]));
4057 if (checkpos + 1 < strbeg) {
4058 /* can't match, even if we remove the trailing \n
4059 * string is too short to match */
4061 Perl_re_printf( aTHX_
4062 "%sString shorter than required trailing substring, cannot match.%s\n",
4063 PL_colors[4], PL_colors[5]));
4065 } else if (memEQ(checkpos + 1, little, len - 1)) {
4066 /* can match, the end of the string matches without the
4068 last = checkpos + 1;
4069 } else if (checkpos < strbeg) {
4070 /* cant match, string is too short when the "\n" is
4073 Perl_re_printf( aTHX_
4074 "%sString does not contain required trailing substring, cannot match.%s\n",
4075 PL_colors[4], PL_colors[5]));
4077 } else if (!multiline) {
4078 /* non multiline match, so compare with the "\n" at the
4079 * end of the string */
4080 if (memEQ(checkpos, little, len)) {
4084 Perl_re_printf( aTHX_
4085 "%sString does not contain required trailing substring, cannot match.%s\n",
4086 PL_colors[4], PL_colors[5]));
4090 /* multiline match, so we have to search for a place
4091 * where the full string is located */
4097 last = rninstr(s, strend, little, little + len);
4099 last = strend; /* matching "$" */
4102 /* at one point this block contained a comment which was
4103 * probably incorrect, which said that this was a "should not
4104 * happen" case. Even if it was true when it was written I am
4105 * pretty sure it is not anymore, so I have removed the comment
4106 * and replaced it with this one. Yves */
4108 Perl_re_printf( aTHX_
4109 "%sString does not contain required substring, cannot match.%s\n",
4110 PL_colors[4], PL_colors[5]
4114 dontbother = strend - last + prog->float_min_offset;
4116 if (minlen && (dontbother < minlen))
4117 dontbother = minlen - 1;
4118 strend -= dontbother; /* this one's always in bytes! */
4119 /* We don't know much -- general case. */
4122 if (regtry(reginfo, &s))
4131 if (regtry(reginfo, &s))
4133 } while (s++ < strend);
4141 /* s/// doesn't like it if $& is earlier than where we asked it to
4142 * start searching (which can happen on something like /.\G/) */
4143 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
4144 && (prog->offs[0].start < stringarg - strbeg))
4146 /* this should only be possible under \G */
4147 assert(prog->intflags & PREGf_GPOS_SEEN);
4148 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
4149 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
4153 /* clean up; this will trigger destructors that will free all slabs
4154 * above the current one, and cleanup the regmatch_info_aux
4155 * and regmatch_info_aux_eval sructs */
4157 LEAVE_SCOPE(oldsave);
4159 if (RXp_PAREN_NAMES(prog))
4160 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
4162 /* make sure $`, $&, $', and $digit will work later */
4163 if ( !(flags & REXEC_NOT_FIRST) )
4164 S_reg_set_capture_string(aTHX_ rx,
4165 strbeg, reginfo->strend,
4166 sv, flags, utf8_target);
4171 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n",
4172 PL_colors[4], PL_colors[5]));
4175 /* we failed :-( roll it back.
4176 * Since the swap buffer will be freed on scope exit which follows
4177 * shortly, restore the old captures by copying 'swap's original
4178 * data to the new offs buffer
4180 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
4181 "rex=0x%" UVxf " rolling back offs: 0x%" UVxf " will be freed; restoring data to =0x%" UVxf "\n",
4188 Copy(swap, prog->offs, prog->nparens + 1, regexp_paren_pair);
4191 /* clean up; this will trigger destructors that will free all slabs
4192 * above the current one, and cleanup the regmatch_info_aux
4193 * and regmatch_info_aux_eval sructs */
4195 LEAVE_SCOPE(oldsave);
4201 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
4202 * Do inc before dec, in case old and new rex are the same */
4203 #define SET_reg_curpm(Re2) \
4204 if (reginfo->info_aux_eval) { \
4205 (void)ReREFCNT_inc(Re2); \
4206 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
4207 PM_SETRE((PL_reg_curpm), (Re2)); \
4212 - regtry - try match at specific point
4214 STATIC bool /* 0 failure, 1 success */
4215 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
4218 REGEXP *const rx = reginfo->prog;
4219 regexp *const prog = ReANY(rx);
4222 U32 depth = 0; /* used by REGCP_SET */
4224 RXi_GET_DECL(prog,progi);
4225 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4227 PERL_ARGS_ASSERT_REGTRY;
4229 reginfo->cutpoint=NULL;
4231 prog->offs[0].start = *startposp - reginfo->strbeg;
4232 prog->lastparen = 0;
4233 prog->lastcloseparen = 0;
4235 /* XXXX What this code is doing here?!!! There should be no need
4236 to do this again and again, prog->lastparen should take care of
4239 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
4240 * Actually, the code in regcppop() (which Ilya may be meaning by
4241 * prog->lastparen), is not needed at all by the test suite
4242 * (op/regexp, op/pat, op/split), but that code is needed otherwise
4243 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
4244 * Meanwhile, this code *is* needed for the
4245 * above-mentioned test suite tests to succeed. The common theme
4246 * on those tests seems to be returning null fields from matches.
4247 * --jhi updated by dapm */
4249 /* After encountering a variant of the issue mentioned above I think
4250 * the point Ilya was making is that if we properly unwind whenever
4251 * we set lastparen to a smaller value then we should not need to do
4252 * this every time, only when needed. So if we have tests that fail if
4253 * we remove this, then it suggests somewhere else we are improperly
4254 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
4255 * places it is called, and related regcp() routines. - Yves */
4257 if (prog->nparens) {
4258 regexp_paren_pair *pp = prog->offs;
4260 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
4268 result = regmatch(reginfo, *startposp, progi->program + 1);
4270 prog->offs[0].end = result;
4273 if (reginfo->cutpoint)
4274 *startposp= reginfo->cutpoint;
4275 REGCP_UNWIND(lastcp);
4279 /* this is used to determine how far from the left messages like
4280 'failed...' are printed in regexec.c. It should be set such that
4281 messages are inline with the regop output that created them.
4283 #define REPORT_CODE_OFF 29
4284 #define INDENT_CHARS(depth) ((int)(depth) % 20)
4287 Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
4291 PerlIO *f= Perl_debug_log;
4292 PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
4293 va_start(ap, depth);
4294 PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
4295 result = PerlIO_vprintf(f, fmt, ap);
4299 #endif /* DEBUGGING */
4301 /* grab a new slab and return the first slot in it */
4303 STATIC regmatch_state *
4306 regmatch_slab *s = PL_regmatch_slab->next;
4308 Newx(s, 1, regmatch_slab);
4309 s->prev = PL_regmatch_slab;
4311 PL_regmatch_slab->next = s;
4313 PL_regmatch_slab = s;
4314 return SLAB_FIRST(s);
4320 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
4321 const char *start, const char *end, const char *blurb)
4323 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
4325 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
4330 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
4331 RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len);
4333 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
4334 start, end - start, PL_dump_re_max_len);
4336 Perl_re_printf( aTHX_
4337 "%s%s REx%s %s against %s\n",
4338 PL_colors[4], blurb, PL_colors[5], s0, s1);
4340 if (utf8_target||utf8_pat)
4341 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n",
4342 utf8_pat ? "pattern" : "",
4343 utf8_pat && utf8_target ? " and " : "",
4344 utf8_target ? "string" : ""
4350 S_dump_exec_pos(pTHX_ const char *locinput,
4351 const regnode *scan,
4352 const char *loc_regeol,
4353 const char *loc_bostr,
4354 const char *loc_reg_starttry,
4355 const bool utf8_target,
4359 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
4360 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
4361 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
4362 /* The part of the string before starttry has one color
4363 (pref0_len chars), between starttry and current
4364 position another one (pref_len - pref0_len chars),
4365 after the current position the third one.
4366 We assume that pref0_len <= pref_len, otherwise we
4367 decrease pref0_len. */
4368 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
4369 ? (5 + taill) - l : locinput - loc_bostr;
4372 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
4374 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
4376 pref0_len = pref_len - (locinput - loc_reg_starttry);
4377 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
4378 l = ( loc_regeol - locinput > (5 + taill) - pref_len
4379 ? (5 + taill) - pref_len : loc_regeol - locinput);
4380 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
4384 if (pref0_len > pref_len)
4385 pref0_len = pref_len;
4387 const int is_uni = utf8_target ? 1 : 0;
4389 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
4390 (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5);
4392 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
4393 (locinput - pref_len + pref0_len),
4394 pref_len - pref0_len, PL_dump_re_max_len, 2, 3);
4396 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
4397 locinput, loc_regeol - locinput, 10, 0, 1);
4399 const STRLEN tlen=len0+len1+len2;
4400 Perl_re_printf( aTHX_
4401 "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
4402 (IV)(locinput - loc_bostr),
4405 (docolor ? "" : "> <"),
4407 (int)(tlen > 19 ? 0 : 19 - tlen),
4415 /* reg_check_named_buff_matched()
4416 * Checks to see if a named buffer has matched. The data array of
4417 * buffer numbers corresponding to the buffer is expected to reside
4418 * in the regexp->data->data array in the slot stored in the ARG() of
4419 * node involved. Note that this routine doesn't actually care about the
4420 * name, that information is not preserved from compilation to execution.
4421 * Returns the index of the leftmost defined buffer with the given name
4422 * or 0 if non of the buffers matched.
4425 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
4428 RXi_GET_DECL(rex,rexi);
4429 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4430 I32 *nums=(I32*)SvPVX(sv_dat);
4432 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
4434 for ( n=0; n<SvIVX(sv_dat); n++ ) {
4435 if ((I32)rex->lastparen >= nums[n] &&
4436 rex->offs[nums[n]].end != -1)
4445 S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node,
4446 struct next_matchable_info * m,
4447 regmatch_info *reginfo)
4449 /* This function determines various characteristics about every possible
4450 * initial match of the passed-in EXACTish <text_node>, and stores them in
4453 * That includes a match string and a parallel mask, such that if you AND
4454 * the target string with the mask and compare with the match string,
4455 * you'll have a pretty good idea, perhaps even perfect, if that portion of
4456 * the target matches or not.
4458 * The motivation behind this function is to allow the caller to set up
4459 * tight loops for matching. Consider patterns like '.*B' or '.*?B' where
4460 * B is an arbitrary EXACTish node. To find the end of .*, we look for the
4461 * beginning oF B, which is the passed in <text_node> That's where this
4462 * function comes in. The values it returns can quickly be used to rule
4463 * out many, or all, cases of possible matches not actually being the
4464 * beginning of B, <text_node>. It is also used in regrepeat() where we
4465 * have 'A*', for arbitrary 'A'. This sets up criteria to more efficiently
4466 * determine where the span of 'A's stop.
4468 * If <text_node> is of type EXACT, there is only one possible character
4469 * that can match its first character, and so the situation is quite
4470 * simple. But things can get much more complicated if folding is
4471 * involved. It may be that the first character of an EXACTFish node
4472 * doesn't participate in any possible fold, e.g., punctuation, so it can
4473 * be matched only by itself. The vast majority of characters that are in
4474 * folds match just two things, their lower and upper-case equivalents.
4475 * But not all are like that; some have multiple possible matches, or match
4476 * sequences of more than one character. This function sorts all that out.
4478 * It returns information about all possibilities of what the first
4479 * character(s) of <text_node> could look like. Again, if <text_node> is a
4480 * plain EXACT node, that's just the actual first bytes of the first
4481 * character; but otherwise it is the bytes, that when masked, match all
4482 * possible combinations of all the initial bytes of all the characters
4483 * that could match, folded. (Actually, this is a slight over promise. It
4484 * handles only up to the initial 5 bytes, which is enough for all Unicode
4485 * characters, but not for all non-Unicode ones.)
4487 * Here's an example to clarify. Suppose the first character of
4488 * <text_node> is the letter 'C', and we are under /i matching. That means
4489 * 'c' also matches. The representations of these two characters differ in
4490 * just one bit, so the mask would be a zero in that position and ones in
4491 * the other 7. And the returned string would be the AND of these two
4492 * characters, and would be one byte long, since these characters are each
4493 * a single byte. ANDing the target <text_node> with this mask will yield
4494 * the returned string if and only if <text_node> begins with one of these
4495 * two characters. So, the function would also return that the definitive
4496 * length matched is 1 byte.
4498 * Now, suppose instead of the letter 'C', <text_node> begins with the
4499 * letter 'F'. The situation is much more complicated because there are
4500 * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also
4501 * begins with 'f', and hence could match. We add these into the returned
4502 * string and mask, but the result isn't definitive; the caller has to
4503 * check further if its AND and compare pass. But the failure of that
4504 * compare will quickly rule out most possible inputs.
4506 * Much of this could be done in regcomp.c at compile time, except for
4507 * locale-dependent, and UTF-8 target dependent data. Extra data fields
4508 * could be used for one or the other eventualities.
4510 * If this function determines that no possible character in the target
4511 * string can match, it returns FALSE; otherwise TRUE. (The FALSE
4512 * situation occurs if the first character in <text_node> requires UTF-8 to
4513 * represent, and the target string isn't in UTF-8.)
4515 * Some analysis is in GH #18414, located at the time of this writing at:
4516 * https://github.com/Perl/perl5/issues/18414
4519 const bool utf8_target = reginfo->is_utf8_target;
4520 bool utf8_pat = reginfo->is_utf8_pat;
4522 PERL_UINT_FAST8_T i;
4524 /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e
4526 U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } };
4527 U8 lengths[MAX_MATCHES] = { 0 };
4529 U8 index_of_longest = 0;
4531 U8 *pat = (U8*)STRING(text_node);
4532 Size_t pat_len = STR_LEN(text_node);
4533 U8 op = OP(text_node);
4535 U8 byte_mask[5] = {0};
4536 U8 byte_anded[5] = {0};
4538 /* There are some folds in Unicode to multiple characters. This will hold
4539 * such characters that could fold to the beginning of 'text_node' */
4540 UV multi_fold_from = 0;
4542 /* We may have to create a modified copy of the pattern */
4543 U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' };
4546 m->min_length = 255;
4549 /* Even if the first character in the node can match something in Latin1,
4550 * if there is anything in the node that can't, the match must fail */
4551 if (! utf8_target && isEXACT_REQ8(op)) {
4555 /* Define a temporary op for use in this function, using an existing one that
4556 * should never be a real op during execution */
4557 #define TURKISH PSEUDO
4559 /* What to do about these two nodes had to be deferred to runtime (which is
4560 * now). If the extra information we now have so indicates, turn them into
4562 if ( (op == EXACTF && utf8_target)
4563 || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4565 if (op == EXACTFL && PL_in_utf8_turkic_locale) {
4572 /* And certain situations are better handled if we create a modified
4573 * version of the pattern */
4574 if (utf8_pat) { /* Here, must have been EXACTFL, so look at the
4575 specific problematic characters */
4576 if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) {
4578 /* The node could start with characters that are the first ones
4579 * of a multi-character fold. */
4581 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4582 if (multi_fold_from) {
4584 /* Here, they do form a sequence that matches the fold of a
4585 * single character. That single character then is a
4586 * possible match. Below we will look again at this, but
4587 * the code below is expecting every character in the
4588 * pattern to be folded, which the input isn't required to
4589 * be in this case. So, just fold the single character,
4590 * and the result will be in the expected form. */
4591 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4595 /* Turkish has a couple extra possibilities. */
4596 else if ( UNLIKELY(op == TURKISH)
4598 && isALPHA_FOLD_EQ(pat[0], 'f')
4599 && ( memBEGINs(pat + 1, pat_len - 1,
4600 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4602 && isALPHA_FOLD_EQ(pat[1], 'f')
4603 && memBEGINs(pat + 2, pat_len - 2,
4604 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)
4606 /* The macros for finding a multi-char fold don't include
4607 * the Turkish possibilities, in which U+130 folds to 'i'.
4608 * Hard-code these. It's very unlikely that Unicode will
4609 * ever add any others. */
4610 if (pat[1] == 'f') {
4612 Copy("ffi", mod_pat, pat_len, U8);
4616 Copy("fi", mod_pat, pat_len, U8);
4620 else if ( UTF8_IS_DOWNGRADEABLE_START(*pat)
4621 && LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8))
4622 && LIKELY(memNEs(pat, pat_len,
4623 LATIN_SMALL_LETTER_SHARP_S_UTF8))
4624 && (LIKELY(op != TURKISH || *pat != 'I')))
4626 /* For all cases of things between 0-255, except the ones
4627 * in the conditional above, the fold is just the lower
4628 * case, which is faster than the more general case. */
4629 mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0],
4635 else { /* Code point above 255, or needs special handling */
4636 _to_utf8_fold_flags(pat, pat + pat_len,
4638 FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE);
4643 else if /* Below is not a UTF-8 pattern; there's a somewhat different
4644 set of problematic characters */
4646 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len)))
4648 /* We may have to canonicalize a multi-char fold, as in the UTF-8
4650 _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len,
4654 else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) {
4655 mod_pat[0] = mod_pat[1] = 's';
4657 utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant
4658 chars, and speeds copying */
4661 else if (LIKELY(op != TURKISH || *pat != 'I')) {
4662 mod_pat[0] = toLOWER_L1(*pat);
4667 else if /* Below isn't a node that we convert to UTF-8 */
4670 && op == EXACTFAA_NO_TRIE
4671 && *pat == LATIN_SMALL_LETTER_SHARP_S)
4673 /* A very special case. Folding U+DF goes to U+17F under /iaa. We
4674 * did this at compile time when the pattern was UTF-8 , but otherwise
4675 * we couldn't do it earlier, because it requires a UTF-8 target for
4676 * this match to be legal. */
4677 pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1);
4678 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8
4679 LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8);
4684 /* Here, we have taken care of the initial work for a few very problematic
4685 * situations, possibly creating a modified pattern.
4687 * Now ready for the general case. We build up all the possible things
4688 * that could match the first character of the pattern into the elements of
4691 * Everything generally matches at least itself. But if there is a
4692 * UTF8ness mismatch, we have to convert to that of the target string. */
4693 if (UTF8_IS_INVARIANT(*pat)) { /* Immaterial if either is in UTF-8 */
4694 matches[0][0] = pat[0];
4698 else if (utf8_target) {
4700 lengths[0] = UTF8SKIP(pat);
4701 Copy(pat, matches[0], lengths[0], U8);
4704 else { /* target is UTF-8, pattern isn't */
4705 matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]);
4706 matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]);
4711 else if (! utf8_pat) { /* Neither is UTF-8 */
4712 matches[0][0] = pat[0];
4716 else /* target isn't UTF-8; pattern is. No match possible unless the
4717 pattern's first character can fit in a byte */
4718 if (UTF8_IS_DOWNGRADEABLE_START(*pat))
4720 matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]);
4725 /* Here we have taken care of any necessary node-type changes */
4728 m->max_length = lengths[0];
4729 m->min_length = lengths[0];
4732 /* For non-folding nodes, there are no other possible candidate matches,
4733 * but for foldable ones, we have to look further. */
4734 if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */
4735 UV folded; /* The first character in the pattern, folded */
4736 U32 first_fold_from; /* A character that folds to it */
4737 const U32 * remaining_fold_froms; /* The remaining characters that
4738 fold to it, if any */
4739 Size_t folds_to_count; /* The total number of characters that fold to
4742 /* If the node begins with a sequence of more than one character that
4743 * together form the fold of a single character, it is called a
4744 * 'multi-character fold', and the normal functions don't handle this
4745 * case. We set 'multi_fold_from' to the single folded-from character,
4746 * which is handled in an extra iteration below */
4748 folded = valid_utf8_to_uvchr(pat, NULL);
4750 = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len);
4755 /* This may generate illegal combinations for things like EXACTF,
4756 * but rather than repeat the logic and exclude them here, all such
4757 * illegalities are checked for and skipped below in the loop */
4759 = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len);
4762 /* Everything matches at least itself; initialize to that because the
4763 * only the branches below that set it are the ones where the number
4767 /* There are a few special cases for locale-dependent nodes, where the
4768 * run-time context was needed before we could know what matched */
4769 if (UNLIKELY(op == EXACTFL) && folded < 256) {
4770 first_fold_from = PL_fold_locale[folded];
4772 else if ( op == EXACTFL && utf8_target && utf8_pat
4773 && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8
4774 LATIN_SMALL_LETTER_LONG_S_UTF8))
4776 first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S;
4778 else if (UNLIKELY( op == TURKISH
4779 && ( isALPHA_FOLD_EQ(folded, 'i')
4781 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE,
4782 LATIN_SMALL_LETTER_DOTLESS_I))))
4783 { /* Turkish folding requires special handling */
4785 first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
4786 else if (folded == 'I')
4787 first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I;
4788 else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
4789 first_fold_from = 'i';
4790 else first_fold_from = 'I';
4793 /* Here, isn't a special case: use the generic function to
4794 * calculate what folds to this */
4796 /* Look up what code points (besides itself) fold to 'folded';
4797 * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */
4798 folds_to_count = _inverse_folds(folded, &first_fold_from,
4799 &remaining_fold_froms);
4802 /* Add each character that folds to 'folded' to the list of them,
4803 * subject to limitations based on the node type and target UTF8ness.
4804 * If there was a character that folded to multiple characters, do an
4805 * extra iteration for it. (Note the extra iteration if there is a
4806 * multi-character fold) */
4807 for (i = 0; i < folds_to_count
4808 + UNLIKELY(multi_fold_from != 0); i++)
4812 if (i >= folds_to_count) { /* Final iteration: handle the
4814 fold_from = multi_fold_from;
4817 fold_from = first_fold_from;
4819 else if (i < folds_to_count) {
4820 fold_from = remaining_fold_froms[i-1];
4823 if (folded == fold_from) { /* We already added the character
4828 /* EXACTF doesn't have any non-ascii folds */
4829 if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) {
4833 /* In /iaa nodes, neither or both must be ASCII to be a legal fold
4835 if ( isASCII(folded) != isASCII(fold_from)
4836 && inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE))
4842 /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8
4843 * locale, but those have been converted to EXACTFU above) */
4845 && (folded < 256) != (fold_from < 256))
4850 /* If this triggers, it likely is because of the unlikely case
4851 * where a new Unicode standard has changed what MAX_MATCHES should
4853 assert(m->count < MAX_MATCHES);
4855 /* Add this character to the list of possible matches */
4857 uvchr_to_utf8(matches[m->count], fold_from);
4858 lengths[m->count] = UVCHR_SKIP(fold_from);
4861 else { /* Non-UTF8 target: no code point above 255 can appear in it
4863 if (fold_from > 255) {
4867 matches[m->count][0] = fold_from;
4868 lengths[m->count] = 1;
4872 /* Update min and mlengths */
4873 if (m->min_length > lengths[m->count-1]) {
4874 m->min_length = lengths[m->count-1];
4877 if (m->max_length < lengths[m->count-1]) {
4878 index_of_longest = m->count - 1;
4879 m->max_length = lengths[index_of_longest];
4881 } /* looped through each potential fold */
4883 /* If there is something that folded to an initial multi-character
4884 * fold, repeat, using it. This catches some edge cases. An example
4885 * of one is /ss/i when UTF-8 encoded. The function
4886 * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF
4887 * (LATIN SMALL SHARP S). If it returned a list of characters, this
4888 * code wouldn't be needed. But since it doesn't, we have to look what
4889 * folds to the U+DF. In this case, U+1E9E does, and has to be added.
4891 if (multi_fold_from) {
4892 folded = multi_fold_from;
4893 multi_fold_from = 0;
4896 } /* End of finding things that participate in this fold */
4898 if (m->count == 0) { /* If nothing found, can't match */
4903 /* Have calculated all possible matches. Now calculate the mask and AND
4905 m->initial_exact = 0;
4906 m->initial_definitive = 0;
4909 unsigned int mask_ones = 0;
4910 unsigned int possible_ones = 0;
4913 /* For each byte that is in all possible matches ... */
4914 for (j = 0; j < MIN(m->min_length, 5); j++) {
4916 /* Initialize the accumulator for this byte */
4917 byte_mask[j] = 0xFF;
4918 byte_anded[j] = matches[0][j];
4920 /* Then the rest of the rows (folds). The mask is based on, like,
4921 * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0
4922 * where they differ. */
4923 for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) {
4924 byte_mask[j] &= ~ (byte_anded[j] ^ matches[i][j]);
4925 byte_anded[j] &= matches[i][j];
4928 /* Keep track of the number of initial mask bytes that are all one
4929 * bits. The code calling this can use this number to know that
4930 * a string that matches this number of bytes in the pattern is an
4931 * exact match of that pattern for this number of bytes. But also
4932 * counted are the number of initial bytes that in total have a
4933 * single zero bit. If a string matches those, masked, it must be
4934 * one of two possibilites, both of which this function has
4935 * determined are legal. (But if that single 0 is one of the
4936 * initial bits for masking a UTF-8 start byte, that could
4937 * incorrectly lead to different length strings appearing to be
4938 * equivalent, so only do this optimization when the matchables are
4939 * all the same length. This was uncovered by testing
4941 if (m->min_length == m->max_length) {
4942 mask_ones += PL_bitcount[byte_mask[j]];
4944 if (mask_ones + 1 >= possible_ones) {
4945 m->initial_definitive++;
4946 if (mask_ones >= possible_ones) {
4954 /* The first byte is separate for speed */
4955 m->first_byte_mask = byte_mask[0];
4956 m->first_byte_anded = byte_anded[0];
4958 /* Then pack up to the next 4 bytes into a word */
4959 m->mask32 = m->anded32 = 0;
4960 for (i = 1; i < MIN(m->min_length, 5); i++) {
4962 U8 shift = (which - 1) * 8;
4963 m->mask32 |= (U32) byte_mask[i] << shift;
4964 m->anded32 |= (U32) byte_anded[i] << shift;
4967 /* Finally, take the match strings and place them sequentially into a
4968 * one-dimensional array. (This is done to save significant space in the
4969 * structure.) Sort so the longest (presumably the least likely) is last.
4970 * XXX When this gets moved to regcomp, may want to fully sort shortest
4971 * first, but above we generally used the folded code point first, and
4972 * those tend to be no longer than their upper case values, so this is
4973 * already pretty well sorted by size.
4975 * If the asserts fail, it's most likely because a new version of the
4976 * Unicode standard requires more space; simply increase the declaration
4980 U8 output_index = 0;
4982 if (m->count > 1) { /* No need to sort a single entry */
4983 for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) {
4985 /* Keep the same order for all but the longest. (If the
4986 * asserts fail, it could be because m->matches is declared too
4987 * short, either because of a new Unicode release, or an
4988 * overlooked test case, or it could be a bug.) */
4989 if (i != index_of_longest) {
4990 assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches));
4991 Copy(matches[i], m->matches + cur_pos, lengths[i], U8);
4992 cur_pos += lengths[i];
4993 m->lengths[output_index++] = lengths[i];
4998 assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches));
4999 Copy(matches[index_of_longest], m->matches + cur_pos,
5000 lengths[index_of_longest], U8);
5002 /* Place the longest match last */
5003 m->lengths[output_index] = lengths[index_of_longest];
5010 PERL_STATIC_FORCE_INLINE /* We want speed at the expense of size */
5012 S_test_EXACTISH_ST(const char * loc,
5013 struct next_matchable_info info)
5015 /* This function uses the data set up in setup_EXACTISH_ST() to see if the
5016 * bytes starting at 'loc' can match based on 'next_matchable_info' */
5020 /* Check the first byte */
5021 if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded)
5024 /* Pack the next up-to-4 bytes into a 32 bit word */
5025 switch (info.min_length) {
5027 input32 |= (U32) ((U8) loc[4]) << 3 * 8;
5030 input32 |= (U8) loc[3] << 2 * 8;
5033 input32 |= (U8) loc[2] << 1 * 8;
5036 input32 |= (U8) loc[1];
5039 return TRUE; /* We already tested and passed the 0th byte */
5044 /* And AND that with the mask and compare that with the assembled ANDED
5046 return (input32 & info.mask32) == info.anded32;
5050 S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
5052 /* returns a boolean indicating if there is a Grapheme Cluster Boundary
5053 * between the inputs. See https://www.unicode.org/reports/tr29/. */
5055 PERL_ARGS_ASSERT_ISGCB;
5057 switch (GCB_table[before][after]) {
5064 case GCB_RI_then_RI:
5067 U8 * temp_pos = (U8 *) curpos;
5069 /* Do not break within emoji flag sequences. That is, do not
5070 * break between regional indicator (RI) symbols if there is an
5071 * odd number of RI characters before the break point.
5072 * GB12 sot (RI RI)* RI × RI
5073 * GB13 [^RI] (RI RI)* RI × RI */
5075 while (backup_one_GCB(strbeg,
5077 utf8_target) == GCB_Regional_Indicator)
5082 return RI_count % 2 != 1;
5085 case GCB_EX_then_EM:
5087 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */
5089 U8 * temp_pos = (U8 *) curpos;
5093 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5095 while (prev == GCB_Extend);
5097 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
5100 case GCB_Maybe_Emoji_NonBreak:
5104 /* Do not break within emoji modifier sequences or emoji zwj sequences.
5105 GB11 \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
5107 U8 * temp_pos = (U8 *) curpos;
5111 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
5113 while (prev == GCB_Extend);
5115 return prev != GCB_ExtPict_XX;
5123 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
5124 before, after, GCB_table[before][after]);
5131 S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5135 PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
5137 if (*curpos < strbeg) {
5142 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5143 U8 * prev_prev_char_pos;
5145 if (! prev_char_pos) {
5149 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5150 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5151 *curpos = prev_char_pos;
5152 prev_char_pos = prev_prev_char_pos;
5155 *curpos = (U8 *) strbeg;
5160 if (*curpos - 2 < strbeg) {
5161 *curpos = (U8 *) strbeg;
5165 gcb = getGCB_VAL_CP(*(*curpos - 1));
5171 /* Combining marks attach to most classes that precede them, but this defines
5172 * the exceptions (from TR14) */
5173 #define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
5174 || prev == LB_Mandatory_Break \
5175 || prev == LB_Carriage_Return \
5176 || prev == LB_Line_Feed \
5177 || prev == LB_Next_Line \
5178 || prev == LB_Space \
5179 || prev == LB_ZWSpace))
5182 S_isLB(pTHX_ LB_enum before,
5184 const U8 * const strbeg,
5185 const U8 * const curpos,
5186 const U8 * const strend,
5187 const bool utf8_target)
5189 U8 * temp_pos = (U8 *) curpos;
5190 LB_enum prev = before;
5192 /* Is the boundary between 'before' and 'after' line-breakable?
5193 * Most of this is just a table lookup of a generated table from Unicode
5194 * rules. But some rules require context to decide, and so have to be
5195 * implemented in code */
5197 PERL_ARGS_ASSERT_ISLB;
5199 /* Rule numbers in the comments below are as of Unicode 9.0 */
5203 switch (LB_table[before][after]) {
5208 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5211 case LB_SP_foo + LB_BREAKABLE:
5212 case LB_SP_foo + LB_NOBREAK:
5213 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
5215 /* When we have something following a SP, we have to look at the
5216 * context in order to know what to do.
5218 * SP SP should not reach here because LB7: Do not break before
5219 * spaces. (For two spaces in a row there is nothing that
5220 * overrides that) */
5221 assert(after != LB_Space);
5223 /* Here we have a space followed by a non-space. Mostly this is a
5224 * case of LB18: "Break after spaces". But there are complications
5225 * as the handling of spaces is somewhat tricky. They are in a
5226 * number of rules, which have to be applied in priority order, but
5227 * something earlier in the string can cause a rule to be skipped
5228 * and a lower priority rule invoked. A prime example is LB7 which
5229 * says don't break before a space. But rule LB8 (lower priority)
5230 * says that the first break opportunity after a ZW is after any
5231 * span of spaces immediately after it. If a ZW comes before a SP
5232 * in the input, rule LB8 applies, and not LB7. Other such rules
5233 * involve combining marks which are rules 9 and 10, but they may
5234 * override higher priority rules if they come earlier in the
5235 * string. Since we're doing random access into the middle of the
5236 * string, we have to look for rules that should get applied based
5237 * on both string position and priority. Combining marks do not
5238 * attach to either ZW nor SP, so we don't have to consider them
5241 * To check for LB8, we have to find the first non-space character
5242 * before this span of spaces */
5244 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5246 while (prev == LB_Space);
5248 /* LB8 Break before any character following a zero-width space,
5249 * even if one or more spaces intervene.
5251 * So if we have a ZW just before this span, and to get here this
5252 * is the final space in the span. */
5253 if (prev == LB_ZWSpace) {
5257 /* Here, not ZW SP+. There are several rules that have higher
5258 * priority than LB18 and can be resolved now, as they don't depend
5259 * on anything earlier in the string (except ZW, which we have
5260 * already handled). One of these rules is LB11 Do not break
5261 * before Word joiner, but we have specially encoded that in the
5262 * lookup table so it is caught by the single test below which
5263 * catches the other ones. */
5264 if (LB_table[LB_Space][after] - LB_SP_foo
5265 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
5270 /* If we get here, we have to XXX consider combining marks. */
5271 if (prev == LB_Combining_Mark) {
5273 /* What happens with these depends on the character they
5276 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5278 while (prev == LB_Combining_Mark);
5280 /* Most times these attach to and inherit the characteristics
5281 * of that character, but not always, and when not, they are to
5282 * be treated as AL by rule LB10. */
5283 if (! LB_CM_ATTACHES_TO(prev)) {
5284 prev = LB_Alphabetic;
5288 /* Here, we have the character preceding the span of spaces all set
5289 * up. We follow LB18: "Break after spaces" unless the table shows
5290 * that is overriden */
5291 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
5295 /* We don't know how to treat the CM except by looking at the first
5296 * non-CM character preceding it. ZWJ is treated as CM */
5298 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
5300 while (prev == LB_Combining_Mark || prev == LB_ZWJ);
5302 /* Here, 'prev' is that first earlier non-CM character. If the CM
5303 * attatches to it, then it inherits the behavior of 'prev'. If it
5304 * doesn't attach, it is to be treated as an AL */
5305 if (! LB_CM_ATTACHES_TO(prev)) {
5306 prev = LB_Alphabetic;
5311 case LB_HY_or_BA_then_foo + LB_BREAKABLE:
5312 case LB_HY_or_BA_then_foo + LB_NOBREAK:
5314 /* LB21a Don't break after Hebrew + Hyphen.
5315 * HL (HY | BA) × */
5317 if (backup_one_LB(strbeg, &temp_pos, utf8_target)
5318 == LB_Hebrew_Letter)
5323 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
5325 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
5326 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
5328 /* LB25a (PR | PO) × ( OP | HY )? NU */
5329 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
5333 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
5336 case LB_SY_or_IS_then_various + LB_BREAKABLE:
5337 case LB_SY_or_IS_then_various + LB_NOBREAK:
5339 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
5341 LB_enum temp = prev;
5343 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5345 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
5346 if (temp == LB_Numeric) {
5350 return LB_table[prev][after] - LB_SY_or_IS_then_various
5354 case LB_various_then_PO_or_PR + LB_BREAKABLE:
5355 case LB_various_then_PO_or_PR + LB_NOBREAK:
5357 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
5359 LB_enum temp = prev;
5360 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
5362 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5364 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
5365 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
5367 if (temp == LB_Numeric) {
5370 return LB_various_then_PO_or_PR;
5373 case LB_RI_then_RI + LB_NOBREAK:
5374 case LB_RI_then_RI + LB_BREAKABLE:
5378 /* LB30a Break between two regional indicator symbols if and
5379 * only if there are an even number of regional indicators
5380 * preceding the position of the break.
5382 * sot (RI RI)* RI × RI
5383 * [^RI] (RI RI)* RI × RI */
5385 while (backup_one_LB(strbeg,
5387 utf8_target) == LB_Regional_Indicator)
5392 return RI_count % 2 == 0;
5400 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n",
5401 before, after, LB_table[before][after]);
5408 S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5413 PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
5415 if (*curpos >= strend) {
5420 *curpos += UTF8SKIP(*curpos);
5421 if (*curpos >= strend) {
5424 lb = getLB_VAL_UTF8(*curpos, strend);
5428 if (*curpos >= strend) {
5431 lb = getLB_VAL_CP(**curpos);
5438 S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5442 PERL_ARGS_ASSERT_BACKUP_ONE_LB;
5444 if (*curpos < strbeg) {
5449 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5450 U8 * prev_prev_char_pos;
5452 if (! prev_char_pos) {
5456 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
5457 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5458 *curpos = prev_char_pos;
5459 prev_char_pos = prev_prev_char_pos;
5462 *curpos = (U8 *) strbeg;
5467 if (*curpos - 2 < strbeg) {
5468 *curpos = (U8 *) strbeg;
5472 lb = getLB_VAL_CP(*(*curpos - 1));
5479 S_isSB(pTHX_ SB_enum before,
5481 const U8 * const strbeg,
5482 const U8 * const curpos,
5483 const U8 * const strend,
5484 const bool utf8_target)
5486 /* returns a boolean indicating if there is a Sentence Boundary Break
5487 * between the inputs. See https://www.unicode.org/reports/tr29/ */
5489 U8 * lpos = (U8 *) curpos;
5490 bool has_para_sep = FALSE;
5491 bool has_sp = FALSE;
5493 PERL_ARGS_ASSERT_ISSB;
5495 /* Break at the start and end of text.
5498 But unstated in Unicode is don't break if the text is empty */
5499 if (before == SB_EDGE || after == SB_EDGE) {
5500 return before != after;
5503 /* SB 3: Do not break within CRLF. */
5504 if (before == SB_CR && after == SB_LF) {
5508 /* Break after paragraph separators. CR and LF are considered
5509 * so because Unicode views text as like word processing text where there
5510 * are no newlines except between paragraphs, and the word processor takes
5511 * care of wrapping without there being hard line-breaks in the text *./
5512 SB4. Sep | CR | LF ÷ */
5513 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5517 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
5518 * (See Section 6.2, Replacing Ignore Rules.)
5519 SB5. X (Extend | Format)* → X */
5520 if (after == SB_Extend || after == SB_Format) {
5522 /* Implied is that the these characters attach to everything
5523 * immediately prior to them except for those separator-type
5524 * characters. And the rules earlier have already handled the case
5525 * when one of those immediately precedes the extend char */
5529 if (before == SB_Extend || before == SB_Format) {
5530 U8 * temp_pos = lpos;
5531 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5532 if ( backup != SB_EDGE
5541 /* Here, both 'before' and 'backup' are these types; implied is that we
5542 * don't break between them */
5543 if (backup == SB_Extend || backup == SB_Format) {
5548 /* Do not break after ambiguous terminators like period, if they are
5549 * immediately followed by a number or lowercase letter, if they are
5550 * between uppercase letters, if the first following letter (optionally
5551 * after certain punctuation) is lowercase, or if they are followed by
5552 * "continuation" punctuation such as comma, colon, or semicolon. For
5553 * example, a period may be an abbreviation or numeric period, and thus may
5554 * not mark the end of a sentence.
5556 * SB6. ATerm × Numeric */
5557 if (before == SB_ATerm && after == SB_Numeric) {
5561 /* SB7. (Upper | Lower) ATerm × Upper */
5562 if (before == SB_ATerm && after == SB_Upper) {
5563 U8 * temp_pos = lpos;
5564 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
5565 if (backup == SB_Upper || backup == SB_Lower) {
5570 /* The remaining rules that aren't the final one, all require an STerm or
5571 * an ATerm after having backed up over some Close* Sp*, and in one case an
5572 * optional Paragraph separator, although one rule doesn't have any Sp's in it.
5573 * So do that backup now, setting flags if either Sp or a paragraph
5574 * separator are found */
5576 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
5577 has_para_sep = TRUE;
5578 before = backup_one_SB(strbeg, &lpos, utf8_target);
5581 if (before == SB_Sp) {
5584 before = backup_one_SB(strbeg, &lpos, utf8_target);
5586 while (before == SB_Sp);
5589 while (before == SB_Close) {
5590 before = backup_one_SB(strbeg, &lpos, utf8_target);
5593 /* The next few rules apply only when the backed-up-to is an ATerm, and in
5594 * most cases an STerm */
5595 if (before == SB_STerm || before == SB_ATerm) {
5597 /* So, here the lhs matches
5598 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
5599 * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
5600 * The rules that apply here are:
5602 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR
5603 | LF | STerm | ATerm) )* Lower
5604 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
5605 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF)
5606 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF)
5607 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷
5610 /* And all but SB11 forbid having seen a paragraph separator */
5611 if (! has_para_sep) {
5612 if (before == SB_ATerm) { /* SB8 */
5613 U8 * rpos = (U8 *) curpos;
5614 SB_enum later = after;
5616 while ( later != SB_OLetter
5617 && later != SB_Upper
5618 && later != SB_Lower
5622 && later != SB_STerm
5623 && later != SB_ATerm
5624 && later != SB_EDGE)
5626 later = advance_one_SB(&rpos, strend, utf8_target);
5628 if (later == SB_Lower) {
5633 if ( after == SB_SContinue /* SB8a */
5634 || after == SB_STerm
5635 || after == SB_ATerm)
5640 if (! has_sp) { /* SB9 applies only if there was no Sp* */
5641 if ( after == SB_Close
5651 /* SB10. This and SB9 could probably be combined some way, but khw
5652 * has decided to follow the Unicode rule book precisely for
5653 * simplified maintenance */
5667 /* Otherwise, do not break.
5674 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
5678 PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
5680 if (*curpos >= strend) {
5686 *curpos += UTF8SKIP(*curpos);
5687 if (*curpos >= strend) {
5690 sb = getSB_VAL_UTF8(*curpos, strend);
5691 } while (sb == SB_Extend || sb == SB_Format);
5696 if (*curpos >= strend) {
5699 sb = getSB_VAL_CP(**curpos);
5700 } while (sb == SB_Extend || sb == SB_Format);
5707 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5711 PERL_ARGS_ASSERT_BACKUP_ONE_SB;
5713 if (*curpos < strbeg) {
5718 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5719 if (! prev_char_pos) {
5723 /* Back up over Extend and Format. curpos is always just to the right
5724 * of the characater whose value we are getting */
5726 U8 * prev_prev_char_pos;
5727 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
5730 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5731 *curpos = prev_char_pos;
5732 prev_char_pos = prev_prev_char_pos;
5735 *curpos = (U8 *) strbeg;
5738 } while (sb == SB_Extend || sb == SB_Format);
5742 if (*curpos - 2 < strbeg) {
5743 *curpos = (U8 *) strbeg;
5747 sb = getSB_VAL_CP(*(*curpos - 1));
5748 } while (sb == SB_Extend || sb == SB_Format);
5755 S_isWB(pTHX_ WB_enum previous,
5758 const U8 * const strbeg,
5759 const U8 * const curpos,
5760 const U8 * const strend,
5761 const bool utf8_target)
5763 /* Return a boolean as to if the boundary between 'before' and 'after' is
5764 * a Unicode word break, using their published algorithm, but tailored for
5765 * Perl by treating spans of white space as one unit. Context may be
5766 * needed to make this determination. If the value for the character
5767 * before 'before' is known, it is passed as 'previous'; otherwise that
5768 * should be set to WB_UNKNOWN. The other input parameters give the
5769 * boundaries and current position in the matching of the string. That
5770 * is, 'curpos' marks the position where the character whose wb value is
5771 * 'after' begins. See http://www.unicode.org/reports/tr29/ */
5773 U8 * before_pos = (U8 *) curpos;
5774 U8 * after_pos = (U8 *) curpos;
5775 WB_enum prev = before;
5778 PERL_ARGS_ASSERT_ISWB;
5780 /* Rule numbers in the comments below are as of Unicode 9.0 */
5784 switch (WB_table[before][after]) {
5791 case WB_hs_then_hs: /* 2 horizontal spaces in a row */
5792 next = advance_one_WB(&after_pos, strend, utf8_target,
5793 FALSE /* Don't skip Extend nor Format */ );
5794 /* A space immediately preceeding an Extend or Format is attached
5795 * to by them, and hence gets separated from previous spaces.
5796 * Otherwise don't break between horizontal white space */
5797 return next == WB_Extend || next == WB_Format;
5799 /* WB4 Ignore Format and Extend characters, except when they appear at
5800 * the beginning of a region of text. This code currently isn't
5801 * general purpose, but it works as the rules are currently and likely
5802 * to be laid out. The reason it works is that when 'they appear at
5803 * the beginning of a region of text', the rule is to break before
5804 * them, just like any other character. Therefore, the default rule
5805 * applies and we don't have to look in more depth. Should this ever
5806 * change, we would have to have 2 'case' statements, like in the rules
5807 * below, and backup a single character (not spacing over the extend
5808 * ones) and then see if that is one of the region-end characters and
5810 case WB_Ex_or_FO_or_ZWJ_then_foo:
5811 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5814 case WB_DQ_then_HL + WB_BREAKABLE:
5815 case WB_DQ_then_HL + WB_NOBREAK:
5817 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */
5819 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5820 == WB_Hebrew_Letter)
5825 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
5827 case WB_HL_then_DQ + WB_BREAKABLE:
5828 case WB_HL_then_DQ + WB_NOBREAK:
5830 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */
5832 if (advance_one_WB(&after_pos, strend, utf8_target,
5833 TRUE /* Do skip Extend and Format */ )
5834 == WB_Hebrew_Letter)
5839 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
5841 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5842 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5844 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
5845 * | Single_Quote) (ALetter | Hebrew_Letter) */
5847 next = advance_one_WB(&after_pos, strend, utf8_target,
5848 TRUE /* Do skip Extend and Format */ );
5850 if (next == WB_ALetter || next == WB_Hebrew_Letter)
5855 return WB_table[before][after]
5856 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
5858 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5859 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
5861 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5862 * | Single_Quote) × (ALetter | Hebrew_Letter) */
5864 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5865 if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5870 return WB_table[before][after]
5871 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
5873 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5874 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
5876 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
5879 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5885 return WB_table[before][after]
5886 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5888 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5889 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5891 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */
5893 if (advance_one_WB(&after_pos, strend, utf8_target,
5894 TRUE /* Do skip Extend and Format */ )
5900 return WB_table[before][after]
5901 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
5903 case WB_RI_then_RI + WB_NOBREAK:
5904 case WB_RI_then_RI + WB_BREAKABLE:
5908 /* Do not break within emoji flag sequences. That is, do not
5909 * break between regional indicator (RI) symbols if there is an
5910 * odd number of RI characters before the potential break
5913 * WB15 sot (RI RI)* RI × RI
5914 * WB16 [^RI] (RI RI)* RI × RI */
5916 while (backup_one_WB(&previous,
5919 utf8_target) == WB_Regional_Indicator)
5924 return RI_count % 2 != 1;
5932 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n",
5933 before, after, WB_table[before][after]);
5940 S_advance_one_WB(pTHX_ U8 ** curpos,
5941 const U8 * const strend,
5942 const bool utf8_target,
5943 const bool skip_Extend_Format)
5947 PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5949 if (*curpos >= strend) {
5955 /* Advance over Extend and Format */
5957 *curpos += UTF8SKIP(*curpos);
5958 if (*curpos >= strend) {
5961 wb = getWB_VAL_UTF8(*curpos, strend);
5962 } while ( skip_Extend_Format
5963 && (wb == WB_Extend || wb == WB_Format));
5968 if (*curpos >= strend) {
5971 wb = getWB_VAL_CP(**curpos);
5972 } while ( skip_Extend_Format
5973 && (wb == WB_Extend || wb == WB_Format));
5980 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
5984 PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5986 /* If we know what the previous character's break value is, don't have
5988 if (*previous != WB_UNKNOWN) {
5991 /* But we need to move backwards by one */
5993 *curpos = reghopmaybe3(*curpos, -1, strbeg);
5995 *previous = WB_EDGE;
5996 *curpos = (U8 *) strbeg;
5999 *previous = WB_UNKNOWN;
6004 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
6007 /* And we always back up over these three types */
6008 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
6013 if (*curpos < strbeg) {
6018 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
6019 if (! prev_char_pos) {
6023 /* Back up over Extend and Format. curpos is always just to the right
6024 * of the characater whose value we are getting */
6026 U8 * prev_prev_char_pos;
6027 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
6031 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
6032 *curpos = prev_char_pos;
6033 prev_char_pos = prev_prev_char_pos;
6036 *curpos = (U8 *) strbeg;
6039 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
6043 if (*curpos - 2 < strbeg) {
6044 *curpos = (U8 *) strbeg;
6048 wb = getWB_VAL_CP(*(*curpos - 1));
6049 } while (wb == WB_Extend || wb == WB_Format);
6055 /* Macros for regmatch(), using its internal variables */
6056 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
6057 #define NEXTCHR_IS_EOS (nextbyte < 0)
6059 #define SET_nextchr \
6060 nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
6062 #define SET_locinput(p) \
6066 #define sayYES goto yes
6067 #define sayNO goto no
6068 #define sayNO_SILENT goto no_silent
6070 /* we dont use STMT_START/END here because it leads to
6071 "unreachable code" warnings, which are bogus, but distracting. */
6072 #define CACHEsayNO \
6073 if (ST.cache_mask) \
6074 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
6077 #define EVAL_CLOSE_PAREN_IS(st,expr) \
6080 ( ( st )->u.eval.close_paren ) && \
6081 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6084 #define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
6087 ( ( st )->u.eval.close_paren ) && \
6089 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
6093 #define EVAL_CLOSE_PAREN_SET(st,expr) \
6094 (st)->u.eval.close_paren = ( (expr) + 1 )
6096 #define EVAL_CLOSE_PAREN_CLEAR(st) \
6097 (st)->u.eval.close_paren = 0
6099 /* push a new state then goto it */
6101 #define PUSH_STATE_GOTO(state, node, input, eol, sr0) \
6102 pushinput = input; \
6106 st->resume_state = state; \
6109 /* push a new state with success backtracking, then goto it */
6111 #define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \
6112 pushinput = input; \
6116 st->resume_state = state; \
6117 goto push_yes_state;
6119 #define DEBUG_STATE_pp(pp) \
6121 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
6122 Perl_re_printf( aTHX_ \
6123 "%*s" pp " %s%s%s%s%s\n", \
6124 INDENT_CHARS(depth), "", \
6125 PL_reg_name[st->resume_state], \
6126 ((st==yes_state||st==mark_state) ? "[" : ""), \
6127 ((st==yes_state) ? "Y" : ""), \
6128 ((st==mark_state) ? "M" : ""), \
6129 ((st==yes_state||st==mark_state) ? "]" : "") \
6135 regmatch() - main matching routine
6137 This is basically one big switch statement in a loop. We execute an op,
6138 set 'next' to point the next op, and continue. If we come to a point which
6139 we may need to backtrack to on failure such as (A|B|C), we push a
6140 backtrack state onto the backtrack stack. On failure, we pop the top
6141 state, and re-enter the loop at the state indicated. If there are no more
6142 states to pop, we return failure.
6144 Sometimes we also need to backtrack on success; for example /A+/, where
6145 after successfully matching one A, we need to go back and try to
6146 match another one; similarly for lookahead assertions: if the assertion
6147 completes successfully, we backtrack to the state just before the assertion
6148 and then carry on. In these cases, the pushed state is marked as
6149 'backtrack on success too'. This marking is in fact done by a chain of
6150 pointers, each pointing to the previous 'yes' state. On success, we pop to
6151 the nearest yes state, discarding any intermediate failure-only states.
6152 Sometimes a yes state is pushed just to force some cleanup code to be
6153 called at the end of a successful match or submatch; e.g. (??{$re}) uses
6154 it to free the inner regex.
6156 Note that failure backtracking rewinds the cursor position, while
6157 success backtracking leaves it alone.
6159 A pattern is complete when the END op is executed, while a subpattern
6160 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
6161 ops trigger the "pop to last yes state if any, otherwise return true"
6164 A common convention in this function is to use A and B to refer to the two
6165 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
6166 the subpattern to be matched possibly multiple times, while B is the entire
6167 rest of the pattern. Variable and state names reflect this convention.
6169 The states in the main switch are the union of ops and failure/success of
6170 substates associated with that op. For example, IFMATCH is the op
6171 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
6172 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
6173 successfully matched A and IFMATCH_A_fail is a state saying that we have
6174 just failed to match A. Resume states always come in pairs. The backtrack
6175 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
6176 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
6177 on success or failure.
6179 The struct that holds a backtracking state is actually a big union, with
6180 one variant for each major type of op. The variable st points to the
6181 top-most backtrack struct. To make the code clearer, within each
6182 block of code we #define ST to alias the relevant union.
6184 Here's a concrete example of a (vastly oversimplified) IFMATCH
6190 #define ST st->u.ifmatch
6192 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6193 ST.foo = ...; // some state we wish to save
6195 // push a yes backtrack state with a resume value of
6196 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
6198 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
6201 case IFMATCH_A: // we have successfully executed A; now continue with B
6203 bar = ST.foo; // do something with the preserved value
6206 case IFMATCH_A_fail: // A failed, so the assertion failed
6207 ...; // do some housekeeping, then ...
6208 sayNO; // propagate the failure
6215 For any old-timers reading this who are familiar with the old recursive
6216 approach, the code above is equivalent to:
6218 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
6227 ...; // do some housekeeping, then ...
6228 sayNO; // propagate the failure
6231 The topmost backtrack state, pointed to by st, is usually free. If you
6232 want to claim it, populate any ST.foo fields in it with values you wish to
6233 save, then do one of
6235 PUSH_STATE_GOTO(resume_state, node, newinput, new_eol);
6236 PUSH_YES_STATE_GOTO(resume_state, node, newinput, new_eol);
6238 which sets that backtrack state's resume value to 'resume_state', pushes a
6239 new free entry to the top of the backtrack stack, then goes to 'node'.
6240 On backtracking, the free slot is popped, and the saved state becomes the
6241 new free state. An ST.foo field in this new top state can be temporarily
6242 accessed to retrieve values, but once the main loop is re-entered, it
6243 becomes available for reuse.
6245 Note that the depth of the backtrack stack constantly increases during the
6246 left-to-right execution of the pattern, rather than going up and down with
6247 the pattern nesting. For example the stack is at its maximum at Z at the
6248 end of the pattern, rather than at X in the following:
6250 /(((X)+)+)+....(Y)+....Z/
6252 The only exceptions to this are lookahead/behind assertions and the cut,
6253 (?>A), which pop all the backtrack states associated with A before
6256 Backtrack state structs are allocated in slabs of about 4K in size.
6257 PL_regmatch_state and st always point to the currently active state,
6258 and PL_regmatch_slab points to the slab currently containing
6259 PL_regmatch_state. The first time regmatch() is called, the first slab is
6260 allocated, and is never freed until interpreter destruction. When the slab
6261 is full, a new one is allocated and chained to the end. At exit from
6262 regmatch(), slabs allocated since entry are freed.
6264 In order to work with variable length lookbehinds, an upper limit is placed on
6265 lookbehinds which is set to where the match position is at the end of where the
6266 lookbehind would get to. Nothing in the lookbehind should match above that,
6267 except we should be able to look beyond if for things like \b, which need the
6268 next character in the string to be able to determine if this is a boundary or
6269 not. We also can't match the end of string/line unless we are also at the end
6270 of the entire string, so NEXTCHR_IS_EOS remains the same, and for those OPs
6271 that match a width, we have to add a condition that they are within the legal
6272 bounds of our window into the string.
6276 /* returns -1 on failure, $+[0] on success */
6278 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
6280 const bool utf8_target = reginfo->is_utf8_target;
6281 const U32 uniflags = UTF8_ALLOW_DEFAULT;
6282 REGEXP *rex_sv = reginfo->prog;
6283 regexp *rex = ReANY(rex_sv);
6284 RXi_GET_DECL(rex,rexi);
6285 /* the current state. This is a cached copy of PL_regmatch_state */
6287 /* cache heavy used fields of st in registers */
6290 U32 n = 0; /* general value; init to avoid compiler warning */
6291 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
6292 SSize_t endref = 0; /* offset of end of backref when ln is start */
6293 char *locinput = startpos;
6294 char *loceol = reginfo->strend;
6295 char *pushinput; /* where to continue after a PUSH */
6296 char *pusheol; /* where to stop matching (loceol) after a PUSH */
6297 U8 *pushsr0; /* save starting pos of script run */
6298 PERL_INT_FAST16_T nextbyte; /* is always set to UCHARAT(locinput), or -1
6301 bool result = 0; /* return value of S_regmatch */
6302 U32 depth = 0; /* depth of backtrack stack */
6303 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
6304 const U32 max_nochange_depth =
6305 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
6306 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
6307 regmatch_state *yes_state = NULL; /* state to pop to on success of
6309 /* mark_state piggy backs on the yes_state logic so that when we unwind
6310 the stack on success we can update the mark_state as we go */
6311 regmatch_state *mark_state = NULL; /* last mark state we have seen */
6312 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
6313 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
6315 bool no_final = 0; /* prevent failure from backtracking? */
6316 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
6317 char *startpoint = locinput;
6318 SV *popmark = NULL; /* are we looking for a mark? */
6319 SV *sv_commit = NULL; /* last mark name seen in failure */
6320 SV *sv_yes_mark = NULL; /* last mark name we have seen
6321 during a successful match */
6322 U32 lastopen = 0; /* last open we saw */
6323 bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
6324 SV* const oreplsv = GvSVn(PL_replgv);
6325 /* these three flags are set by various ops to signal information to
6326 * the very next op. They have a useful lifetime of exactly one loop
6327 * iteration, and are not preserved or restored by state pushes/pops
6329 bool sw = 0; /* the condition value in (?(cond)a|b) */
6330 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
6331 int logical = 0; /* the following EVAL is:
6335 or the following IFMATCH/UNLESSM is:
6336 false: plain (?=foo)
6337 true: used as a condition: (?(?=foo))
6339 PAD* last_pad = NULL;
6341 U8 gimme = G_SCALAR;
6342 CV *caller_cv = NULL; /* who called us */
6343 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
6344 U32 maxopenparen = 0; /* max '(' index seen so far */
6345 int to_complement; /* Invert the result? */
6346 _char_class_number classnum;
6347 bool is_utf8_pat = reginfo->is_utf8_pat;
6349 I32 orig_savestack_ix = PL_savestack_ix;
6350 U8 * script_run_begin = NULL;
6352 /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
6353 #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
6354 # define SOLARIS_BAD_OPTIMIZER
6355 const U32 *pl_charclass_dup = PL_charclass;
6356 # define PL_charclass pl_charclass_dup
6360 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6363 /* protect against undef(*^R) */
6364 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
6366 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
6367 multicall_oldcatch = 0;
6368 PERL_UNUSED_VAR(multicall_cop);
6370 PERL_ARGS_ASSERT_REGMATCH;
6372 st = PL_regmatch_state;
6374 /* Note that nextbyte is a byte even in UTF */
6378 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
6379 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6380 Perl_re_printf( aTHX_ "regmatch start\n" );
6383 while (scan != NULL) {
6384 next = scan + NEXT_OFF(scan);
6387 state_num = OP(scan);
6391 if (state_num <= REGNODE_MAX) {
6392 SV * const prop = sv_newmortal();
6393 regnode *rnext = regnext(scan);
6395 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
6396 regprop(rex, prop, scan, reginfo, NULL);
6397 Perl_re_printf( aTHX_
6398 "%*s%" IVdf ":%s(%" IVdf ")\n",
6399 INDENT_CHARS(depth), "",
6400 (IV)(scan - rexi->program),
6402 (PL_regkind[OP(scan)] == END || !rnext) ?
6403 0 : (IV)(rnext - rexi->program));
6410 assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS));
6412 switch (state_num) {
6413 case SBOL: /* /^../ and /\A../ */
6414 if (locinput == reginfo->strbeg)
6418 case MBOL: /* /^../m */
6419 if (locinput == reginfo->strbeg ||
6420 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
6427 if (locinput == reginfo->ganch)
6431 case KEEPS: /* \K */
6432 /* update the startpoint */
6433 st->u.keeper.val = rex->offs[0].start;
6434 rex->offs[0].start = locinput - reginfo->strbeg;
6435 PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol,
6437 NOT_REACHED; /* NOTREACHED */
6439 case KEEPS_next_fail:
6440 /* rollback the start point change */
6441 rex->offs[0].start = st->u.keeper.val;
6443 NOT_REACHED; /* NOTREACHED */
6445 case MEOL: /* /..$/m */
6446 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6450 case SEOL: /* /..$/ */
6451 if (!NEXTCHR_IS_EOS && nextbyte != '\n')
6453 if (reginfo->strend - locinput > 1)
6458 if (!NEXTCHR_IS_EOS)
6462 case SANY: /* /./s */
6463 if (NEXTCHR_IS_EOS || locinput >= loceol)
6465 goto increment_locinput;
6467 case REG_ANY: /* /./ */
6469 || locinput >= loceol
6470 || nextbyte == '\n')
6474 goto increment_locinput;
6478 #define ST st->u.trie
6479 case TRIEC: /* (ab|cd) with known charclass */
6480 /* In this case the charclass data is available inline so
6481 we can fail fast without a lot of extra overhead.
6483 if ( ! NEXTCHR_IS_EOS
6484 && locinput < loceol
6485 && ! ANYOF_BITMAP_TEST(scan, nextbyte))
6488 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6489 depth, PL_colors[4], PL_colors[5])
6492 NOT_REACHED; /* NOTREACHED */
6495 case TRIE: /* (ab|cd) */
6496 /* the basic plan of execution of the trie is:
6497 * At the beginning, run though all the states, and
6498 * find the longest-matching word. Also remember the position
6499 * of the shortest matching word. For example, this pattern:
6502 * when matched against the string "abcde", will generate
6503 * accept states for all words except 3, with the longest
6504 * matching word being 4, and the shortest being 2 (with
6505 * the position being after char 1 of the string).
6507 * Then for each matching word, in word order (i.e. 1,2,4,5),
6508 * we run the remainder of the pattern; on each try setting
6509 * the current position to the character following the word,
6510 * returning to try the next word on failure.
6512 * We avoid having to build a list of words at runtime by
6513 * using a compile-time structure, wordinfo[].prev, which
6514 * gives, for each word, the previous accepting word (if any).
6515 * In the case above it would contain the mappings 1->2, 2->0,
6516 * 3->0, 4->5, 5->1. We can use this table to generate, from
6517 * the longest word (4 above), a list of all words, by
6518 * following the list of prev pointers; this gives us the
6519 * unordered list 4,5,1,2. Then given the current word we have
6520 * just tried, we can go through the list and find the
6521 * next-biggest word to try (so if we just failed on word 2,
6522 * the next in the list is 4).
6524 * Since at runtime we don't record the matching position in
6525 * the string for each word, we have to work that out for
6526 * each word we're about to process. The wordinfo table holds
6527 * the character length of each word; given that we recorded
6528 * at the start: the position of the shortest word and its
6529 * length in chars, we just need to move the pointer the
6530 * difference between the two char lengths. Depending on
6531 * Unicode status and folding, that's cheap or expensive.
6533 * This algorithm is optimised for the case where are only a
6534 * small number of accept states, i.e. 0,1, or maybe 2.
6535 * With lots of accepts states, and having to try all of them,
6536 * it becomes quadratic on number of accept states to find all
6541 /* what type of TRIE am I? (utf8 makes this contextual) */
6542 DECL_TRIE_TYPE(scan);
6544 /* what trie are we using right now */
6545 reg_trie_data * const trie
6546 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
6547 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
6548 U32 state = trie->startstate;
6550 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
6551 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6554 && UTF8_IS_ABOVE_LATIN1(nextbyte)
6555 && scan->flags == EXACTL)
6557 /* We only output for EXACTL, as we let the folder
6558 * output this message for EXACTFLU8 to avoid
6560 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6566 || locinput >= loceol
6567 || ! TRIE_BITMAP_TEST(trie, nextbyte)))
6569 if (trie->states[ state ].wordnum) {
6571 Perl_re_exec_indentf( aTHX_ "%sTRIE: matched empty string...%s\n",
6572 depth, PL_colors[4], PL_colors[5])
6578 Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n",
6579 depth, PL_colors[4], PL_colors[5])
6586 U8 *uc = ( U8* )locinput;
6590 U8 *uscan = (U8*)NULL;
6591 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
6592 U32 charcount = 0; /* how many input chars we have matched */
6593 U32 accepted = 0; /* have we seen any accepting states? */
6595 ST.jump = trie->jump;
6598 ST.longfold = FALSE; /* char longer if folded => it's harder */
6601 /* fully traverse the TRIE; note the position of the
6602 shortest accept state and the wordnum of the longest
6605 while ( state && uc <= (U8*)(loceol) ) {
6606 U32 base = trie->states[ state ].trans.base;
6610 wordnum = trie->states[ state ].wordnum;
6612 if (wordnum) { /* it's an accept state */
6615 /* record first match position */
6617 ST.firstpos = (U8*)locinput;
6622 ST.firstchars = charcount;
6625 if (!ST.nextword || wordnum < ST.nextword)
6626 ST.nextword = wordnum;
6627 ST.topword = wordnum;
6630 DEBUG_TRIE_EXECUTE_r({
6631 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
6633 PerlIO_printf( Perl_debug_log,
6634 "%*s%sTRIE: State: %4" UVxf " Accepted: %c ",
6635 INDENT_CHARS(depth), "", PL_colors[4],
6636 (UV)state, (accepted ? 'Y' : 'N'));
6639 /* read a char and goto next state */
6640 if ( base && (foldlen || uc < (U8*)(loceol))) {
6642 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
6643 (U8 *) loceol, uscan,
6644 len, uvc, charid, foldlen,
6651 base + charid - 1 - trie->uniquecharcount)) >= 0)
6653 && ((U32)offset < trie->lasttrans)
6654 && trie->trans[offset].check == state)
6656 state = trie->trans[offset].next;
6667 DEBUG_TRIE_EXECUTE_r(
6668 Perl_re_printf( aTHX_
6669 "TRIE: Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
6670 charid, uvc, (UV)state, PL_colors[5] );
6676 /* calculate total number of accept states */
6681 w = trie->wordinfo[w].prev;
6684 ST.accepted = accepted;
6688 Perl_re_exec_indentf( aTHX_ "%sTRIE: got %" IVdf " possible matches%s\n",
6690 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
6692 goto trie_first_try; /* jump into the fail handler */
6694 NOT_REACHED; /* NOTREACHED */
6696 case TRIE_next_fail: /* we failed - try next alternative */
6700 /* undo any captures done in the tail part of a branch,
6702 * /(?:X(.)(.)|Y(.)).../
6703 * where the trie just matches X then calls out to do the
6704 * rest of the branch */
6705 REGCP_UNWIND(ST.cp);
6706 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6708 if (!--ST.accepted) {
6710 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n",
6718 /* Find next-highest word to process. Note that this code
6719 * is O(N^2) per trie run (O(N) per branch), so keep tight */
6722 U16 const nextword = ST.nextword;
6723 reg_trie_wordinfo * const wordinfo
6724 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
6725 for (word=ST.topword; word; word=wordinfo[word].prev) {
6726 if (word > nextword && (!min || word < min))
6739 ST.lastparen = rex->lastparen;
6740 ST.lastcloseparen = rex->lastcloseparen;
6744 /* find start char of end of current word */
6746 U32 chars; /* how many chars to skip */
6747 reg_trie_data * const trie
6748 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
6750 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
6752 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
6757 /* the hard option - fold each char in turn and find
6758 * its folded length (which may be different */
6759 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
6767 /* XXX This assumes the length is well-formed, as
6768 * does the UTF8SKIP below */
6769 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
6777 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
6782 uvc = utf8n_to_uvchr(uscan, foldlen, &len,
6798 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
6799 ? ST.jump[ST.nextword]
6803 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n",
6811 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
6812 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
6814 NOT_REACHED; /* NOTREACHED */
6816 /* only one choice left - just continue */
6818 AV *const trie_words
6819 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
6820 SV ** const tmp = trie_words
6821 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
6822 SV *sv= tmp ? sv_newmortal() : NULL;
6824 Perl_re_exec_indentf( aTHX_ "%sTRIE: only one match left, short-circuiting: #%d <%s>%s\n",
6825 depth, PL_colors[4],
6827 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
6828 PL_colors[0], PL_colors[1],
6829 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
6831 : "not compiled under -Dr",
6835 locinput = (char*)uc;
6836 continue; /* execute rest of RE */
6842 if (! utf8_target) {
6852 ln = STR_LENl(scan);
6853 goto join_short_long_exact;
6855 case EXACTL: /* /abc/l */
6856 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6858 /* Complete checking would involve going through every character
6859 * matched by the string to see if any is above latin1. But the
6860 * comparision otherwise might very well be a fast assembly
6861 * language routine, and I (khw) don't think slowing things down
6862 * just to check for this warning is worth it. So this just checks
6863 * the first character */
6864 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
6865 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
6869 if (! utf8_target) {
6874 case EXACT: /* /abc/ */
6877 ln = STR_LENs(scan);
6879 join_short_long_exact:
6880 if (utf8_target != is_utf8_pat) {
6881 /* The target and the pattern have differing utf8ness. */
6883 const char * const e = s + ln;
6886 /* The target is utf8, the pattern is not utf8.
6887 * Above-Latin1 code points can't match the pattern;
6888 * invariants match exactly, and the other Latin1 ones need
6889 * to be downgraded to a single byte in order to do the
6890 * comparison. (If we could be confident that the target
6891 * is not malformed, this could be refactored to have fewer
6892 * tests by just assuming that if the first bytes match, it
6893 * is an invariant, but there are tests in the test suite
6894 * dealing with (??{...}) which violate this) */
6897 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
6901 if (UTF8_IS_INVARIANT(*(U8*)l)) {
6908 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
6918 /* The target is not utf8, the pattern is utf8. */
6921 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
6925 if (UTF8_IS_INVARIANT(*(U8*)s)) {
6932 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
6944 /* The target and the pattern have the same utf8ness. */
6945 /* Inline the first character, for speed. */
6946 if ( loceol - locinput < ln
6947 || UCHARAT(s) != nextbyte
6948 || (ln > 1 && memNE(s, locinput, ln)))
6957 case EXACTFL: /* /abc/il */
6960 const U8 * fold_array;
6962 U32 fold_utf8_flags;
6964 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6965 folder = foldEQ_locale;
6966 fold_array = PL_fold_locale;
6967 fold_utf8_flags = FOLDEQ_LOCALE;
6970 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
6971 is effectively /u; hence to match, target
6973 if (! utf8_target) {
6976 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
6977 | FOLDEQ_S2_FOLDS_SANE;
6978 folder = foldEQ_latin1_s2_folded;
6979 fold_array = PL_fold_latin1;
6982 case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */
6983 if (! utf8_target) {
6986 assert(is_utf8_pat);
6987 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
6990 case EXACTFUP: /* /foo/iu, and something is problematic in
6991 'foo' so can't take shortcuts. */
6992 assert(! is_utf8_pat);
6993 folder = foldEQ_latin1;
6994 fold_array = PL_fold_latin1;
6995 fold_utf8_flags = 0;
6998 case EXACTFU: /* /abc/iu */
6999 folder = foldEQ_latin1_s2_folded;
7000 fold_array = PL_fold_latin1;
7001 fold_utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
7004 case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8
7006 assert(! is_utf8_pat);
7008 case EXACTFAA: /* /abc/iaa */
7009 folder = foldEQ_latin1_s2_folded;
7010 fold_array = PL_fold_latin1;
7011 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7012 if (is_utf8_pat || ! utf8_target) {
7014 /* The possible presence of a MICRO SIGN in the pattern forbids
7015 * us to view a non-UTF-8 pattern as folded when there is a
7017 fold_utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED
7018 |FOLDEQ_S2_FOLDS_SANE;
7023 case EXACTF: /* /abc/i This node only generated for
7024 non-utf8 patterns */
7025 assert(! is_utf8_pat);
7027 fold_array = PL_fold;
7028 fold_utf8_flags = 0;
7032 ln = STR_LENs(scan);
7036 || state_num == EXACTFUP
7037 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
7039 /* Either target or the pattern are utf8, or has the issue where
7040 * the fold lengths may differ. */
7041 const char * const l = locinput;
7044 if (! foldEQ_utf8_flags(l, &e, 0, utf8_target,
7045 s, 0, ln, is_utf8_pat,fold_utf8_flags))
7053 /* Neither the target nor the pattern are utf8 */
7054 if (UCHARAT(s) != nextbyte
7056 && UCHARAT(s) != fold_array[nextbyte])
7060 if (loceol - locinput < ln)
7062 if (ln > 1 && ! folder(locinput, s, ln))
7068 case NBOUNDL: /* /\B/l */
7072 case BOUNDL: /* /\b/l */
7075 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7077 if (FLAGS(scan) != TRADITIONAL_BOUND) {
7078 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND;
7083 if (locinput == reginfo->strbeg)
7084 b1 = isWORDCHAR_LC('\n');
7086 U8 *p = reghop3((U8*)locinput, -1,
7087 (U8*)(reginfo->strbeg));
7088 b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend));
7090 b2 = (NEXTCHR_IS_EOS)
7091 ? isWORDCHAR_LC('\n')
7092 : isWORDCHAR_LC_utf8_safe((U8*) locinput,
7093 (U8*) reginfo->strend);
7095 else { /* Here the string isn't utf8 */
7096 b1 = (locinput == reginfo->strbeg)
7097 ? isWORDCHAR_LC('\n')
7098 : isWORDCHAR_LC(UCHARAT(locinput - 1));
7099 b2 = (NEXTCHR_IS_EOS)
7100 ? isWORDCHAR_LC('\n')
7101 : isWORDCHAR_LC(nextbyte);
7103 if (to_complement ^ (b1 == b2)) {
7109 case NBOUND: /* /\B/ */
7113 case BOUND: /* /\b/ */
7117 goto bound_ascii_match_only;
7119 case NBOUNDA: /* /\B/a */
7123 case BOUNDA: /* /\b/a */
7127 bound_ascii_match_only:
7128 /* Here the string isn't utf8, or is utf8 and only ascii characters
7129 * are to match \w. In the latter case looking at the byte just
7130 * prior to the current one may be just the final byte of a
7131 * multi-byte character. This is ok. There are two cases:
7132 * 1) it is a single byte character, and then the test is doing
7133 * just what it's supposed to.
7134 * 2) it is a multi-byte character, in which case the final byte is
7135 * never mistakable for ASCII, and so the test will say it is
7136 * not a word character, which is the correct answer. */
7137 b1 = (locinput == reginfo->strbeg)
7138 ? isWORDCHAR_A('\n')
7139 : isWORDCHAR_A(UCHARAT(locinput - 1));
7140 b2 = (NEXTCHR_IS_EOS)
7141 ? isWORDCHAR_A('\n')
7142 : isWORDCHAR_A(nextbyte);
7143 if (to_complement ^ (b1 == b2)) {
7149 case NBOUNDU: /* /\B/u */
7153 case BOUNDU: /* /\b/u */
7156 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
7159 else if (utf8_target) {
7161 switch((bound_type) FLAGS(scan)) {
7162 case TRADITIONAL_BOUND:
7165 if (locinput == reginfo->strbeg) {
7166 b1 = 0 /* isWORDCHAR_L1('\n') */;
7169 U8 *p = reghop3((U8*)locinput, -1,
7170 (U8*)(reginfo->strbeg));
7172 b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend);
7174 b2 = (NEXTCHR_IS_EOS)
7175 ? 0 /* isWORDCHAR_L1('\n') */
7176 : isWORDCHAR_utf8_safe((U8*)locinput,
7177 (U8*) reginfo->strend);
7178 match = cBOOL(b1 != b2);
7182 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7183 match = TRUE; /* GCB always matches at begin and
7187 /* Find the gcb values of previous and current
7188 * chars, then see if is a break point */
7189 match = isGCB(getGCB_VAL_UTF8(
7190 reghop3((U8*)locinput,
7192 (U8*)(reginfo->strbeg)),
7193 (U8*) reginfo->strend),
7194 getGCB_VAL_UTF8((U8*) locinput,
7195 (U8*) reginfo->strend),
7196 (U8*) reginfo->strbeg,
7203 if (locinput == reginfo->strbeg) {
7206 else if (NEXTCHR_IS_EOS) {
7210 match = isLB(getLB_VAL_UTF8(
7211 reghop3((U8*)locinput,
7213 (U8*)(reginfo->strbeg)),
7214 (U8*) reginfo->strend),
7215 getLB_VAL_UTF8((U8*) locinput,
7216 (U8*) reginfo->strend),
7217 (U8*) reginfo->strbeg,
7219 (U8*) reginfo->strend,
7224 case SB_BOUND: /* Always matches at begin and end */
7225 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7229 match = isSB(getSB_VAL_UTF8(
7230 reghop3((U8*)locinput,
7232 (U8*)(reginfo->strbeg)),
7233 (U8*) reginfo->strend),
7234 getSB_VAL_UTF8((U8*) locinput,
7235 (U8*) reginfo->strend),
7236 (U8*) reginfo->strbeg,
7238 (U8*) reginfo->strend,
7244 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7248 match = isWB(WB_UNKNOWN,
7250 reghop3((U8*)locinput,
7252 (U8*)(reginfo->strbeg)),
7253 (U8*) reginfo->strend),
7254 getWB_VAL_UTF8((U8*) locinput,
7255 (U8*) reginfo->strend),
7256 (U8*) reginfo->strbeg,
7258 (U8*) reginfo->strend,
7264 else { /* Not utf8 target */
7265 switch((bound_type) FLAGS(scan)) {
7266 case TRADITIONAL_BOUND:
7269 b1 = (locinput == reginfo->strbeg)
7270 ? 0 /* isWORDCHAR_L1('\n') */
7271 : isWORDCHAR_L1(UCHARAT(locinput - 1));
7272 b2 = (NEXTCHR_IS_EOS)
7273 ? 0 /* isWORDCHAR_L1('\n') */
7274 : isWORDCHAR_L1(nextbyte);
7275 match = cBOOL(b1 != b2);
7280 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7281 match = TRUE; /* GCB always matches at begin and
7284 else { /* Only CR-LF combo isn't a GCB in 0-255
7286 match = UCHARAT(locinput - 1) != '\r'
7287 || UCHARAT(locinput) != '\n';
7292 if (locinput == reginfo->strbeg) {
7295 else if (NEXTCHR_IS_EOS) {
7299 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
7300 getLB_VAL_CP(UCHARAT(locinput)),
7301 (U8*) reginfo->strbeg,
7303 (U8*) reginfo->strend,
7308 case SB_BOUND: /* Always matches at begin and end */
7309 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7313 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
7314 getSB_VAL_CP(UCHARAT(locinput)),
7315 (U8*) reginfo->strbeg,
7317 (U8*) reginfo->strend,
7323 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
7327 match = isWB(WB_UNKNOWN,
7328 getWB_VAL_CP(UCHARAT(locinput -1)),
7329 getWB_VAL_CP(UCHARAT(locinput)),
7330 (U8*) reginfo->strbeg,
7332 (U8*) reginfo->strend,
7339 if (to_complement ^ ! match) {
7345 case ANYOFL: /* /[abc]/l */
7346 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7347 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan);
7350 case ANYOFD: /* /[abc]/d */
7351 case ANYOF: /* /[abc]/ */
7352 if (NEXTCHR_IS_EOS || locinput >= loceol)
7354 if ( (! utf8_target || UTF8_IS_INVARIANT(*locinput))
7355 && ! (ANYOF_FLAGS(scan) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP))
7357 if (! ANYOF_BITMAP_TEST(scan, * (U8 *) (locinput))) {
7363 if (!reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7368 goto increment_locinput;
7374 || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)
7375 || locinput >= loceol)
7379 locinput++; /* ANYOFM is always single byte */
7384 || (UCHARAT(locinput) & FLAGS(scan)) == ARG(scan)
7385 || locinput >= loceol)
7389 goto increment_locinput;
7395 || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7396 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7401 goto increment_locinput;
7407 || ANYOF_FLAGS(scan) != (U8) *locinput
7408 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7413 goto increment_locinput;
7419 || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput),
7420 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)),
7421 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)))
7422 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7427 goto increment_locinput;
7433 || loceol - locinput < FLAGS(scan)
7434 || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan))
7435 || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol,
7440 goto increment_locinput;
7444 if (NEXTCHR_IS_EOS) {
7449 if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput)
7450 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7451 (U8 *) reginfo->strend,
7453 ANYOFRbase(scan), ANYOFRdelta(scan)))
7459 if (! withinCOUNT((U8) *locinput,
7460 ANYOFRbase(scan), ANYOFRdelta(scan)))
7465 goto increment_locinput;
7469 if (NEXTCHR_IS_EOS) {
7474 if ( ANYOF_FLAGS(scan) != (U8) *locinput
7475 || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput,
7476 (U8 *) reginfo->strend,
7478 ANYOFRbase(scan), ANYOFRdelta(scan)))
7484 if (! withinCOUNT((U8) *locinput,
7485 ANYOFRbase(scan), ANYOFRdelta(scan)))
7490 goto increment_locinput;
7493 /* The argument (FLAGS) to all the POSIX node types is the class number
7496 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
7500 case POSIXL: /* \w or [:punct:] etc. under /l */
7501 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7502 if (NEXTCHR_IS_EOS || locinput >= loceol)
7505 /* Use isFOO_lc() for characters within Latin1. (Note that
7506 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7507 * wouldn't be invariant) */
7508 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7509 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) {
7517 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7518 /* An above Latin-1 code point, or malformed */
7519 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
7521 goto utf8_posix_above_latin1;
7524 /* Here is a UTF-8 variant code point below 256 and the target is
7526 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
7527 EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7528 *(locinput + 1))))))
7533 goto increment_locinput;
7535 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
7539 case POSIXD: /* \w or [:punct:] etc. under /d */
7545 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
7547 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7551 /* All UTF-8 variants match */
7552 if (! UTF8_IS_INVARIANT(nextbyte)) {
7553 goto increment_locinput;
7559 case POSIXA: /* \w or [:punct:] etc. under /a */
7562 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
7563 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
7564 * character is a single byte */
7566 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7572 if (! (to_complement ^ cBOOL(_generic_isCC_A(nextbyte,
7578 /* Here we are either not in utf8, or we matched a utf8-invariant,
7579 * so the next char is the next byte */
7583 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
7587 case POSIXU: /* \w or [:punct:] etc. under /u */
7589 if (NEXTCHR_IS_EOS || locinput >= loceol) {
7593 /* Use _generic_isCC() for characters within Latin1. (Note that
7594 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
7595 * wouldn't be invariant) */
7596 if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) {
7597 if (! (to_complement ^ cBOOL(_generic_isCC(nextbyte,
7604 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
7605 if (! (to_complement
7606 ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte,
7614 else { /* Handle above Latin-1 code points */
7615 utf8_posix_above_latin1:
7616 classnum = (_char_class_number) FLAGS(scan);
7619 if (! (to_complement
7620 ^ cBOOL(_invlist_contains_cp(
7621 PL_XPosix_ptrs[classnum],
7622 utf8_to_uvchr_buf((U8 *) locinput,
7623 (U8 *) reginfo->strend,
7629 case _CC_ENUM_SPACE:
7630 if (! (to_complement
7631 ^ cBOOL(is_XPERLSPACE_high(locinput))))
7636 case _CC_ENUM_BLANK:
7637 if (! (to_complement
7638 ^ cBOOL(is_HORIZWS_high(locinput))))
7643 case _CC_ENUM_XDIGIT:
7644 if (! (to_complement
7645 ^ cBOOL(is_XDIGIT_high(locinput))))
7650 case _CC_ENUM_VERTSPACE:
7651 if (! (to_complement
7652 ^ cBOOL(is_VERTWS_high(locinput))))
7657 case _CC_ENUM_CNTRL: /* These can't match above Latin1 */
7658 case _CC_ENUM_ASCII:
7659 if (! to_complement) {
7664 locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend);
7668 case CLUMP: /* Match \X: logical Unicode character. This is defined as
7669 a Unicode extended Grapheme Cluster */
7670 if (NEXTCHR_IS_EOS || locinput >= loceol)
7672 if (! utf8_target) {
7674 /* Match either CR LF or '.', as all the other possibilities
7676 locinput++; /* Match the . or CR */
7677 if (nextbyte == '\r' /* And if it was CR, and the next is LF,
7679 && locinput < loceol
7680 && UCHARAT(locinput) == '\n')
7687 /* Get the gcb type for the current character */
7688 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
7689 (U8*) reginfo->strend);
7691 /* Then scan through the input until we get to the first
7692 * character whose type is supposed to be a gcb with the
7693 * current character. (There is always a break at the
7695 locinput += UTF8SKIP(locinput);
7696 while (locinput < loceol) {
7697 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
7698 (U8*) reginfo->strend);
7699 if (isGCB(prev_gcb, cur_gcb,
7700 (U8*) reginfo->strbeg, (U8*) locinput,
7707 locinput += UTF8SKIP(locinput);
7714 case REFFLN: /* /\g{name}/il */
7715 { /* The capture buffer cases. The ones beginning with N for the
7716 named buffers just convert to the equivalent numbered and
7717 pretend they were called as the corresponding numbered buffer
7719 /* don't initialize these in the declaration, it makes C++
7724 const U8 *fold_array;
7727 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7728 folder = foldEQ_locale;
7729 fold_array = PL_fold_locale;
7731 utf8_fold_flags = FOLDEQ_LOCALE;
7734 case REFFAN: /* /\g{name}/iaa */
7735 folder = foldEQ_latin1;
7736 fold_array = PL_fold_latin1;
7738 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7741 case REFFUN: /* /\g{name}/iu */
7742 folder = foldEQ_latin1;
7743 fold_array = PL_fold_latin1;
7745 utf8_fold_flags = 0;
7748 case REFFN: /* /\g{name}/i */
7750 fold_array = PL_fold;
7752 utf8_fold_flags = 0;
7755 case REFN: /* /\g{name}/ */
7759 utf8_fold_flags = 0;
7762 /* For the named back references, find the corresponding buffer
7764 n = reg_check_named_buff_matched(rex,scan);
7769 goto do_nref_ref_common;
7771 case REFFL: /* /\1/il */
7772 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
7773 folder = foldEQ_locale;
7774 fold_array = PL_fold_locale;
7775 utf8_fold_flags = FOLDEQ_LOCALE;
7778 case REFFA: /* /\1/iaa */
7779 folder = foldEQ_latin1;
7780 fold_array = PL_fold_latin1;
7781 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7784 case REFFU: /* /\1/iu */
7785 folder = foldEQ_latin1;
7786 fold_array = PL_fold_latin1;
7787 utf8_fold_flags = 0;
7790 case REFF: /* /\1/i */
7792 fold_array = PL_fold;
7793 utf8_fold_flags = 0;
7796 case REF: /* /\1/ */
7799 utf8_fold_flags = 0;
7803 n = ARG(scan); /* which paren pair */
7806 ln = rex->offs[n].start;
7807 endref = rex->offs[n].end;
7808 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
7809 if (rex->lastparen < n || ln == -1 || endref == -1)
7810 sayNO; /* Do not match unless seen CLOSEn. */
7814 s = reginfo->strbeg + ln;
7815 if (type != REF /* REF can do byte comparison */
7816 && (utf8_target || type == REFFU || type == REFFL))
7818 char * limit = loceol;
7820 /* This call case insensitively compares the entire buffer
7821 * at s, with the current input starting at locinput, but
7822 * not going off the end given by loceol, and
7823 * returns in <limit> upon success, how much of the
7824 * current input was matched */
7825 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
7826 locinput, &limit, 0, utf8_target, utf8_fold_flags))
7834 /* Not utf8: Inline the first character, for speed. */
7835 if ( ! NEXTCHR_IS_EOS
7836 && locinput < loceol
7837 && UCHARAT(s) != nextbyte
7839 || UCHARAT(s) != fold_array[nextbyte]))
7844 if (locinput + ln > loceol)
7846 if (ln > 1 && (type == REF
7847 ? memNE(s, locinput, ln)
7848 : ! folder(locinput, s, ln)))
7854 case NOTHING: /* null op; e.g. the 'nothing' following
7855 * the '*' in m{(a+|b)*}' */
7857 case TAIL: /* placeholder while compiling (A|B|C) */
7861 #define ST st->u.eval
7862 #define CUR_EVAL cur_eval->u.eval
7868 regexp_internal *rei;
7869 regnode *startpoint;
7872 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
7873 arg= (U32)ARG(scan);
7874 if (cur_eval && cur_eval->locinput == locinput) {
7875 if ( ++nochange_depth > max_nochange_depth )
7877 "Pattern subroutine nesting without pos change"
7878 " exceeded limit in regex");
7885 startpoint = scan + ARG2L(scan);
7886 EVAL_CLOSE_PAREN_SET( st, arg );
7887 /* Detect infinite recursion
7889 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
7890 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
7891 * So we track the position in the string we are at each time
7892 * we recurse and if we try to enter the same routine twice from
7893 * the same position we throw an error.
7895 if ( rex->recurse_locinput[arg] == locinput ) {
7896 /* FIXME: we should show the regop that is failing as part
7897 * of the error message. */
7898 Perl_croak(aTHX_ "Infinite recursion in regex");
7900 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
7901 rex->recurse_locinput[arg]= locinput;
7904 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7906 Perl_re_exec_indentf( aTHX_
7907 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
7908 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
7914 /* Save all the positions seen so far. */
7915 ST.cp = regcppush(rex, 0, maxopenparen);
7916 REGCP_SET(ST.lastcp);
7918 /* and then jump to the code we share with EVAL */
7919 goto eval_recurse_doit;
7922 case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */
7923 if (logical == 2 && cur_eval && cur_eval->locinput==locinput) {
7924 if ( ++nochange_depth > max_nochange_depth )
7925 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
7930 /* execute the code in the {...} */
7934 OP * const oop = PL_op;
7935 COP * const ocurcop = PL_curcop;
7939 /* save *all* paren positions */
7940 regcppush(rex, 0, maxopenparen);
7941 REGCP_SET(ST.lastcp);
7944 caller_cv = find_runcv(NULL);
7948 if (rexi->data->what[n] == 'r') { /* code from an external qr */
7950 (REGEXP*)(rexi->data->data[n])
7952 nop = (OP*)rexi->data->data[n+1];
7954 else if (rexi->data->what[n] == 'l') { /* literal code */
7956 nop = (OP*)rexi->data->data[n];
7957 assert(CvDEPTH(newcv));
7960 /* literal with own CV */
7961 assert(rexi->data->what[n] == 'L');
7962 newcv = rex->qr_anoncv;
7963 nop = (OP*)rexi->data->data[n];
7966 /* Some notes about MULTICALL and the context and save stacks.
7969 * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
7970 * since codeblocks don't introduce a new scope (so that
7971 * local() etc accumulate), at the end of a successful
7972 * match there will be a SAVEt_CLEARSV on the savestack
7973 * for each of $x, $y, $z. If the three code blocks above
7974 * happen to have come from different CVs (e.g. via
7975 * embedded qr//s), then we must ensure that during any
7976 * savestack unwinding, PL_comppad always points to the
7977 * right pad at each moment. We achieve this by
7978 * interleaving SAVEt_COMPPAD's on the savestack whenever
7979 * there is a change of pad.
7980 * In theory whenever we call a code block, we should
7981 * push a CXt_SUB context, then pop it on return from
7982 * that code block. This causes a bit of an issue in that
7983 * normally popping a context also clears the savestack
7984 * back to cx->blk_oldsaveix, but here we specifically
7985 * don't want to clear the save stack on exit from the
7987 * Also for efficiency we don't want to keep pushing and
7988 * popping the single SUB context as we backtrack etc.
7989 * So instead, we push a single context the first time
7990 * we need, it, then hang onto it until the end of this
7991 * function. Whenever we encounter a new code block, we
7992 * update the CV etc if that's changed. During the times
7993 * in this function where we're not executing a code
7994 * block, having the SUB context still there is a bit
7995 * naughty - but we hope that no-one notices.
7996 * When the SUB context is initially pushed, we fake up
7997 * cx->blk_oldsaveix to be as if we'd pushed this context
7998 * on first entry to S_regmatch rather than at some random
7999 * point during the regexe execution. That way if we
8000 * croak, popping the context stack will ensure that
8001 * *everything* SAVEd by this function is undone and then
8002 * the context popped, rather than e.g., popping the
8003 * context (and restoring the original PL_comppad) then
8004 * popping more of the savestack and restoring a bad
8008 /* If this is the first EVAL, push a MULTICALL. On
8009 * subsequent calls, if we're executing a different CV, or
8010 * if PL_comppad has got messed up from backtracking
8011 * through SAVECOMPPADs, then refresh the context.
8013 if (newcv != last_pushed_cv || PL_comppad != last_pad)
8015 U8 flags = (CXp_SUB_RE |
8016 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
8018 if (last_pushed_cv) {
8019 CHANGE_MULTICALL_FLAGS(newcv, flags);
8022 PUSH_MULTICALL_FLAGS(newcv, flags);
8024 /* see notes above */
8025 CX_CUR()->blk_oldsaveix = orig_savestack_ix;
8027 last_pushed_cv = newcv;
8030 /* these assignments are just to silence compiler
8032 multicall_cop = NULL;
8034 last_pad = PL_comppad;
8036 /* the initial nextstate you would normally execute
8037 * at the start of an eval (which would cause error
8038 * messages to come from the eval), may be optimised
8039 * away from the execution path in the regex code blocks;
8040 * so manually set PL_curcop to it initially */
8042 OP *o = cUNOPx(nop)->op_first;
8043 assert(o->op_type == OP_NULL);
8044 if (o->op_targ == OP_SCOPE) {
8045 o = cUNOPo->op_first;
8048 assert(o->op_targ == OP_LEAVE);
8049 o = cUNOPo->op_first;
8050 assert(o->op_type == OP_ENTER);
8054 if (o->op_type != OP_STUB) {
8055 assert( o->op_type == OP_NEXTSTATE
8056 || o->op_type == OP_DBSTATE
8057 || (o->op_type == OP_NULL
8058 && ( o->op_targ == OP_NEXTSTATE
8059 || o->op_targ == OP_DBSTATE
8063 PL_curcop = (COP*)o;
8068 DEBUG_STATE_r( Perl_re_printf( aTHX_
8069 " re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
8071 rex->offs[0].end = locinput - reginfo->strbeg;
8072 if (reginfo->info_aux_eval->pos_magic)
8073 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
8074 reginfo->sv, reginfo->strbeg,
8075 locinput - reginfo->strbeg);
8078 SV *sv_mrk = get_sv("REGMARK", 1);
8079 sv_setsv(sv_mrk, sv_yes_mark);
8082 /* we don't use MULTICALL here as we want to call the
8083 * first op of the block of interest, rather than the
8084 * first op of the sub. Also, we don't want to free
8085 * the savestack frame */
8086 before = (IV)(SP-PL_stack_base);
8088 CALLRUNOPS(aTHX); /* Scalar context. */
8090 if ((IV)(SP-PL_stack_base) == before)
8091 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8097 /* before restoring everything, evaluate the returned
8098 * value, so that 'uninit' warnings don't use the wrong
8099 * PL_op or pad. Also need to process any magic vars
8100 * (e.g. $1) *before* parentheses are restored */
8105 if (logical == 0) { /* (?{})/ */
8106 SV *replsv = save_scalar(PL_replgv);
8107 sv_setsv(replsv, ret); /* $^R */
8110 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
8111 sw = cBOOL(SvTRUE_NN(ret));
8114 else { /* /(??{}) */
8115 /* if its overloaded, let the regex compiler handle
8116 * it; otherwise extract regex, or stringify */
8117 if (SvGMAGICAL(ret))
8118 ret = sv_mortalcopy(ret);
8119 if (!SvAMAGIC(ret)) {
8123 if (SvTYPE(sv) == SVt_REGEXP)
8124 re_sv = (REGEXP*) sv;
8125 else if (SvSMAGICAL(ret)) {
8126 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
8128 re_sv = (REGEXP *) mg->mg_obj;
8131 /* force any undef warnings here */
8132 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
8133 ret = sv_mortalcopy(ret);
8134 (void) SvPV_force_nolen(ret);
8140 /* *** Note that at this point we don't restore
8141 * PL_comppad, (or pop the CxSUB) on the assumption it may
8142 * be used again soon. This is safe as long as nothing
8143 * in the regexp code uses the pad ! */
8145 PL_curcop = ocurcop;
8146 regcp_restore(rex, ST.lastcp, &maxopenparen);
8147 PL_curpm_under = PL_curpm;
8148 PL_curpm = PL_reg_curpm;
8151 PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol,
8157 /* only /(??{})/ from now on */
8160 /* extract RE object from returned value; compiling if
8164 re_sv = reg_temp_copy(NULL, re_sv);
8169 if (SvUTF8(ret) && IN_BYTES) {
8170 /* In use 'bytes': make a copy of the octet
8171 * sequence, but without the flag on */
8173 const char *const p = SvPV(ret, len);
8174 ret = newSVpvn_flags(p, len, SVs_TEMP);
8176 if (rex->intflags & PREGf_USE_RE_EVAL)
8177 pm_flags |= PMf_USE_RE_EVAL;
8179 /* if we got here, it should be an engine which
8180 * supports compiling code blocks and stuff */
8181 assert(rex->engine && rex->engine->op_comp);
8182 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
8183 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
8184 rex->engine, NULL, NULL,
8185 /* copy /msixn etc to inner pattern */
8190 & (SVs_TEMP | SVs_GMG | SVf_ROK))
8191 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
8192 /* This isn't a first class regexp. Instead, it's
8193 caching a regexp onto an existing, Perl visible
8195 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
8201 RXp_MATCH_COPIED_off(re);
8202 re->subbeg = rex->subbeg;
8203 re->sublen = rex->sublen;
8204 re->suboffset = rex->suboffset;
8205 re->subcoffset = rex->subcoffset;
8207 re->lastcloseparen = 0;
8210 debug_start_match(re_sv, utf8_target, locinput,
8211 reginfo->strend, "EVAL/GOSUB: Matching embedded");
8213 startpoint = rei->program + 1;
8214 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
8215 * close_paren only for GOSUB */
8216 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
8217 /* Save all the seen positions so far. */
8218 ST.cp = regcppush(rex, 0, maxopenparen);
8219 REGCP_SET(ST.lastcp);
8220 /* and set maxopenparen to 0, since we are starting a "fresh" match */
8222 /* run the pattern returned from (??{...}) */
8224 eval_recurse_doit: /* Share code with GOSUB below this line
8225 * At this point we expect the stack context to be
8226 * set up correctly */
8228 /* invalidate the S-L poscache. We're now executing a
8229 * different set of WHILEM ops (and their associated
8230 * indexes) against the same string, so the bits in the
8231 * cache are meaningless. Setting maxiter to zero forces
8232 * the cache to be invalidated and zeroed before reuse.
8233 * XXX This is too dramatic a measure. Ideally we should
8234 * save the old cache and restore when running the outer
8236 reginfo->poscache_maxiter = 0;
8238 /* the new regexp might have a different is_utf8_pat than we do */
8239 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
8241 ST.prev_rex = rex_sv;
8242 ST.prev_curlyx = cur_curlyx;
8244 SET_reg_curpm(rex_sv);
8249 ST.prev_eval = cur_eval;
8251 /* now continue from first node in postoned RE */
8252 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput,
8253 loceol, script_run_begin);
8254 NOT_REACHED; /* NOTREACHED */
8257 case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
8258 /* note: this is called twice; first after popping B, then A */
8260 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n",
8261 depth, cur_eval, ST.prev_eval);
8264 #define SET_RECURSE_LOCINPUT(STR,VAL)\
8265 if ( cur_eval && CUR_EVAL.close_paren ) {\
8267 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
8269 CUR_EVAL.close_paren - 1,\
8273 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
8276 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
8278 rex_sv = ST.prev_rex;
8279 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8280 SET_reg_curpm(rex_sv);
8281 rex = ReANY(rex_sv);
8282 rexi = RXi_GET(rex);
8284 /* preserve $^R across LEAVE's. See Bug 121070. */
8285 SV *save_sv= GvSV(PL_replgv);
8287 SvREFCNT_inc(save_sv);
8288 regcpblow(ST.cp); /* LEAVE in disguise */
8289 /* don't move this initialization up */
8290 replsv = GvSV(PL_replgv);
8291 sv_setsv(replsv, save_sv);
8293 SvREFCNT_dec(save_sv);
8295 cur_eval = ST.prev_eval;
8296 cur_curlyx = ST.prev_curlyx;
8298 /* Invalidate cache. See "invalidate" comment above. */
8299 reginfo->poscache_maxiter = 0;
8300 if ( nochange_depth )
8303 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
8307 case EVAL_B_fail: /* unsuccessful B in (?{...})B */
8308 REGCP_UNWIND(ST.lastcp);
8311 case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
8312 /* note: this is called twice; first after popping B, then A */
8314 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
8315 depth, cur_eval, ST.prev_eval);
8318 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
8320 rex_sv = ST.prev_rex;
8321 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
8322 SET_reg_curpm(rex_sv);
8323 rex = ReANY(rex_sv);
8324 rexi = RXi_GET(rex);
8326 REGCP_UNWIND(ST.lastcp);
8327 regcppop(rex, &maxopenparen);
8328 cur_eval = ST.prev_eval;
8329 cur_curlyx = ST.prev_curlyx;
8331 /* Invalidate cache. See "invalidate" comment above. */
8332 reginfo->poscache_maxiter = 0;
8333 if ( nochange_depth )
8336 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
8341 n = ARG(scan); /* which paren pair */
8342 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
8343 if (n > maxopenparen)
8345 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
8346 "OPEN: rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
8351 (IV)rex->offs[n].start_tmp,
8357 case SROPEN: /* (*SCRIPT_RUN: */
8358 script_run_begin = (U8 *) locinput;
8363 n = ARG(scan); /* which paren pair */
8364 CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8365 locinput - reginfo->strbeg);
8366 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
8371 case SRCLOSE: /* (*SCRIPT_RUN: ... ) */
8373 if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target))
8381 case ACCEPT: /* (*ACCEPT) */
8383 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
8387 cursor && OP(cursor)!=END;
8388 cursor=regnext(cursor))
8390 if ( OP(cursor)==CLOSE ){
8392 if ( n <= lastopen ) {
8393 CLOSE_CAPTURE(n, rex->offs[n].start_tmp,
8394 locinput - reginfo->strbeg);
8395 if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
8404 case GROUPP: /* (?(1)) */
8405 n = ARG(scan); /* which paren pair */
8406 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
8409 case GROUPPN: /* (?(<name>)) */
8410 /* reg_check_named_buff_matched returns 0 for no match */
8411 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
8414 case INSUBP: /* (?(R)) */
8416 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
8417 * of SCAN is already set up as matches a eval.close_paren */
8418 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
8421 case DEFINEP: /* (?(DEFINE)) */
8425 case IFTHEN: /* (?(cond)A|B) */
8426 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
8428 next = NEXTOPER(NEXTOPER(scan));
8430 next = scan + ARG(scan);
8431 if (OP(next) == IFTHEN) /* Fake one. */
8432 next = NEXTOPER(NEXTOPER(next));
8436 case LOGICAL: /* modifier for EVAL and IFMATCH */
8437 logical = scan->flags;
8440 /*******************************************************************
8442 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
8443 pattern, where A and B are subpatterns. (For simple A, CURLYM or
8444 STAR/PLUS/CURLY/CURLYN are used instead.)
8446 A*B is compiled as <CURLYX><A><WHILEM><B>
8448 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
8449 state, which contains the current count, initialised to -1. It also sets
8450 cur_curlyx to point to this state, with any previous value saved in the
8453 CURLYX then jumps straight to the WHILEM op, rather than executing A,
8454 since the pattern may possibly match zero times (i.e. it's a while {} loop
8455 rather than a do {} while loop).
8457 Each entry to WHILEM represents a successful match of A. The count in the
8458 CURLYX block is incremented, another WHILEM state is pushed, and execution
8459 passes to A or B depending on greediness and the current count.
8461 For example, if matching against the string a1a2a3b (where the aN are
8462 substrings that match /A/), then the match progresses as follows: (the
8463 pushed states are interspersed with the bits of strings matched so far):
8466 <CURLYX cnt=0><WHILEM>
8467 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
8468 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
8469 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
8470 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
8472 (Contrast this with something like CURLYM, which maintains only a single
8476 a1 <CURLYM cnt=1> a2
8477 a1 a2 <CURLYM cnt=2> a3
8478 a1 a2 a3 <CURLYM cnt=3> b
8481 Each WHILEM state block marks a point to backtrack to upon partial failure
8482 of A or B, and also contains some minor state data related to that
8483 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
8484 overall state, such as the count, and pointers to the A and B ops.
8486 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
8487 must always point to the *current* CURLYX block, the rules are:
8489 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
8490 and set cur_curlyx to point the new block.
8492 When popping the CURLYX block after a successful or unsuccessful match,
8493 restore the previous cur_curlyx.
8495 When WHILEM is about to execute B, save the current cur_curlyx, and set it
8496 to the outer one saved in the CURLYX block.
8498 When popping the WHILEM block after a successful or unsuccessful B match,
8499 restore the previous cur_curlyx.
8501 Here's an example for the pattern (AI* BI)*BO
8502 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
8505 curlyx backtrack stack
8506 ------ ---------------
8508 CO <CO prev=NULL> <WO>
8509 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8510 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8511 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
8513 At this point the pattern succeeds, and we work back down the stack to
8514 clean up, restoring as we go:
8516 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
8517 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
8518 CO <CO prev=NULL> <WO>
8521 *******************************************************************/
8523 #define ST st->u.curlyx
8525 case CURLYX: /* start of /A*B/ (for complex A) */
8527 /* No need to save/restore up to this paren */
8528 I32 parenfloor = scan->flags;
8530 assert(next); /* keep Coverity happy */
8531 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
8534 /* XXXX Probably it is better to teach regpush to support
8535 parenfloor > maxopenparen ... */
8536 if (parenfloor > (I32)rex->lastparen)
8537 parenfloor = rex->lastparen; /* Pessimization... */
8539 ST.prev_curlyx= cur_curlyx;
8541 ST.cp = PL_savestack_ix;
8543 /* these fields contain the state of the current curly.
8544 * they are accessed by subsequent WHILEMs */
8545 ST.parenfloor = parenfloor;
8550 ST.count = -1; /* this will be updated by WHILEM */
8551 ST.lastloc = NULL; /* this will be updated by WHILEM */
8553 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol,
8555 NOT_REACHED; /* NOTREACHED */
8558 case CURLYX_end: /* just finished matching all of A*B */
8559 cur_curlyx = ST.prev_curlyx;
8561 NOT_REACHED; /* NOTREACHED */
8563 case CURLYX_end_fail: /* just failed to match all of A*B */
8565 cur_curlyx = ST.prev_curlyx;
8567 NOT_REACHED; /* NOTREACHED */
8571 #define ST st->u.whilem
8573 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
8575 /* see the discussion above about CURLYX/WHILEM */
8580 assert(cur_curlyx); /* keep Coverity happy */
8582 min = ARG1(cur_curlyx->u.curlyx.me);
8583 max = ARG2(cur_curlyx->u.curlyx.me);
8584 A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
8585 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
8586 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
8587 ST.cache_offset = 0;
8591 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n",
8592 depth, (long)n, min, max)
8595 /* First just match a string of min A's. */
8598 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
8599 cur_curlyx->u.curlyx.lastloc = locinput;
8600 REGCP_SET(ST.lastcp);
8602 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol,
8604 NOT_REACHED; /* NOTREACHED */
8607 /* If degenerate A matches "", assume A done. */
8609 if (locinput == cur_curlyx->u.curlyx.lastloc) {
8610 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: empty match detected, trying continuation...\n",
8613 goto do_whilem_B_max;
8616 /* super-linear cache processing.
8618 * The idea here is that for certain types of CURLYX/WHILEM -
8619 * principally those whose upper bound is infinity (and
8620 * excluding regexes that have things like \1 and other very
8621 * non-regular expresssiony things), then if a pattern like
8622 * /....A*.../ fails and we backtrack to the WHILEM, then we
8623 * make a note that this particular WHILEM op was at string
8624 * position 47 (say) when the rest of pattern failed. Then, if
8625 * we ever find ourselves back at that WHILEM, and at string
8626 * position 47 again, we can just fail immediately rather than
8627 * running the rest of the pattern again.
8629 * This is very handy when patterns start to go
8630 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
8631 * with a combinatorial explosion of backtracking.
8633 * The cache is implemented as a bit array, with one bit per
8634 * string byte position per WHILEM op (up to 16) - so its
8635 * between 0.25 and 2x the string size.
8637 * To avoid allocating a poscache buffer every time, we do an
8638 * initially countdown; only after we have executed a WHILEM
8639 * op (string-length x #WHILEMs) times do we allocate the
8642 * The top 4 bits of scan->flags byte say how many different
8643 * relevant CURLLYX/WHILEM op pairs there are, while the
8644 * bottom 4-bits is the identifying index number of this
8650 if (!reginfo->poscache_maxiter) {
8651 /* start the countdown: Postpone detection until we
8652 * know the match is not *that* much linear. */
8653 reginfo->poscache_maxiter
8654 = (reginfo->strend - reginfo->strbeg + 1)
8656 /* possible overflow for long strings and many CURLYX's */
8657 if (reginfo->poscache_maxiter < 0)
8658 reginfo->poscache_maxiter = I32_MAX;
8659 reginfo->poscache_iter = reginfo->poscache_maxiter;
8662 if (reginfo->poscache_iter-- == 0) {
8663 /* initialise cache */
8664 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
8665 regmatch_info_aux *const aux = reginfo->info_aux;
8666 if (aux->poscache) {
8667 if ((SSize_t)reginfo->poscache_size < size) {
8668 Renew(aux->poscache, size, char);
8669 reginfo->poscache_size = size;
8671 Zero(aux->poscache, size, char);
8674 reginfo->poscache_size = size;
8675 Newxz(aux->poscache, size, char);
8677 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
8678 "%sWHILEM: Detected a super-linear match, switching on caching%s...\n",
8679 PL_colors[4], PL_colors[5])
8683 if (reginfo->poscache_iter < 0) {
8684 /* have we already failed at this position? */
8685 SSize_t offset, mask;
8687 reginfo->poscache_iter = -1; /* stop eventual underflow */
8688 offset = (scan->flags & 0xf) - 1
8689 + (locinput - reginfo->strbeg)
8691 mask = 1 << (offset % 8);
8693 if (reginfo->info_aux->poscache[offset] & mask) {
8694 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: (cache) already tried at this position...\n",
8697 cur_curlyx->u.curlyx.count--;
8698 sayNO; /* cache records failure */
8700 ST.cache_offset = offset;
8701 ST.cache_mask = mask;
8705 /* Prefer B over A for minimal matching. */
8707 if (cur_curlyx->u.curlyx.minmod) {
8708 ST.save_curlyx = cur_curlyx;
8709 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8710 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
8711 locinput, loceol, script_run_begin);
8712 NOT_REACHED; /* NOTREACHED */
8715 /* Prefer A over B for maximal matching. */
8717 if (n < max) { /* More greed allowed? */
8718 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8720 cur_curlyx->u.curlyx.lastloc = locinput;
8721 REGCP_SET(ST.lastcp);
8722 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol,
8724 NOT_REACHED; /* NOTREACHED */
8726 goto do_whilem_B_max;
8728 NOT_REACHED; /* NOTREACHED */
8730 case WHILEM_B_min: /* just matched B in a minimal match */
8731 case WHILEM_B_max: /* just matched B in a maximal match */
8732 cur_curlyx = ST.save_curlyx;
8734 NOT_REACHED; /* NOTREACHED */
8736 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
8737 cur_curlyx = ST.save_curlyx;
8738 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8739 cur_curlyx->u.curlyx.count--;
8741 NOT_REACHED; /* NOTREACHED */
8743 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
8745 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
8746 REGCP_UNWIND(ST.lastcp);
8747 regcppop(rex, &maxopenparen);
8748 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
8749 cur_curlyx->u.curlyx.count--;
8751 NOT_REACHED; /* NOTREACHED */
8753 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
8754 REGCP_UNWIND(ST.lastcp);
8755 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
8756 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: failed, trying continuation...\n",
8760 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8761 && ckWARN(WARN_REGEXP)
8762 && !reginfo->warned)
8764 reginfo->warned = TRUE;
8765 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8766 "Complex regular subexpression recursion limit (%d) "
8772 ST.save_curlyx = cur_curlyx;
8773 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
8774 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
8775 locinput, loceol, script_run_begin);
8776 NOT_REACHED; /* NOTREACHED */
8778 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
8779 cur_curlyx = ST.save_curlyx;
8781 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
8782 /* Maximum greed exceeded */
8783 if (cur_curlyx->u.curlyx.count >= REG_INFTY
8784 && ckWARN(WARN_REGEXP)
8785 && !reginfo->warned)
8787 reginfo->warned = TRUE;
8788 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
8789 "Complex regular subexpression recursion "
8790 "limit (%d) exceeded",
8793 cur_curlyx->u.curlyx.count--;
8797 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "WHILEM: B min fail: trying longer...\n", depth)
8799 /* Try grabbing another A and see if it helps. */
8800 cur_curlyx->u.curlyx.lastloc = locinput;
8801 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
8803 REGCP_SET(ST.lastcp);
8804 PUSH_STATE_GOTO(WHILEM_A_min,
8805 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
8806 locinput, loceol, script_run_begin);
8807 NOT_REACHED; /* NOTREACHED */
8810 #define ST st->u.branch
8812 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
8813 next = scan + ARG(scan);
8816 scan = NEXTOPER(scan);
8819 case BRANCH: /* /(...|A|...)/ */
8820 scan = NEXTOPER(scan); /* scan now points to inner node */
8821 ST.lastparen = rex->lastparen;
8822 ST.lastcloseparen = rex->lastcloseparen;
8823 ST.next_branch = next;
8826 /* Now go into the branch */
8828 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8831 PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol,
8834 NOT_REACHED; /* NOTREACHED */
8836 case CUTGROUP: /* /(*THEN)/ */
8837 sv_yes_mark = st->u.mark.mark_name = scan->flags
8838 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
8840 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol,
8842 NOT_REACHED; /* NOTREACHED */
8844 case CUTGROUP_next_fail:
8847 if (st->u.mark.mark_name)
8848 sv_commit = st->u.mark.mark_name;
8850 NOT_REACHED; /* NOTREACHED */
8854 NOT_REACHED; /* NOTREACHED */
8856 case BRANCH_next_fail: /* that branch failed; try the next, if any */
8861 REGCP_UNWIND(ST.cp);
8862 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8863 scan = ST.next_branch;
8864 /* no more branches? */
8865 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
8867 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n",
8874 continue; /* execute next BRANCH[J] op */
8877 case MINMOD: /* next op will be non-greedy, e.g. A*? */
8882 #define ST st->u.curlym
8884 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
8886 /* This is an optimisation of CURLYX that enables us to push
8887 * only a single backtracking state, no matter how many matches
8888 * there are in {m,n}. It relies on the pattern being constant
8889 * length, with no parens to influence future backrefs
8893 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
8895 ST.lastparen = rex->lastparen;
8896 ST.lastcloseparen = rex->lastcloseparen;
8898 /* if paren positive, emulate an OPEN/CLOSE around A */
8900 U32 paren = ST.me->flags;
8901 if (paren > maxopenparen)
8902 maxopenparen = paren;
8903 scan += NEXT_OFF(scan); /* Skip former OPEN. */
8911 ST.Binfo.count = -1;
8914 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
8917 curlym_do_A: /* execute the A in /A{m,n}B/ */
8918 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */
8920 NOT_REACHED; /* NOTREACHED */
8922 case CURLYM_A: /* we've just matched an A */
8924 /* after first match, determine A's length: u.curlym.alen */
8925 if (ST.count == 1) {
8926 if (reginfo->is_utf8_target) {
8927 char *s = st->locinput;
8928 while (s < locinput) {
8934 ST.alen = locinput - st->locinput;
8937 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
8940 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
8941 depth, (IV) ST.count, (IV)ST.alen)
8944 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8948 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
8949 if ( max == REG_INFTY || ST.count < max )
8950 goto curlym_do_A; /* try to match another A */
8952 goto curlym_do_B; /* try to match B */
8954 case CURLYM_A_fail: /* just failed to match an A */
8955 REGCP_UNWIND(ST.cp);
8958 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
8959 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
8962 curlym_do_B: /* execute the B in /A{m,n}B/ */
8963 if (ST.Binfo.count < 0) {
8964 /* calculate possible match of 1st char following curly */
8966 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
8967 regnode *text_node = ST.B;
8968 if (! HAS_TEXT(text_node))
8969 FIND_NEXT_IMPT(text_node);
8970 if (PL_regkind[OP(text_node)] == EXACT) {
8971 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
8972 &ST.Binfo, reginfo))
8981 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n",
8982 depth, (IV)ST.count)
8984 if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) {
8985 assert(ST.Binfo.count > 0);
8987 /* Do a quick test to hopefully rule out most non-matches */
8988 if ( locinput + ST.Binfo.min_length > loceol
8989 || ! S_test_EXACTISH_ST(locinput, ST.Binfo))
8992 Perl_re_exec_indentf( aTHX_
8993 "CURLYM Fast bail next target=0x%X anded==0x%X"
8996 (int) nextbyte, ST.Binfo.first_byte_anded,
8997 ST.Binfo.first_byte_mask)
8999 state_num = CURLYM_B_fail;
9000 goto reenter_switch;
9005 /* emulate CLOSE: mark current A as captured */
9006 U32 paren = (U32)ST.me->flags;
9008 CLOSE_CAPTURE(paren,
9009 HOPc(locinput, -ST.alen) - reginfo->strbeg,
9010 locinput - reginfo->strbeg);
9013 rex->offs[paren].end = -1;
9015 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
9024 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */
9026 NOT_REACHED; /* NOTREACHED */
9028 case CURLYM_B_fail: /* just failed to match a B */
9029 REGCP_UNWIND(ST.cp);
9030 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9032 I32 max = ARG2(ST.me);
9033 if (max != REG_INFTY && ST.count == max)
9035 goto curlym_do_A; /* try to match a further A */
9037 /* backtrack one A */
9038 if (ST.count == ARG1(ST.me) /* min */)
9041 SET_locinput(HOPc(locinput, -ST.alen));
9042 goto curlym_do_B; /* try to match B */
9045 #define ST st->u.curly
9047 #define CURLY_SETPAREN(paren, success) \
9050 CLOSE_CAPTURE(paren, HOPc(locinput, -1) - reginfo->strbeg, \
9051 locinput - reginfo->strbeg); \
9054 rex->offs[paren].end = -1; \
9055 rex->lastparen = ST.lastparen; \
9056 rex->lastcloseparen = ST.lastcloseparen; \
9060 case STAR: /* /A*B/ where A is width 1 char */
9064 scan = NEXTOPER(scan);
9067 case PLUS: /* /A+B/ where A is width 1 char */
9071 scan = NEXTOPER(scan);
9074 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
9075 ST.paren = scan->flags; /* Which paren to set */
9076 ST.lastparen = rex->lastparen;
9077 ST.lastcloseparen = rex->lastcloseparen;
9078 if (ST.paren > maxopenparen)
9079 maxopenparen = ST.paren;
9080 ST.min = ARG1(scan); /* min to match */
9081 ST.max = ARG2(scan); /* max to match */
9082 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
9084 /* handle the single-char capture called as a GOSUB etc */
9085 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
9087 char *li = locinput;
9088 if (!regrepeat(rex, &li, scan, loceol, reginfo, 1))
9096 case CURLY: /* /A{m,n}B/ where A is width 1 char */
9098 ST.min = ARG1(scan); /* min to match */
9099 ST.max = ARG2(scan); /* max to match */
9100 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
9103 * Lookahead to avoid useless match attempts
9104 * when we know what character comes next.
9106 * Used to only do .*x and .*?x, but now it allows
9107 * for )'s, ('s and (?{ ... })'s to be in the way
9108 * of the quantifier and the EXACT-like node. -- japhy
9111 assert(ST.min <= ST.max);
9112 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
9116 regnode *text_node = next;
9118 if (! HAS_TEXT(text_node))
9119 FIND_NEXT_IMPT(text_node);
9121 if (! HAS_TEXT(text_node))
9124 if ( PL_regkind[OP(text_node)] != EXACT ) {
9128 if (! S_setup_EXACTISH_ST(aTHX_ text_node,
9129 &ST.Binfo, reginfo))
9140 char *li = locinput;
9143 regrepeat(rex, &li, ST.A, loceol, reginfo, ST.min)
9150 if (ST.Binfo.count <= 0)
9151 goto curly_try_B_min;
9153 ST.oldloc = locinput;
9155 /* set ST.maxpos to the furthest point along the
9156 * string that could possibly match, i.e., that a match could
9158 if (ST.max == REG_INFTY) {
9159 ST.maxpos = loceol - 1;
9161 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
9164 else if (utf8_target) {
9165 int m = ST.max - ST.min;
9166 for (ST.maxpos = locinput;
9167 m >0 && ST.maxpos < loceol; m--)
9168 ST.maxpos += UTF8SKIP(ST.maxpos);
9171 ST.maxpos = locinput + ST.max - ST.min;
9172 if (ST.maxpos >= loceol)
9173 ST.maxpos = loceol - 1;
9175 goto curly_try_B_min_known;
9179 /* avoid taking address of locinput, so it can remain
9181 char *li = locinput;
9182 ST.count = regrepeat(rex, &li, ST.A, loceol, reginfo, ST.max);
9183 if (ST.count < ST.min)
9186 if ((ST.count > ST.min)
9187 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
9189 /* A{m,n} must come at the end of the string, there's
9190 * no point in backing off ... */
9192 /* ...except that $ and \Z can match before *and* after
9193 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
9194 We may back off by one in this case. */
9195 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
9199 goto curly_try_B_max;
9201 NOT_REACHED; /* NOTREACHED */
9203 case CURLY_B_min_fail:
9204 /* failed to find B in a non-greedy match. */
9206 REGCP_UNWIND(ST.cp);
9208 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9211 if (ST.Binfo.count == 0) {
9212 /* failed -- move forward one */
9213 char *li = locinput;
9214 if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) {
9219 if (!( ST.count <= ST.max
9220 /* count overflow ? */
9221 || (ST.max == REG_INFTY && ST.count > 0))
9227 /* Couldn't or didn't -- move forward. */
9228 ST.oldloc = locinput;
9230 locinput += UTF8SKIP(locinput);
9235 curly_try_B_min_known:
9236 /* find the next place where 'B' could work, then call B */
9237 if (locinput + ST.Binfo.initial_exact < loceol) {
9238 if (ST.Binfo.initial_exact >= ST.Binfo.max_length) {
9240 /* Here, the mask is all 1's for the entire length of
9241 * any possible match. (That actually means that there
9242 * is only one possible match.) Look for the next
9244 locinput = ninstr(locinput, loceol,
9245 (char *) ST.Binfo.matches,
9246 (char *) ST.Binfo.matches
9247 + ST.Binfo.initial_exact);
9248 if (locinput == NULL) {
9253 /* If the first byte(s) of the mask are all ones, it
9254 * means those bytes must match identically, so can use
9255 * ninstr() to find the next possible matchpoint */
9256 if (ST.Binfo.initial_exact > 0) {
9257 locinput = ninstr(locinput, loceol,
9258 (char *) ST.Binfo.matches,
9259 (char *) ST.Binfo.matches
9260 + ST.Binfo.initial_exact);
9262 else { /* Otherwise find the next byte that matches,
9264 locinput = (char *) find_next_masked(
9265 (U8 *) locinput, (U8 *) loceol,
9266 ST.Binfo.first_byte_anded,
9267 ST.Binfo.first_byte_mask);
9268 /* Advance to the end of a multi-byte character */
9270 while ( locinput < loceol
9271 && UTF8_IS_CONTINUATION(*locinput))
9277 if ( locinput == NULL
9278 || locinput + ST.Binfo.min_length > loceol)
9283 /* Here, we have found a possible match point; if can't
9284 * rule it out, quit the loop so can check fully */
9285 if (S_test_EXACTISH_ST(locinput, ST.Binfo)) {
9289 locinput += (utf8_target) ? UTF8SKIP(locinput) : 1;
9291 } while (locinput <= ST.maxpos);
9294 if (locinput > ST.maxpos)
9298 ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput)
9299 : (STRLEN) (locinput - ST.oldloc);
9302 /* Here is at the beginning of a character that meets the mask
9303 * criteria. Need to make sure that some real possibility */
9306 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
9307 * at what may be the beginning of b; check that everything
9308 * between oldloc and locinput matches */
9309 char *li = ST.oldloc;
9311 if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n)
9313 assert(n == REG_INFTY || locinput == li);
9318 CURLY_SETPAREN(ST.paren, ST.count);
9319 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol,
9321 NOT_REACHED; /* NOTREACHED */
9325 /* a successful greedy match: now try to match B */
9326 if ( ST.Binfo.count <= 0
9327 || ( ST.Binfo.count > 0
9328 && locinput + ST.Binfo.min_length <= loceol
9329 && S_test_EXACTISH_ST(locinput, ST.Binfo)))
9331 CURLY_SETPAREN(ST.paren, ST.count);
9332 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol,
9334 NOT_REACHED; /* NOTREACHED */
9338 case CURLY_B_max_fail:
9339 /* failed to find B in a greedy match */
9341 REGCP_UNWIND(ST.cp);
9343 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
9346 if (--ST.count < ST.min)
9348 locinput = HOPc(locinput, -1);
9349 goto curly_try_B_max;
9353 case END: /* last op of main pattern */
9356 /* we've just finished A in /(??{A})B/; now continue with B */
9357 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
9358 st->u.eval.prev_rex = rex_sv; /* inner */
9360 /* Save *all* the positions. */
9361 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
9362 rex_sv = CUR_EVAL.prev_rex;
9363 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
9364 SET_reg_curpm(rex_sv);
9365 rex = ReANY(rex_sv);
9366 rexi = RXi_GET(rex);
9368 st->u.eval.prev_curlyx = cur_curlyx;
9369 cur_curlyx = CUR_EVAL.prev_curlyx;
9371 REGCP_SET(st->u.eval.lastcp);
9373 /* Restore parens of the outer rex without popping the
9375 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
9377 st->u.eval.prev_eval = cur_eval;
9378 cur_eval = CUR_EVAL.prev_eval;
9380 Perl_re_exec_indentf( aTHX_ "END: EVAL trying tail ... (cur_eval=%p)\n",
9382 if ( nochange_depth )
9385 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
9387 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */
9388 st->u.eval.prev_eval->u.eval.B,
9389 locinput, loceol, script_run_begin);
9392 if (locinput < reginfo->till) {
9393 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
9394 "%sEND: Match possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
9396 (long)(locinput - startpos),
9397 (long)(reginfo->till - startpos),
9400 sayNO_SILENT; /* Cannot match: too short. */
9402 sayYES; /* Success! */
9404 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
9406 Perl_re_exec_indentf( aTHX_ "%sSUCCEED: subpattern success...%s\n",
9407 depth, PL_colors[4], PL_colors[5]));
9408 sayYES; /* Success! */
9411 #define ST st->u.ifmatch
9413 case SUSPEND: /* (?>A) */
9415 ST.start = locinput;
9420 case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?<!A) */
9422 goto ifmatch_trivial_fail_test;
9424 case IFMATCH: /* +ve lookaround: (?=A), or with 'flags', (?<=A) */
9426 ifmatch_trivial_fail_test:
9427 ST.count = scan->next_off + 1; /* next_off repurposed to be
9428 lookbehind count, requires
9430 if (! scan->flags) { /* 'flags' zero means lookahed */
9432 /* Lookahead starts here and ends at the normal place */
9433 ST.start = locinput;
9437 PERL_UINT_FAST8_T back_count = scan->flags;
9440 /* Lookbehind can look beyond the current position */
9443 /* ... and starts at the first place in the input that is in
9444 * the range of the possible start positions */
9445 for (; ST.count > 0; ST.count--, back_count--) {
9446 s = HOPBACKc(locinput, back_count);
9453 /* If the lookbehind doesn't start in the actual string, is a
9454 * trivial match failure */
9457 sw = 1 - cBOOL(ST.wanted);
9462 /* Here, we didn't want it to match, so is actually success */
9463 next = scan + ARG(scan);
9471 ST.logical = logical;
9472 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
9474 /* execute body of (?...A) */
9475 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), ST.start,
9476 ST.end, script_run_begin);
9477 NOT_REACHED; /* NOTREACHED */
9482 case IFMATCH_A_fail: /* body of (?...A) failed */
9483 if (! ST.logical && ST.count > 1) {
9485 /* It isn't a real failure until we've tried all starting
9486 * positions. Move to the next starting position and retry */
9488 ST.start = HOPc(ST.start, 1);
9490 logical = ST.logical;
9494 /* Here, all starting positions have been tried. */
9498 case IFMATCH_A: /* body of (?...A) succeeded */
9501 sw = matched == ST.wanted;
9502 if (! ST.logical && !sw) {
9506 if (OP(ST.me) != SUSPEND) {
9507 /* restore old position except for (?>...) */
9508 locinput = st->locinput;
9509 loceol = st->loceol;
9510 script_run_begin = st->sr0;
9512 scan = ST.me + ARG(ST.me);
9515 continue; /* execute B */
9520 case LONGJMP: /* alternative with many branches compiles to
9521 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
9522 next = scan + ARG(scan);
9527 case COMMIT: /* (*COMMIT) */
9528 reginfo->cutpoint = loceol;
9531 case PRUNE: /* (*PRUNE) */
9533 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9534 PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol,
9536 NOT_REACHED; /* NOTREACHED */
9538 case COMMIT_next_fail:
9542 NOT_REACHED; /* NOTREACHED */
9544 case OPFAIL: /* (*FAIL) */
9546 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9548 /* deal with (?(?!)X|Y) properly,
9549 * make sure we trigger the no branch
9550 * of the trailing IFTHEN structure*/
9556 NOT_REACHED; /* NOTREACHED */
9558 #define ST st->u.mark
9559 case MARKPOINT: /* (*MARK:foo) */
9560 ST.prev_mark = mark_state;
9561 ST.mark_name = sv_commit = sv_yes_mark
9562 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9564 ST.mark_loc = locinput;
9565 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol,
9567 NOT_REACHED; /* NOTREACHED */
9569 case MARKPOINT_next:
9570 mark_state = ST.prev_mark;
9572 NOT_REACHED; /* NOTREACHED */
9574 case MARKPOINT_next_fail:
9575 if (popmark && sv_eq(ST.mark_name,popmark))
9577 if (ST.mark_loc > startpoint)
9578 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9579 popmark = NULL; /* we found our mark */
9580 sv_commit = ST.mark_name;
9583 Perl_re_exec_indentf( aTHX_ "%sMARKPOINT: next fail: setting cutpoint to mark:%" SVf "...%s\n",
9585 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
9588 mark_state = ST.prev_mark;
9589 sv_yes_mark = mark_state ?
9590 mark_state->u.mark.mark_name : NULL;
9592 NOT_REACHED; /* NOTREACHED */
9594 case SKIP: /* (*SKIP) */
9596 /* (*SKIP) : if we fail we cut here*/
9597 ST.mark_name = NULL;
9598 ST.mark_loc = locinput;
9599 PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol,
9602 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
9603 otherwise do nothing. Meaning we need to scan
9605 regmatch_state *cur = mark_state;
9606 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
9609 if ( sv_eq( cur->u.mark.mark_name,
9612 ST.mark_name = find;
9613 PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol,
9616 cur = cur->u.mark.prev_mark;
9619 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
9622 case SKIP_next_fail:
9624 /* (*CUT:NAME) - Set up to search for the name as we
9625 collapse the stack*/
9626 popmark = ST.mark_name;
9628 /* (*CUT) - No name, we cut here.*/
9629 if (ST.mark_loc > startpoint)
9630 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
9631 /* but we set sv_commit to latest mark_name if there
9632 is one so they can test to see how things lead to this
9635 sv_commit=mark_state->u.mark.mark_name;
9639 NOT_REACHED; /* NOTREACHED */
9642 case LNBREAK: /* \R */
9643 if ((n=is_LNBREAK_safe(locinput, loceol, utf8_target))) {
9650 PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
9651 PTR2UV(scan), OP(scan));
9652 Perl_croak(aTHX_ "regexp memory corruption");
9654 /* this is a point to jump to in order to increment
9655 * locinput by one character */
9657 assert(!NEXTCHR_IS_EOS);
9659 locinput += PL_utf8skip[nextbyte];
9660 /* locinput is allowed to go 1 char off the end (signifying
9661 * EOS), but not 2+ */
9662 if (locinput > loceol)
9671 /* switch break jumps here */
9672 scan = next; /* prepare to execute the next op and ... */
9673 continue; /* ... jump back to the top, reusing st */
9677 /* push a state that backtracks on success */
9678 st->u.yes.prev_yes_state = yes_state;
9682 /* push a new regex state, then continue at scan */
9684 regmatch_state *newst;
9685 DECLARE_AND_GET_RE_DEBUG_FLAGS;
9687 DEBUG_r( /* DEBUG_STACK_r */
9688 if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) {
9689 regmatch_state *cur = st;
9690 regmatch_state *curyes = yes_state;
9692 regmatch_slab *slab = PL_regmatch_slab;
9693 for (i = 0; i < 3 && i <= depth; cur--,i++) {
9694 if (cur < SLAB_FIRST(slab)) {
9696 cur = SLAB_LAST(slab);
9698 Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
9701 depth - i, PL_reg_name[cur->resume_state],
9702 (curyes == cur) ? "yes" : ""
9705 curyes = cur->u.yes.prev_yes_state;
9708 DEBUG_STATE_pp("push")
9711 st->locinput = locinput;
9712 st->loceol = loceol;
9713 st->sr0 = script_run_begin;
9715 if (newst > SLAB_LAST(PL_regmatch_slab))
9716 newst = S_push_slab(aTHX);
9717 PL_regmatch_state = newst;
9719 locinput = pushinput;
9721 script_run_begin = pushsr0;
9727 #ifdef SOLARIS_BAD_OPTIMIZER
9728 # undef PL_charclass
9732 * We get here only if there's trouble -- normally "case END" is
9733 * the terminating point.
9735 Perl_croak(aTHX_ "corrupted regexp pointers");
9736 NOT_REACHED; /* NOTREACHED */
9740 /* we have successfully completed a subexpression, but we must now
9741 * pop to the state marked by yes_state and continue from there */
9742 assert(st != yes_state);
9744 while (st != yes_state) {
9746 if (st < SLAB_FIRST(PL_regmatch_slab)) {
9747 PL_regmatch_slab = PL_regmatch_slab->prev;
9748 st = SLAB_LAST(PL_regmatch_slab);
9752 DEBUG_STATE_pp("pop (no final)");
9754 DEBUG_STATE_pp("pop (yes)");
9760 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
9761 || yes_state > SLAB_LAST(PL_regmatch_slab))
9763 /* not in this slab, pop slab */
9764 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
9765 PL_regmatch_slab = PL_regmatch_slab->prev;
9766 st = SLAB_LAST(PL_regmatch_slab);
9768 depth -= (st - yes_state);
9771 yes_state = st->u.yes.prev_yes_state;
9772 PL_regmatch_state = st;
9775 locinput= st->locinput;
9777 script_run_begin = st->sr0;
9779 state_num = st->resume_state + no_final;
9780 goto reenter_switch;
9783 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n",
9784 PL_colors[4], PL_colors[5]));
9786 if (reginfo->info_aux_eval) {
9787 /* each successfully executed (?{...}) block does the equivalent of
9788 * local $^R = do {...}
9789 * When popping the save stack, all these locals would be undone;
9790 * bypass this by setting the outermost saved $^R to the latest
9792 /* I dont know if this is needed or works properly now.
9793 * see code related to PL_replgv elsewhere in this file.
9796 if (oreplsv != GvSV(PL_replgv)) {
9797 sv_setsv(oreplsv, GvSV(PL_replgv));
9798 SvSETMAGIC(oreplsv);
9806 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n",
9808 PL_colors[4], PL_colors[5])
9820 /* there's a previous state to backtrack to */
9822 if (st < SLAB_FIRST(PL_regmatch_slab)) {
9823 PL_regmatch_slab = PL_regmatch_slab->prev;
9824 st = SLAB_LAST(PL_regmatch_slab);
9826 PL_regmatch_state = st;
9827 locinput= st->locinput;
9829 script_run_begin = st->sr0;
9831 DEBUG_STATE_pp("pop");
9833 if (yes_state == st)
9834 yes_state = st->u.yes.prev_yes_state;
9836 state_num = st->resume_state + 1; /* failure = success + 1 */
9838 goto reenter_switch;
9843 if (rex->intflags & PREGf_VERBARG_SEEN) {
9844 SV *sv_err = get_sv("REGERROR", 1);
9845 SV *sv_mrk = get_sv("REGMARK", 1);
9847 sv_commit = &PL_sv_no;
9849 sv_yes_mark = &PL_sv_yes;
9852 sv_commit = &PL_sv_yes;
9853 sv_yes_mark = &PL_sv_no;
9857 sv_setsv(sv_err, sv_commit);
9858 sv_setsv(sv_mrk, sv_yes_mark);
9862 if (last_pushed_cv) {
9864 /* see "Some notes about MULTICALL" above */
9866 PERL_UNUSED_VAR(SP);
9869 LEAVE_SCOPE(orig_savestack_ix);
9871 assert(!result || locinput - reginfo->strbeg >= 0);
9872 return result ? locinput - reginfo->strbeg : -1;
9876 - regrepeat - repeatedly match something simple, report how many
9878 * What 'simple' means is a node which can be the operand of a quantifier like
9881 * startposp - pointer to a pointer to the start position. This is updated
9882 * to point to the byte following the highest successful
9884 * p - the regnode to be repeatedly matched against.
9885 * loceol - pointer to the end position beyond which we aren't supposed to
9887 * reginfo - struct holding match state, such as utf8_target
9888 * max - maximum number of things to match.
9889 * depth - (for debugging) backtracking depth.
9892 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
9893 char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH)
9895 char *scan; /* Pointer to current position in target string */
9897 char *this_eol = loceol; /* potentially adjusted version. */
9898 I32 hardcount = 0; /* How many matches so far */
9899 bool utf8_target = reginfo->is_utf8_target;
9900 unsigned int to_complement = 0; /* Invert the result? */
9901 _char_class_number classnum;
9903 PERL_ARGS_ASSERT_REGREPEAT;
9905 /* This routine is structured so that we switch on the input OP. Each OP
9906 * case: statement contains a loop to repeatedly apply the OP, advancing
9907 * the input until it fails, or reaches the end of the input, or until it
9908 * reaches the upper limit of matches. */
9911 if (max == REG_INFTY) /* This is a special marker to go to the platform's
9914 else if (! utf8_target && this_eol - scan > max)
9915 this_eol = scan + max;
9917 /* Here, for the case of a non-UTF-8 target we have adjusted <this_eol> down
9918 * to the maximum of how far we should go in it (but leaving it set to the
9919 * real end if the maximum permissible would take us beyond that). This
9920 * allows us to make the loop exit condition that we haven't gone past
9921 * <this_eol> to also mean that we haven't exceeded the max permissible
9922 * count, saving a test each time through the loop. But it assumes that
9923 * the OP matches a single byte, which is true for most of the OPs below
9924 * when applied to a non-UTF-8 target. Those relatively few OPs that don't
9925 * have this characteristic have to compensate.
9927 * There is no such adjustment for UTF-8 targets, sinc the number of bytes
9928 * per character can vary. OPs will have to test both that the count is
9929 * less than the max permissible (using <hardcount> to keep track), and
9930 * that we are still within the bounds of the string (using <this_eol>. A
9931 * few OPs match a single byte no matter what the encoding. They can omit
9932 * the max test if, for the UTF-8 case, they do the adjustment that was
9935 * Thus, the code above sets things up for the common case; and exceptional
9936 * cases need extra work; the common case is to make sure <scan> doesn't
9937 * go past <this_eol>, and for UTF-8 to also use <hardcount> to make sure the
9938 * count doesn't exceed the maximum permissible */
9943 while (scan < this_eol && hardcount < max && *scan != '\n') {
9944 scan += UTF8SKIP(scan);
9948 scan = (char *) memchr(scan, '\n', this_eol - scan);
9956 while (scan < this_eol && hardcount < max) {
9957 scan += UTF8SKIP(scan);
9966 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
9967 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
9973 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9979 if (! utf8_target) {
9987 case EXACTFAA_NO_TRIE:
9993 struct next_matchable_info Binfo;
9994 PERL_UINT_FAST8_T definitive_len;
9996 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
9998 /* Set up termination info, and quit if we can rule out that we've
9999 * gotten a match of the termination criteria */
10000 if ( ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo)
10001 || scan + Binfo.min_length > this_eol
10002 || ! S_test_EXACTISH_ST(scan, Binfo))
10007 definitive_len = Binfo.initial_definitive;
10009 /* Here there are potential matches, and the first byte(s) matched our
10012 * If we got a definitive match of some initial bytes, there is no
10013 * possibility of false positives as far as it got */
10014 if (definitive_len > 0) {
10016 /* If as far as it got is the maximum possible, there were no false
10017 * positives at all. Since we have everything set up, see how many
10018 * repeats there are. */
10019 if (definitive_len >= Binfo.max_length) {
10021 /* We've already found one match */
10022 scan += definitive_len;
10025 /* If want more than the one match, and there is room for more,
10026 * see if there are any */
10027 if (hardcount < max && scan + definitive_len <= this_eol) {
10029 /* If the character is only a single byte long, just span
10030 * all such bytes. */
10031 if (definitive_len == 1) {
10032 const char * orig_scan = scan;
10034 if (this_eol - (scan - hardcount) > max) {
10035 this_eol = scan - hardcount + max;
10038 /* Use different routines depending on whether it's an
10039 * exact match or matches with a mask */
10040 if (Binfo.initial_exact == 1) {
10041 scan = (char *) find_span_end((U8 *) scan,
10046 scan = (char *) find_span_end_mask(
10049 Binfo.first_byte_anded,
10050 Binfo.first_byte_mask);
10053 hardcount += scan - orig_scan;
10055 else { /* Here, the full character definitive match is more
10057 while ( hardcount < max
10058 && scan + definitive_len <= this_eol
10059 && S_test_EXACTISH_ST(scan, Binfo))
10061 scan += definitive_len;
10068 } /* End of a full character is definitively matched */
10070 /* Here, an initial portion of the character matched definitively,
10071 * and the rest matched as well, but could have false positives */
10074 PERL_INT_FAST8_T i;
10075 U8 * matches = Binfo.matches;
10077 /* The first bytes were definitive. Look at the remaining */
10078 for (i = 0; i < Binfo.count; i++) {
10079 if (memEQ(scan + definitive_len,
10080 matches + definitive_len,
10081 Binfo.lengths[i] - definitive_len))
10083 goto found_a_completion;
10086 matches += Binfo.lengths[i];
10089 /* Didn't find anything to complete our initial match. Stop
10093 found_a_completion:
10095 /* Here, matched a full character, Include it in the result,
10096 * and then look to see if the next char matches */
10098 scan += Binfo.lengths[i];
10100 } while ( hardcount < max
10101 && scan + definitive_len < this_eol
10102 && S_test_EXACTISH_ST(scan, Binfo));
10104 /* Here, have advanced as far as possible */
10106 } /* End of found some initial bytes that definitively matched */
10108 /* Here, we can't rule out that we have found the beginning of 'B', but
10109 * there were no initial bytes that could rule out anything
10110 * definitively. Use brute force to examine all the possibilities */
10111 while (scan < this_eol && hardcount < max) {
10112 PERL_INT_FAST8_T i;
10113 U8 * matches = Binfo.matches;
10115 for (i = 0; i < Binfo.count; i++) {
10116 if (memEQ(scan, matches, Binfo.lengths[i])) {
10120 matches += Binfo.lengths[i];
10127 scan += Binfo.lengths[i];
10134 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10135 CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p);
10141 while (hardcount < max
10143 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, utf8_target))
10145 scan += UTF8SKIP(scan);
10149 else if (ANYOF_FLAGS(p) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
10150 while (scan < this_eol
10151 && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
10155 while (scan < this_eol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
10161 if (utf8_target && this_eol - scan > max) {
10163 /* We didn't adjust <this_eol> at the beginning of this routine
10164 * because is UTF-8, but it is actually ok to do so, since here, to
10165 * match, 1 char == 1 byte. */
10166 this_eol = scan + max;
10169 scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
10174 while ( hardcount < max
10176 && (*scan & FLAGS(p)) != ARG(p))
10178 scan += UTF8SKIP(scan);
10183 scan = (char *) find_next_masked((U8 *) scan, (U8 *) this_eol, (U8) ARG(p), FLAGS(p));
10188 if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
10189 while ( hardcount < max
10191 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10192 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10194 scan += UTF8SKIP(scan);
10201 if (utf8_target) { /* ANYOFHb only can match UTF-8 targets */
10203 /* we know the first byte must be the FLAGS field */
10204 while ( hardcount < max
10206 && (U8) *scan == ANYOF_FLAGS(p)
10207 && reginclass(prog, p, (U8*)scan, (U8*) this_eol,
10210 scan += UTF8SKIP(scan);
10217 if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
10218 while ( hardcount < max
10220 && inRANGE(NATIVE_UTF8_TO_I8(*scan),
10221 LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)),
10222 HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)))
10223 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10224 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10226 scan += UTF8SKIP(scan);
10233 if (utf8_target) { /* ANYOFH only can match UTF-8 targets */
10234 while ( hardcount < max
10235 && scan + FLAGS(p) < this_eol
10236 && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p))
10237 && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE))
10239 scan += UTF8SKIP(scan);
10247 while ( hardcount < max
10249 && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p)
10250 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10253 ANYOFRbase(p), ANYOFRdelta(p)))
10255 scan += UTF8SKIP(scan);
10260 while ( hardcount < max
10262 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10272 while ( hardcount < max
10274 && (U8) *scan == ANYOF_FLAGS(p)
10275 && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan,
10278 ANYOFRbase(p), ANYOFRdelta(p)))
10280 scan += UTF8SKIP(scan);
10285 while ( hardcount < max
10287 && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p)))
10295 /* The argument (FLAGS) to all the POSIX node types is the class number */
10302 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
10303 if (! utf8_target) {
10304 while (scan < this_eol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
10310 while (hardcount < max && scan < this_eol
10311 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
10315 scan += UTF8SKIP(scan);
10328 if (utf8_target && this_eol - scan > max) {
10330 /* We didn't adjust <this_eol> at the beginning of this routine
10331 * because is UTF-8, but it is actually ok to do so, since here, to
10332 * match, 1 char == 1 byte. */
10333 this_eol = scan + max;
10335 while (scan < this_eol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
10348 if (! utf8_target) {
10349 while (scan < this_eol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
10355 /* The complement of something that matches only ASCII matches all
10356 * non-ASCII, plus everything in ASCII that isn't in the class. */
10357 while (hardcount < max && scan < this_eol
10358 && ( ! isASCII_utf8_safe(scan, loceol)
10359 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
10361 scan += UTF8SKIP(scan);
10372 if (! utf8_target) {
10373 while (scan < this_eol && to_complement
10374 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
10381 classnum = (_char_class_number) FLAGS(p);
10382 switch (classnum) {
10384 while ( hardcount < max && scan < this_eol
10385 && to_complement ^ cBOOL(_invlist_contains_cp(
10386 PL_XPosix_ptrs[classnum],
10387 utf8_to_uvchr_buf((U8 *) scan,
10391 scan += UTF8SKIP(scan);
10396 /* For the classes below, the knowledge of how to handle
10397 * every code point is compiled in to Perl via a macro.
10398 * This code is written for making the loops as tight as
10399 * possible. It could be refactored to save space instead.
10402 case _CC_ENUM_SPACE:
10403 while (hardcount < max
10406 ^ cBOOL(isSPACE_utf8_safe(scan, this_eol))))
10408 scan += UTF8SKIP(scan);
10412 case _CC_ENUM_BLANK:
10413 while (hardcount < max
10416 ^ cBOOL(isBLANK_utf8_safe(scan, this_eol))))
10418 scan += UTF8SKIP(scan);
10422 case _CC_ENUM_XDIGIT:
10423 while (hardcount < max
10426 ^ cBOOL(isXDIGIT_utf8_safe(scan, this_eol))))
10428 scan += UTF8SKIP(scan);
10432 case _CC_ENUM_VERTSPACE:
10433 while (hardcount < max
10436 ^ cBOOL(isVERTWS_utf8_safe(scan, this_eol))))
10438 scan += UTF8SKIP(scan);
10442 case _CC_ENUM_CNTRL:
10443 while (hardcount < max
10446 ^ cBOOL(isCNTRL_utf8_safe(scan, this_eol))))
10448 scan += UTF8SKIP(scan);
10458 while (hardcount < max && scan < this_eol &&
10459 (c=is_LNBREAK_utf8_safe(scan, this_eol))) {
10464 /* LNBREAK can match one or two latin chars, which is ok, but we
10465 * have to use hardcount in this situation, and throw away the
10466 * adjustment to <this_eol> done before the switch statement */
10467 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
10475 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
10476 NOT_REACHED; /* NOTREACHED */
10483 c = scan - *startposp;
10487 DECLARE_AND_GET_RE_DEBUG_FLAGS;
10489 SV * const prop = sv_newmortal();
10490 regprop(prog, prop, p, reginfo, NULL);
10491 Perl_re_exec_indentf( aTHX_ "%s can match %" IVdf " times out of %" IVdf "...\n",
10492 depth, SvPVX_const(prop),(IV)c,(IV)max);
10500 - reginclass - determine if a character falls into a character class
10502 n is the ANYOF-type regnode
10503 p is the target string
10504 p_end points to one byte beyond the end of the target string
10505 utf8_target tells whether p is in UTF-8.
10507 Returns true if matched; false otherwise.
10509 Note that this can be a synthetic start class, a combination of various
10510 nodes, so things you think might be mutually exclusive, such as locale,
10511 aren't. It can match both locale and non-locale
10516 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
10518 const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs))
10521 bool match = FALSE;
10524 PERL_ARGS_ASSERT_REGINCLASS;
10526 /* If c is not already the code point, get it. Note that
10527 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
10528 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
10530 const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
10531 c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
10532 if (c_len == (STRLEN)-1) {
10533 _force_out_malformed_utf8_message(p, p_end,
10535 1 /* 1 means die */ );
10536 NOT_REACHED; /* NOTREACHED */
10539 && (OP(n) == ANYOFL || OP(n) == ANYOFPOSIXL)
10540 && ! ANYOFL_UTF8_LOCALE_REQD(flags))
10542 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
10546 /* If this character is potentially in the bitmap, check it */
10547 if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) {
10548 if (ANYOF_BITMAP_TEST(n, c))
10551 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10558 else if (flags & ANYOF_LOCALE_FLAGS) {
10559 if ( (flags & ANYOFL_FOLD)
10561 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
10565 else if ( ANYOF_POSIXL_TEST_ANY_SET(n)
10566 && c <= U8_MAX /* param to isFOO_lc() */
10569 /* The data structure is arranged so bits 0, 2, 4, ... are set
10570 * if the class includes the Posix character class given by
10571 * bit/2; and 1, 3, 5, ... are set if the class includes the
10572 * complemented Posix class given by int(bit/2). So we loop
10573 * through the bits, each time changing whether we complement
10574 * the result or not. Suppose for the sake of illustration
10575 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
10576 * is set, it means there is a match for this ANYOF node if the
10577 * character is in the class given by the expression (0 / 2 = 0
10578 * = \w). If it is in that class, isFOO_lc() will return 1,
10579 * and since 'to_complement' is 0, the result will stay TRUE,
10580 * and we exit the loop. Suppose instead that bit 0 is 0, but
10581 * bit 1 is 1. That means there is a match if the character
10582 * matches \W. We won't bother to call isFOO_lc() on bit 0,
10583 * but will on bit 1. On the second iteration 'to_complement'
10584 * will be 1, so the exclusive or will reverse things, so we
10585 * are testing for \W. On the third iteration, 'to_complement'
10586 * will be 0, and we would be testing for \s; the fourth
10587 * iteration would test for \S, etc.
10589 * Note that this code assumes that all the classes are closed
10590 * under folding. For example, if a character matches \w, then
10591 * its fold does too; and vice versa. This should be true for
10592 * any well-behaved locale for all the currently defined Posix
10593 * classes, except for :lower: and :upper:, which are handled
10594 * by the pseudo-class :cased: which matches if either of the
10595 * other two does. To get rid of this assumption, an outer
10596 * loop could be used below to iterate over both the source
10597 * character, and its fold (if different) */
10600 int to_complement = 0;
10602 while (count < ANYOF_MAX) {
10603 if (ANYOF_POSIXL_TEST(n, count)
10604 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
10610 to_complement ^= 1;
10617 /* If the bitmap didn't (or couldn't) match, and something outside the
10618 * bitmap could match, try that. */
10620 if (c >= NUM_ANYOF_CODE_POINTS
10621 && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
10623 match = TRUE; /* Everything above the bitmap matches */
10625 /* Here doesn't match everything above the bitmap. If there is
10626 * some information available beyond the bitmap, we may find a
10627 * match in it. If so, this is most likely because the code point
10628 * is outside the bitmap range. But rarely, it could be because of
10629 * some other reason. If so, various flags are set to indicate
10630 * this possibility. On ANYOFD nodes, there may be matches that
10631 * happen only when the target string is UTF-8; or for other node
10632 * types, because runtime lookup is needed, regardless of the
10633 * UTF-8ness of the target string. Finally, under /il, there may
10634 * be some matches only possible if the locale is a UTF-8 one. */
10635 else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP
10636 && ( c >= NUM_ANYOF_CODE_POINTS
10637 || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
10638 && ( UNLIKELY(OP(n) != ANYOFD)
10639 || (utf8_target && ! isASCII_uni(c)
10640 # if NUM_ANYOF_CODE_POINTS > 256
10644 || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
10645 && IN_UTF8_CTYPE_LOCALE)))
10647 SV* only_utf8_locale = NULL;
10648 SV * const definition =
10649 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
10650 get_regclass_nonbitmap_data(prog, n, TRUE, 0,
10651 &only_utf8_locale, NULL);
10653 get_re_gclass_nonbitmap_data(prog, n, TRUE, 0,
10654 &only_utf8_locale, NULL);
10661 } else { /* Convert to utf8 */
10662 utf8_p = utf8_buffer;
10663 append_utf8_from_native_byte(*p, &utf8_p);
10664 utf8_p = utf8_buffer;
10667 /* Turkish locales have these hard-coded rules overriding
10669 if ( UNLIKELY(PL_in_utf8_turkic_locale)
10670 && isALPHA_FOLD_EQ(*p, 'i'))
10673 if (_invlist_contains_cp(definition,
10674 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
10679 else if (*p == 'I') {
10680 if (_invlist_contains_cp(definition,
10681 LATIN_SMALL_LETTER_DOTLESS_I))
10687 else if (_invlist_contains_cp(definition, c)) {
10691 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
10692 match = _invlist_contains_cp(only_utf8_locale, c);
10696 /* In a Turkic locale under folding, hard-code the I i case pair
10698 if ( UNLIKELY(PL_in_utf8_turkic_locale)
10700 && (flags & ANYOFL_FOLD)
10703 if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10704 if (ANYOF_BITMAP_TEST(n, 'i')) {
10708 else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
10709 if (ANYOF_BITMAP_TEST(n, 'I')) {
10715 if (UNICODE_IS_SUPER(c)
10717 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
10719 && ckWARN_d(WARN_NON_UNICODE))
10721 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
10722 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
10726 #if ANYOF_INVERT != 1
10727 /* Depending on compiler optimization cBOOL takes time, so if don't have to
10729 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
10732 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
10733 return (flags & ANYOF_INVERT) ^ match;
10737 S_reghop3(U8 *s, SSize_t off, const U8* lim)
10739 /* return the position 'off' UTF-8 characters away from 's', forward if
10740 * 'off' >= 0, backwards if negative. But don't go outside of position
10741 * 'lim', which better be < s if off < 0 */
10743 PERL_ARGS_ASSERT_REGHOP3;
10746 while (off-- && s < lim) {
10747 /* XXX could check well-formedness here */
10748 U8 *new_s = s + UTF8SKIP(s);
10749 if (new_s > lim) /* lim may be in the middle of a long character */
10755 while (off++ && s > lim) {
10757 if (UTF8_IS_CONTINUED(*s)) {
10758 while (s > lim && UTF8_IS_CONTINUATION(*s))
10760 if (! UTF8_IS_START(*s)) {
10761 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10764 /* XXX could check well-formedness here */
10771 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
10773 PERL_ARGS_ASSERT_REGHOP4;
10776 while (off-- && s < rlim) {
10777 /* XXX could check well-formedness here */
10782 while (off++ && s > llim) {
10784 if (UTF8_IS_CONTINUED(*s)) {
10785 while (s > llim && UTF8_IS_CONTINUATION(*s))
10787 if (! UTF8_IS_START(*s)) {
10788 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10791 /* XXX could check well-formedness here */
10797 /* like reghop3, but returns NULL on overrun, rather than returning last
10801 S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
10803 PERL_ARGS_ASSERT_REGHOPMAYBE3;
10806 while (off-- && s < lim) {
10807 /* XXX could check well-formedness here */
10814 while (off++ && s > lim) {
10816 if (UTF8_IS_CONTINUED(*s)) {
10817 while (s > lim && UTF8_IS_CONTINUATION(*s))
10819 if (! UTF8_IS_START(*s)) {
10820 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
10823 /* XXX could check well-formedness here */
10832 /* when executing a regex that may have (?{}), extra stuff needs setting
10833 up that will be visible to the called code, even before the current
10834 match has finished. In particular:
10836 * $_ is localised to the SV currently being matched;
10837 * pos($_) is created if necessary, ready to be updated on each call-out
10839 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
10840 isn't set until the current pattern is successfully finished), so that
10841 $1 etc of the match-so-far can be seen;
10842 * save the old values of subbeg etc of the current regex, and set then
10843 to the current string (again, this is normally only done at the end
10848 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
10851 regexp *const rex = ReANY(reginfo->prog);
10852 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
10854 eval_state->rex = rex;
10855 eval_state->sv = reginfo->sv;
10858 /* Make $_ available to executed code. */
10859 if (reginfo->sv != DEFSV) {
10861 DEFSV_set(reginfo->sv);
10863 /* will be dec'd by S_cleanup_regmatch_info_aux */
10864 SvREFCNT_inc_NN(reginfo->sv);
10866 if (!(mg = mg_find_mglob(reginfo->sv))) {
10867 /* prepare for quick setting of pos */
10868 mg = sv_magicext_mglob(reginfo->sv);
10871 eval_state->pos_magic = mg;
10872 eval_state->pos = mg->mg_len;
10873 eval_state->pos_flags = mg->mg_flags;
10876 eval_state->pos_magic = NULL;
10878 if (!PL_reg_curpm) {
10879 /* PL_reg_curpm is a fake PMOP that we can attach the current
10880 * regex to and point PL_curpm at, so that $1 et al are visible
10881 * within a /(?{})/. It's just allocated once per interpreter the
10882 * first time its needed */
10883 Newxz(PL_reg_curpm, 1, PMOP);
10884 #ifdef USE_ITHREADS
10886 SV* const repointer = &PL_sv_undef;
10887 /* this regexp is also owned by the new PL_reg_curpm, which
10888 will try to free it. */
10889 av_push(PL_regex_padav, repointer);
10890 PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav);
10891 PL_regex_pad = AvARRAY(PL_regex_padav);
10895 SET_reg_curpm(reginfo->prog);
10896 eval_state->curpm = PL_curpm;
10897 PL_curpm_under = PL_curpm;
10898 PL_curpm = PL_reg_curpm;
10899 if (RXp_MATCH_COPIED(rex)) {
10900 /* Here is a serious problem: we cannot rewrite subbeg,
10901 since it may be needed if this match fails. Thus
10902 $` inside (?{}) could fail... */
10903 eval_state->subbeg = rex->subbeg;
10904 eval_state->sublen = rex->sublen;
10905 eval_state->suboffset = rex->suboffset;
10906 eval_state->subcoffset = rex->subcoffset;
10907 #ifdef PERL_ANY_COW
10908 eval_state->saved_copy = rex->saved_copy;
10910 RXp_MATCH_COPIED_off(rex);
10913 eval_state->subbeg = NULL;
10914 rex->subbeg = (char *)reginfo->strbeg;
10915 rex->suboffset = 0;
10916 rex->subcoffset = 0;
10917 rex->sublen = reginfo->strend - reginfo->strbeg;
10921 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
10924 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
10926 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
10927 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
10930 Safefree(aux->poscache);
10934 /* undo the effects of S_setup_eval_state() */
10936 if (eval_state->subbeg) {
10937 regexp * const rex = eval_state->rex;
10938 rex->subbeg = eval_state->subbeg;
10939 rex->sublen = eval_state->sublen;
10940 rex->suboffset = eval_state->suboffset;
10941 rex->subcoffset = eval_state->subcoffset;
10942 #ifdef PERL_ANY_COW
10943 rex->saved_copy = eval_state->saved_copy;
10945 RXp_MATCH_COPIED_on(rex);
10947 if (eval_state->pos_magic)
10949 eval_state->pos_magic->mg_len = eval_state->pos;
10950 eval_state->pos_magic->mg_flags =
10951 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
10952 | (eval_state->pos_flags & MGf_BYTES);
10955 PL_curpm = eval_state->curpm;
10956 SvREFCNT_dec(eval_state->sv);
10959 PL_regmatch_state = aux->old_regmatch_state;
10960 PL_regmatch_slab = aux->old_regmatch_slab;
10962 /* free all slabs above current one - this must be the last action
10963 * of this function, as aux and eval_state are allocated within
10964 * slabs and may be freed here */
10966 s = PL_regmatch_slab->next;
10968 PL_regmatch_slab->next = NULL;
10970 regmatch_slab * const osl = s;
10979 S_to_utf8_substr(pTHX_ regexp *prog)
10981 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
10982 * on the converted value */
10986 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
10989 if (prog->substrs->data[i].substr
10990 && !prog->substrs->data[i].utf8_substr) {
10991 SV* const sv = newSVsv(prog->substrs->data[i].substr);
10992 prog->substrs->data[i].utf8_substr = sv;
10993 sv_utf8_upgrade(sv);
10994 if (SvVALID(prog->substrs->data[i].substr)) {
10995 if (SvTAIL(prog->substrs->data[i].substr)) {
10996 /* Trim the trailing \n that fbm_compile added last
10998 SvCUR_set(sv, SvCUR(sv) - 1);
10999 /* Whilst this makes the SV technically "invalid" (as its
11000 buffer is no longer followed by "\0") when fbm_compile()
11001 adds the "\n" back, a "\0" is restored. */
11002 fbm_compile(sv, FBMcf_TAIL);
11004 fbm_compile(sv, 0);
11006 if (prog->substrs->data[i].substr == prog->check_substr)
11007 prog->check_utf8 = sv;
11013 S_to_byte_substr(pTHX_ regexp *prog)
11015 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
11016 * on the converted value; returns FALSE if can't be converted. */
11020 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
11023 if (prog->substrs->data[i].utf8_substr
11024 && !prog->substrs->data[i].substr) {
11025 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
11026 if (! sv_utf8_downgrade(sv, TRUE)) {
11027 SvREFCNT_dec_NN(sv);
11030 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
11031 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
11032 /* Trim the trailing \n that fbm_compile added last
11034 SvCUR_set(sv, SvCUR(sv) - 1);
11035 fbm_compile(sv, FBMcf_TAIL);
11037 fbm_compile(sv, 0);
11039 prog->substrs->data[i].substr = sv;
11040 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
11041 prog->check_substr = sv;
11048 #ifndef PERL_IN_XSUB_RE
11051 Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
11053 /* Temporary helper function for toke.c. Verify that the code point 'cp'
11054 * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in
11055 * the larger string bounded by 'strbeg' and 'strend'.
11057 * 'cp' needs to be assigned (if not, a future version of the Unicode
11058 * Standard could make it something that combines with adjacent characters,
11059 * so code using it would then break), and there has to be a GCB break
11060 * before and after the character. */
11063 GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
11064 const U8 * prev_cp_start;
11066 PERL_ARGS_ASSERT_IS_GRAPHEME;
11068 if ( UNLIKELY(UNICODE_IS_SUPER(cp))
11069 || UNLIKELY(UNICODE_IS_NONCHAR(cp)))
11071 /* These are considered graphemes */
11075 /* Otherwise, unassigned code points are forbidden */
11076 if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
11077 _invlist_search(PL_Assigned_invlist, cp))))
11082 cp_gcb_val = getGCB_VAL_CP(cp);
11084 /* Find the GCB value of the previous code point in the input */
11085 prev_cp_start = utf8_hop_back(s, -1, strbeg);
11086 if (UNLIKELY(prev_cp_start == s)) {
11087 prev_cp_gcb_val = GCB_EDGE;
11090 prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
11093 /* And check that is a grapheme boundary */
11094 if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
11095 TRUE /* is UTF-8 encoded */ ))
11100 /* Similarly verify there is a break between the current character and the
11104 next_cp_gcb_val = GCB_EDGE;
11107 next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
11110 return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
11114 =for apidoc_section $unicode
11116 =for apidoc isSCRIPT_RUN
11118 Returns a bool as to whether or not the sequence of bytes from C<s> up to but
11119 not including C<send> form a "script run". C<utf8_target> is TRUE iff the
11120 sequence starting at C<s> is to be treated as UTF-8. To be precise, except for
11121 two degenerate cases given below, this function returns TRUE iff all code
11122 points in it come from any combination of three "scripts" given by the Unicode
11123 "Script Extensions" property: Common, Inherited, and possibly one other.
11124 Additionally all decimal digits must come from the same consecutive sequence of
11127 For example, if all the characters in the sequence are Greek, or Common, or
11128 Inherited, this function will return TRUE, provided any decimal digits in it
11129 are from the same block of digits in Common. (These are the ASCII digits
11130 "0".."9" and additionally a block for full width forms of these, and several
11131 others used in mathematical notation.) For scripts (unlike Greek) that have
11132 their own digits defined this will accept either digits from that set or from
11133 one of the Common digit sets, but not a combination of the two. Some scripts,
11134 such as Arabic, have more than one set of digits. All digits must come from
11135 the same set for this function to return TRUE.
11137 C<*ret_script>, if C<ret_script> is not NULL, will on return of TRUE
11138 contain the script found, using the C<SCX_enum> typedef. Its value will be
11139 C<SCX_INVALID> if the function returns FALSE.
11141 If the sequence is empty, TRUE is returned, but C<*ret_script> (if asked for)
11142 will be C<SCX_INVALID>.
11144 If the sequence contains a single code point which is unassigned to a character
11145 in the version of Unicode being used, the function will return TRUE, and the
11146 script will be C<SCX_Unknown>. Any other combination of unassigned code points
11147 in the input sequence will result in the function treating the input as not
11148 being a script run.
11150 The returned script will be C<SCX_Inherited> iff all the code points in it are
11151 from the Inherited script.
11153 Otherwise, the returned script will be C<SCX_Common> iff all the code points in
11154 it are from the Inherited or Common scripts.
11161 Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target)
11163 /* Basically, it looks at each character in the sequence to see if the
11164 * above conditions are met; if not it fails. It uses an inversion map to
11165 * find the enum corresponding to the script of each character. But this
11166 * is complicated by the fact that a few code points can be in any of
11167 * several scripts. The data has been constructed so that there are
11168 * additional enum values (all negative) for these situations. The
11169 * absolute value of those is an index into another table which contains
11170 * pointers to auxiliary tables for each such situation. Each aux array
11171 * lists all the scripts for the given situation. There is another,
11172 * parallel, table that gives the number of entries in each aux table.
11173 * These are all defined in charclass_invlists.h */
11175 /* XXX Here are the additional things UTS 39 says could be done:
11177 * Forbid sequences of the same nonspacing mark
11179 * Check to see that all the characters are in the sets of exemplar
11180 * characters for at least one language in the Unicode Common Locale Data
11181 * Repository [CLDR]. */
11184 /* Things that match /\d/u */
11185 SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT];
11186 UV * decimals_array = invlist_array(decimals_invlist);
11188 /* What code point is the digit '0' of the script run? (0 meaning FALSE if
11189 * not currently known) */
11190 UV zero_of_run = 0;
11192 SCX_enum script_of_run = SCX_INVALID; /* Illegal value */
11193 SCX_enum script_of_char = SCX_INVALID;
11195 /* If the script remains not fully determined from iteration to iteration,
11196 * this is the current intersection of the possiblities. */
11197 SCX_enum * intersection = NULL;
11198 PERL_UINT_FAST8_T intersection_len = 0;
11200 bool retval = TRUE;
11201 SCX_enum * ret_script = NULL;
11205 PERL_ARGS_ASSERT_ISSCRIPT_RUN;
11207 /* All code points in 0..255 are either Common or Latin, so must be a
11208 * script run. We can return immediately unless we need to know which
11210 if (! utf8_target && LIKELY(send > s)) {
11211 if (ret_script == NULL) {
11215 /* If any character is Latin, the run is Latin */
11217 if (isALPHA_L1(*s) && LIKELY(*s != MICRO_SIGN_NATIVE)) {
11218 *ret_script = SCX_Latin;
11223 /* Here, all are Common */
11224 *ret_script = SCX_Common;
11228 /* Look at each character in the sequence */
11230 /* If the current character being examined is a digit, this is the code
11231 * point of the zero for its sequence of 10 */
11236 /* The code allows all scripts to use the ASCII digits. This is
11237 * because they are in the Common script. Hence any ASCII ones found
11238 * are ok, unless and until a digit from another set has already been
11239 * encountered. digit ranges in Common are not similarly blessed) */
11240 if (UNLIKELY(isDIGIT(*s))) {
11241 if (UNLIKELY(script_of_run == SCX_Unknown)) {
11246 if (zero_of_run != '0') {
11258 /* Here, isn't an ASCII digit. Find the code point of the character */
11259 if (! UTF8_IS_INVARIANT(*s)) {
11261 cp = valid_utf8_to_uvchr((U8 *) s, &len);
11268 /* If is within the range [+0 .. +9] of the script's zero, it also is a
11269 * digit in that script. We can skip the rest of this code for this
11271 if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) {
11275 /* Find the character's script. The correct values are hard-coded here
11276 * for small-enough code points. */
11277 if (cp < 0x2B9) { /* From inspection of Unicode db; extremely
11278 unlikely to change */
11280 || ( isALPHA_L1(cp)
11281 && LIKELY(cp != MICRO_SIGN_NATIVE)))
11283 script_of_char = SCX_Latin;
11286 script_of_char = SCX_Common;
11290 script_of_char = _Perl_SCX_invmap[
11291 _invlist_search(PL_SCX_invlist, cp)];
11294 /* We arbitrarily accept a single unassigned character, but not in
11295 * combination with anything else, and not a run of them. */
11296 if ( UNLIKELY(script_of_run == SCX_Unknown)
11297 || UNLIKELY( script_of_run != SCX_INVALID
11298 && script_of_char == SCX_Unknown))
11304 /* For the first character, or the run is inherited, the run's script
11305 * is set to the char's */
11306 if ( UNLIKELY(script_of_run == SCX_INVALID)
11307 || UNLIKELY(script_of_run == SCX_Inherited))
11309 script_of_run = script_of_char;
11312 /* For the character's script to be Unknown, it must be the first
11313 * character in the sequence (for otherwise a test above would have
11314 * prevented us from reaching here), and we have set the run's script
11315 * to it. Nothing further to be done for this character */
11316 if (UNLIKELY(script_of_char == SCX_Unknown)) {
11320 /* We accept 'inherited' script characters currently even at the
11321 * beginning. (We know that no characters in Inherited are digits, or
11322 * we'd have to check for that) */
11323 if (UNLIKELY(script_of_char == SCX_Inherited)) {
11327 /* If the run so far is Common, and the new character isn't, change the
11328 * run's script to that of this character */
11329 if (script_of_run == SCX_Common && script_of_char != SCX_Common) {
11330 script_of_run = script_of_char;
11333 /* Now we can see if the script of the new character is the same as
11334 * that of the run */
11335 if (LIKELY(script_of_char == script_of_run)) {
11336 /* By far the most common case */
11337 goto scripts_match;
11340 /* Here, the script of the run isn't Common. But characters in Common
11341 * match any script */
11342 if (script_of_char == SCX_Common) {
11343 goto scripts_match;
11346 #ifndef HAS_SCX_AUX_TABLES
11348 /* Too early a Unicode version to have a code point belonging to more
11349 * than one script, so, if the scripts don't exactly match, fail */
11350 PERL_UNUSED_VAR(intersection_len);
11356 /* Here there is no exact match between the character's script and the
11357 * run's. And we've handled the special cases of scripts Unknown,
11358 * Inherited, and Common.
11360 * Negative script numbers signify that the value may be any of several
11361 * scripts, and we need to look at auxiliary information to make our
11362 * deterimination. But if both are non-negative, we can fail now */
11363 if (LIKELY(script_of_char >= 0)) {
11364 const SCX_enum * search_in;
11365 PERL_UINT_FAST8_T search_in_len;
11366 PERL_UINT_FAST8_T i;
11368 if (LIKELY(script_of_run >= 0)) {
11373 /* Use the previously constructed set of possible scripts, if any.
11375 if (intersection) {
11376 search_in = intersection;
11377 search_in_len = intersection_len;
11380 search_in = SCX_AUX_TABLE_ptrs[-script_of_run];
11381 search_in_len = SCX_AUX_TABLE_lengths[-script_of_run];
11384 for (i = 0; i < search_in_len; i++) {
11385 if (search_in[i] == script_of_char) {
11386 script_of_run = script_of_char;
11387 goto scripts_match;
11394 else if (LIKELY(script_of_run >= 0)) {
11395 /* script of character could be one of several, but run is a single
11397 const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char];
11398 const PERL_UINT_FAST8_T search_in_len
11399 = SCX_AUX_TABLE_lengths[-script_of_char];
11400 PERL_UINT_FAST8_T i;
11402 for (i = 0; i < search_in_len; i++) {
11403 if (search_in[i] == script_of_run) {
11404 script_of_char = script_of_run;
11405 goto scripts_match;
11413 /* Both run and char could be in one of several scripts. If the
11414 * intersection is empty, then this character isn't in this script
11415 * run. Otherwise, we need to calculate the intersection to use
11416 * for future iterations of the loop, unless we are already at the
11417 * final character */
11418 const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char];
11419 const PERL_UINT_FAST8_T char_len
11420 = SCX_AUX_TABLE_lengths[-script_of_char];
11421 const SCX_enum * search_run;
11422 PERL_UINT_FAST8_T run_len;
11424 SCX_enum * new_overlap = NULL;
11425 PERL_UINT_FAST8_T i, j;
11427 if (intersection) {
11428 search_run = intersection;
11429 run_len = intersection_len;
11432 search_run = SCX_AUX_TABLE_ptrs[-script_of_run];
11433 run_len = SCX_AUX_TABLE_lengths[-script_of_run];
11436 intersection_len = 0;
11438 for (i = 0; i < run_len; i++) {
11439 for (j = 0; j < char_len; j++) {
11440 if (search_run[i] == search_char[j]) {
11442 /* Here, the script at i,j matches. That means this
11443 * character is in the run. But continue on to find
11444 * the complete intersection, for the next loop
11445 * iteration, and for the digit check after it.
11447 * On the first found common script, we malloc space
11448 * for the intersection list for the worst case of the
11449 * intersection, which is the minimum of the number of
11450 * scripts remaining in each set. */
11451 if (intersection_len == 0) {
11453 MIN(run_len - i, char_len - j),
11456 new_overlap[intersection_len++] = search_run[i];
11461 /* Here we've looked through everything. If they have no scripts
11462 * in common, not a run */
11463 if (intersection_len == 0) {
11468 /* If there is only a single script in common, set to that.
11469 * Otherwise, use the intersection going forward */
11470 Safefree(intersection);
11471 intersection = NULL;
11472 if (intersection_len == 1) {
11473 script_of_run = script_of_char = new_overlap[0];
11474 Safefree(new_overlap);
11475 new_overlap = NULL;
11478 intersection = new_overlap;
11486 /* Here, the script of the character is compatible with that of the
11487 * run. That means that in most cases, it continues the script run.
11488 * Either it and the run match exactly, or one or both can be in any of
11489 * several scripts, and the intersection is not empty. However, if the
11490 * character is a decimal digit, it could still mean failure if it is
11491 * from the wrong sequence of 10. So, we need to look at if it's a
11492 * digit. We've already handled the 10 digits [0-9], and the next
11493 * lowest one is this one: */
11494 if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) {
11495 continue; /* Not a digit; this character is part of the run */
11498 /* If we have a definitive '0' for the script of this character, we
11499 * know that for this to be a digit, it must be in the range of +0..+9
11501 if ( script_of_char >= 0
11502 && (zero_of_char = script_zeros[script_of_char]))
11504 if (! withinCOUNT(cp, zero_of_char, 9)) {
11505 continue; /* Not a digit; this character is part of the run
11510 else { /* Need to look up if this character is a digit or not */
11511 SSize_t index_of_zero_of_char;
11512 index_of_zero_of_char = _invlist_search(decimals_invlist, cp);
11513 if ( UNLIKELY(index_of_zero_of_char < 0)
11514 || ! ELEMENT_RANGE_MATCHES_INVLIST(index_of_zero_of_char))
11516 continue; /* Not a digit; this character is part of the run.
11520 zero_of_char = decimals_array[index_of_zero_of_char];
11523 /* Here, the character is a decimal digit, and the zero of its sequence
11524 * of 10 is in 'zero_of_char'. If we already have a zero for this run,
11525 * they better be the same. */
11527 if (zero_of_run != zero_of_char) {
11532 else { /* Otherwise we now have a zero for this run */
11533 zero_of_run = zero_of_char;
11535 } /* end of looping through CLOSESR text */
11537 Safefree(intersection);
11539 if (ret_script != NULL) {
11541 *ret_script = script_of_run;
11544 *ret_script = SCX_INVALID;
11551 #endif /* ifndef PERL_IN_XSUB_RE */
11554 * ex: set ts=8 sts=4 sw=4 et: