This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Perl_sv_vcatpvfn_flags: handle %.NNNg case earlier
[perl5.git] / regexec.c
CommitLineData
a0d0e21e
LW
1/* regexec.c
2 */
3
4/*
f65819ce
CO
5 * One Ring to rule them all, One Ring to find them
6 *
4ac71550
TC
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
a0d0e21e
LW
10 */
11
61296642
DM
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
166f8a29 14 * a regular expression.
e4a054ea
DM
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
20 */
21
a687059c
LW
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
e50aee73
AD
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
b9d5759e 36#ifdef PERL_EXT_RE_BUILD
54df2634 37#include "re_top.h"
9041c2e3 38#endif
56953603 39
a687059c 40/*
e50aee73 41 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
42 *
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
45 *
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
49 *
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
52 * from defects in it.
53 *
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
56 *
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
59 *
60 **** Alterations to Henry's code are...
61 ****
4bb101f2 62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
a687059c 65 ****
9ef589d8
LW
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGEXEC_C
a687059c 75#include "perl.h"
0f5d15d6 76
54df2634
NC
77#ifdef PERL_IN_XSUB_RE
78# include "re_comp.h"
79#else
80# include "regcomp.h"
81#endif
a687059c 82
b992490d 83#include "invlist_inline.h"
1b0f46bf 84#include "unicode_constants.h"
81e983c1 85
bbac6b20
KW
86#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
87 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
88
a0bd1a30
KW
89static const char utf8_locale_required[] =
90 "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
91
e1cf74e3
CB
92#ifdef DEBUGGING
93/* At least one required character in the target string is expressible only in
94 * UTF-8. */
95static const char* const non_utf8_target_but_utf8_required
96 = "Can't match, because target string needs to be in UTF-8\n";
97#endif
98
7b031478 99#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
6ad9a8ab 100 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
7b031478 101 goto target; \
e1cf74e3
CB
102} STMT_END
103
c74f6de9
KW
104#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
105
a687059c
LW
106#ifndef STATIC
107#define STATIC static
108#endif
109
451c6e0b
KW
110/* Valid only if 'c', the character being looke-up, is an invariant under
111 * UTF-8: it avoids the reginclass call if there are no complications: i.e., if
112 * everything matchable is straight forward in the bitmap */
113#define REGINCLASS(prog,p,c,u) (ANYOF_FLAGS(p) \
114 ? reginclass(prog,p,c,c+1,u) \
115 : ANYOF_BITMAP_TEST(p,*(c)))
7d3e948e 116
c277df42
IZ
117/*
118 * Forwards.
119 */
120
f2ed9b32 121#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 122
3dab1dad 123#define HOPc(pos,off) \
ba44c216 124 (char *)(reginfo->is_utf8_target \
220db18a 125 ? reghop3((U8*)pos, off, \
9d9163fb 126 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
3dab1dad 127 : (U8*)(pos + off))
557f47af 128
3dab1dad 129#define HOPBACKc(pos, off) \
ba44c216 130 (char*)(reginfo->is_utf8_target \
c708944d 131 ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(reginfo->strbeg)) \
9d9163fb 132 : (pos - off >= reginfo->strbeg) \
8e11feef 133 ? (U8*)pos - off \
3dab1dad 134 : NULL)
efb30f32 135
ba44c216 136#define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
1aa99e6b 137#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
1aa99e6b 138
557f47af
DM
139/* lim must be +ve. Returns NULL on overshoot */
140#define HOPMAYBE3(pos,off,lim) \
141 (reginfo->is_utf8_target \
142 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
143 : ((U8*)pos + off <= lim) \
144 ? (U8*)pos + off \
145 : NULL)
146
8e9f2289
DM
147/* like HOP3, but limits the result to <= lim even for the non-utf8 case.
148 * off must be >=0; args should be vars rather than expressions */
149#define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
150 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
151 : (U8*)((pos + off) > lim ? lim : (pos + off)))
67853908 152#define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim))
8e9f2289 153
2974eaec
DM
154#define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
155 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
156 : (U8*)(pos + off))
157#define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
7016d6eb
DM
158
159#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
160#define NEXTCHR_IS_EOS (nextchr < 0)
161
162#define SET_nextchr \
220db18a 163 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
7016d6eb
DM
164
165#define SET_locinput(p) \
166 locinput = (p); \
167 SET_nextchr
168
169
2a16ac92 170#define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
c7304fe2
KW
171 if (!swash_ptr) { \
172 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
c7304fe2 173 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
2a16ac92 174 1, 0, invlist, &flags); \
c7304fe2
KW
175 assert(swash_ptr); \
176 } \
177 } STMT_END
178
179/* If in debug mode, we test that a known character properly matches */
180#ifdef DEBUGGING
181# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
182 property_name, \
2a16ac92 183 invlist, \
c7304fe2 184 utf8_char_in_property) \
2a16ac92 185 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
c7304fe2
KW
186 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
187#else
188# define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
189 property_name, \
2a16ac92 190 invlist, \
c7304fe2 191 utf8_char_in_property) \
2a16ac92 192 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
c7304fe2 193#endif
d1eb3177 194
c7304fe2
KW
195#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
196 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
2a16ac92
KW
197 "", \
198 PL_XPosix_ptrs[_CC_WORDCHAR], \
0766489e 199 LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
c7304fe2 200
c7304fe2 201#define PLACEHOLDER /* Something for the preprocessor to grab onto */
3dab1dad
YO
202/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
203
5f80c4cf 204/* for use after a quantifier and before an EXACT-like node -- japhy */
c35dcbe2
YO
205/* it would be nice to rework regcomp.sym to generate this stuff. sigh
206 *
207 * NOTE that *nothing* that affects backtracking should be in here, specifically
208 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
209 * node that is in between two EXACT like nodes when ascertaining what the required
210 * "follow" character is. This should probably be moved to regex compile time
211 * although it may be done at run time beause of the REF possibility - more
212 * investigation required. -- demerphq
213*/
baa60164
KW
214#define JUMPABLE(rn) ( \
215 OP(rn) == OPEN || \
24be3102
YO
216 (OP(rn) == CLOSE && \
217 !EVAL_CLOSE_PAREN_IS(cur_eval,ARG(rn)) ) || \
baa60164
KW
218 OP(rn) == EVAL || \
219 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
220 OP(rn) == PLUS || OP(rn) == MINMOD || \
221 OP(rn) == KEEPS || \
222 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
e2d8ce26 223)
ee9b8eae 224#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
e2d8ce26 225
ee9b8eae
YO
226#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
227
228#if 0
229/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
a4525e78 230 we don't need this definition. XXX These are now out-of-sync*/
ee9b8eae 231#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
098b07d5 232#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
ee9b8eae
YO
233#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
234
235#else
236/* ... so we use this as its faster. */
a4525e78
KW
237#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
238#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
ee9b8eae
YO
239#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
240#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
241
242#endif
e2d8ce26 243
a84d97b6
HS
244/*
245 Search for mandatory following text node; for lookahead, the text must
246 follow but for lookbehind (rn->flags != 0) we skip to the next step.
247*/
baa60164 248#define FIND_NEXT_IMPT(rn) STMT_START { \
3dab1dad
YO
249 while (JUMPABLE(rn)) { \
250 const OPCODE type = OP(rn); \
251 if (type == SUSPEND || PL_regkind[type] == CURLY) \
e2d8ce26 252 rn = NEXTOPER(NEXTOPER(rn)); \
3dab1dad 253 else if (type == PLUS) \
cca55fe3 254 rn = NEXTOPER(rn); \
3dab1dad 255 else if (type == IFMATCH) \
a84d97b6 256 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
e2d8ce26 257 else rn += NEXT_OFF(rn); \
3dab1dad 258 } \
5f80c4cf 259} STMT_END
74750237 260
006f26b2
DM
261#define SLAB_FIRST(s) (&(s)->states[0])
262#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
263
a75351a1 264static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
bf2039a9 265static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
bf2039a9 266static regmatch_state * S_push_slab(pTHX);
51371543 267
87c0511b 268#define REGCP_PAREN_ELEMS 3
f067efbf 269#define REGCP_OTHER_ELEMS 3
e0fa7e2b 270#define REGCP_FRAME_ELEMS 1
620d5b66
NC
271/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
272 * are needed for the regexp context stack bookkeeping. */
273
76e3520e 274STATIC CHECKPOINT
21553840 275S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH)
a0d0e21e 276{
a3b680e6 277 const int retval = PL_savestack_ix;
92da3157
DM
278 const int paren_elems_to_push =
279 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
e0fa7e2b
NC
280 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
281 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
87c0511b 282 I32 p;
40a82448 283 GET_RE_DEBUG_FLAGS_DECL;
a0d0e21e 284
b93070ed
DM
285 PERL_ARGS_ASSERT_REGCPPUSH;
286
e49a9654 287 if (paren_elems_to_push < 0)
e8a85d26
JH
288 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
289 (int)paren_elems_to_push, (int)maxopenparen,
290 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
e49a9654 291
e0fa7e2b 292 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
147e3846 293 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %" UVuf
5df417d0 294 " out of range (%lu-%ld)",
92da3157
DM
295 total_elems,
296 (unsigned long)maxopenparen,
297 (long)parenfloor);
e0fa7e2b 298
620d5b66 299 SSGROW(total_elems + REGCP_FRAME_ELEMS);
7f69552c 300
495f47a5 301 DEBUG_BUFFERS_r(
92da3157 302 if ((int)maxopenparen > (int)parenfloor)
2b1a3689 303 Perl_re_exec_indentf( aTHX_
147e3846 304 "rex=0x%" UVxf " offs=0x%" UVxf ": saving capture indices:\n",
2b1a3689
YO
305 depth,
306 PTR2UV(rex),
495f47a5
DM
307 PTR2UV(rex->offs)
308 );
309 );
92da3157 310 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
b1ce53c5 311/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
99a90e59
FC
312 SSPUSHIV(rex->offs[p].end);
313 SSPUSHIV(rex->offs[p].start);
1ca2007e 314 SSPUSHINT(rex->offs[p].start_tmp);
2b1a3689 315 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
147e3846 316 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "\n",
2b1a3689
YO
317 depth,
318 (UV)p,
495f47a5
DM
319 (IV)rex->offs[p].start,
320 (IV)rex->offs[p].start_tmp,
321 (IV)rex->offs[p].end
40a82448 322 ));
a0d0e21e 323 }
b1ce53c5 324/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
92da3157 325 SSPUSHINT(maxopenparen);
b93070ed
DM
326 SSPUSHINT(rex->lastparen);
327 SSPUSHINT(rex->lastcloseparen);
e0fa7e2b 328 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
41123dfd 329
a0d0e21e
LW
330 return retval;
331}
332
c277df42 333/* These are needed since we do not localize EVAL nodes: */
ab3bbdeb
YO
334#define REGCP_SET(cp) \
335 DEBUG_STATE_r( \
147e3846
KW
336 Perl_re_exec_indentf( aTHX_ \
337 "Setting an EVAL scope, savestack=%" IVdf ",\n", \
cb41e5d6
YO
338 depth, (IV)PL_savestack_ix \
339 ) \
340 ); \
ab3bbdeb 341 cp = PL_savestack_ix
c3464db5 342
ab3bbdeb 343#define REGCP_UNWIND(cp) \
e4f74956 344 DEBUG_STATE_r( \
cb41e5d6 345 if (cp != PL_savestack_ix) \
147e3846
KW
346 Perl_re_exec_indentf( aTHX_ \
347 "Clearing an EVAL scope, savestack=%" \
348 IVdf "..%" IVdf "\n", \
cb41e5d6
YO
349 depth, (IV)(cp), (IV)PL_savestack_ix \
350 ) \
351 ); \
ab3bbdeb 352 regcpblow(cp)
c277df42 353
a8d1f4b4
DM
354#define UNWIND_PAREN(lp, lcp) \
355 for (n = rex->lastparen; n > lp; n--) \
356 rex->offs[n].end = -1; \
357 rex->lastparen = n; \
358 rex->lastcloseparen = lcp;
359
360
f067efbf 361STATIC void
21553840 362S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH)
a0d0e21e 363{
e0fa7e2b 364 UV i;
87c0511b 365 U32 paren;
a3621e74
YO
366 GET_RE_DEBUG_FLAGS_DECL;
367
7918f24d
NC
368 PERL_ARGS_ASSERT_REGCPPOP;
369
b1ce53c5 370 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
c6bf6a65 371 i = SSPOPUV;
e0fa7e2b
NC
372 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
373 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
b93070ed
DM
374 rex->lastcloseparen = SSPOPINT;
375 rex->lastparen = SSPOPINT;
92da3157 376 *maxopenparen_p = SSPOPINT;
b1ce53c5 377
620d5b66 378 i -= REGCP_OTHER_ELEMS;
b1ce53c5 379 /* Now restore the parentheses context. */
495f47a5
DM
380 DEBUG_BUFFERS_r(
381 if (i || rex->lastparen + 1 <= rex->nparens)
2b1a3689 382 Perl_re_exec_indentf( aTHX_
147e3846 383 "rex=0x%" UVxf " offs=0x%" UVxf ": restoring capture indices to:\n",
2b1a3689
YO
384 depth,
385 PTR2UV(rex),
495f47a5
DM
386 PTR2UV(rex->offs)
387 );
388 );
92da3157 389 paren = *maxopenparen_p;
620d5b66 390 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
ea3daa5d 391 SSize_t tmps;
1ca2007e 392 rex->offs[paren].start_tmp = SSPOPINT;
99a90e59
FC
393 rex->offs[paren].start = SSPOPIV;
394 tmps = SSPOPIV;
b93070ed
DM
395 if (paren <= rex->lastparen)
396 rex->offs[paren].end = tmps;
2b1a3689 397 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 398 " \\%" UVuf ": %" IVdf "(%" IVdf ")..%" IVdf "%s\n",
2b1a3689
YO
399 depth,
400 (UV)paren,
495f47a5
DM
401 (IV)rex->offs[paren].start,
402 (IV)rex->offs[paren].start_tmp,
403 (IV)rex->offs[paren].end,
404 (paren > rex->lastparen ? "(skipped)" : ""));
c277df42 405 );
87c0511b 406 paren--;
a0d0e21e 407 }
daf18116 408#if 1
dafc8851
JH
409 /* It would seem that the similar code in regtry()
410 * already takes care of this, and in fact it is in
411 * a better location to since this code can #if 0-ed out
412 * but the code in regtry() is needed or otherwise tests
413 * requiring null fields (pat.t#187 and split.t#{13,14}
daf18116
JH
414 * (as of patchlevel 7877) will fail. Then again,
415 * this code seems to be necessary or otherwise
225593e1
DM
416 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
417 * --jhi updated by dapm */
b93070ed 418 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
92da3157 419 if (i > *maxopenparen_p)
b93070ed
DM
420 rex->offs[i].start = -1;
421 rex->offs[i].end = -1;
2b1a3689 422 DEBUG_BUFFERS_r( Perl_re_exec_indentf( aTHX_
147e3846 423 " \\%" UVuf ": %s ..-1 undeffing\n",
2b1a3689
YO
424 depth,
425 (UV)i,
92da3157 426 (i > *maxopenparen_p) ? "-1" : " "
495f47a5 427 ));
a0d0e21e 428 }
dafc8851 429#endif
a0d0e21e
LW
430}
431
74088413
DM
432/* restore the parens and associated vars at savestack position ix,
433 * but without popping the stack */
434
435STATIC void
21553840 436S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH)
74088413
DM
437{
438 I32 tmpix = PL_savestack_ix;
85882954
YO
439 PERL_ARGS_ASSERT_REGCP_RESTORE;
440
74088413 441 PL_savestack_ix = ix;
21553840 442 regcppop(rex, maxopenparen_p);
74088413
DM
443 PL_savestack_ix = tmpix;
444}
445
02db2b7b 446#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
a0d0e21e 447
9637d2a5
CB
448#ifndef PERL_IN_XSUB_RE
449
24e16d7b
KW
450bool
451Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
31c7f561
KW
452{
453 /* Returns a boolean as to whether or not 'character' is a member of the
454 * Posix character class given by 'classnum' that should be equivalent to a
455 * value in the typedef '_char_class_number'.
456 *
457 * Ideally this could be replaced by a just an array of function pointers
458 * to the C library functions that implement the macros this calls.
459 * However, to compile, the precise function signatures are required, and
460 * these may vary from platform to to platform. To avoid having to figure
461 * out what those all are on each platform, I (khw) am using this method,
7aee35ff
KW
462 * which adds an extra layer of function call overhead (unless the C
463 * optimizer strips it away). But we don't particularly care about
464 * performance with locales anyway. */
31c7f561
KW
465
466 switch ((_char_class_number) classnum) {
15861f94 467 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
31c7f561 468 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
e8d596e0
KW
469 case _CC_ENUM_ASCII: return isASCII_LC(character);
470 case _CC_ENUM_BLANK: return isBLANK_LC(character);
cee69f79 471 case _CC_ENUM_CASED: return isLOWER_LC(character)
b0d691b2 472 || isUPPER_LC(character);
e8d596e0 473 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
31c7f561
KW
474 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
475 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
476 case _CC_ENUM_LOWER: return isLOWER_LC(character);
477 case _CC_ENUM_PRINT: return isPRINT_LC(character);
478 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
e8d596e0 479 case _CC_ENUM_SPACE: return isSPACE_LC(character);
31c7f561
KW
480 case _CC_ENUM_UPPER: return isUPPER_LC(character);
481 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
31c7f561 482 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
31c7f561
KW
483 default: /* VERTSPACE should never occur in locales */
484 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
485 }
486
e5964223 487 NOT_REACHED; /* NOTREACHED */
31c7f561
KW
488 return FALSE;
489}
490
9637d2a5
CB
491#endif
492
3018b823
KW
493STATIC bool
494S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
495{
496 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
497 * 'character' is a member of the Posix character class given by 'classnum'
498 * that should be equivalent to a value in the typedef
499 * '_char_class_number'.
500 *
501 * This just calls isFOO_lc on the code point for the character if it is in
2f306ab9 502 * the range 0-255. Outside that range, all characters use Unicode
3018b823
KW
503 * rules, ignoring any locale. So use the Unicode function if this class
504 * requires a swash, and use the Unicode macro otherwise. */
505
506 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
507
508 if (UTF8_IS_INVARIANT(*character)) {
509 return isFOO_lc(classnum, *character);
510 }
511 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
512 return isFOO_lc(classnum,
a62b247b 513 EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
3018b823
KW
514 }
515
613abc6d
KW
516 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
517
3018b823
KW
518 if (classnum < _FIRST_NON_SWASH_CC) {
519
520 /* Initialize the swash unless done already */
521 if (! PL_utf8_swash_ptrs[classnum]) {
522 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2a16ac92
KW
523 PL_utf8_swash_ptrs[classnum] =
524 _core_swash_init("utf8",
525 "",
526 &PL_sv_undef, 1, 0,
527 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
528 }
529
92a2046b
KW
530 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
531 character,
532 TRUE /* is UTF */ ));
3018b823
KW
533 }
534
535 switch ((_char_class_number) classnum) {
779cf272 536 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
3018b823
KW
537 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
538 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
539 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
e1ee3960 540 default: break;
3018b823
KW
541 }
542
e1ee3960 543 return FALSE; /* Things like CNTRL are always below 256 */
3018b823
KW
544}
545
a687059c 546/*
e50aee73 547 * pregexec and friends
a687059c
LW
548 */
549
76234dfb 550#ifndef PERL_IN_XSUB_RE
a687059c 551/*
c277df42 552 - pregexec - match a regexp against a string
a687059c 553 */
c277df42 554I32
5aaab254 555Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
ea3daa5d 556 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
8fd1a950
DM
557/* stringarg: the point in the string at which to begin matching */
558/* strend: pointer to null at end of string */
559/* strbeg: real beginning of string */
560/* minend: end of match must be >= minend bytes after stringarg. */
561/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
562 * itself is accessed via the pointers above */
563/* nosave: For optimizations. */
c277df42 564{
7918f24d
NC
565 PERL_ARGS_ASSERT_PREGEXEC;
566
c277df42 567 return
9041c2e3 568 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
c277df42
IZ
569 nosave ? 0 : REXEC_COPY_STR);
570}
76234dfb 571#endif
22e551b9 572
cad2e5aa 573
6eb5f6b9 574
1a4edc3c
DM
575/* re_intuit_start():
576 *
577 * Based on some optimiser hints, try to find the earliest position in the
578 * string where the regex could match.
579 *
580 * rx: the regex to match against
581 * sv: the SV being matched: only used for utf8 flag; the string
582 * itself is accessed via the pointers below. Note that on
583 * something like an overloaded SV, SvPOK(sv) may be false
584 * and the string pointers may point to something unrelated to
585 * the SV itself.
586 * strbeg: real beginning of string
587 * strpos: the point in the string at which to begin matching
588 * strend: pointer to the byte following the last char of the string
589 * flags currently unused; set to 0
590 * data: currently unused; set to NULL
591 *
592 * The basic idea of re_intuit_start() is to use some known information
593 * about the pattern, namely:
594 *
595 * a) the longest known anchored substring (i.e. one that's at a
596 * constant offset from the beginning of the pattern; but not
597 * necessarily at a fixed offset from the beginning of the
598 * string);
599 * b) the longest floating substring (i.e. one that's not at a constant
600 * offset from the beginning of the pattern);
601 * c) Whether the pattern is anchored to the string; either
602 * an absolute anchor: /^../, or anchored to \n: /^.../m,
603 * or anchored to pos(): /\G/;
604 * d) A start class: a real or synthetic character class which
605 * represents which characters are legal at the start of the pattern;
606 *
607 * to either quickly reject the match, or to find the earliest position
608 * within the string at which the pattern might match, thus avoiding
609 * running the full NFA engine at those earlier locations, only to
610 * eventually fail and retry further along.
611 *
612 * Returns NULL if the pattern can't match, or returns the address within
613 * the string which is the earliest place the match could occur.
614 *
615 * The longest of the anchored and floating substrings is called 'check'
616 * and is checked first. The other is called 'other' and is checked
617 * second. The 'other' substring may not be present. For example,
618 *
619 * /(abc|xyz)ABC\d{0,3}DEFG/
620 *
621 * will have
622 *
623 * check substr (float) = "DEFG", offset 6..9 chars
624 * other substr (anchored) = "ABC", offset 3..3 chars
625 * stclass = [ax]
626 *
627 * Be aware that during the course of this function, sometimes 'anchored'
628 * refers to a substring being anchored relative to the start of the
629 * pattern, and sometimes to the pattern itself being anchored relative to
630 * the string. For example:
631 *
632 * /\dabc/: "abc" is anchored to the pattern;
633 * /^\dabc/: "abc" is anchored to the pattern and the string;
634 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
635 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
636 * but the pattern is anchored to the string.
52a21eb3
DM
637 */
638
cad2e5aa 639char *
52a21eb3
DM
640Perl_re_intuit_start(pTHX_
641 REGEXP * const rx,
642 SV *sv,
643 const char * const strbeg,
644 char *strpos,
645 char *strend,
646 const U32 flags,
647 re_scream_pos_data *data)
cad2e5aa 648{
8d919b0a 649 struct regexp *const prog = ReANY(rx);
6b071d16 650 SSize_t start_shift = prog->check_offset_min;
cad2e5aa 651 /* Should be nonnegative! */
ea3daa5d 652 SSize_t end_shift = 0;
0fc004dd
DM
653 /* current lowest pos in string where the regex can start matching */
654 char *rx_origin = strpos;
eb578fdb 655 SV *check;
f2ed9b32 656 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
6480a6c4 657 U8 other_ix = 1 - prog->substrs->check_ix;
6ad5ffb3 658 bool ml_anch = 0;
8f4bf5fc 659 char *other_last = strpos;/* latest pos 'other' substr already checked to */
bd61b366 660 char *check_at = NULL; /* check substr found at this pos */
bbe252da 661 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
f8fc2ecf 662 RXi_GET_DECL(prog,progi);
02d5137b
DM
663 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
664 regmatch_info *const reginfo = &reginfo_buf;
a3621e74
YO
665 GET_RE_DEBUG_FLAGS_DECL;
666
7918f24d 667 PERL_ARGS_ASSERT_RE_INTUIT_START;
c33e64f0
FC
668 PERL_UNUSED_ARG(flags);
669 PERL_UNUSED_ARG(data);
7918f24d 670
6ad9a8ab 671 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0
DM
672 "Intuit: trying to determine minimum start position...\n"));
673
fb9bbddb 674 /* for now, assume that all substr offsets are positive. If at some point
f67a5002 675 * in the future someone wants to do clever things with lookbehind and
fb9bbddb
DM
676 * -ve offsets, they'll need to fix up any code in this function
677 * which uses these offsets. See the thread beginning
678 * <20140113145929.GF27210@iabyn.com>
679 */
680 assert(prog->substrs->data[0].min_offset >= 0);
681 assert(prog->substrs->data[0].max_offset >= 0);
682 assert(prog->substrs->data[1].min_offset >= 0);
683 assert(prog->substrs->data[1].max_offset >= 0);
684 assert(prog->substrs->data[2].min_offset >= 0);
685 assert(prog->substrs->data[2].max_offset >= 0);
686
f7022b5a 687 /* for now, assume that if both present, that the floating substring
83f2232d 688 * doesn't start before the anchored substring.
f7022b5a
DM
689 * If you break this assumption (e.g. doing better optimisations
690 * with lookahead/behind), then you'll need to audit the code in this
691 * function carefully first
692 */
693 assert(
694 ! ( (prog->anchored_utf8 || prog->anchored_substr)
695 && (prog->float_utf8 || prog->float_substr))
696 || (prog->float_min_offset >= prog->anchored_offset));
697
1a4edc3c
DM
698 /* byte rather than char calculation for efficiency. It fails
699 * to quickly reject some cases that can't match, but will reject
700 * them later after doing full char arithmetic */
c344f387 701 if (prog->minlen > strend - strpos) {
6ad9a8ab 702 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 703 " String too short...\n"));
cad2e5aa 704 goto fail;
2c2d71f5 705 }
d8da0584 706
ab4e48c1 707 RX_MATCH_UTF8_set(rx,utf8_target);
6c3fea77 708 reginfo->is_utf8_target = cBOOL(utf8_target);
bf2039a9 709 reginfo->info_aux = NULL;
9d9163fb 710 reginfo->strbeg = strbeg;
220db18a 711 reginfo->strend = strend;
aed7b151 712 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
02d5137b 713 reginfo->intuit = 1;
1cb48e53
DM
714 /* not actually used within intuit, but zero for safety anyway */
715 reginfo->poscache_maxiter = 0;
02d5137b 716
f2ed9b32 717 if (utf8_target) {
2814f4b3
HS
718 if ((!prog->anchored_utf8 && prog->anchored_substr)
719 || (!prog->float_utf8 && prog->float_substr))
33b8afdf
JH
720 to_utf8_substr(prog);
721 check = prog->check_utf8;
722 } else {
7e0d5ad7
KW
723 if (!prog->check_substr && prog->check_utf8) {
724 if (! to_byte_substr(prog)) {
6b54ddc5 725 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
7e0d5ad7
KW
726 }
727 }
33b8afdf
JH
728 check = prog->check_substr;
729 }
274cd312 730
1dc475d0
DM
731 /* dump the various substring data */
732 DEBUG_OPTIMISE_MORE_r({
733 int i;
734 for (i=0; i<=2; i++) {
735 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
736 : prog->substrs->data[i].substr);
737 if (!sv)
738 continue;
739
6ad9a8ab 740 Perl_re_printf( aTHX_
147e3846
KW
741 " substrs[%d]: min=%" IVdf " max=%" IVdf " end shift=%" IVdf
742 " useful=%" IVdf " utf8=%d [%s]\n",
1dc475d0
DM
743 i,
744 (IV)prog->substrs->data[i].min_offset,
745 (IV)prog->substrs->data[i].max_offset,
746 (IV)prog->substrs->data[i].end_shift,
747 BmUSEFUL(sv),
748 utf8_target ? 1 : 0,
749 SvPEEK(sv));
750 }
751 });
752
8e1490ee 753 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
9fc7410e
DM
754
755 /* ml_anch: check after \n?
756 *
0fa70a06 757 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
9fc7410e
DM
758 * with /.*.../, these flags will have been added by the
759 * compiler:
760 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
761 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
762 */
7d2d37f5
DM
763 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
764 && !(prog->intflags & PREGf_IMPLICIT);
cad2e5aa 765
343c8a29 766 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
c889ccc8
DM
767 /* we are only allowed to match at BOS or \G */
768
57fcbfa7 769 /* trivially reject if there's a BOS anchor and we're not at BOS.
7bb3b9eb
DM
770 *
771 * Note that we don't try to do a similar quick reject for
772 * \G, since generally the caller will have calculated strpos
773 * based on pos() and gofs, so the string is already correctly
774 * anchored by definition; and handling the exceptions would
775 * be too fiddly (e.g. REXEC_IGNOREPOS).
57fcbfa7 776 */
7bb3b9eb 777 if ( strpos != strbeg
d3d47aac 778 && (prog->intflags & PREGf_ANCH_SBOL))
c889ccc8 779 {
6ad9a8ab 780 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 781 " Not at start...\n"));
c889ccc8
DM
782 goto fail;
783 }
784
a5d12a4b
DM
785 /* in the presence of an anchor, the anchored (relative to the
786 * start of the regex) substr must also be anchored relative
66b7ec5c
DM
787 * to strpos. So quickly reject if substr isn't found there.
788 * This works for \G too, because the caller will already have
789 * subtracted gofs from pos, and gofs is the offset from the
790 * \G to the start of the regex. For example, in /.abc\Gdef/,
791 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
792 * caller will have set strpos=pos()-4; we look for the substr
793 * at position pos()-4+1, which lines up with the "a" */
a5d12a4b 794
33c28ab2 795 if (prog->check_offset_min == prog->check_offset_max) {
c889ccc8 796 /* Substring at constant offset from beg-of-str... */
d307bf57 797 SSize_t slen = SvCUR(check);
343c8a29 798 char *s = HOP3c(strpos, prog->check_offset_min, strend);
1de06328 799
6ad9a8ab 800 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 801 " Looking for check substr at fixed offset %" IVdf "...\n",
1dc475d0
DM
802 (IV)prog->check_offset_min));
803
7742aa66
DM
804 if (SvTAIL(check)) {
805 /* In this case, the regex is anchored at the end too.
806 * Unless it's a multiline match, the lengths must match
807 * exactly, give or take a \n. NB: slen >= 1 since
808 * the last char of check is \n */
809 if (!multiline
810 && ( strend - s > slen
811 || strend - s < slen - 1
812 || (strend - s == slen && strend[-1] != '\n')))
c889ccc8 813 {
6ad9a8ab 814 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 815 " String too long...\n"));
c889ccc8
DM
816 goto fail_finish;
817 }
818 /* Now should match s[0..slen-2] */
819 slen--;
c889ccc8 820 }
26fb2318
TC
821 if (slen && (strend - s < slen
822 || *SvPVX_const(check) != *s
823 || (slen > 1 && (memNE(SvPVX_const(check), s, slen)))))
d307bf57 824 {
6ad9a8ab 825 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 826 " String not equal...\n"));
d307bf57
DM
827 goto fail_finish;
828 }
c889ccc8
DM
829
830 check_at = s;
831 goto success_at_start;
cad2e5aa 832 }
cad2e5aa 833 }
cad2e5aa 834 }
0fc004dd 835
c0e0ec46 836 end_shift = prog->check_end_shift;
cad2e5aa 837
19188028 838#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
0033605d 839 if (end_shift < 0)
147e3846 840 Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ",
220fc49f 841 (IV)end_shift, RX_PRECOMP(prog));
2c2d71f5
JH
842#endif
843
2c2d71f5 844 restart:
1de06328 845
66b7ec5c
DM
846 /* This is the (re)entry point of the main loop in this function.
847 * The goal of this loop is to:
848 * 1) find the "check" substring in the region rx_origin..strend
849 * (adjusted by start_shift / end_shift). If not found, reject
850 * immediately.
851 * 2) If it exists, look for the "other" substr too if defined; for
852 * example, if the check substr maps to the anchored substr, then
853 * check the floating substr, and vice-versa. If not found, go
854 * back to (1) with rx_origin suitably incremented.
855 * 3) If we find an rx_origin position that doesn't contradict
856 * either of the substrings, then check the possible additional
857 * constraints on rx_origin of /^.../m or a known start class.
858 * If these fail, then depending on which constraints fail, jump
859 * back to here, or to various other re-entry points further along
860 * that skip some of the first steps.
861 * 4) If we pass all those tests, update the BmUSEFUL() count on the
862 * substring. If the start position was determined to be at the
863 * beginning of the string - so, not rejected, but not optimised,
864 * since we have to run regmatch from position 0 - decrement the
865 * BmUSEFUL() count. Otherwise increment it.
866 */
867
1a4edc3c
DM
868
869 /* first, look for the 'check' substring */
870
1de06328 871 {
c33e64f0
FC
872 U8* start_point;
873 U8* end_point;
c889ccc8 874
c889ccc8 875 DEBUG_OPTIMISE_MORE_r({
6ad9a8ab 876 Perl_re_printf( aTHX_
147e3846
KW
877 " At restart: rx_origin=%" IVdf " Check offset min: %" IVdf
878 " Start shift: %" IVdf " End shift %" IVdf
879 " Real end Shift: %" IVdf "\n",
675e93ee 880 (IV)(rx_origin - strbeg),
c889ccc8 881 (IV)prog->check_offset_min,
12fbc530
DM
882 (IV)start_shift,
883 (IV)end_shift,
c889ccc8
DM
884 (IV)prog->check_end_shift);
885 });
1de06328 886
33c28ab2
DM
887 end_point = HOP3(strend, -end_shift, strbeg);
888 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
889 if (!start_point)
890 goto fail_finish;
c889ccc8 891
557f47af 892
e0362b86 893 /* If the regex is absolutely anchored to either the start of the
d3d47aac 894 * string (SBOL) or to pos() (ANCH_GPOS), then
e0362b86
DM
895 * check_offset_max represents an upper bound on the string where
896 * the substr could start. For the ANCH_GPOS case, we assume that
897 * the caller of intuit will have already set strpos to
898 * pos()-gofs, so in this case strpos + offset_max will still be
899 * an upper bound on the substr.
900 */
c19c836a
DM
901 if (!ml_anch
902 && prog->intflags & PREGf_ANCH
e0362b86 903 && prog->check_offset_max != SSize_t_MAX)
c19c836a 904 {
1a08ba3a 905 SSize_t len = SvCUR(check) - !!SvTAIL(check);
e0362b86
DM
906 const char * const anchor =
907 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
908
909 /* do a bytes rather than chars comparison. It's conservative;
910 * so it skips doing the HOP if the result can't possibly end
911 * up earlier than the old value of end_point.
912 */
913 if ((char*)end_point - anchor > prog->check_offset_max) {
914 end_point = HOP3lim((U8*)anchor,
915 prog->check_offset_max,
916 end_point -len)
917 + len;
918 }
d6ef1678
DM
919 }
920
ae5d4331 921 check_at = fbm_instr( start_point, end_point,
7fba1cd6 922 check, multiline ? FBMrf_MULTILINE : 0);
c889ccc8 923
6ad9a8ab 924 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 925 " doing 'check' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
675e93ee
DM
926 (IV)((char*)start_point - strbeg),
927 (IV)((char*)end_point - strbeg),
928 (IV)(check_at ? check_at - strbeg : -1)
929 ));
930
8fd34720
DM
931 /* Update the count-of-usability, remove useless subpatterns,
932 unshift s. */
933
934 DEBUG_EXECUTE_r({
935 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
936 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
6ad9a8ab 937 Perl_re_printf( aTHX_ " %s %s substr %s%s%s",
8fd34720
DM
938 (check_at ? "Found" : "Did not find"),
939 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
940 ? "anchored" : "floating"),
941 quoted,
942 RE_SV_TAIL(check),
943 (check_at ? " at offset " : "...\n") );
944 });
2c2d71f5 945
8fd34720
DM
946 if (!check_at)
947 goto fail_finish;
8fd34720
DM
948 /* set rx_origin to the minimum position where the regex could start
949 * matching, given the constraint of the just-matched check substring.
950 * But don't set it lower than previously.
951 */
fdc003fd 952
8fd34720
DM
953 if (check_at - rx_origin > prog->check_offset_max)
954 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
675e93ee 955 /* Finish the diagnostic message */
6ad9a8ab 956 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 957 "%ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
958 (long)(check_at - strbeg),
959 (IV)(rx_origin - strbeg)
960 ));
8fd34720 961 }
fdc003fd
DM
962
963
1a4edc3c 964 /* now look for the 'other' substring if defined */
2c2d71f5 965
6480a6c4
DM
966 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
967 : prog->substrs->data[other_ix].substr)
1de06328 968 {
30944b6d 969 /* Take into account the "other" substring. */
6c3343a6
DM
970 char *last, *last1;
971 char *s;
972 SV* must;
973 struct reg_substr_datum *other;
974
975 do_other_substr:
976 other = &prog->substrs->data[other_ix];
977
978 /* if "other" is anchored:
979 * we've previously found a floating substr starting at check_at.
980 * This means that the regex origin must lie somewhere
981 * between min (rx_origin): HOP3(check_at, -check_offset_max)
982 * and max: HOP3(check_at, -check_offset_min)
983 * (except that min will be >= strpos)
984 * So the fixed substr must lie somewhere between
985 * HOP3(min, anchored_offset)
986 * HOP3(max, anchored_offset) + SvCUR(substr)
987 */
988
989 /* if "other" is floating
990 * Calculate last1, the absolute latest point where the
991 * floating substr could start in the string, ignoring any
992 * constraints from the earlier fixed match. It is calculated
993 * as follows:
994 *
995 * strend - prog->minlen (in chars) is the absolute latest
996 * position within the string where the origin of the regex
997 * could appear. The latest start point for the floating
998 * substr is float_min_offset(*) on from the start of the
999 * regex. last1 simply combines thee two offsets.
1000 *
1001 * (*) You might think the latest start point should be
1002 * float_max_offset from the regex origin, and technically
1003 * you'd be correct. However, consider
1004 * /a\d{2,4}bcd\w/
1005 * Here, float min, max are 3,5 and minlen is 7.
1006 * This can match either
1007 * /a\d\dbcd\w/
1008 * /a\d\d\dbcd\w/
1009 * /a\d\d\d\dbcd\w/
1010 * In the first case, the regex matches minlen chars; in the
1011 * second, minlen+1, in the third, minlen+2.
1012 * In the first case, the floating offset is 3 (which equals
1013 * float_min), in the second, 4, and in the third, 5 (which
1014 * equals float_max). In all cases, the floating string bcd
1015 * can never start more than 4 chars from the end of the
1016 * string, which equals minlen - float_min. As the substring
1017 * starts to match more than float_min from the start of the
1018 * regex, it makes the regex match more than minlen chars,
1019 * and the two cancel each other out. So we can always use
1020 * float_min - minlen, rather than float_max - minlen for the
1021 * latest position in the string.
1022 *
1023 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1024 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1025 */
1026
e7a14a9c 1027 assert(prog->minlen >= other->min_offset);
6c3343a6
DM
1028 last1 = HOP3c(strend,
1029 other->min_offset - prog->minlen, strbeg);
1030
4d006249 1031 if (other_ix) {/* i.e. if (other-is-float) */
6c3343a6
DM
1032 /* last is the latest point where the floating substr could
1033 * start, *given* any constraints from the earlier fixed
1034 * match. This constraint is that the floating string starts
1035 * <= float_max_offset chars from the regex origin (rx_origin).
1036 * If this value is less than last1, use it instead.
eb3831ce 1037 */
6c3343a6
DM
1038 assert(rx_origin <= last1);
1039 last =
1040 /* this condition handles the offset==infinity case, and
1041 * is a short-cut otherwise. Although it's comparing a
1042 * byte offset to a char length, it does so in a safe way,
1043 * since 1 char always occupies 1 or more bytes,
1044 * so if a string range is (last1 - rx_origin) bytes,
1045 * it will be less than or equal to (last1 - rx_origin)
1046 * chars; meaning it errs towards doing the accurate HOP3
1047 * rather than just using last1 as a short-cut */
1048 (last1 - rx_origin) < other->max_offset
1049 ? last1
1050 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1051 }
1052 else {
1053 assert(strpos + start_shift <= check_at);
1054 last = HOP4c(check_at, other->min_offset - start_shift,
1055 strbeg, strend);
1056 }
ead917d0 1057
6c3343a6
DM
1058 s = HOP3c(rx_origin, other->min_offset, strend);
1059 if (s < other_last) /* These positions already checked */
1060 s = other_last;
1061
1062 must = utf8_target ? other->utf8_substr : other->substr;
1063 assert(SvPOK(must));
675e93ee
DM
1064 {
1065 char *from = s;
1066 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1067
71a9d105
DM
1068 if (to > strend)
1069 to = strend;
88203927
DM
1070 if (from > to) {
1071 s = NULL;
6ad9a8ab 1072 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1073 " skipping 'other' fbm scan: %" IVdf " > %" IVdf "\n",
88203927
DM
1074 (IV)(from - strbeg),
1075 (IV)(to - strbeg)
1076 ));
1077 }
1078 else {
1079 s = fbm_instr(
1080 (unsigned char*)from,
1081 (unsigned char*)to,
1082 must,
1083 multiline ? FBMrf_MULTILINE : 0
1084 );
6ad9a8ab 1085 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1086 " doing 'other' fbm scan, [%" IVdf "..%" IVdf "] gave %" IVdf "\n",
88203927
DM
1087 (IV)(from - strbeg),
1088 (IV)(to - strbeg),
1089 (IV)(s ? s - strbeg : -1)
1090 ));
1091 }
675e93ee
DM
1092 }
1093
6c3343a6
DM
1094 DEBUG_EXECUTE_r({
1095 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1096 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
6ad9a8ab 1097 Perl_re_printf( aTHX_ " %s %s substr %s%s",
6c3343a6
DM
1098 s ? "Found" : "Contradicts",
1099 other_ix ? "floating" : "anchored",
1100 quoted, RE_SV_TAIL(must));
1101 });
ead917d0 1102
ead917d0 1103
6c3343a6
DM
1104 if (!s) {
1105 /* last1 is latest possible substr location. If we didn't
1106 * find it before there, we never will */
1107 if (last >= last1) {
6ad9a8ab 1108 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee 1109 "; giving up...\n"));
6c3343a6 1110 goto fail_finish;
ead917d0
DM
1111 }
1112
6c3343a6
DM
1113 /* try to find the check substr again at a later
1114 * position. Maybe next time we'll find the "other" substr
1115 * in range too */
6c3343a6
DM
1116 other_last = HOP3c(last, 1, strend) /* highest failure */;
1117 rx_origin =
4d006249 1118 other_ix /* i.e. if other-is-float */
6c3343a6
DM
1119 ? HOP3c(rx_origin, 1, strend)
1120 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
6ad9a8ab 1121 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1122 "; about to retry %s at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1123 (other_ix ? "floating" : "anchored"),
1124 (long)(HOP3c(check_at, 1, strend) - strbeg),
1125 (IV)(rx_origin - strbeg)
1126 ));
6c3343a6
DM
1127 goto restart;
1128 }
1129 else {
4d006249 1130 if (other_ix) { /* if (other-is-float) */
6c3343a6
DM
1131 /* other_last is set to s, not s+1, since its possible for
1132 * a floating substr to fail first time, then succeed
1133 * second time at the same floating position; e.g.:
1134 * "-AB--AABZ" =~ /\wAB\d*Z/
1135 * The first time round, anchored and float match at
1136 * "-(AB)--AAB(Z)" then fail on the initial \w character
1137 * class. Second time round, they match at "-AB--A(AB)(Z)".
1138 */
1139 other_last = s;
ead917d0
DM
1140 }
1141 else {
6c3343a6
DM
1142 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1143 other_last = HOP3c(s, 1, strend);
ead917d0 1144 }
6ad9a8ab 1145 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1146 " at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1147 (long)(s - strbeg),
1148 (IV)(rx_origin - strbeg)
1149 ));
1150
6c3343a6 1151 }
cad2e5aa 1152 }
acba93e8
DM
1153 else {
1154 DEBUG_OPTIMISE_MORE_r(
6ad9a8ab 1155 Perl_re_printf( aTHX_
147e3846
KW
1156 " Check-only match: offset min:%" IVdf " max:%" IVdf
1157 " check_at:%" IVdf " rx_origin:%" IVdf " rx_origin-check_at:%" IVdf
1158 " strend:%" IVdf "\n",
acba93e8
DM
1159 (IV)prog->check_offset_min,
1160 (IV)prog->check_offset_max,
675e93ee
DM
1161 (IV)(check_at-strbeg),
1162 (IV)(rx_origin-strbeg),
1c1c599d 1163 (IV)(rx_origin-check_at),
675e93ee 1164 (IV)(strend-strbeg)
acba93e8
DM
1165 )
1166 );
1167 }
2c2d71f5 1168
acba93e8 1169 postprocess_substr_matches:
0991020e 1170
1a4edc3c 1171 /* handle the extra constraint of /^.../m if present */
e3c6feb0 1172
7d2d37f5 1173 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
4620cb61
DM
1174 char *s;
1175
6ad9a8ab 1176 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a62659bd 1177 " looking for /^/m anchor"));
d0880ea7
DM
1178
1179 /* we have failed the constraint of a \n before rx_origin.
2e759faa
DM
1180 * Find the next \n, if any, even if it's beyond the current
1181 * anchored and/or floating substrings. Whether we should be
1182 * scanning ahead for the next \n or the next substr is debatable.
1183 * On the one hand you'd expect rare substrings to appear less
1184 * often than \n's. On the other hand, searching for \n means
675e93ee 1185 * we're effectively flipping between check_substr and "\n" on each
2e759faa
DM
1186 * iteration as the current "rarest" string candidate, which
1187 * means for example that we'll quickly reject the whole string if
1188 * hasn't got a \n, rather than trying every substr position
1189 * first
1190 */
d0880ea7 1191
4620cb61
DM
1192 s = HOP3c(strend, - prog->minlen, strpos);
1193 if (s <= rx_origin ||
1194 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1195 {
6ad9a8ab 1196 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d0880ea7
DM
1197 " Did not find /%s^%s/m...\n",
1198 PL_colors[0], PL_colors[1]));
a62659bd
DM
1199 goto fail_finish;
1200 }
d0880ea7 1201
4ada1233
DM
1202 /* earliest possible origin is 1 char after the \n.
1203 * (since *rx_origin == '\n', it's safe to ++ here rather than
1204 * HOP(rx_origin, 1)) */
1205 rx_origin++;
d0880ea7 1206
f4f115de 1207 if (prog->substrs->check_ix == 0 /* check is anchored */
4ada1233 1208 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
f4f115de 1209 {
d0880ea7
DM
1210 /* Position contradicts check-string; either because
1211 * check was anchored (and thus has no wiggle room),
4ada1233 1212 * or check was float and rx_origin is above the float range */
6ad9a8ab 1213 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1214 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1215 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
d0880ea7
DM
1216 goto restart;
1217 }
1218
1219 /* if we get here, the check substr must have been float,
2e759faa 1220 * is in range, and we may or may not have had an anchored
d0880ea7
DM
1221 * "other" substr which still contradicts */
1222 assert(prog->substrs->check_ix); /* check is float */
1223
1224 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1225 /* whoops, the anchored "other" substr exists, so we still
1226 * contradict. On the other hand, the float "check" substr
1227 * didn't contradict, so just retry the anchored "other"
1228 * substr */
6ad9a8ab 1229 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846 1230 " Found /%s^%s/m, rescanning for anchored from offset %" IVdf " (rx_origin now %" IVdf ")...\n",
d0880ea7 1231 PL_colors[0], PL_colors[1],
73e8ff00
DM
1232 (IV)(rx_origin - strbeg + prog->anchored_offset),
1233 (IV)(rx_origin - strbeg)
675e93ee 1234 ));
d0880ea7
DM
1235 goto do_other_substr;
1236 }
1237
1238 /* success: we don't contradict the found floating substring
1239 * (and there's no anchored substr). */
6ad9a8ab 1240 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
675e93ee
DM
1241 " Found /%s^%s/m with rx_origin %ld...\n",
1242 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
e3c6feb0
DM
1243 }
1244 else {
6ad9a8ab 1245 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
fe4f3442 1246 " (multiline anchor test skipped)\n"));
e3c6feb0
DM
1247 }
1248
ffad1e6a 1249 success_at_start:
e3c6feb0 1250
cad2e5aa 1251
dd170ff5
DM
1252 /* if we have a starting character class, then test that extra constraint.
1253 * (trie stclasses are too expensive to use here, we are better off to
1254 * leave it to regmatch itself) */
1255
f8fc2ecf 1256 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
f8fc2ecf 1257 const U8* const str = (U8*)STRING(progi->regstclass);
0991020e 1258
2c75e362 1259 /* XXX this value could be pre-computed */
f8fc2ecf 1260 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
2c75e362
DM
1261 ? (reginfo->is_utf8_pat
1262 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1263 : STR_LEN(progi->regstclass))
66e933ab 1264 : 1);
1de06328 1265 char * endpos;
fa3bb21d 1266 char *s;
000dfd2d
DM
1267 /* latest pos that a matching float substr constrains rx start to */
1268 char *rx_max_float = NULL;
1269
c75a3985
DM
1270 /* if the current rx_origin is anchored, either by satisfying an
1271 * anchored substring constraint, or a /^.../m constraint, then we
1272 * can reject the current origin if the start class isn't found
1273 * at the current position. If we have a float-only match, then
1274 * rx_origin is constrained to a range; so look for the start class
1275 * in that range. if neither, then look for the start class in the
1276 * whole rest of the string */
1277
dd170ff5
DM
1278 /* XXX DAPM it's not clear what the minlen test is for, and why
1279 * it's not used in the floating case. Nothing in the test suite
1280 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1281 * Here are some old comments, which may or may not be correct:
1282 *
1283 * minlen == 0 is possible if regstclass is \b or \B,
1284 * and the fixed substr is ''$.
1285 * Since minlen is already taken into account, rx_origin+1 is
1286 * before strend; accidentally, minlen >= 1 guaranties no false
1287 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1288 * 0) below assumes that regstclass does not come from lookahead...
1289 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1290 * This leaves EXACTF-ish only, which are dealt with in
1291 * find_byclass().
1292 */
1293
7d2d37f5 1294 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
67853908 1295 endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend);
000dfd2d
DM
1296 else if (prog->float_substr || prog->float_utf8) {
1297 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
67853908 1298 endpos = HOP3clim(rx_max_float, cl_l, strend);
000dfd2d 1299 }
1de06328
YO
1300 else
1301 endpos= strend;
1302
6ad9a8ab 1303 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
147e3846
KW
1304 " looking for class: start_shift: %" IVdf " check_at: %" IVdf
1305 " rx_origin: %" IVdf " endpos: %" IVdf "\n",
1dc475d0 1306 (IV)start_shift, (IV)(check_at - strbeg),
c43b5520 1307 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
d8080198 1308
c43b5520 1309 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
f9176b44 1310 reginfo);
be778b1a 1311 if (!s) {
6eb5f6b9 1312 if (endpos == strend) {
6ad9a8ab 1313 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1314 " Could not match STCLASS...\n") );
6eb5f6b9
JH
1315 goto fail;
1316 }
6ad9a8ab 1317 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
1dc475d0 1318 " This position contradicts STCLASS...\n") );
e0eb31e7
DM
1319 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1320 && !(prog->intflags & PREGf_IMPLICIT))
653099ff 1321 goto fail;
9fed8d02 1322
6eb5f6b9 1323 /* Contradict one of substrings */
97136c8a
DM
1324 if (prog->anchored_substr || prog->anchored_utf8) {
1325 if (prog->substrs->check_ix == 1) { /* check is float */
1326 /* Have both, check_string is floating */
1327 assert(rx_origin + start_shift <= check_at);
1328 if (rx_origin + start_shift != check_at) {
1329 /* not at latest position float substr could match:
c75a3985
DM
1330 * Recheck anchored substring, but not floating.
1331 * The condition above is in bytes rather than
1332 * chars for efficiency. It's conservative, in
1333 * that it errs on the side of doing 'goto
88203927
DM
1334 * do_other_substr'. In this case, at worst,
1335 * an extra anchored search may get done, but in
1336 * practice the extra fbm_instr() is likely to
1337 * get skipped anyway. */
6ad9a8ab 1338 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1339 " about to retry anchored at offset %ld (rx_origin now %" IVdf ")...\n",
675e93ee
DM
1340 (long)(other_last - strbeg),
1341 (IV)(rx_origin - strbeg)
1342 ));
97136c8a 1343 goto do_other_substr;
3369914b 1344 }
3369914b
DM
1345 }
1346 }
97136c8a 1347 else {
9fed8d02
DM
1348 /* float-only */
1349
7d2d37f5 1350 if (ml_anch) {
c75a3985
DM
1351 /* In the presence of ml_anch, we might be able to
1352 * find another \n without breaking the current float
1353 * constraint. */
1354
1355 /* strictly speaking this should be HOP3c(..., 1, ...),
1356 * but since we goto a block of code that's going to
1357 * search for the next \n if any, its safe here */
9fed8d02 1358 rx_origin++;
6ad9a8ab 1359 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
675e93ee 1360 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
9fed8d02 1361 PL_colors[0], PL_colors[1],
675e93ee 1362 (long)(rx_origin - strbeg)) );
9fed8d02 1363 goto postprocess_substr_matches;
ab60c45a 1364 }
c75a3985
DM
1365
1366 /* strictly speaking this can never be true; but might
1367 * be if we ever allow intuit without substrings */
1368 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
9fed8d02 1369 goto fail;
c75a3985 1370
000dfd2d 1371 rx_origin = rx_max_float;
9fed8d02
DM
1372 }
1373
c75a3985
DM
1374 /* at this point, any matching substrings have been
1375 * contradicted. Start again... */
1376
9fed8d02 1377 rx_origin = HOP3c(rx_origin, 1, strend);
557f47af
DM
1378
1379 /* uses bytes rather than char calculations for efficiency.
1380 * It's conservative: it errs on the side of doing 'goto restart',
1381 * where there is code that does a proper char-based test */
9fed8d02 1382 if (rx_origin + start_shift + end_shift > strend) {
6ad9a8ab 1383 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
9fed8d02
DM
1384 " Could not match STCLASS...\n") );
1385 goto fail;
1386 }
6ad9a8ab 1387 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
147e3846 1388 " about to look for %s substr starting at offset %ld (rx_origin now %" IVdf ")...\n",
9fed8d02 1389 (prog->substrs->check_ix ? "floating" : "anchored"),
675e93ee
DM
1390 (long)(rx_origin + start_shift - strbeg),
1391 (IV)(rx_origin - strbeg)
1392 ));
9fed8d02 1393 goto restart;
6eb5f6b9 1394 }
9fed8d02 1395
c75a3985
DM
1396 /* Success !!! */
1397
5f9c6575 1398 if (rx_origin != s) {
6ad9a8ab 1399 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1400 " By STCLASS: moving %ld --> %ld\n",
675e93ee 1401 (long)(rx_origin - strbeg), (long)(s - strbeg))
b7953727
JH
1402 );
1403 }
1404 else {
6ad9a8ab 1405 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
1dc475d0 1406 " Does not contradict STCLASS...\n");
b7953727
JH
1407 );
1408 }
6eb5f6b9 1409 }
ffad1e6a
DM
1410
1411 /* Decide whether using the substrings helped */
1412
1413 if (rx_origin != strpos) {
1414 /* Fixed substring is found far enough so that the match
1415 cannot start at strpos. */
1416
6ad9a8ab 1417 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " try at offset...\n"));
ffad1e6a
DM
1418 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1419 }
1420 else {
70563e16
DM
1421 /* The found rx_origin position does not prohibit matching at
1422 * strpos, so calling intuit didn't gain us anything. Decrement
1423 * the BmUSEFUL() count on the check substring, and if we reach
1424 * zero, free it. */
1425 if (!(prog->intflags & PREGf_NAUGHTY)
ffad1e6a
DM
1426 && (utf8_target ? (
1427 prog->check_utf8 /* Could be deleted already */
1428 && --BmUSEFUL(prog->check_utf8) < 0
1429 && (prog->check_utf8 == prog->float_utf8)
1430 ) : (
1431 prog->check_substr /* Could be deleted already */
1432 && --BmUSEFUL(prog->check_substr) < 0
1433 && (prog->check_substr == prog->float_substr)
1434 )))
1435 {
1436 /* If flags & SOMETHING - do not do it many times on the same match */
6ad9a8ab 1437 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " ... Disabling check substring...\n"));
ffad1e6a
DM
1438 /* XXX Does the destruction order has to change with utf8_target? */
1439 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1440 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1441 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1442 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1443 check = NULL; /* abort */
ffad1e6a
DM
1444 /* XXXX This is a remnant of the old implementation. It
1445 looks wasteful, since now INTUIT can use many
1446 other heuristics. */
1447 prog->extflags &= ~RXf_USE_INTUIT;
ffad1e6a
DM
1448 }
1449 }
1450
6ad9a8ab 1451 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
ffad1e6a 1452 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
675e93ee 1453 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
ffad1e6a 1454
c765d6e0 1455 return rx_origin;
2c2d71f5
JH
1456
1457 fail_finish: /* Substring not found */
33b8afdf 1458 if (prog->check_substr || prog->check_utf8) /* could be removed already */
f2ed9b32 1459 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
cad2e5aa 1460 fail:
6ad9a8ab 1461 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch rejected by optimizer%s\n",
e4584336 1462 PL_colors[4], PL_colors[5]));
bd61b366 1463 return NULL;
cad2e5aa 1464}
9661b544 1465
70563e16 1466
a0a388a1 1467#define DECL_TRIE_TYPE(scan) \
e7fd4aa1 1468 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
a4525e78
KW
1469 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1470 trie_utf8l, trie_flu8 } \
e7fd4aa1
KW
1471 trie_type = ((scan->flags == EXACT) \
1472 ? (utf8_target ? trie_utf8 : trie_plain) \
a4525e78
KW
1473 : (scan->flags == EXACTL) \
1474 ? (utf8_target ? trie_utf8l : trie_plain) \
1475 : (scan->flags == EXACTFA) \
1476 ? (utf8_target \
1477 ? trie_utf8_exactfa_fold \
1478 : trie_latin_utf8_exactfa_fold) \
1479 : (scan->flags == EXACTFLU8 \
1480 ? trie_flu8 \
1481 : (utf8_target \
1482 ? trie_utf8_fold \
1483 : trie_latin_utf8_fold)))
fab2782b 1484
fd3249ee 1485#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
baa60164 1486STMT_START { \
fab2782b 1487 STRLEN skiplen; \
baa60164 1488 U8 flags = FOLD_FLAGS_FULL; \
fab2782b 1489 switch (trie_type) { \
a4525e78 1490 case trie_flu8: \
780fcc9f 1491 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1492 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1493 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1494 } \
a4525e78 1495 goto do_trie_utf8_fold; \
31f05a37 1496 case trie_utf8_exactfa_fold: \
baa60164 1497 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1498 /* FALLTHROUGH */ \
fab2782b 1499 case trie_utf8_fold: \
a4525e78 1500 do_trie_utf8_fold: \
fab2782b 1501 if ( foldlen>0 ) { \
c80e42f3 1502 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1503 foldlen -= len; \
1504 uscan += len; \
1505 len=0; \
1506 } else { \
fab2782b 1507 len = UTF8SKIP(uc); \
a1a5ec35
KW
1508 uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, &foldlen, \
1509 flags); \
5f560d8a 1510 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1511 foldlen -= skiplen; \
1512 uscan = foldbuf + skiplen; \
1513 } \
1514 break; \
baa60164
KW
1515 case trie_latin_utf8_exactfa_fold: \
1516 flags |= FOLD_FLAGS_NOMIX_ASCII; \
8e57b935 1517 /* FALLTHROUGH */ \
fab2782b
YO
1518 case trie_latin_utf8_fold: \
1519 if ( foldlen>0 ) { \
c80e42f3 1520 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1521 foldlen -= len; \
1522 uscan += len; \
1523 len=0; \
1524 } else { \
1525 len = 1; \
31f05a37 1526 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
5f560d8a 1527 skiplen = UVCHR_SKIP( uvc ); \
fab2782b
YO
1528 foldlen -= skiplen; \
1529 uscan = foldbuf + skiplen; \
1530 } \
1531 break; \
a4525e78 1532 case trie_utf8l: \
780fcc9f 1533 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
613abc6d
KW
1534 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1535 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1536 } \
780fcc9f 1537 /* FALLTHROUGH */ \
fab2782b 1538 case trie_utf8: \
c80e42f3 1539 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
fab2782b
YO
1540 break; \
1541 case trie_plain: \
1542 uvc = (UV)*uc; \
1543 len = 1; \
1544 } \
1545 if (uvc < 256) { \
1546 charid = trie->charmap[ uvc ]; \
1547 } \
1548 else { \
1549 charid = 0; \
1550 if (widecharmap) { \
1551 SV** const svpp = hv_fetch(widecharmap, \
1552 (char*)&uvc, sizeof(UV), 0); \
1553 if (svpp) \
1554 charid = (U16)SvIV(*svpp); \
1555 } \
1556 } \
4cadc6a9
YO
1557} STMT_END
1558
cb41e5d6 1559#define DUMP_EXEC_POS(li,s,doutf8,depth) \
ae7c5b9b 1560 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
cb41e5d6 1561 startpos, doutf8, depth)
ae7c5b9b 1562
c84a03c5 1563#define REXEC_FBC_EXACTISH_SCAN(COND) \
4cadc6a9
YO
1564STMT_START { \
1565 while (s <= e) { \
c84a03c5 1566 if ( (COND) \
fac1af77 1567 && (ln == 1 || folder(s, pat_string, ln)) \
02d5137b 1568 && (reginfo->intuit || regtry(reginfo, &s)) )\
4cadc6a9
YO
1569 goto got_it; \
1570 s++; \
1571 } \
1572} STMT_END
1573
c84a03c5 1574#define REXEC_FBC_UTF8_SCAN(CODE) \
4cadc6a9 1575STMT_START { \
9a902117 1576 while (s < strend) { \
c84a03c5 1577 CODE \
9a902117 1578 s += UTF8SKIP(s); \
4cadc6a9
YO
1579 } \
1580} STMT_END
1581
c84a03c5 1582#define REXEC_FBC_SCAN(CODE) \
4cadc6a9
YO
1583STMT_START { \
1584 while (s < strend) { \
c84a03c5 1585 CODE \
4cadc6a9
YO
1586 s++; \
1587 } \
1588} STMT_END
1589
05bd126c
KW
1590#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1591REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \
1592 if (COND) { \
1593 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1594 goto got_it; \
1595 else \
1596 tmp = doevery; \
1597 } \
1598 else \
1599 tmp = 1; \
4cadc6a9
YO
1600)
1601
05bd126c
KW
1602#define REXEC_FBC_CLASS_SCAN(COND) \
1603REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
1604 if (COND) { \
1605 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1606 goto got_it; \
1607 else \
1608 tmp = doevery; \
1609 } \
1610 else \
1611 tmp = 1; \
4cadc6a9
YO
1612)
1613
c84a03c5 1614#define REXEC_FBC_CSCAN(CONDUTF8,COND) \
baa60164 1615 if (utf8_target) { \
c84a03c5 1616 REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \
e1d1eefb
YO
1617 } \
1618 else { \
c84a03c5 1619 REXEC_FBC_CLASS_SCAN(COND); \
d981ef24 1620 }
05bd126c 1621
05bd126c
KW
1622/* The three macros below are slightly different versions of the same logic.
1623 *
1624 * The first is for /a and /aa when the target string is UTF-8. This can only
1625 * match ascii, but it must advance based on UTF-8. The other two handle the
1626 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1627 * for the boundary (or non-boundary) between a word and non-word character.
1628 * The utf8 and non-utf8 cases have the same logic, but the details must be
1629 * different. Find the "wordness" of the character just prior to this one, and
1630 * compare it with the wordness of this one. If they differ, we have a
1631 * boundary. At the beginning of the string, pretend that the previous
1632 * character was a new-line.
1633 *
1634 * All these macros uncleanly have side-effects with each other and outside
1635 * variables. So far it's been too much trouble to clean-up
1636 *
1637 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1638 * a word character or not.
1639 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1640 * word/non-word
1641 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1642 *
1643 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1644 * are looking for a boundary or for a non-boundary. If we are looking for a
1645 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1646 * see if this tentative match actually works, and if so, to quit the loop
1647 * here. And vice-versa if we are looking for a non-boundary.
1648 *
1649 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1650 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1651 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1652 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1653 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1654 * complement. But in that branch we complement tmp, meaning that at the
1655 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1656 * which means at the top of the loop in the next iteration, it is
1657 * TEST_NON_UTF8(s-1) */
b2f4e957 1658#define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1659 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1660 tmp = TEST_NON_UTF8(tmp); \
1661 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1662 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1663 tmp = !tmp; \
1664 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1665 } \
1666 else { \
1667 IF_FAIL; \
1668 } \
1669 ); \
1670
1671/* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1672 * TEST_UTF8 is a macro that for the same input code points returns identically
1673 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
236d82fd 1674#define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
05bd126c
KW
1675 if (s == reginfo->strbeg) { \
1676 tmp = '\n'; \
1677 } \
1678 else { /* Back-up to the start of the previous character */ \
1679 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1680 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
3db24e1e 1681 0, UTF8_ALLOW_DEFAULT); \
05bd126c
KW
1682 } \
1683 tmp = TEST_UV(tmp); \
1684 LOAD_UTF8_CHARCLASS_ALNUM(); \
1685 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
7a207065 1686 if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \
05bd126c
KW
1687 tmp = !tmp; \
1688 IF_SUCCESS; \
1689 } \
1690 else { \
1691 IF_FAIL; \
1692 } \
1693 );
cfaf538b 1694
05bd126c
KW
1695/* Like the above two macros. UTF8_CODE is the complete code for handling
1696 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1697 * macros below */
baa60164 1698#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
63ac0dad 1699 if (utf8_target) { \
05bd126c 1700 UTF8_CODE \
63ac0dad
KW
1701 } \
1702 else { /* Not utf8 */ \
9d9163fb 1703 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
63ac0dad 1704 tmp = TEST_NON_UTF8(tmp); \
05bd126c 1705 REXEC_FBC_SCAN( /* advances s while s < strend */ \
63ac0dad 1706 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
63ac0dad 1707 IF_SUCCESS; \
760cfa8e 1708 tmp = !tmp; \
63ac0dad
KW
1709 } \
1710 else { \
1711 IF_FAIL; \
1712 } \
1713 ); \
1714 } \
c8519dc7
KW
1715 /* Here, things have been set up by the previous code so that tmp is the \
1716 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
1717 * utf8ness of the target). We also have to check if this matches against \
1718 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
1719 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
1720 * string */ \
1721 if (tmp == ! TEST_NON_UTF8('\n')) { \
1722 IF_SUCCESS; \
1723 } \
1724 else { \
1725 IF_FAIL; \
1726 }
63ac0dad 1727
ae7c5b9b
KW
1728/* This is the macro to use when we want to see if something that looks like it
1729 * could match, actually does, and if so exits the loop */
1730#define REXEC_FBC_TRYIT \
1731 if ((reginfo->intuit || regtry(reginfo, &s))) \
1732 goto got_it
1733
1734/* The only difference between the BOUND and NBOUND cases is that
1735 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1736 * NBOUND. This is accomplished by passing it as either the if or else clause,
1737 * with the other one being empty (PLACEHOLDER is defined as empty).
1738 *
1739 * The TEST_FOO parameters are for operating on different forms of input, but
1740 * all should be ones that return identically for the same underlying code
1741 * points */
1742#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1743 FBC_BOUND_COMMON( \
1744 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1745 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1746
44129e46 1747#define FBC_BOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
1748 FBC_BOUND_COMMON( \
1749 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1750 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1751
1752#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1753 FBC_BOUND_COMMON( \
1754 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1755 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1756
44129e46 1757#define FBC_NBOUND_A(TEST_NON_UTF8) \
ae7c5b9b
KW
1758 FBC_BOUND_COMMON( \
1759 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1760 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1761
8bde5eaf
JH
1762#ifdef DEBUGGING
1763static IV
1764S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) {
1765 IV cp_out = Perl__invlist_search(invlist, cp_in);
1766 assert(cp_out >= 0);
1767 return cp_out;
1768}
1769# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1770 invmap[S_get_break_val_cp_checked(invlist, cp)]
1771#else
1772# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \
1773 invmap[_invlist_search(invlist, cp)]
1774#endif
1775
64935bc6
KW
1776/* Takes a pointer to an inversion list, a pointer to its corresponding
1777 * inversion map, and a code point, and returns the code point's value
1778 * according to the two arrays. It assumes that all code points have a value.
1779 * This is used as the base macro for macros for particular properties */
1780#define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
8bde5eaf 1781 _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp)
64935bc6
KW
1782
1783/* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1784 * of a code point, returning the value for the first code point in the string.
1785 * And it takes the particular macro name that finds the desired value given a
1786 * code point. Merely convert the UTF-8 to code point and call the cp macro */
1787#define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
1788 (__ASSERT_(pos < strend) \
1789 /* Note assumes is valid UTF-8 */ \
1790 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1791
1792/* Returns the GCB value for the input code point */
1793#define getGCB_VAL_CP(cp) \
1794 _generic_GET_BREAK_VAL_CP( \
1795 PL_GCB_invlist, \
02f811dd 1796 _Perl_GCB_invmap, \
64935bc6
KW
1797 (cp))
1798
1799/* Returns the GCB value for the first code point in the UTF-8 encoded string
1800 * bounded by pos and strend */
1801#define getGCB_VAL_UTF8(pos, strend) \
1802 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
05bd126c 1803
6b659339
KW
1804/* Returns the LB value for the input code point */
1805#define getLB_VAL_CP(cp) \
1806 _generic_GET_BREAK_VAL_CP( \
1807 PL_LB_invlist, \
1808 _Perl_LB_invmap, \
1809 (cp))
1810
1811/* Returns the LB value for the first code point in the UTF-8 encoded string
1812 * bounded by pos and strend */
1813#define getLB_VAL_UTF8(pos, strend) \
1814 _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
1815
06ae2722
KW
1816
1817/* Returns the SB value for the input code point */
1818#define getSB_VAL_CP(cp) \
1819 _generic_GET_BREAK_VAL_CP( \
1820 PL_SB_invlist, \
bf4268fa 1821 _Perl_SB_invmap, \
06ae2722
KW
1822 (cp))
1823
1824/* Returns the SB value for the first code point in the UTF-8 encoded string
1825 * bounded by pos and strend */
1826#define getSB_VAL_UTF8(pos, strend) \
1827 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1828
ae3bb8ea
KW
1829/* Returns the WB value for the input code point */
1830#define getWB_VAL_CP(cp) \
1831 _generic_GET_BREAK_VAL_CP( \
1832 PL_WB_invlist, \
bf4268fa 1833 _Perl_WB_invmap, \
ae3bb8ea
KW
1834 (cp))
1835
1836/* Returns the WB value for the first code point in the UTF-8 encoded string
1837 * bounded by pos and strend */
1838#define getWB_VAL_UTF8(pos, strend) \
1839 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1840
786e8c11 1841/* We know what class REx starts with. Try to find this position... */
02d5137b 1842/* if reginfo->intuit, its a dryrun */
786e8c11
YO
1843/* annoyingly all the vars in this routine have different names from their counterparts
1844 in regmatch. /grrr */
3c3eec57 1845STATIC char *
07be1b83 1846S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
f9176b44 1847 const char *strend, regmatch_info *reginfo)
a687059c 1848{
73104a1b
KW
1849 dVAR;
1850 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1851 char *pat_string; /* The pattern's exactish string */
1852 char *pat_end; /* ptr to end char of pat_string */
1853 re_fold_t folder; /* Function for computing non-utf8 folds */
1854 const U8 *fold_array; /* array for folding ords < 256 */
1855 STRLEN ln;
1856 STRLEN lnc;
73104a1b
KW
1857 U8 c1;
1858 U8 c2;
1859 char *e;
1860 I32 tmp = 1; /* Scratch variable? */
ba44c216 1861 const bool utf8_target = reginfo->is_utf8_target;
73104a1b 1862 UV utf8_fold_flags = 0;
f9176b44 1863 const bool is_utf8_pat = reginfo->is_utf8_pat;
3018b823
KW
1864 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1865 with a result inverts that result, as 0^1 =
1866 1 and 1^1 = 0 */
1867 _char_class_number classnum;
1868
73104a1b 1869 RXi_GET_DECL(prog,progi);
2f7f8cb1 1870
73104a1b 1871 PERL_ARGS_ASSERT_FIND_BYCLASS;
2f7f8cb1 1872
73104a1b
KW
1873 /* We know what class it must start with. */
1874 switch (OP(c)) {
a4525e78 1875 case ANYOFL:
780fcc9f 1876 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30 1877
d1c40ef5 1878 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
a0bd1a30
KW
1879 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
1880 }
1881
780fcc9f 1882 /* FALLTHROUGH */
ac44c12e 1883 case ANYOFD:
73104a1b
KW
1884 case ANYOF:
1885 if (utf8_target) {
1886 REXEC_FBC_UTF8_CLASS_SCAN(
3db24e1e 1887 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
73104a1b 1888 }
1451f692
DM
1889 else if (ANYOF_FLAGS(c)) {
1890 REXEC_FBC_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, 0));
1891 }
73104a1b 1892 else {
1451f692 1893 REXEC_FBC_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s)));
73104a1b
KW
1894 }
1895 break;
73104a1b 1896
098b07d5
KW
1897 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1898 assert(! is_utf8_pat);
924ba076 1899 /* FALLTHROUGH */
73104a1b 1900 case EXACTFA:
984e6dd1 1901 if (is_utf8_pat || utf8_target) {
73104a1b
KW
1902 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1903 goto do_exactf_utf8;
1904 }
1905 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1906 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1907 goto do_exactf_non_utf8; /* isn't dealt with by these */
77a6d856 1908
2fdb7295
KW
1909 case EXACTF: /* This node only generated for non-utf8 patterns */
1910 assert(! is_utf8_pat);
73104a1b 1911 if (utf8_target) {
73104a1b
KW
1912 utf8_fold_flags = 0;
1913 goto do_exactf_utf8;
1914 }
1915 fold_array = PL_fold;
1916 folder = foldEQ;
1917 goto do_exactf_non_utf8;
1918
1919 case EXACTFL:
780fcc9f 1920 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
31f05a37 1921 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
cea315b6 1922 utf8_fold_flags = FOLDEQ_LOCALE;
73104a1b
KW
1923 goto do_exactf_utf8;
1924 }
1925 fold_array = PL_fold_locale;
1926 folder = foldEQ_locale;
1927 goto do_exactf_non_utf8;
3c760661 1928
73104a1b 1929 case EXACTFU_SS:
984e6dd1 1930 if (is_utf8_pat) {
73104a1b
KW
1931 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1932 }
1933 goto do_exactf_utf8;
16d951b7 1934
a4525e78
KW
1935 case EXACTFLU8:
1936 if (! utf8_target) { /* All code points in this node require
1937 UTF-8 to express. */
1938 break;
1939 }
613abc6d
KW
1940 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1941 | FOLDEQ_S2_FOLDS_SANE;
a4525e78
KW
1942 goto do_exactf_utf8;
1943
73104a1b 1944 case EXACTFU:
984e6dd1
DM
1945 if (is_utf8_pat || utf8_target) {
1946 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
73104a1b
KW
1947 goto do_exactf_utf8;
1948 }
fac1af77 1949
73104a1b
KW
1950 /* Any 'ss' in the pattern should have been replaced by regcomp,
1951 * so we don't have to worry here about this single special case
1952 * in the Latin1 range */
1953 fold_array = PL_fold_latin1;
1954 folder = foldEQ_latin1;
1955
924ba076 1956 /* FALLTHROUGH */
73104a1b 1957
c52b8b12 1958 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
73104a1b
KW
1959 are no glitches with fold-length differences
1960 between the target string and pattern */
1961
1962 /* The idea in the non-utf8 EXACTF* cases is to first find the
1963 * first character of the EXACTF* node and then, if necessary,
1964 * case-insensitively compare the full text of the node. c1 is the
1965 * first character. c2 is its fold. This logic will not work for
1966 * Unicode semantics and the german sharp ss, which hence should
1967 * not be compiled into a node that gets here. */
1968 pat_string = STRING(c);
1969 ln = STR_LEN(c); /* length to match in octets/bytes */
1970
1971 /* We know that we have to match at least 'ln' bytes (which is the
1972 * same as characters, since not utf8). If we have to match 3
1973 * characters, and there are only 2 availabe, we know without
1974 * trying that it will fail; so don't start a match past the
1975 * required minimum number from the far end */
ea3daa5d 1976 e = HOP3c(strend, -((SSize_t)ln), s);
dda01918
HS
1977 if (e < s)
1978 break;
fac1af77 1979
73104a1b
KW
1980 c1 = *pat_string;
1981 c2 = fold_array[c1];
1982 if (c1 == c2) { /* If char and fold are the same */
1983 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1984 }
1985 else {
1986 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1987 }
1988 break;
fac1af77 1989
c52b8b12
KW
1990 do_exactf_utf8:
1991 {
73104a1b
KW
1992 unsigned expansion;
1993
1994 /* If one of the operands is in utf8, we can't use the simpler folding
1995 * above, due to the fact that many different characters can have the
1996 * same fold, or portion of a fold, or different- length fold */
1997 pat_string = STRING(c);
1998 ln = STR_LEN(c); /* length to match in octets/bytes */
1999 pat_end = pat_string + ln;
984e6dd1 2000 lnc = is_utf8_pat /* length to match in characters */
73104a1b
KW
2001 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
2002 : ln;
2003
2004 /* We have 'lnc' characters to match in the pattern, but because of
2005 * multi-character folding, each character in the target can match
2006 * up to 3 characters (Unicode guarantees it will never exceed
2007 * this) if it is utf8-encoded; and up to 2 if not (based on the
2008 * fact that the Latin 1 folds are already determined, and the
2009 * only multi-char fold in that range is the sharp-s folding to
2010 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
2011 * string character. Adjust lnc accordingly, rounding up, so that
2012 * if we need to match at least 4+1/3 chars, that really is 5. */
2013 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
2014 lnc = (lnc + expansion - 1) / expansion;
2015
2016 /* As in the non-UTF8 case, if we have to match 3 characters, and
2017 * only 2 are left, it's guaranteed to fail, so don't start a
2018 * match that would require us to go beyond the end of the string
2019 */
ea3daa5d 2020 e = HOP3c(strend, -((SSize_t)lnc), s);
73104a1b 2021
73104a1b
KW
2022 /* XXX Note that we could recalculate e to stop the loop earlier,
2023 * as the worst case expansion above will rarely be met, and as we
2024 * go along we would usually find that e moves further to the left.
2025 * This would happen only after we reached the point in the loop
2026 * where if there were no expansion we should fail. Unclear if
2027 * worth the expense */
2028
2029 while (s <= e) {
2030 char *my_strend= (char *)strend;
2031 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
984e6dd1 2032 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
02d5137b 2033 && (reginfo->intuit || regtry(reginfo, &s)) )
73104a1b
KW
2034 {
2035 goto got_it;
2036 }
2037 s += (utf8_target) ? UTF8SKIP(s) : 1;
2038 }
2039 break;
2040 }
236d82fd 2041
73104a1b 2042 case BOUNDL:
780fcc9f 2043 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2044 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2045 if (! IN_UTF8_CTYPE_LOCALE) {
2046 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2047 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2048 }
64935bc6
KW
2049 goto do_boundu;
2050 }
2051
7a207065 2052 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2053 break;
64935bc6 2054
73104a1b 2055 case NBOUNDL:
780fcc9f 2056 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6 2057 if (FLAGS(c) != TRADITIONAL_BOUND) {
89ad707a
KW
2058 if (! IN_UTF8_CTYPE_LOCALE) {
2059 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
64935bc6 2060 B_ON_NON_UTF8_LOCALE_IS_WRONG);
89ad707a 2061 }
64935bc6
KW
2062 goto do_nboundu;
2063 }
2064
7a207065 2065 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
73104a1b 2066 break;
64935bc6
KW
2067
2068 case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2069 meaning */
2070 assert(FLAGS(c) == TRADITIONAL_BOUND);
2071
7a207065 2072 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2073 break;
64935bc6
KW
2074
2075 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2076 meaning */
2077 assert(FLAGS(c) == TRADITIONAL_BOUND);
2078
44129e46 2079 FBC_BOUND_A(isWORDCHAR_A);
73104a1b 2080 break;
64935bc6
KW
2081
2082 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2083 meaning */
2084 assert(FLAGS(c) == TRADITIONAL_BOUND);
2085
7a207065 2086 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
73104a1b 2087 break;
64935bc6
KW
2088
2089 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2090 meaning */
2091 assert(FLAGS(c) == TRADITIONAL_BOUND);
2092
44129e46 2093 FBC_NBOUND_A(isWORDCHAR_A);
73104a1b 2094 break;
64935bc6 2095
73104a1b 2096 case NBOUNDU:
64935bc6 2097 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
7a207065 2098 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
64935bc6
KW
2099 break;
2100 }
2101
2102 do_nboundu:
2103
2104 to_complement = 1;
2105 /* FALLTHROUGH */
2106
2107 case BOUNDU:
2108 do_boundu:
2109 switch((bound_type) FLAGS(c)) {
2110 case TRADITIONAL_BOUND:
7a207065 2111 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
64935bc6
KW
2112 break;
2113 case GCB_BOUND:
a7a8bd1e 2114 if (s == reginfo->strbeg) {
67481c39 2115 if (reginfo->intuit || regtry(reginfo, &s))
64935bc6
KW
2116 {
2117 goto got_it;
2118 }
a7a8bd1e
KW
2119
2120 /* Didn't match. Try at the next position (if there is one) */
64935bc6 2121 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2122 if (UNLIKELY(s >= reginfo->strend)) {
2123 break;
2124 }
64935bc6
KW
2125 }
2126
2127 if (utf8_target) {
85e5f08b 2128 GCB_enum before = getGCB_VAL_UTF8(
64935bc6
KW
2129 reghop3((U8*)s, -1,
2130 (U8*)(reginfo->strbeg)),
2131 (U8*) reginfo->strend);
2132 while (s < strend) {
85e5f08b 2133 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
64935bc6 2134 (U8*) reginfo->strend);
b0e24409
KW
2135 if ( (to_complement ^ isGCB(before,
2136 after,
2137 (U8*) reginfo->strbeg,
2138 (U8*) s,
2139 utf8_target))
00e3344b
KW
2140 && (reginfo->intuit || regtry(reginfo, &s)))
2141 {
2142 goto got_it;
64935bc6 2143 }
43a7bd62 2144 before = after;
64935bc6
KW
2145 s += UTF8SKIP(s);
2146 }
2147 }
2148 else { /* Not utf8. Everything is a GCB except between CR and
2149 LF */
2150 while (s < strend) {
00e3344b
KW
2151 if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
2152 || UCHARAT(s) != '\n'))
2153 && (reginfo->intuit || regtry(reginfo, &s)))
64935bc6 2154 {
00e3344b 2155 goto got_it;
64935bc6 2156 }
43a7bd62 2157 s++;
64935bc6
KW
2158 }
2159 }
2160
6de80efc
KW
2161 /* And, since this is a bound, it can match after the final
2162 * character in the string */
67481c39 2163 if ((reginfo->intuit || regtry(reginfo, &s))) {
64935bc6
KW
2164 goto got_it;
2165 }
2166 break;
ae3bb8ea 2167
6b659339
KW
2168 case LB_BOUND:
2169 if (s == reginfo->strbeg) {
2170 if (reginfo->intuit || regtry(reginfo, &s)) {
2171 goto got_it;
2172 }
2173 s += (utf8_target) ? UTF8SKIP(s) : 1;
2174 if (UNLIKELY(s >= reginfo->strend)) {
2175 break;
2176 }
2177 }
2178
2179 if (utf8_target) {
2180 LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
2181 -1,
2182 (U8*)(reginfo->strbeg)),
2183 (U8*) reginfo->strend);
2184 while (s < strend) {
2185 LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
2186 if (to_complement ^ isLB(before,
2187 after,
2188 (U8*) reginfo->strbeg,
2189 (U8*) s,
2190 (U8*) reginfo->strend,
2191 utf8_target)
2192 && (reginfo->intuit || regtry(reginfo, &s)))
2193 {
2194 goto got_it;
2195 }
2196 before = after;
2197 s += UTF8SKIP(s);
2198 }
2199 }
2200 else { /* Not utf8. */
2201 LB_enum before = getLB_VAL_CP((U8) *(s -1));
2202 while (s < strend) {
2203 LB_enum after = getLB_VAL_CP((U8) *s);
2204 if (to_complement ^ isLB(before,
2205 after,
2206 (U8*) reginfo->strbeg,
2207 (U8*) s,
2208 (U8*) reginfo->strend,
2209 utf8_target)
2210 && (reginfo->intuit || regtry(reginfo, &s)))
2211 {
2212 goto got_it;
2213 }
2214 before = after;
2215 s++;
2216 }
2217 }
2218
2219 if (reginfo->intuit || regtry(reginfo, &s)) {
2220 goto got_it;
2221 }
2222
2223 break;
2224
06ae2722 2225 case SB_BOUND:
a7a8bd1e 2226 if (s == reginfo->strbeg) {
67481c39 2227 if (reginfo->intuit || regtry(reginfo, &s)) {
06ae2722
KW
2228 goto got_it;
2229 }
06ae2722 2230 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2231 if (UNLIKELY(s >= reginfo->strend)) {
2232 break;
2233 }
06ae2722
KW
2234 }
2235
2236 if (utf8_target) {
85e5f08b 2237 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
06ae2722
KW
2238 -1,
2239 (U8*)(reginfo->strbeg)),
2240 (U8*) reginfo->strend);
2241 while (s < strend) {
85e5f08b 2242 SB_enum after = getSB_VAL_UTF8((U8*) s,
06ae2722 2243 (U8*) reginfo->strend);
00e3344b
KW
2244 if ((to_complement ^ isSB(before,
2245 after,
2246 (U8*) reginfo->strbeg,
2247 (U8*) s,
2248 (U8*) reginfo->strend,
2249 utf8_target))
2250 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2251 {
00e3344b 2252 goto got_it;
06ae2722 2253 }
43a7bd62 2254 before = after;
06ae2722
KW
2255 s += UTF8SKIP(s);
2256 }
2257 }
2258 else { /* Not utf8. */
85e5f08b 2259 SB_enum before = getSB_VAL_CP((U8) *(s -1));
06ae2722 2260 while (s < strend) {
85e5f08b 2261 SB_enum after = getSB_VAL_CP((U8) *s);
00e3344b
KW
2262 if ((to_complement ^ isSB(before,
2263 after,
2264 (U8*) reginfo->strbeg,
2265 (U8*) s,
2266 (U8*) reginfo->strend,
2267 utf8_target))
2268 && (reginfo->intuit || regtry(reginfo, &s)))
06ae2722 2269 {
00e3344b 2270 goto got_it;
06ae2722 2271 }
43a7bd62 2272 before = after;
06ae2722
KW
2273 s++;
2274 }
2275 }
2276
2277 /* Here are at the final position in the target string. The SB
2278 * value is always true here, so matches, depending on other
2279 * constraints */
67481c39 2280 if (reginfo->intuit || regtry(reginfo, &s)) {
06ae2722
KW
2281 goto got_it;
2282 }
2283
2284 break;
2285
ae3bb8ea
KW
2286 case WB_BOUND:
2287 if (s == reginfo->strbeg) {
67481c39 2288 if (reginfo->intuit || regtry(reginfo, &s)) {
ae3bb8ea
KW
2289 goto got_it;
2290 }
2291 s += (utf8_target) ? UTF8SKIP(s) : 1;
a7a8bd1e
KW
2292 if (UNLIKELY(s >= reginfo->strend)) {
2293 break;
2294 }
ae3bb8ea
KW
2295 }
2296
2297 if (utf8_target) {
2298 /* We are at a boundary between char_sub_0 and char_sub_1.
2299 * We also keep track of the value for char_sub_-1 as we
2300 * loop through the line. Context may be needed to make a
2301 * determination, and if so, this can save having to
2302 * recalculate it */
85e5f08b
KW
2303 WB_enum previous = WB_UNKNOWN;
2304 WB_enum before = getWB_VAL_UTF8(
ae3bb8ea
KW
2305 reghop3((U8*)s,
2306 -1,
2307 (U8*)(reginfo->strbeg)),
2308 (U8*) reginfo->strend);
2309 while (s < strend) {
85e5f08b 2310 WB_enum after = getWB_VAL_UTF8((U8*) s,
ae3bb8ea 2311 (U8*) reginfo->strend);
00e3344b
KW
2312 if ((to_complement ^ isWB(previous,
2313 before,
2314 after,
2315 (U8*) reginfo->strbeg,
2316 (U8*) s,
2317 (U8*) reginfo->strend,
2318 utf8_target))
2319 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2320 {
00e3344b 2321 goto got_it;
ae3bb8ea 2322 }
43a7bd62
KW
2323 previous = before;
2324 before = after;
ae3bb8ea
KW
2325 s += UTF8SKIP(s);
2326 }
2327 }
2328 else { /* Not utf8. */
85e5f08b
KW
2329 WB_enum previous = WB_UNKNOWN;
2330 WB_enum before = getWB_VAL_CP((U8) *(s -1));
ae3bb8ea 2331 while (s < strend) {
85e5f08b 2332 WB_enum after = getWB_VAL_CP((U8) *s);
00e3344b
KW
2333 if ((to_complement ^ isWB(previous,
2334 before,
2335 after,
2336 (U8*) reginfo->strbeg,
2337 (U8*) s,
2338 (U8*) reginfo->strend,
2339 utf8_target))
2340 && (reginfo->intuit || regtry(reginfo, &s)))
ae3bb8ea 2341 {
00e3344b 2342 goto got_it;
ae3bb8ea 2343 }
43a7bd62
KW
2344 previous = before;
2345 before = after;
ae3bb8ea
KW
2346 s++;
2347 }
2348 }
2349
67481c39 2350 if (reginfo->intuit || regtry(reginfo, &s)) {
ae3bb8ea
KW
2351 goto got_it;
2352 }
64935bc6 2353 }
73104a1b 2354 break;
64935bc6 2355
73104a1b
KW
2356 case LNBREAK:
2357 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2358 is_LNBREAK_latin1_safe(s, strend)
2359 );
2360 break;
3018b823
KW
2361
2362 /* The argument to all the POSIX node types is the class number to pass to
2363 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2364
2365 case NPOSIXL:
2366 to_complement = 1;
2367 /* FALLTHROUGH */
2368
2369 case POSIXL:
780fcc9f 2370 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3018b823
KW
2371 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2372 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
73104a1b 2373 break;
3018b823
KW
2374
2375 case NPOSIXD:
2376 to_complement = 1;
2377 /* FALLTHROUGH */
2378
2379 case POSIXD:
2380 if (utf8_target) {
2381 goto posix_utf8;
2382 }
2383 goto posixa;
2384
2385 case NPOSIXA:
2386 if (utf8_target) {
2387 /* The complement of something that matches only ASCII matches all
837226c8 2388 * non-ASCII, plus everything in ASCII that isn't in the class. */
7a207065 2389 REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend)
3018b823
KW
2390 || ! _generic_isCC_A(*s, FLAGS(c)));
2391 break;
2392 }
2393
2394 to_complement = 1;
2395 /* FALLTHROUGH */
2396
73104a1b 2397 case POSIXA:
3018b823 2398 posixa:
73104a1b 2399 /* Don't need to worry about utf8, as it can match only a single
3018b823
KW
2400 * byte invariant character. */
2401 REXEC_FBC_CLASS_SCAN(
2402 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
73104a1b 2403 break;
3018b823
KW
2404
2405 case NPOSIXU:
2406 to_complement = 1;
2407 /* FALLTHROUGH */
2408
2409 case POSIXU:
2410 if (! utf8_target) {
2411 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2412 FLAGS(c))));
2413 }
2414 else {
2415
c52b8b12 2416 posix_utf8:
3018b823
KW
2417 classnum = (_char_class_number) FLAGS(c);
2418 if (classnum < _FIRST_NON_SWASH_CC) {
2419 while (s < strend) {
2420
2421 /* We avoid loading in the swash as long as possible, but
2422 * should we have to, we jump to a separate loop. This
2423 * extra 'if' statement is what keeps this code from being
2424 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2425 if (UTF8_IS_ABOVE_LATIN1(*s)) {
2426 goto found_above_latin1;
2427 }
2428 if ((UTF8_IS_INVARIANT(*s)
2429 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2430 classnum)))
042d9e50 2431 || ( UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend)
3018b823 2432 && to_complement ^ cBOOL(
a62b247b 2433 _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
94bb8c36 2434 *(s + 1)),
3018b823
KW
2435 classnum))))
2436 {
02d5137b 2437 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
3018b823
KW
2438 goto got_it;
2439 else {
2440 tmp = doevery;
2441 }
2442 }
2443 else {
2444 tmp = 1;
2445 }
2446 s += UTF8SKIP(s);
2447 }
2448 }
2449 else switch (classnum) { /* These classes are implemented as
2450 macros */
779cf272 2451 case _CC_ENUM_SPACE:
3018b823 2452 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2453 to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
3018b823
KW
2454 break;
2455
2456 case _CC_ENUM_BLANK:
2457 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2458 to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
3018b823
KW
2459 break;
2460
2461 case _CC_ENUM_XDIGIT:
2462 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2463 to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
3018b823
KW
2464 break;
2465
2466 case _CC_ENUM_VERTSPACE:
2467 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2468 to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
3018b823
KW
2469 break;
2470
2471 case _CC_ENUM_CNTRL:
2472 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2473 to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
3018b823
KW
2474 break;
2475
2476 default:
2477 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
e5964223 2478 NOT_REACHED; /* NOTREACHED */
3018b823
KW
2479 }
2480 }
2481 break;
2482
2483 found_above_latin1: /* Here we have to load a swash to get the result
2484 for the current code point */
2485 if (! PL_utf8_swash_ptrs[classnum]) {
2486 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2487 PL_utf8_swash_ptrs[classnum] =
2a16ac92
KW
2488 _core_swash_init("utf8",
2489 "",
2490 &PL_sv_undef, 1, 0,
2491 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
2492 }
2493
2494 /* This is a copy of the loop above for swash classes, though using the
2495 * FBC macro instead of being expanded out. Since we've loaded the
2496 * swash, we don't have to check for that each time through the loop */
2497 REXEC_FBC_UTF8_CLASS_SCAN(
7a207065 2498 to_complement ^ cBOOL(_generic_utf8_safe(
3018b823
KW
2499 classnum,
2500 s,
7a207065 2501 strend,
3018b823
KW
2502 swash_fetch(PL_utf8_swash_ptrs[classnum],
2503 (U8 *) s, TRUE))));
73104a1b
KW
2504 break;
2505
2506 case AHOCORASICKC:
2507 case AHOCORASICK:
2508 {
2509 DECL_TRIE_TYPE(c);
2510 /* what trie are we using right now */
2511 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2512 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2513 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2514
2515 const char *last_start = strend - trie->minlen;
6148ee25 2516#ifdef DEBUGGING
73104a1b 2517 const char *real_start = s;
6148ee25 2518#endif
73104a1b
KW
2519 STRLEN maxlen = trie->maxlen;
2520 SV *sv_points;
2521 U8 **points; /* map of where we were in the input string
2522 when reading a given char. For ASCII this
2523 is unnecessary overhead as the relationship
2524 is always 1:1, but for Unicode, especially
2525 case folded Unicode this is not true. */
2526 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2527 U8 *bitmap=NULL;
2528
2529
2530 GET_RE_DEBUG_FLAGS_DECL;
2531
2532 /* We can't just allocate points here. We need to wrap it in
2533 * an SV so it gets freed properly if there is a croak while
2534 * running the match */
2535 ENTER;
2536 SAVETMPS;
2537 sv_points=newSV(maxlen * sizeof(U8 *));
2538 SvCUR_set(sv_points,
2539 maxlen * sizeof(U8 *));
2540 SvPOK_on(sv_points);
2541 sv_2mortal(sv_points);
2542 points=(U8**)SvPV_nolen(sv_points );
2543 if ( trie_type != trie_utf8_fold
2544 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2545 {
2546 if (trie->bitmap)
2547 bitmap=(U8*)trie->bitmap;
2548 else
2549 bitmap=(U8*)ANYOF_BITMAP(c);
2550 }
2551 /* this is the Aho-Corasick algorithm modified a touch
2552 to include special handling for long "unknown char" sequences.
2553 The basic idea being that we use AC as long as we are dealing
2554 with a possible matching char, when we encounter an unknown char
2555 (and we have not encountered an accepting state) we scan forward
2556 until we find a legal starting char.
2557 AC matching is basically that of trie matching, except that when
2558 we encounter a failing transition, we fall back to the current
2559 states "fail state", and try the current char again, a process
2560 we repeat until we reach the root state, state 1, or a legal
2561 transition. If we fail on the root state then we can either
2562 terminate if we have reached an accepting state previously, or
2563 restart the entire process from the beginning if we have not.
2564
2565 */
2566 while (s <= last_start) {
2567 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2568 U8 *uc = (U8*)s;
2569 U16 charid = 0;
2570 U32 base = 1;
2571 U32 state = 1;
2572 UV uvc = 0;
2573 STRLEN len = 0;
2574 STRLEN foldlen = 0;
2575 U8 *uscan = (U8*)NULL;
2576 U8 *leftmost = NULL;
2577#ifdef DEBUGGING
2578 U32 accepted_word= 0;
786e8c11 2579#endif
73104a1b
KW
2580 U32 pointpos = 0;
2581
2582 while ( state && uc <= (U8*)strend ) {
2583 int failed=0;
2584 U32 word = aho->states[ state ].wordnum;
2585
2586 if( state==1 ) {
2587 if ( bitmap ) {
2588 DEBUG_TRIE_EXECUTE_r(
2589 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2590 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 2591 (char *)uc, utf8_target, 0 );
6ad9a8ab 2592 Perl_re_printf( aTHX_
73104a1b
KW
2593 " Scanning for legal start char...\n");
2594 }
2595 );
2596 if (utf8_target) {
2597 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2598 uc += UTF8SKIP(uc);
2599 }
2600 } else {
2601 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2602 uc++;
2603 }
786e8c11 2604 }
73104a1b 2605 s= (char *)uc;
07be1b83 2606 }
73104a1b
KW
2607 if (uc >(U8*)last_start) break;
2608 }
2609
2610 if ( word ) {
2611 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2612 if (!leftmost || lpos < leftmost) {
2613 DEBUG_r(accepted_word=word);
2614 leftmost= lpos;
7016d6eb 2615 }
73104a1b 2616 if (base==0) break;
7016d6eb 2617
73104a1b
KW
2618 }
2619 points[pointpos++ % maxlen]= uc;
2620 if (foldlen || uc < (U8*)strend) {
2621 REXEC_TRIE_READ_CHAR(trie_type, trie,
2622 widecharmap, uc,
2623 uscan, len, uvc, charid, foldlen,
2624 foldbuf, uniflags);
2625 DEBUG_TRIE_EXECUTE_r({
2626 dump_exec_pos( (char *)uc, c, strend,
cb41e5d6 2627 real_start, s, utf8_target, 0);
6ad9a8ab 2628 Perl_re_printf( aTHX_
147e3846 2629 " Charid:%3u CP:%4" UVxf " ",
73104a1b
KW
2630 charid, uvc);
2631 });
2632 }
2633 else {
2634 len = 0;
2635 charid = 0;
2636 }
07be1b83 2637
73104a1b
KW
2638
2639 do {
6148ee25 2640#ifdef DEBUGGING
73104a1b 2641 word = aho->states[ state ].wordnum;
6148ee25 2642#endif
73104a1b
KW
2643 base = aho->states[ state ].trans.base;
2644
2645 DEBUG_TRIE_EXECUTE_r({
2646 if (failed)
2647 dump_exec_pos( (char *)uc, c, strend, real_start,
cb41e5d6 2648 s, utf8_target, 0 );
6ad9a8ab 2649 Perl_re_printf( aTHX_
147e3846 2650 "%sState: %4" UVxf ", word=%" UVxf,
73104a1b
KW
2651 failed ? " Fail transition to " : "",
2652 (UV)state, (UV)word);
2653 });
2654 if ( base ) {
2655 U32 tmp;
2656 I32 offset;
2657 if (charid &&
2658 ( ((offset = base + charid
2659 - 1 - trie->uniquecharcount)) >= 0)
2660 && ((U32)offset < trie->lasttrans)
2661 && trie->trans[offset].check == state
2662 && (tmp=trie->trans[offset].next))
2663 {
2664 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 2665 Perl_re_printf( aTHX_ " - legal\n"));
73104a1b
KW
2666 state = tmp;
2667 break;
07be1b83
YO
2668 }
2669 else {
786e8c11 2670 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 2671 Perl_re_printf( aTHX_ " - fail\n"));
786e8c11 2672 failed = 1;
73104a1b 2673 state = aho->fail[state];
07be1b83 2674 }
07be1b83 2675 }
73104a1b
KW
2676 else {
2677 /* we must be accepting here */
2678 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 2679 Perl_re_printf( aTHX_ " - accepting\n"));
73104a1b
KW
2680 failed = 1;
2681 break;
786e8c11 2682 }
73104a1b
KW
2683 } while(state);
2684 uc += len;
2685 if (failed) {
2686 if (leftmost)
2687 break;
2688 if (!state) state = 1;
07be1b83 2689 }
73104a1b
KW
2690 }
2691 if ( aho->states[ state ].wordnum ) {
2692 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2693 if (!leftmost || lpos < leftmost) {
2694 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2695 leftmost = lpos;
07be1b83
YO
2696 }
2697 }
73104a1b
KW
2698 if (leftmost) {
2699 s = (char*)leftmost;
2700 DEBUG_TRIE_EXECUTE_r({
147e3846 2701 Perl_re_printf( aTHX_ "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n",
73104a1b
KW
2702 (UV)accepted_word, (IV)(s - real_start)
2703 );
2704 });
02d5137b 2705 if (reginfo->intuit || regtry(reginfo, &s)) {
73104a1b
KW
2706 FREETMPS;
2707 LEAVE;
2708 goto got_it;
2709 }
2710 s = HOPc(s,1);
2711 DEBUG_TRIE_EXECUTE_r({
6ad9a8ab 2712 Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n");
73104a1b
KW
2713 });
2714 } else {
2715 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 2716 Perl_re_printf( aTHX_ "No match.\n"));
73104a1b
KW
2717 break;
2718 }
2719 }
2720 FREETMPS;
2721 LEAVE;
2722 }
2723 break;
2724 default:
2725 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
73104a1b
KW
2726 }
2727 return 0;
2728 got_it:
2729 return s;
6eb5f6b9
JH
2730}
2731
60165aa4
DM
2732/* set RX_SAVED_COPY, RX_SUBBEG etc.
2733 * flags have same meanings as with regexec_flags() */
2734
749f4950
DM
2735static void
2736S_reg_set_capture_string(pTHX_ REGEXP * const rx,
60165aa4
DM
2737 char *strbeg,
2738 char *strend,
2739 SV *sv,
2740 U32 flags,
2741 bool utf8_target)
2742{
2743 struct regexp *const prog = ReANY(rx);
2744
60165aa4
DM
2745 if (flags & REXEC_COPY_STR) {
2746#ifdef PERL_ANY_COW
2747 if (SvCANCOW(sv)) {
eb8fc9fe 2748 DEBUG_C(Perl_re_printf( aTHX_
60165aa4 2749 "Copy on write: regexp capture, type %d\n",
eb8fc9fe 2750 (int) SvTYPE(sv)));
5411a0e5
DM
2751 /* Create a new COW SV to share the match string and store
2752 * in saved_copy, unless the current COW SV in saved_copy
2753 * is valid and suitable for our purpose */
2754 if (( prog->saved_copy
2755 && SvIsCOW(prog->saved_copy)
2756 && SvPOKp(prog->saved_copy)
2757 && SvIsCOW(sv)
2758 && SvPOKp(sv)
2759 && SvPVX(sv) == SvPVX(prog->saved_copy)))
a76b0e90 2760 {
5411a0e5
DM
2761 /* just reuse saved_copy SV */
2762 if (RXp_MATCH_COPIED(prog)) {
2763 Safefree(prog->subbeg);
2764 RXp_MATCH_COPIED_off(prog);
2765 }
2766 }
2767 else {
2768 /* create new COW SV to share string */
a76b0e90
DM
2769 RX_MATCH_COPY_FREE(rx);
2770 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
a76b0e90 2771 }
5411a0e5
DM
2772 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2773 assert (SvPOKp(prog->saved_copy));
60165aa4
DM
2774 prog->sublen = strend - strbeg;
2775 prog->suboffset = 0;
2776 prog->subcoffset = 0;
2777 } else
2778#endif
2779 {
99a90e59
FC
2780 SSize_t min = 0;
2781 SSize_t max = strend - strbeg;
ea3daa5d 2782 SSize_t sublen;
60165aa4
DM
2783
2784 if ( (flags & REXEC_COPY_SKIP_POST)
e322109a 2785 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2786 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2787 ) { /* don't copy $' part of string */
2788 U32 n = 0;
2789 max = -1;
2790 /* calculate the right-most part of the string covered
f67a5002 2791 * by a capture. Due to lookahead, this may be to
60165aa4
DM
2792 * the right of $&, so we have to scan all captures */
2793 while (n <= prog->lastparen) {
2794 if (prog->offs[n].end > max)
2795 max = prog->offs[n].end;
2796 n++;
2797 }
2798 if (max == -1)
2799 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2800 ? prog->offs[0].start
2801 : 0;
2802 assert(max >= 0 && max <= strend - strbeg);
2803 }
2804
2805 if ( (flags & REXEC_COPY_SKIP_PRE)
e322109a 2806 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
60165aa4
DM
2807 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2808 ) { /* don't copy $` part of string */
2809 U32 n = 0;
2810 min = max;
2811 /* calculate the left-most part of the string covered
f67a5002 2812 * by a capture. Due to lookbehind, this may be to
60165aa4
DM
2813 * the left of $&, so we have to scan all captures */
2814 while (min && n <= prog->lastparen) {
2815 if ( prog->offs[n].start != -1
2816 && prog->offs[n].start < min)
2817 {
2818 min = prog->offs[n].start;
2819 }
2820 n++;
2821 }
2822 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2823 && min > prog->offs[0].end
2824 )
2825 min = prog->offs[0].end;
2826
2827 }
2828
2829 assert(min >= 0 && min <= max && min <= strend - strbeg);
2830 sublen = max - min;
2831
2832 if (RX_MATCH_COPIED(rx)) {
2833 if (sublen > prog->sublen)
2834 prog->subbeg =
2835 (char*)saferealloc(prog->subbeg, sublen+1);
2836 }
2837 else
2838 prog->subbeg = (char*)safemalloc(sublen+1);
2839 Copy(strbeg + min, prog->subbeg, sublen, char);
2840 prog->subbeg[sublen] = '\0';
2841 prog->suboffset = min;
2842 prog->sublen = sublen;
2843 RX_MATCH_COPIED_on(rx);
2844 }
2845 prog->subcoffset = prog->suboffset;
2846 if (prog->suboffset && utf8_target) {
2847 /* Convert byte offset to chars.
2848 * XXX ideally should only compute this if @-/@+
2849 * has been seen, a la PL_sawampersand ??? */
2850
2851 /* If there's a direct correspondence between the
2852 * string which we're matching and the original SV,
2853 * then we can use the utf8 len cache associated with
2854 * the SV. In particular, it means that under //g,
2855 * sv_pos_b2u() will use the previously cached
2856 * position to speed up working out the new length of
2857 * subcoffset, rather than counting from the start of
2858 * the string each time. This stops
2859 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2860 * from going quadratic */
2861 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
ea3daa5d
FC
2862 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2863 SV_GMAGIC|SV_CONST_RETURN);
60165aa4
DM
2864 else
2865 prog->subcoffset = utf8_length((U8*)strbeg,
2866 (U8*)(strbeg+prog->suboffset));
2867 }
2868 }
2869 else {
2870 RX_MATCH_COPY_FREE(rx);
2871 prog->subbeg = strbeg;
2872 prog->suboffset = 0;
2873 prog->subcoffset = 0;
2874 prog->sublen = strend - strbeg;
2875 }
2876}
2877
2878
2879
fae667d5 2880
6eb5f6b9
JH
2881/*
2882 - regexec_flags - match a regexp against a string
2883 */
2884I32
5aaab254 2885Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
ea3daa5d 2886 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
8fd1a950
DM
2887/* stringarg: the point in the string at which to begin matching */
2888/* strend: pointer to null at end of string */
2889/* strbeg: real beginning of string */
2890/* minend: end of match must be >= minend bytes after stringarg. */
2891/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2892 * itself is accessed via the pointers above */
2893/* data: May be used for some additional optimizations.
d058ec57 2894 Currently unused. */
a340edde 2895/* flags: For optimizations. See REXEC_* in regexp.h */
8fd1a950 2896
6eb5f6b9 2897{
8d919b0a 2898 struct regexp *const prog = ReANY(rx);
5aaab254 2899 char *s;
eb578fdb 2900 regnode *c;
03c83e26 2901 char *startpos;
ea3daa5d
FC
2902 SSize_t minlen; /* must match at least this many chars */
2903 SSize_t dontbother = 0; /* how many characters not to try at end */
f2ed9b32 2904 const bool utf8_target = cBOOL(DO_UTF8(sv));
2757e526 2905 I32 multiline;
f8fc2ecf 2906 RXi_GET_DECL(prog,progi);
02d5137b
DM
2907 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2908 regmatch_info *const reginfo = &reginfo_buf;
e9105d30 2909 regexp_paren_pair *swap = NULL;
006f26b2 2910 I32 oldsave;
a3621e74
YO
2911 GET_RE_DEBUG_FLAGS_DECL;
2912
7918f24d 2913 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
9d4ba2ae 2914 PERL_UNUSED_ARG(data);
6eb5f6b9
JH
2915
2916 /* Be paranoid... */
3dc78631 2917 if (prog == NULL) {
6eb5f6b9 2918 Perl_croak(aTHX_ "NULL regexp parameter");
6eb5f6b9
JH
2919 }
2920
6c3fea77 2921 DEBUG_EXECUTE_r(
03c83e26 2922 debug_start_match(rx, utf8_target, stringarg, strend,
6c3fea77
DM
2923 "Matching");
2924 );
8adc0f72 2925
b342a604
DM
2926 startpos = stringarg;
2927
4cf1a867
DM
2928 /* set these early as they may be used by the HOP macros below */
2929 reginfo->strbeg = strbeg;
2930 reginfo->strend = strend;
2931 reginfo->is_utf8_target = cBOOL(utf8_target);
2932
58430ea8 2933 if (prog->intflags & PREGf_GPOS_SEEN) {
d307c076
DM
2934 MAGIC *mg;
2935
fef7148b
DM
2936 /* set reginfo->ganch, the position where \G can match */
2937
2938 reginfo->ganch =
2939 (flags & REXEC_IGNOREPOS)
2940 ? stringarg /* use start pos rather than pos() */
3dc78631 2941 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
25fdce4a
FC
2942 /* Defined pos(): */
2943 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
fef7148b
DM
2944 : strbeg; /* pos() not defined; use start of string */
2945
6ad9a8ab 2946 DEBUG_GPOS_r(Perl_re_printf( aTHX_
147e3846 2947 "GPOS ganch set to strbeg[%" IVdf "]\n", (IV)(reginfo->ganch - strbeg)));
fef7148b 2948
03c83e26
DM
2949 /* in the presence of \G, we may need to start looking earlier in
2950 * the string than the suggested start point of stringarg:
0b2c2a84 2951 * if prog->gofs is set, then that's a known, fixed minimum
03c83e26
DM
2952 * offset, such as
2953 * /..\G/: gofs = 2
2954 * /ab|c\G/: gofs = 1
2955 * or if the minimum offset isn't known, then we have to go back
2956 * to the start of the string, e.g. /w+\G/
2957 */
2bfbe302 2958
8e1490ee 2959 if (prog->intflags & PREGf_ANCH_GPOS) {
4cf1a867
DM
2960 if (prog->gofs) {
2961 startpos = HOPBACKc(reginfo->ganch, prog->gofs);
2962 if (!startpos ||
2963 ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
2964 {
6ad9a8ab 2965 DEBUG_r(Perl_re_printf( aTHX_
4cf1a867
DM
2966 "fail: ganch-gofs before earliest possible start\n"));
2967 return 0;
2968 }
2bfbe302 2969 }
4cf1a867
DM
2970 else
2971 startpos = reginfo->ganch;
2bfbe302
DM
2972 }
2973 else if (prog->gofs) {
4cf1a867
DM
2974 startpos = HOPBACKc(startpos, prog->gofs);
2975 if (!startpos)
b342a604 2976 startpos = strbeg;
03c83e26 2977 }
58430ea8 2978 else if (prog->intflags & PREGf_GPOS_FLOAT)
b342a604 2979 startpos = strbeg;
03c83e26
DM
2980 }
2981
2982 minlen = prog->minlen;
b342a604 2983 if ((startpos + minlen) > strend || startpos < strbeg) {
6ad9a8ab 2984 DEBUG_r(Perl_re_printf( aTHX_
03c83e26
DM
2985 "Regex match can't succeed, so not even tried\n"));
2986 return 0;
2987 }
2988
63a3746a
DM
2989 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2990 * which will call destuctors to reset PL_regmatch_state, free higher
2991 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2992 * regmatch_info_aux_eval */
2993
2994 oldsave = PL_savestack_ix;
2995
dfa77d06
DM
2996 s = startpos;
2997
e322109a 2998 if ((prog->extflags & RXf_USE_INTUIT)
7fadf4a7
DM
2999 && !(flags & REXEC_CHECKED))
3000 {
dfa77d06 3001 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
7fadf4a7 3002 flags, NULL);
dfa77d06 3003 if (!s)
7fadf4a7
DM
3004 return 0;
3005
e322109a 3006 if (prog->extflags & RXf_CHECK_ALL) {
7fadf4a7
DM
3007 /* we can match based purely on the result of INTUIT.
3008 * Set up captures etc just for $& and $-[0]
3009 * (an intuit-only match wont have $1,$2,..) */
3010 assert(!prog->nparens);
d5e7783a
DM
3011
3012 /* s/// doesn't like it if $& is earlier than where we asked it to
3013 * start searching (which can happen on something like /.\G/) */
3014 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3015 && (s < stringarg))
3016 {
3017 /* this should only be possible under \G */
58430ea8 3018 assert(prog->intflags & PREGf_GPOS_SEEN);
6ad9a8ab 3019 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d5e7783a
DM
3020 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3021 goto phooey;
3022 }
3023
7fadf4a7
DM
3024 /* match via INTUIT shouldn't have any captures.
3025 * Let @-, @+, $^N know */
3026 prog->lastparen = prog->lastcloseparen = 0;
3027 RX_MATCH_UTF8_set(rx, utf8_target);
3ff69bd6
DM
3028 prog->offs[0].start = s - strbeg;
3029 prog->offs[0].end = utf8_target
3030 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
3031 : s - strbeg + prog->minlenret;
7fadf4a7 3032 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3033 S_reg_set_capture_string(aTHX_ rx,
7fadf4a7
DM
3034 strbeg, strend,
3035 sv, flags, utf8_target);
3036
7fadf4a7
DM
3037 return 1;
3038 }
3039 }
3040
6c3fea77 3041 multiline = prog->extflags & RXf_PMf_MULTILINE;
1de06328 3042
dfa77d06 3043 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
6ad9a8ab 3044 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
a72c7584
JH
3045 "String too short [regexec_flags]...\n"));
3046 goto phooey;
1aa99e6b 3047 }
1de06328 3048
6eb5f6b9 3049 /* Check validity of program. */
f8fc2ecf 3050 if (UCHARAT(progi->program) != REG_MAGIC) {
6eb5f6b9
JH
3051 Perl_croak(aTHX_ "corrupted regexp program");
3052 }
3053
1738e041 3054 RX_MATCH_TAINTED_off(rx);
ab4e48c1 3055 RX_MATCH_UTF8_set(rx, utf8_target);
1738e041 3056
6c3fea77
DM
3057 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
3058 reginfo->intuit = 0;
02d5137b
DM
3059 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
3060 reginfo->warned = FALSE;
02d5137b 3061 reginfo->sv = sv;
1cb48e53 3062 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
6eb5f6b9 3063 /* see how far we have to get to not match where we matched before */
fe3974be 3064 reginfo->till = stringarg + minend;
6eb5f6b9 3065
60779a30 3066 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
82c23608
FC
3067 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
3068 S_cleanup_regmatch_info_aux has executed (registered by
3069 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
3070 magic belonging to this SV.
3071 Not newSVsv, either, as it does not COW.
3072 */
3073 reginfo->sv = newSV(0);
4cba5ac0 3074 SvSetSV_nosteal(reginfo->sv, sv);
82c23608
FC
3075 SAVEFREESV(reginfo->sv);
3076 }
3077
331b2dcc
DM
3078 /* reserve next 2 or 3 slots in PL_regmatch_state:
3079 * slot N+0: may currently be in use: skip it
3080 * slot N+1: use for regmatch_info_aux struct
3081 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
3082 * slot N+3: ready for use by regmatch()
3083 */
bf2039a9 3084
331b2dcc
DM
3085 {
3086 regmatch_state *old_regmatch_state;
3087 regmatch_slab *old_regmatch_slab;
3088 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
3089
3090 /* on first ever match, allocate first slab */
3091 if (!PL_regmatch_slab) {
3092 Newx(PL_regmatch_slab, 1, regmatch_slab);
3093 PL_regmatch_slab->prev = NULL;
3094 PL_regmatch_slab->next = NULL;
3095 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3096 }
bf2039a9 3097
331b2dcc
DM
3098 old_regmatch_state = PL_regmatch_state;
3099 old_regmatch_slab = PL_regmatch_slab;
bf2039a9 3100
331b2dcc
DM
3101 for (i=0; i <= max; i++) {
3102 if (i == 1)
3103 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3104 else if (i ==2)
3105 reginfo->info_aux_eval =
3106 reginfo->info_aux->info_aux_eval =
3107 &(PL_regmatch_state->u.info_aux_eval);
bf2039a9 3108
331b2dcc
DM
3109 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3110 PL_regmatch_state = S_push_slab(aTHX);
3111 }
bf2039a9 3112
331b2dcc
DM
3113 /* note initial PL_regmatch_state position; at end of match we'll
3114 * pop back to there and free any higher slabs */
bf2039a9 3115
331b2dcc
DM
3116 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3117 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2ac8ff4b 3118 reginfo->info_aux->poscache = NULL;
bf2039a9 3119
331b2dcc 3120 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
bf2039a9 3121
331b2dcc
DM
3122 if ((prog->extflags & RXf_EVAL_SEEN))
3123 S_setup_eval_state(aTHX_ reginfo);
3124 else
3125 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
bf2039a9 3126 }
d3aa529c 3127
6eb5f6b9 3128 /* If there is a "must appear" string, look for it. */
6eb5f6b9 3129
288b8c02 3130 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
e9105d30
GG
3131 /* We have to be careful. If the previous successful match
3132 was from this regex we don't want a subsequent partially
3133 successful match to clobber the old results.
3134 So when we detect this possibility we add a swap buffer
d8da0584
KW
3135 to the re, and switch the buffer each match. If we fail,
3136 we switch it back; otherwise we leave it swapped.
e9105d30
GG
3137 */
3138 swap = prog->offs;
3139 /* do we need a save destructor here for eval dies? */
3140 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2b1a3689 3141 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
147e3846 3142 "rex=0x%" UVxf " saving offs: orig=0x%" UVxf " new=0x%" UVxf "\n",
2b1a3689
YO
3143 0,
3144 PTR2UV(prog),
495f47a5
DM
3145 PTR2UV(swap),
3146 PTR2UV(prog->offs)
3147 ));
c74340f9 3148 }
6eb5f6b9 3149
ba6840fb
YO
3150 if (prog->recurse_locinput)
3151 Zero(prog->recurse_locinput,prog->nparens + 1, char *);
3152
0fa70a06
DM
3153 /* Simplest case: anchored match need be tried only once, or with
3154 * MBOL, only at the beginning of each line.
3155 *
3156 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3157 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3158 * match at the start of the string then it won't match anywhere else
3159 * either; while with /.*.../, if it doesn't match at the beginning,
3160 * the earliest it could match is at the start of the next line */
3161
8e1490ee 3162 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
0fa70a06
DM
3163 char *end;
3164
3165 if (regtry(reginfo, &s))
6eb5f6b9 3166 goto got_it;
0fa70a06
DM
3167
3168 if (!(prog->intflags & PREGf_ANCH_MBOL))
3169 goto phooey;
3170
3171 /* didn't match at start, try at other newline positions */
3172
3173 if (minlen)
3174 dontbother = minlen - 1;
3175 end = HOP3c(strend, -dontbother, strbeg) - 1;
3176
3177 /* skip to next newline */
3178
3179 while (s <= end) { /* note it could be possible to match at the end of the string */
3180 /* NB: newlines are the same in unicode as they are in latin */
3181 if (*s++ != '\n')
3182 continue;
3183 if (prog->check_substr || prog->check_utf8) {
3184 /* note that with PREGf_IMPLICIT, intuit can only fail
3185 * or return the start position, so it's of limited utility.
3186 * Nevertheless, I made the decision that the potential for
3187 * quick fail was still worth it - DAPM */
3188 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3189 if (!s)
3190 goto phooey;
3191 }
3192 if (regtry(reginfo, &s))
3193 goto got_it;
3194 }
3195 goto phooey;
3196 } /* end anchored search */
3197
3198 if (prog->intflags & PREGf_ANCH_GPOS)
f9f4320a 3199 {
a8430a8b
YO
3200 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3201 assert(prog->intflags & PREGf_GPOS_SEEN);
2bfbe302
DM
3202 /* For anchored \G, the only position it can match from is
3203 * (ganch-gofs); we already set startpos to this above; if intuit
3204 * moved us on from there, we can't possibly succeed */
4cf1a867 3205 assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
2bfbe302 3206 if (s == startpos && regtry(reginfo, &s))
6eb5f6b9
JH
3207 goto got_it;
3208 goto phooey;
3209 }
3210
3211 /* Messy cases: unanchored match. */
bbe252da 3212 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
6eb5f6b9 3213 /* we have /x+whatever/ */
984e6dd1 3214 /* it must be a one character string (XXXX Except is_utf8_pat?) */
33b8afdf 3215 char ch;
bf93d4cc
GS
3216#ifdef DEBUGGING
3217 int did_match = 0;
3218#endif
f2ed9b32 3219 if (utf8_target) {
7e0d5ad7
KW
3220 if (! prog->anchored_utf8) {
3221 to_utf8_substr(prog);
3222 }
3223 ch = SvPVX_const(prog->anchored_utf8)[0];
4cadc6a9 3224 REXEC_FBC_SCAN(
6eb5f6b9 3225 if (*s == ch) {
a3621e74 3226 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 3227 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
3228 s += UTF8SKIP(s);
3229 while (s < strend && *s == ch)
3230 s += UTF8SKIP(s);
3231 }
4cadc6a9 3232 );
7e0d5ad7 3233
6eb5f6b9
JH
3234 }
3235 else {
7e0d5ad7
KW
3236 if (! prog->anchored_substr) {
3237 if (! to_byte_substr(prog)) {
6b54ddc5 3238 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3239 }
3240 }
3241 ch = SvPVX_const(prog->anchored_substr)[0];
4cadc6a9 3242 REXEC_FBC_SCAN(
6eb5f6b9 3243 if (*s == ch) {
a3621e74 3244 DEBUG_EXECUTE_r( did_match = 1 );
02d5137b 3245 if (regtry(reginfo, &s)) goto got_it;
6eb5f6b9
JH
3246 s++;
3247 while (s < strend && *s == ch)
3248 s++;
3249 }
4cadc6a9 3250 );
6eb5f6b9 3251 }
a3621e74 3252 DEBUG_EXECUTE_r(if (!did_match)
6ad9a8ab 3253 Perl_re_printf( aTHX_
b7953727
JH
3254 "Did not find anchored character...\n")
3255 );
6eb5f6b9 3256 }
a0714e2c
SS
3257 else if (prog->anchored_substr != NULL
3258 || prog->anchored_utf8 != NULL
3259 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
33b8afdf
JH
3260 && prog->float_max_offset < strend - s)) {
3261 SV *must;
ea3daa5d
FC
3262 SSize_t back_max;
3263 SSize_t back_min;
33b8afdf 3264 char *last;
6eb5f6b9 3265 char *last1; /* Last position checked before */
bf93d4cc
GS
3266#ifdef DEBUGGING
3267 int did_match = 0;
3268#endif
33b8afdf 3269 if (prog->anchored_substr || prog->anchored_utf8) {
7e0d5ad7
KW
3270 if (utf8_target) {
3271 if (! prog->anchored_utf8) {
3272 to_utf8_substr(prog);
3273 }
3274 must = prog->anchored_utf8;
3275 }
3276 else {
3277 if (! prog->anchored_substr) {
3278 if (! to_byte_substr(prog)) {
6b54ddc5 3279 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3280 }
3281 }
3282 must = prog->anchored_substr;
3283 }
33b8afdf
JH
3284 back_max = back_min = prog->anchored_offset;
3285 } else {
7e0d5ad7
KW
3286 if (utf8_target) {
3287 if (! prog->float_utf8) {
3288 to_utf8_substr(prog);
3289 }
3290 must = prog->float_utf8;
3291 }
3292 else {
3293 if (! prog->float_substr) {
3294 if (! to_byte_substr(prog)) {
6b54ddc5 3295 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3296 }
3297 }
3298 must = prog->float_substr;
3299 }
33b8afdf
JH
3300 back_max = prog->float_max_offset;
3301 back_min = prog->float_min_offset;
3302 }
1de06328 3303
1de06328
YO
3304 if (back_min<0) {
3305 last = strend;
3306 } else {
3307 last = HOP3c(strend, /* Cannot start after this */
ea3daa5d 3308 -(SSize_t)(CHR_SVLEN(must)
1de06328
YO
3309 - (SvTAIL(must) != 0) + back_min), strbeg);
3310 }
9d9163fb 3311 if (s > reginfo->strbeg)
6eb5f6b9
JH
3312 last1 = HOPc(s, -1);
3313 else
3314 last1 = s - 1; /* bogus */
3315
a0288114 3316 /* XXXX check_substr already used to find "s", can optimize if
6eb5f6b9 3317 check_substr==must. */
bf05793b 3318 dontbother = 0;
6eb5f6b9
JH
3319 strend = HOPc(strend, -dontbother);
3320 while ( (s <= last) &&
e50d57d4 3321 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
9041c2e3 3322 (unsigned char*)strend, must,
c33e64f0 3323 multiline ? FBMrf_MULTILINE : 0)) ) {
a3621e74 3324 DEBUG_EXECUTE_r( did_match = 1 );
6eb5f6b9
JH
3325 if (HOPc(s, -back_max) > last1) {
3326 last1 = HOPc(s, -back_min);
3327 s = HOPc(s, -back_max);
3328 }
3329 else {
9d9163fb
DM
3330 char * const t = (last1 >= reginfo->strbeg)
3331 ? HOPc(last1, 1) : last1 + 1;
6eb5f6b9
JH
3332
3333 last1 = HOPc(s, -back_min);
52657f30 3334 s = t;
6eb5f6b9 3335 }
f2ed9b32 3336 if (utf8_target) {
6eb5f6b9 3337 while (s <= last1) {
02d5137b 3338 if (regtry(reginfo, &s))
6eb5f6b9 3339 goto got_it;
7016d6eb
DM
3340 if (s >= last1) {
3341 s++; /* to break out of outer loop */
3342 break;
3343 }
3344 s += UTF8SKIP(s);
6eb5f6b9
JH
3345 }
3346 }
3347 else {
3348 while (s <= last1) {
02d5137b 3349 if (regtry(reginfo, &s))
6eb5f6b9
JH
3350 goto got_it;
3351 s++;
3352 }
3353 }
3354 }
ab3bbdeb 3355 DEBUG_EXECUTE_r(if (!did_match) {
f2ed9b32 3356 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
ab3bbdeb 3357 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
6ad9a8ab 3358 Perl_re_printf( aTHX_ "Did not find %s substr %s%s...\n",
33b8afdf 3359 ((must == prog->anchored_substr || must == prog->anchored_utf8)
bf93d4cc 3360 ? "anchored" : "floating"),
ab3bbdeb
YO
3361 quoted, RE_SV_TAIL(must));
3362 });
6eb5f6b9
JH
3363 goto phooey;
3364 }
f8fc2ecf 3365 else if ( (c = progi->regstclass) ) {
f14c76ed 3366 if (minlen) {
f8fc2ecf 3367 const OPCODE op = OP(progi->regstclass);
66e933ab 3368 /* don't bother with what can't match */
33c28ab2 3369 if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
f14c76ed
RGS
3370 strend = HOPc(strend, -(minlen - 1));
3371 }
a3621e74 3372 DEBUG_EXECUTE_r({
be8e71aa 3373 SV * const prop = sv_newmortal();
8b9781c9 3374 regprop(prog, prop, c, reginfo, NULL);
0df25f3d 3375 {
f2ed9b32 3376 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb 3377 s,strend-s,60);
6ad9a8ab 3378 Perl_re_printf( aTHX_
1c8f8eb1 3379 "Matching stclass %.*s against %s (%d bytes)\n",
e4f74956 3380 (int)SvCUR(prop), SvPVX_const(prop),
ab3bbdeb 3381 quoted, (int)(strend - s));
0df25f3d 3382 }
ffc61ed2 3383 });
f9176b44 3384 if (find_byclass(prog, c, s, strend, reginfo))
6eb5f6b9 3385 goto got_it;
6ad9a8ab 3386 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "Contradicts stclass... [regexec_flags]\n"));
d6a28714
JH
3387 }
3388 else {
3389 dontbother = 0;
a0714e2c 3390 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
33b8afdf 3391 /* Trim the end. */
6af40bd7 3392 char *last= NULL;
33b8afdf 3393 SV* float_real;
c33e64f0
FC
3394 STRLEN len;
3395 const char *little;
33b8afdf 3396
7e0d5ad7
KW
3397 if (utf8_target) {
3398 if (! prog->float_utf8) {
3399 to_utf8_substr(prog);
3400 }
3401 float_real = prog->float_utf8;
3402 }
3403 else {
3404 if (! prog->float_substr) {
3405 if (! to_byte_substr(prog)) {
6b54ddc5 3406 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
7e0d5ad7
KW
3407 }
3408 }
3409 float_real = prog->float_substr;
3410 }
d6a28714 3411
c33e64f0
FC
3412 little = SvPV_const(float_real, len);
3413 if (SvTAIL(float_real)) {
7f18ad16
KW
3414 /* This means that float_real contains an artificial \n on
3415 * the end due to the presence of something like this:
3416 * /foo$/ where we can match both "foo" and "foo\n" at the
3417 * end of the string. So we have to compare the end of the
3418 * string first against the float_real without the \n and
3419 * then against the full float_real with the string. We
3420 * have to watch out for cases where the string might be
3421 * smaller than the float_real or the float_real without
3422 * the \n. */
1a13b075
YO
3423 char *checkpos= strend - len;
3424 DEBUG_OPTIMISE_r(
6ad9a8ab 3425 Perl_re_printf( aTHX_
1a13b075
YO
3426 "%sChecking for float_real.%s\n",
3427 PL_colors[4], PL_colors[5]));
3428 if (checkpos + 1 < strbeg) {
7f18ad16
KW
3429 /* can't match, even if we remove the trailing \n
3430 * string is too short to match */
1a13b075 3431 DEBUG_EXECUTE_r(
6ad9a8ab 3432 Perl_re_printf( aTHX_
1a13b075
YO
3433 "%sString shorter than required trailing substring, cannot match.%s\n",
3434 PL_colors[4], PL_colors[5]));
3435 goto phooey;
3436 } else if (memEQ(checkpos + 1, little, len - 1)) {
7f18ad16
KW
3437 /* can match, the end of the string matches without the
3438 * "\n" */
1a13b075
YO
3439 last = checkpos + 1;
3440 } else if (checkpos < strbeg) {
7f18ad16
KW
3441 /* cant match, string is too short when the "\n" is
3442 * included */
1a13b075 3443 DEBUG_EXECUTE_r(
6ad9a8ab 3444 Perl_re_printf( aTHX_
1a13b075
YO
3445 "%sString does not contain required trailing substring, cannot match.%s\n",
3446 PL_colors[4], PL_colors[5]));
3447 goto phooey;
3448 } else if (!multiline) {
7f18ad16
KW
3449 /* non multiline match, so compare with the "\n" at the
3450 * end of the string */
1a13b075
YO
3451 if (memEQ(checkpos, little, len)) {
3452 last= checkpos;
3453 } else {
3454 DEBUG_EXECUTE_r(
6ad9a8ab 3455 Perl_re_printf( aTHX_
1a13b075
YO
3456 "%sString does not contain required trailing substring, cannot match.%s\n",
3457 PL_colors[4], PL_colors[5]));
3458 goto phooey;
3459 }
3460 } else {
7f18ad16
KW
3461 /* multiline match, so we have to search for a place
3462 * where the full string is located */
d6a28714 3463 goto find_last;
1a13b075 3464 }
c33e64f0 3465 } else {
d6a28714 3466 find_last:
9041c2e3 3467 if (len)
d6a28714 3468 last = rninstr(s, strend, little, little + len);
b8c5462f 3469 else
a0288114 3470 last = strend; /* matching "$" */
b8c5462f 3471 }
6af40bd7 3472 if (!last) {
7f18ad16
KW
3473 /* at one point this block contained a comment which was
3474 * probably incorrect, which said that this was a "should not
3475 * happen" case. Even if it was true when it was written I am
3476 * pretty sure it is not anymore, so I have removed the comment
3477 * and replaced it with this one. Yves */
6bda09f9 3478 DEBUG_EXECUTE_r(
6ad9a8ab 3479 Perl_re_printf( aTHX_
b729e729
YO
3480 "%sString does not contain required substring, cannot match.%s\n",
3481 PL_colors[4], PL_colors[5]
6af40bd7
YO
3482 ));
3483 goto phooey;
bf93d4cc 3484 }
d6a28714
JH
3485 dontbother = strend - last + prog->float_min_offset;
3486 }
3487 if (minlen && (dontbother < minlen))
3488 dontbother = minlen - 1;
3489 strend -= dontbother; /* this one's always in bytes! */
3490 /* We don't know much -- general case. */
f2ed9b32 3491 if (utf8_target) {
d6a28714 3492 for (;;) {
02d5137b 3493 if (regtry(reginfo, &s))
d6a28714
JH
3494 goto got_it;
3495 if (s >= strend)
3496 break;
b8c5462f 3497 s += UTF8SKIP(s);
d6a28714
JH
3498 };
3499 }
3500 else {
3501 do {
02d5137b 3502 if (regtry(reginfo, &s))
d6a28714
JH
3503 goto got_it;
3504 } while (s++ < strend);
3505 }
3506 }
3507
3508 /* Failure. */
3509 goto phooey;
3510
7b52d656 3511 got_it:
d5e7783a
DM
3512 /* s/// doesn't like it if $& is earlier than where we asked it to
3513 * start searching (which can happen on something like /.\G/) */
3514 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3515 && (prog->offs[0].start < stringarg - strbeg))
3516 {
3517 /* this should only be possible under \G */
58430ea8 3518 assert(prog->intflags & PREGf_GPOS_SEEN);
6ad9a8ab 3519 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
d5e7783a
DM
3520 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3521 goto phooey;
3522 }
3523
495f47a5
DM
3524 DEBUG_BUFFERS_r(
3525 if (swap)
2b1a3689 3526 Perl_re_exec_indentf( aTHX_
147e3846 3527 "rex=0x%" UVxf " freeing offs: 0x%" UVxf "\n",
2b1a3689
YO
3528 0,
3529 PTR2UV(prog),
495f47a5
DM
3530 PTR2UV(swap)
3531 );
3532 );
e9105d30 3533 Safefree(swap);
d6a28714 3534
bf2039a9
DM
3535 /* clean up; this will trigger destructors that will free all slabs
3536 * above the current one, and cleanup the regmatch_info_aux
3537 * and regmatch_info_aux_eval sructs */
8adc0f72 3538
006f26b2
DM
3539 LEAVE_SCOPE(oldsave);
3540
5daac39c
NC
3541 if (RXp_PAREN_NAMES(prog))
3542 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
d6a28714
JH
3543
3544 /* make sure $`, $&, $', and $digit will work later */
60165aa4 3545 if ( !(flags & REXEC_NOT_FIRST) )
749f4950 3546 S_reg_set_capture_string(aTHX_ rx,
60165aa4
DM
3547 strbeg, reginfo->strend,
3548 sv, flags, utf8_target);
9041c2e3 3549
d6a28714
JH
3550 return 1;
3551
7b52d656 3552 phooey:
6ad9a8ab 3553 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch failed%s\n",
e4584336 3554 PL_colors[4], PL_colors[5]));
8adc0f72 3555
bf2039a9
DM
3556 /* clean up; this will trigger destructors that will free all slabs
3557 * above the current one, and cleanup the regmatch_info_aux
3558 * and regmatch_info_aux_eval sructs */
8adc0f72 3559
006f26b2
DM
3560 LEAVE_SCOPE(oldsave);
3561
e9105d30 3562 if (swap) {
c74340f9 3563 /* we failed :-( roll it back */
2b1a3689 3564 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
147e3846 3565 "rex=0x%" UVxf " rolling back offs: freeing=0x%" UVxf " restoring=0x%" UVxf "\n",
2b1a3689
YO
3566 0,
3567 PTR2UV(prog),
495f47a5
DM
3568 PTR2UV(prog->offs),
3569 PTR2UV(swap)
3570 ));
e9105d30
GG
3571 Safefree(prog->offs);
3572 prog->offs = swap;
3573 }
d6a28714
JH
3574 return 0;
3575}
3576
6bda09f9 3577
b3d298be 3578/* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
ec43f78b 3579 * Do inc before dec, in case old and new rex are the same */
baa60164 3580#define SET_reg_curpm(Re2) \
bf2039a9 3581 if (reginfo->info_aux_eval) { \
ec43f78b
DM
3582 (void)ReREFCNT_inc(Re2); \
3583 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3584 PM_SETRE((PL_reg_curpm), (Re2)); \
3585 }
3586
3587
d6a28714
JH
3588/*
3589 - regtry - try match at specific point
3590 */
63f46dab 3591STATIC bool /* 0 failure, 1 success */
f73aaa43 3592S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
d6a28714 3593{
d6a28714 3594 CHECKPOINT lastcp;
288b8c02 3595 REGEXP *const rx = reginfo->prog;
8d919b0a 3596 regexp *const prog = ReANY(rx);
99a90e59 3597 SSize_t result;
cb41e5d6
YO
3598#ifdef DEBUGGING
3599 U32 depth = 0; /* used by REGCP_SET */
3600#endif
f8fc2ecf 3601 RXi_GET_DECL(prog,progi);
a3621e74 3602 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
3603
3604 PERL_ARGS_ASSERT_REGTRY;
3605
24b23f37 3606 reginfo->cutpoint=NULL;
d6a28714 3607
9d9163fb 3608 prog->offs[0].start = *startposp - reginfo->strbeg;
d6a28714 3609 prog->lastparen = 0;
03994de8 3610 prog->lastcloseparen = 0;
d6a28714
JH
3611
3612 /* XXXX What this code is doing here?!!! There should be no need
b93070ed 3613 to do this again and again, prog->lastparen should take care of
3dd2943c 3614 this! --ilya*/
dafc8851
JH
3615
3616 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3617 * Actually, the code in regcppop() (which Ilya may be meaning by
b93070ed 3618 * prog->lastparen), is not needed at all by the test suite
225593e1
DM
3619 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3620 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3621 * Meanwhile, this code *is* needed for the
daf18116
JH
3622 * above-mentioned test suite tests to succeed. The common theme
3623 * on those tests seems to be returning null fields from matches.
225593e1 3624 * --jhi updated by dapm */
acfafe8c
YO
3625
3626 /* After encountering a variant of the issue mentioned above I think
3627 * the point Ilya was making is that if we properly unwind whenever
3628 * we set lastparen to a smaller value then we should not need to do
3629 * this every time, only when needed. So if we have tests that fail if
3630 * we remove this, then it suggests somewhere else we are improperly
3631 * unwinding the lastparen/paren buffers. See UNWIND_PARENS() and
3632 * places it is called, and related regcp() routines. - Yves */
dafc8851 3633#if 1
d6a28714 3634 if (prog->nparens) {
b93070ed 3635 regexp_paren_pair *pp = prog->offs;
eb578fdb 3636 I32 i;
b93070ed 3637 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
f0ab9afb
NC
3638 ++pp;
3639 pp->start = -1;
3640 pp->end = -1;
d6a28714
JH
3641 }
3642 }
dafc8851 3643#endif
02db2b7b 3644 REGCP_SET(lastcp);
f73aaa43
DM
3645 result = regmatch(reginfo, *startposp, progi->program + 1);
3646 if (result != -1) {
3647 prog->offs[0].end = result;
d6a28714
JH
3648 return 1;
3649 }
24b23f37 3650 if (reginfo->cutpoint)
f73aaa43 3651 *startposp= reginfo->cutpoint;
02db2b7b 3652 REGCP_UNWIND(lastcp);
d6a28714
JH
3653 return 0;
3654}
3655
02db2b7b 3656
8ba1375e
MJD
3657#define sayYES goto yes
3658#define sayNO goto no
262b90c4 3659#define sayNO_SILENT goto no_silent
8ba1375e 3660
f9f4320a
YO
3661/* we dont use STMT_START/END here because it leads to
3662 "unreachable code" warnings, which are bogus, but distracting. */
3663#define CACHEsayNO \
c476f425 3664 if (ST.cache_mask) \
2ac8ff4b 3665 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
f9f4320a 3666 sayNO
3298f257 3667
a3621e74 3668/* this is used to determine how far from the left messages like
cb41e5d6
YO
3669 'failed...' are printed in regexec.c. It should be set such that
3670 messages are inline with the regop output that created them.
a3621e74 3671*/
cb41e5d6 3672#define REPORT_CODE_OFF 29
daeb874b 3673#define INDENT_CHARS(depth) ((int)(depth) % 20)
cb41e5d6
YO
3674#ifdef DEBUGGING
3675int
7b031478 3676Perl_re_exec_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
3677{
3678 va_list ap;
3679 int result;
3680 PerlIO *f= Perl_debug_log;
7b031478 3681 PERL_ARGS_ASSERT_RE_EXEC_INDENTF;
cb41e5d6 3682 va_start(ap, depth);
147e3846 3683 PerlIO_printf(f, "%*s|%4" UVuf "| %*s", REPORT_CODE_OFF, "", (UV)depth, INDENT_CHARS(depth), "" );
cb41e5d6
YO
3684 result = PerlIO_vprintf(f, fmt, ap);
3685 va_end(ap);
3686 return result;
3687}
3688#endif /* DEBUGGING */
a3621e74
YO
3689
3690
40a82448
DM
3691#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3692#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
79a2a0e8
KW
3693#define CHRTEST_NOT_A_CP_1 -999
3694#define CHRTEST_NOT_A_CP_2 -998
9e137952 3695
5d9a96ca
DM
3696/* grab a new slab and return the first slot in it */
3697
3698STATIC regmatch_state *
3699S_push_slab(pTHX)
3700{
3701 regmatch_slab *s = PL_regmatch_slab->next;
3702 if (!s) {
3703 Newx(s, 1, regmatch_slab);
3704 s->prev = PL_regmatch_slab;
3705 s->next = NULL;
3706 PL_regmatch_slab->next = s;
3707 }
3708 PL_regmatch_slab = s;
86545054 3709 return SLAB_FIRST(s);
5d9a96ca 3710}
5b47454d 3711
95b24440 3712
40a82448
DM
3713/* push a new state then goto it */
3714
4d5016e5
DM
3715#define PUSH_STATE_GOTO(state, node, input) \
3716 pushinput = input; \
40a82448
DM
3717 scan = node; \
3718 st->resume_state = state; \
3719 goto push_state;
3720
3721/* push a new state with success backtracking, then goto it */
3722
4d5016e5
DM
3723#define PUSH_YES_STATE_GOTO(state, node, input) \
3724 pushinput = input; \
40a82448
DM
3725 scan = node; \
3726 st->resume_state = state; \
3727 goto push_yes_state;
3728
aa283a38 3729
aa283a38 3730
4d5016e5 3731
d6a28714 3732/*
95b24440 3733
bf1f174e
DM
3734regmatch() - main matching routine
3735
3736This is basically one big switch statement in a loop. We execute an op,
3737set 'next' to point the next op, and continue. If we come to a point which
3738we may need to backtrack to on failure such as (A|B|C), we push a
3739backtrack state onto the backtrack stack. On failure, we pop the top
3740state, and re-enter the loop at the state indicated. If there are no more
3741states to pop, we return failure.
3742
3743Sometimes we also need to backtrack on success; for example /A+/, where
3744after successfully matching one A, we need to go back and try to
3745match another one; similarly for lookahead assertions: if the assertion
3746completes successfully, we backtrack to the state just before the assertion
3747and then carry on. In these cases, the pushed state is marked as
3748'backtrack on success too'. This marking is in fact done by a chain of
3749pointers, each pointing to the previous 'yes' state. On success, we pop to
3750the nearest yes state, discarding any intermediate failure-only states.
3751Sometimes a yes state is pushed just to force some cleanup code to be
3752called at the end of a successful match or submatch; e.g. (??{$re}) uses
3753it to free the inner regex.
3754
3755Note that failure backtracking rewinds the cursor position, while
3756success backtracking leaves it alone.
3757
3758A pattern is complete when the END op is executed, while a subpattern
3759such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3760ops trigger the "pop to last yes state if any, otherwise return true"
3761behaviour.
3762
3763A common convention in this function is to use A and B to refer to the two
3764subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3765the subpattern to be matched possibly multiple times, while B is the entire
3766rest of the pattern. Variable and state names reflect this convention.
3767
3768The states in the main switch are the union of ops and failure/success of
3769substates associated with with that op. For example, IFMATCH is the op
3770that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3771'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3772successfully matched A and IFMATCH_A_fail is a state saying that we have
3773just failed to match A. Resume states always come in pairs. The backtrack
3774state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3775at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3776on success or failure.
3777
3778The struct that holds a backtracking state is actually a big union, with
3779one variant for each major type of op. The variable st points to the
3780top-most backtrack struct. To make the code clearer, within each
3781block of code we #define ST to alias the relevant union.
3782
3783Here's a concrete example of a (vastly oversimplified) IFMATCH
3784implementation:
3785
3786 switch (state) {
3787 ....
3788
3789#define ST st->u.ifmatch
3790
3791 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3792 ST.foo = ...; // some state we wish to save
95b24440 3793 ...
bf1f174e
DM
3794 // push a yes backtrack state with a resume value of
3795 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3796 // first node of A:
4d5016e5 3797 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
bf1f174e
DM
3798 // NOTREACHED
3799
3800 case IFMATCH_A: // we have successfully executed A; now continue with B
3801 next = B;
3802 bar = ST.foo; // do something with the preserved value
3803 break;
3804
3805 case IFMATCH_A_fail: // A failed, so the assertion failed
3806 ...; // do some housekeeping, then ...
3807 sayNO; // propagate the failure
3808
3809#undef ST
95b24440 3810
bf1f174e
DM
3811 ...
3812 }
95b24440 3813
bf1f174e
DM
3814For any old-timers reading this who are familiar with the old recursive
3815approach, the code above is equivalent to:
95b24440 3816
bf1f174e
DM
3817 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3818 {
3819 int foo = ...
95b24440 3820 ...
bf1f174e
DM
3821 if (regmatch(A)) {
3822 next = B;
3823 bar = foo;
3824 break;
95b24440 3825 }
bf1f174e
DM
3826 ...; // do some housekeeping, then ...
3827 sayNO; // propagate the failure
95b24440 3828 }
bf1f174e
DM
3829
3830The topmost backtrack state, pointed to by st, is usually free. If you
3831want to claim it, populate any ST.foo fields in it with values you wish to
3832save, then do one of
3833
4d5016e5
DM
3834 PUSH_STATE_GOTO(resume_state, node, newinput);
3835 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
bf1f174e
DM
3836
3837which sets that backtrack state's resume value to 'resume_state', pushes a
3838new free entry to the top of the backtrack stack, then goes to 'node'.
3839On backtracking, the free slot is popped, and the saved state becomes the
3840new free state. An ST.foo field in this new top state can be temporarily
3841accessed to retrieve values, but once the main loop is re-entered, it
3842becomes available for reuse.
3843
3844Note that the depth of the backtrack stack constantly increases during the
3845left-to-right execution of the pattern, rather than going up and down with
3846the pattern nesting. For example the stack is at its maximum at Z at the
3847end of the pattern, rather than at X in the following:
3848
3849 /(((X)+)+)+....(Y)+....Z/
3850
3851The only exceptions to this are lookahead/behind assertions and the cut,
3852(?>A), which pop all the backtrack states associated with A before
3853continuing.
3854
486ec47a 3855Backtrack state structs are allocated in slabs of about 4K in size.
bf1f174e
DM
3856PL_regmatch_state and st always point to the currently active state,
3857and PL_regmatch_slab points to the slab currently containing
3858PL_regmatch_state. The first time regmatch() is called, the first slab is
3859allocated, and is never freed until interpreter destruction. When the slab
3860is full, a new one is allocated and chained to the end. At exit from
3861regmatch(), slabs allocated since entry are freed.
3862
3863*/
95b24440 3864
40a82448 3865
cb41e5d6
YO
3866#define DEBUG_STATE_pp(pp) \
3867 DEBUG_STATE_r({ \
3868 DUMP_EXEC_POS(locinput, scan, utf8_target,depth); \
6ad9a8ab 3869 Perl_re_printf( aTHX_ \
cb41e5d6
YO
3870 "%*s" pp " %s%s%s%s%s\n", \
3871 INDENT_CHARS(depth), "", \
3872 PL_reg_name[st->resume_state], \
3873 ((st==yes_state||st==mark_state) ? "[" : ""), \
3874 ((st==yes_state) ? "Y" : ""), \
3875 ((st==mark_state) ? "M" : ""), \
3876 ((st==yes_state||st==mark_state) ? "]" : "") \
3877 ); \
265c4333 3878 });
5bc10b2c 3879
40a82448 3880
3dab1dad 3881#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
95b24440 3882
3df15adc 3883#ifdef DEBUGGING
5bc10b2c 3884
ab3bbdeb 3885STATIC void
f2ed9b32 3886S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
ab3bbdeb
YO
3887 const char *start, const char *end, const char *blurb)
3888{
efd26800 3889 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
7918f24d
NC
3890
3891 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3892
ab3bbdeb
YO
3893 if (!PL_colorset)
3894 reginitcolors();
3895 {
3896 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
d2c6dc5e 3897 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
ab3bbdeb 3898
f2ed9b32 3899 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
ab3bbdeb
YO
3900 start, end - start, 60);
3901
6ad9a8ab 3902 Perl_re_printf( aTHX_
ab3bbdeb
YO
3903 "%s%s REx%s %s against %s\n",
3904 PL_colors[4], blurb, PL_colors[5], s0, s1);
3905
f2ed9b32 3906 if (utf8_target||utf8_pat)
6ad9a8ab 3907 Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n",
1de06328 3908 utf8_pat ? "pattern" : "",
f2ed9b32
KW
3909 utf8_pat && utf8_target ? " and " : "",
3910 utf8_target ? "string" : ""
ab3bbdeb
YO
3911 );
3912 }
3913}
3df15adc
YO
3914
3915STATIC void
786e8c11
YO
3916S_dump_exec_pos(pTHX_ const char *locinput,
3917 const regnode *scan,
3918 const char *loc_regeol,
3919 const char *loc_bostr,
3920 const char *loc_reg_starttry,
cb41e5d6
YO
3921 const bool utf8_target,
3922 const U32 depth
3923 )
07be1b83 3924{
786e8c11 3925 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
07be1b83 3926 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
786e8c11 3927 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
07be1b83
YO
3928 /* The part of the string before starttry has one color
3929 (pref0_len chars), between starttry and current
3930 position another one (pref_len - pref0_len chars),
3931 after the current position the third one.
3932 We assume that pref0_len <= pref_len, otherwise we
3933 decrease pref0_len. */
786e8c11
YO
3934 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3935 ? (5 + taill) - l : locinput - loc_bostr;
07be1b83
YO
3936 int pref0_len;
3937
7918f24d
NC
3938 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3939
f2ed9b32 3940 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
07be1b83 3941 pref_len++;
786e8c11
YO
3942 pref0_len = pref_len - (locinput - loc_reg_starttry);
3943 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3944 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3945 ? (5 + taill) - pref_len : loc_regeol - locinput);
f2ed9b32 3946 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
07be1b83
YO
3947 l--;
3948 if (pref0_len < 0)
3949 pref0_len = 0;
3950 if (pref0_len > pref_len)
3951 pref0_len = pref_len;
3952 {
33c28ab2 3953 const int is_uni = utf8_target ? 1 : 0;
0df25f3d 3954
ab3bbdeb 3955 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
1de06328 3956 (locinput - pref_len),pref0_len, 60, 4, 5);
0df25f3d 3957
ab3bbdeb 3958 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3df15adc 3959 (locinput - pref_len + pref0_len),
1de06328 3960 pref_len - pref0_len, 60, 2, 3);
0df25f3d 3961
ab3bbdeb 3962 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
1de06328 3963 locinput, loc_regeol - locinput, 10, 0, 1);
0df25f3d 3964
1de06328 3965 const STRLEN tlen=len0+len1+len2;
6ad9a8ab 3966 Perl_re_printf( aTHX_
147e3846 3967 "%4" IVdf " <%.*s%.*s%s%.*s>%*s|%4u| ",
786e8c11 3968 (IV)(locinput - loc_bostr),
07be1b83 3969 len0, s0,
07be1b83 3970 len1, s1,
07be1b83 3971 (docolor ? "" : "> <"),
07be1b83 3972 len2, s2,
f9f4320a 3973 (int)(tlen > 19 ? 0 : 19 - tlen),
cb41e5d6
YO
3974 "",
3975 depth);
07be1b83
YO
3976 }
3977}
3df15adc 3978
07be1b83
YO
3979#endif
3980
0a4db386
YO
3981/* reg_check_named_buff_matched()
3982 * Checks to see if a named buffer has matched. The data array of
3983 * buffer numbers corresponding to the buffer is expected to reside
3984 * in the regexp->data->data array in the slot stored in the ARG() of
3985 * node involved. Note that this routine doesn't actually care about the
3986 * name, that information is not preserved from compilation to execution.
3987 * Returns the index of the leftmost defined buffer with the given name
3988 * or 0 if non of the buffers matched.
3989 */
3990STATIC I32
dc3bf405 3991S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
7918f24d 3992{
0a4db386 3993 I32 n;
f8fc2ecf 3994 RXi_GET_DECL(rex,rexi);
ad64d0ec 3995 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
0a4db386 3996 I32 *nums=(I32*)SvPVX(sv_dat);
7918f24d
NC
3997
3998 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3999
0a4db386 4000 for ( n=0; n<SvIVX(sv_dat); n++ ) {
b93070ed
DM
4001 if ((I32)rex->lastparen >= nums[n] &&
4002 rex->offs[nums[n]].end != -1)
0a4db386
YO
4003 {
4004 return nums[n];
4005 }
4006 }
4007 return 0;
4008}
4009
2f554ef7 4010
c74f6de9 4011static bool
984e6dd1 4012S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
aed7b151 4013 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
c74f6de9 4014{
79a2a0e8
KW
4015 /* This function determines if there are one or two characters that match
4016 * the first character of the passed-in EXACTish node <text_node>, and if
4017 * so, returns them in the passed-in pointers.
c74f6de9 4018 *
79a2a0e8
KW
4019 * If it determines that no possible character in the target string can
4020 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
4021 * the first character in <text_node> requires UTF-8 to represent, and the
4022 * target string isn't in UTF-8.)
c74f6de9 4023 *
79a2a0e8
KW
4024 * If there are more than two characters that could match the beginning of
4025 * <text_node>, or if more context is required to determine a match or not,
4026 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
4027 *
4028 * The motiviation behind this function is to allow the caller to set up
4029 * tight loops for matching. If <text_node> is of type EXACT, there is
4030 * only one possible character that can match its first character, and so
4031 * the situation is quite simple. But things get much more complicated if
4032 * folding is involved. It may be that the first character of an EXACTFish
4033 * node doesn't participate in any possible fold, e.g., punctuation, so it
4034 * can be matched only by itself. The vast majority of characters that are
4035 * in folds match just two things, their lower and upper-case equivalents.
4036 * But not all are like that; some have multiple possible matches, or match
4037 * sequences of more than one character. This function sorts all that out.
4038 *
4039 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
4040 * loop of trying to match A*, we know we can't exit where the thing
4041 * following it isn't a B. And something can't be a B unless it is the
4042 * beginning of B. By putting a quick test for that beginning in a tight
4043 * loop, we can rule out things that can't possibly be B without having to
4044 * break out of the loop, thus avoiding work. Similarly, if A is a single
4045 * character, we can make a tight loop matching A*, using the outputs of
4046 * this function.
4047 *
4048 * If the target string to match isn't in UTF-8, and there aren't
4049 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
4050 * the one or two possible octets (which are characters in this situation)
4051 * that can match. In all cases, if there is only one character that can
4052 * match, *<c1p> and *<c2p> will be identical.
4053 *
4054 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
4055 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
4056 * can match the beginning of <text_node>. They should be declared with at
4057 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
4058 * undefined what these contain.) If one or both of the buffers are
4059 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
4060 * corresponding invariant. If variant, the corresponding *<c1p> and/or
4061 * *<c2p> will be set to a negative number(s) that shouldn't match any code
4062 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
4063 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
c74f6de9 4064
ba44c216 4065 const bool utf8_target = reginfo->is_utf8_target;
79a2a0e8 4066
9cd990bf
JH
4067 UV c1 = (UV)CHRTEST_NOT_A_CP_1;
4068 UV c2 = (UV)CHRTEST_NOT_A_CP_2;
79a2a0e8 4069 bool use_chrtest_void = FALSE;
aed7b151 4070 const bool is_utf8_pat = reginfo->is_utf8_pat;
79a2a0e8
KW
4071
4072 /* Used when we have both utf8 input and utf8 output, to avoid converting
4073 * to/from code points */
4074 bool utf8_has_been_setup = FALSE;
4075
c74f6de9
KW
4076 dVAR;
4077
b4291290 4078 U8 *pat = (U8*)STRING(text_node);
a6715020 4079 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
c74f6de9 4080
a4525e78 4081 if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
79a2a0e8
KW
4082
4083 /* In an exact node, only one thing can be matched, that first
4084 * character. If both the pat and the target are UTF-8, we can just
4085 * copy the input to the output, avoiding finding the code point of
4086 * that character */
984e6dd1 4087 if (!is_utf8_pat) {
79a2a0e8
KW
4088 c2 = c1 = *pat;
4089 }
4090 else if (utf8_target) {
4091 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
4092 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
4093 utf8_has_been_setup = TRUE;
4094 }
4095 else {
4096 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
c74f6de9 4097 }
79a2a0e8 4098 }
31f05a37
KW
4099 else { /* an EXACTFish node */
4100 U8 *pat_end = pat + STR_LEN(text_node);
4101
4102 /* An EXACTFL node has at least some characters unfolded, because what
4103 * they match is not known until now. So, now is the time to fold
4104 * the first few of them, as many as are needed to determine 'c1' and
4105 * 'c2' later in the routine. If the pattern isn't UTF-8, we only need
4106 * to fold if in a UTF-8 locale, and then only the Sharp S; everything
4107 * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we
4108 * need to fold as many characters as a single character can fold to,
4109 * so that later we can check if the first ones are such a multi-char
4110 * fold. But, in such a pattern only locale-problematic characters
4111 * aren't folded, so we can skip this completely if the first character
4112 * in the node isn't one of the tricky ones */
4113 if (OP(text_node) == EXACTFL) {
4114
4115 if (! is_utf8_pat) {
4116 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
4117 {
4118 folded[0] = folded[1] = 's';
4119 pat = folded;
4120 pat_end = folded + 2;
4121 }
4122 }
4123 else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4124 U8 *s = pat;
4125 U8 *d = folded;
4126 int i;
4127
4128 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4129 if (isASCII(*s)) {
4130 *(d++) = (U8) toFOLD_LC(*s);
4131 s++;
4132 }
4133 else {
4134 STRLEN len;
567b353c 4135 _toFOLD_utf8_flags(s,
a1a5ec35 4136 pat_end,
567b353c
KW
4137 d,
4138 &len,
4139 FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
31f05a37
KW
4140 d += len;
4141 s += UTF8SKIP(s);
4142 }
4143 }
4144
4145 pat = folded;
4146 pat_end = d;
4147 }
4148 }
4149
251b239f
KW
4150 if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4151 || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
baa60164
KW
4152 {
4153 /* Multi-character folds require more context to sort out. Also
4154 * PL_utf8_foldclosures used below doesn't handle them, so have to
4155 * be handled outside this routine */
4156 use_chrtest_void = TRUE;
4157 }
4158 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4159 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
e8ea8356 4160 if (c1 > 255) {
baa60164
KW
4161 /* Load the folds hash, if not already done */
4162 SV** listp;
4163 if (! PL_utf8_foldclosures) {
31667e6b 4164 _load_PL_utf8_foldclosures();
79a2a0e8 4165 }
79a2a0e8 4166
baa60164
KW
4167 /* The fold closures data structure is a hash with the keys
4168 * being the UTF-8 of every character that is folded to, like
4169 * 'k', and the values each an array of all code points that
4170 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
4171 * Multi-character folds are not included */
4172 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4173 (char *) pat,
4174 UTF8SKIP(pat),
4175 FALSE))))
4176 {
4177 /* Not found in the hash, therefore there are no folds
4178 * containing it, so there is only a single character that
4179 * could match */
4180 c2 = c1;
79a2a0e8 4181 }
baa60164
KW
4182 else { /* Does participate in folds */
4183 AV* list = (AV*) *listp;
9506e945 4184 if (av_tindex_skip_len_mg(list) != 1) {
79a2a0e8 4185
baa60164
KW
4186 /* If there aren't exactly two folds to this, it is
4187 * outside the scope of this function */
4188 use_chrtest_void = TRUE;
79a2a0e8 4189 }
baa60164
KW
4190 else { /* There are two. Get them */
4191 SV** c_p = av_fetch(list, 0, FALSE);
4192 if (c_p == NULL) {
4193 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4194 }
4195 c1 = SvUV(*c_p);
4196
4197 c_p = av_fetch(list, 1, FALSE);
4198 if (c_p == NULL) {
4199 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4200 }
4201 c2 = SvUV(*c_p);
4202
4203 /* Folds that cross the 255/256 boundary are forbidden
4204 * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4205 * one is ASCIII. Since the pattern character is above
e8ea8356 4206 * 255, and its only other match is below 256, the only
baa60164
KW
4207 * legal match will be to itself. We have thrown away
4208 * the original, so have to compute which is the one
e8ea8356 4209 * above 255. */
baa60164
KW
4210 if ((c1 < 256) != (c2 < 256)) {
4211 if ((OP(text_node) == EXACTFL
4212 && ! IN_UTF8_CTYPE_LOCALE)
4213 || ((OP(text_node) == EXACTFA
4214 || OP(text_node) == EXACTFA_NO_TRIE)
4215 && (isASCII(c1) || isASCII(c2))))
4216 {
4217 if (c1 < 256) {
4218 c1 = c2;
4219 }
4220 else {
4221 c2 = c1;
4222 }
79a2a0e8
KW
4223 }
4224 }
4225 }
4226 }
4227 }
e8ea8356 4228 else /* Here, c1 is <= 255 */
baa60164
KW
4229 if (utf8_target
4230 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4231 && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4232 && ((OP(text_node) != EXACTFA
4233 && OP(text_node) != EXACTFA_NO_TRIE)
4234 || ! isASCII(c1)))
4235 {
4236 /* Here, there could be something above Latin1 in the target
4237 * which folds to this character in the pattern. All such
4238 * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4239 * than two characters involved in their folds, so are outside
4240 * the scope of this function */
4241 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4242 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4243 }
4244 else {
4245 use_chrtest_void = TRUE;
4246 }
79a2a0e8 4247 }
baa60164
KW
4248 else { /* Here nothing above Latin1 can fold to the pattern
4249 character */
4250 switch (OP(text_node)) {
c74f6de9 4251
baa60164
KW
4252 case EXACTFL: /* /l rules */
4253 c2 = PL_fold_locale[c1];
4254 break;
c74f6de9 4255
baa60164
KW
4256 case EXACTF: /* This node only generated for non-utf8
4257 patterns */
4258 assert(! is_utf8_pat);
4259 if (! utf8_target) { /* /d rules */
4260 c2 = PL_fold[c1];
4261 break;
4262 }
4263 /* FALLTHROUGH */
4264 /* /u rules for all these. This happens to work for
4265 * EXACTFA as nothing in Latin1 folds to ASCII */
4266 case EXACTFA_NO_TRIE: /* This node only generated for
4267 non-utf8 patterns */
4268 assert(! is_utf8_pat);
924ba076 4269 /* FALLTHROUGH */
baa60164
KW
4270 case EXACTFA:
4271 case EXACTFU_SS:
4272 case EXACTFU:
4273 c2 = PL_fold_latin1[c1];
c74f6de9 4274 break;
c74f6de9 4275
baa60164
KW
4276 default:
4277 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
e5964223 4278 NOT_REACHED; /* NOTREACHED */
baa60164 4279 }
c74f6de9
KW
4280 }
4281 }
4282 }
79a2a0e8
KW
4283
4284 /* Here have figured things out. Set up the returns */
4285 if (use_chrtest_void) {
4286 *c2p = *c1p = CHRTEST_VOID;
4287 }
4288 else if (utf8_target) {
4289 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
4290 uvchr_to_utf8(c1_utf8, c1);
4291 uvchr_to_utf8(c2_utf8, c2);
c74f6de9 4292 }
c74f6de9 4293
79a2a0e8
KW
4294 /* Invariants are stored in both the utf8 and byte outputs; Use
4295 * negative numbers otherwise for the byte ones. Make sure that the
4296 * byte ones are the same iff the utf8 ones are the same */
4297 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4298 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4299 ? *c2_utf8
4300 : (c1 == c2)
4301 ? CHRTEST_NOT_A_CP_1
4302 : CHRTEST_NOT_A_CP_2;
4303 }
4304 else if (c1 > 255) {
4305 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
4306 can represent */
4307 return FALSE;
4308 }
c74f6de9 4309
79a2a0e8
KW
4310 *c1p = *c2p = c2; /* c2 is the only representable value */
4311 }
4312 else { /* c1 is representable; see about c2 */
4313 *c1p = c1;
4314 *c2p = (c2 < 256) ? c2 : c1;
c74f6de9 4315 }
2f554ef7 4316
c74f6de9
KW
4317 return TRUE;
4318}
2f554ef7 4319
b0e24409 4320STATIC bool
5cd8b225 4321S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target)
64935bc6
KW
4322{
4323 /* returns a boolean indicating if there is a Grapheme Cluster Boundary
b0e24409
KW
4324 * between the inputs. See http://www.unicode.org/reports/tr29/. */
4325
4326 PERL_ARGS_ASSERT_ISGCB;
4327
4328 switch (GCB_table[before][after]) {
4329 case GCB_BREAKABLE:
4330 return TRUE;
4331
4332 case GCB_NOBREAK:
4333 return FALSE;
4334
4335 case GCB_RI_then_RI:
4336 {
4337 int RI_count = 1;
4338 U8 * temp_pos = (U8 *) curpos;
4339
4340 /* Do not break within emoji flag sequences. That is, do not
4341 * break between regional indicator (RI) symbols if there is an
4342 * odd number of RI characters before the break point.
4343 * GB12 ^ (RI RI)* RI × RI
4344 * GB13 [^RI] (RI RI)* RI × RI */
4345
4346 while (backup_one_GCB(strbeg,
4347 &temp_pos,
4348 utf8_target) == GCB_Regional_Indicator)
4349 {
4350 RI_count++;
4351 }
4352
4353 return RI_count % 2 != 1;
4354 }
4355
4356 case GCB_EX_then_EM:
4357
4358 /* GB10 ( E_Base | E_Base_GAZ ) Extend* × E_Modifier */
4359 {
4360 U8 * temp_pos = (U8 *) curpos;
4361 GCB_enum prev;
64935bc6 4362
b0e24409
KW
4363 do {
4364 prev = backup_one_GCB(strbeg, &temp_pos, utf8_target);
4365 }
4366 while (prev == GCB_Extend);
4367
4368 return prev != GCB_E_Base && prev != GCB_E_Base_GAZ;
4369 }
4370
4371 default:
4372 break;
4373 }
4374
4375#ifdef DEBUGGING
4376 Perl_re_printf( aTHX_ "Unhandled GCB pair: GCB_table[%d, %d] = %d\n",
4377 before, after, GCB_table[before][after]);
4378 assert(0);
4379#endif
4380 return TRUE;
4381}
4382
4383STATIC GCB_enum
4384S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4385{
4386 GCB_enum gcb;
4387
4388 PERL_ARGS_ASSERT_BACKUP_ONE_GCB;
4389
4390 if (*curpos < strbeg) {
4391 return GCB_EDGE;
4392 }
4393
4394 if (utf8_target) {
4395 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4396 U8 * prev_prev_char_pos;
4397
4398 if (! prev_char_pos) {
4399 return GCB_EDGE;
4400 }
4401
4402 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4403 gcb = getGCB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4404 *curpos = prev_char_pos;
4405 prev_char_pos = prev_prev_char_pos;
4406 }
4407 else {
4408 *curpos = (U8 *) strbeg;
4409 return GCB_EDGE;
4410 }
4411 }
4412 else {
4413 if (*curpos - 2 < strbeg) {
4414 *curpos = (U8 *) strbeg;
4415 return GCB_EDGE;
4416 }
4417 (*curpos)--;
4418 gcb = getGCB_VAL_CP(*(*curpos - 1));
4419 }
4420
4421 return gcb;
64935bc6
KW
4422}
4423
6b659339
KW
4424/* Combining marks attach to most classes that precede them, but this defines
4425 * the exceptions (from TR14) */
4426#define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
4427 || prev == LB_Mandatory_Break \
4428 || prev == LB_Carriage_Return \
4429 || prev == LB_Line_Feed \
4430 || prev == LB_Next_Line \
4431 || prev == LB_Space \
4432 || prev == LB_ZWSpace))
4433
4434STATIC bool
4435S_isLB(pTHX_ LB_enum before,
4436 LB_enum after,
4437 const U8 * const strbeg,
4438 const U8 * const curpos,
4439 const U8 * const strend,
4440 const bool utf8_target)
4441{
4442 U8 * temp_pos = (U8 *) curpos;
4443 LB_enum prev = before;
4444
4445 /* Is the boundary between 'before' and 'after' line-breakable?
4446 * Most of this is just a table lookup of a generated table from Unicode
4447 * rules. But some rules require context to decide, and so have to be
4448 * implemented in code */
4449
4450 PERL_ARGS_ASSERT_ISLB;
4451
b0e24409 4452 /* Rule numbers in the comments below are as of Unicode 9.0 */
6b659339
KW
4453
4454 redo:
4455 before = prev;
4456 switch (LB_table[before][after]) {
4457 case LB_BREAKABLE:
4458 return TRUE;
4459
4460 case LB_NOBREAK:
4461 case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4462 return FALSE;
4463
4464 case LB_SP_foo + LB_BREAKABLE:
4465 case LB_SP_foo + LB_NOBREAK:
4466 case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
4467
4468 /* When we have something following a SP, we have to look at the
4469 * context in order to know what to do.
4470 *
4471 * SP SP should not reach here because LB7: Do not break before
4472 * spaces. (For two spaces in a row there is nothing that
4473 * overrides that) */
4474 assert(after != LB_Space);
4475
4476 /* Here we have a space followed by a non-space. Mostly this is a
4477 * case of LB18: "Break after spaces". But there are complications
4478 * as the handling of spaces is somewhat tricky. They are in a
4479 * number of rules, which have to be applied in priority order, but
4480 * something earlier in the string can cause a rule to be skipped
4481 * and a lower priority rule invoked. A prime example is LB7 which
4482 * says don't break before a space. But rule LB8 (lower priority)
4483 * says that the first break opportunity after a ZW is after any
4484 * span of spaces immediately after it. If a ZW comes before a SP
4485 * in the input, rule LB8 applies, and not LB7. Other such rules
4486 * involve combining marks which are rules 9 and 10, but they may
4487 * override higher priority rules if they come earlier in the
4488 * string. Since we're doing random access into the middle of the
4489 * string, we have to look for rules that should get applied based
4490 * on both string position and priority. Combining marks do not
4491 * attach to either ZW nor SP, so we don't have to consider them
4492 * until later.
4493 *
4494 * To check for LB8, we have to find the first non-space character
4495 * before this span of spaces */
4496 do {
4497 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4498 }
4499 while (prev == LB_Space);
4500
4501 /* LB8 Break before any character following a zero-width space,
4502 * even if one or more spaces intervene.
4503 * ZW SP* ÷
4504 * So if we have a ZW just before this span, and to get here this
4505 * is the final space in the span. */
4506 if (prev == LB_ZWSpace) {
4507 return TRUE;
4508 }
4509
4510 /* Here, not ZW SP+. There are several rules that have higher
4511 * priority than LB18 and can be resolved now, as they don't depend
4512 * on anything earlier in the string (except ZW, which we have
4513 * already handled). One of these rules is LB11 Do not break
4514 * before Word joiner, but we have specially encoded that in the
4515 * lookup table so it is caught by the single test below which
4516 * catches the other ones. */
4517 if (LB_table[LB_Space][after] - LB_SP_foo
4518 == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
4519 {
4520 return FALSE;
4521 }
4522
4523 /* If we get here, we have to XXX consider combining marks. */
4524 if (prev == LB_Combining_Mark) {
4525
4526 /* What happens with these depends on the character they
4527 * follow. */
4528 do {
4529 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4530 }
4531 while (prev == LB_Combining_Mark);
4532
4533 /* Most times these attach to and inherit the characteristics
4534 * of that character, but not always, and when not, they are to
4535 * be treated as AL by rule LB10. */
4536 if (! LB_CM_ATTACHES_TO(prev)) {
4537 prev = LB_Alphabetic;
4538 }
4539 }
4540
4541 /* Here, we have the character preceding the span of spaces all set
4542 * up. We follow LB18: "Break after spaces" unless the table shows
4543 * that is overriden */
4544 return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
4545
b0e24409 4546 case LB_CM_ZWJ_foo:
6b659339
KW
4547
4548 /* We don't know how to treat the CM except by looking at the first
b0e24409 4549 * non-CM character preceding it. ZWJ is treated as CM */
6b659339
KW
4550 do {
4551 prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
4552 }
b0e24409 4553 while (prev == LB_Combining_Mark || prev == LB_ZWJ);
6b659339
KW
4554
4555 /* Here, 'prev' is that first earlier non-CM character. If the CM
4556 * attatches to it, then it inherits the behavior of 'prev'. If it
4557 * doesn't attach, it is to be treated as an AL */
4558 if (! LB_CM_ATTACHES_TO(prev)) {
4559 prev = LB_Alphabetic;
4560 }
4561
4562 goto redo;
4563
4564 case LB_HY_or_BA_then_foo + LB_BREAKABLE:
4565 case LB_HY_or_BA_then_foo + LB_NOBREAK:
4566
4567 /* LB21a Don't break after Hebrew + Hyphen.
4568 * HL (HY | BA) × */
4569
4570 if (backup_one_LB(strbeg, &temp_pos, utf8_target)
4571 == LB_Hebrew_Letter)
4572 {
4573 return FALSE;
4574 }
4575
4576 return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
4577
4578 case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
4579 case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
4580
4581 /* LB25a (PR | PO) × ( OP | HY )? NU */
4582 if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
4583 return FALSE;
4584 }
4585
4586 return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
4587 == LB_BREAKABLE;
4588
4589 case LB_SY_or_IS_then_various + LB_BREAKABLE:
4590 case LB_SY_or_IS_then_various + LB_NOBREAK:
4591 {
4592 /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
4593
4594 LB_enum temp = prev;
4595 do {
4596 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4597 }
4598 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
4599 if (temp == LB_Numeric) {
4600 return FALSE;
4601 }
4602
4603 return LB_table[prev][after] - LB_SY_or_IS_then_various
4604 == LB_BREAKABLE;
4605 }
4606
4607 case LB_various_then_PO_or_PR + LB_BREAKABLE:
4608 case LB_various_then_PO_or_PR + LB_NOBREAK:
4609 {
4610 /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
4611
4612 LB_enum temp = prev;
4613 if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
4614 {
4615 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4616 }
4617 while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
4618 temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
4619 }
4620 if (temp == LB_Numeric) {
4621 return FALSE;
4622 }
4623 return LB_various_then_PO_or_PR;
4624 }
4625
b0e24409
KW
4626 case LB_RI_then_RI + LB_NOBREAK:
4627 case LB_RI_then_RI + LB_BREAKABLE:
4628 {
4629 int RI_count = 1;
4630
4631 /* LB30a Break between two regional indicator symbols if and
4632 * only if there are an even number of regional indicators
4633 * preceding the position of the break.
4634 *
4635 * sot (RI RI)* RI × RI
4636 * [^RI] (RI RI)* RI × RI */
4637
4638 while (backup_one_LB(strbeg,
4639 &temp_pos,
4640 utf8_target) == LB_Regional_Indicator)
4641 {
4642 RI_count++;
4643 }
4644
4645 return RI_count % 2 == 0;
4646 }
4647
6b659339
KW
4648 default:
4649 break;
4650 }
4651
4652#ifdef DEBUGGING
6ad9a8ab 4653 Perl_re_printf( aTHX_ "Unhandled LB pair: LB_table[%d, %d] = %d\n",
6b659339
KW
4654 before, after, LB_table[before][after]);
4655 assert(0);
4656#endif
4657 return TRUE;
4658}
4659
4660STATIC LB_enum
4661S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4662{
4663 LB_enum lb;
4664
4665 PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
4666
4667 if (*curpos >= strend) {
4668 return LB_EDGE;
4669 }
4670
4671 if (utf8_target) {
4672 *curpos += UTF8SKIP(*curpos);
4673 if (*curpos >= strend) {
4674 return LB_EDGE;
4675 }
4676 lb = getLB_VAL_UTF8(*curpos, strend);
4677 }
4678 else {
4679 (*curpos)++;
4680 if (*curpos >= strend) {
4681 return LB_EDGE;
4682 }
4683 lb = getLB_VAL_CP(**curpos);
4684 }
4685
4686 return lb;
4687}
4688
4689STATIC LB_enum
4690S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4691{
4692 LB_enum lb;
4693
4694 PERL_ARGS_ASSERT_BACKUP_ONE_LB;
4695
4696 if (*curpos < strbeg) {
4697 return LB_EDGE;
4698 }
4699
4700 if (utf8_target) {
4701 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4702 U8 * prev_prev_char_pos;
4703
4704 if (! prev_char_pos) {
4705 return LB_EDGE;
4706 }
4707
4708 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
4709 lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4710 *curpos = prev_char_pos;
4711 prev_char_pos = prev_prev_char_pos;
4712 }
4713 else {
4714 *curpos = (U8 *) strbeg;
4715 return LB_EDGE;
4716 }
4717 }
4718 else {
4719 if (*curpos - 2 < strbeg) {
4720 *curpos = (U8 *) strbeg;
4721 return LB_EDGE;
4722 }
4723 (*curpos)--;
4724 lb = getLB_VAL_CP(*(*curpos - 1));
4725 }
4726
4727 return lb;
4728}
4729
06ae2722 4730STATIC bool
85e5f08b
KW
4731S_isSB(pTHX_ SB_enum before,
4732 SB_enum after,
06ae2722
KW
4733 const U8 * const strbeg,
4734 const U8 * const curpos,
4735 const U8 * const strend,
4736 const bool utf8_target)
4737{
4738 /* returns a boolean indicating if there is a Sentence Boundary Break
4739 * between the inputs. See http://www.unicode.org/reports/tr29/ */
4740
4741 U8 * lpos = (U8 *) curpos;
943e3497
KW
4742 bool has_para_sep = FALSE;
4743 bool has_sp = FALSE;
06ae2722
KW
4744
4745 PERL_ARGS_ASSERT_ISSB;
4746
4747 /* Break at the start and end of text.
4748 SB1. sot ÷
943e3497
KW
4749 SB2. ÷ eot
4750 But unstated in Unicode is don't break if the text is empty */
85e5f08b 4751 if (before == SB_EDGE || after == SB_EDGE) {
943e3497 4752 return before != after;
06ae2722
KW
4753 }
4754
4755 /* SB 3: Do not break within CRLF. */
85e5f08b 4756 if (before == SB_CR && after == SB_LF) {
06ae2722
KW
4757 return FALSE;
4758 }
4759
7cc3f64a
KW
4760 /* Break after paragraph separators. CR and LF are considered
4761 * so because Unicode views text as like word processing text where there
4762 * are no newlines except between paragraphs, and the word processor takes
4763 * care of wrapping without there being hard line-breaks in the text *./
06ae2722 4764 SB4. Sep | CR | LF ÷ */
85e5f08b 4765 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
06ae2722
KW
4766 return TRUE;
4767 }
4768
4769 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4770 * (See Section 6.2, Replacing Ignore Rules.)
4771 SB5. X (Extend | Format)* → X */
85e5f08b 4772 if (after == SB_Extend || after == SB_Format) {
943e3497
KW
4773
4774 /* Implied is that the these characters attach to everything
4775 * immediately prior to them except for those separator-type
4776 * characters. And the rules earlier have already handled the case
4777 * when one of those immediately precedes the extend char */
06ae2722
KW
4778 return FALSE;
4779 }
4780
85e5f08b 4781 if (before == SB_Extend || before == SB_Format) {
943e3497
KW
4782 U8 * temp_pos = lpos;
4783 const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4784 if ( backup != SB_EDGE
4785 && backup != SB_Sep
4786 && backup != SB_CR
4787 && backup != SB_LF)
4788 {
4789 before = backup;
4790 lpos = temp_pos;
4791 }
4792
4793 /* Here, both 'before' and 'backup' are these types; implied is that we
4794 * don't break between them */
4795 if (backup == SB_Extend || backup == SB_Format) {
4796 return FALSE;
4797 }
06ae2722
KW
4798 }
4799
4800 /* Do not break after ambiguous terminators like period, if they are
4801 * immediately followed by a number or lowercase letter, if they are
4802 * between uppercase letters, if the first following letter (optionally
4803 * after certain punctuation) is lowercase, or if they are followed by
4804 * "continuation" punctuation such as comma, colon, or semicolon. For
4805 * example, a period may be an abbreviation or numeric period, and thus may
4806 * not mark the end of a sentence.
4807
4808 * SB6. ATerm × Numeric */
85e5f08b 4809 if (before == SB_ATerm && after == SB_Numeric) {
06ae2722
KW
4810 return FALSE;
4811 }
4812
d4005659 4813 /* SB7. (Upper | Lower) ATerm × Upper */
85e5f08b 4814 if (before == SB_ATerm && after == SB_Upper) {
943e3497
KW
4815 U8 * temp_pos = lpos;
4816 SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
d4005659 4817 if (backup == SB_Upper || backup == SB_Lower) {
06ae2722
KW
4818 return FALSE;
4819 }
4820 }
4821
943e3497
KW
4822 /* The remaining rules that aren't the final one, all require an STerm or
4823 * an ATerm after having backed up over some Close* Sp*, and in one case an
4824 * optional Paragraph separator, although one rule doesn't have any Sp's in it.
4825 * So do that backup now, setting flags if either Sp or a paragraph
4826 * separator are found */
4827
4828 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4829 has_para_sep = TRUE;
4830 before = backup_one_SB(strbeg, &lpos, utf8_target);
06ae2722
KW
4831 }
4832
943e3497
KW
4833 if (before == SB_Sp) {
4834 has_sp = TRUE;
4835 do {
4836 before = backup_one_SB(strbeg, &lpos, utf8_target);
06ae2722 4837 }
943e3497 4838 while (before == SB_Sp);
06ae2722
KW
4839 }
4840
943e3497
KW
4841 while (before == SB_Close) {
4842 before = backup_one_SB(strbeg, &lpos, utf8_target);
06ae2722
KW
4843 }
4844
943e3497
KW
4845 /* The next few rules apply only when the backed-up-to is an ATerm, and in
4846 * most cases an STerm */
4847 if (before == SB_STerm || before == SB_ATerm) {
06ae2722 4848
943e3497
KW
4849 /* So, here the lhs matches
4850 * (STerm | ATerm) Close* Sp* (Sep | CR | LF)?
4851 * and we have set flags if we found an Sp, or the optional Sep,CR,LF.
4852 * The rules that apply here are:
4853 *
4854 * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR
4855 | LF | STerm | ATerm) )* Lower
4856 SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
4857 SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF)
4858 SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF)
4859 SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷
4860 */
4861
4862 /* And all but SB11 forbid having seen a paragraph separator */
4863 if (! has_para_sep) {
4864 if (before == SB_ATerm) { /* SB8 */
4865 U8 * rpos = (U8 *) curpos;
4866 SB_enum later = after;
4867
4868 while ( later != SB_OLetter
4869 && later != SB_Upper
4870 && later != SB_Lower
4871 && later != SB_Sep
4872 && later != SB_CR
4873 && later != SB_LF
4874 && later != SB_STerm
4875 && later != SB_ATerm
4876 && later != SB_EDGE)
4877 {
4878 later = advance_one_SB(&rpos, strend, utf8_target);
4879 }
4880 if (later == SB_Lower) {
4881 return FALSE;
4882 }
4883 }
4884
4885 if ( after == SB_SContinue /* SB8a */
4886 || after == SB_STerm
4887 || after == SB_ATerm)
4888 {
4889 return FALSE;
4890 }
4891
4892 if (! has_sp) { /* SB9 applies only if there was no Sp* */
4893 if ( after == SB_Close
4894 || after == SB_Sp
4895 || after == SB_Sep
4896 || after == SB_CR
4897 || after == SB_LF)
4898 {
4899 return FALSE;
4900 }
4901 }
4902
4903 /* SB10. This and SB9 could probably be combined some way, but khw
4904 * has decided to follow the Unicode rule book precisely for
4905 * simplified maintenance */
4906 if ( after == SB_Sp
4907 || after == SB_Sep
4908 || after == SB_CR
4909 || after == SB_LF)
4910 {
4911 return FALSE;
4912 }
4913 }
4914
4915 /* SB11. */
06ae2722
KW
4916 return TRUE;
4917 }
4918
4919 /* Otherwise, do not break.
4920 SB12. Any × Any */
4921
4922 return FALSE;
4923}
4924
85e5f08b 4925STATIC SB_enum
06ae2722
KW
4926S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4927{
85e5f08b 4928 SB_enum sb;
06ae2722
KW
4929
4930 PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4931
4932 if (*curpos >= strend) {
85e5f08b 4933 return SB_EDGE;
06ae2722
KW
4934 }
4935
4936 if (utf8_target) {
4937 do {
4938 *curpos += UTF8SKIP(*curpos);
4939 if (*curpos >= strend) {
85e5f08b 4940 return SB_EDGE;
06ae2722
KW
4941 }
4942 sb = getSB_VAL_UTF8(*curpos, strend);
85e5f08b 4943 } while (sb == SB_Extend || sb == SB_Format);
06ae2722
KW
4944 }
4945 else {
4946 do {
4947 (*curpos)++;
4948 if (*curpos >= strend) {
85e5f08b 4949 return SB_EDGE;
06ae2722
KW
4950 }
4951 sb = getSB_VAL_CP(**curpos);
85e5f08b 4952 } while (sb == SB_Extend || sb == SB_Format);
06ae2722
KW
4953 }
4954
4955 return sb;
4956}
4957
85e5f08b 4958STATIC SB_enum
06ae2722
KW
4959S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4960{
85e5f08b 4961 SB_enum sb;
06ae2722
KW
4962
4963 PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4964
4965 if (*curpos < strbeg) {
85e5f08b 4966 return SB_EDGE;
06ae2722
KW
4967 }
4968
4969 if (utf8_target) {
4970 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4971 if (! prev_char_pos) {
85e5f08b 4972 return SB_EDGE;
06ae2722
KW
4973 }
4974
4975 /* Back up over Extend and Format. curpos is always just to the right
4976 * of the characater whose value we are getting */
4977 do {
4978 U8 * prev_prev_char_pos;
4979 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4980 strbeg)))
4981 {
4982 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4983 *curpos = prev_char_pos;
4984 prev_char_pos = prev_prev_char_pos;
4985 }
4986 else {
4987 *curpos = (U8 *) strbeg;
85e5f08b 4988 return SB_EDGE;
06ae2722 4989 }
85e5f08b 4990 } while (sb == SB_Extend || sb == SB_Format);
06ae2722
KW
4991 }
4992 else {
4993 do {
4994 if (*curpos - 2 < strbeg) {
4995 *curpos = (U8 *) strbeg;
85e5f08b 4996 return SB_EDGE;
06ae2722
KW
4997 }
4998 (*curpos)--;
4999 sb = getSB_VAL_CP(*(*curpos - 1));
85e5f08b 5000 } while (sb == SB_Extend || sb == SB_Format);
06ae2722
KW
5001 }
5002
5003 return sb;
5004}
5005
ae3bb8ea 5006STATIC bool
85e5f08b
KW
5007S_isWB(pTHX_ WB_enum previous,
5008 WB_enum before,
5009 WB_enum after,
ae3bb8ea
KW
5010 const U8 * const strbeg,
5011 const U8 * const curpos,
5012 const U8 * const strend,
5013 const bool utf8_target)
5014{
5015 /* Return a boolean as to if the boundary between 'before' and 'after' is
f1f6961f
KW
5016 * a Unicode word break, using their published algorithm, but tailored for
5017 * Perl by treating spans of white space as one unit. Context may be
ae3bb8ea
KW
5018 * needed to make this determination. If the value for the character
5019 * before 'before' is known, it is passed as 'previous'; otherwise that
85e5f08b 5020 * should be set to WB_UNKNOWN. The other input parameters give the
ae3bb8ea
KW
5021 * boundaries and current position in the matching of the string. That
5022 * is, 'curpos' marks the position where the character whose wb value is
5023 * 'after' begins. See http://www.unicode.org/reports/tr29/ */
5024
5025 U8 * before_pos = (U8 *) curpos;
5026 U8 * after_pos = (U8 *) curpos;
7e54b87f
KW
5027 WB_enum prev = before;
5028 WB_enum next;
ae3bb8ea
KW
5029
5030 PERL_ARGS_ASSERT_ISWB;
5031
b0e24409 5032 /* Rule numbers in the comments below are as of Unicode 9.0 */
ae3bb8ea 5033
7e54b87f
KW
5034 redo:
5035 before = prev;
5036 switch (WB_table[before][after]) {
5037 case WB_BREAKABLE:
5038 return TRUE;
f1f6961f 5039
7e54b87f 5040 case WB_NOBREAK:
f1f6961f 5041 return FALSE;
ae3bb8ea 5042
7e54b87f
KW
5043 case WB_hs_then_hs: /* 2 horizontal spaces in a row */
5044 next = advance_one_WB(&after_pos, strend, utf8_target,
f1f6961f 5045 FALSE /* Don't skip Extend nor Format */ );
7e54b87f
KW
5046 /* A space immediately preceeding an Extend or Format is attached
5047 * to by them, and hence gets separated from previous spaces.
5048 * Otherwise don't break between horizontal white space */
f1f6961f 5049 return next == WB_Extend || next == WB_Format;
f1f6961f 5050
7e54b87f
KW
5051 /* WB4 Ignore Format and Extend characters, except when they appear at
5052 * the beginning of a region of text. This code currently isn't
5053 * general purpose, but it works as the rules are currently and likely
5054 * to be laid out. The reason it works is that when 'they appear at
5055 * the beginning of a region of text', the rule is to break before
5056 * them, just like any other character. Therefore, the default rule
5057 * applies and we don't have to look in more depth. Should this ever
b0e24409
KW
5058 * change, we would have to have 2 'case' statements, like in the rules
5059 * below, and backup a single character (not spacing over the extend
5060 * ones) and then see if that is one of the region-end characters and
5061 * go from there */
5062 case WB_Ex_or_FO_or_ZWJ_then_foo:
7e54b87f
KW
5063 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5064 goto redo;
f1f6961f 5065
7e54b87f
KW
5066 case WB_DQ_then_HL + WB_BREAKABLE:
5067 case WB_DQ_then_HL + WB_NOBREAK:
ae3bb8ea 5068
7e54b87f 5069 /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */
ae3bb8ea 5070
7e54b87f
KW
5071 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5072 == WB_Hebrew_Letter)
5073 {
5074 return FALSE;
5075 }
ae3bb8ea 5076
7e54b87f 5077 return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE;
ae3bb8ea 5078
7e54b87f
KW
5079 case WB_HL_then_DQ + WB_BREAKABLE:
5080 case WB_HL_then_DQ + WB_NOBREAK:
ae3bb8ea 5081
7e54b87f
KW
5082 /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */
5083
5084 if (advance_one_WB(&after_pos, strend, utf8_target,
5085 TRUE /* Do skip Extend and Format */ )
5086 == WB_Hebrew_Letter)
5087 {
ae3bb8ea 5088 return FALSE;
7e54b87f
KW
5089 }
5090
5091 return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE;
ae3bb8ea 5092
7e54b87f
KW
5093 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK:
5094 case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE:
5095
5096 /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet
5097 * | Single_Quote) (ALetter | Hebrew_Letter) */
5098
5099 next = advance_one_WB(&after_pos, strend, utf8_target,
4370fbb5 5100 TRUE /* Do skip Extend and Format */ );
7e54b87f
KW
5101
5102 if (next == WB_ALetter || next == WB_Hebrew_Letter)
5103 {
ae3bb8ea 5104 return FALSE;
7e54b87f 5105 }
ae3bb8ea 5106
7e54b87f
KW
5107 return WB_table[before][after]
5108 - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE;
ae3bb8ea 5109
7e54b87f
KW
5110 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK:
5111 case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE:
ae3bb8ea 5112
7e54b87f
KW
5113 /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet
5114 * | Single_Quote) × (ALetter | Hebrew_Letter) */
ae3bb8ea 5115
7e54b87f
KW
5116 prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
5117 if (prev == WB_ALetter || prev == WB_Hebrew_Letter)
5118 {
ae3bb8ea 5119 return FALSE;
7e54b87f 5120 }
ae3bb8ea 5121
7e54b87f
KW
5122 return WB_table[before][after]
5123 - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE;
ae3bb8ea 5124
7e54b87f
KW
5125 case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK:
5126 case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE:
ae3bb8ea 5127
7e54b87f
KW
5128 /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric
5129 * */
ae3bb8ea 5130
7e54b87f
KW
5131 if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
5132 == WB_Numeric)
5133 {
ae3bb8ea 5134 return FALSE;
7e54b87f 5135 }
ae3bb8ea 5136
7e54b87f
KW
5137 return WB_table[before][after]
5138 - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE;
5139
5140 case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK:
5141 case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE:
5142
5143 /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */
5144
5145 if (advance_one_WB(&after_pos, strend, utf8_target,
5146 TRUE /* Do skip Extend and Format */ )
5147 == WB_Numeric)
5148 {
ae3bb8ea 5149 return FALSE;
7e54b87f
KW
5150 }
5151
5152 return WB_table[before][after]
5153 - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE;
ae3bb8ea 5154
b0e24409
KW
5155 case WB_RI_then_RI + WB_NOBREAK:
5156 case WB_RI_then_RI + WB_BREAKABLE:
5157 {
5158 int RI_count = 1;
5159
5160 /* Do not break within emoji flag sequences. That is, do not
5161 * break between regional indicator (RI) symbols if there is an
5162 * odd number of RI characters before the potential break
5163 * point.
5164 *
5165 * WB15 ^ (RI RI)* RI × RI
5166 * WB16 [^RI] (RI RI)* RI × RI */
5167
5168 while (backup_one_WB(&previous,
5169 strbeg,
5170 &before_pos,
5171 utf8_target) == WB_Regional_Indicator)
5172 {
5173 RI_count++;
5174 }
5175
5176 return RI_count % 2 != 1;
5177 }
5178
7e54b87f
KW
5179 default:
5180 break;
ae3bb8ea
KW
5181 }
5182
7e54b87f 5183#ifdef DEBUGGING
6ad9a8ab 5184 Perl_re_printf( aTHX_ "Unhandled WB pair: WB_table[%d, %d] = %d\n",
7e54b87f
KW
5185 before, after, WB_table[before][after]);
5186 assert(0);
5187#endif
5188 return TRUE;
ae3bb8ea
KW
5189}
5190
85e5f08b 5191STATIC WB_enum
4370fbb5
KW
5192S_advance_one_WB(pTHX_ U8 ** curpos,
5193 const U8 * const strend,
5194 const bool utf8_target,
5195 const bool skip_Extend_Format)
ae3bb8ea 5196{
85e5f08b 5197 WB_enum wb;
ae3bb8ea
KW
5198
5199 PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
5200
5201 if (*curpos >= strend) {
85e5f08b 5202 return WB_EDGE;
ae3bb8ea
KW
5203 }
5204
5205 if (utf8_target) {
5206
5207 /* Advance over Extend and Format */
5208 do {
5209 *curpos += UTF8SKIP(*curpos);
5210 if (*curpos >= strend) {
85e5f08b 5211 return WB_EDGE;
ae3bb8ea
KW
5212 }
5213 wb = getWB_VAL_UTF8(*curpos, strend);
4370fbb5
KW
5214 } while ( skip_Extend_Format
5215 && (wb == WB_Extend || wb == WB_Format));
ae3bb8ea
KW
5216 }
5217 else {
5218 do {
5219 (*curpos)++;
5220 if (*curpos >= strend) {
85e5f08b 5221 return WB_EDGE;
ae3bb8ea
KW
5222 }
5223 wb = getWB_VAL_CP(**curpos);
4370fbb5
KW
5224 } while ( skip_Extend_Format
5225 && (wb == WB_Extend || wb == WB_Format));
ae3bb8ea
KW
5226 }
5227
5228 return wb;
5229}
5230
85e5f08b
KW
5231STATIC WB_enum
5232S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
ae3bb8ea 5233{
85e5f08b 5234 WB_enum wb;
ae3bb8ea
KW
5235
5236 PERL_ARGS_ASSERT_BACKUP_ONE_WB;
5237
5238 /* If we know what the previous character's break value is, don't have
5239 * to look it up */
85e5f08b 5240 if (*previous != WB_UNKNOWN) {
ae3bb8ea 5241 wb = *previous;
ae3bb8ea 5242
b4b2ec55
KW
5243 /* But we need to move backwards by one */
5244 if (utf8_target) {
5245 *curpos = reghopmaybe3(*curpos, -1, strbeg);
5246 if (! *curpos) {
5247 *previous = WB_EDGE;
5248 *curpos = (U8 *) strbeg;
5249 }
5250 else {
5251 *previous = WB_UNKNOWN;
5252 }
5253 }
5254 else {
5255 (*curpos)--;
5256 *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
5257 }
5258
b0e24409
KW
5259 /* And we always back up over these three types */
5260 if (wb != WB_Extend && wb != WB_Format && wb != WB_ZWJ) {
ae3bb8ea
KW
5261 return wb;
5262 }
5263 }
5264
5265 if (*curpos < strbeg) {
85e5f08b 5266 return WB_EDGE;
ae3bb8ea
KW
5267 }
5268
5269 if (utf8_target) {
5270 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
5271 if (! prev_char_pos) {
85e5f08b 5272 return WB_EDGE;
ae3bb8ea
KW
5273 }
5274
5275 /* Back up over Extend and Format. curpos is always just to the right
5276 * of the characater whose value we are getting */
5277 do {
5278 U8 * prev_prev_char_pos;
5279 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
5280 -1,
5281 strbeg)))
5282 {
5283 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
5284 *curpos = prev_char_pos;
5285 prev_char_pos = prev_prev_char_pos;
5286 }
5287 else {
5288 *curpos = (U8 *) strbeg;
85e5f08b 5289 return WB_EDGE;
ae3bb8ea 5290 }
b0e24409 5291 } while (wb == WB_Extend || wb == WB_Format || wb == WB_ZWJ);
ae3bb8ea
KW
5292 }
5293 else {
5294 do {
5295 if (*curpos - 2 < strbeg) {
5296 *curpos = (U8 *) strbeg;
85e5f08b 5297 return WB_EDGE;
ae3bb8ea
KW
5298 }
5299 (*curpos)--;
5300 wb = getWB_VAL_CP(*(*curpos - 1));
85e5f08b 5301 } while (wb == WB_Extend || wb == WB_Format);
ae3bb8ea
KW
5302 }
5303
5304 return wb;
5305}
5306
ce12e254
YO
5307#define EVAL_CLOSE_PAREN_IS(st,expr) \
5308( \
5309 ( ( st ) ) && \
5310 ( ( st )->u.eval.close_paren ) && \
5311 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
5312)
5313
5314#define EVAL_CLOSE_PAREN_IS_TRUE(st,expr) \
5315( \
5316 ( ( st ) ) && \
5317 ( ( st )->u.eval.close_paren ) && \
5318 ( ( expr ) ) && \
5319 ( ( ( st )->u.eval.close_paren ) == ( (expr) + 1 ) ) \
5320)
5321
5322
5323#define EVAL_CLOSE_PAREN_SET(st,expr) \
5324 (st)->u.eval.close_paren = ( (expr) + 1 )
5325
5326#define EVAL_CLOSE_PAREN_CLEAR(st) \
5327 (st)->u.eval.close_paren = 0
5328
f73aaa43 5329/* returns -1 on failure, $+[0] on success */
99a90e59 5330STATIC SSize_t
f73aaa43 5331S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
d6a28714 5332{
27da23d5 5333 dVAR;
ba44c216 5334 const bool utf8_target = reginfo->is_utf8_target;
4ad0818d 5335 const U32 uniflags = UTF8_ALLOW_DEFAULT;
288b8c02 5336 REGEXP *rex_sv = reginfo->prog;
8d919b0a 5337 regexp *rex = ReANY(rex_sv);
f8fc2ecf 5338 RXi_GET_DECL(rex,rexi);
5d9a96ca 5339 /* the current state. This is a cached copy of PL_regmatch_state */
eb578fdb 5340 regmatch_state *st;
5d9a96ca 5341 /* cache heavy used fields of st in registers */
eb578fdb
KW
5342 regnode *scan;
5343 regnode *next;
5344 U32 n = 0; /* general value; init to avoid compiler warning */
8b2312d5 5345 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
2dfc11ec 5346 SSize_t endref = 0; /* offset of end of backref when ln is start */
d60de1d1 5347 char *locinput = startpos;
4d5016e5 5348 char *pushinput; /* where to continue after a PUSH */
0c6c7932 5349 I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */
24d3c4a9 5350
b69b0499 5351 bool result = 0; /* return value of S_regmatch */
21553840 5352 U32 depth = 0; /* depth of backtrack stack */
4b196cd4
YO
5353 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
5354 const U32 max_nochange_depth =
5355 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
5356 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
77cb431f
DM
5357 regmatch_state *yes_state = NULL; /* state to pop to on success of
5358 subpattern */
e2e6a0f1
YO
5359 /* mark_state piggy backs on the yes_state logic so that when we unwind
5360 the stack on success we can update the mark_state as we go */
5361 regmatch_state *mark_state = NULL; /* last mark state we have seen */
faec1544 5362 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
b8591aee 5363 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
40a82448 5364 U32 state_num;
5d458dd8
YO
5365 bool no_final = 0; /* prevent failure from backtracking? */
5366 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
d60de1d1 5367 char *startpoint = locinput;
5d458dd8
YO
5368 SV *popmark = NULL; /* are we looking for a mark? */
5369 SV *sv_commit = NULL; /* last mark name seen in failure */
5370 SV *sv_yes_mark = NULL; /* last mark name we have seen
486ec47a 5371 during a successful match */
5d458dd8
YO
5372 U32 lastopen = 0; /* last open we saw */
5373 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
4a2b275c 5374 SV* const oreplsv = GvSVn(PL_replgv);
24d3c4a9
DM
5375 /* these three flags are set by various ops to signal information to
5376 * the very next op. They have a useful lifetime of exactly one loop
5377 * iteration, and are not preserved or restored by state pushes/pops
5378 */
5379 bool sw = 0; /* the condition value in (?(cond)a|b) */
5380 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
5381 int logical = 0; /* the following EVAL is:
5382 0: (?{...})
5383 1: (?(?{...})X|Y)
5384 2: (??{...})
5385 or the following IFMATCH/UNLESSM is:
5386 false: plain (?=foo)
5387 true: used as a condition: (?(?=foo))
5388 */
81ed78b2
DM
5389 PAD* last_pad = NULL;
5390 dMULTICALL;
1c23e2bd 5391 U8 gimme = G_SCALAR;
81ed78b2
DM
5392 CV *caller_cv = NULL; /* who called us */
5393 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
92da3157 5394 U32 maxopenparen = 0; /* max '(' index seen so far */
3018b823
KW
5395 int to_complement; /* Invert the result? */
5396 _char_class_number classnum;
984e6dd1 5397 bool is_utf8_pat = reginfo->is_utf8_pat;
64935bc6 5398 bool match = FALSE;
4b9c7cae 5399 I32 orig_savestack_ix = PL_savestack_ix;
64935bc6 5400
c5d7841e
DM
5401/* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */
5402#if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL))
5403# define SOLARIS_BAD_OPTIMIZER
5404 const U32 *pl_charclass_dup = PL_charclass;
5405# define PL_charclass pl_charclass_dup
5406#endif
5407
95b24440 5408#ifdef DEBUGGING
e68ec53f 5409 GET_RE_DEBUG_FLAGS_DECL;
d6a28714
JH
5410#endif
5411
7e68f152
FC
5412 /* protect against undef(*^R) */
5413 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
5414
81ed78b2
DM
5415 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
5416 multicall_oldcatch = 0;
4f8dbb2d 5417 PERL_UNUSED_VAR(multicall_cop);
81ed78b2 5418
7918f24d
NC
5419 PERL_ARGS_ASSERT_REGMATCH;
5420
331b2dcc 5421 st = PL_regmatch_state;
5d9a96ca 5422
d6a28714 5423 /* Note that nextchr is a byte even in UTF */
7016d6eb 5424 SET_nextchr;
d6a28714 5425 scan = prog;
2b1a3689
YO
5426
5427 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
5428 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5429 Perl_re_printf( aTHX_ "regmatch start\n" );
5430 }));
5431
d6a28714 5432 while (scan != NULL) {
8ba1375e 5433
d6a28714
JH
5434
5435 next = scan + NEXT_OFF(scan);
5436 if (next == scan)
5437 next = NULL;
40a82448 5438 state_num = OP(scan);
d6a28714 5439
40a82448 5440 reenter_switch:
cb41e5d6
YO
5441 DEBUG_EXECUTE_r(
5442 if (state_num <= REGNODE_MAX) {
5443 SV * const prop = sv_newmortal();
5444 regnode *rnext = regnext(scan);
5445
5446 DUMP_EXEC_POS( locinput, scan, utf8_target, depth );
5447 regprop(rex, prop, scan, reginfo, NULL);
6ad9a8ab 5448 Perl_re_printf( aTHX_
147e3846 5449 "%*s%" IVdf ":%s(%" IVdf ")\n",
cb41e5d6
YO
5450 INDENT_CHARS(depth), "",
5451 (IV)(scan - rexi->program),
5452 SvPVX_const(prop),
5453 (PL_regkind[OP(scan)] == END || !rnext) ?
5454 0 : (IV)(rnext - rexi->program));
5455 }
5456 );
5457
3018b823 5458 to_complement = 0;
34a81e2b 5459
7016d6eb 5460 SET_nextchr;
e6ca698c 5461 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
bf798dc4 5462
40a82448 5463 switch (state_num) {
d3d47aac 5464 case SBOL: /* /^../ and /\A../ */
9d9163fb 5465 if (locinput == reginfo->strbeg)
b8c5462f 5466 break;
d6a28714 5467 sayNO;
3c0563b9
DM
5468
5469 case MBOL: /* /^../m */
9d9163fb 5470 if (locinput == reginfo->strbeg ||
7016d6eb 5471 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
d6a28714 5472 {
b8c5462f
JH
5473 break;
5474 }
d6a28714 5475 sayNO;
3c0563b9 5476
3c0563b9 5477 case GPOS: /* \G */
3b0527fe 5478 if (locinput == reginfo->ganch)
d6a28714
JH
5479 break;
5480 sayNO;
ee9b8eae 5481
3c0563b9 5482 case KEEPS: /* \K */
ee9b8eae 5483 /* update the startpoint */
b93070ed 5484 st->u.keeper.val = rex->offs[0].start;
9d9163fb 5485 rex->offs[0].start = locinput - reginfo->strbeg;
4d5016e5 5486 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
661d43c4 5487 NOT_REACHED; /* NOTREACHED */
a74ff37d 5488
ee9b8eae
YO
5489 case KEEPS_next_fail:
5490 /* rollback the start point change */
b93070ed 5491 rex->offs[0].start = st->u.keeper.val;
ee9b8eae 5492 sayNO_SILENT;
661d43c4 5493 NOT_REACHED; /* NOTREACHED */
3c0563b9 5494
3c0563b9 5495 case MEOL: /* /..$/m */
7016d6eb 5496 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 5497 sayNO;
b8c5462f 5498 break;
3c0563b9 5499
d3d47aac 5500 case SEOL: /* /..$/ */
7016d6eb 5501 if (!NEXTCHR_IS_EOS && nextchr != '\n')
b8c5462f 5502 sayNO;
220db18a 5503 if (reginfo->strend - locinput > 1)
b8c5462f 5504 sayNO;
b8c5462f 5505 break;
3c0563b9
DM
5506
5507 case EOS: /* \z */
7016d6eb 5508 if (!NEXTCHR_IS_EOS)
b8c5462f 5509 sayNO;
d6a28714 5510 break;
3c0563b9
DM
5511
5512 case SANY: /* /./s */
7016d6eb 5513 if (NEXTCHR_IS_EOS)
4633a7c4 5514 sayNO;
28b98f76 5515 goto increment_locinput;
3c0563b9 5516
3c0563b9 5517 case REG_ANY: /* /./ */
7016d6eb 5518 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
1aa99e6b 5519 sayNO;
28b98f76
DM
5520 goto increment_locinput;
5521
166ba7cd
DM
5522
5523#undef ST
5524#define ST st->u.trie
3c0563b9 5525 case TRIEC: /* (ab|cd) with known charclass */
786e8c11
YO
5526 /* In this case the charclass data is available inline so
5527 we can fail fast without a lot of extra overhead.
5528 */
7016d6eb 5529 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
fab2782b 5530 DEBUG_EXECUTE_r(
6ad9a8ab 5531 Perl_re_exec_indentf( aTHX_ "%sfailed to match trie start class...%s\n",
cb41e5d6 5532 depth, PL_colors[4], PL_colors[5])
fab2782b
YO
5533 );
5534 sayNO_SILENT;
661d43c4 5535 NOT_REACHED; /* NOTREACHED */
786e8c11 5536 }
924ba076 5537 /* FALLTHROUGH */
3c0563b9 5538 case TRIE: /* (ab|cd) */
2e64971a
DM
5539 /* the basic plan of execution of the trie is:
5540 * At the beginning, run though all the states, and
5541 * find the longest-matching word. Also remember the position
5542 * of the shortest matching word. For example, this pattern:
5543 * 1 2 3 4 5
5544 * ab|a|x|abcd|abc
5545 * when matched against the string "abcde", will generate
5546 * accept states for all words except 3, with the longest
895cc420 5547 * matching word being 4, and the shortest being 2 (with
2e64971a
DM
5548 * the position being after char 1 of the string).
5549 *
5550 * Then for each matching word, in word order (i.e. 1,2,4,5),
5551 * we run the remainder of the pattern; on each try setting
5552 * the current position to the character following the word,
5553 * returning to try the next word on failure.
5554 *
5555 * We avoid having to build a list of words at runtime by
5556 * using a compile-time structure, wordinfo[].prev, which
5557 * gives, for each word, the previous accepting word (if any).
5558 * In the case above it would contain the mappings 1->2, 2->0,
5559 * 3->0, 4->5, 5->1. We can use this table to generate, from
5560 * the longest word (4 above), a list of all words, by
5561 * following the list of prev pointers; this gives us the
5562 * unordered list 4,5,1,2. Then given the current word we have
5563 * just tried, we can go through the list and find the
5564 * next-biggest word to try (so if we just failed on word 2,
5565 * the next in the list is 4).
5566 *
5567 * Since at runtime we don't record the matching position in
5568 * the string for each word, we have to work that out for
5569 * each word we're about to process. The wordinfo table holds
5570 * the character length of each word; given that we recorded
5571 * at the start: the position of the shortest word and its
5572 * length in chars, we just need to move the pointer the
5573 * difference between the two char lengths. Depending on
5574 * Unicode status and folding, that's cheap or expensive.
5575 *
5576 * This algorithm is optimised for the case where are only a
5577 * small number of accept states, i.e. 0,1, or maybe 2.
5578 * With lots of accepts states, and having to try all of them,
5579 * it becomes quadratic on number of accept states to find all
5580 * the next words.
5581 */
5582
3dab1dad 5583 {
07be1b83 5584 /* what type of TRIE am I? (utf8 makes this contextual) */
a0a388a1 5585 DECL_TRIE_TYPE(scan);
3dab1dad
YO
5586
5587 /* what trie are we using right now */
be8e71aa 5588 reg_trie_data * const trie
f8fc2ecf 5589 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
85fbaab2 5590 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3dab1dad 5591 U32 state = trie->startstate;
166ba7cd 5592
780fcc9f
KW
5593 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5594 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
613abc6d 5595 if (utf8_target
2c2da8e7 5596 && nextchr >= 0 /* guard against negative EOS value in nextchr */
ca7eb79a 5597 && UTF8_IS_ABOVE_LATIN1(nextchr)
613abc6d
KW
5598 && scan->flags == EXACTL)
5599 {
5600 /* We only output for EXACTL, as we let the folder
5601 * output this message for EXACTFLU8 to avoid
5602 * duplication */
5603 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5604 reginfo->strend);
5605 }
780fcc9f 5606 }
7016d6eb
DM
5607 if ( trie->bitmap
5608 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5609 {
3dab1dad
YO
5610 if (trie->states[ state ].wordnum) {
5611 DEBUG_EXECUTE_r(
6ad9a8ab 5612 Perl_re_exec_indentf( aTHX_ "%smatched empty string...%s\n",
cb41e5d6 5613 depth, PL_colors[4], PL_colors[5])
3dab1dad 5614 );
20dbff7c
YO
5615 if (!trie->jump)
5616 break;
3dab1dad
YO
5617 } else {
5618 DEBUG_EXECUTE_r(
6ad9a8ab 5619 Perl_re_exec_indentf( aTHX_ "%sfailed to match trie start class...%s\n",
cb41e5d6 5620 depth, PL_colors[4], PL_colors[5])
3dab1dad
YO
5621 );
5622 sayNO_SILENT;
5623 }
5624 }
166ba7cd 5625
786e8c11
YO
5626 {
5627 U8 *uc = ( U8* )locinput;
5628
5629 STRLEN len = 0;
5630 STRLEN foldlen = 0;
5631 U8 *uscan = (U8*)NULL;
786e8c11 5632 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2e64971a
DM
5633 U32 charcount = 0; /* how many input chars we have matched */
5634 U32 accepted = 0; /* have we seen any accepting states? */
786e8c11 5635
786e8c11 5636 ST.jump = trie->jump;
786e8c11 5637 ST.me = scan;
2e64971a
DM
5638 ST.firstpos = NULL;
5639 ST.longfold = FALSE; /* char longer if folded => it's harder */
5640 ST.nextword = 0;
5641
5642 /* fully traverse the TRIE; note the position of the
5643 shortest accept state and the wordnum of the longest
5644 accept state */
07be1b83 5645
220db18a 5646 while ( state && uc <= (U8*)(reginfo->strend) ) {
786e8c11 5647 U32 base = trie->states[ state ].trans.base;
f9f4320a 5648 UV uvc = 0;
acb909b4 5649 U16 charid = 0;
2e64971a
DM
5650 U16 wordnum;
5651 wordnum = trie->states[ state ].wordnum;
5652
5653 if (wordnum) { /* it's an accept state */
5654 if (!accepted) {
5655 accepted = 1;
5656 /* record first match position */
5657 if (ST.longfold) {
5658 ST.firstpos = (U8*)locinput;
5659 ST.firstchars = 0;
5b47454d 5660 }
2e64971a
DM
5661 else {
5662 ST.firstpos = uc;
5663 ST.firstchars = charcount;
5664 }
5665 }
5666 if (!ST.nextword || wordnum < ST.nextword)
5667 ST.nextword = wordnum;
5668 ST.topword = wordnum;
786e8c11 5669 }
a3621e74 5670
07be1b83 5671 DEBUG_TRIE_EXECUTE_r({
cb41e5d6 5672 DUMP_EXEC_POS( (char *)uc, scan, utf8_target, depth );
2b1a3689 5673 /* HERE */
889458f1 5674 PerlIO_printf( Perl_debug_log,
147e3846 5675 "%*s%sState: %4" UVxf " Accepted: %c ",
2b1a3689 5676 INDENT_CHARS(depth), "", PL_colors[4],
2e64971a 5677 (UV)state, (accepted ? 'Y' : 'N'));
07be1b83 5678 });
a3621e74 5679
2e64971a 5680 /* read a char and goto next state */
220db18a 5681 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
6dd2be57 5682 I32 offset;
55eed653
NC
5683 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5684 uscan, len, uvc, charid, foldlen,
5685 foldbuf, uniflags);
2e64971a
DM
5686 charcount++;
5687 if (foldlen>0)
5688 ST.longfold = TRUE;
5b47454d 5689 if (charid &&
6dd2be57
DM
5690 ( ((offset =
5691 base + charid - 1 - trie->uniquecharcount)) >= 0)
5692
5693 && ((U32)offset < trie->lasttrans)
5694 && trie->trans[offset].check == state)
5b47454d 5695 {
6dd2be57 5696 state = trie->trans[offset].next;
5b47454d
DM
5697 }
5698 else {
5699 state = 0;
5700 }
5701 uc += len;
5702
5703 }
5704 else {
a3621e74
YO
5705 state = 0;
5706 }
5707 DEBUG_TRIE_EXECUTE_r(
6ad9a8ab 5708 Perl_re_printf( aTHX_
147e3846 5709 "Charid:%3x CP:%4" UVxf " After State: %4" UVxf "%s\n",
e4584336 5710 charid, uvc, (UV)state, PL_colors[5] );
a3621e74
YO
5711 );
5712 }
2e64971a 5713 if (!accepted)
a3621e74 5714 sayNO;
a3621e74 5715
2e64971a
DM
5716 /* calculate total number of accept states */
5717 {
5718 U16 w = ST.topword;
5719 accepted = 0;
5720 while (w) {
5721 w = trie->wordinfo[w].prev;
5722 accepted++;
5723 }
5724 ST.accepted = accepted;
5725 }
5726
166ba7cd 5727 DEBUG_EXECUTE_r(
147e3846 5728 Perl_re_exec_indentf( aTHX_ "%sgot %" IVdf " possible matches%s\n",
cb41e5d6 5729 depth,
166ba7cd
DM
5730 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5731 );
2e64971a 5732 goto trie_first_try; /* jump into the fail handler */
786e8c11 5733 }}
661d43c4 5734 NOT_REACHED; /* NOTREACHED */
2e64971a
DM
5735
5736 case TRIE_next_fail: /* we failed - try next alternative */
a059a757
DM
5737 {
5738 U8 *uc;
2c27f131 5739 if ( ST.jump ) {
cbb658a1
DM
5740 /* undo any captures done in the tail part of a branch,
5741 * e.g.
5742 * /(?:X(.)(.)|Y(.)).../
5743 * where the trie just matches X then calls out to do the
5744 * rest of the branch */
fae667d5 5745 REGCP_UNWIND(ST.cp);
a8d1f4b4 5746 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
fae667d5 5747 }
2e64971a
DM
5748 if (!--ST.accepted) {
5749 DEBUG_EXECUTE_r({
6ad9a8ab 5750 Perl_re_exec_indentf( aTHX_ "%sTRIE failed...%s\n",
cb41e5d6 5751 depth,
2e64971a
DM
5752 PL_colors[4],
5753 PL_colors[5] );
5754 });
5755 sayNO_SILENT;
5756 }
5757 {
5758 /* Find next-highest word to process. Note that this code
5759 * is O(N^2) per trie run (O(N) per branch), so keep tight */
eb578fdb
KW
5760 U16 min = 0;
5761 U16 word;
5762 U16 const nextword = ST.nextword;
5763 reg_trie_wordinfo * const wordinfo
2e64971a
DM
5764 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5765 for (word=ST.topword; word; word=wordinfo[word].prev) {
5766 if (word > nextword && (!min || word < min))
5767 min = word;
5768 }
5769 ST.nextword = min;
5770 }
5771
fae667d5 5772 trie_first_try:
5d458dd8
YO
5773 if (do_cutgroup) {
5774 do_cutgroup = 0;
5775 no_final = 0;
5776 }
fae667d5 5777
2c27f131 5778 if ( ST.jump ) {
b93070ed 5779 ST.lastparen = rex->lastparen;
f6033a9d 5780 ST.lastcloseparen = rex->lastcloseparen;
fae667d5 5781 REGCP_SET(ST.cp);
2e64971a 5782 }
a3621e74 5783
2e64971a 5784 /* find start char of end of current word */
166ba7cd 5785 {
2e64971a 5786 U32 chars; /* how many chars to skip */
2e64971a
DM
5787 reg_trie_data * const trie
5788 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5789
5790 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5791 >= ST.firstchars);
5792 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5793 - ST.firstchars;
a059a757 5794 uc = ST.firstpos;
2e64971a
DM
5795
5796 if (ST.longfold) {
5797 /* the hard option - fold each char in turn and find
5798 * its folded length (which may be different */
5799 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5800 STRLEN foldlen;
5801 STRLEN len;
d9a396a3 5802 UV uvc;
2e64971a
DM
5803 U8 *uscan;
5804
5805 while (chars) {
f2ed9b32 5806 if (utf8_target) {
c80e42f3 5807 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
2e64971a
DM
5808 uniflags);
5809 uc += len;
5810 }
5811 else {
5812 uvc = *uc;
5813 uc++;
5814 }
5815 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5816 uscan = foldbuf;
5817 while (foldlen) {
5818 if (!--chars)
5819 break;
c80e42f3 5820 uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
2e64971a
DM
5821 uniflags);
5822 uscan += len;
5823 foldlen -= len;
5824 }
5825 }
a3621e74 5826 }
2e64971a 5827 else {
f2ed9b32 5828 if (utf8_target)
2e64971a
DM
5829 while (chars--)
5830 uc += UTF8SKIP(uc);
5831 else
5832 uc += chars;
5833 }
2e64971a 5834 }
166ba7cd 5835
6603fe3e
DM
5836 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5837 ? ST.jump[ST.nextword]
5838 : NEXT_OFF(ST.me));
166ba7cd 5839
2e64971a 5840 DEBUG_EXECUTE_r({
6ad9a8ab 5841 Perl_re_exec_indentf( aTHX_ "%sTRIE matched word #%d, continuing%s\n",
cb41e5d6 5842 depth,
2e64971a
DM
5843 PL_colors[4],
5844 ST.nextword,
5845 PL_colors[5]
5846 );
5847 });
5848
cfe04db5 5849 if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
a059a757 5850 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
661d43c4 5851 NOT_REACHED; /* NOTREACHED */
166ba7cd 5852 }
2e64971a
DM
5853 /* only one choice left - just continue */
5854 DEBUG_EXECUTE_r({
5855 AV *const trie_words
5856 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
d0bec203
HS
5857 SV ** const tmp = trie_words
5858 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
2e64971a
DM
5859 SV *sv= tmp ? sv_newmortal() : NULL;
5860
6ad9a8ab 5861 Perl_re_exec_indentf( aTHX_ "%sonly one match left, short-circuiting: #%d <%s>%s\n",
cb41e5d6 5862 depth, PL_colors[4],
2e64971a
DM
5863 ST.nextword,
5864 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5865 PL_colors[0], PL_colors[1],
c89df6cf 5866 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
2e64971a
DM
5867 )
5868 : "not compiled under -Dr",
5869 PL_colors[5] );
5870 });
5871
a059a757 5872 locinput = (char*)uc;
2e64971a 5873 continue; /* execute rest of RE */
a74ff37d 5874 /* NOTREACHED */
a059a757 5875 }
166ba7cd
DM
5876#undef ST
5877
a4525e78 5878 case EXACTL: /* /abc/l */
780fcc9f 5879 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
613abc6d
KW
5880
5881 /* Complete checking would involve going through every character
5882 * matched by the string to see if any is above latin1. But the
5883 * comparision otherwise might very well be a fast assembly
5884 * language routine, and I (khw) don't think slowing things down
5885 * just to check for this warning is worth it. So this just checks
5886 * the first character */
5887 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5888 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5889 }
780fcc9f 5890 /* FALLTHROUGH */
3c0563b9 5891 case EXACT: { /* /abc/ */
95b24440 5892 char *s = STRING(scan);
24d3c4a9 5893 ln = STR_LEN(scan);
984e6dd1 5894 if (utf8_target != is_utf8_pat) {
bc517b45 5895 /* The target and the pattern have differing utf8ness. */
1aa99e6b 5896 char *l = locinput;
24d3c4a9 5897 const char * const e = s + ln;
a72c7584 5898
f2ed9b32 5899 if (utf8_target) {
e6a3850e
KW
5900 /* The target is utf8, the pattern is not utf8.
5901 * Above-Latin1 code points can't match the pattern;
5902 * invariants match exactly, and the other Latin1 ones need
5903 * to be downgraded to a single byte in order to do the
5904 * comparison. (If we could be confident that the target
5905 * is not malformed, this could be refactored to have fewer
5906 * tests by just assuming that if the first bytes match, it
5907 * is an invariant, but there are tests in the test suite
5908 * dealing with (??{...}) which violate this) */
1aa99e6b 5909 while (s < e) {
220db18a
DM
5910 if (l >= reginfo->strend
5911 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5912 {
e6a3850e
KW
5913 sayNO;
5914 }
5915 if (UTF8_IS_INVARIANT(*(U8*)l)) {
5916 if (*l != *s) {
5917 sayNO;
5918 }
5919 l++;
5920 }
5921 else {
a62b247b 5922 if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
94bb8c36 5923 {
e6a3850e
KW
5924 sayNO;
5925 }
5926 l += 2;
5927 }
5928 s++;
1aa99e6b 5929 }
5ff6fc6d
JH
5930 }
5931 else {
5932 /* The target is not utf8, the pattern is utf8. */
1aa99e6b 5933 while (s < e) {
220db18a
DM
5934 if (l >= reginfo->strend
5935 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
e6a3850e
KW
5936 {
5937 sayNO;
5938 }
5939 if (UTF8_IS_INVARIANT(*(U8*)s)) {
5940 if (*s != *l) {
5941 sayNO;
5942 }
5943 s++;
5944 }
5945 else {
a62b247b 5946 if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
94bb8c36 5947 {
e6a3850e
KW
5948 sayNO;
5949 }
5950 s += 2;
5951 }
5952 l++;
1aa99e6b 5953 }
5ff6fc6d 5954 }
1aa99e6b 5955 locinput = l;
1aa99e6b 5956 }
5ac65bff
KW
5957 else {
5958 /* The target and the pattern have the same utf8ness. */
5959 /* Inline the first character, for speed. */
220db18a 5960 if (reginfo->strend - locinput < ln
5ac65bff
KW
5961 || UCHARAT(s) != nextchr
5962 || (ln > 1 && memNE(s, locinput, ln)))
5963 {
5964 sayNO;
5965 }
5966 locinput += ln;
5967 }
d6a28714 5968 break;
95b24440 5969 }
7016d6eb 5970
3c0563b9 5971 case EXACTFL: { /* /abc/il */
a932d541 5972 re_fold_t folder;
9a5a5549
KW
5973 const U8 * fold_array;
5974 const char * s;
d513472c 5975 U32 fold_utf8_flags;
9a5a5549 5976
780fcc9f 5977 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
f67f9e53
KW
5978 folder = foldEQ_locale;
5979 fold_array = PL_fold_locale;
cea315b6 5980 fold_utf8_flags = FOLDEQ_LOCALE;
9a5a5549
KW
5981 goto do_exactf;
5982
a4525e78
KW
5983 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
5984 is effectively /u; hence to match, target
5985 must be UTF-8. */
5986 if (! utf8_target) {
5987 sayNO;
5988 }
613abc6d
KW
5989 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
5990 | FOLDEQ_S1_FOLDS_SANE;
ec5e0e1c
KW
5991 folder = foldEQ_latin1;
5992 fold_array = PL_fold_latin1;
a4525e78
KW
5993 goto do_exactf;
5994
3c0563b9 5995 case EXACTFU_SS: /* /\x{df}/iu */
3c0563b9 5996 case EXACTFU: /* /abc/iu */
9a5a5549
KW
5997 folder = foldEQ_latin1;
5998 fold_array = PL_fold_latin1;
984e6dd1 5999 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
9a5a5549
KW
6000 goto do_exactf;
6001
098b07d5
KW
6002 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
6003 patterns */
6004 assert(! is_utf8_pat);
924ba076 6005 /* FALLTHROUGH */
3c0563b9 6006 case EXACTFA: /* /abc/iaa */
2f7f8cb1
KW
6007 folder = foldEQ_latin1;
6008 fold_array = PL_fold_latin1;
57014d77 6009 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
6010 goto do_exactf;
6011
2fdb7295
KW
6012 case EXACTF: /* /abc/i This node only generated for
6013 non-utf8 patterns */
6014 assert(! is_utf8_pat);
9a5a5549
KW
6015 folder = foldEQ;
6016 fold_array = PL_fold;
62bf7766 6017 fold_utf8_flags = 0;
9a5a5549
KW
6018
6019 do_exactf:
6020 s = STRING(scan);
24d3c4a9 6021 ln = STR_LEN(scan);
d6a28714 6022
31f05a37
KW
6023 if (utf8_target
6024 || is_utf8_pat
6025 || state_num == EXACTFU_SS
6026 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
6027 {
3c760661
KW
6028 /* Either target or the pattern are utf8, or has the issue where
6029 * the fold lengths may differ. */
be8e71aa 6030 const char * const l = locinput;
220db18a 6031 char *e = reginfo->strend;
bc517b45 6032
984e6dd1 6033 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
fa5b1667 6034 l, &e, 0, utf8_target, fold_utf8_flags))
c3e1d013
KW
6035 {
6036 sayNO;
5486206c 6037 }
d07ddd77 6038 locinput = e;
d07ddd77 6039 break;
a0ed51b3 6040 }
d6a28714 6041
0a138b74 6042 /* Neither the target nor the pattern are utf8 */
1443c94c
DM
6043 if (UCHARAT(s) != nextchr
6044 && !NEXTCHR_IS_EOS
6045 && UCHARAT(s) != fold_array[nextchr])
9a5a5549 6046 {
a0ed51b3 6047 sayNO;
9a5a5549 6048 }
220db18a 6049 if (reginfo->strend - locinput < ln)
b8c5462f 6050 sayNO;
9a5a5549 6051 if (ln > 1 && ! folder(s, locinput, ln))
4633a7c4 6052 sayNO;
24d3c4a9 6053 locinput += ln;
a0d0e21e 6054 break;
9a5a5549 6055 }
63ac0dad 6056
3c0563b9 6057 case NBOUNDL: /* /\B/l */
5c388b33
KW
6058 to_complement = 1;
6059 /* FALLTHROUGH */
6060
6061 case BOUNDL: /* /\b/l */
73e8ff00
DM
6062 {
6063 bool b1, b2;
780fcc9f 6064 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
64935bc6
KW
6065
6066 if (FLAGS(scan) != TRADITIONAL_BOUND) {
6067 if (! IN_UTF8_CTYPE_LOCALE) {
6068 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
6069 B_ON_NON_UTF8_LOCALE_IS_WRONG);
6070 }
6071 goto boundu;
6072 }
6073
5c388b33
KW
6074 if (utf8_target) {
6075 if (locinput == reginfo->strbeg)
73e8ff00 6076 b1 = isWORDCHAR_LC('\n');
5c388b33 6077 else {
7a207065
KW
6078 b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
6079 (U8*)(reginfo->strbeg)),
6080 (U8*)(reginfo->strend));
5c388b33 6081 }
73e8ff00 6082 b2 = (NEXTCHR_IS_EOS)
5c388b33 6083 ? isWORDCHAR_LC('\n')
7a207065
KW
6084 : isWORDCHAR_LC_utf8_safe((U8*) locinput,
6085 (U8*) reginfo->strend);
5c388b33
KW
6086 }
6087 else { /* Here the string isn't utf8 */
73e8ff00 6088 b1 = (locinput == reginfo->strbeg)
5c388b33
KW
6089 ? isWORDCHAR_LC('\n')
6090 : isWORDCHAR_LC(UCHARAT(locinput - 1));
73e8ff00 6091 b2 = (NEXTCHR_IS_EOS)
5c388b33
KW
6092 ? isWORDCHAR_LC('\n')
6093 : isWORDCHAR_LC(nextchr);
6094 }
73e8ff00 6095 if (to_complement ^ (b1 == b2)) {
5c388b33
KW
6096 sayNO;
6097 }
6098 break;
73e8ff00 6099 }
5c388b33
KW
6100
6101 case NBOUND: /* /\B/ */
6102 to_complement = 1;
780fcc9f 6103 /* FALLTHROUGH */
5c388b33 6104
3c0563b9 6105 case BOUND: /* /\b/ */
5c388b33
KW
6106 if (utf8_target) {
6107 goto bound_utf8;
6108 }
6109 goto bound_ascii_match_only;
6110
6111 case NBOUNDA: /* /\B/a */
6112 to_complement = 1;
6113 /* FALLTHROUGH */
6114
3c0563b9 6115 case BOUNDA: /* /\b/a */
73e8ff00
DM
6116 {
6117 bool b1, b2;
5c388b33 6118
c52b8b12 6119 bound_ascii_match_only:
5c388b33
KW
6120 /* Here the string isn't utf8, or is utf8 and only ascii characters
6121 * are to match \w. In the latter case looking at the byte just
6122 * prior to the current one may be just the final byte of a
6123 * multi-byte character. This is ok. There are two cases:
6124 * 1) it is a single byte character, and then the test is doing
6125 * just what it's supposed to.
6126 * 2) it is a multi-byte character, in which case the final byte is
6127 * never mistakable for ASCII, and so the test will say it is
6128 * not a word character, which is the correct answer. */
73e8ff00 6129 b1 = (locinput == reginfo->strbeg)
5c388b33
KW
6130 ? isWORDCHAR_A('\n')
6131 : isWORDCHAR_A(UCHARAT(locinput - 1));
73e8ff00 6132 b2 = (NEXTCHR_IS_EOS)
5c388b33
KW
6133 ? isWORDCHAR_A('\n')
6134 : isWORDCHAR_A(nextchr);
73e8ff00 6135 if (to_complement ^ (b1 == b2)) {
5c388b33
KW
6136 sayNO;
6137 }
6138 break;
73e8ff00 6139 }
5c388b33 6140
3c0563b9 6141 case NBOUNDU: /* /\B/u */
5c388b33
KW
6142 to_complement = 1;
6143 /* FALLTHROUGH */
b2680017 6144
5c388b33 6145 case BOUNDU: /* /\b/u */
64935bc6
KW
6146
6147 boundu:
a7a8bd1e
KW
6148 if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
6149 match = FALSE;
6150 }
6151 else if (utf8_target) {
64935bc6
KW
6152 bound_utf8:
6153 switch((bound_type) FLAGS(scan)) {
6154 case TRADITIONAL_BOUND:
73e8ff00
DM
6155 {
6156 bool b1, b2;
6157 b1 = (locinput == reginfo->strbeg)
996de84d 6158 ? 0 /* isWORDCHAR_L1('\n') */
7a207065
KW
6159 : isWORDCHAR_utf8_safe(
6160 reghop3((U8*)locinput,
6161 -1,
6162 (U8*)(reginfo->strbeg)),
6163 (U8*) reginfo->strend);
73e8ff00 6164 b2 = (NEXTCHR_IS_EOS)
996de84d 6165 ? 0 /* isWORDCHAR_L1('\n') */
7a207065
KW
6166 : isWORDCHAR_utf8_safe((U8*)locinput,
6167 (U8*) reginfo->strend);
73e8ff00 6168 match = cBOOL(b1 != b2);
64935bc6 6169 break;
73e8ff00 6170 }
64935bc6
KW
6171 case GCB_BOUND:
6172 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6173 match = TRUE; /* GCB always matches at begin and
6174 end */
6175 }
6176 else {
6177 /* Find the gcb values of previous and current
6178 * chars, then see if is a break point */
6179 match = isGCB(getGCB_VAL_UTF8(
6180 reghop3((U8*)locinput,
6181 -1,
6182 (U8*)(reginfo->strbeg)),
6183 (U8*) reginfo->strend),
6184 getGCB_VAL_UTF8((U8*) locinput,
b0e24409
KW
6185 (U8*) reginfo->strend),
6186 (U8*) reginfo->strbeg,
6187 (U8*) locinput,
6188 utf8_target);
64935bc6
KW
6189 }
6190 break;
06ae2722 6191
6b659339
KW
6192 case LB_BOUND:
6193 if (locinput == reginfo->strbeg) {
6194 match = FALSE;
6195 }
6196 else if (NEXTCHR_IS_EOS) {
6197 match = TRUE;
6198 }
6199 else {
6200 match = isLB(getLB_VAL_UTF8(
6201 reghop3((U8*)locinput,
6202 -1,
6203 (U8*)(reginfo->strbeg)),
6204 (U8*) reginfo->strend),
6205 getLB_VAL_UTF8((U8*) locinput,
6206 (U8*) reginfo->strend),
6207 (U8*) reginfo->strbeg,
6208 (U8*) locinput,
6209 (U8*) reginfo->strend,
6210 utf8_target);
6211 }
6212 break;
6213
06ae2722
KW
6214 case SB_BOUND: /* Always matches at begin and end */
6215 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6216 match = TRUE;
6217 }
6218 else {
6219 match = isSB(getSB_VAL_UTF8(
6220 reghop3((U8*)locinput,
6221 -1,
6222 (U8*)(reginfo->strbeg)),
6223 (U8*) reginfo->strend),
6224 getSB_VAL_UTF8((U8*) locinput,
6225 (U8*) reginfo->strend),
6226 (U8*) reginfo->strbeg,
6227 (U8*) locinput,
6228 (U8*) reginfo->strend,
6229 utf8_target);
6230 }
6231 break;
6232
ae3bb8ea
KW
6233 case WB_BOUND:
6234 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6235 match = TRUE;
6236 }
6237 else {
85e5f08b 6238 match = isWB(WB_UNKNOWN,
ae3bb8ea
KW
6239 getWB_VAL_UTF8(
6240 reghop3((U8*)locinput,
6241 -1,
6242 (U8*)(reginfo->strbeg)),
6243 (U8*) reginfo->strend),
6244 getWB_VAL_UTF8((U8*) locinput,
6245 (U8*) reginfo->strend),
6246 (U8*) reginfo->strbeg,
6247 (U8*) locinput,
6248 (U8*) reginfo->strend,
6249 utf8_target);
6250 }
6251 break;
64935bc6 6252 }
b2680017 6253 }
64935bc6
KW
6254 else { /* Not utf8 target */
6255 switch((bound_type) FLAGS(scan)) {
6256 case TRADITIONAL_BOUND:
73e8ff00
DM
6257 {
6258 bool b1, b2;
6259 b1 = (locinput == reginfo->strbeg)
996de84d 6260 ? 0 /* isWORDCHAR_L1('\n') */
aa383448 6261 : isWORDCHAR_L1(UCHARAT(locinput - 1));
73e8ff00 6262 b2 = (NEXTCHR_IS_EOS)
996de84d 6263 ? 0 /* isWORDCHAR_L1('\n') */
aa383448 6264 : isWORDCHAR_L1(nextchr);
73e8ff00 6265 match = cBOOL(b1 != b2);
64935bc6 6266 break;
73e8ff00 6267 }
cfaf538b 6268
64935bc6
KW
6269 case GCB_BOUND:
6270 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6271 match = TRUE; /* GCB always matches at begin and
6272 end */
6273 }
6274 else { /* Only CR-LF combo isn't a GCB in 0-255
6275 range */
6276 match = UCHARAT(locinput - 1) != '\r'
6277 || UCHARAT(locinput) != '\n';
6278 }
6279 break;
06ae2722 6280
6b659339
KW
6281 case LB_BOUND:
6282 if (locinput == reginfo->strbeg) {
6283 match = FALSE;
6284 }
6285 else if (NEXTCHR_IS_EOS) {
6286 match = TRUE;
6287 }
6288 else {
6289 match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
6290 getLB_VAL_CP(UCHARAT(locinput)),
6291 (U8*) reginfo->strbeg,
6292 (U8*) locinput,
6293 (U8*) reginfo->strend,
6294 utf8_target);
6295 }
6296 break;
6297
06ae2722
KW
6298 case SB_BOUND: /* Always matches at begin and end */
6299 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6300 match = TRUE;
6301 }
6302 else {
6303 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
6304 getSB_VAL_CP(UCHARAT(locinput)),
6305 (U8*) reginfo->strbeg,
6306 (U8*) locinput,
6307 (U8*) reginfo->strend,
6308 utf8_target);
6309 }
6310 break;
6311
ae3bb8ea
KW
6312 case WB_BOUND:
6313 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
6314 match = TRUE;
6315 }
6316 else {
85e5f08b 6317 match = isWB(WB_UNKNOWN,
ae3bb8ea
KW
6318 getWB_VAL_CP(UCHARAT(locinput -1)),
6319 getWB_VAL_CP(UCHARAT(locinput)),
6320 (U8*) reginfo->strbeg,
6321 (U8*) locinput,
6322 (U8*) reginfo->strend,
6323 utf8_target);
6324 }
6325 break;
64935bc6 6326 }
b2680017 6327 }
5c388b33 6328
64935bc6 6329 if (to_complement ^ ! match) {
5c388b33
KW
6330 sayNO;
6331 }
b2680017 6332 break;
3c0563b9 6333
a4525e78 6334 case ANYOFL: /* /[abc]/l */
780fcc9f 6335 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30 6336
d1c40ef5 6337 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
a0bd1a30
KW
6338 {
6339 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
6340 }
780fcc9f 6341 /* FALLTHROUGH */
ac44c12e 6342 case ANYOFD: /* /[abc]/d */
a4525e78 6343 case ANYOF: /* /[abc]/ */
7016d6eb
DM
6344 if (NEXTCHR_IS_EOS)
6345 sayNO;
dcf88e34 6346 if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
3db24e1e
KW
6347 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
6348 utf8_target))
09b08e9b 6349 sayNO;
635cd5d4 6350 locinput += UTF8SKIP(locinput);
ffc61ed2
JH
6351 }
6352 else {
451c6e0b 6353 if (!REGINCLASS(rex, scan, (U8*)locinput, utf8_target))
09b08e9b 6354 sayNO;
3640db6b 6355 locinput++;
e0f9d4a8 6356 }
b8c5462f 6357 break;
3c0563b9 6358
3018b823
KW
6359 /* The argument (FLAGS) to all the POSIX node types is the class number
6360 * */
ee9a90b8 6361
3018b823
KW
6362 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
6363 to_complement = 1;
6364 /* FALLTHROUGH */
6365
6366 case POSIXL: /* \w or [:punct:] etc. under /l */
780fcc9f 6367 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3018b823 6368 if (NEXTCHR_IS_EOS)
bedac28b 6369 sayNO;
bedac28b 6370
3018b823
KW
6371 /* Use isFOO_lc() for characters within Latin1. (Note that
6372 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6373 * wouldn't be invariant) */
6374 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
eb4e9c04 6375 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
bedac28b
KW
6376 sayNO;
6377 }
109ac342
KW
6378
6379 locinput++;
6380 break;
bedac28b 6381 }
109ac342 6382
042d9e50
KW
6383 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
6384 /* An above Latin-1 code point, or malformed */
6385 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
6386 reginfo->strend);
01f55654 6387 goto utf8_posix_above_latin1;
bedac28b 6388 }
3018b823 6389
109ac342
KW
6390 /* Here is a UTF-8 variant code point below 256 and the target is
6391 * UTF-8 */
6392 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
6393 EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
6394 *(locinput + 1))))))
6395 {
6396 sayNO;
6397 }
6398
6399 goto increment_locinput;
bedac28b 6400
3018b823
KW
6401 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
6402 to_complement = 1;
6403 /* FALLTHROUGH */
6404
6405 case POSIXD: /* \w or [:punct:] etc. under /d */
bedac28b 6406 if (utf8_target) {
3018b823 6407 goto utf8_posix;
bedac28b 6408 }
3018b823
KW
6409 goto posixa;
6410
6411 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
bedac28b 6412
3018b823 6413 if (NEXTCHR_IS_EOS) {
bedac28b
KW
6414 sayNO;
6415 }
bedac28b 6416
3018b823
KW
6417 /* All UTF-8 variants match */
6418 if (! UTF8_IS_INVARIANT(nextchr)) {
6419 goto increment_locinput;
bedac28b 6420 }
ee9a90b8 6421
3018b823 6422 to_complement = 1;
e29d83e2 6423 goto join_nposixa;
3018b823
KW
6424
6425 case POSIXA: /* \w or [:punct:] etc. under /a */
6426
6427 posixa:
6428 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
6429 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
6430 * character is a single byte */
20d0b1e9 6431
e29d83e2
KW
6432 if (NEXTCHR_IS_EOS) {
6433 sayNO;
6434 }
6435
6436 join_nposixa:
6437
6438 if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
6439 FLAGS(scan)))))
3018b823 6440 {
0658cdde
KW
6441 sayNO;
6442 }
3018b823
KW
6443
6444 /* Here we are either not in utf8, or we matched a utf8-invariant,
6445 * so the next char is the next byte */
3640db6b 6446 locinput++;
0658cdde 6447 break;
3c0563b9 6448
3018b823
KW
6449 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
6450 to_complement = 1;
6451 /* FALLTHROUGH */
6452
6453 case POSIXU: /* \w or [:punct:] etc. under /u */
6454 utf8_posix:
6455 if (NEXTCHR_IS_EOS) {
0658cdde
KW
6456 sayNO;
6457 }
3018b823
KW
6458
6459 /* Use _generic_isCC() for characters within Latin1. (Note that
6460 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
6461 * wouldn't be invariant) */
6462 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
6463 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
6464 FLAGS(scan)))))
6465 {
6466 sayNO;
6467 }
6468 locinput++;
6469 }
042d9e50 6470 else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) {
3018b823 6471 if (! (to_complement
a62b247b 6472 ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
3018b823 6473 *(locinput + 1)),
94bb8c36 6474 FLAGS(scan)))))
3018b823
KW
6475 {
6476 sayNO;
6477 }
6478 locinput += 2;
6479 }
6480 else { /* Handle above Latin-1 code points */
c52b8b12 6481 utf8_posix_above_latin1:
3018b823
KW
6482 classnum = (_char_class_number) FLAGS(scan);
6483 if (classnum < _FIRST_NON_SWASH_CC) {
6484
6485 /* Here, uses a swash to find such code points. Load if if
6486 * not done already */
6487 if (! PL_utf8_swash_ptrs[classnum]) {
6488 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6489 PL_utf8_swash_ptrs[classnum]
6490 = _core_swash_init("utf8",
2a16ac92
KW
6491 "",
6492 &PL_sv_undef, 1, 0,
6493 PL_XPosix_ptrs[classnum], &flags);
3018b823
KW
6494 }
6495 if (! (to_complement
6496 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
6497 (U8 *) locinput, TRUE))))
6498 {
6499 sayNO;
6500 }
6501 }
6502 else { /* Here, uses macros to find above Latin-1 code points */
6503 switch (classnum) {
779cf272 6504 case _CC_ENUM_SPACE:
3018b823
KW
6505 if (! (to_complement
6506 ^ cBOOL(is_XPERLSPACE_high(locinput))))
6507 {
6508 sayNO;
6509 }
6510 break;
6511 case _CC_ENUM_BLANK:
6512 if (! (to_complement
6513 ^ cBOOL(is_HORIZWS_high(locinput))))
6514 {
6515 sayNO;
6516 }
6517 break;
6518 case _CC_ENUM_XDIGIT:
6519 if (! (to_complement
6520 ^ cBOOL(is_XDIGIT_high(locinput))))
6521 {
6522 sayNO;
6523 }
6524 break;
6525 case _CC_ENUM_VERTSPACE:
6526 if (! (to_complement
6527 ^ cBOOL(is_VERTWS_high(locinput))))
6528 {
6529 sayNO;
6530 }
6531 break;
6532 default: /* The rest, e.g. [:cntrl:], can't match
6533 above Latin1 */
6534 if (! to_complement) {
6535 sayNO;
6536 }
6537 break;
6538 }
6539 }
6540 locinput += UTF8SKIP(locinput);
6541 }
6542 break;
0658cdde 6543
37e2e78e
KW
6544 case CLUMP: /* Match \X: logical Unicode character. This is defined as
6545 a Unicode extended Grapheme Cluster */
7016d6eb 6546 if (NEXTCHR_IS_EOS)
a0ed51b3 6547 sayNO;
f2ed9b32 6548 if (! utf8_target) {
37e2e78e
KW
6549
6550 /* Match either CR LF or '.', as all the other possibilities
6551 * require utf8 */
6552 locinput++; /* Match the . or CR */
cc3b396d
KW
6553 if (nextchr == '\r' /* And if it was CR, and the next is LF,
6554 match the LF */
220db18a 6555 && locinput < reginfo->strend
e699a1d5
KW
6556 && UCHARAT(locinput) == '\n')
6557 {
6558 locinput++;
6559 }
37e2e78e
KW
6560 }
6561 else {
6562
64935bc6 6563 /* Get the gcb type for the current character */
85e5f08b 6564 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
64935bc6 6565 (U8*) reginfo->strend);
37e2e78e 6566
64935bc6
KW
6567 /* Then scan through the input until we get to the first
6568 * character whose type is supposed to be a gcb with the
6569 * current character. (There is always a break at the
6570 * end-of-input) */
6571 locinput += UTF8SKIP(locinput);
6572 while (locinput < reginfo->strend) {
85e5f08b 6573 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
64935bc6 6574 (U8*) reginfo->strend);
b0e24409
KW
6575 if (isGCB(prev_gcb, cur_gcb,
6576 (U8*) reginfo->strbeg, (U8*) locinput,
6577 utf8_target))
6578 {
64935bc6 6579 break;
27d4fc33 6580 }
11dfcd49 6581
64935bc6
KW
6582 prev_gcb = cur_gcb;
6583 locinput += UTF8SKIP(locinput);
6584 }
37e2e78e 6585
37e2e78e 6586
37e2e78e 6587 }
a0ed51b3 6588 break;
81714fb9 6589
3c0563b9 6590 case NREFFL: /* /\g{name}/il */
d7ef4b73
KW
6591 { /* The capture buffer cases. The ones beginning with N for the
6592 named buffers just convert to the equivalent numbered and
6593 pretend they were called as the corresponding numbered buffer
6594 op. */
26ecd678
TC
6595 /* don't initialize these in the declaration, it makes C++
6596 unhappy */
9d9163fb 6597 const char *s;
ff1157ca 6598 char type;
8368298a
TC
6599 re_fold_t folder;
6600 const U8 *fold_array;
26ecd678 6601 UV utf8_fold_flags;
8368298a 6602
780fcc9f 6603 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
d7ef4b73
KW
6604 folder = foldEQ_locale;
6605 fold_array = PL_fold_locale;
6606 type = REFFL;
cea315b6 6607 utf8_fold_flags = FOLDEQ_LOCALE;
d7ef4b73
KW
6608 goto do_nref;
6609
3c0563b9 6610 case NREFFA: /* /\g{name}/iaa */
2f7f8cb1
KW
6611 folder = foldEQ_latin1;
6612 fold_array = PL_fold_latin1;
6613 type = REFFA;
6614 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6615 goto do_nref;
6616
3c0563b9 6617 case NREFFU: /* /\g{name}/iu */
d7ef4b73
KW
6618 folder = foldEQ_latin1;
6619 fold_array = PL_fold_latin1;
6620 type = REFFU;
d513472c 6621 utf8_fold_flags = 0;
d7ef4b73
KW
6622 goto do_nref;
6623
3c0563b9 6624 case NREFF: /* /\g{name}/i */
d7ef4b73
KW
6625 folder = foldEQ;
6626 fold_array = PL_fold;
6627 type = REFF;
d513472c 6628 utf8_fold_flags = 0;
d7ef4b73
KW
6629 goto do_nref;
6630
3c0563b9 6631 case NREF: /* /\g{name}/ */
d7ef4b73 6632 type = REF;
83d7b90b
KW
6633 folder = NULL;
6634 fold_array = NULL;
d513472c 6635 utf8_fold_flags = 0;
d7ef4b73
KW
6636 do_nref:
6637
6638 /* For the named back references, find the corresponding buffer
6639 * number */
0a4db386
YO
6640 n = reg_check_named_buff_matched(rex,scan);
6641
d7ef4b73 6642 if ( ! n ) {
81714fb9 6643 sayNO;
d7ef4b73
KW
6644 }
6645 goto do_nref_ref_common;
6646
3c0563b9 6647 case REFFL: /* /\1/il */
780fcc9f 6648 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
d7ef4b73
KW
6649 folder = foldEQ_locale;
6650 fold_array = PL_fold_locale;
cea315b6 6651 utf8_fold_flags = FOLDEQ_LOCALE;
d7ef4b73
KW
6652 goto do_ref;
6653
3c0563b9 6654 case REFFA: /* /\1/iaa */
2f7f8cb1
KW
6655 folder = foldEQ_latin1;
6656 fold_array = PL_fold_latin1;
6657 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6658 goto do_ref;
6659
3c0563b9 6660 case REFFU: /* /\1/iu */
d7ef4b73
KW
6661 folder = foldEQ_latin1;
6662 fold_array = PL_fold_latin1;
d513472c 6663 utf8_fold_flags = 0;
d7ef4b73
KW
6664 goto do_ref;
6665
3c0563b9 6666 case REFF: /* /\1/i */
d7ef4b73
KW
6667 folder = foldEQ;
6668 fold_array = PL_fold;
d513472c 6669 utf8_fold_flags = 0;
83d7b90b 6670 goto do_ref;
d7ef4b73 6671
3c0563b9 6672 case REF: /* /\1/ */
83d7b90b
KW
6673 folder = NULL;
6674 fold_array = NULL;
d513472c 6675 utf8_fold_flags = 0;
83d7b90b 6676
d7ef4b73 6677 do_ref:
81714fb9 6678 type = OP(scan);
d7ef4b73
KW
6679 n = ARG(scan); /* which paren pair */
6680
6681 do_nref_ref_common:
b93070ed 6682 ln = rex->offs[n].start;
2dfc11ec 6683 endref = rex->offs[n].end;
1cb48e53 6684 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
2dfc11ec 6685 if (rex->lastparen < n || ln == -1 || endref == -1)
af3f8c16 6686 sayNO; /* Do not match unless seen CLOSEn. */
2dfc11ec 6687 if (ln == endref)
a0d0e21e 6688 break;
a0ed51b3 6689
9d9163fb 6690 s = reginfo->strbeg + ln;
d7ef4b73 6691 if (type != REF /* REF can do byte comparison */
31f05a37
KW
6692 && (utf8_target || type == REFFU || type == REFFL))
6693 {
220db18a 6694 char * limit = reginfo->strend;
d7ef4b73
KW
6695
6696 /* This call case insensitively compares the entire buffer
6697 * at s, with the current input starting at locinput, but
220db18a
DM
6698 * not going off the end given by reginfo->strend, and
6699 * returns in <limit> upon success, how much of the
6700 * current input was matched */
2dfc11ec 6701 if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target,
d513472c 6702 locinput, &limit, 0, utf8_target, utf8_fold_flags))
d7ef4b73
KW
6703 {
6704 sayNO;
a0ed51b3 6705 }
d7ef4b73 6706 locinput = limit;
a0ed51b3
LW
6707 break;
6708 }
6709
d7ef4b73 6710 /* Not utf8: Inline the first character, for speed. */
7016d6eb
DM
6711 if (!NEXTCHR_IS_EOS &&
6712 UCHARAT(s) != nextchr &&
81714fb9 6713 (type == REF ||
d7ef4b73 6714 UCHARAT(s) != fold_array[nextchr]))
4633a7c4 6715 sayNO;
2dfc11ec 6716 ln = endref - ln;
220db18a 6717 if (locinput + ln > reginfo->strend)
4633a7c4 6718 sayNO;
81714fb9 6719 if (ln > 1 && (type == REF
24d3c4a9 6720 ? memNE(s, locinput, ln)
d7ef4b73 6721 : ! folder(s, locinput, ln)))
4633a7c4 6722 sayNO;
24d3c4a9 6723 locinput += ln;
a0d0e21e 6724 break;
81714fb9 6725 }
3c0563b9
DM
6726
6727 case NOTHING: /* null op; e.g. the 'nothing' following
6728 * the '*' in m{(a+|b)*}' */
6729 break;
6730 case TAIL: /* placeholder while compiling (A|B|C) */
a0d0e21e 6731 break;
3c0563b9 6732
40a82448
DM
6733#undef ST
6734#define ST st->u.eval
401a8022
YO
6735#define CUR_EVAL cur_eval->u.eval
6736
c277df42 6737 {
c277df42 6738 SV *ret;
d2f13c59 6739 REGEXP *re_sv;
6bda09f9 6740 regexp *re;
f8fc2ecf 6741 regexp_internal *rei;
1a147d38 6742 regnode *startpoint;
ba6840fb 6743 U32 arg;
1a147d38 6744
e7707071 6745 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
ba6840fb
YO
6746 arg= (U32)ARG(scan);
6747 if (cur_eval && cur_eval->locinput == locinput) {
4b196cd4 6748 if ( ++nochange_depth > max_nochange_depth )
1a147d38
YO
6749 Perl_croak(aTHX_
6750 "Pattern subroutine nesting without pos change"
6751 " exceeded limit in regex");
6bda09f9
YO
6752 } else {
6753 nochange_depth = 0;
1a147d38 6754 }
288b8c02 6755 re_sv = rex_sv;
6bda09f9 6756 re = rex;
f8fc2ecf 6757 rei = rexi;
d5a00e4a 6758 startpoint = scan + ARG2L(scan);
401a8022 6759 EVAL_CLOSE_PAREN_SET( st, arg );
ba6840fb
YO
6760 /* Detect infinite recursion
6761 *
6762 * A pattern like /(?R)foo/ or /(?<x>(?&y)foo)(?<y>(?&x)bar)/
6763 * or "a"=~/(.(?2))((?<=(?=(?1)).))/ could recurse forever.
6764 * So we track the position in the string we are at each time
6765 * we recurse and if we try to enter the same routine twice from
401a8022 6766 * the same position we throw an error.
ba6840fb
YO
6767 */
6768 if ( rex->recurse_locinput[arg] == locinput ) {
401a8022
YO
6769 /* FIXME: we should show the regop that is failing as part
6770 * of the error message. */
6771 Perl_croak(aTHX_ "Infinite recursion in regex");
6772 } else {
6773 ST.prev_recurse_locinput= rex->recurse_locinput[arg];
6774 rex->recurse_locinput[arg]= locinput;
6775
ba6840fb
YO
6776 DEBUG_r({
6777 GET_RE_DEBUG_FLAGS_DECL;
401a8022 6778 DEBUG_STACK_r({
6ad9a8ab 6779 Perl_re_exec_indentf( aTHX_
401a8022
YO
6780 "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n",
6781 depth, ST.prev_recurse_locinput, arg, rex->recurse_locinput[arg]
6782 );
ba6840fb
YO
6783 });
6784 });
ba6840fb 6785 }
d1b2014a
YO
6786
6787 /* Save all the positions seen so far. */
21553840 6788 ST.cp = regcppush(rex, 0, maxopenparen);
d1b2014a
YO
6789 REGCP_SET(ST.lastcp);
6790
6791 /* and then jump to the code we share with EVAL */
6bda09f9 6792 goto eval_recurse_doit;
a74ff37d 6793 /* NOTREACHED */
3c0563b9 6794
4ee16520 6795 case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */
6bda09f9 6796 if (cur_eval && cur_eval->locinput==locinput) {
4b196cd4 6797 if ( ++nochange_depth > max_nochange_depth )
1a147d38 6798 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6bda09f9
YO
6799 } else {
6800 nochange_depth = 0;
6801 }
8e5e9ebe 6802 {
4aabdb9b 6803 /* execute the code in the {...} */
81ed78b2 6804
4aabdb9b 6805 dSP;
a6dc34f1 6806 IV before;
1f4d1a1e 6807 OP * const oop = PL_op;
4aabdb9b 6808 COP * const ocurcop = PL_curcop;
81ed78b2 6809 OP *nop;
81ed78b2 6810 CV *newcv;
91332126 6811
74088413 6812 /* save *all* paren positions */
21553840 6813 regcppush(rex, 0, maxopenparen);
4ee16520 6814 REGCP_SET(ST.lastcp);
74088413 6815
81ed78b2
DM
6816 if (!caller_cv)
6817 caller_cv = find_runcv(NULL);
6818
4aabdb9b 6819 n = ARG(scan);
81ed78b2 6820
b30fcab9 6821 if (rexi->data->what[n] == 'r') { /* code from an external qr */
ba6840fb
YO
6822 newcv = (ReANY(
6823 (REGEXP*)(rexi->data->data[n])
6824 ))->qr_anoncv;
81ed78b2 6825 nop = (OP*)rexi->data->data[n+1];
b30fcab9
DM
6826 }
6827 else if (rexi->data->what[n] == 'l') { /* literal code */
81ed78b2
DM
6828 newcv = caller_cv;
6829 nop = (OP*)rexi->data->data[n];
6830 assert(CvDEPTH(newcv));
68e2671b
DM
6831 }
6832 else {
d24ca0c5
DM
6833 /* literal with own CV */
6834 assert(rexi->data->what[n] == 'L');
81ed78b2
DM
6835 newcv = rex->qr_anoncv;
6836 nop = (OP*)rexi->data->data[n];
68e2671b 6837 }
81ed78b2 6838
4b9c7cae
DM
6839 /* Some notes about MULTICALL and the context and save stacks.
6840 *
6841 * In something like
825fcd83 6842 * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../
4b9c7cae
DM
6843 * since codeblocks don't introduce a new scope (so that
6844 * local() etc accumulate), at the end of a successful
6845 * match there will be a SAVEt_CLEARSV on the savestack
6846 * for each of $x, $y, $z. If the three code blocks above
6847 * happen to have come from different CVs (e.g. via
6848 * embedded qr//s), then we must ensure that during any
6849 * savestack unwinding, PL_comppad always points to the
6850 * right pad at each moment. We achieve this by
6851 * interleaving SAVEt_COMPPAD's on the savestack whenever
6852 * there is a change of pad.
6853 * In theory whenever we call a code block, we should
6854 * push a CXt_SUB context, then pop it on return from
6855 * that code block. This causes a bit of an issue in that
6856 * normally popping a context also clears the savestack
6857 * back to cx->blk_oldsaveix, but here we specifically
6858 * don't want to clear the save stack on exit from the
6859 * code block.
6860 * Also for efficiency we don't want to keep pushing and
6861 * popping the single SUB context as we backtrack etc.
6862 * So instead, we push a single context the first time
6863 * we need, it, then hang onto it until the end of this
6864 * function. Whenever we encounter a new code block, we
6865 * update the CV etc if that's changed. During the times
6866 * in this function where we're not executing a code
6867 * block, having the SUB context still there is a bit
6868 * naughty - but we hope that no-one notices.
6869 * When the SUB context is initially pushed, we fake up
6870 * cx->blk_oldsaveix to be as if we'd pushed this context
6871 * on first entry to S_regmatch rather than at some random
6872 * point during the regexe execution. That way if we
6873 * croak, popping the context stack will ensure that
6874 * *everything* SAVEd by this function is undone and then
6875 * the context popped, rather than e.g., popping the
6876 * context (and restoring the original PL_comppad) then
825fcd83 6877 * popping more of the savestack and restoring a bad
4b9c7cae
DM
6878 * PL_comppad.
6879 */
6880
6881 /* If this is the first EVAL, push a MULTICALL. On
6882 * subsequent calls, if we're executing a different CV, or
6883 * if PL_comppad has got messed up from backtracking
6884 * through SAVECOMPPADs, then refresh the context.
6885 */
0e458318
DM
6886 if (newcv != last_pushed_cv || PL_comppad != last_pad)
6887 {
b0065247
DM
6888 U8 flags = (CXp_SUB_RE |
6889 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
4b9c7cae 6890 SAVECOMPPAD();
0e458318 6891 if (last_pushed_cv) {
b0065247 6892 CHANGE_MULTICALL_FLAGS(newcv, flags);
0e458318
DM
6893 }
6894 else {
b0065247 6895 PUSH_MULTICALL_FLAGS(newcv, flags);
0e458318 6896 }
4b9c7cae
DM
6897 /* see notes above */
6898 CX_CUR()->blk_oldsaveix = orig_savestack_ix;
6899
0e458318
DM
6900 last_pushed_cv = newcv;
6901 }
c31ee3bb
DM
6902 else {
6903 /* these assignments are just to silence compiler
6904 * warnings */
6905 multicall_cop = NULL;
c31ee3bb 6906 }
0e458318
DM
6907 last_pad = PL_comppad;
6908
2e2e3f36
DM
6909 /* the initial nextstate you would normally execute
6910 * at the start of an eval (which would cause error
6911 * messages to come from the eval), may be optimised
6912 * away from the execution path in the regex code blocks;
6913 * so manually set PL_curcop to it initially */
6914 {
81ed78b2 6915 OP *o = cUNOPx(nop)->op_first;
2e2e3f36
DM
6916 assert(o->op_type == OP_NULL);
6917 if (o->op_targ == OP_SCOPE) {
6918 o = cUNOPo->op_first;
6919 }
6920 else {
6921 assert(o->op_targ == OP_LEAVE);
6922 o = cUNOPo->op_first;
6923 assert(o->op_type == OP_ENTER);
e6dae479 6924 o = OpSIBLING(o);
2e2e3f36
DM
6925 }
6926
6927 if (o->op_type != OP_STUB) {
6928 assert( o->op_type == OP_NEXTSTATE
6929 || o->op_type == OP_DBSTATE
6930 || (o->op_type == OP_NULL
6931 && ( o->op_targ == OP_NEXTSTATE
6932 || o->op_targ == OP_DBSTATE
6933 )
6934 )
6935 );
6936 PL_curcop = (COP*)o;
6937 }
6938 }
81ed78b2 6939 nop = nop->op_next;
2e2e3f36 6940
6ad9a8ab 6941 DEBUG_STATE_r( Perl_re_printf( aTHX_
147e3846 6942 " re EVAL PL_op=0x%" UVxf "\n", PTR2UV(nop)) );
81ed78b2 6943
8adc0f72 6944 rex->offs[0].end = locinput - reginfo->strbeg;
bf2039a9 6945 if (reginfo->info_aux_eval->pos_magic)
25fdce4a
FC
6946 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6947 reginfo->sv, reginfo->strbeg,
6948 locinput - reginfo->strbeg);
4aabdb9b 6949
2bf803e2
YO
6950 if (sv_yes_mark) {
6951 SV *sv_mrk = get_sv("REGMARK", 1);
6952 sv_setsv(sv_mrk, sv_yes_mark);
6953 }
6954
81ed78b2
DM
6955 /* we don't use MULTICALL here as we want to call the
6956 * first op of the block of interest, rather than the
79503099
DM
6957 * first op of the sub. Also, we don't want to free
6958 * the savestack frame */
a6dc34f1 6959 before = (IV)(SP-PL_stack_base);
81ed78b2 6960 PL_op = nop;
8e5e9ebe
RGS
6961 CALLRUNOPS(aTHX); /* Scalar context. */
6962 SPAGAIN;
a6dc34f1 6963 if ((IV)(SP-PL_stack_base) == before)
075aa684 6964 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
8e5e9ebe
RGS
6965 else {
6966 ret = POPs;
6967 PUTBACK;
6968 }
4aabdb9b 6969
e4bfbed3
DM
6970 /* before restoring everything, evaluate the returned
6971 * value, so that 'uninit' warnings don't use the wrong
497d0a96
DM
6972 * PL_op or pad. Also need to process any magic vars
6973 * (e.g. $1) *before* parentheses are restored */
e4bfbed3
DM
6974
6975 PL_op = NULL;
6976
5e98dac2 6977 re_sv = NULL;
e4bfbed3
DM
6978 if (logical == 0) /* (?{})/ */
6979 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6980 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
6981 sw = cBOOL(SvTRUE(ret));
6982 logical = 0;
6983 }
6984 else { /* /(??{}) */
497d0a96
DM
6985 /* if its overloaded, let the regex compiler handle
6986 * it; otherwise extract regex, or stringify */
237da807 6987 if (SvGMAGICAL(ret))
2685dc2d 6988 ret = sv_mortalcopy(ret);
497d0a96
DM
6989 if (!SvAMAGIC(ret)) {
6990 SV *sv = ret;
6991 if (SvROK(sv))
6992 sv = SvRV(sv);
6993 if (SvTYPE(sv) == SVt_REGEXP)
6994 re_sv = (REGEXP*) sv;
63620942
FC
6995 else if (SvSMAGICAL(ret)) {
6996 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
497d0a96
DM
6997 if (mg)
6998 re_sv = (REGEXP *) mg->mg_obj;
6999 }
e4bfbed3 7000
2685dc2d 7001 /* force any undef warnings here */
237da807
FC
7002 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
7003 ret = sv_mortalcopy(ret);
497d0a96
DM
7004 (void) SvPV_force_nolen(ret);
7005 }
e4bfbed3
DM
7006 }
7007
7008 }
7009
81ed78b2
DM
7010 /* *** Note that at this point we don't restore
7011 * PL_comppad, (or pop the CxSUB) on the assumption it may
7012 * be used again soon. This is safe as long as nothing
7013 * in the regexp code uses the pad ! */
4aabdb9b 7014 PL_op = oop;
4aabdb9b 7015 PL_curcop = ocurcop;
4ee16520 7016 regcp_restore(rex, ST.lastcp, &maxopenparen);
5585e758
YO
7017 PL_curpm_under = PL_curpm;
7018 PL_curpm = PL_reg_curpm;
e4bfbed3 7019
4ee16520
DM
7020 if (logical != 2) {
7021 PUSH_STATE_GOTO(EVAL_B, next, locinput);
7022 /* NOTREACHED */
7023 }
8e5e9ebe 7024 }
e4bfbed3
DM
7025
7026 /* only /(??{})/ from now on */
24d3c4a9 7027 logical = 0;
4aabdb9b 7028 {
4f639d21
DM
7029 /* extract RE object from returned value; compiling if
7030 * necessary */
5c35adbb 7031
575c37f6
DM
7032 if (re_sv) {
7033 re_sv = reg_temp_copy(NULL, re_sv);
288b8c02 7034 }
0f5d15d6 7035 else {
c737faaf 7036 U32 pm_flags = 0;
0f5d15d6 7037
9753d940
DM
7038 if (SvUTF8(ret) && IN_BYTES) {
7039 /* In use 'bytes': make a copy of the octet
7040 * sequence, but without the flag on */
b9ad30b4
NC
7041 STRLEN len;
7042 const char *const p = SvPV(ret, len);
7043 ret = newSVpvn_flags(p, len, SVs_TEMP);
7044 }
732caac7
DM
7045 if (rex->intflags & PREGf_USE_RE_EVAL)
7046 pm_flags |= PMf_USE_RE_EVAL;
7047
7048 /* if we got here, it should be an engine which
7049 * supports compiling code blocks and stuff */
7050 assert(rex->engine && rex->engine->op_comp);
ec841a27 7051 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
575c37f6 7052 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
ec841a27 7053 rex->engine, NULL, NULL,
33be4c61 7054 /* copy /msixn etc to inner pattern */
13f27704 7055 ARG2L(scan),
ec841a27 7056 pm_flags);
732caac7 7057
9041c2e3 7058 if (!(SvFLAGS(ret)
237da807
FC
7059 & (SVs_TEMP | SVs_GMG | SVf_ROK))
7060 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
a2794585
NC
7061 /* This isn't a first class regexp. Instead, it's
7062 caching a regexp onto an existing, Perl visible
7063 scalar. */
575c37f6 7064 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
3ce3ed55 7065 }
0f5d15d6 7066 }
e1ff3a88 7067 SAVEFREESV(re_sv);
8d919b0a 7068 re = ReANY(re_sv);
4aabdb9b 7069 }
07bc277f 7070 RXp_MATCH_COPIED_off(re);
28d8d7f4
YO
7071 re->subbeg = rex->subbeg;
7072 re->sublen = rex->sublen;
6502e081
DM
7073 re->suboffset = rex->suboffset;
7074 re->subcoffset = rex->subcoffset;
d1b2014a
YO
7075 re->lastparen = 0;
7076 re->lastcloseparen = 0;
f8fc2ecf 7077 rei = RXi_GET(re);
6bda09f9 7078 DEBUG_EXECUTE_r(
220db18a
DM
7079 debug_start_match(re_sv, utf8_target, locinput,
7080 reginfo->strend, "Matching embedded");
6bda09f9 7081 );
f8fc2ecf 7082 startpoint = rei->program + 1;
d5a00e4a
YO
7083 EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0;
7084 * close_paren only for GOSUB */
ba6840fb 7085 ST.prev_recurse_locinput= NULL; /* only used for GOSUB */
d1b2014a 7086 /* Save all the seen positions so far. */
21553840 7087 ST.cp = regcppush(rex, 0, maxopenparen);
d1b2014a
YO
7088 REGCP_SET(ST.lastcp);
7089 /* and set maxopenparen to 0, since we are starting a "fresh" match */
7090 maxopenparen = 0;
7091 /* run the pattern returned from (??{...}) */
aa283a38 7092
c52b8b12 7093 eval_recurse_doit: /* Share code with GOSUB below this line
d1b2014a
YO
7094 * At this point we expect the stack context to be
7095 * set up correctly */
4aabdb9b 7096
1cb95af7
DM
7097 /* invalidate the S-L poscache. We're now executing a
7098 * different set of WHILEM ops (and their associated
7099 * indexes) against the same string, so the bits in the
7100 * cache are meaningless. Setting maxiter to zero forces
7101 * the cache to be invalidated and zeroed before reuse.
7102 * XXX This is too dramatic a measure. Ideally we should
7103 * save the old cache and restore when running the outer
7104 * pattern again */
1cb48e53 7105 reginfo->poscache_maxiter = 0;
4aabdb9b 7106
d1b2014a 7107 /* the new regexp might have a different is_utf8_pat than we do */
aed7b151 7108 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
faec1544 7109
288b8c02 7110 ST.prev_rex = rex_sv;
faec1544 7111 ST.prev_curlyx = cur_curlyx;
ec43f78b
DM
7112 rex_sv = re_sv;
7113 SET_reg_curpm(rex_sv);
288b8c02 7114 rex = re;
f8fc2ecf 7115 rexi = rei;
faec1544 7116 cur_curlyx = NULL;
40a82448 7117 ST.B = next;
faec1544
DM
7118 ST.prev_eval = cur_eval;
7119 cur_eval = st;
faec1544 7120 /* now continue from first node in postoned RE */
4ee16520 7121 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput);
661d43c4 7122 NOT_REACHED; /* NOTREACHED */
c277df42 7123 }
40a82448 7124
4ee16520 7125 case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */
27fe105a 7126 /* note: this is called twice; first after popping B, then A */
401a8022 7127 DEBUG_STACK_r({
6ad9a8ab 7128 Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n",
401a8022
YO
7129 depth, cur_eval, ST.prev_eval);
7130 });
7131
7132#define SET_RECURSE_LOCINPUT(STR,VAL)\
7133 if ( cur_eval && CUR_EVAL.close_paren ) {\
6aeaca27
YO
7134 DEBUG_STACK_r({ \
7135 Perl_re_exec_indentf( aTHX_ STR " GOSUB%d ce=%p recurse_locinput=%p\n",\
401a8022
YO
7136 depth, \
7137 CUR_EVAL.close_paren - 1,\
7138 cur_eval, \
7139 VAL); \
7140 }); \
7141 rex->recurse_locinput[CUR_EVAL.close_paren - 1] = VAL;\
7142 }
7143
7144 SET_RECURSE_LOCINPUT("EVAL_AB[before]", CUR_EVAL.prev_recurse_locinput);
ba6840fb 7145
ec43f78b 7146 rex_sv = ST.prev_rex;
aed7b151 7147 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 7148 SET_reg_curpm(rex_sv);
8d919b0a 7149 rex = ReANY(rex_sv);
f8fc2ecf 7150 rexi = RXi_GET(rex);
4b22688e
YO
7151 {
7152 /* preserve $^R across LEAVE's. See Bug 121070. */
7153 SV *save_sv= GvSV(PL_replgv);
7154 SvREFCNT_inc(save_sv);
7155 regcpblow(ST.cp); /* LEAVE in disguise */
7156 sv_setsv(GvSV(PL_replgv), save_sv);
7157 SvREFCNT_dec(save_sv);
7158 }
faec1544
DM
7159 cur_eval = ST.prev_eval;
7160 cur_curlyx = ST.prev_curlyx;
34a81e2b 7161
1cb95af7 7162 /* Invalidate cache. See "invalidate" comment above. */
1cb48e53 7163 reginfo->poscache_maxiter = 0;
e7707071 7164 if ( nochange_depth )
4b196cd4 7165 nochange_depth--;
401a8022
YO
7166
7167 SET_RECURSE_LOCINPUT("EVAL_AB[after]", cur_eval->locinput);
262b90c4 7168 sayYES;
40a82448 7169
40a82448 7170
4ee16520
DM
7171 case EVAL_B_fail: /* unsuccessful B in (?{...})B */
7172 REGCP_UNWIND(ST.lastcp);
7173 sayNO;
7174
7175 case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
faec1544 7176 /* note: this is called twice; first after popping B, then A */
401a8022 7177 DEBUG_STACK_r({
6ad9a8ab 7178 Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n",
401a8022
YO
7179 depth, cur_eval, ST.prev_eval);
7180 });
7181
7182 SET_RECURSE_LOCINPUT("EVAL_AB_fail[before]", CUR_EVAL.prev_recurse_locinput);
ba6840fb 7183
ec43f78b 7184 rex_sv = ST.prev_rex;
aed7b151 7185 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 7186 SET_reg_curpm(rex_sv);
8d919b0a 7187 rex = ReANY(rex_sv);
f8fc2ecf 7188 rexi = RXi_GET(rex);
0357f1fd 7189
40a82448 7190 REGCP_UNWIND(ST.lastcp);
21553840 7191 regcppop(rex, &maxopenparen);
faec1544
DM
7192 cur_eval = ST.prev_eval;
7193 cur_curlyx = ST.prev_curlyx;
401a8022 7194
1cb95af7 7195 /* Invalidate cache. See "invalidate" comment above. */
1cb48e53 7196 reginfo->poscache_maxiter = 0;
e7707071 7197 if ( nochange_depth )
4b196cd4 7198 nochange_depth--;
401a8022
YO
7199
7200 SET_RECURSE_LOCINPUT("EVAL_AB_fail[after]", cur_eval->locinput);
7201 sayNO_SILENT;
40a82448
DM
7202#undef ST
7203
3c0563b9 7204 case OPEN: /* ( */
c277df42 7205 n = ARG(scan); /* which paren pair */
9d9163fb 7206 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
92da3157
DM
7207 if (n > maxopenparen)
7208 maxopenparen = n;
2b1a3689 7209 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_
147e3846 7210 "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf " tmp; maxopenparen=%" UVuf "\n",
2b1a3689 7211 depth,
495f47a5
DM
7212 PTR2UV(rex),
7213 PTR2UV(rex->offs),
7214 (UV)n,
7215 (IV)rex->offs[n].start_tmp,
92da3157 7216 (UV)maxopenparen
495f47a5 7217 ));
e2e6a0f1 7218 lastopen = n;
a0d0e21e 7219 break;
495f47a5
DM
7220
7221/* XXX really need to log other places start/end are set too */
7b031478
YO
7222#define CLOSE_CAPTURE \
7223 rex->offs[n].start = rex->offs[n].start_tmp; \
7224 rex->offs[n].end = locinput - reginfo->strbeg; \
2b1a3689 7225 DEBUG_BUFFERS_r(Perl_re_exec_indentf( aTHX_ \
147e3846 7226 "rex=0x%" UVxf " offs=0x%" UVxf ": \\%" UVuf ": set %" IVdf "..%" IVdf "\n", \
2b1a3689 7227 depth, \
7b031478
YO
7228 PTR2UV(rex), \
7229 PTR2UV(rex->offs), \
7230 (UV)n, \
7231 (IV)rex->offs[n].start, \
7232 (IV)rex->offs[n].end \
495f47a5
DM
7233 ))
7234
3c0563b9 7235 case CLOSE: /* ) */
c277df42 7236 n = ARG(scan); /* which paren pair */
495f47a5 7237 CLOSE_CAPTURE;
b93070ed
DM
7238 if (n > rex->lastparen)
7239 rex->lastparen = n;
7240 rex->lastcloseparen = n;
24be3102 7241 if ( EVAL_CLOSE_PAREN_IS( cur_eval, n ) )
6bda09f9 7242 goto fake_end;
24be3102 7243
a0d0e21e 7244 break;
3c0563b9
DM
7245
7246 case ACCEPT: /* (*ACCEPT) */
fee50582
YO
7247 if (scan->flags)
7248 sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7249 if (ARG2L(scan)){
e2e6a0f1
YO
7250 regnode *cursor;
7251 for (cursor=scan;
7252 cursor && OP(cursor)!=END;
7253 cursor=regnext(cursor))
7254 {
7255 if ( OP(cursor)==CLOSE ){
7256 n = ARG(cursor);
7257 if ( n <= lastopen ) {
495f47a5 7258 CLOSE_CAPTURE;
b93070ed
DM
7259 if (n > rex->lastparen)
7260 rex->lastparen = n;
7261 rex->lastcloseparen = n;
24be3102 7262 if ( n == ARG(scan) || EVAL_CLOSE_PAREN_IS(cur_eval, n) )
e2e6a0f1
YO
7263 break;
7264 }
7265 }
7266 }
7267 }
7268 goto fake_end;
a74ff37d 7269 /* NOTREACHED */
3c0563b9
DM
7270
7271 case GROUPP: /* (?(1)) */
c277df42 7272 n = ARG(scan); /* which paren pair */
b93070ed 7273 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
c277df42 7274 break;
3c0563b9
DM
7275
7276 case NGROUPP: /* (?(<name>)) */
0a4db386 7277 /* reg_check_named_buff_matched returns 0 for no match */
f2338a2e 7278 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
0a4db386 7279 break;
3c0563b9
DM
7280
7281 case INSUBP: /* (?(R)) */
0a4db386 7282 n = ARG(scan);
ce12e254
YO
7283 /* this does not need to use EVAL_CLOSE_PAREN macros, as the arg
7284 * of SCAN is already set up as matches a eval.close_paren */
401a8022 7285 sw = cur_eval && (n == 0 || CUR_EVAL.close_paren == n);
0a4db386 7286 break;
3c0563b9
DM
7287
7288 case DEFINEP: /* (?(DEFINE)) */
0a4db386
YO
7289 sw = 0;
7290 break;
3c0563b9
DM
7291
7292 case IFTHEN: /* (?(cond)A|B) */
1cb48e53 7293 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
24d3c4a9 7294 if (sw)
c277df42
IZ
7295 next = NEXTOPER(NEXTOPER(scan));
7296 else {
7297 next = scan + ARG(scan);
7298 if (OP(next) == IFTHEN) /* Fake one. */
7299 next = NEXTOPER(NEXTOPER(next));
7300 }
7301 break;
3c0563b9
DM
7302
7303 case LOGICAL: /* modifier for EVAL and IFMATCH */
24d3c4a9 7304 logical = scan->flags;
c277df42 7305 break;
c476f425 7306
2ab05381 7307/*******************************************************************
2ab05381 7308
c476f425
DM
7309The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
7310pattern, where A and B are subpatterns. (For simple A, CURLYM or
7311STAR/PLUS/CURLY/CURLYN are used instead.)
2ab05381 7312
c476f425 7313A*B is compiled as <CURLYX><A><WHILEM><B>
2ab05381 7314
c476f425
DM
7315On entry to the subpattern, CURLYX is called. This pushes a CURLYX
7316state, which contains the current count, initialised to -1. It also sets
7317cur_curlyx to point to this state, with any previous value saved in the
7318state block.
2ab05381 7319
c476f425
DM
7320CURLYX then jumps straight to the WHILEM op, rather than executing A,
7321since the pattern may possibly match zero times (i.e. it's a while {} loop
7322rather than a do {} while loop).
2ab05381 7323
c476f425
DM
7324Each entry to WHILEM represents a successful match of A. The count in the
7325CURLYX block is incremented, another WHILEM state is pushed, and execution
7326passes to A or B depending on greediness and the current count.
2ab05381 7327
c476f425
DM
7328For example, if matching against the string a1a2a3b (where the aN are
7329substrings that match /A/), then the match progresses as follows: (the
7330pushed states are interspersed with the bits of strings matched so far):
2ab05381 7331
c476f425
DM
7332 <CURLYX cnt=-1>
7333 <CURLYX cnt=0><WHILEM>
7334 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
7335 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
7336 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
7337 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
2ab05381 7338
c476f425
DM
7339(Contrast this with something like CURLYM, which maintains only a single
7340backtrack state:
2ab05381 7341
c476f425
DM
7342 <CURLYM cnt=0> a1
7343 a1 <CURLYM cnt=1> a2
7344 a1 a2 <CURLYM cnt=2> a3
7345 a1 a2 a3 <CURLYM cnt=3> b
7346)
2ab05381 7347
c476f425
DM
7348Each WHILEM state block marks a point to backtrack to upon partial failure
7349of A or B, and also contains some minor state data related to that
7350iteration. The CURLYX block, pointed to by cur_curlyx, contains the
7351overall state, such as the count, and pointers to the A and B ops.
2ab05381 7352
c476f425
DM
7353This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
7354must always point to the *current* CURLYX block, the rules are:
2ab05381 7355
c476f425
DM
7356When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
7357and set cur_curlyx to point the new block.
2ab05381 7358
c476f425
DM
7359When popping the CURLYX block after a successful or unsuccessful match,
7360restore the previous cur_curlyx.
2ab05381 7361
c476f425
DM
7362When WHILEM is about to execute B, save the current cur_curlyx, and set it
7363to the outer one saved in the CURLYX block.
2ab05381 7364
c476f425
DM
7365When popping the WHILEM block after a successful or unsuccessful B match,
7366restore the previous cur_curlyx.
2ab05381 7367
c476f425
DM
7368Here's an example for the pattern (AI* BI)*BO
7369I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
2ab05381 7370
c476f425
DM
7371cur_
7372curlyx backtrack stack
7373------ ---------------
7374NULL
7375CO <CO prev=NULL> <WO>
7376CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7377CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7378NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
2ab05381 7379
c476f425
DM
7380At this point the pattern succeeds, and we work back down the stack to
7381clean up, restoring as we go:
95b24440 7382
c476f425
DM
7383CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
7384CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
7385CO <CO prev=NULL> <WO>
7386NULL
a0374537 7387
c476f425
DM
7388*******************************************************************/
7389
7390#define ST st->u.curlyx
7391
7392 case CURLYX: /* start of /A*B/ (for complex A) */
7393 {
7394 /* No need to save/restore up to this paren */
7395 I32 parenfloor = scan->flags;
7396
7397 assert(next); /* keep Coverity happy */
7398 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
7399 next += ARG(next);
7400
7401 /* XXXX Probably it is better to teach regpush to support
92da3157 7402 parenfloor > maxopenparen ... */
b93070ed
DM
7403 if (parenfloor > (I32)rex->lastparen)
7404 parenfloor = rex->lastparen; /* Pessimization... */
c476f425
DM
7405
7406 ST.prev_curlyx= cur_curlyx;
7407 cur_curlyx = st;
7408 ST.cp = PL_savestack_ix;
7409
7410 /* these fields contain the state of the current curly.
7411 * they are accessed by subsequent WHILEMs */
7412 ST.parenfloor = parenfloor;
d02d6d97 7413 ST.me = scan;
c476f425 7414 ST.B = next;
24d3c4a9
DM
7415 ST.minmod = minmod;
7416 minmod = 0;
c476f425
DM
7417 ST.count = -1; /* this will be updated by WHILEM */
7418 ST.lastloc = NULL; /* this will be updated by WHILEM */
7419
4d5016e5 7420 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
661d43c4 7421 NOT_REACHED; /* NOTREACHED */
c476f425 7422 }
a0d0e21e 7423
c476f425 7424 case CURLYX_end: /* just finished matching all of A*B */
c476f425
DM
7425 cur_curlyx = ST.prev_curlyx;
7426 sayYES;
661d43c4 7427 NOT_REACHED; /* NOTREACHED */
a0d0e21e 7428
c476f425
DM
7429 case CURLYX_end_fail: /* just failed to match all of A*B */
7430 regcpblow(ST.cp);
7431 cur_curlyx = ST.prev_curlyx;
7432 sayNO;
661d43c4 7433 NOT_REACHED; /* NOTREACHED */
4633a7c4 7434
a0d0e21e 7435
c476f425
DM
7436#undef ST
7437#define ST st->u.whilem
7438
7439 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
7440 {
7441 /* see the discussion above about CURLYX/WHILEM */
c476f425 7442 I32 n;
1a522d5d
JH
7443 int min, max;
7444 regnode *A;
d02d6d97 7445
c476f425 7446 assert(cur_curlyx); /* keep Coverity happy */
1a522d5d
JH
7447
7448 min = ARG1(cur_curlyx->u.curlyx.me);
7449 max = ARG2(cur_curlyx->u.curlyx.me);
7450 A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
c476f425
DM
7451 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
7452 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
7453 ST.cache_offset = 0;
7454 ST.cache_mask = 0;
7455
c476f425 7456
6ad9a8ab 7457 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: matched %ld out of %d..%d\n",
cb41e5d6 7458 depth, (long)n, min, max)
c476f425 7459 );
a0d0e21e 7460
c476f425 7461 /* First just match a string of min A's. */
a0d0e21e 7462
d02d6d97 7463 if (n < min) {
21553840 7464 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, maxopenparen);
c476f425 7465 cur_curlyx->u.curlyx.lastloc = locinput;
92e82afa
YO
7466 REGCP_SET(ST.lastcp);
7467
4d5016e5 7468 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
661d43c4 7469 NOT_REACHED; /* NOTREACHED */
c476f425
DM
7470 }
7471
7472 /* If degenerate A matches "", assume A done. */
7473
7474 if (locinput == cur_curlyx->u.curlyx.lastloc) {
6ad9a8ab 7475 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: empty match detected, trying continuation...\n",
cb41e5d6 7476 depth)
c476f425
DM
7477 );
7478 goto do_whilem_B_max;
7479 }
7480
1cb95af7
DM
7481 /* super-linear cache processing.
7482 *
7483 * The idea here is that for certain types of CURLYX/WHILEM -
7484 * principally those whose upper bound is infinity (and
7485 * excluding regexes that have things like \1 and other very
7486 * non-regular expresssiony things), then if a pattern like
7487 * /....A*.../ fails and we backtrack to the WHILEM, then we
7488 * make a note that this particular WHILEM op was at string
7489 * position 47 (say) when the rest of pattern failed. Then, if
7490 * we ever find ourselves back at that WHILEM, and at string
7491 * position 47 again, we can just fail immediately rather than
7492 * running the rest of the pattern again.
7493 *
7494 * This is very handy when patterns start to go
7495 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
7496 * with a combinatorial explosion of backtracking.
7497 *
7498 * The cache is implemented as a bit array, with one bit per
7499 * string byte position per WHILEM op (up to 16) - so its
7500 * between 0.25 and 2x the string size.
7501 *
7502 * To avoid allocating a poscache buffer every time, we do an
7503 * initially countdown; only after we have executed a WHILEM
7504 * op (string-length x #WHILEMs) times do we allocate the
7505 * cache.
7506 *
7507 * The top 4 bits of scan->flags byte say how many different
7508 * relevant CURLLYX/WHILEM op pairs there are, while the
7509 * bottom 4-bits is the identifying index number of this
7510 * WHILEM.
7511 */
c476f425
DM
7512
7513 if (scan->flags) {
a0d0e21e 7514
1cb48e53 7515 if (!reginfo->poscache_maxiter) {
c476f425
DM
7516 /* start the countdown: Postpone detection until we
7517 * know the match is not *that* much linear. */
1cb48e53 7518 reginfo->poscache_maxiter
9d9163fb
DM
7519 = (reginfo->strend - reginfo->strbeg + 1)
7520 * (scan->flags>>4);
66bf836d 7521 /* possible overflow for long strings and many CURLYX's */
1cb48e53
DM
7522 if (reginfo->poscache_maxiter < 0)
7523 reginfo->poscache_maxiter = I32_MAX;
7524 reginfo->poscache_iter = reginfo->poscache_maxiter;
2c2d71f5 7525 }
c476f425 7526
1cb48e53 7527 if (reginfo->poscache_iter-- == 0) {
c476f425 7528 /* initialise cache */
ea3daa5d 7529 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
2ac8ff4b
DM
7530 regmatch_info_aux *const aux = reginfo->info_aux;
7531 if (aux->poscache) {
ea3daa5d 7532 if ((SSize_t)reginfo->poscache_size < size) {
2ac8ff4b
DM
7533 Renew(aux->poscache, size, char);
7534 reginfo->poscache_size = size;
2c2d71f5 7535 }
2ac8ff4b 7536 Zero(aux->poscache, size, char);
2c2d71f5
JH
7537 }
7538 else {
2ac8ff4b
DM
7539 reginfo->poscache_size = size;
7540 Newxz(aux->poscache, size, char);
2c2d71f5 7541 }
6ad9a8ab 7542 DEBUG_EXECUTE_r( Perl_re_printf( aTHX_
c476f425
DM
7543 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
7544 PL_colors[4], PL_colors[5])
7545 );
2c2d71f5 7546 }
c476f425 7547
1cb48e53 7548 if (reginfo->poscache_iter < 0) {
c476f425 7549 /* have we already failed at this position? */
ea3daa5d 7550 SSize_t offset, mask;
338e600a
DM
7551
7552 reginfo->poscache_iter = -1; /* stop eventual underflow */
c476f425 7553 offset = (scan->flags & 0xf) - 1
9d9163fb
DM
7554 + (locinput - reginfo->strbeg)
7555 * (scan->flags>>4);
c476f425
DM
7556 mask = 1 << (offset % 8);
7557 offset /= 8;
2ac8ff4b 7558 if (reginfo->info_aux->poscache[offset] & mask) {
6ad9a8ab 7559 DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "whilem: (cache) already tried at this position...\n",
cb41e5d6 7560 depth)
2c2d71f5 7561 );
d3c48e81 7562 cur_curlyx->u.curlyx.count--;
3298f257 7563 sayNO; /* cache records failure */
2c2d71f5 7564 }
c476f425
DM
7565 ST.cache_offset = offset;
7566 ST.cache_mask = mask;
2c2d71f5 7567 }
c476f425 7568 }
2c2d71f5 7569
c476f425 7570 /* Prefer B over A for minimal matching. */
a687059c 7571
c476f425
DM
7572 if (cur_curlyx->u.curlyx.minmod) {
7573 ST.save_curlyx = cur_curlyx;
7574 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
7575 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
7576 locinput);
661d43c4 7577 NOT_REACHED; /* NOTREACHED */
c476f425 7578 }
a0d0e21e 7579
c476f425
DM
7580 /* Prefer A over B for maximal matching. */
7581
d02d6d97 7582 if (n < max) { /* More greed allowed? */
21553840
YO
7583 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
7584 maxopenparen);
c476f425
DM
7585 cur_curlyx->u.curlyx.lastloc = locinput;
7586 REGCP_SET(ST.lastcp);
4d5016e5 7587 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
661d43c4 7588 NOT_REACHED; /* NOTREACHED */
c476f425
DM
7589 }
7590 goto do_whilem_B_max;
7591 }
661d43c4 7592 NOT_REACHED; /* NOTREACHED */
c476f425
DM
7593
7594 case WHILEM_B_min: /* just matched B in a minimal match */
7595 case WHILEM_B_max: /* just matched B in a maximal match */
7596 cur_curlyx = ST.save_curlyx;
7597 sayYES;
661d43c4 7598 NOT_REACHED; /* NOTREACHED */
c476f425
DM
7599
7600 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
7601 cur_curlyx = ST.save_curlyx;
7602 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7603 cur_curlyx->u.curlyx.count--;
7604 CACHEsayNO;
661d43c4 7605 NOT_REACHED; /* NOTREACHED */
c476f425 7606
c476f425 7607 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
92e82afa 7608 REGCP_UNWIND(ST.lastcp);
21553840 7609 regcppop(rex, &maxopenparen);
77584140
DM
7610 /* FALLTHROUGH */
7611 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
c476f425
DM
7612 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
7613 cur_curlyx->u.curlyx.count--;
7614 CACHEsayNO;
661d43c4 7615 NOT_REACHED; /* NOTREACHED */
c476f425
DM
7616
7617 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
7618 REGCP_UNWIND(ST.lastcp);
21553840 7619 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
6ad9a8ab 7620 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "whilem: failed, trying continuation...\n",
cb41e5d6 7621 depth)
c476f425
DM
7622 );
7623 do_whilem_B_max:
7624 if (cur_curlyx->u.curlyx.count >= REG_INFTY
7625 && ckWARN(WARN_REGEXP)
39819bd9 7626 && !reginfo->warned)
c476f425 7627 {
39819bd9 7628 reginfo->warned = TRUE;
dcbac5bb
FC
7629 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
7630 "Complex regular subexpression recursion limit (%d) "
7631 "exceeded",
c476f425
DM
7632 REG_INFTY - 1);
7633 }
7634
7635 /* now try B */
7636 ST.save_curlyx = cur_curlyx;
7637 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4d5016e5
DM
7638 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
7639 locinput);
661d43c4 7640 NOT_REACHED; /* NOTREACHED */
c476f425
DM
7641
7642 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
7643 cur_curlyx = ST.save_curlyx;
c476f425 7644
d02d6d97 7645 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
c476f425
DM
7646 /* Maximum greed exceeded */
7647 if (cur_curlyx->u.curlyx.count >= REG_INFTY
7648 && ckWARN(WARN_REGEXP)
39819bd9 7649 && !reginfo->warned)
c476f425 7650 {
39819bd9 7651 reginfo->warned = TRUE;
c476f425 7652 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
dcbac5bb
FC
7653 "Complex regular subexpression recursion "
7654 "limit (%d) exceeded",
c476f425 7655 REG_INFTY - 1);
a0d0e21e 7656 }
c476f425 7657 cur_curlyx->u.curlyx.count--;
3ab3c9b4 7658 CACHEsayNO;
a0d0e21e 7659 }
c476f425 7660
6ad9a8ab 7661 DEBUG_EXECUTE_r(Perl_re_exec_indentf( aTHX_ "trying longer...\n", depth)
c476f425
DM
7662 );
7663 /* Try grabbing another A and see if it helps. */
c476f425 7664 cur_curlyx->u.curlyx.lastloc = locinput;
d02d6d97 7665 PUSH_STATE_GOTO(WHILEM_A_min,
4d5016e5
DM
7666 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
7667 locinput);
661d43c4 7668 NOT_REACHED; /* NOTREACHED */
40a82448
DM
7669
7670#undef ST
7671#define ST st->u.branch
7672
7673 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
c277df42
IZ
7674 next = scan + ARG(scan);
7675 if (next == scan)
7676 next = NULL;
40a82448 7677 scan = NEXTOPER(scan);
924ba076 7678 /* FALLTHROUGH */
c277df42 7679
40a82448
DM
7680 case BRANCH: /* /(...|A|...)/ */
7681 scan = NEXTOPER(scan); /* scan now points to inner node */
b93070ed 7682 ST.lastparen = rex->lastparen;
f6033a9d 7683 ST.lastcloseparen = rex->lastcloseparen;
40a82448
DM
7684 ST.next_branch = next;
7685 REGCP_SET(ST.cp);
02db2b7b 7686
40a82448 7687 /* Now go into the branch */
5d458dd8 7688 if (has_cutgroup) {
4d5016e5 7689 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 7690 } else {
4d5016e5 7691 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5d458dd8 7692 }
661d43c4 7693 NOT_REACHED; /* NOTREACHED */
3c0563b9
DM
7694
7695 case CUTGROUP: /* /(*THEN)/ */
fee50582
YO
7696 sv_yes_mark = st->u.mark.mark_name = scan->flags
7697 ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
7698 : NULL;
4d5016e5 7699 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
661d43c4 7700 NOT_REACHED; /* NOTREACHED */
3c0563b9 7701
5d458dd8
YO
7702 case CUTGROUP_next_fail:
7703 do_cutgroup = 1;
7704 no_final = 1;
7705 if (st->u.mark.mark_name)
7706 sv_commit = st->u.mark.mark_name;
7707 sayNO;
661d43c4 7708 NOT_REACHED; /* NOTREACHED */
3c0563b9 7709
5d458dd8
YO
7710 case BRANCH_next:
7711 sayYES;
661d43c4 7712 NOT_REACHED; /* NOTREACHED */
3c0563b9 7713
40a82448 7714 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5d458dd8
YO
7715 if (do_cutgroup) {
7716 do_cutgroup = 0;
7717 no_final = 0;
7718 }
40a82448 7719 REGCP_UNWIND(ST.cp);
a8d1f4b4 7720 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448
DM
7721 scan = ST.next_branch;
7722 /* no more branches? */
5d458dd8
YO
7723 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7724 DEBUG_EXECUTE_r({
6ad9a8ab 7725 Perl_re_exec_indentf( aTHX_ "%sBRANCH failed...%s\n",
cb41e5d6 7726 depth,
5d458dd8
YO
7727 PL_colors[4],
7728 PL_colors[5] );
7729 });
7730 sayNO_SILENT;
7731 }
40a82448 7732 continue; /* execute next BRANCH[J] op */
a74ff37d 7733 /* NOTREACHED */
40a82448 7734
3c0563b9 7735 case MINMOD: /* next op will be non-greedy, e.g. A*? */
24d3c4a9 7736 minmod = 1;
a0d0e21e 7737 break;
40a82448
DM
7738
7739#undef ST
7740#define ST st->u.curlym
7741
7742 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
7743
7744 /* This is an optimisation of CURLYX that enables us to push
84d2fa14 7745 * only a single backtracking state, no matter how many matches
40a82448
DM
7746 * there are in {m,n}. It relies on the pattern being constant
7747 * length, with no parens to influence future backrefs
7748 */
7749
7750 ST.me = scan;
dc45a647 7751 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
40a82448 7752
f6033a9d
DM
7753 ST.lastparen = rex->lastparen;
7754 ST.lastcloseparen = rex->lastcloseparen;
7755
40a82448
DM
7756 /* if paren positive, emulate an OPEN/CLOSE around A */
7757 if (ST.me->flags) {
3b6647e0 7758 U32 paren = ST.me->flags;
92da3157
DM
7759 if (paren > maxopenparen)
7760 maxopenparen = paren;
c277df42 7761 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6407bf3b 7762 }
40a82448
DM
7763 ST.A = scan;
7764 ST.B = next;
7765 ST.alen = 0;
7766 ST.count = 0;
24d3c4a9
DM
7767 ST.minmod = minmod;
7768 minmod = 0;
40a82448
DM
7769 ST.c1 = CHRTEST_UNINIT;
7770 REGCP_SET(ST.cp);
6407bf3b 7771
40a82448
DM
7772 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7773 goto curlym_do_B;
7774
7775 curlym_do_A: /* execute the A in /A{m,n}B/ */
4d5016e5 7776 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
661d43c4 7777 NOT_REACHED; /* NOTREACHED */
5f80c4cf 7778
40a82448 7779 case CURLYM_A: /* we've just matched an A */
40a82448
DM
7780 ST.count++;
7781 /* after first match, determine A's length: u.curlym.alen */
7782 if (ST.count == 1) {
ba44c216 7783 if (reginfo->is_utf8_target) {
c07e9d7b
DM
7784 char *s = st->locinput;
7785 while (s < locinput) {
40a82448
DM
7786 ST.alen++;
7787 s += UTF8SKIP(s);
7788 }
7789 }
7790 else {
c07e9d7b 7791 ST.alen = locinput - st->locinput;
40a82448
DM
7792 }
7793 if (ST.alen == 0)
7794 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7795 }
0cadcf80 7796 DEBUG_EXECUTE_r(
147e3846 7797 Perl_re_exec_indentf( aTHX_ "CURLYM now matched %" IVdf " times, len=%" IVdf "...\n",
cb41e5d6 7798 depth, (IV) ST.count, (IV)ST.alen)
0cadcf80
DM
7799 );
7800
ce12e254 7801 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
0a4db386
YO
7802 goto fake_end;
7803
c966426a
DM
7804 {
7805 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7806 if ( max == REG_INFTY || ST.count < max )
7807 goto curlym_do_A; /* try to match another A */
7808 }
40a82448 7809 goto curlym_do_B; /* try to match B */
5f80c4cf 7810
40a82448
DM
7811 case CURLYM_A_fail: /* just failed to match an A */
7812 REGCP_UNWIND(ST.cp);
0a4db386 7813
24be3102 7814
0a4db386 7815 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
ce12e254 7816 || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
40a82448 7817 sayNO;
0cadcf80 7818
40a82448 7819 curlym_do_B: /* execute the B in /A{m,n}B/ */
40a82448
DM
7820 if (ST.c1 == CHRTEST_UNINIT) {
7821 /* calculate c1 and c2 for possible match of 1st char
7822 * following curly */
7823 ST.c1 = ST.c2 = CHRTEST_VOID;
d20a21f4 7824 assert(ST.B);
40a82448
DM
7825 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7826 regnode *text_node = ST.B;
7827 if (! HAS_TEXT(text_node))
7828 FIND_NEXT_IMPT(text_node);
ee9b8eae
YO
7829 /* this used to be
7830
7831 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7832
7833 But the former is redundant in light of the latter.
7834
7835 if this changes back then the macro for
7836 IS_TEXT and friends need to change.
7837 */
c74f6de9 7838 if (PL_regkind[OP(text_node)] == EXACT) {
79a2a0e8 7839 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1 7840 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
aed7b151 7841 reginfo))
c74f6de9
KW
7842 {
7843 sayNO;
7844 }
c277df42 7845 }
c277df42 7846 }
40a82448
DM
7847 }
7848
7849 DEBUG_EXECUTE_r(
147e3846 7850 Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n",
cb41e5d6 7851 depth, (IV)ST.count)
40a82448 7852 );
c74f6de9 7853 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
79a2a0e8
KW
7854 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7855 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7856 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7857 {
7858 /* simulate B failing */
7859 DEBUG_OPTIMISE_r(
147e3846 7860 Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n",
cb41e5d6 7861 depth,
79a2a0e8
KW
7862 valid_utf8_to_uvchr((U8 *) locinput, NULL),
7863 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7864 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7865 );
7866 state_num = CURLYM_B_fail;
7867 goto reenter_switch;
7868 }
7869 }
7870 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5400f398
KW
7871 /* simulate B failing */
7872 DEBUG_OPTIMISE_r(
6ad9a8ab 7873 Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
cb41e5d6 7874 depth,
79a2a0e8
KW
7875 (int) nextchr, ST.c1, ST.c2)
7876 );
5400f398
KW
7877 state_num = CURLYM_B_fail;
7878 goto reenter_switch;
7879 }
c74f6de9 7880 }
40a82448
DM
7881
7882 if (ST.me->flags) {
f6033a9d 7883 /* emulate CLOSE: mark current A as captured */
40a82448
DM
7884 I32 paren = ST.me->flags;
7885 if (ST.count) {
b93070ed 7886 rex->offs[paren].start
9d9163fb
DM
7887 = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7888 rex->offs[paren].end = locinput - reginfo->strbeg;
f6033a9d
DM
7889 if ((U32)paren > rex->lastparen)
7890 rex->lastparen = paren;
7891 rex->lastcloseparen = paren;
c277df42 7892 }
40a82448 7893 else
b93070ed 7894 rex->offs[paren].end = -1;
24be3102 7895
ce12e254 7896 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags))
0a4db386
YO
7897 {
7898 if (ST.count)
7899 goto fake_end;
7900 else
7901 sayNO;
7902 }
c277df42 7903 }
0a4db386 7904
4d5016e5 7905 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
661d43c4 7906 NOT_REACHED; /* NOTREACHED */
40a82448
DM
7907
7908 case CURLYM_B_fail: /* just failed to match a B */
7909 REGCP_UNWIND(ST.cp);
a8d1f4b4 7910 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
40a82448 7911 if (ST.minmod) {
84d2fa14
HS
7912 I32 max = ARG2(ST.me);
7913 if (max != REG_INFTY && ST.count == max)
40a82448
DM
7914 sayNO;
7915 goto curlym_do_A; /* try to match a further A */
7916 }
7917 /* backtrack one A */
7918 if (ST.count == ARG1(ST.me) /* min */)
7919 sayNO;
7920 ST.count--;
7016d6eb 7921 SET_locinput(HOPc(locinput, -ST.alen));
40a82448
DM
7922 goto curlym_do_B; /* try to match B */
7923
c255a977
DM
7924#undef ST
7925#define ST st->u.curly
40a82448 7926
c255a977
DM
7927#define CURLY_SETPAREN(paren, success) \
7928 if (paren) { \
7929 if (success) { \
9d9163fb
DM
7930 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7931 rex->offs[paren].end = locinput - reginfo->strbeg; \
f6033a9d
DM
7932 if (paren > rex->lastparen) \
7933 rex->lastparen = paren; \
b93070ed 7934 rex->lastcloseparen = paren; \
c255a977 7935 } \
f6033a9d 7936 else { \
b93070ed 7937 rex->offs[paren].end = -1; \
f6033a9d
DM
7938 rex->lastparen = ST.lastparen; \
7939 rex->lastcloseparen = ST.lastcloseparen; \
7940 } \
c255a977
DM
7941 }
7942
b40a2c17 7943 case STAR: /* /A*B/ where A is width 1 char */
c255a977
DM
7944 ST.paren = 0;
7945 ST.min = 0;
7946 ST.max = REG_INFTY;
a0d0e21e
LW
7947 scan = NEXTOPER(scan);
7948 goto repeat;
3c0563b9 7949
b40a2c17 7950 case PLUS: /* /A+B/ where A is width 1 char */
c255a977
DM
7951 ST.paren = 0;
7952 ST.min = 1;
7953 ST.max = REG_INFTY;
c277df42 7954 scan = NEXTOPER(scan);
c255a977 7955 goto repeat;
3c0563b9 7956
b40a2c17 7957 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5400f398
KW
7958 ST.paren = scan->flags; /* Which paren to set */
7959 ST.lastparen = rex->lastparen;
f6033a9d 7960 ST.lastcloseparen = rex->lastcloseparen;
92da3157
DM
7961 if (ST.paren > maxopenparen)
7962 maxopenparen = ST.paren;
c255a977
DM
7963 ST.min = ARG1(scan); /* min to match */
7964 ST.max = ARG2(scan); /* max to match */
ce12e254 7965 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
24be3102 7966 {
0a4db386
YO
7967 ST.min=1;
7968 ST.max=1;
7969 }
c255a977
DM
7970 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7971 goto repeat;
3c0563b9 7972
b40a2c17 7973 case CURLY: /* /A{m,n}B/ where A is width 1 char */
c255a977
DM
7974 ST.paren = 0;
7975 ST.min = ARG1(scan); /* min to match */
7976 ST.max = ARG2(scan); /* max to match */
7977 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
c277df42 7978 repeat:
a0d0e21e
LW
7979 /*
7980 * Lookahead to avoid useless match attempts
7981 * when we know what character comes next.
c255a977 7982 *
5f80c4cf
JP
7983 * Used to only do .*x and .*?x, but now it allows
7984 * for )'s, ('s and (?{ ... })'s to be in the way
7985 * of the quantifier and the EXACT-like node. -- japhy
7986 */
7987
eb5c1be8 7988 assert(ST.min <= ST.max);
3337dfe3
KW
7989 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
7990 ST.c1 = ST.c2 = CHRTEST_VOID;
7991 }
7992 else {
5f80c4cf
JP
7993 regnode *text_node = next;
7994
3dab1dad
YO
7995 if (! HAS_TEXT(text_node))
7996 FIND_NEXT_IMPT(text_node);
5f80c4cf 7997
9e137952 7998 if (! HAS_TEXT(text_node))
c255a977 7999 ST.c1 = ST.c2 = CHRTEST_VOID;
5f80c4cf 8000 else {
ee9b8eae 8001 if ( PL_regkind[OP(text_node)] != EXACT ) {
c255a977 8002 ST.c1 = ST.c2 = CHRTEST_VOID;
cca55fe3 8003 }
c74f6de9 8004 else {
ee9b8eae
YO
8005
8006 /* Currently we only get here when
8007
8008 PL_rekind[OP(text_node)] == EXACT
8009
8010 if this changes back then the macro for IS_TEXT and
8011 friends need to change. */
79a2a0e8 8012 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
984e6dd1 8013 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
aed7b151 8014 reginfo))
c74f6de9
KW
8015 {
8016 sayNO;
8017 }
8018 }
1aa99e6b 8019 }
bbce6d69 8020 }
c255a977
DM
8021
8022 ST.A = scan;
8023 ST.B = next;
24d3c4a9 8024 if (minmod) {
eb72505d 8025 char *li = locinput;
24d3c4a9 8026 minmod = 0;
984e6dd1 8027 if (ST.min &&
21553840 8028 regrepeat(rex, &li, ST.A, reginfo, ST.min)
984e6dd1 8029 < ST.min)
4633a7c4 8030 sayNO;
7016d6eb 8031 SET_locinput(li);
c255a977 8032 ST.count = ST.min;
c255a977
DM
8033 REGCP_SET(ST.cp);
8034 if (ST.c1 == CHRTEST_VOID)
8035 goto curly_try_B_min;
8036
8037 ST.oldloc = locinput;
8038
8039 /* set ST.maxpos to the furthest point along the
8040 * string that could possibly match */
8041 if (ST.max == REG_INFTY) {
220db18a 8042 ST.maxpos = reginfo->strend - 1;
f2ed9b32 8043 if (utf8_target)
c255a977
DM
8044 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
8045 ST.maxpos--;
8046 }
f2ed9b32 8047 else if (utf8_target) {
c255a977
DM
8048 int m = ST.max - ST.min;
8049 for (ST.maxpos = locinput;
220db18a 8050 m >0 && ST.maxpos < reginfo->strend; m--)
c255a977
DM
8051 ST.maxpos += UTF8SKIP(ST.maxpos);
8052 }
8053 else {
8054 ST.maxpos = locinput + ST.max - ST.min;
220db18a
DM
8055 if (ST.maxpos >= reginfo->strend)
8056 ST.maxpos = reginfo->strend - 1;
c255a977
DM
8057 }
8058 goto curly_try_B_min_known;
8059
8060 }
8061 else {
eb72505d
DM
8062 /* avoid taking address of locinput, so it can remain
8063 * a register var */
8064 char *li = locinput;
21553840 8065 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max);
c255a977
DM
8066 if (ST.count < ST.min)
8067 sayNO;
7016d6eb 8068 SET_locinput(li);
c255a977
DM
8069 if ((ST.count > ST.min)
8070 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
8071 {
8072 /* A{m,n} must come at the end of the string, there's
8073 * no point in backing off ... */
8074 ST.min = ST.count;
8075 /* ...except that $ and \Z can match before *and* after
8076 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
8077 We may back off by one in this case. */
eb72505d 8078 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
c255a977
DM
8079 ST.min--;
8080 }
8081 REGCP_SET(ST.cp);
8082 goto curly_try_B_max;
8083 }
661d43c4 8084 NOT_REACHED; /* NOTREACHED */
c255a977
DM
8085
8086 case CURLY_B_min_known_fail:
8087 /* failed to find B in a non-greedy match where c1,c2 valid */
c255a977 8088
c255a977 8089 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
8090 if (ST.paren) {
8091 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8092 }
c255a977
DM
8093 /* Couldn't or didn't -- move forward. */
8094 ST.oldloc = locinput;
f2ed9b32 8095 if (utf8_target)
c255a977
DM
8096 locinput += UTF8SKIP(locinput);
8097 else
8098 locinput++;
8099 ST.count++;
8100 curly_try_B_min_known:
8101 /* find the next place where 'B' could work, then call B */
8102 {
8103 int n;
f2ed9b32 8104 if (utf8_target) {
c255a977
DM
8105 n = (ST.oldloc == locinput) ? 0 : 1;
8106 if (ST.c1 == ST.c2) {
c255a977 8107 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
8108 while (locinput <= ST.maxpos
8109 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
8110 {
8111 locinput += UTF8SKIP(locinput);
c255a977
DM
8112 n++;
8113 }
1aa99e6b
IH
8114 }
8115 else {
c255a977 8116 /* set n to utf8_distance(oldloc, locinput) */
79a2a0e8
KW
8117 while (locinput <= ST.maxpos
8118 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
8119 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
8120 {
8121 locinput += UTF8SKIP(locinput);
c255a977 8122 n++;
1aa99e6b 8123 }
0fe9bf95
IZ
8124 }
8125 }
5400f398 8126 else { /* Not utf8_target */
c255a977
DM
8127 if (ST.c1 == ST.c2) {
8128 while (locinput <= ST.maxpos &&
8129 UCHARAT(locinput) != ST.c1)
8130 locinput++;
bbce6d69 8131 }
c255a977
DM
8132 else {
8133 while (locinput <= ST.maxpos
8134 && UCHARAT(locinput) != ST.c1
8135 && UCHARAT(locinput) != ST.c2)
8136 locinput++;
a0ed51b3 8137 }
c255a977
DM
8138 n = locinput - ST.oldloc;
8139 }
8140 if (locinput > ST.maxpos)
8141 sayNO;
c255a977 8142 if (n) {
eb72505d
DM
8143 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
8144 * at b; check that everything between oldloc and
8145 * locinput matches */
8146 char *li = ST.oldloc;
c255a977 8147 ST.count += n;
21553840 8148 if (regrepeat(rex, &li, ST.A, reginfo, n) < n)
4633a7c4 8149 sayNO;
eb72505d 8150 assert(n == REG_INFTY || locinput == li);
a0d0e21e 8151 }
c255a977 8152 CURLY_SETPAREN(ST.paren, ST.count);
ce12e254 8153 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
0a4db386 8154 goto fake_end;
4d5016e5 8155 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
a0d0e21e 8156 }
661d43c4 8157 NOT_REACHED; /* NOTREACHED */
c255a977
DM
8158
8159 case CURLY_B_min_fail:
8160 /* failed to find B in a non-greedy match where c1,c2 invalid */
c255a977
DM
8161
8162 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
8163 if (ST.paren) {
8164 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8165 }
c255a977 8166 /* failed -- move forward one */
f73aaa43 8167 {
eb72505d 8168 char *li = locinput;
21553840 8169 if (!regrepeat(rex, &li, ST.A, reginfo, 1)) {
f73aaa43
DM
8170 sayNO;
8171 }
eb72505d 8172 locinput = li;
f73aaa43
DM
8173 }
8174 {
c255a977 8175 ST.count++;
c255a977
DM
8176 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
8177 ST.count > 0)) /* count overflow ? */
15272685 8178 {
c255a977
DM
8179 curly_try_B_min:
8180 CURLY_SETPAREN(ST.paren, ST.count);
ce12e254 8181 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
0a4db386 8182 goto fake_end;
4d5016e5 8183 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
a0d0e21e
LW
8184 }
8185 }
c74f6de9 8186 sayNO;
661d43c4 8187 NOT_REACHED; /* NOTREACHED */
c255a977 8188
c52b8b12 8189 curly_try_B_max:
c255a977 8190 /* a successful greedy match: now try to match B */
ce12e254 8191 if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.paren))
40d049e4 8192 goto fake_end;
c255a977 8193 {
220db18a 8194 bool could_match = locinput < reginfo->strend;
79a2a0e8 8195
c255a977 8196 /* If it could work, try it. */
79a2a0e8
KW
8197 if (ST.c1 != CHRTEST_VOID && could_match) {
8198 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
8199 {
8200 could_match = memEQ(locinput,
8201 ST.c1_utf8,
8202 UTF8SKIP(locinput))
8203 || memEQ(locinput,
8204 ST.c2_utf8,
8205 UTF8SKIP(locinput));
8206 }
8207 else {
8208 could_match = UCHARAT(locinput) == ST.c1
8209 || UCHARAT(locinput) == ST.c2;
8210 }
8211 }
8212 if (ST.c1 == CHRTEST_VOID || could_match) {
c255a977 8213 CURLY_SETPAREN(ST.paren, ST.count);
4d5016e5 8214 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
661d43c4 8215 NOT_REACHED; /* NOTREACHED */
c255a977
DM
8216 }
8217 }
924ba076 8218 /* FALLTHROUGH */
3c0563b9 8219
c255a977
DM
8220 case CURLY_B_max_fail:
8221 /* failed to find B in a greedy match */
c255a977
DM
8222
8223 REGCP_UNWIND(ST.cp);
a8d1f4b4
DM
8224 if (ST.paren) {
8225 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
8226 }
c255a977
DM
8227 /* back up. */
8228 if (--ST.count < ST.min)
8229 sayNO;
eb72505d 8230 locinput = HOPc(locinput, -1);
c255a977
DM
8231 goto curly_try_B_max;
8232
8233#undef ST
8234
3c0563b9 8235 case END: /* last op of main pattern */
c52b8b12 8236 fake_end:
faec1544
DM
8237 if (cur_eval) {
8238 /* we've just finished A in /(??{A})B/; now continue with B */
401a8022 8239 SET_RECURSE_LOCINPUT("FAKE-END[before]", CUR_EVAL.prev_recurse_locinput);
288b8c02 8240 st->u.eval.prev_rex = rex_sv; /* inner */
92da3157
DM
8241
8242 /* Save *all* the positions. */
21553840 8243 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
401a8022 8244 rex_sv = CUR_EVAL.prev_rex;
aed7b151 8245 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
ec43f78b 8246 SET_reg_curpm(rex_sv);
8d919b0a 8247 rex = ReANY(rex_sv);
f8fc2ecf 8248 rexi = RXi_GET(rex);
595de761
YO
8249
8250 st->u.eval.prev_curlyx = cur_curlyx;
401a8022 8251 cur_curlyx = CUR_EVAL.prev_curlyx;
34a81e2b 8252
faec1544 8253 REGCP_SET(st->u.eval.lastcp);
faec1544
DM
8254
8255 /* Restore parens of the outer rex without popping the
8256 * savestack */
21553840 8257 regcp_restore(rex, CUR_EVAL.lastcp, &maxopenparen);
faec1544
DM
8258
8259 st->u.eval.prev_eval = cur_eval;
401a8022 8260 cur_eval = CUR_EVAL.prev_eval;
faec1544 8261 DEBUG_EXECUTE_r(
6ad9a8ab 8262 Perl_re_exec_indentf( aTHX_ "EVAL trying tail ... (cur_eval=%p)\n",
401a8022 8263 depth, cur_eval););
e7707071
YO
8264 if ( nochange_depth )
8265 nochange_depth--;
8266
401a8022
YO
8267 SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput);
8268
4ee16520 8269 PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B,
4d5016e5 8270 locinput); /* match B */
faec1544
DM
8271 }
8272
3b0527fe 8273 if (locinput < reginfo->till) {
6ad9a8ab 8274 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_
cb41e5d6 8275 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
7821416a 8276 PL_colors[4],
6d59b646
DM
8277 (long)(locinput - startpos),
8278 (long)(reginfo->till - startpos),
7821416a 8279 PL_colors[5]));
58e23c8d 8280
262b90c4 8281 sayNO_SILENT; /* Cannot match: too short. */
7821416a 8282 }
262b90c4 8283 sayYES; /* Success! */
dad79028
DM
8284
8285 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
8286 DEBUG_EXECUTE_r(
6ad9a8ab 8287 Perl_re_exec_indentf( aTHX_ "%ssubpattern success...%s\n",
cb41e5d6 8288 depth, PL_colors[4], PL_colors[5]));
262b90c4 8289 sayYES; /* Success! */
dad79028 8290
40a82448
DM
8291#undef ST
8292#define ST st->u.ifmatch
8293
37f53970
DM
8294 {
8295 char *newstart;
8296
40a82448
DM
8297 case SUSPEND: /* (?>A) */
8298 ST.wanted = 1;
37f53970 8299 newstart = locinput;
9041c2e3 8300 goto do_ifmatch;
dad79028 8301
40a82448
DM
8302 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
8303 ST.wanted = 0;
dad79028
DM
8304 goto ifmatch_trivial_fail_test;
8305
40a82448
DM
8306 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
8307 ST.wanted = 1;
dad79028 8308 ifmatch_trivial_fail_test:
a0ed51b3 8309 if (scan->flags) {
52657f30 8310 char * const s = HOPBACKc(locinput, scan->flags);
dad79028
DM
8311 if (!s) {
8312 /* trivial fail */
24d3c4a9
DM
8313 if (logical) {
8314 logical = 0;
f2338a2e 8315 sw = 1 - cBOOL(ST.wanted);
dad79028 8316 }
40a82448 8317 else if (ST.wanted)
dad79028
DM
8318 sayNO;
8319 next = scan + ARG(scan);
8320 if (next == scan)
8321 next = NULL;
8322 break;
8323 }
37f53970 8324 newstart = s;
a0ed51b3
LW
8325 }
8326 else
37f53970 8327 newstart = locinput;
a0ed51b3 8328
c277df42 8329 do_ifmatch:
40a82448 8330 ST.me = scan;
24d3c4a9 8331 ST.logical = logical;
24d786f4
YO
8332 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
8333
40a82448 8334 /* execute body of (?...A) */
37f53970 8335 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
661d43c4 8336 NOT_REACHED; /* NOTREACHED */
37f53970 8337 }
40a82448
DM
8338
8339 case IFMATCH_A_fail: /* body of (?...A) failed */
8340 ST.wanted = !ST.wanted;
924ba076 8341 /* FALLTHROUGH */
40a82448
DM
8342
8343 case IFMATCH_A: /* body of (?...A) succeeded */
24d3c4a9 8344 if (ST.logical) {
f2338a2e 8345 sw = cBOOL(ST.wanted);
40a82448
DM
8346 }
8347 else if (!ST.wanted)
8348 sayNO;
8349
37f53970
DM
8350 if (OP(ST.me) != SUSPEND) {
8351 /* restore old position except for (?>...) */
8352 locinput = st->locinput;
40a82448
DM
8353 }
8354 scan = ST.me + ARG(ST.me);
8355 if (scan == ST.me)
8356 scan = NULL;
8357 continue; /* execute B */
8358
8359#undef ST
dad79028 8360
3c0563b9
DM
8361 case LONGJMP: /* alternative with many branches compiles to
8362 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
c277df42
IZ
8363 next = scan + ARG(scan);
8364 if (next == scan)
8365 next = NULL;
a0d0e21e 8366 break;
3c0563b9
DM
8367
8368 case COMMIT: /* (*COMMIT) */
220db18a 8369 reginfo->cutpoint = reginfo->strend;
e2e6a0f1 8370 /* FALLTHROUGH */
3c0563b9
DM
8371
8372 case PRUNE: /* (*PRUNE) */
fee50582 8373 if (scan->flags)
ad64d0ec 8374 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4d5016e5 8375 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
661d43c4 8376 NOT_REACHED; /* NOTREACHED */
3c0563b9 8377
54612592
YO
8378 case COMMIT_next_fail:
8379 no_final = 1;
8380 /* FALLTHROUGH */
fee50582
YO
8381 sayNO;
8382 NOT_REACHED; /* NOTREACHED */
3c0563b9
DM
8383
8384 case OPFAIL: /* (*FAIL) */
fee50582
YO
8385 if (scan->flags)
8386 sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6625d926
YO
8387 if (logical) {
8388 /* deal with (?(?!)X|Y) properly,
8389 * make sure we trigger the no branch
8390 * of the trailing IFTHEN structure*/
8391 sw= 0;
8392 break;
8393 } else {
8394 sayNO;
8395 }
661d43c4 8396 NOT_REACHED; /* NOTREACHED */
e2e6a0f1
YO
8397
8398#define ST st->u.mark
3c0563b9 8399 case MARKPOINT: /* (*MARK:foo) */
e2e6a0f1 8400 ST.prev_mark = mark_state;
5d458dd8 8401 ST.mark_name = sv_commit = sv_yes_mark
ad64d0ec 8402 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
e2e6a0f1 8403 mark_state = st;
4d5016e5
DM
8404 ST.mark_loc = locinput;
8405 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
661d43c4 8406 NOT_REACHED; /* NOTREACHED */
3c0563b9 8407
e2e6a0f1
YO
8408 case MARKPOINT_next:
8409 mark_state = ST.prev_mark;
8410 sayYES;
661d43c4 8411 NOT_REACHED; /* NOTREACHED */
3c0563b9 8412
e2e6a0f1 8413 case MARKPOINT_next_fail:
5d458dd8 8414 if (popmark && sv_eq(ST.mark_name,popmark))
e2e6a0f1
YO
8415 {
8416 if (ST.mark_loc > startpoint)
8417 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
8418 popmark = NULL; /* we found our mark */
8419 sv_commit = ST.mark_name;
8420
8421 DEBUG_EXECUTE_r({
147e3846 8422 Perl_re_exec_indentf( aTHX_ "%ssetting cutpoint to mark:%" SVf "...%s\n",
cb41e5d6 8423 depth,
be2597df 8424 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
e2e6a0f1
YO
8425 });
8426 }
8427 mark_state = ST.prev_mark;
5d458dd8
YO
8428 sv_yes_mark = mark_state ?
8429 mark_state->u.mark.mark_name : NULL;
e2e6a0f1 8430 sayNO;
661d43c4 8431 NOT_REACHED; /* NOTREACHED */
3c0563b9
DM
8432
8433 case SKIP: /* (*SKIP) */
fee50582 8434 if (!scan->flags) {
2bf803e2 8435 /* (*SKIP) : if we fail we cut here*/
5d458dd8 8436 ST.mark_name = NULL;
e2e6a0f1 8437 ST.mark_loc = locinput;
4d5016e5 8438 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5d458dd8 8439 } else {
2bf803e2 8440 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5d458dd8
YO
8441 otherwise do nothing. Meaning we need to scan
8442 */
8443 regmatch_state *cur = mark_state;
ad64d0ec 8444 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5d458dd8
YO
8445
8446 while (cur) {
8447 if ( sv_eq( cur->u.mark.mark_name,
8448 find ) )
8449 {
8450 ST.mark_name = find;
4d5016e5 8451 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5d458dd8
YO
8452 }
8453 cur = cur->u.mark.prev_mark;
8454 }
e2e6a0f1 8455 }
2bf803e2 8456 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5d458dd8 8457 break;
3c0563b9 8458
5d458dd8
YO
8459 case SKIP_next_fail:
8460 if (ST.mark_name) {
8461 /* (*CUT:NAME) - Set up to search for the name as we
8462 collapse the stack*/
8463 popmark = ST.mark_name;
8464 } else {
8465 /* (*CUT) - No name, we cut here.*/
e2e6a0f1
YO
8466 if (ST.mark_loc > startpoint)
8467 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5d458dd8
YO
8468 /* but we set sv_commit to latest mark_name if there
8469 is one so they can test to see how things lead to this
8470 cut */
8471 if (mark_state)
8472 sv_commit=mark_state->u.mark.mark_name;
8473 }
e2e6a0f1
YO
8474 no_final = 1;
8475 sayNO;
661d43c4 8476 NOT_REACHED; /* NOTREACHED */
e2e6a0f1 8477#undef ST
3c0563b9
DM
8478
8479 case LNBREAK: /* \R */
220db18a 8480 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
e1d1eefb 8481 locinput += n;
e1d1eefb
YO
8482 } else
8483 sayNO;
8484 break;
8485
a0d0e21e 8486 default:
147e3846 8487 PerlIO_printf(Perl_error_log, "%" UVxf " %d\n",
d7d93a81 8488 PTR2UV(scan), OP(scan));
cea2e8a9 8489 Perl_croak(aTHX_ "regexp memory corruption");
28b98f76
DM
8490
8491 /* this is a point to jump to in order to increment
8492 * locinput by one character */
c52b8b12 8493 increment_locinput:
e6ca698c 8494 assert(!NEXTCHR_IS_EOS);
28b98f76
DM
8495 if (utf8_target) {
8496 locinput += PL_utf8skip[nextchr];
8975657c
KW
8497 /* locinput is allowed to go 1 char off the end (signifying
8498 * EOS), but not 2+ */
220db18a 8499 if (locinput > reginfo->strend)
28b98f76 8500 sayNO;
28b98f76
DM
8501 }
8502 else
3640db6b 8503 locinput++;
28b98f76 8504 break;
5d458dd8
YO
8505
8506 } /* end switch */
95b24440 8507
5d458dd8
YO
8508 /* switch break jumps here */
8509 scan = next; /* prepare to execute the next op and ... */
8510 continue; /* ... jump back to the top, reusing st */
a74ff37d 8511 /* NOTREACHED */
95b24440 8512
40a82448
DM
8513 push_yes_state:
8514 /* push a state that backtracks on success */
8515 st->u.yes.prev_yes_state = yes_state;
8516 yes_state = st;
924ba076 8517 /* FALLTHROUGH */
40a82448
DM
8518 push_state:
8519 /* push a new regex state, then continue at scan */
8520 {
8521 regmatch_state *newst;
8522
24b23f37
YO
8523 DEBUG_STACK_r({
8524 regmatch_state *cur = st;
8525 regmatch_state *curyes = yes_state;
f4197b8e 8526 U32 i;
24b23f37 8527 regmatch_slab *slab = PL_regmatch_slab;
f4197b8e 8528 for (i = 0; i < 3 && i <= depth; cur--,i++) {
24b23f37
YO
8529 if (cur < SLAB_FIRST(slab)) {
8530 slab = slab->prev;
8531 cur = SLAB_LAST(slab);
8532 }
f4197b8e 8533 Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n",
cb41e5d6 8534 depth,
f4197b8e
DM
8535 i ? " " : "push",
8536 depth - i, PL_reg_name[cur->resume_state],
24b23f37
YO
8537 (curyes == cur) ? "yes" : ""
8538 );
8539 if (curyes == cur)
8540 curyes = cur->u.yes.prev_yes_state;
8541 }
8542 } else
8543 DEBUG_STATE_pp("push")
8544 );
40a82448 8545 depth++;
40a82448
DM
8546 st->locinput = locinput;
8547 newst = st+1;
8548 if (newst > SLAB_LAST(PL_regmatch_slab))
8549 newst = S_push_slab(aTHX);
8550 PL_regmatch_state = newst;
786e8c11 8551
4d5016e5 8552 locinput = pushinput;
40a82448
DM
8553 st = newst;
8554 continue;
a74ff37d 8555 /* NOTREACHED */
40a82448 8556 }
a0d0e21e 8557 }
c5d7841e 8558#ifdef SOLARIS_BAD_OPTIMIZER
19949523 8559# undef PL_charclass
c5d7841e 8560#endif
a687059c 8561
a0d0e21e
LW
8562 /*
8563 * We get here only if there's trouble -- normally "case END" is
8564 * the terminating point.
8565 */
cea2e8a9 8566 Perl_croak(aTHX_ "corrupted regexp pointers");
661d43c4 8567 NOT_REACHED; /* NOTREACHED */
4633a7c4 8568
7b52d656 8569 yes:
77cb431f
DM
8570 if (yes_state) {
8571 /* we have successfully completed a subexpression, but we must now
8572 * pop to the state marked by yes_state and continue from there */
77cb431f 8573 assert(st != yes_state);
5bc10b2c
DM
8574#ifdef DEBUGGING
8575 while (st != yes_state) {
8576 st--;
8577 if (st < SLAB_FIRST(PL_regmatch_slab)) {
8578 PL_regmatch_slab = PL_regmatch_slab->prev;
8579 st = SLAB_LAST(PL_regmatch_slab);
8580 }
e2e6a0f1 8581 DEBUG_STATE_r({
54612592
YO
8582 if (no_final) {
8583 DEBUG_STATE_pp("pop (no final)");
8584 } else {
8585 DEBUG_STATE_pp("pop (yes)");
8586 }
e2e6a0f1 8587 });
5bc10b2c
DM
8588 depth--;
8589 }
8590#else
77cb431f
DM
8591 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
8592 || yes_state > SLAB_LAST(PL_regmatch_slab))
8593 {
8594 /* not in this slab, pop slab */
8595 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
8596 PL_regmatch_slab = PL_regmatch_slab->prev;
8597 st = SLAB_LAST(PL_regmatch_slab);
8598 }
8599 depth -= (st - yes_state);
5bc10b2c 8600#endif
77cb431f
DM
8601 st = yes_state;
8602 yes_state = st->u.yes.prev_yes_state;
8603 PL_regmatch_state = st;
24b23f37 8604
3640db6b 8605 if (no_final)
5d458dd8 8606 locinput= st->locinput;
54612592 8607 state_num = st->resume_state + no_final;
24d3c4a9 8608 goto reenter_switch;
77cb431f
DM
8609 }
8610
6ad9a8ab 8611 DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%sMatch successful!%s\n",
e4584336 8612 PL_colors[4], PL_colors[5]));
02db2b7b 8613
bf2039a9 8614 if (reginfo->info_aux_eval) {
19b95bf0
DM
8615 /* each successfully executed (?{...}) block does the equivalent of
8616 * local $^R = do {...}
8617 * When popping the save stack, all these locals would be undone;
8618 * bypass this by setting the outermost saved $^R to the latest
8619 * value */
4b22688e
YO
8620 /* I dont know if this is needed or works properly now.
8621 * see code related to PL_replgv elsewhere in this file.
8622 * Yves
8623 */
19b95bf0
DM
8624 if (oreplsv != GvSV(PL_replgv))
8625 sv_setsv(oreplsv, GvSV(PL_replgv));
8626 }
95b24440 8627 result = 1;
aa283a38 8628 goto final_exit;
4633a7c4 8629
7b52d656 8630 no:
a3621e74 8631 DEBUG_EXECUTE_r(
6ad9a8ab 8632 Perl_re_exec_indentf( aTHX_ "%sfailed...%s\n",
cb41e5d6 8633 depth,
786e8c11 8634 PL_colors[4], PL_colors[5])
7821416a 8635 );
aa283a38 8636
7b52d656 8637 no_silent:
54612592
YO
8638 if (no_final) {
8639 if (yes_state) {
8640 goto yes;
8641 } else {
8642 goto final_exit;
8643 }
8644 }
aa283a38
DM
8645 if (depth) {
8646 /* there's a previous state to backtrack to */
40a82448
DM
8647 st--;
8648 if (st < SLAB_FIRST(PL_regmatch_slab)) {
8649 PL_regmatch_slab = PL_regmatch_slab->prev;
8650 st = SLAB_LAST(PL_regmatch_slab);
8651 }
8652 PL_regmatch_state = st;
40a82448 8653 locinput= st->locinput;
40a82448 8654
5bc10b2c
DM
8655 DEBUG_STATE_pp("pop");
8656 depth--;
262b90c4
DM
8657 if (yes_state == st)
8658 yes_state = st->u.yes.prev_yes_state;
5bc10b2c 8659
24d3c4a9 8660 state_num = st->resume_state + 1; /* failure = success + 1 */
8df928d2 8661 PERL_ASYNC_CHECK();
24d3c4a9 8662 goto reenter_switch;
95b24440 8663 }
24d3c4a9 8664 result = 0;
aa283a38 8665
262b90c4 8666 final_exit:
bbe252da 8667 if (rex->intflags & PREGf_VERBARG_SEEN) {
5d458dd8
YO
8668 SV *sv_err = get_sv("REGERROR", 1);
8669 SV *sv_mrk = get_sv("REGMARK", 1);
8670 if (result) {
e2e6a0f1 8671 sv_commit = &PL_sv_no;
5d458dd8
YO
8672 if (!sv_yes_mark)
8673 sv_yes_mark = &PL_sv_yes;
8674 } else {
8675 if (!sv_commit)
8676 sv_commit = &PL_sv_yes;
8677 sv_yes_mark = &PL_sv_no;
8678 }
316ebaf2
JH
8679 assert(sv_err);
8680 assert(sv_mrk);
5d458dd8
YO
8681 sv_setsv(sv_err, sv_commit);
8682 sv_setsv(sv_mrk, sv_yes_mark);
e2e6a0f1 8683 }
19b95bf0 8684
81ed78b2
DM
8685
8686 if (last_pushed_cv) {
8687 dSP;
4b9c7cae 8688 /* see "Some notes about MULTICALL" above */
81ed78b2 8689 POP_MULTICALL;
4f8dbb2d 8690 PERL_UNUSED_VAR(SP);
81ed78b2 8691 }
4b9c7cae
DM
8692 else
8693 LEAVE_SCOPE(orig_savestack_ix);
81ed78b2 8694
9d9163fb
DM
8695 assert(!result || locinput - reginfo->strbeg >= 0);
8696 return result ? locinput - reginfo->strbeg : -1;
a687059c
LW
8697}
8698
8699/*
8700 - regrepeat - repeatedly match something simple, report how many
d60de1d1 8701 *
e64f369d
KW
8702 * What 'simple' means is a node which can be the operand of a quantifier like
8703 * '+', or {1,3}
8704 *
d60de1d1
DM
8705 * startposp - pointer a pointer to the start position. This is updated
8706 * to point to the byte following the highest successful
8707 * match.
8708 * p - the regnode to be repeatedly matched against.
220db18a 8709 * reginfo - struct holding match state, such as strend
4063ade8 8710 * max - maximum number of things to match.
d60de1d1 8711 * depth - (for debugging) backtracking depth.
a687059c 8712 */
76e3520e 8713STATIC I32
272d35c9 8714S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
21553840 8715 regmatch_info *const reginfo, I32 max _pDEPTH)
a687059c 8716{
4063ade8 8717 char *scan; /* Pointer to current position in target string */
eb578fdb 8718 I32 c;
220db18a 8719 char *loceol = reginfo->strend; /* local version */
4063ade8 8720 I32 hardcount = 0; /* How many matches so far */
ba44c216 8721 bool utf8_target = reginfo->is_utf8_target;
b53eee5d 8722 unsigned int to_complement = 0; /* Invert the result? */
d513472c 8723 UV utf8_flags;
3018b823 8724 _char_class_number classnum;
a0d0e21e 8725
7918f24d
NC
8726 PERL_ARGS_ASSERT_REGREPEAT;
8727
f73aaa43 8728 scan = *startposp;
faf11cac
HS
8729 if (max == REG_INFTY)
8730 max = I32_MAX;
dfb8f192 8731 else if (! utf8_target && loceol - scan > max)
7f596f4c 8732 loceol = scan + max;
4063ade8
KW
8733
8734 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8735 * to the maximum of how far we should go in it (leaving it set to the real
8736 * end, if the maximum permissible would take us beyond that). This allows
8737 * us to make the loop exit condition that we haven't gone past <loceol> to
8738 * also mean that we haven't exceeded the max permissible count, saving a
8739 * test each time through the loop. But it assumes that the OP matches a
8740 * single byte, which is true for most of the OPs below when applied to a
8741 * non-UTF-8 target. Those relatively few OPs that don't have this
8742 * characteristic will have to compensate.
8743 *
8744 * There is no adjustment for UTF-8 targets, as the number of bytes per
8745 * character varies. OPs will have to test both that the count is less
8746 * than the max permissible (using <hardcount> to keep track), and that we
8747 * are still within the bounds of the string (using <loceol>. A few OPs
8748 * match a single byte no matter what the encoding. They can omit the max
8749 * test if, for the UTF-8 case, they do the adjustment that was skipped
8750 * above.
8751 *
8752 * Thus, the code above sets things up for the common case; and exceptional
8753 * cases need extra work; the common case is to make sure <scan> doesn't
8754 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8755 * count doesn't exceed the maximum permissible */
8756
a0d0e21e 8757 switch (OP(p)) {
22c35a8c 8758 case REG_ANY:
f2ed9b32 8759 if (utf8_target) {
1aa99e6b 8760 while (scan < loceol && hardcount < max && *scan != '\n') {
ffc61ed2
JH
8761 scan += UTF8SKIP(scan);
8762 hardcount++;
8763 }
8764 } else {
8765 while (scan < loceol && *scan != '\n')
8766 scan++;
a0ed51b3
LW
8767 }
8768 break;
ffc61ed2 8769 case SANY:
f2ed9b32 8770 if (utf8_target) {
a0804c9e 8771 while (scan < loceol && hardcount < max) {
def8e4ea
JH
8772 scan += UTF8SKIP(scan);
8773 hardcount++;
8774 }
8775 }
8776 else
8777 scan = loceol;
a0ed51b3 8778 break;
a4525e78 8779 case EXACTL:
780fcc9f 8780 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
613abc6d
KW
8781 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8782 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8783 }
780fcc9f 8784 /* FALLTHROUGH */
59d32103 8785 case EXACT:
f9176b44 8786 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 8787
59d32103 8788 c = (U8)*STRING(p);
59d32103 8789
5e4a1da1
KW
8790 /* Can use a simple loop if the pattern char to match on is invariant
8791 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
8792 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8793 * true iff it doesn't matter if the argument is in UTF-8 or not */
f9176b44 8794 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
e9369824 8795 if (utf8_target && loceol - scan > max) {
4063ade8
KW
8796 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8797 * since here, to match at all, 1 char == 1 byte */
8798 loceol = scan + max;
8799 }
59d32103
KW
8800 while (scan < loceol && UCHARAT(scan) == c) {
8801 scan++;
8802 }
8803 }
f9176b44 8804 else if (reginfo->is_utf8_pat) {
5e4a1da1
KW
8805 if (utf8_target) {
8806 STRLEN scan_char_len;
5e4a1da1 8807
4063ade8 8808 /* When both target and pattern are UTF-8, we have to do
5e4a1da1
KW
8809 * string EQ */
8810 while (hardcount < max
9a902117
KW
8811 && scan < loceol
8812 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
5e4a1da1
KW
8813 && memEQ(scan, STRING(p), scan_char_len))
8814 {
4200a00c 8815 scan += scan_char_len;
5e4a1da1
KW
8816 hardcount++;
8817 }
8818 }
8819 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
b40a2c17 8820
5e4a1da1
KW
8821 /* Target isn't utf8; convert the character in the UTF-8
8822 * pattern to non-UTF8, and do a simple loop */
a62b247b 8823 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
5e4a1da1
KW
8824 while (scan < loceol && UCHARAT(scan) == c) {
8825 scan++;
8826 }
8827 } /* else pattern char is above Latin1, can't possibly match the
8828 non-UTF-8 target */
b40a2c17 8829 }
5e4a1da1 8830 else {
59d32103 8831
5e4a1da1
KW
8832 /* Here, the string must be utf8; pattern isn't, and <c> is
8833 * different in utf8 than not, so can't compare them directly.
8834 * Outside the loop, find the two utf8 bytes that represent c, and
8835 * then look for those in sequence in the utf8 string */
59d32103
KW
8836 U8 high = UTF8_TWO_BYTE_HI(c);
8837 U8 low = UTF8_TWO_BYTE_LO(c);
59d32103
KW
8838
8839 while (hardcount < max
8840 && scan + 1 < loceol
8841 && UCHARAT(scan) == high
8842 && UCHARAT(scan + 1) == low)
8843 {
8844 scan += 2;
8845 hardcount++;
8846 }
8847 }
8848 break;
5e4a1da1 8849
098b07d5
KW
8850 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
8851 assert(! reginfo->is_utf8_pat);
924ba076 8852 /* FALLTHROUGH */
2f7f8cb1 8853 case EXACTFA:
098b07d5 8854 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
2f7f8cb1
KW
8855 goto do_exactf;
8856
d4e0b827 8857 case EXACTFL:
780fcc9f 8858 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
cea315b6 8859 utf8_flags = FOLDEQ_LOCALE;
17580e7a
KW
8860 goto do_exactf;
8861
2fdb7295
KW
8862 case EXACTF: /* This node only generated for non-utf8 patterns */
8863 assert(! reginfo->is_utf8_pat);
098b07d5
KW
8864 utf8_flags = 0;
8865 goto do_exactf;
62bf7766 8866
a4525e78
KW
8867 case EXACTFLU8:
8868 if (! utf8_target) {
8869 break;
8870 }
613abc6d
KW
8871 utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8872 | FOLDEQ_S2_FOLDS_SANE;
a4525e78
KW
8873 goto do_exactf;
8874
3c760661 8875 case EXACTFU_SS:
9a5a5549 8876 case EXACTFU:
f9176b44 8877 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
59d32103 8878
c52b8b12 8879 do_exactf: {
613a425d
KW
8880 int c1, c2;
8881 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
d4e0b827 8882
f9176b44 8883 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
613a425d 8884
984e6dd1 8885 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
aed7b151 8886 reginfo))
984e6dd1 8887 {
613a425d 8888 if (c1 == CHRTEST_VOID) {
49b95fad 8889 /* Use full Unicode fold matching */
220db18a 8890 char *tmpeol = reginfo->strend;
f9176b44 8891 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
49b95fad
KW
8892 while (hardcount < max
8893 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8894 STRING(p), NULL, pat_len,
f9176b44 8895 reginfo->is_utf8_pat, utf8_flags))
49b95fad
KW
8896 {
8897 scan = tmpeol;
220db18a 8898 tmpeol = reginfo->strend;
49b95fad
KW
8899 hardcount++;
8900 }
613a425d
KW
8901 }
8902 else if (utf8_target) {
8903 if (c1 == c2) {
4063ade8
KW
8904 while (scan < loceol
8905 && hardcount < max
613a425d
KW
8906 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8907 {
8908 scan += UTF8SKIP(scan);
8909 hardcount++;
8910 }
8911 }
8912 else {
4063ade8
KW
8913 while (scan < loceol
8914 && hardcount < max
613a425d
KW
8915 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8916 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8917 {
8918 scan += UTF8SKIP(scan);
8919 hardcount++;
8920 }
8921 }
8922 }
8923 else if (c1 == c2) {
8924 while (scan < loceol && UCHARAT(scan) == c1) {
8925 scan++;
8926 }
8927 }
8928 else {
8929 while (scan < loceol &&
8930 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8931 {
8932 scan++;
8933 }
8934 }
634c83a2 8935 }
bbce6d69 8936 break;
613a425d 8937 }
a4525e78 8938 case ANYOFL:
780fcc9f 8939 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
a0bd1a30 8940
d1c40ef5 8941 if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
a0bd1a30
KW
8942 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
8943 }
780fcc9f 8944 /* FALLTHROUGH */
ac44c12e 8945 case ANYOFD:
a0d0e21e 8946 case ANYOF:
e0193e47 8947 if (utf8_target) {
4e8910e0 8948 while (hardcount < max
9a902117 8949 && scan < loceol
3db24e1e 8950 && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
4e8910e0 8951 {
9a902117 8952 scan += UTF8SKIP(scan);
ffc61ed2
JH
8953 hardcount++;
8954 }
1451f692
DM
8955 }
8956 else if (ANYOF_FLAGS(p)) {
8957 while (scan < loceol
8958 && reginclass(prog, p, (U8*)scan, (U8*)scan+1, 0))
8959 scan++;
8960 }
8961 else {
8962 while (scan < loceol && ANYOF_BITMAP_TEST(p, *((U8*)scan)))
ffc61ed2
JH
8963 scan++;
8964 }
a0d0e21e 8965 break;
4063ade8 8966
3018b823 8967 /* The argument (FLAGS) to all the POSIX node types is the class number */
980866de 8968
3018b823
KW
8969 case NPOSIXL:
8970 to_complement = 1;
8971 /* FALLTHROUGH */
980866de 8972
3018b823 8973 case POSIXL:
780fcc9f 8974 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
3018b823
KW
8975 if (! utf8_target) {
8976 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8977 *scan)))
a12cf05f 8978 {
3018b823
KW
8979 scan++;
8980 }
8981 } else {
8982 while (hardcount < max && scan < loceol
8983 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8984 (U8 *) scan)))
8985 {
8986 scan += UTF8SKIP(scan);
ffc61ed2
JH
8987 hardcount++;
8988 }
a0ed51b3
LW
8989 }
8990 break;
0658cdde 8991
3018b823
KW
8992 case POSIXD:
8993 if (utf8_target) {
8994 goto utf8_posix;
8995 }
8996 /* FALLTHROUGH */
8997
0658cdde 8998 case POSIXA:
0430522f 8999 if (utf8_target && loceol - scan > max) {
4063ade8 9000
7aee35ff
KW
9001 /* We didn't adjust <loceol> at the beginning of this routine
9002 * because is UTF-8, but it is actually ok to do so, since here, to
9003 * match, 1 char == 1 byte. */
4063ade8
KW
9004 loceol = scan + max;
9005 }
9006 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
0658cdde
KW
9007 scan++;
9008 }
9009 break;
980866de 9010
3018b823
KW
9011 case NPOSIXD:
9012 if (utf8_target) {
9013 to_complement = 1;
9014 goto utf8_posix;
9015 }
924ba076 9016 /* FALLTHROUGH */
980866de 9017
3018b823
KW
9018 case NPOSIXA:
9019 if (! utf8_target) {
9020 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
a12cf05f
KW
9021 scan++;
9022 }
4063ade8 9023 }
3018b823 9024 else {
980866de 9025
3018b823 9026 /* The complement of something that matches only ASCII matches all
837226c8 9027 * non-ASCII, plus everything in ASCII that isn't in the class. */
bedac28b 9028 while (hardcount < max && scan < loceol
7a207065 9029 && ( ! isASCII_utf8_safe(scan, reginfo->strend)
3018b823 9030 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
a12cf05f 9031 {
3018b823 9032 scan += UTF8SKIP(scan);
ffc61ed2
JH
9033 hardcount++;
9034 }
3018b823
KW
9035 }
9036 break;
980866de 9037
3018b823
KW
9038 case NPOSIXU:
9039 to_complement = 1;
9040 /* FALLTHROUGH */
9041
9042 case POSIXU:
9043 if (! utf8_target) {
9044 while (scan < loceol && to_complement
9045 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
4063ade8 9046 {
3018b823
KW
9047 scan++;
9048 }
cfaf538b
KW
9049 }
9050 else {
c52b8b12 9051 utf8_posix:
3018b823
KW
9052 classnum = (_char_class_number) FLAGS(p);
9053 if (classnum < _FIRST_NON_SWASH_CC) {
9054
9055 /* Here, a swash is needed for above-Latin1 code points.
9056 * Process as many Latin1 code points using the built-in rules.
9057 * Go to another loop to finish processing upon encountering
9058 * the first Latin1 code point. We could do that in this loop
9059 * as well, but the other way saves having to test if the swash
9060 * has been loaded every time through the loop: extra space to
9061 * save a test. */
9062 while (hardcount < max && scan < loceol) {
9063 if (UTF8_IS_INVARIANT(*scan)) {
9064 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
9065 classnum))))
9066 {
9067 break;
9068 }
9069 scan++;
9070 }
9071 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
9072 if (! (to_complement
a62b247b 9073 ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
94bb8c36 9074 *(scan + 1)),
3018b823
KW
9075 classnum))))
9076 {
9077 break;
9078 }
9079 scan += 2;
9080 }
9081 else {
9082 goto found_above_latin1;
9083 }
9084
9085 hardcount++;
9086 }
9087 }
9088 else {
9089 /* For these character classes, the knowledge of how to handle
9090 * every code point is compiled in to Perl via a macro. This
9091 * code is written for making the loops as tight as possible.
9092 * It could be refactored to save space instead */
9093 switch (classnum) {
779cf272 9094 case _CC_ENUM_SPACE:
3018b823
KW
9095 while (hardcount < max
9096 && scan < loceol
7a207065
KW
9097 && (to_complement
9098 ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
3018b823
KW
9099 {
9100 scan += UTF8SKIP(scan);
9101 hardcount++;
9102 }
9103 break;
9104 case _CC_ENUM_BLANK:
9105 while (hardcount < max
9106 && scan < loceol
7a207065
KW
9107 && (to_complement
9108 ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
3018b823
KW
9109 {
9110 scan += UTF8SKIP(scan);
9111 hardcount++;
9112 }
9113 break;
9114 case _CC_ENUM_XDIGIT:
9115 while (hardcount < max
9116 && scan < loceol
7a207065
KW
9117 && (to_complement
9118 ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
3018b823
KW
9119 {
9120 scan += UTF8SKIP(scan);
9121 hardcount++;
9122 }
9123 break;
9124 case _CC_ENUM_VERTSPACE:
9125 while (hardcount < max
9126 && scan < loceol
7a207065
KW
9127 && (to_complement
9128 ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
3018b823
KW
9129 {
9130 scan += UTF8SKIP(scan);
9131 hardcount++;
9132 }
9133 break;
9134 case _CC_ENUM_CNTRL:
9135 while (hardcount < max
9136 && scan < loceol
7a207065
KW
9137 && (to_complement
9138 ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
3018b823
KW
9139 {
9140 scan += UTF8SKIP(scan);
9141 hardcount++;
9142 }
9143 break;
9144 default:
9145 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
9146 }
9147 }
a0ed51b3 9148 }
3018b823 9149 break;
4063ade8 9150
3018b823
KW
9151 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
9152
9153 /* Load the swash if not already present */
9154 if (! PL_utf8_swash_ptrs[classnum]) {
9155 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
9156 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
2a16ac92
KW
9157 "utf8",
9158 "",
9159 &PL_sv_undef, 1, 0,
9160 PL_XPosix_ptrs[classnum], &flags);
4063ade8 9161 }
3018b823
KW
9162
9163 while (hardcount < max && scan < loceol
7a207065 9164 && to_complement ^ cBOOL(_generic_utf8_safe(
3018b823
KW
9165 classnum,
9166 scan,
7a207065 9167 loceol,
3018b823
KW
9168 swash_fetch(PL_utf8_swash_ptrs[classnum],
9169 (U8 *) scan,
9170 TRUE))))
9171 {
9172 scan += UTF8SKIP(scan);
9173 hardcount++;
9174 }
9175 break;
9176
e1d1eefb 9177 case LNBREAK:
e64f369d
KW
9178 if (utf8_target) {
9179 while (hardcount < max && scan < loceol &&
9180 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
9181 scan += c;
9182 hardcount++;
9183 }
9184 } else {
9185 /* LNBREAK can match one or two latin chars, which is ok, but we
9186 * have to use hardcount in this situation, and throw away the
9187 * adjustment to <loceol> done before the switch statement */
220db18a 9188 loceol = reginfo->strend;
e64f369d
KW
9189 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
9190 scan+=c;
9191 hardcount++;
9192 }
9193 }
9194 break;
e1d1eefb 9195
780fcc9f
KW
9196 case BOUNDL:
9197 case NBOUNDL:
9198 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
9199 /* FALLTHROUGH */
584b1f02
KW
9200 case BOUND:
9201 case BOUNDA:
584b1f02
KW
9202 case BOUNDU:
9203 case EOS:
9204 case GPOS:
9205 case KEEPS:
9206 case NBOUND:
9207 case NBOUNDA:
584b1f02
KW
9208 case NBOUNDU:
9209 case OPFAIL:
9210 case SBOL:
9211 case SEOL:
9212 /* These are all 0 width, so match right here or not at all. */
9213 break;
9214
9215 default:
9216 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
661d43c4 9217 NOT_REACHED; /* NOTREACHED */
584b1f02 9218
a0d0e21e 9219 }
a687059c 9220
a0ed51b3
LW
9221 if (hardcount)
9222 c = hardcount;
9223 else
f73aaa43
DM
9224 c = scan - *startposp;
9225 *startposp = scan;
a687059c 9226
a3621e74 9227 DEBUG_r({
e68ec53f 9228 GET_RE_DEBUG_FLAGS_DECL;
be8e71aa 9229 DEBUG_EXECUTE_r({
e68ec53f 9230 SV * const prop = sv_newmortal();
8b9781c9 9231 regprop(prog, prop, p, reginfo, NULL);
147e3846 9232 Perl_re_exec_indentf( aTHX_ "%s can match %" IVdf " times out of %" IVdf "...\n",
cb41e5d6 9233 depth, SvPVX_const(prop),(IV)c,(IV)max);
a3621e74 9234 });
be8e71aa 9235 });
9041c2e3 9236
a0d0e21e 9237 return(c);
a687059c
LW
9238}
9239
c277df42 9240
be8e71aa 9241#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
c277df42 9242/*
6c6525b8 9243- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
e0193e47
KW
9244create a copy so that changes the caller makes won't change the shared one.
9245If <altsvp> is non-null, will return NULL in it, for back-compat.
6c6525b8 9246 */
ffc61ed2 9247SV *
5aaab254 9248Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
ffc61ed2 9249{
6c6525b8 9250 PERL_ARGS_ASSERT_REGCLASS_SWASH;
e0193e47
KW
9251
9252 if (altsvp) {
9253 *altsvp = NULL;
9254 }
9255
ef9bc832 9256 return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
6c6525b8 9257}
6c6525b8 9258
3e63bed3 9259#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
ffc61ed2
JH
9260
9261/*
ba7b4546 9262 - reginclass - determine if a character falls into a character class
832705d4 9263
a4525e78 9264 n is the ANYOF-type regnode
6698fab5 9265 p is the target string
3db24e1e 9266 p_end points to one byte beyond the end of the target string
6698fab5 9267 utf8_target tells whether p is in UTF-8.
832705d4 9268
635cd5d4 9269 Returns true if matched; false otherwise.
eba1359e 9270
d5788240
KW
9271 Note that this can be a synthetic start class, a combination of various
9272 nodes, so things you think might be mutually exclusive, such as locale,
9273 aren't. It can match both locale and non-locale
9274
bbce6d69 9275 */
9276
76e3520e 9277STATIC bool
3db24e1e 9278S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
bbce6d69 9279{
27da23d5 9280 dVAR;
a3b680e6 9281 const char flags = ANYOF_FLAGS(n);
bbce6d69 9282 bool match = FALSE;
cc07378b 9283 UV c = *p;
1aa99e6b 9284
7918f24d
NC
9285 PERL_ARGS_ASSERT_REGINCLASS;
9286
afd2eb18
KW
9287 /* If c is not already the code point, get it. Note that
9288 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
9289 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
635cd5d4 9290 STRLEN c_len = 0;
75219bac
KW
9291 const U32 utf8n_flags = UTF8_ALLOW_DEFAULT;
9292 c = utf8n_to_uvchr(p, p_end - p, &c_len, utf8n_flags | UTF8_CHECK_ONLY);
9293 if (c_len == (STRLEN)-1) {
9294 _force_out_malformed_utf8_message(p, p_end,
9295 utf8n_flags,
9296 1 /* 1 means die */ );
9297 NOT_REACHED; /* NOTREACHED */
9298 }
d1c40ef5 9299 if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
613abc6d
KW
9300 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
9301 }
19f67299 9302 }
4b3cda86 9303
7cdde544 9304 /* If this character is potentially in the bitmap, check it */
dcb20b36 9305 if (c < NUM_ANYOF_CODE_POINTS) {
ffc61ed2
JH
9306 if (ANYOF_BITMAP_TEST(n, c))
9307 match = TRUE;
f240c685
KW
9308 else if ((flags
9309 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9310 && OP(n) == ANYOFD
93e92956
KW
9311 && ! utf8_target
9312 && ! isASCII(c))
11454c59
KW
9313 {
9314 match = TRUE;
9315 }
1462525b 9316 else if (flags & ANYOF_LOCALE_FLAGS) {
037715a6 9317 if ((flags & ANYOFL_FOLD)
e0a1ff7a 9318 && c < 256
6942fd9a
KW
9319 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
9320 {
9321 match = TRUE;
b99851e1 9322 }
e0a1ff7a
KW
9323 else if (ANYOF_POSIXL_TEST_ANY_SET(n)
9324 && c < 256
9325 ) {
31c7f561
KW
9326
9327 /* The data structure is arranged so bits 0, 2, 4, ... are set
9328 * if the class includes the Posix character class given by
9329 * bit/2; and 1, 3, 5, ... are set if the class includes the
9330 * complemented Posix class given by int(bit/2). So we loop
9331 * through the bits, each time changing whether we complement
9332 * the result or not. Suppose for the sake of illustration
9333 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
9334 * is set, it means there is a match for this ANYOF node if the
9335 * character is in the class given by the expression (0 / 2 = 0
9336 * = \w). If it is in that class, isFOO_lc() will return 1,
9337 * and since 'to_complement' is 0, the result will stay TRUE,
9338 * and we exit the loop. Suppose instead that bit 0 is 0, but
9339 * bit 1 is 1. That means there is a match if the character
9340 * matches \W. We won't bother to call isFOO_lc() on bit 0,
9341 * but will on bit 1. On the second iteration 'to_complement'
9342 * will be 1, so the exclusive or will reverse things, so we
9343 * are testing for \W. On the third iteration, 'to_complement'
9344 * will be 0, and we would be testing for \s; the fourth
b0d691b2
KW
9345 * iteration would test for \S, etc.
9346 *
9347 * Note that this code assumes that all the classes are closed
9348 * under folding. For example, if a character matches \w, then
9349 * its fold does too; and vice versa. This should be true for
9350 * any well-behaved locale for all the currently defined Posix
9351 * classes, except for :lower: and :upper:, which are handled
9352 * by the pseudo-class :cased: which matches if either of the
9353 * other two does. To get rid of this assumption, an outer
9354 * loop could be used below to iterate over both the source
9355 * character, and its fold (if different) */
31c7f561
KW
9356
9357 int count = 0;
9358 int to_complement = 0;
522b3c1e 9359
31c7f561 9360 while (count < ANYOF_MAX) {
8efd3f97 9361 if (ANYOF_POSIXL_TEST(n, count)
31c7f561
KW
9362 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
9363 {
9364 match = TRUE;
9365 break;
9366 }
9367 count++;
9368 to_complement ^= 1;
9369 }
ffc61ed2 9370 }
a0ed51b3 9371 }
a0ed51b3
LW
9372 }
9373
31f05a37 9374
7cdde544 9375 /* If the bitmap didn't (or couldn't) match, and something outside the
3b04b210 9376 * bitmap could match, try that. */
ef87b810 9377 if (!match) {
93e92956
KW
9378 if (c >= NUM_ANYOF_CODE_POINTS
9379 && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
9380 {
9381 match = TRUE; /* Everything above the bitmap matches */
e051a21d 9382 }
108316fb
KW
9383 /* Here doesn't match everything above the bitmap. If there is
9384 * some information available beyond the bitmap, we may find a
9385 * match in it. If so, this is most likely because the code point
9386 * is outside the bitmap range. But rarely, it could be because of
9387 * some other reason. If so, various flags are set to indicate
9388 * this possibility. On ANYOFD nodes, there may be matches that
9389 * happen only when the target string is UTF-8; or for other node
9390 * types, because runtime lookup is needed, regardless of the
9391 * UTF-8ness of the target string. Finally, under /il, there may
9392 * be some matches only possible if the locale is a UTF-8 one. */
9393 else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP
9394 && ( c >= NUM_ANYOF_CODE_POINTS
9395 || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
9396 && ( UNLIKELY(OP(n) != ANYOFD)
9397 || (utf8_target && ! isASCII_uni(c)
9398# if NUM_ANYOF_CODE_POINTS > 256
9399 && c < 256
9400# endif
9401 )))
d1c40ef5
KW
9402 || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
9403 && IN_UTF8_CTYPE_LOCALE)))
3b04b210 9404 {
1ee208c4
KW
9405 SV* only_utf8_locale = NULL;
9406 SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
ef9bc832 9407 &only_utf8_locale, NULL);
7cdde544 9408 if (sw) {
893ef8be 9409 U8 utf8_buffer[2];
7cdde544
KW
9410 U8 * utf8_p;
9411 if (utf8_target) {
9412 utf8_p = (U8 *) p;
e0193e47 9413 } else { /* Convert to utf8 */
893ef8be
KW
9414 utf8_p = utf8_buffer;
9415 append_utf8_from_native_byte(*p, &utf8_p);
9416 utf8_p = utf8_buffer;
7cdde544 9417 }
f56b6394 9418
e0193e47 9419 if (swash_fetch(sw, utf8_p, TRUE)) {
7cdde544 9420 match = TRUE;
e0193e47 9421 }
7cdde544 9422 }
1ee208c4
KW
9423 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
9424 match = _invlist_contains_cp(only_utf8_locale, c);
9425 }
7cdde544 9426 }
5073ffbd
KW
9427
9428 if (UNICODE_IS_SUPER(c)
f240c685
KW
9429 && (flags
9430 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
9431 && OP(n) != ANYOFD
5073ffbd
KW
9432 && ckWARN_d(WARN_NON_UNICODE))
9433 {
9434 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
147e3846 9435 "Matched non-Unicode code point 0x%04" UVXf " against Unicode property; may not be portable", c);
5073ffbd 9436 }
7cdde544
KW
9437 }
9438
5dbb0c08
KW
9439#if ANYOF_INVERT != 1
9440 /* Depending on compiler optimization cBOOL takes time, so if don't have to
9441 * use it, don't */
9442# error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
9443#endif
9444
f0fdc1c9 9445 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
5dbb0c08 9446 return (flags & ANYOF_INVERT) ^ match;
a0ed51b3 9447}
161b471a 9448
dfe13c55 9449STATIC U8 *
ea3daa5d 9450S_reghop3(U8 *s, SSize_t off, const U8* lim)
9041c2e3 9451{
6af86488
KW
9452 /* return the position 'off' UTF-8 characters away from 's', forward if
9453 * 'off' >= 0, backwards if negative. But don't go outside of position
9454 * 'lim', which better be < s if off < 0 */
9455
7918f24d
NC
9456 PERL_ARGS_ASSERT_REGHOP3;
9457
a0ed51b3 9458 if (off >= 0) {
1aa99e6b 9459 while (off-- && s < lim) {
ffc61ed2 9460 /* XXX could check well-formedness here */
a0ed51b3 9461 s += UTF8SKIP(s);
ffc61ed2 9462 }
a0ed51b3
LW
9463 }
9464 else {
1de06328
YO
9465 while (off++ && s > lim) {
9466 s--;
9467 if (UTF8_IS_CONTINUED(*s)) {
9468 while (s > lim && UTF8_IS_CONTINUATION(*s))
9469 s--;
22b433ef 9470 if (! UTF8_IS_START(*s)) {
d820a0ff 9471 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
22b433ef 9472 }
a0ed51b3 9473 }
1de06328 9474 /* XXX could check well-formedness here */
a0ed51b3
LW
9475 }
9476 }
9477 return s;
9478}
161b471a 9479
dfe13c55 9480STATIC U8 *
ea3daa5d 9481S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
1de06328 9482{
7918f24d
NC
9483 PERL_ARGS_ASSERT_REGHOP4;
9484
1de06328
YO
9485 if (off >= 0) {
9486 while (off-- && s < rlim) {
9487 /* XXX could check well-formedness here */
9488 s += UTF8SKIP(s);
9489 }
9490 }
9491 else {
9492 while (off++ && s > llim) {
9493 s--;
9494 if (UTF8_IS_CONTINUED(*s)) {
9495 while (s > llim && UTF8_IS_CONTINUATION(*s))
9496 s--;
22b433ef 9497 if (! UTF8_IS_START(*s)) {
d820a0ff 9498 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
22b433ef 9499 }
1de06328
YO
9500 }
9501 /* XXX could check well-formedness here */
9502 }
9503 }
9504 return s;
9505}
1de06328 9506
557f47af
DM
9507/* like reghop3, but returns NULL on overrun, rather than returning last
9508 * char pos */
9509
1de06328 9510STATIC U8 *
b6c0faaf 9511S_reghopmaybe3(U8* s, SSize_t off, const U8* const lim)
a0ed51b3 9512{
7918f24d
NC
9513 PERL_ARGS_ASSERT_REGHOPMAYBE3;
9514
a0ed51b3 9515 if (off >= 0) {
1aa99e6b 9516 while (off-- && s < lim) {
ffc61ed2 9517 /* XXX could check well-formedness here */
a0ed51b3 9518 s += UTF8SKIP(s);
ffc61ed2 9519 }
a0ed51b3 9520 if (off >= 0)
3dab1dad 9521 return NULL;
a0ed51b3
LW
9522 }
9523 else {
1de06328
YO
9524 while (off++ && s > lim) {
9525 s--;
9526 if (UTF8_IS_CONTINUED(*s)) {
9527 while (s > lim && UTF8_IS_CONTINUATION(*s))
9528 s--;
22b433ef 9529 if (! UTF8_IS_START(*s)) {
d820a0ff 9530 Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
22b433ef 9531 }
a0ed51b3 9532 }
1de06328 9533 /* XXX could check well-formedness here */
a0ed51b3
LW
9534 }
9535 if (off <= 0)
3dab1dad 9536 return NULL;
a0ed51b3
LW
9537 }
9538 return s;
9539}
51371543 9540
a75351a1
DM
9541
9542/* when executing a regex that may have (?{}), extra stuff needs setting
9543 up that will be visible to the called code, even before the current
9544 match has finished. In particular:
9545
9546 * $_ is localised to the SV currently being matched;
9547 * pos($_) is created if necessary, ready to be updated on each call-out
9548 to code;
9549 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
9550 isn't set until the current pattern is successfully finished), so that
9551 $1 etc of the match-so-far can be seen;
9552 * save the old values of subbeg etc of the current regex, and set then
9553 to the current string (again, this is normally only done at the end
9554 of execution)
a75351a1
DM
9555*/
9556
9557static void
9558S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
9559{
9560 MAGIC *mg;
9561 regexp *const rex = ReANY(reginfo->prog);
bf2039a9 9562 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
8adc0f72 9563
8adc0f72 9564 eval_state->rex = rex;
a75351a1 9565
a75351a1
DM
9566 if (reginfo->sv) {
9567 /* Make $_ available to executed code. */
9568 if (reginfo->sv != DEFSV) {
9569 SAVE_DEFSV;
9570 DEFSV_set(reginfo->sv);
9571 }
9572
96c2a8ff 9573 if (!(mg = mg_find_mglob(reginfo->sv))) {
a75351a1 9574 /* prepare for quick setting of pos */
96c2a8ff 9575 mg = sv_magicext_mglob(reginfo->sv);
a75351a1
DM
9576 mg->mg_len = -1;
9577 }
8adc0f72
DM
9578 eval_state->pos_magic = mg;
9579 eval_state->pos = mg->mg_len;
25fdce4a 9580 eval_state->pos_flags = mg->mg_flags;
a75351a1 9581 }
8adc0f72
DM
9582 else
9583 eval_state->pos_magic = NULL;
9584
a75351a1 9585 if (!PL_reg_curpm) {
f65e70f5
DM
9586 /* PL_reg_curpm is a fake PMOP that we can attach the current
9587 * regex to and point PL_curpm at, so that $1 et al are visible
9588 * within a /(?{})/. It's just allocated once per interpreter the
9589 * first time its needed */
a75351a1
DM
9590 Newxz(PL_reg_curpm, 1, PMOP);
9591#ifdef USE_ITHREADS
9592 {
9593 SV* const repointer = &PL_sv_undef;
9594 /* this regexp is also owned by the new PL_reg_curpm, which
9595 will try to free it. */
9596 av_push(PL_regex_padav, repointer);
b9f2b683 9597 PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
a75351a1
DM
9598 PL_regex_pad = AvARRAY(PL_regex_padav);
9599 }
9600#endif
9601 }
9602 SET_reg_curpm(reginfo->prog);
8adc0f72 9603 eval_state->curpm = PL_curpm;
5585e758 9604 PL_curpm_under = PL_curpm;
a75351a1
DM
9605 PL_curpm = PL_reg_curpm;
9606 if (RXp_MATCH_COPIED(rex)) {
9607 /* Here is a serious problem: we cannot rewrite subbeg,
9608 since it may be needed if this match fails. Thus
9609 $` inside (?{}) could fail... */
8adc0f72
DM
9610 eval_state->subbeg = rex->subbeg;
9611 eval_state->sublen = rex->sublen;
9612 eval_state->suboffset = rex->suboffset;
a8ee055f 9613 eval_state->subcoffset = rex->subcoffset;
a75351a1 9614#ifdef PERL_ANY_COW
8adc0f72 9615 eval_state->saved_copy = rex->saved_copy;
a75351a1
DM
9616#endif
9617 RXp_MATCH_COPIED_off(rex);
9618 }
9619 else
8adc0f72 9620 eval_state->subbeg = NULL;
a75351a1
DM
9621 rex->subbeg = (char *)reginfo->strbeg;
9622 rex->suboffset = 0;
9623 rex->subcoffset = 0;
9624 rex->sublen = reginfo->strend - reginfo->strbeg;
9625}
9626
bf2039a9
DM
9627
9628/* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
a75351a1 9629
51371543 9630static void
bf2039a9 9631S_cleanup_regmatch_info_aux(pTHX_ void *arg)
51371543 9632{
bf2039a9
DM
9633 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
9634 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
331b2dcc 9635 regmatch_slab *s;
bf2039a9 9636
2ac8ff4b
DM
9637 Safefree(aux->poscache);
9638
331b2dcc 9639 if (eval_state) {
bf2039a9 9640
331b2dcc 9641 /* undo the effects of S_setup_eval_state() */
bf2039a9 9642
331b2dcc
DM
9643 if (eval_state->subbeg) {
9644 regexp * const rex = eval_state->rex;
9645 rex->subbeg = eval_state->subbeg;
9646 rex->sublen = eval_state->sublen;
9647 rex->suboffset = eval_state->suboffset;
9648 rex->subcoffset = eval_state->subcoffset;
db2c6cb3 9649#ifdef PERL_ANY_COW
331b2dcc 9650 rex->saved_copy = eval_state->saved_copy;
ed252734 9651#endif
331b2dcc
DM
9652 RXp_MATCH_COPIED_on(rex);
9653 }
9654 if (eval_state->pos_magic)
25fdce4a 9655 {
331b2dcc 9656 eval_state->pos_magic->mg_len = eval_state->pos;
25fdce4a
FC
9657 eval_state->pos_magic->mg_flags =
9658 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
9659 | (eval_state->pos_flags & MGf_BYTES);
9660 }
331b2dcc
DM
9661
9662 PL_curpm = eval_state->curpm;
8adc0f72 9663 }
bf2039a9 9664
331b2dcc
DM
9665 PL_regmatch_state = aux->old_regmatch_state;
9666 PL_regmatch_slab = aux->old_regmatch_slab;
9667
9668 /* free all slabs above current one - this must be the last action
9669 * of this function, as aux and eval_state are allocated within
9670 * slabs and may be freed here */
9671
9672 s = PL_regmatch_slab->next;
9673 if (s) {
9674 PL_regmatch_slab->next = NULL;
9675 while (s) {
9676 regmatch_slab * const osl = s;
9677 s = s->next;
9678 Safefree(osl);
9679 }
9680 }
51371543 9681}
33b8afdf 9682
8adc0f72 9683
33b8afdf 9684STATIC void
5aaab254 9685S_to_utf8_substr(pTHX_ regexp *prog)
33b8afdf 9686{
7e0d5ad7
KW
9687 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
9688 * on the converted value */
9689
a1cac82e 9690 int i = 1;
7918f24d
NC
9691
9692 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
9693
a1cac82e
NC
9694 do {
9695 if (prog->substrs->data[i].substr
9696 && !prog->substrs->data[i].utf8_substr) {
9697 SV* const sv = newSVsv(prog->substrs->data[i].substr);
9698 prog->substrs->data[i].utf8_substr = sv;
9699 sv_utf8_upgrade(sv);
610460f9 9700 if (SvVALID(prog->substrs->data[i].substr)) {
cffe132d 9701 if (SvTAIL(prog->substrs->data[i].substr)) {
610460f9
NC
9702 /* Trim the trailing \n that fbm_compile added last
9703 time. */
9704 SvCUR_set(sv, SvCUR(sv) - 1);
9705 /* Whilst this makes the SV technically "invalid" (as its
9706 buffer is no longer followed by "\0") when fbm_compile()
9707 adds the "\n" back, a "\0" is restored. */
cffe132d
NC
9708 fbm_compile(sv, FBMcf_TAIL);
9709 } else
9710 fbm_compile(sv, 0);
610460f9 9711 }
a1cac82e
NC
9712 if (prog->substrs->data[i].substr == prog->check_substr)
9713 prog->check_utf8 = sv;
9714 }
9715 } while (i--);
33b8afdf
JH
9716}
9717
7e0d5ad7 9718STATIC bool
5aaab254 9719S_to_byte_substr(pTHX_ regexp *prog)
33b8afdf 9720{
7e0d5ad7
KW
9721 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
9722 * on the converted value; returns FALSE if can't be converted. */
9723
a1cac82e 9724 int i = 1;
7918f24d
NC
9725
9726 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
9727
a1cac82e
NC
9728 do {
9729 if (prog->substrs->data[i].utf8_substr
9730 && !prog->substrs->data[i].substr) {
9731 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7e0d5ad7
KW
9732 if (! sv_utf8_downgrade(sv, TRUE)) {
9733 return FALSE;
9734 }
5400f398
KW
9735 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
9736 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
9737 /* Trim the trailing \n that fbm_compile added last
9738 time. */
9739 SvCUR_set(sv, SvCUR(sv) - 1);
9740 fbm_compile(sv, FBMcf_TAIL);
9741 } else
9742 fbm_compile(sv, 0);
9743 }
a1cac82e
NC
9744 prog->substrs->data[i].substr = sv;
9745 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
9746 prog->check_substr = sv;
33b8afdf 9747 }
a1cac82e 9748 } while (i--);
7e0d5ad7
KW
9749
9750 return TRUE;
33b8afdf 9751}
66610fdd 9752
66c5e3f2
JK
9753#ifndef PERL_IN_XSUB_RE
9754
94749a5e
KW
9755bool
9756Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp)
9757{
9758 /* Temporary helper function for toke.c. Verify that the code point 'cp'
9759 * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in
9760 * the larger string bounded by 'strbeg' and 'strend'.
9761 *
9762 * 'cp' needs to be assigned (if not a future version of the Unicode
9763 * Standard could make it something that combines with adjacent characters,
9764 * so code using it would then break), and there has to be a GCB break
9765 * before and after the character. */
9766
9767 GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val;
9768 const U8 * prev_cp_start;
9769
9770 PERL_ARGS_ASSERT__IS_GRAPHEME;
9771
9772 /* Unassigned code points are forbidden */
9773 if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST(
9774 _invlist_search(PL_Assigned_invlist, cp))))
9775 {
9776 return FALSE;
9777 }
9778
9779 cp_gcb_val = getGCB_VAL_CP(cp);
9780
9781 /* Find the GCB value of the previous code point in the input */
9782 prev_cp_start = utf8_hop_back(s, -1, strbeg);
9783 if (UNLIKELY(prev_cp_start == s)) {
9784 prev_cp_gcb_val = GCB_EDGE;
9785 }
9786 else {
9787 prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend);
9788 }
9789
9790 /* And check that is a grapheme boundary */
9791 if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s,
9792 TRUE /* is UTF-8 encoded */ ))
9793 {
9794 return FALSE;
9795 }
9796
9797 /* Similarly verify there is a break between the current character and the
9798 * following one */
9799 s += UTF8SKIP(s);
9800 if (s >= strend) {
9801 next_cp_gcb_val = GCB_EDGE;
9802 }
9803 else {
9804 next_cp_gcb_val = getGCB_VAL_UTF8(s, strend);
9805 }
9806
9807 return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE);
9808}
9809
66c5e3f2 9810#endif
94749a5e
KW
9811
9812
9813
66610fdd 9814/*
14d04a33 9815 * ex: set ts=8 sts=4 sw=4 et:
37442d52 9816 */