This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Replace macro by expansion in only place called
[perl5.git] / regexec.c
... / ...
CommitLineData
1/* regexec.c
2 */
3
4/*
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10 */
11
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
20 */
21
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
36#ifdef PERL_EXT_RE_BUILD
37#include "re_top.h"
38#endif
39
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
42static const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
44
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
50/*
51 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
75 ****
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
84#define PERL_IN_REGEXEC_C
85#include "perl.h"
86
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
92
93#include "inline_invlist.c"
94#include "unicode_constants.h"
95
96#define RF_tainted 1 /* tainted information used? e.g. locale */
97#define RF_warned 2 /* warned about big count? */
98
99#define RF_utf8 8 /* Pattern contains multibyte chars? */
100
101#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
102
103#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
104
105#ifndef STATIC
106#define STATIC static
107#endif
108
109/* Valid for non-utf8 strings: avoids the reginclass
110 * call if there are no complications: i.e., if everything matchable is
111 * straight forward in the bitmap */
112#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
113 : ANYOF_BITMAP_TEST(p,*(c)))
114
115/*
116 * Forwards.
117 */
118
119#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
120#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
121
122#define HOPc(pos,off) \
123 (char *)(PL_reg_match_utf8 \
124 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
125 : (U8*)(pos + off))
126#define HOPBACKc(pos, off) \
127 (char*)(PL_reg_match_utf8\
128 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
129 : (pos - off >= PL_bostr) \
130 ? (U8*)pos - off \
131 : NULL)
132
133#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
134#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
135
136
137#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
138#define NEXTCHR_IS_EOS (nextchr < 0)
139
140#define SET_nextchr \
141 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
142
143#define SET_locinput(p) \
144 locinput = (p); \
145 SET_nextchr
146
147
148/* these are unrolled below in the CCC_TRY_XXX defined */
149#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
150 if (!CAT2(PL_utf8_,class)) { \
151 bool ok; \
152 ENTER; save_re_context(); \
153 ok=CAT2(is_utf8_,class)((const U8*)str); \
154 PERL_UNUSED_VAR(ok); \
155 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
156/* Doesn't do an assert to verify that is correct */
157#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
158 if (!CAT2(PL_utf8_,class)) { \
159 bool throw_away; \
160 PERL_UNUSED_VAR(throw_away); \
161 ENTER; save_re_context(); \
162 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
163 PERL_UNUSED_VAR(throw_away); \
164 LEAVE; } } STMT_END
165
166#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
167#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
168
169#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
170 /* No asserts are done for some of these, in case called on a */ \
171 /* Unicode version in which they map to nothing */ \
172 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
173 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
174
175#define PLACEHOLDER /* Something for the preprocessor to grab onto */
176
177/* The actual code for CCC_TRY, which uses several variables from the routine
178 * it's callable from. It is designed to be the bulk of a case statement.
179 * FUNC is the macro or function to call on non-utf8 targets that indicate if
180 * nextchr matches the class.
181 * UTF8_TEST is the whole test string to use for utf8 targets
182 * LOAD is what to use to test, and if not present to load in the swash for the
183 * class
184 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
185 * UTF8_TEST test.
186 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
187 * utf8 and a variant, load the swash if necessary and test using the utf8
188 * test. Advance to the next character if test is ok, otherwise fail; If not
189 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
190 * fails, or advance to the next character */
191
192#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
193 if (NEXTCHR_IS_EOS) { \
194 sayNO; \
195 } \
196 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
197 LOAD_UTF8_CHARCLASS(CLASS, STR); \
198 if (POS_OR_NEG (UTF8_TEST)) { \
199 sayNO; \
200 } \
201 } \
202 else if (POS_OR_NEG (FUNC(nextchr))) { \
203 sayNO; \
204 } \
205 goto increment_locinput;
206
207/* Handle the non-locale cases for a character class and its complement. It
208 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
209 * This is because that code fails when the test succeeds, so we want to have
210 * the test fail so that the code succeeds. The swash is stored in a
211 * predictable PL_ place */
212#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
213 CLASS, STR) \
214 case NAME: \
215 _CCC_TRY_CODE( !, FUNC, \
216 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
217 (U8*)locinput, TRUE)), \
218 CLASS, STR) \
219 case NNAME: \
220 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
221 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
222 (U8*)locinput, TRUE)), \
223 CLASS, STR)
224/* Generate the case statements for both locale and non-locale character
225 * classes in regmatch for classes that don't have special unicode semantics.
226 * Locales don't use an immediate swash, but an intermediary special locale
227 * function that is called on the pointer to the current place in the input
228 * string. That function will resolve to needing the same swash. One might
229 * think that because we don't know what the locale will match, we shouldn't
230 * check with the swash loading function that it loaded properly; ie, that we
231 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
232 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
233 * irrelevant here */
234#define CCC_TRY(NAME, NNAME, FUNC, \
235 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
236 NAMEA, NNAMEA, FUNCA, \
237 CLASS, STR) \
238 case NAMEL: \
239 PL_reg_flags |= RF_tainted; \
240 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
241 case NNAMEL: \
242 PL_reg_flags |= RF_tainted; \
243 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
244 CLASS, STR) \
245 case NAMEA: \
246 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
247 sayNO; \
248 } \
249 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
250 locinput++; \
251 break; \
252 case NNAMEA: \
253 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
254 sayNO; \
255 } \
256 goto increment_locinput; \
257 /* Generate the non-locale cases */ \
258 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
259
260/* This is like CCC_TRY, but has an extra set of parameters for generating case
261 * statements to handle separate Unicode semantics nodes */
262#define CCC_TRY_U(NAME, NNAME, FUNC, \
263 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
264 NAMEU, NNAMEU, FUNCU, \
265 NAMEA, NNAMEA, FUNCA, \
266 CLASS, STR) \
267 CCC_TRY(NAME, NNAME, FUNC, \
268 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
269 NAMEA, NNAMEA, FUNCA, \
270 CLASS, STR) \
271 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
272
273/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
274
275/* for use after a quantifier and before an EXACT-like node -- japhy */
276/* it would be nice to rework regcomp.sym to generate this stuff. sigh
277 *
278 * NOTE that *nothing* that affects backtracking should be in here, specifically
279 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
280 * node that is in between two EXACT like nodes when ascertaining what the required
281 * "follow" character is. This should probably be moved to regex compile time
282 * although it may be done at run time beause of the REF possibility - more
283 * investigation required. -- demerphq
284*/
285#define JUMPABLE(rn) ( \
286 OP(rn) == OPEN || \
287 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
288 OP(rn) == EVAL || \
289 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
290 OP(rn) == PLUS || OP(rn) == MINMOD || \
291 OP(rn) == KEEPS || \
292 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
293)
294#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
295
296#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
297
298#if 0
299/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
300 we don't need this definition. */
301#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
302#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
303#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
304
305#else
306/* ... so we use this as its faster. */
307#define IS_TEXT(rn) ( OP(rn)==EXACT )
308#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
309#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
310#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
311
312#endif
313
314/*
315 Search for mandatory following text node; for lookahead, the text must
316 follow but for lookbehind (rn->flags != 0) we skip to the next step.
317*/
318#define FIND_NEXT_IMPT(rn) STMT_START { \
319 while (JUMPABLE(rn)) { \
320 const OPCODE type = OP(rn); \
321 if (type == SUSPEND || PL_regkind[type] == CURLY) \
322 rn = NEXTOPER(NEXTOPER(rn)); \
323 else if (type == PLUS) \
324 rn = NEXTOPER(rn); \
325 else if (type == IFMATCH) \
326 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
327 else rn += NEXT_OFF(rn); \
328 } \
329} STMT_END
330
331/* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
332 * These are for the pre-composed Hangul syllables, which are all in a
333 * contiguous block and arranged there in such a way so as to facilitate
334 * alorithmic determination of their characteristics. As such, they don't need
335 * a swash, but can be determined by simple arithmetic. Almost all are
336 * GCB=LVT, but every 28th one is a GCB=LV */
337#define SBASE 0xAC00 /* Start of block */
338#define SCount 11172 /* Length of block */
339#define TCount 28
340
341static void restore_pos(pTHX_ void *arg);
342
343#define REGCP_PAREN_ELEMS 3
344#define REGCP_OTHER_ELEMS 3
345#define REGCP_FRAME_ELEMS 1
346/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
347 * are needed for the regexp context stack bookkeeping. */
348
349STATIC CHECKPOINT
350S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
351{
352 dVAR;
353 const int retval = PL_savestack_ix;
354 const int paren_elems_to_push =
355 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
356 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
357 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
358 I32 p;
359 GET_RE_DEBUG_FLAGS_DECL;
360
361 PERL_ARGS_ASSERT_REGCPPUSH;
362
363 if (paren_elems_to_push < 0)
364 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
365 paren_elems_to_push);
366
367 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
368 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
369 " out of range (%lu-%ld)",
370 total_elems,
371 (unsigned long)maxopenparen,
372 (long)parenfloor);
373
374 SSGROW(total_elems + REGCP_FRAME_ELEMS);
375
376 DEBUG_BUFFERS_r(
377 if ((int)maxopenparen > (int)parenfloor)
378 PerlIO_printf(Perl_debug_log,
379 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
380 PTR2UV(rex),
381 PTR2UV(rex->offs)
382 );
383 );
384 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
385/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
386 SSPUSHINT(rex->offs[p].end);
387 SSPUSHINT(rex->offs[p].start);
388 SSPUSHINT(rex->offs[p].start_tmp);
389 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
390 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
391 (UV)p,
392 (IV)rex->offs[p].start,
393 (IV)rex->offs[p].start_tmp,
394 (IV)rex->offs[p].end
395 ));
396 }
397/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
398 SSPUSHINT(maxopenparen);
399 SSPUSHINT(rex->lastparen);
400 SSPUSHINT(rex->lastcloseparen);
401 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
402
403 return retval;
404}
405
406/* These are needed since we do not localize EVAL nodes: */
407#define REGCP_SET(cp) \
408 DEBUG_STATE_r( \
409 PerlIO_printf(Perl_debug_log, \
410 " Setting an EVAL scope, savestack=%"IVdf"\n", \
411 (IV)PL_savestack_ix)); \
412 cp = PL_savestack_ix
413
414#define REGCP_UNWIND(cp) \
415 DEBUG_STATE_r( \
416 if (cp != PL_savestack_ix) \
417 PerlIO_printf(Perl_debug_log, \
418 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
419 (IV)(cp), (IV)PL_savestack_ix)); \
420 regcpblow(cp)
421
422#define UNWIND_PAREN(lp, lcp) \
423 for (n = rex->lastparen; n > lp; n--) \
424 rex->offs[n].end = -1; \
425 rex->lastparen = n; \
426 rex->lastcloseparen = lcp;
427
428
429STATIC void
430S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
431{
432 dVAR;
433 UV i;
434 U32 paren;
435 GET_RE_DEBUG_FLAGS_DECL;
436
437 PERL_ARGS_ASSERT_REGCPPOP;
438
439 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
440 i = SSPOPUV;
441 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
442 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
443 rex->lastcloseparen = SSPOPINT;
444 rex->lastparen = SSPOPINT;
445 *maxopenparen_p = SSPOPINT;
446
447 i -= REGCP_OTHER_ELEMS;
448 /* Now restore the parentheses context. */
449 DEBUG_BUFFERS_r(
450 if (i || rex->lastparen + 1 <= rex->nparens)
451 PerlIO_printf(Perl_debug_log,
452 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
453 PTR2UV(rex),
454 PTR2UV(rex->offs)
455 );
456 );
457 paren = *maxopenparen_p;
458 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
459 I32 tmps;
460 rex->offs[paren].start_tmp = SSPOPINT;
461 rex->offs[paren].start = SSPOPINT;
462 tmps = SSPOPINT;
463 if (paren <= rex->lastparen)
464 rex->offs[paren].end = tmps;
465 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
466 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
467 (UV)paren,
468 (IV)rex->offs[paren].start,
469 (IV)rex->offs[paren].start_tmp,
470 (IV)rex->offs[paren].end,
471 (paren > rex->lastparen ? "(skipped)" : ""));
472 );
473 paren--;
474 }
475#if 1
476 /* It would seem that the similar code in regtry()
477 * already takes care of this, and in fact it is in
478 * a better location to since this code can #if 0-ed out
479 * but the code in regtry() is needed or otherwise tests
480 * requiring null fields (pat.t#187 and split.t#{13,14}
481 * (as of patchlevel 7877) will fail. Then again,
482 * this code seems to be necessary or otherwise
483 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
484 * --jhi updated by dapm */
485 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
486 if (i > *maxopenparen_p)
487 rex->offs[i].start = -1;
488 rex->offs[i].end = -1;
489 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
490 " \\%"UVuf": %s ..-1 undeffing\n",
491 (UV)i,
492 (i > *maxopenparen_p) ? "-1" : " "
493 ));
494 }
495#endif
496}
497
498/* restore the parens and associated vars at savestack position ix,
499 * but without popping the stack */
500
501STATIC void
502S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
503{
504 I32 tmpix = PL_savestack_ix;
505 PL_savestack_ix = ix;
506 regcppop(rex, maxopenparen_p);
507 PL_savestack_ix = tmpix;
508}
509
510#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
511
512STATIC bool
513S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
514{
515 /* Returns a boolean as to whether or not 'character' is a member of the
516 * Posix character class given by 'classnum' that should be equivalent to a
517 * value in the typedef '_char_class_number'.
518 *
519 * Ideally this could be replaced by a just an array of function pointers
520 * to the C library functions that implement the macros this calls.
521 * However, to compile, the precise function signatures are required, and
522 * these may vary from platform to to platform. To avoid having to figure
523 * out what those all are on each platform, I (khw) am using this method,
524 * which adds an extra layer of function call overhead. But we don't
525 * particularly care about performance with locales anyway. */
526
527 switch ((_char_class_number) classnum) {
528 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
529 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
530 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
531 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
532 case _CC_ENUM_LOWER: return isLOWER_LC(character);
533 case _CC_ENUM_PRINT: return isPRINT_LC(character);
534 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
535 case _CC_ENUM_UPPER: return isUPPER_LC(character);
536 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
537 case _CC_ENUM_SPACE: return isSPACE_LC(character);
538 case _CC_ENUM_BLANK: return isBLANK_LC(character);
539 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
540 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
541 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
542 case _CC_ENUM_ASCII: return isASCII_LC(character);
543 default: /* VERTSPACE should never occur in locales */
544 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
545 }
546
547 assert(0); /* NOTREACHED */
548 return FALSE;
549}
550
551/*
552 * pregexec and friends
553 */
554
555#ifndef PERL_IN_XSUB_RE
556/*
557 - pregexec - match a regexp against a string
558 */
559I32
560Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
561 char *strbeg, I32 minend, SV *screamer, U32 nosave)
562/* stringarg: the point in the string at which to begin matching */
563/* strend: pointer to null at end of string */
564/* strbeg: real beginning of string */
565/* minend: end of match must be >= minend bytes after stringarg. */
566/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
567 * itself is accessed via the pointers above */
568/* nosave: For optimizations. */
569{
570 PERL_ARGS_ASSERT_PREGEXEC;
571
572 return
573 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
574 nosave ? 0 : REXEC_COPY_STR);
575}
576#endif
577
578/*
579 * Need to implement the following flags for reg_anch:
580 *
581 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
582 * USE_INTUIT_ML
583 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
584 * INTUIT_AUTORITATIVE_ML
585 * INTUIT_ONCE_NOML - Intuit can match in one location only.
586 * INTUIT_ONCE_ML
587 *
588 * Another flag for this function: SECOND_TIME (so that float substrs
589 * with giant delta may be not rechecked).
590 */
591
592/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
593
594/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
595 Otherwise, only SvCUR(sv) is used to get strbeg. */
596
597/* XXXX We assume that strpos is strbeg unless sv. */
598
599/* XXXX Some places assume that there is a fixed substring.
600 An update may be needed if optimizer marks as "INTUITable"
601 RExen without fixed substrings. Similarly, it is assumed that
602 lengths of all the strings are no more than minlen, thus they
603 cannot come from lookahead.
604 (Or minlen should take into account lookahead.)
605 NOTE: Some of this comment is not correct. minlen does now take account
606 of lookahead/behind. Further research is required. -- demerphq
607
608*/
609
610/* A failure to find a constant substring means that there is no need to make
611 an expensive call to REx engine, thus we celebrate a failure. Similarly,
612 finding a substring too deep into the string means that less calls to
613 regtry() should be needed.
614
615 REx compiler's optimizer found 4 possible hints:
616 a) Anchored substring;
617 b) Fixed substring;
618 c) Whether we are anchored (beginning-of-line or \G);
619 d) First node (of those at offset 0) which may distinguish positions;
620 We use a)b)d) and multiline-part of c), and try to find a position in the
621 string which does not contradict any of them.
622 */
623
624/* Most of decisions we do here should have been done at compile time.
625 The nodes of the REx which we used for the search should have been
626 deleted from the finite automaton. */
627
628char *
629Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
630 char *strend, const U32 flags, re_scream_pos_data *data)
631{
632 dVAR;
633 struct regexp *const prog = ReANY(rx);
634 I32 start_shift = 0;
635 /* Should be nonnegative! */
636 I32 end_shift = 0;
637 char *s;
638 SV *check;
639 char *strbeg;
640 char *t;
641 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
642 I32 ml_anch;
643 char *other_last = NULL; /* other substr checked before this */
644 char *check_at = NULL; /* check substr found at this pos */
645 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
646 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
647 RXi_GET_DECL(prog,progi);
648#ifdef DEBUGGING
649 const char * const i_strpos = strpos;
650#endif
651 GET_RE_DEBUG_FLAGS_DECL;
652
653 PERL_ARGS_ASSERT_RE_INTUIT_START;
654 PERL_UNUSED_ARG(flags);
655 PERL_UNUSED_ARG(data);
656
657 RX_MATCH_UTF8_set(rx,utf8_target);
658
659 if (RX_UTF8(rx)) {
660 PL_reg_flags |= RF_utf8;
661 }
662 DEBUG_EXECUTE_r(
663 debug_start_match(rx, utf8_target, strpos, strend,
664 sv ? "Guessing start of match in sv for"
665 : "Guessing start of match in string for");
666 );
667
668 /* CHR_DIST() would be more correct here but it makes things slow. */
669 if (prog->minlen > strend - strpos) {
670 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
671 "String too short... [re_intuit_start]\n"));
672 goto fail;
673 }
674
675 /* XXX we need to pass strbeg as a separate arg: the following is
676 * guesswork and can be wrong... */
677 if (sv && SvPOK(sv)) {
678 char * p = SvPVX(sv);
679 STRLEN cur = SvCUR(sv);
680 if (p <= strpos && strpos < p + cur) {
681 strbeg = p;
682 assert(p <= strend && strend <= p + cur);
683 }
684 else
685 strbeg = strend - cur;
686 }
687 else
688 strbeg = strpos;
689
690 PL_regeol = strend;
691 if (utf8_target) {
692 if (!prog->check_utf8 && prog->check_substr)
693 to_utf8_substr(prog);
694 check = prog->check_utf8;
695 } else {
696 if (!prog->check_substr && prog->check_utf8) {
697 if (! to_byte_substr(prog)) {
698 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
699 }
700 }
701 check = prog->check_substr;
702 }
703 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
704 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
705 || ( (prog->extflags & RXf_ANCH_BOL)
706 && !multiline ) ); /* Check after \n? */
707
708 if (!ml_anch) {
709 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
710 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
711 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
712 && sv && !SvROK(sv)
713 && (strpos != strbeg)) {
714 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
715 goto fail;
716 }
717 if (prog->check_offset_min == prog->check_offset_max
718 && !(prog->extflags & RXf_CANY_SEEN)
719 && ! multiline) /* /m can cause \n's to match that aren't
720 accounted for in the string max length.
721 See [perl #115242] */
722 {
723 /* Substring at constant offset from beg-of-str... */
724 I32 slen;
725
726 s = HOP3c(strpos, prog->check_offset_min, strend);
727
728 if (SvTAIL(check)) {
729 slen = SvCUR(check); /* >= 1 */
730
731 if ( strend - s > slen || strend - s < slen - 1
732 || (strend - s == slen && strend[-1] != '\n')) {
733 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
734 goto fail_finish;
735 }
736 /* Now should match s[0..slen-2] */
737 slen--;
738 if (slen && (*SvPVX_const(check) != *s
739 || (slen > 1
740 && memNE(SvPVX_const(check), s, slen)))) {
741 report_neq:
742 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
743 goto fail_finish;
744 }
745 }
746 else if (*SvPVX_const(check) != *s
747 || ((slen = SvCUR(check)) > 1
748 && memNE(SvPVX_const(check), s, slen)))
749 goto report_neq;
750 check_at = s;
751 goto success_at_start;
752 }
753 }
754 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
755 s = strpos;
756 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
757 end_shift = prog->check_end_shift;
758
759 if (!ml_anch) {
760 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
761 - (SvTAIL(check) != 0);
762 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
763
764 if (end_shift < eshift)
765 end_shift = eshift;
766 }
767 }
768 else { /* Can match at random position */
769 ml_anch = 0;
770 s = strpos;
771 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
772 end_shift = prog->check_end_shift;
773
774 /* end shift should be non negative here */
775 }
776
777#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
778 if (end_shift < 0)
779 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
780 (IV)end_shift, RX_PRECOMP(prog));
781#endif
782
783 restart:
784 /* Find a possible match in the region s..strend by looking for
785 the "check" substring in the region corrected by start/end_shift. */
786
787 {
788 I32 srch_start_shift = start_shift;
789 I32 srch_end_shift = end_shift;
790 U8* start_point;
791 U8* end_point;
792 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
793 srch_end_shift -= ((strbeg - s) - srch_start_shift);
794 srch_start_shift = strbeg - s;
795 }
796 DEBUG_OPTIMISE_MORE_r({
797 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
798 (IV)prog->check_offset_min,
799 (IV)srch_start_shift,
800 (IV)srch_end_shift,
801 (IV)prog->check_end_shift);
802 });
803
804 if (prog->extflags & RXf_CANY_SEEN) {
805 start_point= (U8*)(s + srch_start_shift);
806 end_point= (U8*)(strend - srch_end_shift);
807 } else {
808 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
809 end_point= HOP3(strend, -srch_end_shift, strbeg);
810 }
811 DEBUG_OPTIMISE_MORE_r({
812 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
813 (int)(end_point - start_point),
814 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
815 start_point);
816 });
817
818 s = fbm_instr( start_point, end_point,
819 check, multiline ? FBMrf_MULTILINE : 0);
820 }
821 /* Update the count-of-usability, remove useless subpatterns,
822 unshift s. */
823
824 DEBUG_EXECUTE_r({
825 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
826 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
827 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
828 (s ? "Found" : "Did not find"),
829 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
830 ? "anchored" : "floating"),
831 quoted,
832 RE_SV_TAIL(check),
833 (s ? " at offset " : "...\n") );
834 });
835
836 if (!s)
837 goto fail_finish;
838 /* Finish the diagnostic message */
839 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
840
841 /* XXX dmq: first branch is for positive lookbehind...
842 Our check string is offset from the beginning of the pattern.
843 So we need to do any stclass tests offset forward from that
844 point. I think. :-(
845 */
846
847
848
849 check_at=s;
850
851
852 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
853 Start with the other substr.
854 XXXX no SCREAM optimization yet - and a very coarse implementation
855 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
856 *always* match. Probably should be marked during compile...
857 Probably it is right to do no SCREAM here...
858 */
859
860 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
861 : (prog->float_substr && prog->anchored_substr))
862 {
863 /* Take into account the "other" substring. */
864 /* XXXX May be hopelessly wrong for UTF... */
865 if (!other_last)
866 other_last = strpos;
867 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
868 do_other_anchored:
869 {
870 char * const last = HOP3c(s, -start_shift, strbeg);
871 char *last1, *last2;
872 char * const saved_s = s;
873 SV* must;
874
875 t = s - prog->check_offset_max;
876 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
877 && (!utf8_target
878 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
879 && t > strpos)))
880 NOOP;
881 else
882 t = strpos;
883 t = HOP3c(t, prog->anchored_offset, strend);
884 if (t < other_last) /* These positions already checked */
885 t = other_last;
886 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
887 if (last < last1)
888 last1 = last;
889 /* XXXX It is not documented what units *_offsets are in.
890 We assume bytes, but this is clearly wrong.
891 Meaning this code needs to be carefully reviewed for errors.
892 dmq.
893 */
894
895 /* On end-of-str: see comment below. */
896 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
897 if (must == &PL_sv_undef) {
898 s = (char*)NULL;
899 DEBUG_r(must = prog->anchored_utf8); /* for debug */
900 }
901 else
902 s = fbm_instr(
903 (unsigned char*)t,
904 HOP3(HOP3(last1, prog->anchored_offset, strend)
905 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
906 must,
907 multiline ? FBMrf_MULTILINE : 0
908 );
909 DEBUG_EXECUTE_r({
910 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
911 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
912 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
913 (s ? "Found" : "Contradicts"),
914 quoted, RE_SV_TAIL(must));
915 });
916
917
918 if (!s) {
919 if (last1 >= last2) {
920 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
921 ", giving up...\n"));
922 goto fail_finish;
923 }
924 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
925 ", trying floating at offset %ld...\n",
926 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
927 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
928 s = HOP3c(last, 1, strend);
929 goto restart;
930 }
931 else {
932 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
933 (long)(s - i_strpos)));
934 t = HOP3c(s, -prog->anchored_offset, strbeg);
935 other_last = HOP3c(s, 1, strend);
936 s = saved_s;
937 if (t == strpos)
938 goto try_at_start;
939 goto try_at_offset;
940 }
941 }
942 }
943 else { /* Take into account the floating substring. */
944 char *last, *last1;
945 char * const saved_s = s;
946 SV* must;
947
948 t = HOP3c(s, -start_shift, strbeg);
949 last1 = last =
950 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
951 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
952 last = HOP3c(t, prog->float_max_offset, strend);
953 s = HOP3c(t, prog->float_min_offset, strend);
954 if (s < other_last)
955 s = other_last;
956 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
957 must = utf8_target ? prog->float_utf8 : prog->float_substr;
958 /* fbm_instr() takes into account exact value of end-of-str
959 if the check is SvTAIL(ed). Since false positives are OK,
960 and end-of-str is not later than strend we are OK. */
961 if (must == &PL_sv_undef) {
962 s = (char*)NULL;
963 DEBUG_r(must = prog->float_utf8); /* for debug message */
964 }
965 else
966 s = fbm_instr((unsigned char*)s,
967 (unsigned char*)last + SvCUR(must)
968 - (SvTAIL(must)!=0),
969 must, multiline ? FBMrf_MULTILINE : 0);
970 DEBUG_EXECUTE_r({
971 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
972 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
973 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
974 (s ? "Found" : "Contradicts"),
975 quoted, RE_SV_TAIL(must));
976 });
977 if (!s) {
978 if (last1 == last) {
979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
980 ", giving up...\n"));
981 goto fail_finish;
982 }
983 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
984 ", trying anchored starting at offset %ld...\n",
985 (long)(saved_s + 1 - i_strpos)));
986 other_last = last;
987 s = HOP3c(t, 1, strend);
988 goto restart;
989 }
990 else {
991 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
992 (long)(s - i_strpos)));
993 other_last = s; /* Fix this later. --Hugo */
994 s = saved_s;
995 if (t == strpos)
996 goto try_at_start;
997 goto try_at_offset;
998 }
999 }
1000 }
1001
1002
1003 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
1004
1005 DEBUG_OPTIMISE_MORE_r(
1006 PerlIO_printf(Perl_debug_log,
1007 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
1008 (IV)prog->check_offset_min,
1009 (IV)prog->check_offset_max,
1010 (IV)(s-strpos),
1011 (IV)(t-strpos),
1012 (IV)(t-s),
1013 (IV)(strend-strpos)
1014 )
1015 );
1016
1017 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
1018 && (!utf8_target
1019 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
1020 && t > strpos)))
1021 {
1022 /* Fixed substring is found far enough so that the match
1023 cannot start at strpos. */
1024 try_at_offset:
1025 if (ml_anch && t[-1] != '\n') {
1026 /* Eventually fbm_*() should handle this, but often
1027 anchored_offset is not 0, so this check will not be wasted. */
1028 /* XXXX In the code below we prefer to look for "^" even in
1029 presence of anchored substrings. And we search even
1030 beyond the found float position. These pessimizations
1031 are historical artefacts only. */
1032 find_anchor:
1033 while (t < strend - prog->minlen) {
1034 if (*t == '\n') {
1035 if (t < check_at - prog->check_offset_min) {
1036 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1037 /* Since we moved from the found position,
1038 we definitely contradict the found anchored
1039 substr. Due to the above check we do not
1040 contradict "check" substr.
1041 Thus we can arrive here only if check substr
1042 is float. Redo checking for "other"=="fixed".
1043 */
1044 strpos = t + 1;
1045 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1046 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
1047 goto do_other_anchored;
1048 }
1049 /* We don't contradict the found floating substring. */
1050 /* XXXX Why not check for STCLASS? */
1051 s = t + 1;
1052 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1053 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1054 goto set_useful;
1055 }
1056 /* Position contradicts check-string */
1057 /* XXXX probably better to look for check-string
1058 than for "\n", so one should lower the limit for t? */
1059 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1060 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1061 other_last = strpos = s = t + 1;
1062 goto restart;
1063 }
1064 t++;
1065 }
1066 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1067 PL_colors[0], PL_colors[1]));
1068 goto fail_finish;
1069 }
1070 else {
1071 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1072 PL_colors[0], PL_colors[1]));
1073 }
1074 s = t;
1075 set_useful:
1076 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1077 }
1078 else {
1079 /* The found string does not prohibit matching at strpos,
1080 - no optimization of calling REx engine can be performed,
1081 unless it was an MBOL and we are not after MBOL,
1082 or a future STCLASS check will fail this. */
1083 try_at_start:
1084 /* Even in this situation we may use MBOL flag if strpos is offset
1085 wrt the start of the string. */
1086 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1087 && (strpos != strbeg) && strpos[-1] != '\n'
1088 /* May be due to an implicit anchor of m{.*foo} */
1089 && !(prog->intflags & PREGf_IMPLICIT))
1090 {
1091 t = strpos;
1092 goto find_anchor;
1093 }
1094 DEBUG_EXECUTE_r( if (ml_anch)
1095 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1096 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1097 );
1098 success_at_start:
1099 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1100 && (utf8_target ? (
1101 prog->check_utf8 /* Could be deleted already */
1102 && --BmUSEFUL(prog->check_utf8) < 0
1103 && (prog->check_utf8 == prog->float_utf8)
1104 ) : (
1105 prog->check_substr /* Could be deleted already */
1106 && --BmUSEFUL(prog->check_substr) < 0
1107 && (prog->check_substr == prog->float_substr)
1108 )))
1109 {
1110 /* If flags & SOMETHING - do not do it many times on the same match */
1111 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1112 /* XXX Does the destruction order has to change with utf8_target? */
1113 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1114 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1115 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1116 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1117 check = NULL; /* abort */
1118 s = strpos;
1119 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1120 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1121 if (prog->intflags & PREGf_IMPLICIT)
1122 prog->extflags &= ~RXf_ANCH_MBOL;
1123 /* XXXX This is a remnant of the old implementation. It
1124 looks wasteful, since now INTUIT can use many
1125 other heuristics. */
1126 prog->extflags &= ~RXf_USE_INTUIT;
1127 /* XXXX What other flags might need to be cleared in this branch? */
1128 }
1129 else
1130 s = strpos;
1131 }
1132
1133 /* Last resort... */
1134 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1135 /* trie stclasses are too expensive to use here, we are better off to
1136 leave it to regmatch itself */
1137 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1138 /* minlen == 0 is possible if regstclass is \b or \B,
1139 and the fixed substr is ''$.
1140 Since minlen is already taken into account, s+1 is before strend;
1141 accidentally, minlen >= 1 guaranties no false positives at s + 1
1142 even for \b or \B. But (minlen? 1 : 0) below assumes that
1143 regstclass does not come from lookahead... */
1144 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1145 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1146 const U8* const str = (U8*)STRING(progi->regstclass);
1147 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1148 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1149 : 1);
1150 char * endpos;
1151 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1152 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1153 else if (prog->float_substr || prog->float_utf8)
1154 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1155 else
1156 endpos= strend;
1157
1158 if (checked_upto < s)
1159 checked_upto = s;
1160 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1161 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1162
1163 t = s;
1164 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1165 if (s) {
1166 checked_upto = s;
1167 } else {
1168#ifdef DEBUGGING
1169 const char *what = NULL;
1170#endif
1171 if (endpos == strend) {
1172 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1173 "Could not match STCLASS...\n") );
1174 goto fail;
1175 }
1176 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1177 "This position contradicts STCLASS...\n") );
1178 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1179 goto fail;
1180 checked_upto = HOPBACKc(endpos, start_shift);
1181 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1182 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1183 /* Contradict one of substrings */
1184 if (prog->anchored_substr || prog->anchored_utf8) {
1185 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1186 DEBUG_EXECUTE_r( what = "anchored" );
1187 hop_and_restart:
1188 s = HOP3c(t, 1, strend);
1189 if (s + start_shift + end_shift > strend) {
1190 /* XXXX Should be taken into account earlier? */
1191 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1192 "Could not match STCLASS...\n") );
1193 goto fail;
1194 }
1195 if (!check)
1196 goto giveup;
1197 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1198 "Looking for %s substr starting at offset %ld...\n",
1199 what, (long)(s + start_shift - i_strpos)) );
1200 goto restart;
1201 }
1202 /* Have both, check_string is floating */
1203 if (t + start_shift >= check_at) /* Contradicts floating=check */
1204 goto retry_floating_check;
1205 /* Recheck anchored substring, but not floating... */
1206 s = check_at;
1207 if (!check)
1208 goto giveup;
1209 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1210 "Looking for anchored substr starting at offset %ld...\n",
1211 (long)(other_last - i_strpos)) );
1212 goto do_other_anchored;
1213 }
1214 /* Another way we could have checked stclass at the
1215 current position only: */
1216 if (ml_anch) {
1217 s = t = t + 1;
1218 if (!check)
1219 goto giveup;
1220 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1221 "Looking for /%s^%s/m starting at offset %ld...\n",
1222 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1223 goto try_at_offset;
1224 }
1225 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1226 goto fail;
1227 /* Check is floating substring. */
1228 retry_floating_check:
1229 t = check_at - start_shift;
1230 DEBUG_EXECUTE_r( what = "floating" );
1231 goto hop_and_restart;
1232 }
1233 if (t != s) {
1234 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1235 "By STCLASS: moving %ld --> %ld\n",
1236 (long)(t - i_strpos), (long)(s - i_strpos))
1237 );
1238 }
1239 else {
1240 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1241 "Does not contradict STCLASS...\n");
1242 );
1243 }
1244 }
1245 giveup:
1246 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1247 PL_colors[4], (check ? "Guessed" : "Giving up"),
1248 PL_colors[5], (long)(s - i_strpos)) );
1249 return s;
1250
1251 fail_finish: /* Substring not found */
1252 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1253 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1254 fail:
1255 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1256 PL_colors[4], PL_colors[5]));
1257 return NULL;
1258}
1259
1260#define DECL_TRIE_TYPE(scan) \
1261 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1262 trie_type = ((scan->flags == EXACT) \
1263 ? (utf8_target ? trie_utf8 : trie_plain) \
1264 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1265
1266#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1267uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1268 STRLEN skiplen; \
1269 switch (trie_type) { \
1270 case trie_utf8_fold: \
1271 if ( foldlen>0 ) { \
1272 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1273 foldlen -= len; \
1274 uscan += len; \
1275 len=0; \
1276 } else { \
1277 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1278 len = UTF8SKIP(uc); \
1279 skiplen = UNISKIP( uvc ); \
1280 foldlen -= skiplen; \
1281 uscan = foldbuf + skiplen; \
1282 } \
1283 break; \
1284 case trie_latin_utf8_fold: \
1285 if ( foldlen>0 ) { \
1286 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1287 foldlen -= len; \
1288 uscan += len; \
1289 len=0; \
1290 } else { \
1291 len = 1; \
1292 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1293 skiplen = UNISKIP( uvc ); \
1294 foldlen -= skiplen; \
1295 uscan = foldbuf + skiplen; \
1296 } \
1297 break; \
1298 case trie_utf8: \
1299 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1300 break; \
1301 case trie_plain: \
1302 uvc = (UV)*uc; \
1303 len = 1; \
1304 } \
1305 if (uvc < 256) { \
1306 charid = trie->charmap[ uvc ]; \
1307 } \
1308 else { \
1309 charid = 0; \
1310 if (widecharmap) { \
1311 SV** const svpp = hv_fetch(widecharmap, \
1312 (char*)&uvc, sizeof(UV), 0); \
1313 if (svpp) \
1314 charid = (U16)SvIV(*svpp); \
1315 } \
1316 } \
1317} STMT_END
1318
1319#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1320STMT_START { \
1321 while (s <= e) { \
1322 if ( (CoNd) \
1323 && (ln == 1 || folder(s, pat_string, ln)) \
1324 && (!reginfo || regtry(reginfo, &s)) ) \
1325 goto got_it; \
1326 s++; \
1327 } \
1328} STMT_END
1329
1330#define REXEC_FBC_UTF8_SCAN(CoDe) \
1331STMT_START { \
1332 while (s < strend) { \
1333 CoDe \
1334 s += UTF8SKIP(s); \
1335 } \
1336} STMT_END
1337
1338#define REXEC_FBC_SCAN(CoDe) \
1339STMT_START { \
1340 while (s < strend) { \
1341 CoDe \
1342 s++; \
1343 } \
1344} STMT_END
1345
1346#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1347REXEC_FBC_UTF8_SCAN( \
1348 if (CoNd) { \
1349 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1350 goto got_it; \
1351 else \
1352 tmp = doevery; \
1353 } \
1354 else \
1355 tmp = 1; \
1356)
1357
1358#define REXEC_FBC_CLASS_SCAN(CoNd) \
1359REXEC_FBC_SCAN( \
1360 if (CoNd) { \
1361 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1362 goto got_it; \
1363 else \
1364 tmp = doevery; \
1365 } \
1366 else \
1367 tmp = 1; \
1368)
1369
1370#define REXEC_FBC_TRYIT \
1371if ((!reginfo || regtry(reginfo, &s))) \
1372 goto got_it
1373
1374#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1375 if (utf8_target) { \
1376 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1377 } \
1378 else { \
1379 REXEC_FBC_CLASS_SCAN(CoNd); \
1380 }
1381
1382#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1383 if (utf8_target) { \
1384 UtFpReLoAd; \
1385 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1386 } \
1387 else { \
1388 REXEC_FBC_CLASS_SCAN(CoNd); \
1389 }
1390
1391#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1392 PL_reg_flags |= RF_tainted; \
1393 if (utf8_target) { \
1394 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1395 } \
1396 else { \
1397 REXEC_FBC_CLASS_SCAN(CoNd); \
1398 }
1399
1400#define DUMP_EXEC_POS(li,s,doutf8) \
1401 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1402
1403
1404#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1405 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1406 tmp = TEST_NON_UTF8(tmp); \
1407 REXEC_FBC_UTF8_SCAN( \
1408 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1409 tmp = !tmp; \
1410 IF_SUCCESS; \
1411 } \
1412 else { \
1413 IF_FAIL; \
1414 } \
1415 ); \
1416
1417#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1418 if (s == PL_bostr) { \
1419 tmp = '\n'; \
1420 } \
1421 else { \
1422 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1423 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1424 } \
1425 tmp = TeSt1_UtF8; \
1426 LOAD_UTF8_CHARCLASS_ALNUM(); \
1427 REXEC_FBC_UTF8_SCAN( \
1428 if (tmp == ! (TeSt2_UtF8)) { \
1429 tmp = !tmp; \
1430 IF_SUCCESS; \
1431 } \
1432 else { \
1433 IF_FAIL; \
1434 } \
1435 ); \
1436
1437/* The only difference between the BOUND and NBOUND cases is that
1438 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1439 * NBOUND. This is accomplished by passing it in either the if or else clause,
1440 * with the other one being empty */
1441#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1442 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1443
1444#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1445 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1446
1447#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1448 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1449
1450#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1451 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1452
1453
1454/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1455 * be passed in completely with the variable name being tested, which isn't
1456 * such a clean interface, but this is easier to read than it was before. We
1457 * are looking for the boundary (or non-boundary between a word and non-word
1458 * character. The utf8 and non-utf8 cases have the same logic, but the details
1459 * must be different. Find the "wordness" of the character just prior to this
1460 * one, and compare it with the wordness of this one. If they differ, we have
1461 * a boundary. At the beginning of the string, pretend that the previous
1462 * character was a new-line */
1463#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1464 if (utf8_target) { \
1465 UTF8_CODE \
1466 } \
1467 else { /* Not utf8 */ \
1468 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1469 tmp = TEST_NON_UTF8(tmp); \
1470 REXEC_FBC_SCAN( \
1471 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1472 tmp = !tmp; \
1473 IF_SUCCESS; \
1474 } \
1475 else { \
1476 IF_FAIL; \
1477 } \
1478 ); \
1479 } \
1480 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1481 goto got_it;
1482
1483/* We know what class REx starts with. Try to find this position... */
1484/* if reginfo is NULL, its a dryrun */
1485/* annoyingly all the vars in this routine have different names from their counterparts
1486 in regmatch. /grrr */
1487
1488STATIC char *
1489S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1490 const char *strend, regmatch_info *reginfo)
1491{
1492 dVAR;
1493 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1494 char *pat_string; /* The pattern's exactish string */
1495 char *pat_end; /* ptr to end char of pat_string */
1496 re_fold_t folder; /* Function for computing non-utf8 folds */
1497 const U8 *fold_array; /* array for folding ords < 256 */
1498 STRLEN ln;
1499 STRLEN lnc;
1500 STRLEN uskip;
1501 U8 c1;
1502 U8 c2;
1503 char *e;
1504 I32 tmp = 1; /* Scratch variable? */
1505 const bool utf8_target = PL_reg_match_utf8;
1506 UV utf8_fold_flags = 0;
1507 RXi_GET_DECL(prog,progi);
1508
1509 PERL_ARGS_ASSERT_FIND_BYCLASS;
1510
1511 /* We know what class it must start with. */
1512 switch (OP(c)) {
1513 case ANYOF:
1514 if (utf8_target) {
1515 REXEC_FBC_UTF8_CLASS_SCAN(
1516 reginclass(prog, c, (U8*)s, utf8_target));
1517 }
1518 else {
1519 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1520 }
1521 break;
1522 case CANY:
1523 REXEC_FBC_SCAN(
1524 if (tmp && (!reginfo || regtry(reginfo, &s)))
1525 goto got_it;
1526 else
1527 tmp = doevery;
1528 );
1529 break;
1530
1531 case EXACTFA:
1532 if (UTF_PATTERN || utf8_target) {
1533 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1534 goto do_exactf_utf8;
1535 }
1536 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1537 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1538 goto do_exactf_non_utf8; /* isn't dealt with by these */
1539
1540 case EXACTF:
1541 if (utf8_target) {
1542
1543 /* regcomp.c already folded this if pattern is in UTF-8 */
1544 utf8_fold_flags = 0;
1545 goto do_exactf_utf8;
1546 }
1547 fold_array = PL_fold;
1548 folder = foldEQ;
1549 goto do_exactf_non_utf8;
1550
1551 case EXACTFL:
1552 if (UTF_PATTERN || utf8_target) {
1553 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1554 goto do_exactf_utf8;
1555 }
1556 fold_array = PL_fold_locale;
1557 folder = foldEQ_locale;
1558 goto do_exactf_non_utf8;
1559
1560 case EXACTFU_SS:
1561 if (UTF_PATTERN) {
1562 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1563 }
1564 goto do_exactf_utf8;
1565
1566 case EXACTFU_TRICKYFOLD:
1567 case EXACTFU:
1568 if (UTF_PATTERN || utf8_target) {
1569 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1570 goto do_exactf_utf8;
1571 }
1572
1573 /* Any 'ss' in the pattern should have been replaced by regcomp,
1574 * so we don't have to worry here about this single special case
1575 * in the Latin1 range */
1576 fold_array = PL_fold_latin1;
1577 folder = foldEQ_latin1;
1578
1579 /* FALL THROUGH */
1580
1581 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1582 are no glitches with fold-length differences
1583 between the target string and pattern */
1584
1585 /* The idea in the non-utf8 EXACTF* cases is to first find the
1586 * first character of the EXACTF* node and then, if necessary,
1587 * case-insensitively compare the full text of the node. c1 is the
1588 * first character. c2 is its fold. This logic will not work for
1589 * Unicode semantics and the german sharp ss, which hence should
1590 * not be compiled into a node that gets here. */
1591 pat_string = STRING(c);
1592 ln = STR_LEN(c); /* length to match in octets/bytes */
1593
1594 /* We know that we have to match at least 'ln' bytes (which is the
1595 * same as characters, since not utf8). If we have to match 3
1596 * characters, and there are only 2 availabe, we know without
1597 * trying that it will fail; so don't start a match past the
1598 * required minimum number from the far end */
1599 e = HOP3c(strend, -((I32)ln), s);
1600
1601 if (!reginfo && e < s) {
1602 e = s; /* Due to minlen logic of intuit() */
1603 }
1604
1605 c1 = *pat_string;
1606 c2 = fold_array[c1];
1607 if (c1 == c2) { /* If char and fold are the same */
1608 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1609 }
1610 else {
1611 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1612 }
1613 break;
1614
1615 do_exactf_utf8:
1616 {
1617 unsigned expansion;
1618
1619 /* If one of the operands is in utf8, we can't use the simpler folding
1620 * above, due to the fact that many different characters can have the
1621 * same fold, or portion of a fold, or different- length fold */
1622 pat_string = STRING(c);
1623 ln = STR_LEN(c); /* length to match in octets/bytes */
1624 pat_end = pat_string + ln;
1625 lnc = (UTF_PATTERN) /* length to match in characters */
1626 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1627 : ln;
1628
1629 /* We have 'lnc' characters to match in the pattern, but because of
1630 * multi-character folding, each character in the target can match
1631 * up to 3 characters (Unicode guarantees it will never exceed
1632 * this) if it is utf8-encoded; and up to 2 if not (based on the
1633 * fact that the Latin 1 folds are already determined, and the
1634 * only multi-char fold in that range is the sharp-s folding to
1635 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1636 * string character. Adjust lnc accordingly, rounding up, so that
1637 * if we need to match at least 4+1/3 chars, that really is 5. */
1638 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1639 lnc = (lnc + expansion - 1) / expansion;
1640
1641 /* As in the non-UTF8 case, if we have to match 3 characters, and
1642 * only 2 are left, it's guaranteed to fail, so don't start a
1643 * match that would require us to go beyond the end of the string
1644 */
1645 e = HOP3c(strend, -((I32)lnc), s);
1646
1647 if (!reginfo && e < s) {
1648 e = s; /* Due to minlen logic of intuit() */
1649 }
1650
1651 /* XXX Note that we could recalculate e to stop the loop earlier,
1652 * as the worst case expansion above will rarely be met, and as we
1653 * go along we would usually find that e moves further to the left.
1654 * This would happen only after we reached the point in the loop
1655 * where if there were no expansion we should fail. Unclear if
1656 * worth the expense */
1657
1658 while (s <= e) {
1659 char *my_strend= (char *)strend;
1660 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1661 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1662 && (!reginfo || regtry(reginfo, &s)) )
1663 {
1664 goto got_it;
1665 }
1666 s += (utf8_target) ? UTF8SKIP(s) : 1;
1667 }
1668 break;
1669 }
1670 case BOUNDL:
1671 PL_reg_flags |= RF_tainted;
1672 FBC_BOUND(isALNUM_LC,
1673 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1674 isALNUM_LC_utf8((U8*)s));
1675 break;
1676 case NBOUNDL:
1677 PL_reg_flags |= RF_tainted;
1678 FBC_NBOUND(isALNUM_LC,
1679 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1680 isALNUM_LC_utf8((U8*)s));
1681 break;
1682 case BOUND:
1683 FBC_BOUND(isWORDCHAR,
1684 isALNUM_uni(tmp),
1685 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1686 break;
1687 case BOUNDA:
1688 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1689 isWORDCHAR_A(tmp),
1690 isWORDCHAR_A((U8*)s));
1691 break;
1692 case NBOUND:
1693 FBC_NBOUND(isWORDCHAR,
1694 isALNUM_uni(tmp),
1695 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1696 break;
1697 case NBOUNDA:
1698 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1699 isWORDCHAR_A(tmp),
1700 isWORDCHAR_A((U8*)s));
1701 break;
1702 case BOUNDU:
1703 FBC_BOUND(isWORDCHAR_L1,
1704 isALNUM_uni(tmp),
1705 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1706 break;
1707 case NBOUNDU:
1708 FBC_NBOUND(isWORDCHAR_L1,
1709 isALNUM_uni(tmp),
1710 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1711 break;
1712 case ALNUML:
1713 REXEC_FBC_CSCAN_TAINT(
1714 isALNUM_LC_utf8((U8*)s),
1715 isALNUM_LC(*s)
1716 );
1717 break;
1718 case ALNUMU:
1719 REXEC_FBC_CSCAN_PRELOAD(
1720 LOAD_UTF8_CHARCLASS_ALNUM(),
1721 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1722 isWORDCHAR_L1((U8) *s)
1723 );
1724 break;
1725 case ALNUM:
1726 REXEC_FBC_CSCAN_PRELOAD(
1727 LOAD_UTF8_CHARCLASS_ALNUM(),
1728 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1729 isWORDCHAR((U8) *s)
1730 );
1731 break;
1732 case ALNUMA:
1733 /* Don't need to worry about utf8, as it can match only a single
1734 * byte invariant character */
1735 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1736 break;
1737 case NALNUMU:
1738 REXEC_FBC_CSCAN_PRELOAD(
1739 LOAD_UTF8_CHARCLASS_ALNUM(),
1740 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1741 ! isWORDCHAR_L1((U8) *s)
1742 );
1743 break;
1744 case NALNUM:
1745 REXEC_FBC_CSCAN_PRELOAD(
1746 LOAD_UTF8_CHARCLASS_ALNUM(),
1747 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1748 ! isALNUM(*s)
1749 );
1750 break;
1751 case NALNUMA:
1752 REXEC_FBC_CSCAN(
1753 !isWORDCHAR_A(*s),
1754 !isWORDCHAR_A(*s)
1755 );
1756 break;
1757 case NALNUML:
1758 REXEC_FBC_CSCAN_TAINT(
1759 !isALNUM_LC_utf8((U8*)s),
1760 !isALNUM_LC(*s)
1761 );
1762 break;
1763 case SPACEU:
1764 REXEC_FBC_CSCAN(
1765 is_XPERLSPACE_utf8(s),
1766 isSPACE_L1((U8) *s)
1767 );
1768 break;
1769 case SPACE:
1770 REXEC_FBC_CSCAN(
1771 is_XPERLSPACE_utf8(s),
1772 isSPACE((U8) *s)
1773 );
1774 break;
1775 case SPACEA:
1776 /* Don't need to worry about utf8, as it can match only a single
1777 * byte invariant character */
1778 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1779 break;
1780 case SPACEL:
1781 REXEC_FBC_CSCAN_TAINT(
1782 isSPACE_LC_utf8((U8*)s),
1783 isSPACE_LC(*s)
1784 );
1785 break;
1786 case NSPACEU:
1787 REXEC_FBC_CSCAN(
1788 ! is_XPERLSPACE_utf8(s),
1789 ! isSPACE_L1((U8) *s)
1790 );
1791 break;
1792 case NSPACE:
1793 REXEC_FBC_CSCAN(
1794 ! is_XPERLSPACE_utf8(s),
1795 ! isSPACE((U8) *s)
1796 );
1797 break;
1798 case NSPACEA:
1799 REXEC_FBC_CSCAN(
1800 !isSPACE_A(*s),
1801 !isSPACE_A(*s)
1802 );
1803 break;
1804 case NSPACEL:
1805 REXEC_FBC_CSCAN_TAINT(
1806 !isSPACE_LC_utf8((U8*)s),
1807 !isSPACE_LC(*s)
1808 );
1809 break;
1810 case DIGIT:
1811 REXEC_FBC_CSCAN_PRELOAD(
1812 LOAD_UTF8_CHARCLASS_DIGIT(),
1813 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1814 isDIGIT(*s)
1815 );
1816 break;
1817 case DIGITA:
1818 /* Don't need to worry about utf8, as it can match only a single
1819 * byte invariant character */
1820 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1821 break;
1822 case DIGITL:
1823 REXEC_FBC_CSCAN_TAINT(
1824 isDIGIT_LC_utf8((U8*)s),
1825 isDIGIT_LC(*s)
1826 );
1827 break;
1828 case NDIGIT:
1829 REXEC_FBC_CSCAN_PRELOAD(
1830 LOAD_UTF8_CHARCLASS_DIGIT(),
1831 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1832 !isDIGIT(*s)
1833 );
1834 break;
1835 case NDIGITA:
1836 REXEC_FBC_CSCAN(
1837 !isDIGIT_A(*s),
1838 !isDIGIT_A(*s)
1839 );
1840 break;
1841 case NDIGITL:
1842 REXEC_FBC_CSCAN_TAINT(
1843 !isDIGIT_LC_utf8((U8*)s),
1844 !isDIGIT_LC(*s)
1845 );
1846 break;
1847 case LNBREAK:
1848 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1849 is_LNBREAK_latin1_safe(s, strend)
1850 );
1851 break;
1852 case VERTWS:
1853 REXEC_FBC_CSCAN(
1854 is_VERTWS_utf8_safe(s, strend),
1855 is_VERTWS_latin1_safe(s, strend)
1856 );
1857 break;
1858 case NVERTWS:
1859 REXEC_FBC_CSCAN(
1860 !is_VERTWS_utf8_safe(s, strend),
1861 !is_VERTWS_latin1_safe(s, strend)
1862 );
1863 break;
1864 case HORIZWS:
1865 REXEC_FBC_CSCAN(
1866 is_HORIZWS_utf8_safe(s, strend),
1867 is_HORIZWS_latin1_safe(s, strend)
1868 );
1869 break;
1870 case NHORIZWS:
1871 REXEC_FBC_CSCAN(
1872 !is_HORIZWS_utf8_safe(s, strend),
1873 !is_HORIZWS_latin1_safe(s, strend)
1874 );
1875 break;
1876 case POSIXA:
1877 /* Don't need to worry about utf8, as it can match only a single
1878 * byte invariant character. The flag in this node type is the
1879 * class number to pass to _generic_isCC() to build a mask for
1880 * searching in PL_charclass[] */
1881 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1882 break;
1883 case NPOSIXA:
1884 REXEC_FBC_CSCAN(
1885 !_generic_isCC_A(*s, FLAGS(c)),
1886 !_generic_isCC_A(*s, FLAGS(c))
1887 );
1888 break;
1889
1890 case AHOCORASICKC:
1891 case AHOCORASICK:
1892 {
1893 DECL_TRIE_TYPE(c);
1894 /* what trie are we using right now */
1895 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1896 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
1897 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1898
1899 const char *last_start = strend - trie->minlen;
1900#ifdef DEBUGGING
1901 const char *real_start = s;
1902#endif
1903 STRLEN maxlen = trie->maxlen;
1904 SV *sv_points;
1905 U8 **points; /* map of where we were in the input string
1906 when reading a given char. For ASCII this
1907 is unnecessary overhead as the relationship
1908 is always 1:1, but for Unicode, especially
1909 case folded Unicode this is not true. */
1910 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1911 U8 *bitmap=NULL;
1912
1913
1914 GET_RE_DEBUG_FLAGS_DECL;
1915
1916 /* We can't just allocate points here. We need to wrap it in
1917 * an SV so it gets freed properly if there is a croak while
1918 * running the match */
1919 ENTER;
1920 SAVETMPS;
1921 sv_points=newSV(maxlen * sizeof(U8 *));
1922 SvCUR_set(sv_points,
1923 maxlen * sizeof(U8 *));
1924 SvPOK_on(sv_points);
1925 sv_2mortal(sv_points);
1926 points=(U8**)SvPV_nolen(sv_points );
1927 if ( trie_type != trie_utf8_fold
1928 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1929 {
1930 if (trie->bitmap)
1931 bitmap=(U8*)trie->bitmap;
1932 else
1933 bitmap=(U8*)ANYOF_BITMAP(c);
1934 }
1935 /* this is the Aho-Corasick algorithm modified a touch
1936 to include special handling for long "unknown char" sequences.
1937 The basic idea being that we use AC as long as we are dealing
1938 with a possible matching char, when we encounter an unknown char
1939 (and we have not encountered an accepting state) we scan forward
1940 until we find a legal starting char.
1941 AC matching is basically that of trie matching, except that when
1942 we encounter a failing transition, we fall back to the current
1943 states "fail state", and try the current char again, a process
1944 we repeat until we reach the root state, state 1, or a legal
1945 transition. If we fail on the root state then we can either
1946 terminate if we have reached an accepting state previously, or
1947 restart the entire process from the beginning if we have not.
1948
1949 */
1950 while (s <= last_start) {
1951 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1952 U8 *uc = (U8*)s;
1953 U16 charid = 0;
1954 U32 base = 1;
1955 U32 state = 1;
1956 UV uvc = 0;
1957 STRLEN len = 0;
1958 STRLEN foldlen = 0;
1959 U8 *uscan = (U8*)NULL;
1960 U8 *leftmost = NULL;
1961#ifdef DEBUGGING
1962 U32 accepted_word= 0;
1963#endif
1964 U32 pointpos = 0;
1965
1966 while ( state && uc <= (U8*)strend ) {
1967 int failed=0;
1968 U32 word = aho->states[ state ].wordnum;
1969
1970 if( state==1 ) {
1971 if ( bitmap ) {
1972 DEBUG_TRIE_EXECUTE_r(
1973 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1974 dump_exec_pos( (char *)uc, c, strend, real_start,
1975 (char *)uc, utf8_target );
1976 PerlIO_printf( Perl_debug_log,
1977 " Scanning for legal start char...\n");
1978 }
1979 );
1980 if (utf8_target) {
1981 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1982 uc += UTF8SKIP(uc);
1983 }
1984 } else {
1985 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1986 uc++;
1987 }
1988 }
1989 s= (char *)uc;
1990 }
1991 if (uc >(U8*)last_start) break;
1992 }
1993
1994 if ( word ) {
1995 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1996 if (!leftmost || lpos < leftmost) {
1997 DEBUG_r(accepted_word=word);
1998 leftmost= lpos;
1999 }
2000 if (base==0) break;
2001
2002 }
2003 points[pointpos++ % maxlen]= uc;
2004 if (foldlen || uc < (U8*)strend) {
2005 REXEC_TRIE_READ_CHAR(trie_type, trie,
2006 widecharmap, uc,
2007 uscan, len, uvc, charid, foldlen,
2008 foldbuf, uniflags);
2009 DEBUG_TRIE_EXECUTE_r({
2010 dump_exec_pos( (char *)uc, c, strend,
2011 real_start, s, utf8_target);
2012 PerlIO_printf(Perl_debug_log,
2013 " Charid:%3u CP:%4"UVxf" ",
2014 charid, uvc);
2015 });
2016 }
2017 else {
2018 len = 0;
2019 charid = 0;
2020 }
2021
2022
2023 do {
2024#ifdef DEBUGGING
2025 word = aho->states[ state ].wordnum;
2026#endif
2027 base = aho->states[ state ].trans.base;
2028
2029 DEBUG_TRIE_EXECUTE_r({
2030 if (failed)
2031 dump_exec_pos( (char *)uc, c, strend, real_start,
2032 s, utf8_target );
2033 PerlIO_printf( Perl_debug_log,
2034 "%sState: %4"UVxf", word=%"UVxf,
2035 failed ? " Fail transition to " : "",
2036 (UV)state, (UV)word);
2037 });
2038 if ( base ) {
2039 U32 tmp;
2040 I32 offset;
2041 if (charid &&
2042 ( ((offset = base + charid
2043 - 1 - trie->uniquecharcount)) >= 0)
2044 && ((U32)offset < trie->lasttrans)
2045 && trie->trans[offset].check == state
2046 && (tmp=trie->trans[offset].next))
2047 {
2048 DEBUG_TRIE_EXECUTE_r(
2049 PerlIO_printf( Perl_debug_log," - legal\n"));
2050 state = tmp;
2051 break;
2052 }
2053 else {
2054 DEBUG_TRIE_EXECUTE_r(
2055 PerlIO_printf( Perl_debug_log," - fail\n"));
2056 failed = 1;
2057 state = aho->fail[state];
2058 }
2059 }
2060 else {
2061 /* we must be accepting here */
2062 DEBUG_TRIE_EXECUTE_r(
2063 PerlIO_printf( Perl_debug_log," - accepting\n"));
2064 failed = 1;
2065 break;
2066 }
2067 } while(state);
2068 uc += len;
2069 if (failed) {
2070 if (leftmost)
2071 break;
2072 if (!state) state = 1;
2073 }
2074 }
2075 if ( aho->states[ state ].wordnum ) {
2076 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2077 if (!leftmost || lpos < leftmost) {
2078 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2079 leftmost = lpos;
2080 }
2081 }
2082 if (leftmost) {
2083 s = (char*)leftmost;
2084 DEBUG_TRIE_EXECUTE_r({
2085 PerlIO_printf(
2086 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2087 (UV)accepted_word, (IV)(s - real_start)
2088 );
2089 });
2090 if (!reginfo || regtry(reginfo, &s)) {
2091 FREETMPS;
2092 LEAVE;
2093 goto got_it;
2094 }
2095 s = HOPc(s,1);
2096 DEBUG_TRIE_EXECUTE_r({
2097 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2098 });
2099 } else {
2100 DEBUG_TRIE_EXECUTE_r(
2101 PerlIO_printf( Perl_debug_log,"No match.\n"));
2102 break;
2103 }
2104 }
2105 FREETMPS;
2106 LEAVE;
2107 }
2108 break;
2109 default:
2110 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2111 break;
2112 }
2113 return 0;
2114 got_it:
2115 return s;
2116}
2117
2118
2119/*
2120 - regexec_flags - match a regexp against a string
2121 */
2122I32
2123Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2124 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2125/* stringarg: the point in the string at which to begin matching */
2126/* strend: pointer to null at end of string */
2127/* strbeg: real beginning of string */
2128/* minend: end of match must be >= minend bytes after stringarg. */
2129/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2130 * itself is accessed via the pointers above */
2131/* data: May be used for some additional optimizations.
2132 Currently its only used, with a U32 cast, for transmitting
2133 the ganch offset when doing a /g match. This will change */
2134/* nosave: For optimizations. */
2135
2136{
2137 dVAR;
2138 struct regexp *const prog = ReANY(rx);
2139 char *s;
2140 regnode *c;
2141 char *startpos = stringarg;
2142 I32 minlen; /* must match at least this many chars */
2143 I32 dontbother = 0; /* how many characters not to try at end */
2144 I32 end_shift = 0; /* Same for the end. */ /* CC */
2145 I32 scream_pos = -1; /* Internal iterator of scream. */
2146 char *scream_olds = NULL;
2147 const bool utf8_target = cBOOL(DO_UTF8(sv));
2148 I32 multiline;
2149 RXi_GET_DECL(prog,progi);
2150 regmatch_info reginfo; /* create some info to pass to regtry etc */
2151 regexp_paren_pair *swap = NULL;
2152 GET_RE_DEBUG_FLAGS_DECL;
2153
2154 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2155 PERL_UNUSED_ARG(data);
2156
2157 /* Be paranoid... */
2158 if (prog == NULL || startpos == NULL) {
2159 Perl_croak(aTHX_ "NULL regexp parameter");
2160 return 0;
2161 }
2162
2163 multiline = prog->extflags & RXf_PMf_MULTILINE;
2164 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2165
2166 RX_MATCH_UTF8_set(rx, utf8_target);
2167 DEBUG_EXECUTE_r(
2168 debug_start_match(rx, utf8_target, startpos, strend,
2169 "Matching");
2170 );
2171
2172 minlen = prog->minlen;
2173
2174 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2175 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2176 "String too short [regexec_flags]...\n"));
2177 goto phooey;
2178 }
2179
2180
2181 /* Check validity of program. */
2182 if (UCHARAT(progi->program) != REG_MAGIC) {
2183 Perl_croak(aTHX_ "corrupted regexp program");
2184 }
2185
2186 PL_reg_flags = 0;
2187 PL_reg_state.re_state_eval_setup_done = FALSE;
2188 PL_reg_maxiter = 0;
2189
2190 if (RX_UTF8(rx))
2191 PL_reg_flags |= RF_utf8;
2192
2193 /* Mark beginning of line for ^ and lookbehind. */
2194 reginfo.bol = startpos; /* XXX not used ??? */
2195 PL_bostr = strbeg;
2196 reginfo.sv = sv;
2197
2198 /* Mark end of line for $ (and such) */
2199 PL_regeol = strend;
2200
2201 /* see how far we have to get to not match where we matched before */
2202 reginfo.till = startpos+minend;
2203
2204 /* If there is a "must appear" string, look for it. */
2205 s = startpos;
2206
2207 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2208 MAGIC *mg;
2209 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2210 reginfo.ganch = startpos + prog->gofs;
2211 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2212 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2213 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2214 && SvMAGIC(sv)
2215 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2216 && mg->mg_len >= 0) {
2217 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2218 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2219 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2220
2221 if (prog->extflags & RXf_ANCH_GPOS) {
2222 if (s > reginfo.ganch)
2223 goto phooey;
2224 s = reginfo.ganch - prog->gofs;
2225 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2226 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2227 if (s < strbeg)
2228 goto phooey;
2229 }
2230 }
2231 else if (data) {
2232 reginfo.ganch = strbeg + PTR2UV(data);
2233 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2234 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2235
2236 } else { /* pos() not defined */
2237 reginfo.ganch = strbeg;
2238 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2239 "GPOS: reginfo.ganch = strbeg\n"));
2240 }
2241 }
2242 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2243 /* We have to be careful. If the previous successful match
2244 was from this regex we don't want a subsequent partially
2245 successful match to clobber the old results.
2246 So when we detect this possibility we add a swap buffer
2247 to the re, and switch the buffer each match. If we fail
2248 we switch it back, otherwise we leave it swapped.
2249 */
2250 swap = prog->offs;
2251 /* do we need a save destructor here for eval dies? */
2252 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2253 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2254 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2255 PTR2UV(prog),
2256 PTR2UV(swap),
2257 PTR2UV(prog->offs)
2258 ));
2259 }
2260 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2261 re_scream_pos_data d;
2262
2263 d.scream_olds = &scream_olds;
2264 d.scream_pos = &scream_pos;
2265 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2266 if (!s) {
2267 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2268 goto phooey; /* not present */
2269 }
2270 }
2271
2272
2273
2274 /* Simplest case: anchored match need be tried only once. */
2275 /* [unless only anchor is BOL and multiline is set] */
2276 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2277 if (s == startpos && regtry(&reginfo, &startpos))
2278 goto got_it;
2279 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2280 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2281 {
2282 char *end;
2283
2284 if (minlen)
2285 dontbother = minlen - 1;
2286 end = HOP3c(strend, -dontbother, strbeg) - 1;
2287 /* for multiline we only have to try after newlines */
2288 if (prog->check_substr || prog->check_utf8) {
2289 /* because of the goto we can not easily reuse the macros for bifurcating the
2290 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2291 if (utf8_target) {
2292 if (s == startpos)
2293 goto after_try_utf8;
2294 while (1) {
2295 if (regtry(&reginfo, &s)) {
2296 goto got_it;
2297 }
2298 after_try_utf8:
2299 if (s > end) {
2300 goto phooey;
2301 }
2302 if (prog->extflags & RXf_USE_INTUIT) {
2303 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2304 if (!s) {
2305 goto phooey;
2306 }
2307 }
2308 else {
2309 s += UTF8SKIP(s);
2310 }
2311 }
2312 } /* end search for check string in unicode */
2313 else {
2314 if (s == startpos) {
2315 goto after_try_latin;
2316 }
2317 while (1) {
2318 if (regtry(&reginfo, &s)) {
2319 goto got_it;
2320 }
2321 after_try_latin:
2322 if (s > end) {
2323 goto phooey;
2324 }
2325 if (prog->extflags & RXf_USE_INTUIT) {
2326 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2327 if (!s) {
2328 goto phooey;
2329 }
2330 }
2331 else {
2332 s++;
2333 }
2334 }
2335 } /* end search for check string in latin*/
2336 } /* end search for check string */
2337 else { /* search for newline */
2338 if (s > startpos) {
2339 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2340 s--;
2341 }
2342 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2343 while (s <= end) { /* note it could be possible to match at the end of the string */
2344 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2345 if (regtry(&reginfo, &s))
2346 goto got_it;
2347 }
2348 }
2349 } /* end search for newline */
2350 } /* end anchored/multiline check string search */
2351 goto phooey;
2352 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2353 {
2354 /* the warning about reginfo.ganch being used without initialization
2355 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2356 and we only enter this block when the same bit is set. */
2357 char *tmp_s = reginfo.ganch - prog->gofs;
2358
2359 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2360 goto got_it;
2361 goto phooey;
2362 }
2363
2364 /* Messy cases: unanchored match. */
2365 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2366 /* we have /x+whatever/ */
2367 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2368 char ch;
2369#ifdef DEBUGGING
2370 int did_match = 0;
2371#endif
2372 if (utf8_target) {
2373 if (! prog->anchored_utf8) {
2374 to_utf8_substr(prog);
2375 }
2376 ch = SvPVX_const(prog->anchored_utf8)[0];
2377 REXEC_FBC_SCAN(
2378 if (*s == ch) {
2379 DEBUG_EXECUTE_r( did_match = 1 );
2380 if (regtry(&reginfo, &s)) goto got_it;
2381 s += UTF8SKIP(s);
2382 while (s < strend && *s == ch)
2383 s += UTF8SKIP(s);
2384 }
2385 );
2386
2387 }
2388 else {
2389 if (! prog->anchored_substr) {
2390 if (! to_byte_substr(prog)) {
2391 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2392 }
2393 }
2394 ch = SvPVX_const(prog->anchored_substr)[0];
2395 REXEC_FBC_SCAN(
2396 if (*s == ch) {
2397 DEBUG_EXECUTE_r( did_match = 1 );
2398 if (regtry(&reginfo, &s)) goto got_it;
2399 s++;
2400 while (s < strend && *s == ch)
2401 s++;
2402 }
2403 );
2404 }
2405 DEBUG_EXECUTE_r(if (!did_match)
2406 PerlIO_printf(Perl_debug_log,
2407 "Did not find anchored character...\n")
2408 );
2409 }
2410 else if (prog->anchored_substr != NULL
2411 || prog->anchored_utf8 != NULL
2412 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2413 && prog->float_max_offset < strend - s)) {
2414 SV *must;
2415 I32 back_max;
2416 I32 back_min;
2417 char *last;
2418 char *last1; /* Last position checked before */
2419#ifdef DEBUGGING
2420 int did_match = 0;
2421#endif
2422 if (prog->anchored_substr || prog->anchored_utf8) {
2423 if (utf8_target) {
2424 if (! prog->anchored_utf8) {
2425 to_utf8_substr(prog);
2426 }
2427 must = prog->anchored_utf8;
2428 }
2429 else {
2430 if (! prog->anchored_substr) {
2431 if (! to_byte_substr(prog)) {
2432 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2433 }
2434 }
2435 must = prog->anchored_substr;
2436 }
2437 back_max = back_min = prog->anchored_offset;
2438 } else {
2439 if (utf8_target) {
2440 if (! prog->float_utf8) {
2441 to_utf8_substr(prog);
2442 }
2443 must = prog->float_utf8;
2444 }
2445 else {
2446 if (! prog->float_substr) {
2447 if (! to_byte_substr(prog)) {
2448 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2449 }
2450 }
2451 must = prog->float_substr;
2452 }
2453 back_max = prog->float_max_offset;
2454 back_min = prog->float_min_offset;
2455 }
2456
2457 if (back_min<0) {
2458 last = strend;
2459 } else {
2460 last = HOP3c(strend, /* Cannot start after this */
2461 -(I32)(CHR_SVLEN(must)
2462 - (SvTAIL(must) != 0) + back_min), strbeg);
2463 }
2464 if (s > PL_bostr)
2465 last1 = HOPc(s, -1);
2466 else
2467 last1 = s - 1; /* bogus */
2468
2469 /* XXXX check_substr already used to find "s", can optimize if
2470 check_substr==must. */
2471 scream_pos = -1;
2472 dontbother = end_shift;
2473 strend = HOPc(strend, -dontbother);
2474 while ( (s <= last) &&
2475 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2476 (unsigned char*)strend, must,
2477 multiline ? FBMrf_MULTILINE : 0)) ) {
2478 DEBUG_EXECUTE_r( did_match = 1 );
2479 if (HOPc(s, -back_max) > last1) {
2480 last1 = HOPc(s, -back_min);
2481 s = HOPc(s, -back_max);
2482 }
2483 else {
2484 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2485
2486 last1 = HOPc(s, -back_min);
2487 s = t;
2488 }
2489 if (utf8_target) {
2490 while (s <= last1) {
2491 if (regtry(&reginfo, &s))
2492 goto got_it;
2493 if (s >= last1) {
2494 s++; /* to break out of outer loop */
2495 break;
2496 }
2497 s += UTF8SKIP(s);
2498 }
2499 }
2500 else {
2501 while (s <= last1) {
2502 if (regtry(&reginfo, &s))
2503 goto got_it;
2504 s++;
2505 }
2506 }
2507 }
2508 DEBUG_EXECUTE_r(if (!did_match) {
2509 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2510 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2511 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2512 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2513 ? "anchored" : "floating"),
2514 quoted, RE_SV_TAIL(must));
2515 });
2516 goto phooey;
2517 }
2518 else if ( (c = progi->regstclass) ) {
2519 if (minlen) {
2520 const OPCODE op = OP(progi->regstclass);
2521 /* don't bother with what can't match */
2522 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2523 strend = HOPc(strend, -(minlen - 1));
2524 }
2525 DEBUG_EXECUTE_r({
2526 SV * const prop = sv_newmortal();
2527 regprop(prog, prop, c);
2528 {
2529 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2530 s,strend-s,60);
2531 PerlIO_printf(Perl_debug_log,
2532 "Matching stclass %.*s against %s (%d bytes)\n",
2533 (int)SvCUR(prop), SvPVX_const(prop),
2534 quoted, (int)(strend - s));
2535 }
2536 });
2537 if (find_byclass(prog, c, s, strend, &reginfo))
2538 goto got_it;
2539 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2540 }
2541 else {
2542 dontbother = 0;
2543 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2544 /* Trim the end. */
2545 char *last= NULL;
2546 SV* float_real;
2547 STRLEN len;
2548 const char *little;
2549
2550 if (utf8_target) {
2551 if (! prog->float_utf8) {
2552 to_utf8_substr(prog);
2553 }
2554 float_real = prog->float_utf8;
2555 }
2556 else {
2557 if (! prog->float_substr) {
2558 if (! to_byte_substr(prog)) {
2559 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2560 }
2561 }
2562 float_real = prog->float_substr;
2563 }
2564
2565 little = SvPV_const(float_real, len);
2566 if (SvTAIL(float_real)) {
2567 /* This means that float_real contains an artificial \n on
2568 * the end due to the presence of something like this:
2569 * /foo$/ where we can match both "foo" and "foo\n" at the
2570 * end of the string. So we have to compare the end of the
2571 * string first against the float_real without the \n and
2572 * then against the full float_real with the string. We
2573 * have to watch out for cases where the string might be
2574 * smaller than the float_real or the float_real without
2575 * the \n. */
2576 char *checkpos= strend - len;
2577 DEBUG_OPTIMISE_r(
2578 PerlIO_printf(Perl_debug_log,
2579 "%sChecking for float_real.%s\n",
2580 PL_colors[4], PL_colors[5]));
2581 if (checkpos + 1 < strbeg) {
2582 /* can't match, even if we remove the trailing \n
2583 * string is too short to match */
2584 DEBUG_EXECUTE_r(
2585 PerlIO_printf(Perl_debug_log,
2586 "%sString shorter than required trailing substring, cannot match.%s\n",
2587 PL_colors[4], PL_colors[5]));
2588 goto phooey;
2589 } else if (memEQ(checkpos + 1, little, len - 1)) {
2590 /* can match, the end of the string matches without the
2591 * "\n" */
2592 last = checkpos + 1;
2593 } else if (checkpos < strbeg) {
2594 /* cant match, string is too short when the "\n" is
2595 * included */
2596 DEBUG_EXECUTE_r(
2597 PerlIO_printf(Perl_debug_log,
2598 "%sString does not contain required trailing substring, cannot match.%s\n",
2599 PL_colors[4], PL_colors[5]));
2600 goto phooey;
2601 } else if (!multiline) {
2602 /* non multiline match, so compare with the "\n" at the
2603 * end of the string */
2604 if (memEQ(checkpos, little, len)) {
2605 last= checkpos;
2606 } else {
2607 DEBUG_EXECUTE_r(
2608 PerlIO_printf(Perl_debug_log,
2609 "%sString does not contain required trailing substring, cannot match.%s\n",
2610 PL_colors[4], PL_colors[5]));
2611 goto phooey;
2612 }
2613 } else {
2614 /* multiline match, so we have to search for a place
2615 * where the full string is located */
2616 goto find_last;
2617 }
2618 } else {
2619 find_last:
2620 if (len)
2621 last = rninstr(s, strend, little, little + len);
2622 else
2623 last = strend; /* matching "$" */
2624 }
2625 if (!last) {
2626 /* at one point this block contained a comment which was
2627 * probably incorrect, which said that this was a "should not
2628 * happen" case. Even if it was true when it was written I am
2629 * pretty sure it is not anymore, so I have removed the comment
2630 * and replaced it with this one. Yves */
2631 DEBUG_EXECUTE_r(
2632 PerlIO_printf(Perl_debug_log,
2633 "String does not contain required substring, cannot match.\n"
2634 ));
2635 goto phooey;
2636 }
2637 dontbother = strend - last + prog->float_min_offset;
2638 }
2639 if (minlen && (dontbother < minlen))
2640 dontbother = minlen - 1;
2641 strend -= dontbother; /* this one's always in bytes! */
2642 /* We don't know much -- general case. */
2643 if (utf8_target) {
2644 for (;;) {
2645 if (regtry(&reginfo, &s))
2646 goto got_it;
2647 if (s >= strend)
2648 break;
2649 s += UTF8SKIP(s);
2650 };
2651 }
2652 else {
2653 do {
2654 if (regtry(&reginfo, &s))
2655 goto got_it;
2656 } while (s++ < strend);
2657 }
2658 }
2659
2660 /* Failure. */
2661 goto phooey;
2662
2663got_it:
2664 DEBUG_BUFFERS_r(
2665 if (swap)
2666 PerlIO_printf(Perl_debug_log,
2667 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2668 PTR2UV(prog),
2669 PTR2UV(swap)
2670 );
2671 );
2672 Safefree(swap);
2673 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2674
2675 if (PL_reg_state.re_state_eval_setup_done)
2676 restore_pos(aTHX_ prog);
2677 if (RXp_PAREN_NAMES(prog))
2678 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2679
2680 /* make sure $`, $&, $', and $digit will work later */
2681 if ( !(flags & REXEC_NOT_FIRST) ) {
2682 if (flags & REXEC_COPY_STR) {
2683#ifdef PERL_ANY_COW
2684 if (SvCANCOW(sv)) {
2685 if (DEBUG_C_TEST) {
2686 PerlIO_printf(Perl_debug_log,
2687 "Copy on write: regexp capture, type %d\n",
2688 (int) SvTYPE(sv));
2689 }
2690 RX_MATCH_COPY_FREE(rx);
2691 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2692 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2693 assert (SvPOKp(prog->saved_copy));
2694 prog->sublen = PL_regeol - strbeg;
2695 prog->suboffset = 0;
2696 prog->subcoffset = 0;
2697 } else
2698#endif
2699 {
2700 I32 min = 0;
2701 I32 max = PL_regeol - strbeg;
2702 I32 sublen;
2703
2704 if ( (flags & REXEC_COPY_SKIP_POST)
2705 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2706 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2707 ) { /* don't copy $' part of string */
2708 U32 n = 0;
2709 max = -1;
2710 /* calculate the right-most part of the string covered
2711 * by a capture. Due to look-ahead, this may be to
2712 * the right of $&, so we have to scan all captures */
2713 while (n <= prog->lastparen) {
2714 if (prog->offs[n].end > max)
2715 max = prog->offs[n].end;
2716 n++;
2717 }
2718 if (max == -1)
2719 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2720 ? prog->offs[0].start
2721 : 0;
2722 assert(max >= 0 && max <= PL_regeol - strbeg);
2723 }
2724
2725 if ( (flags & REXEC_COPY_SKIP_PRE)
2726 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2727 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2728 ) { /* don't copy $` part of string */
2729 U32 n = 0;
2730 min = max;
2731 /* calculate the left-most part of the string covered
2732 * by a capture. Due to look-behind, this may be to
2733 * the left of $&, so we have to scan all captures */
2734 while (min && n <= prog->lastparen) {
2735 if ( prog->offs[n].start != -1
2736 && prog->offs[n].start < min)
2737 {
2738 min = prog->offs[n].start;
2739 }
2740 n++;
2741 }
2742 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2743 && min > prog->offs[0].end
2744 )
2745 min = prog->offs[0].end;
2746
2747 }
2748
2749 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2750 sublen = max - min;
2751
2752 if (RX_MATCH_COPIED(rx)) {
2753 if (sublen > prog->sublen)
2754 prog->subbeg =
2755 (char*)saferealloc(prog->subbeg, sublen+1);
2756 }
2757 else
2758 prog->subbeg = (char*)safemalloc(sublen+1);
2759 Copy(strbeg + min, prog->subbeg, sublen, char);
2760 prog->subbeg[sublen] = '\0';
2761 prog->suboffset = min;
2762 prog->sublen = sublen;
2763 RX_MATCH_COPIED_on(rx);
2764 }
2765 prog->subcoffset = prog->suboffset;
2766 if (prog->suboffset && utf8_target) {
2767 /* Convert byte offset to chars.
2768 * XXX ideally should only compute this if @-/@+
2769 * has been seen, a la PL_sawampersand ??? */
2770
2771 /* If there's a direct correspondence between the
2772 * string which we're matching and the original SV,
2773 * then we can use the utf8 len cache associated with
2774 * the SV. In particular, it means that under //g,
2775 * sv_pos_b2u() will use the previously cached
2776 * position to speed up working out the new length of
2777 * subcoffset, rather than counting from the start of
2778 * the string each time. This stops
2779 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2780 * from going quadratic */
2781 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2782 sv_pos_b2u(sv, &(prog->subcoffset));
2783 else
2784 prog->subcoffset = utf8_length((U8*)strbeg,
2785 (U8*)(strbeg+prog->suboffset));
2786 }
2787 }
2788 else {
2789 RX_MATCH_COPY_FREE(rx);
2790 prog->subbeg = strbeg;
2791 prog->suboffset = 0;
2792 prog->subcoffset = 0;
2793 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2794 }
2795 }
2796
2797 return 1;
2798
2799phooey:
2800 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2801 PL_colors[4], PL_colors[5]));
2802 if (PL_reg_state.re_state_eval_setup_done)
2803 restore_pos(aTHX_ prog);
2804 if (swap) {
2805 /* we failed :-( roll it back */
2806 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2807 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2808 PTR2UV(prog),
2809 PTR2UV(prog->offs),
2810 PTR2UV(swap)
2811 ));
2812 Safefree(prog->offs);
2813 prog->offs = swap;
2814 }
2815 return 0;
2816}
2817
2818
2819/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2820 * Do inc before dec, in case old and new rex are the same */
2821#define SET_reg_curpm(Re2) \
2822 if (PL_reg_state.re_state_eval_setup_done) { \
2823 (void)ReREFCNT_inc(Re2); \
2824 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2825 PM_SETRE((PL_reg_curpm), (Re2)); \
2826 }
2827
2828
2829/*
2830 - regtry - try match at specific point
2831 */
2832STATIC I32 /* 0 failure, 1 success */
2833S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2834{
2835 dVAR;
2836 CHECKPOINT lastcp;
2837 REGEXP *const rx = reginfo->prog;
2838 regexp *const prog = ReANY(rx);
2839 I32 result;
2840 RXi_GET_DECL(prog,progi);
2841 GET_RE_DEBUG_FLAGS_DECL;
2842
2843 PERL_ARGS_ASSERT_REGTRY;
2844
2845 reginfo->cutpoint=NULL;
2846
2847 if ((prog->extflags & RXf_EVAL_SEEN)
2848 && !PL_reg_state.re_state_eval_setup_done)
2849 {
2850 MAGIC *mg;
2851
2852 PL_reg_state.re_state_eval_setup_done = TRUE;
2853 if (reginfo->sv) {
2854 /* Make $_ available to executed code. */
2855 if (reginfo->sv != DEFSV) {
2856 SAVE_DEFSV;
2857 DEFSV_set(reginfo->sv);
2858 }
2859
2860 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2861 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2862 /* prepare for quick setting of pos */
2863#ifdef PERL_OLD_COPY_ON_WRITE
2864 if (SvIsCOW(reginfo->sv))
2865 sv_force_normal_flags(reginfo->sv, 0);
2866#endif
2867 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2868 &PL_vtbl_mglob, NULL, 0);
2869 mg->mg_len = -1;
2870 }
2871 PL_reg_magic = mg;
2872 PL_reg_oldpos = mg->mg_len;
2873 SAVEDESTRUCTOR_X(restore_pos, prog);
2874 }
2875 if (!PL_reg_curpm) {
2876 Newxz(PL_reg_curpm, 1, PMOP);
2877#ifdef USE_ITHREADS
2878 {
2879 SV* const repointer = &PL_sv_undef;
2880 /* this regexp is also owned by the new PL_reg_curpm, which
2881 will try to free it. */
2882 av_push(PL_regex_padav, repointer);
2883 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2884 PL_regex_pad = AvARRAY(PL_regex_padav);
2885 }
2886#endif
2887 }
2888 SET_reg_curpm(rx);
2889 PL_reg_oldcurpm = PL_curpm;
2890 PL_curpm = PL_reg_curpm;
2891 if (RXp_MATCH_COPIED(prog)) {
2892 /* Here is a serious problem: we cannot rewrite subbeg,
2893 since it may be needed if this match fails. Thus
2894 $` inside (?{}) could fail... */
2895 PL_reg_oldsaved = prog->subbeg;
2896 PL_reg_oldsavedlen = prog->sublen;
2897 PL_reg_oldsavedoffset = prog->suboffset;
2898 PL_reg_oldsavedcoffset = prog->suboffset;
2899#ifdef PERL_ANY_COW
2900 PL_nrs = prog->saved_copy;
2901#endif
2902 RXp_MATCH_COPIED_off(prog);
2903 }
2904 else
2905 PL_reg_oldsaved = NULL;
2906 prog->subbeg = PL_bostr;
2907 prog->suboffset = 0;
2908 prog->subcoffset = 0;
2909 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2910 }
2911#ifdef DEBUGGING
2912 PL_reg_starttry = *startposp;
2913#endif
2914 prog->offs[0].start = *startposp - PL_bostr;
2915 prog->lastparen = 0;
2916 prog->lastcloseparen = 0;
2917
2918 /* XXXX What this code is doing here?!!! There should be no need
2919 to do this again and again, prog->lastparen should take care of
2920 this! --ilya*/
2921
2922 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2923 * Actually, the code in regcppop() (which Ilya may be meaning by
2924 * prog->lastparen), is not needed at all by the test suite
2925 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2926 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2927 * Meanwhile, this code *is* needed for the
2928 * above-mentioned test suite tests to succeed. The common theme
2929 * on those tests seems to be returning null fields from matches.
2930 * --jhi updated by dapm */
2931#if 1
2932 if (prog->nparens) {
2933 regexp_paren_pair *pp = prog->offs;
2934 I32 i;
2935 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2936 ++pp;
2937 pp->start = -1;
2938 pp->end = -1;
2939 }
2940 }
2941#endif
2942 REGCP_SET(lastcp);
2943 result = regmatch(reginfo, *startposp, progi->program + 1);
2944 if (result != -1) {
2945 prog->offs[0].end = result;
2946 return 1;
2947 }
2948 if (reginfo->cutpoint)
2949 *startposp= reginfo->cutpoint;
2950 REGCP_UNWIND(lastcp);
2951 return 0;
2952}
2953
2954
2955#define sayYES goto yes
2956#define sayNO goto no
2957#define sayNO_SILENT goto no_silent
2958
2959/* we dont use STMT_START/END here because it leads to
2960 "unreachable code" warnings, which are bogus, but distracting. */
2961#define CACHEsayNO \
2962 if (ST.cache_mask) \
2963 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2964 sayNO
2965
2966/* this is used to determine how far from the left messages like
2967 'failed...' are printed. It should be set such that messages
2968 are inline with the regop output that created them.
2969*/
2970#define REPORT_CODE_OFF 32
2971
2972
2973#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2974#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2975#define CHRTEST_NOT_A_CP_1 -999
2976#define CHRTEST_NOT_A_CP_2 -998
2977
2978#define SLAB_FIRST(s) (&(s)->states[0])
2979#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2980
2981/* grab a new slab and return the first slot in it */
2982
2983STATIC regmatch_state *
2984S_push_slab(pTHX)
2985{
2986#if PERL_VERSION < 9 && !defined(PERL_CORE)
2987 dMY_CXT;
2988#endif
2989 regmatch_slab *s = PL_regmatch_slab->next;
2990 if (!s) {
2991 Newx(s, 1, regmatch_slab);
2992 s->prev = PL_regmatch_slab;
2993 s->next = NULL;
2994 PL_regmatch_slab->next = s;
2995 }
2996 PL_regmatch_slab = s;
2997 return SLAB_FIRST(s);
2998}
2999
3000
3001/* push a new state then goto it */
3002
3003#define PUSH_STATE_GOTO(state, node, input) \
3004 pushinput = input; \
3005 scan = node; \
3006 st->resume_state = state; \
3007 goto push_state;
3008
3009/* push a new state with success backtracking, then goto it */
3010
3011#define PUSH_YES_STATE_GOTO(state, node, input) \
3012 pushinput = input; \
3013 scan = node; \
3014 st->resume_state = state; \
3015 goto push_yes_state;
3016
3017
3018
3019
3020/*
3021
3022regmatch() - main matching routine
3023
3024This is basically one big switch statement in a loop. We execute an op,
3025set 'next' to point the next op, and continue. If we come to a point which
3026we may need to backtrack to on failure such as (A|B|C), we push a
3027backtrack state onto the backtrack stack. On failure, we pop the top
3028state, and re-enter the loop at the state indicated. If there are no more
3029states to pop, we return failure.
3030
3031Sometimes we also need to backtrack on success; for example /A+/, where
3032after successfully matching one A, we need to go back and try to
3033match another one; similarly for lookahead assertions: if the assertion
3034completes successfully, we backtrack to the state just before the assertion
3035and then carry on. In these cases, the pushed state is marked as
3036'backtrack on success too'. This marking is in fact done by a chain of
3037pointers, each pointing to the previous 'yes' state. On success, we pop to
3038the nearest yes state, discarding any intermediate failure-only states.
3039Sometimes a yes state is pushed just to force some cleanup code to be
3040called at the end of a successful match or submatch; e.g. (??{$re}) uses
3041it to free the inner regex.
3042
3043Note that failure backtracking rewinds the cursor position, while
3044success backtracking leaves it alone.
3045
3046A pattern is complete when the END op is executed, while a subpattern
3047such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3048ops trigger the "pop to last yes state if any, otherwise return true"
3049behaviour.
3050
3051A common convention in this function is to use A and B to refer to the two
3052subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3053the subpattern to be matched possibly multiple times, while B is the entire
3054rest of the pattern. Variable and state names reflect this convention.
3055
3056The states in the main switch are the union of ops and failure/success of
3057substates associated with with that op. For example, IFMATCH is the op
3058that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3059'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3060successfully matched A and IFMATCH_A_fail is a state saying that we have
3061just failed to match A. Resume states always come in pairs. The backtrack
3062state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3063at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3064on success or failure.
3065
3066The struct that holds a backtracking state is actually a big union, with
3067one variant for each major type of op. The variable st points to the
3068top-most backtrack struct. To make the code clearer, within each
3069block of code we #define ST to alias the relevant union.
3070
3071Here's a concrete example of a (vastly oversimplified) IFMATCH
3072implementation:
3073
3074 switch (state) {
3075 ....
3076
3077#define ST st->u.ifmatch
3078
3079 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3080 ST.foo = ...; // some state we wish to save
3081 ...
3082 // push a yes backtrack state with a resume value of
3083 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3084 // first node of A:
3085 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3086 // NOTREACHED
3087
3088 case IFMATCH_A: // we have successfully executed A; now continue with B
3089 next = B;
3090 bar = ST.foo; // do something with the preserved value
3091 break;
3092
3093 case IFMATCH_A_fail: // A failed, so the assertion failed
3094 ...; // do some housekeeping, then ...
3095 sayNO; // propagate the failure
3096
3097#undef ST
3098
3099 ...
3100 }
3101
3102For any old-timers reading this who are familiar with the old recursive
3103approach, the code above is equivalent to:
3104
3105 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3106 {
3107 int foo = ...
3108 ...
3109 if (regmatch(A)) {
3110 next = B;
3111 bar = foo;
3112 break;
3113 }
3114 ...; // do some housekeeping, then ...
3115 sayNO; // propagate the failure
3116 }
3117
3118The topmost backtrack state, pointed to by st, is usually free. If you
3119want to claim it, populate any ST.foo fields in it with values you wish to
3120save, then do one of
3121
3122 PUSH_STATE_GOTO(resume_state, node, newinput);
3123 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3124
3125which sets that backtrack state's resume value to 'resume_state', pushes a
3126new free entry to the top of the backtrack stack, then goes to 'node'.
3127On backtracking, the free slot is popped, and the saved state becomes the
3128new free state. An ST.foo field in this new top state can be temporarily
3129accessed to retrieve values, but once the main loop is re-entered, it
3130becomes available for reuse.
3131
3132Note that the depth of the backtrack stack constantly increases during the
3133left-to-right execution of the pattern, rather than going up and down with
3134the pattern nesting. For example the stack is at its maximum at Z at the
3135end of the pattern, rather than at X in the following:
3136
3137 /(((X)+)+)+....(Y)+....Z/
3138
3139The only exceptions to this are lookahead/behind assertions and the cut,
3140(?>A), which pop all the backtrack states associated with A before
3141continuing.
3142
3143Backtrack state structs are allocated in slabs of about 4K in size.
3144PL_regmatch_state and st always point to the currently active state,
3145and PL_regmatch_slab points to the slab currently containing
3146PL_regmatch_state. The first time regmatch() is called, the first slab is
3147allocated, and is never freed until interpreter destruction. When the slab
3148is full, a new one is allocated and chained to the end. At exit from
3149regmatch(), slabs allocated since entry are freed.
3150
3151*/
3152
3153
3154#define DEBUG_STATE_pp(pp) \
3155 DEBUG_STATE_r({ \
3156 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3157 PerlIO_printf(Perl_debug_log, \
3158 " %*s"pp" %s%s%s%s%s\n", \
3159 depth*2, "", \
3160 PL_reg_name[st->resume_state], \
3161 ((st==yes_state||st==mark_state) ? "[" : ""), \
3162 ((st==yes_state) ? "Y" : ""), \
3163 ((st==mark_state) ? "M" : ""), \
3164 ((st==yes_state||st==mark_state) ? "]" : "") \
3165 ); \
3166 });
3167
3168
3169#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3170
3171#ifdef DEBUGGING
3172
3173STATIC void
3174S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3175 const char *start, const char *end, const char *blurb)
3176{
3177 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3178
3179 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3180
3181 if (!PL_colorset)
3182 reginitcolors();
3183 {
3184 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3185 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3186
3187 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3188 start, end - start, 60);
3189
3190 PerlIO_printf(Perl_debug_log,
3191 "%s%s REx%s %s against %s\n",
3192 PL_colors[4], blurb, PL_colors[5], s0, s1);
3193
3194 if (utf8_target||utf8_pat)
3195 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3196 utf8_pat ? "pattern" : "",
3197 utf8_pat && utf8_target ? " and " : "",
3198 utf8_target ? "string" : ""
3199 );
3200 }
3201}
3202
3203STATIC void
3204S_dump_exec_pos(pTHX_ const char *locinput,
3205 const regnode *scan,
3206 const char *loc_regeol,
3207 const char *loc_bostr,
3208 const char *loc_reg_starttry,
3209 const bool utf8_target)
3210{
3211 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3212 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3213 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3214 /* The part of the string before starttry has one color
3215 (pref0_len chars), between starttry and current
3216 position another one (pref_len - pref0_len chars),
3217 after the current position the third one.
3218 We assume that pref0_len <= pref_len, otherwise we
3219 decrease pref0_len. */
3220 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3221 ? (5 + taill) - l : locinput - loc_bostr;
3222 int pref0_len;
3223
3224 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3225
3226 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3227 pref_len++;
3228 pref0_len = pref_len - (locinput - loc_reg_starttry);
3229 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3230 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3231 ? (5 + taill) - pref_len : loc_regeol - locinput);
3232 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3233 l--;
3234 if (pref0_len < 0)
3235 pref0_len = 0;
3236 if (pref0_len > pref_len)
3237 pref0_len = pref_len;
3238 {
3239 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3240
3241 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3242 (locinput - pref_len),pref0_len, 60, 4, 5);
3243
3244 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3245 (locinput - pref_len + pref0_len),
3246 pref_len - pref0_len, 60, 2, 3);
3247
3248 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3249 locinput, loc_regeol - locinput, 10, 0, 1);
3250
3251 const STRLEN tlen=len0+len1+len2;
3252 PerlIO_printf(Perl_debug_log,
3253 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3254 (IV)(locinput - loc_bostr),
3255 len0, s0,
3256 len1, s1,
3257 (docolor ? "" : "> <"),
3258 len2, s2,
3259 (int)(tlen > 19 ? 0 : 19 - tlen),
3260 "");
3261 }
3262}
3263
3264#endif
3265
3266/* reg_check_named_buff_matched()
3267 * Checks to see if a named buffer has matched. The data array of
3268 * buffer numbers corresponding to the buffer is expected to reside
3269 * in the regexp->data->data array in the slot stored in the ARG() of
3270 * node involved. Note that this routine doesn't actually care about the
3271 * name, that information is not preserved from compilation to execution.
3272 * Returns the index of the leftmost defined buffer with the given name
3273 * or 0 if non of the buffers matched.
3274 */
3275STATIC I32
3276S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3277{
3278 I32 n;
3279 RXi_GET_DECL(rex,rexi);
3280 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3281 I32 *nums=(I32*)SvPVX(sv_dat);
3282
3283 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3284
3285 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3286 if ((I32)rex->lastparen >= nums[n] &&
3287 rex->offs[nums[n]].end != -1)
3288 {
3289 return nums[n];
3290 }
3291 }
3292 return 0;
3293}
3294
3295
3296/* free all slabs above current one - called during LEAVE_SCOPE */
3297
3298STATIC void
3299S_clear_backtrack_stack(pTHX_ void *p)
3300{
3301 regmatch_slab *s = PL_regmatch_slab->next;
3302 PERL_UNUSED_ARG(p);
3303
3304 if (!s)
3305 return;
3306 PL_regmatch_slab->next = NULL;
3307 while (s) {
3308 regmatch_slab * const osl = s;
3309 s = s->next;
3310 Safefree(osl);
3311 }
3312}
3313static bool
3314S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3315{
3316 /* This function determines if there are one or two characters that match
3317 * the first character of the passed-in EXACTish node <text_node>, and if
3318 * so, returns them in the passed-in pointers.
3319 *
3320 * If it determines that no possible character in the target string can
3321 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3322 * the first character in <text_node> requires UTF-8 to represent, and the
3323 * target string isn't in UTF-8.)
3324 *
3325 * If there are more than two characters that could match the beginning of
3326 * <text_node>, or if more context is required to determine a match or not,
3327 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3328 *
3329 * The motiviation behind this function is to allow the caller to set up
3330 * tight loops for matching. If <text_node> is of type EXACT, there is
3331 * only one possible character that can match its first character, and so
3332 * the situation is quite simple. But things get much more complicated if
3333 * folding is involved. It may be that the first character of an EXACTFish
3334 * node doesn't participate in any possible fold, e.g., punctuation, so it
3335 * can be matched only by itself. The vast majority of characters that are
3336 * in folds match just two things, their lower and upper-case equivalents.
3337 * But not all are like that; some have multiple possible matches, or match
3338 * sequences of more than one character. This function sorts all that out.
3339 *
3340 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3341 * loop of trying to match A*, we know we can't exit where the thing
3342 * following it isn't a B. And something can't be a B unless it is the
3343 * beginning of B. By putting a quick test for that beginning in a tight
3344 * loop, we can rule out things that can't possibly be B without having to
3345 * break out of the loop, thus avoiding work. Similarly, if A is a single
3346 * character, we can make a tight loop matching A*, using the outputs of
3347 * this function.
3348 *
3349 * If the target string to match isn't in UTF-8, and there aren't
3350 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3351 * the one or two possible octets (which are characters in this situation)
3352 * that can match. In all cases, if there is only one character that can
3353 * match, *<c1p> and *<c2p> will be identical.
3354 *
3355 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3356 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3357 * can match the beginning of <text_node>. They should be declared with at
3358 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3359 * undefined what these contain.) If one or both of the buffers are
3360 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3361 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3362 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3363 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3364 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3365
3366 const bool utf8_target = PL_reg_match_utf8;
3367
3368 UV c1 = CHRTEST_NOT_A_CP_1;
3369 UV c2 = CHRTEST_NOT_A_CP_2;
3370 bool use_chrtest_void = FALSE;
3371
3372 /* Used when we have both utf8 input and utf8 output, to avoid converting
3373 * to/from code points */
3374 bool utf8_has_been_setup = FALSE;
3375
3376 dVAR;
3377
3378 U8 *pat = (U8*)STRING(text_node);
3379
3380 if (OP(text_node) == EXACT) {
3381
3382 /* In an exact node, only one thing can be matched, that first
3383 * character. If both the pat and the target are UTF-8, we can just
3384 * copy the input to the output, avoiding finding the code point of
3385 * that character */
3386 if (! UTF_PATTERN) {
3387 c2 = c1 = *pat;
3388 }
3389 else if (utf8_target) {
3390 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3391 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3392 utf8_has_been_setup = TRUE;
3393 }
3394 else {
3395 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3396 }
3397 }
3398 else /* an EXACTFish node */
3399 if ((UTF_PATTERN
3400 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3401 pat + STR_LEN(text_node)))
3402 || (! UTF_PATTERN
3403 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3404 pat + STR_LEN(text_node))))
3405 {
3406 /* Multi-character folds require more context to sort out. Also
3407 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3408 * handled outside this routine */
3409 use_chrtest_void = TRUE;
3410 }
3411 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3412 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3413 if (c1 > 256) {
3414 /* Load the folds hash, if not already done */
3415 SV** listp;
3416 if (! PL_utf8_foldclosures) {
3417 if (! PL_utf8_tofold) {
3418 U8 dummy[UTF8_MAXBYTES+1];
3419
3420 /* Force loading this by folding an above-Latin1 char */
3421 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3422 assert(PL_utf8_tofold); /* Verify that worked */
3423 }
3424 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3425 }
3426
3427 /* The fold closures data structure is a hash with the keys being
3428 * the UTF-8 of every character that is folded to, like 'k', and
3429 * the values each an array of all code points that fold to its
3430 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3431 * not included */
3432 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3433 (char *) pat,
3434 UTF8SKIP(pat),
3435 FALSE))))
3436 {
3437 /* Not found in the hash, therefore there are no folds
3438 * containing it, so there is only a single character that
3439 * could match */
3440 c2 = c1;
3441 }
3442 else { /* Does participate in folds */
3443 AV* list = (AV*) *listp;
3444 if (av_len(list) != 1) {
3445
3446 /* If there aren't exactly two folds to this, it is outside
3447 * the scope of this function */
3448 use_chrtest_void = TRUE;
3449 }
3450 else { /* There are two. Get them */
3451 SV** c_p = av_fetch(list, 0, FALSE);
3452 if (c_p == NULL) {
3453 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3454 }
3455 c1 = SvUV(*c_p);
3456
3457 c_p = av_fetch(list, 1, FALSE);
3458 if (c_p == NULL) {
3459 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3460 }
3461 c2 = SvUV(*c_p);
3462
3463 /* Folds that cross the 255/256 boundary are forbidden if
3464 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3465 * pattern character is above 256, and its only other match
3466 * is below 256, the only legal match will be to itself.
3467 * We have thrown away the original, so have to compute
3468 * which is the one above 255 */
3469 if ((c1 < 256) != (c2 < 256)) {
3470 if (OP(text_node) == EXACTFL
3471 || (OP(text_node) == EXACTFA
3472 && (isASCII(c1) || isASCII(c2))))
3473 {
3474 if (c1 < 256) {
3475 c1 = c2;
3476 }
3477 else {
3478 c2 = c1;
3479 }
3480 }
3481 }
3482 }
3483 }
3484 }
3485 else /* Here, c1 is < 255 */
3486 if (utf8_target
3487 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3488 && OP(text_node) != EXACTFL
3489 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3490 {
3491 /* Here, there could be something above Latin1 in the target which
3492 * folds to this character in the pattern. All such cases except
3493 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3494 * involved in their folds, so are outside the scope of this
3495 * function */
3496 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3497 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3498 }
3499 else {
3500 use_chrtest_void = TRUE;
3501 }
3502 }
3503 else { /* Here nothing above Latin1 can fold to the pattern character */
3504 switch (OP(text_node)) {
3505
3506 case EXACTFL: /* /l rules */
3507 c2 = PL_fold_locale[c1];
3508 break;
3509
3510 case EXACTF:
3511 if (! utf8_target) { /* /d rules */
3512 c2 = PL_fold[c1];
3513 break;
3514 }
3515 /* FALLTHROUGH */
3516 /* /u rules for all these. This happens to work for
3517 * EXACTFA as nothing in Latin1 folds to ASCII */
3518 case EXACTFA:
3519 case EXACTFU_TRICKYFOLD:
3520 case EXACTFU_SS:
3521 case EXACTFU:
3522 c2 = PL_fold_latin1[c1];
3523 break;
3524
3525 default:
3526 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3527 assert(0); /* NOTREACHED */
3528 }
3529 }
3530 }
3531
3532 /* Here have figured things out. Set up the returns */
3533 if (use_chrtest_void) {
3534 *c2p = *c1p = CHRTEST_VOID;
3535 }
3536 else if (utf8_target) {
3537 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3538 uvchr_to_utf8(c1_utf8, c1);
3539 uvchr_to_utf8(c2_utf8, c2);
3540 }
3541
3542 /* Invariants are stored in both the utf8 and byte outputs; Use
3543 * negative numbers otherwise for the byte ones. Make sure that the
3544 * byte ones are the same iff the utf8 ones are the same */
3545 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3546 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3547 ? *c2_utf8
3548 : (c1 == c2)
3549 ? CHRTEST_NOT_A_CP_1
3550 : CHRTEST_NOT_A_CP_2;
3551 }
3552 else if (c1 > 255) {
3553 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3554 can represent */
3555 return FALSE;
3556 }
3557
3558 *c1p = *c2p = c2; /* c2 is the only representable value */
3559 }
3560 else { /* c1 is representable; see about c2 */
3561 *c1p = c1;
3562 *c2p = (c2 < 256) ? c2 : c1;
3563 }
3564
3565 return TRUE;
3566}
3567
3568/* returns -1 on failure, $+[0] on success */
3569STATIC I32
3570S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3571{
3572#if PERL_VERSION < 9 && !defined(PERL_CORE)
3573 dMY_CXT;
3574#endif
3575 dVAR;
3576 const bool utf8_target = PL_reg_match_utf8;
3577 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3578 REGEXP *rex_sv = reginfo->prog;
3579 regexp *rex = ReANY(rex_sv);
3580 RXi_GET_DECL(rex,rexi);
3581 I32 oldsave;
3582 /* the current state. This is a cached copy of PL_regmatch_state */
3583 regmatch_state *st;
3584 /* cache heavy used fields of st in registers */
3585 regnode *scan;
3586 regnode *next;
3587 U32 n = 0; /* general value; init to avoid compiler warning */
3588 I32 ln = 0; /* len or last; init to avoid compiler warning */
3589 char *locinput = startpos;
3590 char *pushinput; /* where to continue after a PUSH */
3591 I32 nextchr; /* is always set to UCHARAT(locinput) */
3592
3593 bool result = 0; /* return value of S_regmatch */
3594 int depth = 0; /* depth of backtrack stack */
3595 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3596 const U32 max_nochange_depth =
3597 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3598 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3599 regmatch_state *yes_state = NULL; /* state to pop to on success of
3600 subpattern */
3601 /* mark_state piggy backs on the yes_state logic so that when we unwind
3602 the stack on success we can update the mark_state as we go */
3603 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3604 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3605 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3606 U32 state_num;
3607 bool no_final = 0; /* prevent failure from backtracking? */
3608 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3609 char *startpoint = locinput;
3610 SV *popmark = NULL; /* are we looking for a mark? */
3611 SV *sv_commit = NULL; /* last mark name seen in failure */
3612 SV *sv_yes_mark = NULL; /* last mark name we have seen
3613 during a successful match */
3614 U32 lastopen = 0; /* last open we saw */
3615 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3616 SV* const oreplsv = GvSV(PL_replgv);
3617 /* these three flags are set by various ops to signal information to
3618 * the very next op. They have a useful lifetime of exactly one loop
3619 * iteration, and are not preserved or restored by state pushes/pops
3620 */
3621 bool sw = 0; /* the condition value in (?(cond)a|b) */
3622 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3623 int logical = 0; /* the following EVAL is:
3624 0: (?{...})
3625 1: (?(?{...})X|Y)
3626 2: (??{...})
3627 or the following IFMATCH/UNLESSM is:
3628 false: plain (?=foo)
3629 true: used as a condition: (?(?=foo))
3630 */
3631 PAD* last_pad = NULL;
3632 dMULTICALL;
3633 I32 gimme = G_SCALAR;
3634 CV *caller_cv = NULL; /* who called us */
3635 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3636 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3637 U32 maxopenparen = 0; /* max '(' index seen so far */
3638
3639#ifdef DEBUGGING
3640 GET_RE_DEBUG_FLAGS_DECL;
3641#endif
3642
3643 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3644 multicall_oldcatch = 0;
3645 multicall_cv = NULL;
3646 cx = NULL;
3647 PERL_UNUSED_VAR(multicall_cop);
3648 PERL_UNUSED_VAR(newsp);
3649
3650
3651 PERL_ARGS_ASSERT_REGMATCH;
3652
3653 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3654 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3655 }));
3656 /* on first ever call to regmatch, allocate first slab */
3657 if (!PL_regmatch_slab) {
3658 Newx(PL_regmatch_slab, 1, regmatch_slab);
3659 PL_regmatch_slab->prev = NULL;
3660 PL_regmatch_slab->next = NULL;
3661 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3662 }
3663
3664 oldsave = PL_savestack_ix;
3665 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3666 SAVEVPTR(PL_regmatch_slab);
3667 SAVEVPTR(PL_regmatch_state);
3668
3669 /* grab next free state slot */
3670 st = ++PL_regmatch_state;
3671 if (st > SLAB_LAST(PL_regmatch_slab))
3672 st = PL_regmatch_state = S_push_slab(aTHX);
3673
3674 /* Note that nextchr is a byte even in UTF */
3675 SET_nextchr;
3676 scan = prog;
3677 while (scan != NULL) {
3678
3679 DEBUG_EXECUTE_r( {
3680 SV * const prop = sv_newmortal();
3681 regnode *rnext=regnext(scan);
3682 DUMP_EXEC_POS( locinput, scan, utf8_target );
3683 regprop(rex, prop, scan);
3684
3685 PerlIO_printf(Perl_debug_log,
3686 "%3"IVdf":%*s%s(%"IVdf")\n",
3687 (IV)(scan - rexi->program), depth*2, "",
3688 SvPVX_const(prop),
3689 (PL_regkind[OP(scan)] == END || !rnext) ?
3690 0 : (IV)(rnext - rexi->program));
3691 });
3692
3693 next = scan + NEXT_OFF(scan);
3694 if (next == scan)
3695 next = NULL;
3696 state_num = OP(scan);
3697
3698 reenter_switch:
3699
3700 SET_nextchr;
3701 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3702
3703 switch (state_num) {
3704 case BOL: /* /^../ */
3705 if (locinput == PL_bostr)
3706 {
3707 /* reginfo->till = reginfo->bol; */
3708 break;
3709 }
3710 sayNO;
3711
3712 case MBOL: /* /^../m */
3713 if (locinput == PL_bostr ||
3714 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3715 {
3716 break;
3717 }
3718 sayNO;
3719
3720 case SBOL: /* /^../s */
3721 if (locinput == PL_bostr)
3722 break;
3723 sayNO;
3724
3725 case GPOS: /* \G */
3726 if (locinput == reginfo->ganch)
3727 break;
3728 sayNO;
3729
3730 case KEEPS: /* \K */
3731 /* update the startpoint */
3732 st->u.keeper.val = rex->offs[0].start;
3733 rex->offs[0].start = locinput - PL_bostr;
3734 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3735 assert(0); /*NOTREACHED*/
3736 case KEEPS_next_fail:
3737 /* rollback the start point change */
3738 rex->offs[0].start = st->u.keeper.val;
3739 sayNO_SILENT;
3740 assert(0); /*NOTREACHED*/
3741
3742 case EOL: /* /..$/ */
3743 goto seol;
3744
3745 case MEOL: /* /..$/m */
3746 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3747 sayNO;
3748 break;
3749
3750 case SEOL: /* /..$/s */
3751 seol:
3752 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3753 sayNO;
3754 if (PL_regeol - locinput > 1)
3755 sayNO;
3756 break;
3757
3758 case EOS: /* \z */
3759 if (!NEXTCHR_IS_EOS)
3760 sayNO;
3761 break;
3762
3763 case SANY: /* /./s */
3764 if (NEXTCHR_IS_EOS)
3765 sayNO;
3766 goto increment_locinput;
3767
3768 case CANY: /* \C */
3769 if (NEXTCHR_IS_EOS)
3770 sayNO;
3771 locinput++;
3772 break;
3773
3774 case REG_ANY: /* /./ */
3775 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3776 sayNO;
3777 goto increment_locinput;
3778
3779
3780#undef ST
3781#define ST st->u.trie
3782 case TRIEC: /* (ab|cd) with known charclass */
3783 /* In this case the charclass data is available inline so
3784 we can fail fast without a lot of extra overhead.
3785 */
3786 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3787 DEBUG_EXECUTE_r(
3788 PerlIO_printf(Perl_debug_log,
3789 "%*s %sfailed to match trie start class...%s\n",
3790 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3791 );
3792 sayNO_SILENT;
3793 assert(0); /* NOTREACHED */
3794 }
3795 /* FALL THROUGH */
3796 case TRIE: /* (ab|cd) */
3797 /* the basic plan of execution of the trie is:
3798 * At the beginning, run though all the states, and
3799 * find the longest-matching word. Also remember the position
3800 * of the shortest matching word. For example, this pattern:
3801 * 1 2 3 4 5
3802 * ab|a|x|abcd|abc
3803 * when matched against the string "abcde", will generate
3804 * accept states for all words except 3, with the longest
3805 * matching word being 4, and the shortest being 2 (with
3806 * the position being after char 1 of the string).
3807 *
3808 * Then for each matching word, in word order (i.e. 1,2,4,5),
3809 * we run the remainder of the pattern; on each try setting
3810 * the current position to the character following the word,
3811 * returning to try the next word on failure.
3812 *
3813 * We avoid having to build a list of words at runtime by
3814 * using a compile-time structure, wordinfo[].prev, which
3815 * gives, for each word, the previous accepting word (if any).
3816 * In the case above it would contain the mappings 1->2, 2->0,
3817 * 3->0, 4->5, 5->1. We can use this table to generate, from
3818 * the longest word (4 above), a list of all words, by
3819 * following the list of prev pointers; this gives us the
3820 * unordered list 4,5,1,2. Then given the current word we have
3821 * just tried, we can go through the list and find the
3822 * next-biggest word to try (so if we just failed on word 2,
3823 * the next in the list is 4).
3824 *
3825 * Since at runtime we don't record the matching position in
3826 * the string for each word, we have to work that out for
3827 * each word we're about to process. The wordinfo table holds
3828 * the character length of each word; given that we recorded
3829 * at the start: the position of the shortest word and its
3830 * length in chars, we just need to move the pointer the
3831 * difference between the two char lengths. Depending on
3832 * Unicode status and folding, that's cheap or expensive.
3833 *
3834 * This algorithm is optimised for the case where are only a
3835 * small number of accept states, i.e. 0,1, or maybe 2.
3836 * With lots of accepts states, and having to try all of them,
3837 * it becomes quadratic on number of accept states to find all
3838 * the next words.
3839 */
3840
3841 {
3842 /* what type of TRIE am I? (utf8 makes this contextual) */
3843 DECL_TRIE_TYPE(scan);
3844
3845 /* what trie are we using right now */
3846 reg_trie_data * const trie
3847 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3848 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3849 U32 state = trie->startstate;
3850
3851 if ( trie->bitmap
3852 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3853 {
3854 if (trie->states[ state ].wordnum) {
3855 DEBUG_EXECUTE_r(
3856 PerlIO_printf(Perl_debug_log,
3857 "%*s %smatched empty string...%s\n",
3858 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3859 );
3860 if (!trie->jump)
3861 break;
3862 } else {
3863 DEBUG_EXECUTE_r(
3864 PerlIO_printf(Perl_debug_log,
3865 "%*s %sfailed to match trie start class...%s\n",
3866 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3867 );
3868 sayNO_SILENT;
3869 }
3870 }
3871
3872 {
3873 U8 *uc = ( U8* )locinput;
3874
3875 STRLEN len = 0;
3876 STRLEN foldlen = 0;
3877 U8 *uscan = (U8*)NULL;
3878 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3879 U32 charcount = 0; /* how many input chars we have matched */
3880 U32 accepted = 0; /* have we seen any accepting states? */
3881
3882 ST.jump = trie->jump;
3883 ST.me = scan;
3884 ST.firstpos = NULL;
3885 ST.longfold = FALSE; /* char longer if folded => it's harder */
3886 ST.nextword = 0;
3887
3888 /* fully traverse the TRIE; note the position of the
3889 shortest accept state and the wordnum of the longest
3890 accept state */
3891
3892 while ( state && uc <= (U8*)PL_regeol ) {
3893 U32 base = trie->states[ state ].trans.base;
3894 UV uvc = 0;
3895 U16 charid = 0;
3896 U16 wordnum;
3897 wordnum = trie->states[ state ].wordnum;
3898
3899 if (wordnum) { /* it's an accept state */
3900 if (!accepted) {
3901 accepted = 1;
3902 /* record first match position */
3903 if (ST.longfold) {
3904 ST.firstpos = (U8*)locinput;
3905 ST.firstchars = 0;
3906 }
3907 else {
3908 ST.firstpos = uc;
3909 ST.firstchars = charcount;
3910 }
3911 }
3912 if (!ST.nextword || wordnum < ST.nextword)
3913 ST.nextword = wordnum;
3914 ST.topword = wordnum;
3915 }
3916
3917 DEBUG_TRIE_EXECUTE_r({
3918 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3919 PerlIO_printf( Perl_debug_log,
3920 "%*s %sState: %4"UVxf" Accepted: %c ",
3921 2+depth * 2, "", PL_colors[4],
3922 (UV)state, (accepted ? 'Y' : 'N'));
3923 });
3924
3925 /* read a char and goto next state */
3926 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3927 I32 offset;
3928 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3929 uscan, len, uvc, charid, foldlen,
3930 foldbuf, uniflags);
3931 charcount++;
3932 if (foldlen>0)
3933 ST.longfold = TRUE;
3934 if (charid &&
3935 ( ((offset =
3936 base + charid - 1 - trie->uniquecharcount)) >= 0)
3937
3938 && ((U32)offset < trie->lasttrans)
3939 && trie->trans[offset].check == state)
3940 {
3941 state = trie->trans[offset].next;
3942 }
3943 else {
3944 state = 0;
3945 }
3946 uc += len;
3947
3948 }
3949 else {
3950 state = 0;
3951 }
3952 DEBUG_TRIE_EXECUTE_r(
3953 PerlIO_printf( Perl_debug_log,
3954 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3955 charid, uvc, (UV)state, PL_colors[5] );
3956 );
3957 }
3958 if (!accepted)
3959 sayNO;
3960
3961 /* calculate total number of accept states */
3962 {
3963 U16 w = ST.topword;
3964 accepted = 0;
3965 while (w) {
3966 w = trie->wordinfo[w].prev;
3967 accepted++;
3968 }
3969 ST.accepted = accepted;
3970 }
3971
3972 DEBUG_EXECUTE_r(
3973 PerlIO_printf( Perl_debug_log,
3974 "%*s %sgot %"IVdf" possible matches%s\n",
3975 REPORT_CODE_OFF + depth * 2, "",
3976 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3977 );
3978 goto trie_first_try; /* jump into the fail handler */
3979 }}
3980 assert(0); /* NOTREACHED */
3981
3982 case TRIE_next_fail: /* we failed - try next alternative */
3983 {
3984 U8 *uc;
3985 if ( ST.jump) {
3986 REGCP_UNWIND(ST.cp);
3987 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3988 }
3989 if (!--ST.accepted) {
3990 DEBUG_EXECUTE_r({
3991 PerlIO_printf( Perl_debug_log,
3992 "%*s %sTRIE failed...%s\n",
3993 REPORT_CODE_OFF+depth*2, "",
3994 PL_colors[4],
3995 PL_colors[5] );
3996 });
3997 sayNO_SILENT;
3998 }
3999 {
4000 /* Find next-highest word to process. Note that this code
4001 * is O(N^2) per trie run (O(N) per branch), so keep tight */
4002 U16 min = 0;
4003 U16 word;
4004 U16 const nextword = ST.nextword;
4005 reg_trie_wordinfo * const wordinfo
4006 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4007 for (word=ST.topword; word; word=wordinfo[word].prev) {
4008 if (word > nextword && (!min || word < min))
4009 min = word;
4010 }
4011 ST.nextword = min;
4012 }
4013
4014 trie_first_try:
4015 if (do_cutgroup) {
4016 do_cutgroup = 0;
4017 no_final = 0;
4018 }
4019
4020 if ( ST.jump) {
4021 ST.lastparen = rex->lastparen;
4022 ST.lastcloseparen = rex->lastcloseparen;
4023 REGCP_SET(ST.cp);
4024 }
4025
4026 /* find start char of end of current word */
4027 {
4028 U32 chars; /* how many chars to skip */
4029 reg_trie_data * const trie
4030 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4031
4032 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4033 >= ST.firstchars);
4034 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4035 - ST.firstchars;
4036 uc = ST.firstpos;
4037
4038 if (ST.longfold) {
4039 /* the hard option - fold each char in turn and find
4040 * its folded length (which may be different */
4041 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4042 STRLEN foldlen;
4043 STRLEN len;
4044 UV uvc;
4045 U8 *uscan;
4046
4047 while (chars) {
4048 if (utf8_target) {
4049 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4050 uniflags);
4051 uc += len;
4052 }
4053 else {
4054 uvc = *uc;
4055 uc++;
4056 }
4057 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4058 uscan = foldbuf;
4059 while (foldlen) {
4060 if (!--chars)
4061 break;
4062 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4063 uniflags);
4064 uscan += len;
4065 foldlen -= len;
4066 }
4067 }
4068 }
4069 else {
4070 if (utf8_target)
4071 while (chars--)
4072 uc += UTF8SKIP(uc);
4073 else
4074 uc += chars;
4075 }
4076 }
4077
4078 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4079 ? ST.jump[ST.nextword]
4080 : NEXT_OFF(ST.me));
4081
4082 DEBUG_EXECUTE_r({
4083 PerlIO_printf( Perl_debug_log,
4084 "%*s %sTRIE matched word #%d, continuing%s\n",
4085 REPORT_CODE_OFF+depth*2, "",
4086 PL_colors[4],
4087 ST.nextword,
4088 PL_colors[5]
4089 );
4090 });
4091
4092 if (ST.accepted > 1 || has_cutgroup) {
4093 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4094 assert(0); /* NOTREACHED */
4095 }
4096 /* only one choice left - just continue */
4097 DEBUG_EXECUTE_r({
4098 AV *const trie_words
4099 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4100 SV ** const tmp = av_fetch( trie_words,
4101 ST.nextword-1, 0 );
4102 SV *sv= tmp ? sv_newmortal() : NULL;
4103
4104 PerlIO_printf( Perl_debug_log,
4105 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4106 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4107 ST.nextword,
4108 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4109 PL_colors[0], PL_colors[1],
4110 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4111 )
4112 : "not compiled under -Dr",
4113 PL_colors[5] );
4114 });
4115
4116 locinput = (char*)uc;
4117 continue; /* execute rest of RE */
4118 assert(0); /* NOTREACHED */
4119 }
4120#undef ST
4121
4122 case EXACT: { /* /abc/ */
4123 char *s = STRING(scan);
4124 ln = STR_LEN(scan);
4125 if (utf8_target != UTF_PATTERN) {
4126 /* The target and the pattern have differing utf8ness. */
4127 char *l = locinput;
4128 const char * const e = s + ln;
4129
4130 if (utf8_target) {
4131 /* The target is utf8, the pattern is not utf8.
4132 * Above-Latin1 code points can't match the pattern;
4133 * invariants match exactly, and the other Latin1 ones need
4134 * to be downgraded to a single byte in order to do the
4135 * comparison. (If we could be confident that the target
4136 * is not malformed, this could be refactored to have fewer
4137 * tests by just assuming that if the first bytes match, it
4138 * is an invariant, but there are tests in the test suite
4139 * dealing with (??{...}) which violate this) */
4140 while (s < e) {
4141 if (l >= PL_regeol)
4142 sayNO;
4143 if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4144 sayNO;
4145 }
4146 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4147 if (*l != *s) {
4148 sayNO;
4149 }
4150 l++;
4151 }
4152 else {
4153 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4154 sayNO;
4155 }
4156 l += 2;
4157 }
4158 s++;
4159 }
4160 }
4161 else {
4162 /* The target is not utf8, the pattern is utf8. */
4163 while (s < e) {
4164 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4165 {
4166 sayNO;
4167 }
4168 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4169 if (*s != *l) {
4170 sayNO;
4171 }
4172 s++;
4173 }
4174 else {
4175 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4176 sayNO;
4177 }
4178 s += 2;
4179 }
4180 l++;
4181 }
4182 }
4183 locinput = l;
4184 break;
4185 }
4186 /* The target and the pattern have the same utf8ness. */
4187 /* Inline the first character, for speed. */
4188 if (UCHARAT(s) != nextchr)
4189 sayNO;
4190 if (PL_regeol - locinput < ln)
4191 sayNO;
4192 if (ln > 1 && memNE(s, locinput, ln))
4193 sayNO;
4194 locinput += ln;
4195 break;
4196 }
4197
4198 case EXACTFL: { /* /abc/il */
4199 re_fold_t folder;
4200 const U8 * fold_array;
4201 const char * s;
4202 U32 fold_utf8_flags;
4203
4204 PL_reg_flags |= RF_tainted;
4205 folder = foldEQ_locale;
4206 fold_array = PL_fold_locale;
4207 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4208 goto do_exactf;
4209
4210 case EXACTFU_SS: /* /\x{df}/iu */
4211 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4212 case EXACTFU: /* /abc/iu */
4213 folder = foldEQ_latin1;
4214 fold_array = PL_fold_latin1;
4215 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4216 goto do_exactf;
4217
4218 case EXACTFA: /* /abc/iaa */
4219 folder = foldEQ_latin1;
4220 fold_array = PL_fold_latin1;
4221 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4222 goto do_exactf;
4223
4224 case EXACTF: /* /abc/i */
4225 folder = foldEQ;
4226 fold_array = PL_fold;
4227 fold_utf8_flags = 0;
4228
4229 do_exactf:
4230 s = STRING(scan);
4231 ln = STR_LEN(scan);
4232
4233 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4234 /* Either target or the pattern are utf8, or has the issue where
4235 * the fold lengths may differ. */
4236 const char * const l = locinput;
4237 char *e = PL_regeol;
4238
4239 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
4240 l, &e, 0, utf8_target, fold_utf8_flags))
4241 {
4242 sayNO;
4243 }
4244 locinput = e;
4245 break;
4246 }
4247
4248 /* Neither the target nor the pattern are utf8 */
4249 if (UCHARAT(s) != nextchr
4250 && !NEXTCHR_IS_EOS
4251 && UCHARAT(s) != fold_array[nextchr])
4252 {
4253 sayNO;
4254 }
4255 if (PL_regeol - locinput < ln)
4256 sayNO;
4257 if (ln > 1 && ! folder(s, locinput, ln))
4258 sayNO;
4259 locinput += ln;
4260 break;
4261 }
4262
4263 /* XXX Could improve efficiency by separating these all out using a
4264 * macro or in-line function. At that point regcomp.c would no longer
4265 * have to set the FLAGS fields of these */
4266 case BOUNDL: /* /\b/l */
4267 case NBOUNDL: /* /\B/l */
4268 PL_reg_flags |= RF_tainted;
4269 /* FALL THROUGH */
4270 case BOUND: /* /\b/ */
4271 case BOUNDU: /* /\b/u */
4272 case BOUNDA: /* /\b/a */
4273 case NBOUND: /* /\B/ */
4274 case NBOUNDU: /* /\B/u */
4275 case NBOUNDA: /* /\B/a */
4276 /* was last char in word? */
4277 if (utf8_target
4278 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4279 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4280 {
4281 if (locinput == PL_bostr)
4282 ln = '\n';
4283 else {
4284 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4285
4286 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4287 }
4288 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4289 ln = isALNUM_uni(ln);
4290 if (NEXTCHR_IS_EOS)
4291 n = 0;
4292 else {
4293 LOAD_UTF8_CHARCLASS_ALNUM();
4294 n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4295 utf8_target);
4296 }
4297 }
4298 else {
4299 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4300 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4301 }
4302 }
4303 else {
4304
4305 /* Here the string isn't utf8, or is utf8 and only ascii
4306 * characters are to match \w. In the latter case looking at
4307 * the byte just prior to the current one may be just the final
4308 * byte of a multi-byte character. This is ok. There are two
4309 * cases:
4310 * 1) it is a single byte character, and then the test is doing
4311 * just what it's supposed to.
4312 * 2) it is a multi-byte character, in which case the final
4313 * byte is never mistakable for ASCII, and so the test
4314 * will say it is not a word character, which is the
4315 * correct answer. */
4316 ln = (locinput != PL_bostr) ?
4317 UCHARAT(locinput - 1) : '\n';
4318 switch (FLAGS(scan)) {
4319 case REGEX_UNICODE_CHARSET:
4320 ln = isWORDCHAR_L1(ln);
4321 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4322 break;
4323 case REGEX_LOCALE_CHARSET:
4324 ln = isALNUM_LC(ln);
4325 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4326 break;
4327 case REGEX_DEPENDS_CHARSET:
4328 ln = isALNUM(ln);
4329 n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4330 break;
4331 case REGEX_ASCII_RESTRICTED_CHARSET:
4332 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4333 ln = isWORDCHAR_A(ln);
4334 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4335 break;
4336 default:
4337 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4338 break;
4339 }
4340 }
4341 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4342 * regcomp.sym */
4343 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4344 sayNO;
4345 break;
4346
4347 case ANYOF: /* /[abc]/ */
4348 if (NEXTCHR_IS_EOS)
4349 sayNO;
4350 if (utf8_target) {
4351 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4352 sayNO;
4353 locinput += UTF8SKIP(locinput);
4354 break;
4355 }
4356 else {
4357 if (!REGINCLASS(rex, scan, (U8*)locinput))
4358 sayNO;
4359 locinput++;
4360 break;
4361 }
4362 break;
4363
4364 /* Special char classes: \d, \w etc.
4365 * The defines start on line 166 or so */
4366 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
4367 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4368 ALNUMU, NALNUMU, isWORDCHAR_L1,
4369 ALNUMA, NALNUMA, isWORDCHAR_A,
4370 alnum, "a");
4371
4372 case SPACEL:
4373 PL_reg_flags |= RF_tainted;
4374 if (NEXTCHR_IS_EOS) {
4375 sayNO;
4376 }
4377 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4378 if (! isSPACE_LC_utf8((U8 *) locinput)) {
4379 sayNO;
4380 }
4381 }
4382 else if (! isSPACE_LC((U8) nextchr)) {
4383 sayNO;
4384 }
4385 goto increment_locinput;
4386
4387 case NSPACEL:
4388 PL_reg_flags |= RF_tainted;
4389 if (NEXTCHR_IS_EOS) {
4390 sayNO;
4391 }
4392 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4393 if (isSPACE_LC_utf8((U8 *) locinput)) {
4394 sayNO;
4395 }
4396 }
4397 else if (isSPACE_LC(nextchr)) {
4398 sayNO;
4399 }
4400 goto increment_locinput;
4401
4402 case SPACE:
4403 if (utf8_target) {
4404 goto utf8_space;
4405 }
4406 /* FALL THROUGH */
4407 case SPACEA:
4408 if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
4409 sayNO;
4410 }
4411 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4412 locinput++;
4413 break;
4414
4415 case NSPACE:
4416 if (utf8_target) {
4417 goto utf8_nspace;
4418 }
4419 /* FALL THROUGH */
4420 case NSPACEA:
4421 if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
4422 sayNO;
4423 }
4424 goto increment_locinput;
4425
4426 case SPACEU:
4427 utf8_space:
4428 if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
4429 sayNO;
4430 }
4431 goto increment_locinput;
4432
4433 case NSPACEU:
4434 utf8_nspace:
4435 if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
4436 sayNO;
4437 }
4438 goto increment_locinput;
4439
4440 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
4441 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4442 DIGITA, NDIGITA, isDIGIT_A,
4443 digit, "0");
4444
4445 case POSIXA: /* /[[:ascii:]]/ etc */
4446 if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4447 sayNO;
4448 }
4449 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4450 locinput++;
4451 break;
4452
4453 case NPOSIXA: /* /[^[:ascii:]]/ etc */
4454 if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4455 sayNO;
4456 }
4457 goto increment_locinput;
4458
4459 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4460 a Unicode extended Grapheme Cluster */
4461 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4462 extended Grapheme Cluster is:
4463
4464 CR LF
4465 | Prepend* Begin Extend*
4466 | .
4467
4468 Begin is: ( Special_Begin | ! Control )
4469 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4470 Extend is: ( Grapheme_Extend | Spacing_Mark )
4471 Control is: [ GCB_Control | CR | LF ]
4472 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4473
4474 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4475 we can rewrite
4476
4477 Begin is ( Regular_Begin + Special Begin )
4478
4479 It turns out that 98.4% of all Unicode code points match
4480 Regular_Begin. Doing it this way eliminates a table match in
4481 the previous implementation for almost all Unicode code points.
4482
4483 There is a subtlety with Prepend* which showed up in testing.
4484 Note that the Begin, and only the Begin is required in:
4485 | Prepend* Begin Extend*
4486 Also, Begin contains '! Control'. A Prepend must be a
4487 '! Control', which means it must also be a Begin. What it
4488 comes down to is that if we match Prepend* and then find no
4489 suitable Begin afterwards, that if we backtrack the last
4490 Prepend, that one will be a suitable Begin.
4491 */
4492
4493 if (NEXTCHR_IS_EOS)
4494 sayNO;
4495 if (! utf8_target) {
4496
4497 /* Match either CR LF or '.', as all the other possibilities
4498 * require utf8 */
4499 locinput++; /* Match the . or CR */
4500 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4501 match the LF */
4502 && locinput < PL_regeol
4503 && UCHARAT(locinput) == '\n')
4504 {
4505 locinput++;
4506 }
4507 }
4508 else {
4509
4510 /* Utf8: See if is ( CR LF ); already know that locinput <
4511 * PL_regeol, so locinput+1 is in bounds */
4512 if ( nextchr == '\r' && locinput+1 < PL_regeol
4513 && UCHARAT(locinput + 1) == '\n')
4514 {
4515 locinput += 2;
4516 }
4517 else {
4518 STRLEN len;
4519
4520 /* In case have to backtrack to beginning, then match '.' */
4521 char *starting = locinput;
4522
4523 /* In case have to backtrack the last prepend */
4524 char *previous_prepend = NULL;
4525
4526 LOAD_UTF8_CHARCLASS_GCB();
4527
4528 /* Match (prepend)* */
4529 while (locinput < PL_regeol
4530 && (len = is_GCB_Prepend_utf8(locinput)))
4531 {
4532 previous_prepend = locinput;
4533 locinput += len;
4534 }
4535
4536 /* As noted above, if we matched a prepend character, but
4537 * the next thing won't match, back off the last prepend we
4538 * matched, as it is guaranteed to match the begin */
4539 if (previous_prepend
4540 && (locinput >= PL_regeol
4541 || (! swash_fetch(PL_utf8_X_regular_begin,
4542 (U8*)locinput, utf8_target)
4543 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4544 )
4545 {
4546 locinput = previous_prepend;
4547 }
4548
4549 /* Note that here we know PL_regeol > locinput, as we
4550 * tested that upon input to this switch case, and if we
4551 * moved locinput forward, we tested the result just above
4552 * and it either passed, or we backed off so that it will
4553 * now pass */
4554 if (swash_fetch(PL_utf8_X_regular_begin,
4555 (U8*)locinput, utf8_target)) {
4556 locinput += UTF8SKIP(locinput);
4557 }
4558 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4559
4560 /* Here did not match the required 'Begin' in the
4561 * second term. So just match the very first
4562 * character, the '.' of the final term of the regex */
4563 locinput = starting + UTF8SKIP(starting);
4564 goto exit_utf8;
4565 } else {
4566
4567 /* Here is a special begin. It can be composed of
4568 * several individual characters. One possibility is
4569 * RI+ */
4570 if ((len = is_GCB_RI_utf8(locinput))) {
4571 locinput += len;
4572 while (locinput < PL_regeol
4573 && (len = is_GCB_RI_utf8(locinput)))
4574 {
4575 locinput += len;
4576 }
4577 } else if ((len = is_GCB_T_utf8(locinput))) {
4578 /* Another possibility is T+ */
4579 locinput += len;
4580 while (locinput < PL_regeol
4581 && (len = is_GCB_T_utf8(locinput)))
4582 {
4583 locinput += len;
4584 }
4585 } else {
4586
4587 /* Here, neither RI+ nor T+; must be some other
4588 * Hangul. That means it is one of the others: L,
4589 * LV, LVT or V, and matches:
4590 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4591
4592 /* Match L* */
4593 while (locinput < PL_regeol
4594 && (len = is_GCB_L_utf8(locinput)))
4595 {
4596 locinput += len;
4597 }
4598
4599 /* Here, have exhausted L*. If the next character
4600 * is not an LV, LVT nor V, it means we had to have
4601 * at least one L, so matches L+ in the original
4602 * equation, we have a complete hangul syllable.
4603 * Are done. */
4604
4605 if (locinput < PL_regeol
4606 && is_GCB_LV_LVT_V_utf8(locinput))
4607 {
4608 /* Otherwise keep going. Must be LV, LVT or V.
4609 * See if LVT, by first ruling out V, then LV */
4610 if (! is_GCB_V_utf8(locinput)
4611 /* All but every TCount one is LV */
4612 && (valid_utf8_to_uvchr((U8 *) locinput,
4613 NULL)
4614 - SBASE)
4615 % TCount != 0)
4616 {
4617 locinput += UTF8SKIP(locinput);
4618 } else {
4619
4620 /* Must be V or LV. Take it, then match
4621 * V* */
4622 locinput += UTF8SKIP(locinput);
4623 while (locinput < PL_regeol
4624 && (len = is_GCB_V_utf8(locinput)))
4625 {
4626 locinput += len;
4627 }
4628 }
4629
4630 /* And any of LV, LVT, or V can be followed
4631 * by T* */
4632 while (locinput < PL_regeol
4633 && (len = is_GCB_T_utf8(locinput)))
4634 {
4635 locinput += len;
4636 }
4637 }
4638 }
4639 }
4640
4641 /* Match any extender */
4642 while (locinput < PL_regeol
4643 && swash_fetch(PL_utf8_X_extend,
4644 (U8*)locinput, utf8_target))
4645 {
4646 locinput += UTF8SKIP(locinput);
4647 }
4648 }
4649 exit_utf8:
4650 if (locinput > PL_regeol) sayNO;
4651 }
4652 break;
4653
4654 case NREFFL: /* /\g{name}/il */
4655 { /* The capture buffer cases. The ones beginning with N for the
4656 named buffers just convert to the equivalent numbered and
4657 pretend they were called as the corresponding numbered buffer
4658 op. */
4659 /* don't initialize these in the declaration, it makes C++
4660 unhappy */
4661 char *s;
4662 char type;
4663 re_fold_t folder;
4664 const U8 *fold_array;
4665 UV utf8_fold_flags;
4666
4667 PL_reg_flags |= RF_tainted;
4668 folder = foldEQ_locale;
4669 fold_array = PL_fold_locale;
4670 type = REFFL;
4671 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4672 goto do_nref;
4673
4674 case NREFFA: /* /\g{name}/iaa */
4675 folder = foldEQ_latin1;
4676 fold_array = PL_fold_latin1;
4677 type = REFFA;
4678 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4679 goto do_nref;
4680
4681 case NREFFU: /* /\g{name}/iu */
4682 folder = foldEQ_latin1;
4683 fold_array = PL_fold_latin1;
4684 type = REFFU;
4685 utf8_fold_flags = 0;
4686 goto do_nref;
4687
4688 case NREFF: /* /\g{name}/i */
4689 folder = foldEQ;
4690 fold_array = PL_fold;
4691 type = REFF;
4692 utf8_fold_flags = 0;
4693 goto do_nref;
4694
4695 case NREF: /* /\g{name}/ */
4696 type = REF;
4697 folder = NULL;
4698 fold_array = NULL;
4699 utf8_fold_flags = 0;
4700 do_nref:
4701
4702 /* For the named back references, find the corresponding buffer
4703 * number */
4704 n = reg_check_named_buff_matched(rex,scan);
4705
4706 if ( ! n ) {
4707 sayNO;
4708 }
4709 goto do_nref_ref_common;
4710
4711 case REFFL: /* /\1/il */
4712 PL_reg_flags |= RF_tainted;
4713 folder = foldEQ_locale;
4714 fold_array = PL_fold_locale;
4715 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4716 goto do_ref;
4717
4718 case REFFA: /* /\1/iaa */
4719 folder = foldEQ_latin1;
4720 fold_array = PL_fold_latin1;
4721 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4722 goto do_ref;
4723
4724 case REFFU: /* /\1/iu */
4725 folder = foldEQ_latin1;
4726 fold_array = PL_fold_latin1;
4727 utf8_fold_flags = 0;
4728 goto do_ref;
4729
4730 case REFF: /* /\1/i */
4731 folder = foldEQ;
4732 fold_array = PL_fold;
4733 utf8_fold_flags = 0;
4734 goto do_ref;
4735
4736 case REF: /* /\1/ */
4737 folder = NULL;
4738 fold_array = NULL;
4739 utf8_fold_flags = 0;
4740
4741 do_ref:
4742 type = OP(scan);
4743 n = ARG(scan); /* which paren pair */
4744
4745 do_nref_ref_common:
4746 ln = rex->offs[n].start;
4747 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4748 if (rex->lastparen < n || ln == -1)
4749 sayNO; /* Do not match unless seen CLOSEn. */
4750 if (ln == rex->offs[n].end)
4751 break;
4752
4753 s = PL_bostr + ln;
4754 if (type != REF /* REF can do byte comparison */
4755 && (utf8_target || type == REFFU))
4756 { /* XXX handle REFFL better */
4757 char * limit = PL_regeol;
4758
4759 /* This call case insensitively compares the entire buffer
4760 * at s, with the current input starting at locinput, but
4761 * not going off the end given by PL_regeol, and returns in
4762 * <limit> upon success, how much of the current input was
4763 * matched */
4764 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4765 locinput, &limit, 0, utf8_target, utf8_fold_flags))
4766 {
4767 sayNO;
4768 }
4769 locinput = limit;
4770 break;
4771 }
4772
4773 /* Not utf8: Inline the first character, for speed. */
4774 if (!NEXTCHR_IS_EOS &&
4775 UCHARAT(s) != nextchr &&
4776 (type == REF ||
4777 UCHARAT(s) != fold_array[nextchr]))
4778 sayNO;
4779 ln = rex->offs[n].end - ln;
4780 if (locinput + ln > PL_regeol)
4781 sayNO;
4782 if (ln > 1 && (type == REF
4783 ? memNE(s, locinput, ln)
4784 : ! folder(s, locinput, ln)))
4785 sayNO;
4786 locinput += ln;
4787 break;
4788 }
4789
4790 case NOTHING: /* null op; e.g. the 'nothing' following
4791 * the '*' in m{(a+|b)*}' */
4792 break;
4793 case TAIL: /* placeholder while compiling (A|B|C) */
4794 break;
4795
4796 case BACK: /* ??? doesn't appear to be used ??? */
4797 break;
4798
4799#undef ST
4800#define ST st->u.eval
4801 {
4802 SV *ret;
4803 REGEXP *re_sv;
4804 regexp *re;
4805 regexp_internal *rei;
4806 regnode *startpoint;
4807
4808 case GOSTART: /* (?R) */
4809 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4810 if (cur_eval && cur_eval->locinput==locinput) {
4811 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4812 Perl_croak(aTHX_ "Infinite recursion in regex");
4813 if ( ++nochange_depth > max_nochange_depth )
4814 Perl_croak(aTHX_
4815 "Pattern subroutine nesting without pos change"
4816 " exceeded limit in regex");
4817 } else {
4818 nochange_depth = 0;
4819 }
4820 re_sv = rex_sv;
4821 re = rex;
4822 rei = rexi;
4823 if (OP(scan)==GOSUB) {
4824 startpoint = scan + ARG2L(scan);
4825 ST.close_paren = ARG(scan);
4826 } else {
4827 startpoint = rei->program+1;
4828 ST.close_paren = 0;
4829 }
4830 goto eval_recurse_doit;
4831 assert(0); /* NOTREACHED */
4832
4833 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4834 if (cur_eval && cur_eval->locinput==locinput) {
4835 if ( ++nochange_depth > max_nochange_depth )
4836 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4837 } else {
4838 nochange_depth = 0;
4839 }
4840 {
4841 /* execute the code in the {...} */
4842
4843 dSP;
4844 IV before;
4845 OP * const oop = PL_op;
4846 COP * const ocurcop = PL_curcop;
4847 OP *nop;
4848 char *saved_regeol = PL_regeol;
4849 struct re_save_state saved_state;
4850 CV *newcv;
4851
4852 /* save *all* paren positions */
4853 regcppush(rex, 0, maxopenparen);
4854 REGCP_SET(runops_cp);
4855
4856 /* To not corrupt the existing regex state while executing the
4857 * eval we would normally put it on the save stack, like with
4858 * save_re_context. However, re-evals have a weird scoping so we
4859 * can't just add ENTER/LEAVE here. With that, things like
4860 *
4861 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4862 *
4863 * would break, as they expect the localisation to be unwound
4864 * only when the re-engine backtracks through the bit that
4865 * localised it.
4866 *
4867 * What we do instead is just saving the state in a local c
4868 * variable.
4869 */
4870 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4871
4872 PL_reg_state.re_reparsing = FALSE;
4873
4874 if (!caller_cv)
4875 caller_cv = find_runcv(NULL);
4876
4877 n = ARG(scan);
4878
4879 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4880 newcv = (ReANY(
4881 (REGEXP*)(rexi->data->data[n])
4882 ))->qr_anoncv
4883 ;
4884 nop = (OP*)rexi->data->data[n+1];
4885 }
4886 else if (rexi->data->what[n] == 'l') { /* literal code */
4887 newcv = caller_cv;
4888 nop = (OP*)rexi->data->data[n];
4889 assert(CvDEPTH(newcv));
4890 }
4891 else {
4892 /* literal with own CV */
4893 assert(rexi->data->what[n] == 'L');
4894 newcv = rex->qr_anoncv;
4895 nop = (OP*)rexi->data->data[n];
4896 }
4897
4898 /* normally if we're about to execute code from the same
4899 * CV that we used previously, we just use the existing
4900 * CX stack entry. However, its possible that in the
4901 * meantime we may have backtracked, popped from the save
4902 * stack, and undone the SAVECOMPPAD(s) associated with
4903 * PUSH_MULTICALL; in which case PL_comppad no longer
4904 * points to newcv's pad. */
4905 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4906 {
4907 I32 depth = (newcv == caller_cv) ? 0 : 1;
4908 if (last_pushed_cv) {
4909 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4910 }
4911 else {
4912 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4913 }
4914 last_pushed_cv = newcv;
4915 }
4916 else {
4917 /* these assignments are just to silence compiler
4918 * warnings */
4919 multicall_cop = NULL;
4920 newsp = NULL;
4921 }
4922 last_pad = PL_comppad;
4923
4924 /* the initial nextstate you would normally execute
4925 * at the start of an eval (which would cause error
4926 * messages to come from the eval), may be optimised
4927 * away from the execution path in the regex code blocks;
4928 * so manually set PL_curcop to it initially */
4929 {
4930 OP *o = cUNOPx(nop)->op_first;
4931 assert(o->op_type == OP_NULL);
4932 if (o->op_targ == OP_SCOPE) {
4933 o = cUNOPo->op_first;
4934 }
4935 else {
4936 assert(o->op_targ == OP_LEAVE);
4937 o = cUNOPo->op_first;
4938 assert(o->op_type == OP_ENTER);
4939 o = o->op_sibling;
4940 }
4941
4942 if (o->op_type != OP_STUB) {
4943 assert( o->op_type == OP_NEXTSTATE
4944 || o->op_type == OP_DBSTATE
4945 || (o->op_type == OP_NULL
4946 && ( o->op_targ == OP_NEXTSTATE
4947 || o->op_targ == OP_DBSTATE
4948 )
4949 )
4950 );
4951 PL_curcop = (COP*)o;
4952 }
4953 }
4954 nop = nop->op_next;
4955
4956 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4957 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4958
4959 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4960
4961 if (sv_yes_mark) {
4962 SV *sv_mrk = get_sv("REGMARK", 1);
4963 sv_setsv(sv_mrk, sv_yes_mark);
4964 }
4965
4966 /* we don't use MULTICALL here as we want to call the
4967 * first op of the block of interest, rather than the
4968 * first op of the sub */
4969 before = (IV)(SP-PL_stack_base);
4970 PL_op = nop;
4971 CALLRUNOPS(aTHX); /* Scalar context. */
4972 SPAGAIN;
4973 if ((IV)(SP-PL_stack_base) == before)
4974 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
4975 else {
4976 ret = POPs;
4977 PUTBACK;
4978 }
4979
4980 /* before restoring everything, evaluate the returned
4981 * value, so that 'uninit' warnings don't use the wrong
4982 * PL_op or pad. Also need to process any magic vars
4983 * (e.g. $1) *before* parentheses are restored */
4984
4985 PL_op = NULL;
4986
4987 re_sv = NULL;
4988 if (logical == 0) /* (?{})/ */
4989 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4990 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4991 sw = cBOOL(SvTRUE(ret));
4992 logical = 0;
4993 }
4994 else { /* /(??{}) */
4995 /* if its overloaded, let the regex compiler handle
4996 * it; otherwise extract regex, or stringify */
4997 if (!SvAMAGIC(ret)) {
4998 SV *sv = ret;
4999 if (SvROK(sv))
5000 sv = SvRV(sv);
5001 if (SvTYPE(sv) == SVt_REGEXP)
5002 re_sv = (REGEXP*) sv;
5003 else if (SvSMAGICAL(sv)) {
5004 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
5005 if (mg)
5006 re_sv = (REGEXP *) mg->mg_obj;
5007 }
5008
5009 /* force any magic, undef warnings here */
5010 if (!re_sv) {
5011 ret = sv_mortalcopy(ret);
5012 (void) SvPV_force_nolen(ret);
5013 }
5014 }
5015
5016 }
5017
5018 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
5019
5020 /* *** Note that at this point we don't restore
5021 * PL_comppad, (or pop the CxSUB) on the assumption it may
5022 * be used again soon. This is safe as long as nothing
5023 * in the regexp code uses the pad ! */
5024 PL_op = oop;
5025 PL_curcop = ocurcop;
5026 PL_regeol = saved_regeol;
5027 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5028
5029 if (logical != 2)
5030 break;
5031 }
5032
5033 /* only /(??{})/ from now on */
5034 logical = 0;
5035 {
5036 /* extract RE object from returned value; compiling if
5037 * necessary */
5038
5039 if (re_sv) {
5040 re_sv = reg_temp_copy(NULL, re_sv);
5041 }
5042 else {
5043 U32 pm_flags = 0;
5044
5045 if (SvUTF8(ret) && IN_BYTES) {
5046 /* In use 'bytes': make a copy of the octet
5047 * sequence, but without the flag on */
5048 STRLEN len;
5049 const char *const p = SvPV(ret, len);
5050 ret = newSVpvn_flags(p, len, SVs_TEMP);
5051 }
5052 if (rex->intflags & PREGf_USE_RE_EVAL)
5053 pm_flags |= PMf_USE_RE_EVAL;
5054
5055 /* if we got here, it should be an engine which
5056 * supports compiling code blocks and stuff */
5057 assert(rex->engine && rex->engine->op_comp);
5058 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5059 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5060 rex->engine, NULL, NULL,
5061 /* copy /msix etc to inner pattern */
5062 scan->flags,
5063 pm_flags);
5064
5065 if (!(SvFLAGS(ret)
5066 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5067 | SVs_GMG))) {
5068 /* This isn't a first class regexp. Instead, it's
5069 caching a regexp onto an existing, Perl visible
5070 scalar. */
5071 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5072 }
5073 /* safe to do now that any $1 etc has been
5074 * interpolated into the new pattern string and
5075 * compiled */
5076 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5077 }
5078 SAVEFREESV(re_sv);
5079 re = ReANY(re_sv);
5080 }
5081 RXp_MATCH_COPIED_off(re);
5082 re->subbeg = rex->subbeg;
5083 re->sublen = rex->sublen;
5084 re->suboffset = rex->suboffset;
5085 re->subcoffset = rex->subcoffset;
5086 rei = RXi_GET(re);
5087 DEBUG_EXECUTE_r(
5088 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
5089 "Matching embedded");
5090 );
5091 startpoint = rei->program + 1;
5092 ST.close_paren = 0; /* only used for GOSUB */
5093
5094 eval_recurse_doit: /* Share code with GOSUB below this line */
5095 /* run the pattern returned from (??{...}) */
5096
5097 /* Save *all* the positions. */
5098 ST.cp = regcppush(rex, 0, maxopenparen);
5099 REGCP_SET(ST.lastcp);
5100
5101 re->lastparen = 0;
5102 re->lastcloseparen = 0;
5103
5104 maxopenparen = 0;
5105
5106 /* XXXX This is too dramatic a measure... */
5107 PL_reg_maxiter = 0;
5108
5109 ST.toggle_reg_flags = PL_reg_flags;
5110 if (RX_UTF8(re_sv))
5111 PL_reg_flags |= RF_utf8;
5112 else
5113 PL_reg_flags &= ~RF_utf8;
5114 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5115
5116 ST.prev_rex = rex_sv;
5117 ST.prev_curlyx = cur_curlyx;
5118 rex_sv = re_sv;
5119 SET_reg_curpm(rex_sv);
5120 rex = re;
5121 rexi = rei;
5122 cur_curlyx = NULL;
5123 ST.B = next;
5124 ST.prev_eval = cur_eval;
5125 cur_eval = st;
5126 /* now continue from first node in postoned RE */
5127 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5128 assert(0); /* NOTREACHED */
5129 }
5130
5131 case EVAL_AB: /* cleanup after a successful (??{A})B */
5132 /* note: this is called twice; first after popping B, then A */
5133 PL_reg_flags ^= ST.toggle_reg_flags;
5134 rex_sv = ST.prev_rex;
5135 SET_reg_curpm(rex_sv);
5136 rex = ReANY(rex_sv);
5137 rexi = RXi_GET(rex);
5138 regcpblow(ST.cp);
5139 cur_eval = ST.prev_eval;
5140 cur_curlyx = ST.prev_curlyx;
5141
5142 /* XXXX This is too dramatic a measure... */
5143 PL_reg_maxiter = 0;
5144 if ( nochange_depth )
5145 nochange_depth--;
5146 sayYES;
5147
5148
5149 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5150 /* note: this is called twice; first after popping B, then A */
5151 PL_reg_flags ^= ST.toggle_reg_flags;
5152 rex_sv = ST.prev_rex;
5153 SET_reg_curpm(rex_sv);
5154 rex = ReANY(rex_sv);
5155 rexi = RXi_GET(rex);
5156
5157 REGCP_UNWIND(ST.lastcp);
5158 regcppop(rex, &maxopenparen);
5159 cur_eval = ST.prev_eval;
5160 cur_curlyx = ST.prev_curlyx;
5161 /* XXXX This is too dramatic a measure... */
5162 PL_reg_maxiter = 0;
5163 if ( nochange_depth )
5164 nochange_depth--;
5165 sayNO_SILENT;
5166#undef ST
5167
5168 case OPEN: /* ( */
5169 n = ARG(scan); /* which paren pair */
5170 rex->offs[n].start_tmp = locinput - PL_bostr;
5171 if (n > maxopenparen)
5172 maxopenparen = n;
5173 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5174 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5175 PTR2UV(rex),
5176 PTR2UV(rex->offs),
5177 (UV)n,
5178 (IV)rex->offs[n].start_tmp,
5179 (UV)maxopenparen
5180 ));
5181 lastopen = n;
5182 break;
5183
5184/* XXX really need to log other places start/end are set too */
5185#define CLOSE_CAPTURE \
5186 rex->offs[n].start = rex->offs[n].start_tmp; \
5187 rex->offs[n].end = locinput - PL_bostr; \
5188 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5189 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5190 PTR2UV(rex), \
5191 PTR2UV(rex->offs), \
5192 (UV)n, \
5193 (IV)rex->offs[n].start, \
5194 (IV)rex->offs[n].end \
5195 ))
5196
5197 case CLOSE: /* ) */
5198 n = ARG(scan); /* which paren pair */
5199 CLOSE_CAPTURE;
5200 if (n > rex->lastparen)
5201 rex->lastparen = n;
5202 rex->lastcloseparen = n;
5203 if (cur_eval && cur_eval->u.eval.close_paren == n) {
5204 goto fake_end;
5205 }
5206 break;
5207
5208 case ACCEPT: /* (*ACCEPT) */
5209 if (ARG(scan)){
5210 regnode *cursor;
5211 for (cursor=scan;
5212 cursor && OP(cursor)!=END;
5213 cursor=regnext(cursor))
5214 {
5215 if ( OP(cursor)==CLOSE ){
5216 n = ARG(cursor);
5217 if ( n <= lastopen ) {
5218 CLOSE_CAPTURE;
5219 if (n > rex->lastparen)
5220 rex->lastparen = n;
5221 rex->lastcloseparen = n;
5222 if ( n == ARG(scan) || (cur_eval &&
5223 cur_eval->u.eval.close_paren == n))
5224 break;
5225 }
5226 }
5227 }
5228 }
5229 goto fake_end;
5230 /*NOTREACHED*/
5231
5232 case GROUPP: /* (?(1)) */
5233 n = ARG(scan); /* which paren pair */
5234 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5235 break;
5236
5237 case NGROUPP: /* (?(<name>)) */
5238 /* reg_check_named_buff_matched returns 0 for no match */
5239 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5240 break;
5241
5242 case INSUBP: /* (?(R)) */
5243 n = ARG(scan);
5244 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5245 break;
5246
5247 case DEFINEP: /* (?(DEFINE)) */
5248 sw = 0;
5249 break;
5250
5251 case IFTHEN: /* (?(cond)A|B) */
5252 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5253 if (sw)
5254 next = NEXTOPER(NEXTOPER(scan));
5255 else {
5256 next = scan + ARG(scan);
5257 if (OP(next) == IFTHEN) /* Fake one. */
5258 next = NEXTOPER(NEXTOPER(next));
5259 }
5260 break;
5261
5262 case LOGICAL: /* modifier for EVAL and IFMATCH */
5263 logical = scan->flags;
5264 break;
5265
5266/*******************************************************************
5267
5268The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5269pattern, where A and B are subpatterns. (For simple A, CURLYM or
5270STAR/PLUS/CURLY/CURLYN are used instead.)
5271
5272A*B is compiled as <CURLYX><A><WHILEM><B>
5273
5274On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5275state, which contains the current count, initialised to -1. It also sets
5276cur_curlyx to point to this state, with any previous value saved in the
5277state block.
5278
5279CURLYX then jumps straight to the WHILEM op, rather than executing A,
5280since the pattern may possibly match zero times (i.e. it's a while {} loop
5281rather than a do {} while loop).
5282
5283Each entry to WHILEM represents a successful match of A. The count in the
5284CURLYX block is incremented, another WHILEM state is pushed, and execution
5285passes to A or B depending on greediness and the current count.
5286
5287For example, if matching against the string a1a2a3b (where the aN are
5288substrings that match /A/), then the match progresses as follows: (the
5289pushed states are interspersed with the bits of strings matched so far):
5290
5291 <CURLYX cnt=-1>
5292 <CURLYX cnt=0><WHILEM>
5293 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5294 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5295 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5296 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5297
5298(Contrast this with something like CURLYM, which maintains only a single
5299backtrack state:
5300
5301 <CURLYM cnt=0> a1
5302 a1 <CURLYM cnt=1> a2
5303 a1 a2 <CURLYM cnt=2> a3
5304 a1 a2 a3 <CURLYM cnt=3> b
5305)
5306
5307Each WHILEM state block marks a point to backtrack to upon partial failure
5308of A or B, and also contains some minor state data related to that
5309iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5310overall state, such as the count, and pointers to the A and B ops.
5311
5312This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5313must always point to the *current* CURLYX block, the rules are:
5314
5315When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5316and set cur_curlyx to point the new block.
5317
5318When popping the CURLYX block after a successful or unsuccessful match,
5319restore the previous cur_curlyx.
5320
5321When WHILEM is about to execute B, save the current cur_curlyx, and set it
5322to the outer one saved in the CURLYX block.
5323
5324When popping the WHILEM block after a successful or unsuccessful B match,
5325restore the previous cur_curlyx.
5326
5327Here's an example for the pattern (AI* BI)*BO
5328I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5329
5330cur_
5331curlyx backtrack stack
5332------ ---------------
5333NULL
5334CO <CO prev=NULL> <WO>
5335CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5336CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5337NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5338
5339At this point the pattern succeeds, and we work back down the stack to
5340clean up, restoring as we go:
5341
5342CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5343CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5344CO <CO prev=NULL> <WO>
5345NULL
5346
5347*******************************************************************/
5348
5349#define ST st->u.curlyx
5350
5351 case CURLYX: /* start of /A*B/ (for complex A) */
5352 {
5353 /* No need to save/restore up to this paren */
5354 I32 parenfloor = scan->flags;
5355
5356 assert(next); /* keep Coverity happy */
5357 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5358 next += ARG(next);
5359
5360 /* XXXX Probably it is better to teach regpush to support
5361 parenfloor > maxopenparen ... */
5362 if (parenfloor > (I32)rex->lastparen)
5363 parenfloor = rex->lastparen; /* Pessimization... */
5364
5365 ST.prev_curlyx= cur_curlyx;
5366 cur_curlyx = st;
5367 ST.cp = PL_savestack_ix;
5368
5369 /* these fields contain the state of the current curly.
5370 * they are accessed by subsequent WHILEMs */
5371 ST.parenfloor = parenfloor;
5372 ST.me = scan;
5373 ST.B = next;
5374 ST.minmod = minmod;
5375 minmod = 0;
5376 ST.count = -1; /* this will be updated by WHILEM */
5377 ST.lastloc = NULL; /* this will be updated by WHILEM */
5378
5379 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5380 assert(0); /* NOTREACHED */
5381 }
5382
5383 case CURLYX_end: /* just finished matching all of A*B */
5384 cur_curlyx = ST.prev_curlyx;
5385 sayYES;
5386 assert(0); /* NOTREACHED */
5387
5388 case CURLYX_end_fail: /* just failed to match all of A*B */
5389 regcpblow(ST.cp);
5390 cur_curlyx = ST.prev_curlyx;
5391 sayNO;
5392 assert(0); /* NOTREACHED */
5393
5394
5395#undef ST
5396#define ST st->u.whilem
5397
5398 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5399 {
5400 /* see the discussion above about CURLYX/WHILEM */
5401 I32 n;
5402 int min = ARG1(cur_curlyx->u.curlyx.me);
5403 int max = ARG2(cur_curlyx->u.curlyx.me);
5404 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5405
5406 assert(cur_curlyx); /* keep Coverity happy */
5407 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5408 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5409 ST.cache_offset = 0;
5410 ST.cache_mask = 0;
5411
5412
5413 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5414 "%*s whilem: matched %ld out of %d..%d\n",
5415 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5416 );
5417
5418 /* First just match a string of min A's. */
5419
5420 if (n < min) {
5421 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5422 maxopenparen);
5423 cur_curlyx->u.curlyx.lastloc = locinput;
5424 REGCP_SET(ST.lastcp);
5425
5426 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5427 assert(0); /* NOTREACHED */
5428 }
5429
5430 /* If degenerate A matches "", assume A done. */
5431
5432 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5433 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5434 "%*s whilem: empty match detected, trying continuation...\n",
5435 REPORT_CODE_OFF+depth*2, "")
5436 );
5437 goto do_whilem_B_max;
5438 }
5439
5440 /* super-linear cache processing */
5441
5442 if (scan->flags) {
5443
5444 if (!PL_reg_maxiter) {
5445 /* start the countdown: Postpone detection until we
5446 * know the match is not *that* much linear. */
5447 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5448 /* possible overflow for long strings and many CURLYX's */
5449 if (PL_reg_maxiter < 0)
5450 PL_reg_maxiter = I32_MAX;
5451 PL_reg_leftiter = PL_reg_maxiter;
5452 }
5453
5454 if (PL_reg_leftiter-- == 0) {
5455 /* initialise cache */
5456 const I32 size = (PL_reg_maxiter + 7)/8;
5457 if (PL_reg_poscache) {
5458 if ((I32)PL_reg_poscache_size < size) {
5459 Renew(PL_reg_poscache, size, char);
5460 PL_reg_poscache_size = size;
5461 }
5462 Zero(PL_reg_poscache, size, char);
5463 }
5464 else {
5465 PL_reg_poscache_size = size;
5466 Newxz(PL_reg_poscache, size, char);
5467 }
5468 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5469 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5470 PL_colors[4], PL_colors[5])
5471 );
5472 }
5473
5474 if (PL_reg_leftiter < 0) {
5475 /* have we already failed at this position? */
5476 I32 offset, mask;
5477 offset = (scan->flags & 0xf) - 1
5478 + (locinput - PL_bostr) * (scan->flags>>4);
5479 mask = 1 << (offset % 8);
5480 offset /= 8;
5481 if (PL_reg_poscache[offset] & mask) {
5482 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5483 "%*s whilem: (cache) already tried at this position...\n",
5484 REPORT_CODE_OFF+depth*2, "")
5485 );
5486 sayNO; /* cache records failure */
5487 }
5488 ST.cache_offset = offset;
5489 ST.cache_mask = mask;
5490 }
5491 }
5492
5493 /* Prefer B over A for minimal matching. */
5494
5495 if (cur_curlyx->u.curlyx.minmod) {
5496 ST.save_curlyx = cur_curlyx;
5497 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5498 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5499 maxopenparen);
5500 REGCP_SET(ST.lastcp);
5501 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5502 locinput);
5503 assert(0); /* NOTREACHED */
5504 }
5505
5506 /* Prefer A over B for maximal matching. */
5507
5508 if (n < max) { /* More greed allowed? */
5509 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5510 maxopenparen);
5511 cur_curlyx->u.curlyx.lastloc = locinput;
5512 REGCP_SET(ST.lastcp);
5513 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5514 assert(0); /* NOTREACHED */
5515 }
5516 goto do_whilem_B_max;
5517 }
5518 assert(0); /* NOTREACHED */
5519
5520 case WHILEM_B_min: /* just matched B in a minimal match */
5521 case WHILEM_B_max: /* just matched B in a maximal match */
5522 cur_curlyx = ST.save_curlyx;
5523 sayYES;
5524 assert(0); /* NOTREACHED */
5525
5526 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5527 cur_curlyx = ST.save_curlyx;
5528 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5529 cur_curlyx->u.curlyx.count--;
5530 CACHEsayNO;
5531 assert(0); /* NOTREACHED */
5532
5533 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5534 /* FALL THROUGH */
5535 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5536 REGCP_UNWIND(ST.lastcp);
5537 regcppop(rex, &maxopenparen);
5538 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5539 cur_curlyx->u.curlyx.count--;
5540 CACHEsayNO;
5541 assert(0); /* NOTREACHED */
5542
5543 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5544 REGCP_UNWIND(ST.lastcp);
5545 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5546 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5547 "%*s whilem: failed, trying continuation...\n",
5548 REPORT_CODE_OFF+depth*2, "")
5549 );
5550 do_whilem_B_max:
5551 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5552 && ckWARN(WARN_REGEXP)
5553 && !(PL_reg_flags & RF_warned))
5554 {
5555 PL_reg_flags |= RF_warned;
5556 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5557 "Complex regular subexpression recursion limit (%d) "
5558 "exceeded",
5559 REG_INFTY - 1);
5560 }
5561
5562 /* now try B */
5563 ST.save_curlyx = cur_curlyx;
5564 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5565 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5566 locinput);
5567 assert(0); /* NOTREACHED */
5568
5569 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5570 cur_curlyx = ST.save_curlyx;
5571 REGCP_UNWIND(ST.lastcp);
5572 regcppop(rex, &maxopenparen);
5573
5574 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5575 /* Maximum greed exceeded */
5576 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5577 && ckWARN(WARN_REGEXP)
5578 && !(PL_reg_flags & RF_warned))
5579 {
5580 PL_reg_flags |= RF_warned;
5581 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5582 "Complex regular subexpression recursion "
5583 "limit (%d) exceeded",
5584 REG_INFTY - 1);
5585 }
5586 cur_curlyx->u.curlyx.count--;
5587 CACHEsayNO;
5588 }
5589
5590 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5591 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5592 );
5593 /* Try grabbing another A and see if it helps. */
5594 cur_curlyx->u.curlyx.lastloc = locinput;
5595 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5596 maxopenparen);
5597 REGCP_SET(ST.lastcp);
5598 PUSH_STATE_GOTO(WHILEM_A_min,
5599 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5600 locinput);
5601 assert(0); /* NOTREACHED */
5602
5603#undef ST
5604#define ST st->u.branch
5605
5606 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
5607 next = scan + ARG(scan);
5608 if (next == scan)
5609 next = NULL;
5610 scan = NEXTOPER(scan);
5611 /* FALL THROUGH */
5612
5613 case BRANCH: /* /(...|A|...)/ */
5614 scan = NEXTOPER(scan); /* scan now points to inner node */
5615 ST.lastparen = rex->lastparen;
5616 ST.lastcloseparen = rex->lastcloseparen;
5617 ST.next_branch = next;
5618 REGCP_SET(ST.cp);
5619
5620 /* Now go into the branch */
5621 if (has_cutgroup) {
5622 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5623 } else {
5624 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5625 }
5626 assert(0); /* NOTREACHED */
5627
5628 case CUTGROUP: /* /(*THEN)/ */
5629 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5630 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5631 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5632 assert(0); /* NOTREACHED */
5633
5634 case CUTGROUP_next_fail:
5635 do_cutgroup = 1;
5636 no_final = 1;
5637 if (st->u.mark.mark_name)
5638 sv_commit = st->u.mark.mark_name;
5639 sayNO;
5640 assert(0); /* NOTREACHED */
5641
5642 case BRANCH_next:
5643 sayYES;
5644 assert(0); /* NOTREACHED */
5645
5646 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5647 if (do_cutgroup) {
5648 do_cutgroup = 0;
5649 no_final = 0;
5650 }
5651 REGCP_UNWIND(ST.cp);
5652 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5653 scan = ST.next_branch;
5654 /* no more branches? */
5655 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5656 DEBUG_EXECUTE_r({
5657 PerlIO_printf( Perl_debug_log,
5658 "%*s %sBRANCH failed...%s\n",
5659 REPORT_CODE_OFF+depth*2, "",
5660 PL_colors[4],
5661 PL_colors[5] );
5662 });
5663 sayNO_SILENT;
5664 }
5665 continue; /* execute next BRANCH[J] op */
5666 assert(0); /* NOTREACHED */
5667
5668 case MINMOD: /* next op will be non-greedy, e.g. A*? */
5669 minmod = 1;
5670 break;
5671
5672#undef ST
5673#define ST st->u.curlym
5674
5675 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5676
5677 /* This is an optimisation of CURLYX that enables us to push
5678 * only a single backtracking state, no matter how many matches
5679 * there are in {m,n}. It relies on the pattern being constant
5680 * length, with no parens to influence future backrefs
5681 */
5682
5683 ST.me = scan;
5684 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5685
5686 ST.lastparen = rex->lastparen;
5687 ST.lastcloseparen = rex->lastcloseparen;
5688
5689 /* if paren positive, emulate an OPEN/CLOSE around A */
5690 if (ST.me->flags) {
5691 U32 paren = ST.me->flags;
5692 if (paren > maxopenparen)
5693 maxopenparen = paren;
5694 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5695 }
5696 ST.A = scan;
5697 ST.B = next;
5698 ST.alen = 0;
5699 ST.count = 0;
5700 ST.minmod = minmod;
5701 minmod = 0;
5702 ST.c1 = CHRTEST_UNINIT;
5703 REGCP_SET(ST.cp);
5704
5705 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5706 goto curlym_do_B;
5707
5708 curlym_do_A: /* execute the A in /A{m,n}B/ */
5709 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5710 assert(0); /* NOTREACHED */
5711
5712 case CURLYM_A: /* we've just matched an A */
5713 ST.count++;
5714 /* after first match, determine A's length: u.curlym.alen */
5715 if (ST.count == 1) {
5716 if (PL_reg_match_utf8) {
5717 char *s = st->locinput;
5718 while (s < locinput) {
5719 ST.alen++;
5720 s += UTF8SKIP(s);
5721 }
5722 }
5723 else {
5724 ST.alen = locinput - st->locinput;
5725 }
5726 if (ST.alen == 0)
5727 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5728 }
5729 DEBUG_EXECUTE_r(
5730 PerlIO_printf(Perl_debug_log,
5731 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5732 (int)(REPORT_CODE_OFF+(depth*2)), "",
5733 (IV) ST.count, (IV)ST.alen)
5734 );
5735
5736 if (cur_eval && cur_eval->u.eval.close_paren &&
5737 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5738 goto fake_end;
5739
5740 {
5741 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5742 if ( max == REG_INFTY || ST.count < max )
5743 goto curlym_do_A; /* try to match another A */
5744 }
5745 goto curlym_do_B; /* try to match B */
5746
5747 case CURLYM_A_fail: /* just failed to match an A */
5748 REGCP_UNWIND(ST.cp);
5749
5750 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5751 || (cur_eval && cur_eval->u.eval.close_paren &&
5752 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5753 sayNO;
5754
5755 curlym_do_B: /* execute the B in /A{m,n}B/ */
5756 if (ST.c1 == CHRTEST_UNINIT) {
5757 /* calculate c1 and c2 for possible match of 1st char
5758 * following curly */
5759 ST.c1 = ST.c2 = CHRTEST_VOID;
5760 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5761 regnode *text_node = ST.B;
5762 if (! HAS_TEXT(text_node))
5763 FIND_NEXT_IMPT(text_node);
5764 /* this used to be
5765
5766 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5767
5768 But the former is redundant in light of the latter.
5769
5770 if this changes back then the macro for
5771 IS_TEXT and friends need to change.
5772 */
5773 if (PL_regkind[OP(text_node)] == EXACT) {
5774 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5775 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5776 {
5777 sayNO;
5778 }
5779 }
5780 }
5781 }
5782
5783 DEBUG_EXECUTE_r(
5784 PerlIO_printf(Perl_debug_log,
5785 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5786 (int)(REPORT_CODE_OFF+(depth*2)),
5787 "", (IV)ST.count)
5788 );
5789 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5790 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5791 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5792 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5793 {
5794 /* simulate B failing */
5795 DEBUG_OPTIMISE_r(
5796 PerlIO_printf(Perl_debug_log,
5797 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5798 (int)(REPORT_CODE_OFF+(depth*2)),"",
5799 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5800 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5801 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5802 );
5803 state_num = CURLYM_B_fail;
5804 goto reenter_switch;
5805 }
5806 }
5807 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5808 /* simulate B failing */
5809 DEBUG_OPTIMISE_r(
5810 PerlIO_printf(Perl_debug_log,
5811 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5812 (int)(REPORT_CODE_OFF+(depth*2)),"",
5813 (int) nextchr, ST.c1, ST.c2)
5814 );
5815 state_num = CURLYM_B_fail;
5816 goto reenter_switch;
5817 }
5818 }
5819
5820 if (ST.me->flags) {
5821 /* emulate CLOSE: mark current A as captured */
5822 I32 paren = ST.me->flags;
5823 if (ST.count) {
5824 rex->offs[paren].start
5825 = HOPc(locinput, -ST.alen) - PL_bostr;
5826 rex->offs[paren].end = locinput - PL_bostr;
5827 if ((U32)paren > rex->lastparen)
5828 rex->lastparen = paren;
5829 rex->lastcloseparen = paren;
5830 }
5831 else
5832 rex->offs[paren].end = -1;
5833 if (cur_eval && cur_eval->u.eval.close_paren &&
5834 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5835 {
5836 if (ST.count)
5837 goto fake_end;
5838 else
5839 sayNO;
5840 }
5841 }
5842
5843 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5844 assert(0); /* NOTREACHED */
5845
5846 case CURLYM_B_fail: /* just failed to match a B */
5847 REGCP_UNWIND(ST.cp);
5848 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5849 if (ST.minmod) {
5850 I32 max = ARG2(ST.me);
5851 if (max != REG_INFTY && ST.count == max)
5852 sayNO;
5853 goto curlym_do_A; /* try to match a further A */
5854 }
5855 /* backtrack one A */
5856 if (ST.count == ARG1(ST.me) /* min */)
5857 sayNO;
5858 ST.count--;
5859 SET_locinput(HOPc(locinput, -ST.alen));
5860 goto curlym_do_B; /* try to match B */
5861
5862#undef ST
5863#define ST st->u.curly
5864
5865#define CURLY_SETPAREN(paren, success) \
5866 if (paren) { \
5867 if (success) { \
5868 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5869 rex->offs[paren].end = locinput - PL_bostr; \
5870 if (paren > rex->lastparen) \
5871 rex->lastparen = paren; \
5872 rex->lastcloseparen = paren; \
5873 } \
5874 else { \
5875 rex->offs[paren].end = -1; \
5876 rex->lastparen = ST.lastparen; \
5877 rex->lastcloseparen = ST.lastcloseparen; \
5878 } \
5879 }
5880
5881 case STAR: /* /A*B/ where A is width 1 char */
5882 ST.paren = 0;
5883 ST.min = 0;
5884 ST.max = REG_INFTY;
5885 scan = NEXTOPER(scan);
5886 goto repeat;
5887
5888 case PLUS: /* /A+B/ where A is width 1 char */
5889 ST.paren = 0;
5890 ST.min = 1;
5891 ST.max = REG_INFTY;
5892 scan = NEXTOPER(scan);
5893 goto repeat;
5894
5895 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5896 ST.paren = scan->flags; /* Which paren to set */
5897 ST.lastparen = rex->lastparen;
5898 ST.lastcloseparen = rex->lastcloseparen;
5899 if (ST.paren > maxopenparen)
5900 maxopenparen = ST.paren;
5901 ST.min = ARG1(scan); /* min to match */
5902 ST.max = ARG2(scan); /* max to match */
5903 if (cur_eval && cur_eval->u.eval.close_paren &&
5904 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5905 ST.min=1;
5906 ST.max=1;
5907 }
5908 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5909 goto repeat;
5910
5911 case CURLY: /* /A{m,n}B/ where A is width 1 char */
5912 ST.paren = 0;
5913 ST.min = ARG1(scan); /* min to match */
5914 ST.max = ARG2(scan); /* max to match */
5915 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5916 repeat:
5917 /*
5918 * Lookahead to avoid useless match attempts
5919 * when we know what character comes next.
5920 *
5921 * Used to only do .*x and .*?x, but now it allows
5922 * for )'s, ('s and (?{ ... })'s to be in the way
5923 * of the quantifier and the EXACT-like node. -- japhy
5924 */
5925
5926 assert(ST.min <= ST.max);
5927 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5928 ST.c1 = ST.c2 = CHRTEST_VOID;
5929 }
5930 else {
5931 regnode *text_node = next;
5932
5933 if (! HAS_TEXT(text_node))
5934 FIND_NEXT_IMPT(text_node);
5935
5936 if (! HAS_TEXT(text_node))
5937 ST.c1 = ST.c2 = CHRTEST_VOID;
5938 else {
5939 if ( PL_regkind[OP(text_node)] != EXACT ) {
5940 ST.c1 = ST.c2 = CHRTEST_VOID;
5941 }
5942 else {
5943
5944 /* Currently we only get here when
5945
5946 PL_rekind[OP(text_node)] == EXACT
5947
5948 if this changes back then the macro for IS_TEXT and
5949 friends need to change. */
5950 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5951 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5952 {
5953 sayNO;
5954 }
5955 }
5956 }
5957 }
5958
5959 ST.A = scan;
5960 ST.B = next;
5961 if (minmod) {
5962 char *li = locinput;
5963 minmod = 0;
5964 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5965 sayNO;
5966 SET_locinput(li);
5967 ST.count = ST.min;
5968 REGCP_SET(ST.cp);
5969 if (ST.c1 == CHRTEST_VOID)
5970 goto curly_try_B_min;
5971
5972 ST.oldloc = locinput;
5973
5974 /* set ST.maxpos to the furthest point along the
5975 * string that could possibly match */
5976 if (ST.max == REG_INFTY) {
5977 ST.maxpos = PL_regeol - 1;
5978 if (utf8_target)
5979 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5980 ST.maxpos--;
5981 }
5982 else if (utf8_target) {
5983 int m = ST.max - ST.min;
5984 for (ST.maxpos = locinput;
5985 m >0 && ST.maxpos < PL_regeol; m--)
5986 ST.maxpos += UTF8SKIP(ST.maxpos);
5987 }
5988 else {
5989 ST.maxpos = locinput + ST.max - ST.min;
5990 if (ST.maxpos >= PL_regeol)
5991 ST.maxpos = PL_regeol - 1;
5992 }
5993 goto curly_try_B_min_known;
5994
5995 }
5996 else {
5997 /* avoid taking address of locinput, so it can remain
5998 * a register var */
5999 char *li = locinput;
6000 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
6001 if (ST.count < ST.min)
6002 sayNO;
6003 SET_locinput(li);
6004 if ((ST.count > ST.min)
6005 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6006 {
6007 /* A{m,n} must come at the end of the string, there's
6008 * no point in backing off ... */
6009 ST.min = ST.count;
6010 /* ...except that $ and \Z can match before *and* after
6011 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6012 We may back off by one in this case. */
6013 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6014 ST.min--;
6015 }
6016 REGCP_SET(ST.cp);
6017 goto curly_try_B_max;
6018 }
6019 assert(0); /* NOTREACHED */
6020
6021
6022 case CURLY_B_min_known_fail:
6023 /* failed to find B in a non-greedy match where c1,c2 valid */
6024
6025 REGCP_UNWIND(ST.cp);
6026 if (ST.paren) {
6027 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6028 }
6029 /* Couldn't or didn't -- move forward. */
6030 ST.oldloc = locinput;
6031 if (utf8_target)
6032 locinput += UTF8SKIP(locinput);
6033 else
6034 locinput++;
6035 ST.count++;
6036 curly_try_B_min_known:
6037 /* find the next place where 'B' could work, then call B */
6038 {
6039 int n;
6040 if (utf8_target) {
6041 n = (ST.oldloc == locinput) ? 0 : 1;
6042 if (ST.c1 == ST.c2) {
6043 /* set n to utf8_distance(oldloc, locinput) */
6044 while (locinput <= ST.maxpos
6045 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6046 {
6047 locinput += UTF8SKIP(locinput);
6048 n++;
6049 }
6050 }
6051 else {
6052 /* set n to utf8_distance(oldloc, locinput) */
6053 while (locinput <= ST.maxpos
6054 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6055 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6056 {
6057 locinput += UTF8SKIP(locinput);
6058 n++;
6059 }
6060 }
6061 }
6062 else { /* Not utf8_target */
6063 if (ST.c1 == ST.c2) {
6064 while (locinput <= ST.maxpos &&
6065 UCHARAT(locinput) != ST.c1)
6066 locinput++;
6067 }
6068 else {
6069 while (locinput <= ST.maxpos
6070 && UCHARAT(locinput) != ST.c1
6071 && UCHARAT(locinput) != ST.c2)
6072 locinput++;
6073 }
6074 n = locinput - ST.oldloc;
6075 }
6076 if (locinput > ST.maxpos)
6077 sayNO;
6078 if (n) {
6079 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6080 * at b; check that everything between oldloc and
6081 * locinput matches */
6082 char *li = ST.oldloc;
6083 ST.count += n;
6084 if (regrepeat(rex, &li, ST.A, n, depth) < n)
6085 sayNO;
6086 assert(n == REG_INFTY || locinput == li);
6087 }
6088 CURLY_SETPAREN(ST.paren, ST.count);
6089 if (cur_eval && cur_eval->u.eval.close_paren &&
6090 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6091 goto fake_end;
6092 }
6093 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6094 }
6095 assert(0); /* NOTREACHED */
6096
6097
6098 case CURLY_B_min_fail:
6099 /* failed to find B in a non-greedy match where c1,c2 invalid */
6100
6101 REGCP_UNWIND(ST.cp);
6102 if (ST.paren) {
6103 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6104 }
6105 /* failed -- move forward one */
6106 {
6107 char *li = locinput;
6108 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
6109 sayNO;
6110 }
6111 locinput = li;
6112 }
6113 {
6114 ST.count++;
6115 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6116 ST.count > 0)) /* count overflow ? */
6117 {
6118 curly_try_B_min:
6119 CURLY_SETPAREN(ST.paren, ST.count);
6120 if (cur_eval && cur_eval->u.eval.close_paren &&
6121 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6122 goto fake_end;
6123 }
6124 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6125 }
6126 }
6127 sayNO;
6128 assert(0); /* NOTREACHED */
6129
6130
6131 curly_try_B_max:
6132 /* a successful greedy match: now try to match B */
6133 if (cur_eval && cur_eval->u.eval.close_paren &&
6134 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6135 goto fake_end;
6136 }
6137 {
6138 bool could_match = locinput < PL_regeol;
6139
6140 /* If it could work, try it. */
6141 if (ST.c1 != CHRTEST_VOID && could_match) {
6142 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6143 {
6144 could_match = memEQ(locinput,
6145 ST.c1_utf8,
6146 UTF8SKIP(locinput))
6147 || memEQ(locinput,
6148 ST.c2_utf8,
6149 UTF8SKIP(locinput));
6150 }
6151 else {
6152 could_match = UCHARAT(locinput) == ST.c1
6153 || UCHARAT(locinput) == ST.c2;
6154 }
6155 }
6156 if (ST.c1 == CHRTEST_VOID || could_match) {
6157 CURLY_SETPAREN(ST.paren, ST.count);
6158 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6159 assert(0); /* NOTREACHED */
6160 }
6161 }
6162 /* FALL THROUGH */
6163
6164 case CURLY_B_max_fail:
6165 /* failed to find B in a greedy match */
6166
6167 REGCP_UNWIND(ST.cp);
6168 if (ST.paren) {
6169 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6170 }
6171 /* back up. */
6172 if (--ST.count < ST.min)
6173 sayNO;
6174 locinput = HOPc(locinput, -1);
6175 goto curly_try_B_max;
6176
6177#undef ST
6178
6179 case END: /* last op of main pattern */
6180 fake_end:
6181 if (cur_eval) {
6182 /* we've just finished A in /(??{A})B/; now continue with B */
6183 st->u.eval.toggle_reg_flags
6184 = cur_eval->u.eval.toggle_reg_flags;
6185 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6186
6187 st->u.eval.prev_rex = rex_sv; /* inner */
6188
6189 /* Save *all* the positions. */
6190 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6191 rex_sv = cur_eval->u.eval.prev_rex;
6192 SET_reg_curpm(rex_sv);
6193 rex = ReANY(rex_sv);
6194 rexi = RXi_GET(rex);
6195 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6196
6197 REGCP_SET(st->u.eval.lastcp);
6198
6199 /* Restore parens of the outer rex without popping the
6200 * savestack */
6201 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6202 &maxopenparen);
6203
6204 st->u.eval.prev_eval = cur_eval;
6205 cur_eval = cur_eval->u.eval.prev_eval;
6206 DEBUG_EXECUTE_r(
6207 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6208 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6209 if ( nochange_depth )
6210 nochange_depth--;
6211
6212 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6213 locinput); /* match B */
6214 }
6215
6216 if (locinput < reginfo->till) {
6217 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6218 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6219 PL_colors[4],
6220 (long)(locinput - PL_reg_starttry),
6221 (long)(reginfo->till - PL_reg_starttry),
6222 PL_colors[5]));
6223
6224 sayNO_SILENT; /* Cannot match: too short. */
6225 }
6226 sayYES; /* Success! */
6227
6228 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6229 DEBUG_EXECUTE_r(
6230 PerlIO_printf(Perl_debug_log,
6231 "%*s %ssubpattern success...%s\n",
6232 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6233 sayYES; /* Success! */
6234
6235#undef ST
6236#define ST st->u.ifmatch
6237
6238 {
6239 char *newstart;
6240
6241 case SUSPEND: /* (?>A) */
6242 ST.wanted = 1;
6243 newstart = locinput;
6244 goto do_ifmatch;
6245
6246 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6247 ST.wanted = 0;
6248 goto ifmatch_trivial_fail_test;
6249
6250 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6251 ST.wanted = 1;
6252 ifmatch_trivial_fail_test:
6253 if (scan->flags) {
6254 char * const s = HOPBACKc(locinput, scan->flags);
6255 if (!s) {
6256 /* trivial fail */
6257 if (logical) {
6258 logical = 0;
6259 sw = 1 - cBOOL(ST.wanted);
6260 }
6261 else if (ST.wanted)
6262 sayNO;
6263 next = scan + ARG(scan);
6264 if (next == scan)
6265 next = NULL;
6266 break;
6267 }
6268 newstart = s;
6269 }
6270 else
6271 newstart = locinput;
6272
6273 do_ifmatch:
6274 ST.me = scan;
6275 ST.logical = logical;
6276 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6277
6278 /* execute body of (?...A) */
6279 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6280 assert(0); /* NOTREACHED */
6281 }
6282
6283 case IFMATCH_A_fail: /* body of (?...A) failed */
6284 ST.wanted = !ST.wanted;
6285 /* FALL THROUGH */
6286
6287 case IFMATCH_A: /* body of (?...A) succeeded */
6288 if (ST.logical) {
6289 sw = cBOOL(ST.wanted);
6290 }
6291 else if (!ST.wanted)
6292 sayNO;
6293
6294 if (OP(ST.me) != SUSPEND) {
6295 /* restore old position except for (?>...) */
6296 locinput = st->locinput;
6297 }
6298 scan = ST.me + ARG(ST.me);
6299 if (scan == ST.me)
6300 scan = NULL;
6301 continue; /* execute B */
6302
6303#undef ST
6304
6305 case LONGJMP: /* alternative with many branches compiles to
6306 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6307 next = scan + ARG(scan);
6308 if (next == scan)
6309 next = NULL;
6310 break;
6311
6312 case COMMIT: /* (*COMMIT) */
6313 reginfo->cutpoint = PL_regeol;
6314 /* FALLTHROUGH */
6315
6316 case PRUNE: /* (*PRUNE) */
6317 if (!scan->flags)
6318 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6319 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6320 assert(0); /* NOTREACHED */
6321
6322 case COMMIT_next_fail:
6323 no_final = 1;
6324 /* FALLTHROUGH */
6325
6326 case OPFAIL: /* (*FAIL) */
6327 sayNO;
6328 assert(0); /* NOTREACHED */
6329
6330#define ST st->u.mark
6331 case MARKPOINT: /* (*MARK:foo) */
6332 ST.prev_mark = mark_state;
6333 ST.mark_name = sv_commit = sv_yes_mark
6334 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6335 mark_state = st;
6336 ST.mark_loc = locinput;
6337 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6338 assert(0); /* NOTREACHED */
6339
6340 case MARKPOINT_next:
6341 mark_state = ST.prev_mark;
6342 sayYES;
6343 assert(0); /* NOTREACHED */
6344
6345 case MARKPOINT_next_fail:
6346 if (popmark && sv_eq(ST.mark_name,popmark))
6347 {
6348 if (ST.mark_loc > startpoint)
6349 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6350 popmark = NULL; /* we found our mark */
6351 sv_commit = ST.mark_name;
6352
6353 DEBUG_EXECUTE_r({
6354 PerlIO_printf(Perl_debug_log,
6355 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6356 REPORT_CODE_OFF+depth*2, "",
6357 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6358 });
6359 }
6360 mark_state = ST.prev_mark;
6361 sv_yes_mark = mark_state ?
6362 mark_state->u.mark.mark_name : NULL;
6363 sayNO;
6364 assert(0); /* NOTREACHED */
6365
6366 case SKIP: /* (*SKIP) */
6367 if (scan->flags) {
6368 /* (*SKIP) : if we fail we cut here*/
6369 ST.mark_name = NULL;
6370 ST.mark_loc = locinput;
6371 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6372 } else {
6373 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6374 otherwise do nothing. Meaning we need to scan
6375 */
6376 regmatch_state *cur = mark_state;
6377 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6378
6379 while (cur) {
6380 if ( sv_eq( cur->u.mark.mark_name,
6381 find ) )
6382 {
6383 ST.mark_name = find;
6384 PUSH_STATE_GOTO( SKIP_next, next, locinput);
6385 }
6386 cur = cur->u.mark.prev_mark;
6387 }
6388 }
6389 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6390 break;
6391
6392 case SKIP_next_fail:
6393 if (ST.mark_name) {
6394 /* (*CUT:NAME) - Set up to search for the name as we
6395 collapse the stack*/
6396 popmark = ST.mark_name;
6397 } else {
6398 /* (*CUT) - No name, we cut here.*/
6399 if (ST.mark_loc > startpoint)
6400 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6401 /* but we set sv_commit to latest mark_name if there
6402 is one so they can test to see how things lead to this
6403 cut */
6404 if (mark_state)
6405 sv_commit=mark_state->u.mark.mark_name;
6406 }
6407 no_final = 1;
6408 sayNO;
6409 assert(0); /* NOTREACHED */
6410#undef ST
6411
6412 case LNBREAK: /* \R */
6413 if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6414 locinput += n;
6415 } else
6416 sayNO;
6417 break;
6418
6419#define CASE_CLASS(nAmE) \
6420 case nAmE: \
6421 if (NEXTCHR_IS_EOS) \
6422 sayNO; \
6423 if ((n=is_##nAmE(locinput,utf8_target))) { \
6424 locinput += n; \
6425 } else \
6426 sayNO; \
6427 break; \
6428 case N##nAmE: \
6429 if (NEXTCHR_IS_EOS) \
6430 sayNO; \
6431 if ((n=is_##nAmE(locinput,utf8_target))) { \
6432 sayNO; \
6433 } else { \
6434 locinput += UTF8SKIP(locinput); \
6435 } \
6436 break
6437
6438 CASE_CLASS(VERTWS); /* \v \V */
6439 CASE_CLASS(HORIZWS); /* \h \H */
6440#undef CASE_CLASS
6441
6442 default:
6443 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6444 PTR2UV(scan), OP(scan));
6445 Perl_croak(aTHX_ "regexp memory corruption");
6446
6447 /* this is a point to jump to in order to increment
6448 * locinput by one character */
6449 increment_locinput:
6450 assert(!NEXTCHR_IS_EOS);
6451 if (utf8_target) {
6452 locinput += PL_utf8skip[nextchr];
6453 /* locinput is allowed to go 1 char off the end, but not 2+ */
6454 if (locinput > PL_regeol)
6455 sayNO;
6456 }
6457 else
6458 locinput++;
6459 break;
6460
6461 } /* end switch */
6462
6463 /* switch break jumps here */
6464 scan = next; /* prepare to execute the next op and ... */
6465 continue; /* ... jump back to the top, reusing st */
6466 assert(0); /* NOTREACHED */
6467
6468 push_yes_state:
6469 /* push a state that backtracks on success */
6470 st->u.yes.prev_yes_state = yes_state;
6471 yes_state = st;
6472 /* FALL THROUGH */
6473 push_state:
6474 /* push a new regex state, then continue at scan */
6475 {
6476 regmatch_state *newst;
6477
6478 DEBUG_STACK_r({
6479 regmatch_state *cur = st;
6480 regmatch_state *curyes = yes_state;
6481 int curd = depth;
6482 regmatch_slab *slab = PL_regmatch_slab;
6483 for (;curd > -1;cur--,curd--) {
6484 if (cur < SLAB_FIRST(slab)) {
6485 slab = slab->prev;
6486 cur = SLAB_LAST(slab);
6487 }
6488 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6489 REPORT_CODE_OFF + 2 + depth * 2,"",
6490 curd, PL_reg_name[cur->resume_state],
6491 (curyes == cur) ? "yes" : ""
6492 );
6493 if (curyes == cur)
6494 curyes = cur->u.yes.prev_yes_state;
6495 }
6496 } else
6497 DEBUG_STATE_pp("push")
6498 );
6499 depth++;
6500 st->locinput = locinput;
6501 newst = st+1;
6502 if (newst > SLAB_LAST(PL_regmatch_slab))
6503 newst = S_push_slab(aTHX);
6504 PL_regmatch_state = newst;
6505
6506 locinput = pushinput;
6507 st = newst;
6508 continue;
6509 assert(0); /* NOTREACHED */
6510 }
6511 }
6512
6513 /*
6514 * We get here only if there's trouble -- normally "case END" is
6515 * the terminating point.
6516 */
6517 Perl_croak(aTHX_ "corrupted regexp pointers");
6518 /*NOTREACHED*/
6519 sayNO;
6520
6521yes:
6522 if (yes_state) {
6523 /* we have successfully completed a subexpression, but we must now
6524 * pop to the state marked by yes_state and continue from there */
6525 assert(st != yes_state);
6526#ifdef DEBUGGING
6527 while (st != yes_state) {
6528 st--;
6529 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6530 PL_regmatch_slab = PL_regmatch_slab->prev;
6531 st = SLAB_LAST(PL_regmatch_slab);
6532 }
6533 DEBUG_STATE_r({
6534 if (no_final) {
6535 DEBUG_STATE_pp("pop (no final)");
6536 } else {
6537 DEBUG_STATE_pp("pop (yes)");
6538 }
6539 });
6540 depth--;
6541 }
6542#else
6543 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6544 || yes_state > SLAB_LAST(PL_regmatch_slab))
6545 {
6546 /* not in this slab, pop slab */
6547 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6548 PL_regmatch_slab = PL_regmatch_slab->prev;
6549 st = SLAB_LAST(PL_regmatch_slab);
6550 }
6551 depth -= (st - yes_state);
6552#endif
6553 st = yes_state;
6554 yes_state = st->u.yes.prev_yes_state;
6555 PL_regmatch_state = st;
6556
6557 if (no_final)
6558 locinput= st->locinput;
6559 state_num = st->resume_state + no_final;
6560 goto reenter_switch;
6561 }
6562
6563 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6564 PL_colors[4], PL_colors[5]));
6565
6566 if (PL_reg_state.re_state_eval_setup_done) {
6567 /* each successfully executed (?{...}) block does the equivalent of
6568 * local $^R = do {...}
6569 * When popping the save stack, all these locals would be undone;
6570 * bypass this by setting the outermost saved $^R to the latest
6571 * value */
6572 if (oreplsv != GvSV(PL_replgv))
6573 sv_setsv(oreplsv, GvSV(PL_replgv));
6574 }
6575 result = 1;
6576 goto final_exit;
6577
6578no:
6579 DEBUG_EXECUTE_r(
6580 PerlIO_printf(Perl_debug_log,
6581 "%*s %sfailed...%s\n",
6582 REPORT_CODE_OFF+depth*2, "",
6583 PL_colors[4], PL_colors[5])
6584 );
6585
6586no_silent:
6587 if (no_final) {
6588 if (yes_state) {
6589 goto yes;
6590 } else {
6591 goto final_exit;
6592 }
6593 }
6594 if (depth) {
6595 /* there's a previous state to backtrack to */
6596 st--;
6597 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6598 PL_regmatch_slab = PL_regmatch_slab->prev;
6599 st = SLAB_LAST(PL_regmatch_slab);
6600 }
6601 PL_regmatch_state = st;
6602 locinput= st->locinput;
6603
6604 DEBUG_STATE_pp("pop");
6605 depth--;
6606 if (yes_state == st)
6607 yes_state = st->u.yes.prev_yes_state;
6608
6609 state_num = st->resume_state + 1; /* failure = success + 1 */
6610 goto reenter_switch;
6611 }
6612 result = 0;
6613
6614 final_exit:
6615 if (rex->intflags & PREGf_VERBARG_SEEN) {
6616 SV *sv_err = get_sv("REGERROR", 1);
6617 SV *sv_mrk = get_sv("REGMARK", 1);
6618 if (result) {
6619 sv_commit = &PL_sv_no;
6620 if (!sv_yes_mark)
6621 sv_yes_mark = &PL_sv_yes;
6622 } else {
6623 if (!sv_commit)
6624 sv_commit = &PL_sv_yes;
6625 sv_yes_mark = &PL_sv_no;
6626 }
6627 sv_setsv(sv_err, sv_commit);
6628 sv_setsv(sv_mrk, sv_yes_mark);
6629 }
6630
6631
6632 if (last_pushed_cv) {
6633 dSP;
6634 POP_MULTICALL;
6635 PERL_UNUSED_VAR(SP);
6636 }
6637
6638 /* clean up; in particular, free all slabs above current one */
6639 LEAVE_SCOPE(oldsave);
6640
6641 assert(!result || locinput - PL_bostr >= 0);
6642 return result ? locinput - PL_bostr : -1;
6643}
6644
6645/*
6646 - regrepeat - repeatedly match something simple, report how many
6647 *
6648 * What 'simple' means is a node which can be the operand of a quantifier like
6649 * '+', or {1,3}
6650 *
6651 * startposp - pointer a pointer to the start position. This is updated
6652 * to point to the byte following the highest successful
6653 * match.
6654 * p - the regnode to be repeatedly matched against.
6655 * max - maximum number of things to match.
6656 * depth - (for debugging) backtracking depth.
6657 */
6658STATIC I32
6659S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6660{
6661 dVAR;
6662 char *scan; /* Pointer to current position in target string */
6663 I32 c;
6664 char *loceol = PL_regeol; /* local version */
6665 I32 hardcount = 0; /* How many matches so far */
6666 bool utf8_target = PL_reg_match_utf8;
6667 UV utf8_flags;
6668#ifndef DEBUGGING
6669 PERL_UNUSED_ARG(depth);
6670#endif
6671
6672 PERL_ARGS_ASSERT_REGREPEAT;
6673
6674 scan = *startposp;
6675 if (max == REG_INFTY)
6676 max = I32_MAX;
6677 else if (! utf8_target && scan + max < loceol)
6678 loceol = scan + max;
6679
6680 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6681 * to the maximum of how far we should go in it (leaving it set to the real
6682 * end, if the maximum permissible would take us beyond that). This allows
6683 * us to make the loop exit condition that we haven't gone past <loceol> to
6684 * also mean that we haven't exceeded the max permissible count, saving a
6685 * test each time through the loop. But it assumes that the OP matches a
6686 * single byte, which is true for most of the OPs below when applied to a
6687 * non-UTF-8 target. Those relatively few OPs that don't have this
6688 * characteristic will have to compensate.
6689 *
6690 * There is no adjustment for UTF-8 targets, as the number of bytes per
6691 * character varies. OPs will have to test both that the count is less
6692 * than the max permissible (using <hardcount> to keep track), and that we
6693 * are still within the bounds of the string (using <loceol>. A few OPs
6694 * match a single byte no matter what the encoding. They can omit the max
6695 * test if, for the UTF-8 case, they do the adjustment that was skipped
6696 * above.
6697 *
6698 * Thus, the code above sets things up for the common case; and exceptional
6699 * cases need extra work; the common case is to make sure <scan> doesn't
6700 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6701 * count doesn't exceed the maximum permissible */
6702
6703 switch (OP(p)) {
6704 case REG_ANY:
6705 if (utf8_target) {
6706 while (scan < loceol && hardcount < max && *scan != '\n') {
6707 scan += UTF8SKIP(scan);
6708 hardcount++;
6709 }
6710 } else {
6711 while (scan < loceol && *scan != '\n')
6712 scan++;
6713 }
6714 break;
6715 case SANY:
6716 if (utf8_target) {
6717 while (scan < loceol && hardcount < max) {
6718 scan += UTF8SKIP(scan);
6719 hardcount++;
6720 }
6721 }
6722 else
6723 scan = loceol;
6724 break;
6725 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
6726 if (utf8_target && scan + max < loceol) {
6727
6728 /* <loceol> hadn't been adjusted in the UTF-8 case */
6729 scan += max;
6730 }
6731 else {
6732 scan = loceol;
6733 }
6734 break;
6735 case EXACT:
6736 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6737
6738 c = (U8)*STRING(p);
6739
6740 /* Can use a simple loop if the pattern char to match on is invariant
6741 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6742 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6743 * true iff it doesn't matter if the argument is in UTF-8 or not */
6744 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6745 if (utf8_target && scan + max < loceol) {
6746 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6747 * since here, to match at all, 1 char == 1 byte */
6748 loceol = scan + max;
6749 }
6750 while (scan < loceol && UCHARAT(scan) == c) {
6751 scan++;
6752 }
6753 }
6754 else if (UTF_PATTERN) {
6755 if (utf8_target) {
6756 STRLEN scan_char_len;
6757
6758 /* When both target and pattern are UTF-8, we have to do
6759 * string EQ */
6760 while (hardcount < max
6761 && scan < loceol
6762 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
6763 && memEQ(scan, STRING(p), scan_char_len))
6764 {
6765 scan += scan_char_len;
6766 hardcount++;
6767 }
6768 }
6769 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6770
6771 /* Target isn't utf8; convert the character in the UTF-8
6772 * pattern to non-UTF8, and do a simple loop */
6773 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6774 while (scan < loceol && UCHARAT(scan) == c) {
6775 scan++;
6776 }
6777 } /* else pattern char is above Latin1, can't possibly match the
6778 non-UTF-8 target */
6779 }
6780 else {
6781
6782 /* Here, the string must be utf8; pattern isn't, and <c> is
6783 * different in utf8 than not, so can't compare them directly.
6784 * Outside the loop, find the two utf8 bytes that represent c, and
6785 * then look for those in sequence in the utf8 string */
6786 U8 high = UTF8_TWO_BYTE_HI(c);
6787 U8 low = UTF8_TWO_BYTE_LO(c);
6788
6789 while (hardcount < max
6790 && scan + 1 < loceol
6791 && UCHARAT(scan) == high
6792 && UCHARAT(scan + 1) == low)
6793 {
6794 scan += 2;
6795 hardcount++;
6796 }
6797 }
6798 break;
6799
6800 case EXACTFA:
6801 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6802 goto do_exactf;
6803
6804 case EXACTFL:
6805 PL_reg_flags |= RF_tainted;
6806 utf8_flags = FOLDEQ_UTF8_LOCALE;
6807 goto do_exactf;
6808
6809 case EXACTF:
6810 utf8_flags = 0;
6811 goto do_exactf;
6812
6813 case EXACTFU_SS:
6814 case EXACTFU_TRICKYFOLD:
6815 case EXACTFU:
6816 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6817
6818 do_exactf: {
6819 int c1, c2;
6820 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6821
6822 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6823
6824 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6825 if (c1 == CHRTEST_VOID) {
6826 /* Use full Unicode fold matching */
6827 char *tmpeol = PL_regeol;
6828 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6829 while (hardcount < max
6830 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6831 STRING(p), NULL, pat_len,
6832 cBOOL(UTF_PATTERN), utf8_flags))
6833 {
6834 scan = tmpeol;
6835 tmpeol = PL_regeol;
6836 hardcount++;
6837 }
6838 }
6839 else if (utf8_target) {
6840 if (c1 == c2) {
6841 while (scan < loceol
6842 && hardcount < max
6843 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6844 {
6845 scan += UTF8SKIP(scan);
6846 hardcount++;
6847 }
6848 }
6849 else {
6850 while (scan < loceol
6851 && hardcount < max
6852 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6853 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6854 {
6855 scan += UTF8SKIP(scan);
6856 hardcount++;
6857 }
6858 }
6859 }
6860 else if (c1 == c2) {
6861 while (scan < loceol && UCHARAT(scan) == c1) {
6862 scan++;
6863 }
6864 }
6865 else {
6866 while (scan < loceol &&
6867 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6868 {
6869 scan++;
6870 }
6871 }
6872 }
6873 break;
6874 }
6875 case ANYOF:
6876 if (utf8_target) {
6877 while (hardcount < max
6878 && scan < loceol
6879 && reginclass(prog, p, (U8*)scan, utf8_target))
6880 {
6881 scan += UTF8SKIP(scan);
6882 hardcount++;
6883 }
6884 } else {
6885 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6886 scan++;
6887 }
6888 break;
6889 case ALNUMU:
6890 if (utf8_target) {
6891 utf8_wordchar:
6892 LOAD_UTF8_CHARCLASS_ALNUM();
6893 while (hardcount < max && scan < loceol &&
6894 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6895 {
6896 scan += UTF8SKIP(scan);
6897 hardcount++;
6898 }
6899 } else {
6900 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6901 scan++;
6902 }
6903 }
6904 break;
6905 case ALNUM:
6906 if (utf8_target)
6907 goto utf8_wordchar;
6908 while (scan < loceol && isALNUM((U8) *scan)) {
6909 scan++;
6910 }
6911 break;
6912 case ALNUMA:
6913 if (utf8_target && scan + max < loceol) {
6914
6915 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6916 * since here, to match, 1 char == 1 byte */
6917 loceol = scan + max;
6918 }
6919 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6920 scan++;
6921 }
6922 break;
6923 case ALNUML:
6924 PL_reg_flags |= RF_tainted;
6925 if (utf8_target) {
6926 while (hardcount < max && scan < loceol &&
6927 isALNUM_LC_utf8((U8*)scan)) {
6928 scan += UTF8SKIP(scan);
6929 hardcount++;
6930 }
6931 } else {
6932 while (scan < loceol && isALNUM_LC(*scan))
6933 scan++;
6934 }
6935 break;
6936 case NALNUMU:
6937 if (utf8_target) {
6938
6939 utf8_Nwordchar:
6940
6941 LOAD_UTF8_CHARCLASS_ALNUM();
6942 while (hardcount < max && scan < loceol &&
6943 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6944 {
6945 scan += UTF8SKIP(scan);
6946 hardcount++;
6947 }
6948 } else {
6949 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6950 scan++;
6951 }
6952 }
6953 break;
6954 case NALNUM:
6955 if (utf8_target)
6956 goto utf8_Nwordchar;
6957 while (scan < loceol && ! isALNUM((U8) *scan)) {
6958 scan++;
6959 }
6960 break;
6961
6962 case POSIXA:
6963 if (utf8_target && scan + max < loceol) {
6964
6965 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6966 * since here, to match, 1 char == 1 byte */
6967 loceol = scan + max;
6968 }
6969 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6970 scan++;
6971 }
6972 break;
6973 case NPOSIXA:
6974 if (utf8_target) {
6975 while (scan < loceol && hardcount < max
6976 && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6977 {
6978 scan += UTF8SKIP(scan);
6979 hardcount++;
6980 }
6981 }
6982 else {
6983 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6984 scan++;
6985 }
6986 }
6987 break;
6988 case NALNUMA:
6989 if (utf8_target) {
6990 while (scan < loceol && hardcount < max
6991 && ! isWORDCHAR_A((U8) *scan))
6992 {
6993 scan += UTF8SKIP(scan);
6994 hardcount++;
6995 }
6996 }
6997 else {
6998 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6999 scan++;
7000 }
7001 }
7002 break;
7003 case NALNUML:
7004 PL_reg_flags |= RF_tainted;
7005 if (utf8_target) {
7006 while (hardcount < max && scan < loceol &&
7007 !isALNUM_LC_utf8((U8*)scan)) {
7008 scan += UTF8SKIP(scan);
7009 hardcount++;
7010 }
7011 } else {
7012 while (scan < loceol && !isALNUM_LC(*scan))
7013 scan++;
7014 }
7015 break;
7016 case SPACEU:
7017 if (utf8_target) {
7018
7019 utf8_space:
7020
7021 while (hardcount < max && scan < loceol
7022 && is_XPERLSPACE_utf8((U8*)scan))
7023 {
7024 scan += UTF8SKIP(scan);
7025 hardcount++;
7026 }
7027 break;
7028 }
7029 else {
7030 while (scan < loceol && isSPACE_L1((U8) *scan)) {
7031 scan++;
7032 }
7033 break;
7034 }
7035 case SPACE:
7036 if (utf8_target)
7037 goto utf8_space;
7038
7039 while (scan < loceol && isSPACE((U8) *scan)) {
7040 scan++;
7041 }
7042 break;
7043 case SPACEA:
7044 if (utf8_target && scan + max < loceol) {
7045
7046 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7047 * since here, to match, 1 char == 1 byte */
7048 loceol = scan + max;
7049 }
7050 while (scan < loceol && isSPACE_A((U8) *scan)) {
7051 scan++;
7052 }
7053 break;
7054 case SPACEL:
7055 PL_reg_flags |= RF_tainted;
7056 if (utf8_target) {
7057 while (hardcount < max && scan < loceol &&
7058 isSPACE_LC_utf8((U8*)scan)) {
7059 scan += UTF8SKIP(scan);
7060 hardcount++;
7061 }
7062 } else {
7063 while (scan < loceol && isSPACE_LC(*scan))
7064 scan++;
7065 }
7066 break;
7067 case NSPACEU:
7068 if (utf8_target) {
7069
7070 utf8_Nspace:
7071
7072 while (hardcount < max && scan < loceol
7073 && ! is_XPERLSPACE_utf8((U8*)scan))
7074 {
7075 scan += UTF8SKIP(scan);
7076 hardcount++;
7077 }
7078 break;
7079 }
7080 else {
7081 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
7082 scan++;
7083 }
7084 }
7085 break;
7086 case NSPACE:
7087 if (utf8_target)
7088 goto utf8_Nspace;
7089
7090 while (scan < loceol && ! isSPACE((U8) *scan)) {
7091 scan++;
7092 }
7093 break;
7094 case NSPACEA:
7095 if (utf8_target) {
7096 while (hardcount < max && scan < loceol
7097 && ! isSPACE_A((U8) *scan))
7098 {
7099 scan += UTF8SKIP(scan);
7100 hardcount++;
7101 }
7102 }
7103 else {
7104 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
7105 scan++;
7106 }
7107 }
7108 break;
7109 case NSPACEL:
7110 PL_reg_flags |= RF_tainted;
7111 if (utf8_target) {
7112 while (hardcount < max && scan < loceol &&
7113 !isSPACE_LC_utf8((U8*)scan)) {
7114 scan += UTF8SKIP(scan);
7115 hardcount++;
7116 }
7117 } else {
7118 while (scan < loceol && !isSPACE_LC(*scan))
7119 scan++;
7120 }
7121 break;
7122 case DIGIT:
7123 if (utf8_target) {
7124 LOAD_UTF8_CHARCLASS_DIGIT();
7125 while (hardcount < max && scan < loceol &&
7126 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7127 scan += UTF8SKIP(scan);
7128 hardcount++;
7129 }
7130 } else {
7131 while (scan < loceol && isDIGIT(*scan))
7132 scan++;
7133 }
7134 break;
7135 case DIGITA:
7136 if (utf8_target && scan + max < loceol) {
7137
7138 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7139 * since here, to match, 1 char == 1 byte */
7140 loceol = scan + max;
7141 }
7142 while (scan < loceol && isDIGIT_A((U8) *scan)) {
7143 scan++;
7144 }
7145 break;
7146 case DIGITL:
7147 PL_reg_flags |= RF_tainted;
7148 if (utf8_target) {
7149 while (hardcount < max && scan < loceol &&
7150 isDIGIT_LC_utf8((U8*)scan)) {
7151 scan += UTF8SKIP(scan);
7152 hardcount++;
7153 }
7154 } else {
7155 while (scan < loceol && isDIGIT_LC(*scan))
7156 scan++;
7157 }
7158 break;
7159 case NDIGIT:
7160 if (utf8_target) {
7161 LOAD_UTF8_CHARCLASS_DIGIT();
7162 while (hardcount < max && scan < loceol &&
7163 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7164 scan += UTF8SKIP(scan);
7165 hardcount++;
7166 }
7167 } else {
7168 while (scan < loceol && !isDIGIT(*scan))
7169 scan++;
7170 }
7171 break;
7172 case NDIGITA:
7173 if (utf8_target) {
7174 while (hardcount < max && scan < loceol
7175 && ! isDIGIT_A((U8) *scan)) {
7176 scan += UTF8SKIP(scan);
7177 hardcount++;
7178 }
7179 }
7180 else {
7181 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7182 scan++;
7183 }
7184 }
7185 break;
7186 case NDIGITL:
7187 PL_reg_flags |= RF_tainted;
7188 if (utf8_target) {
7189 while (hardcount < max && scan < loceol &&
7190 !isDIGIT_LC_utf8((U8*)scan)) {
7191 scan += UTF8SKIP(scan);
7192 hardcount++;
7193 }
7194 } else {
7195 while (scan < loceol && !isDIGIT_LC(*scan))
7196 scan++;
7197 }
7198 break;
7199 case LNBREAK:
7200 if (utf8_target) {
7201 while (hardcount < max && scan < loceol &&
7202 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7203 scan += c;
7204 hardcount++;
7205 }
7206 } else {
7207 /* LNBREAK can match one or two latin chars, which is ok, but we
7208 * have to use hardcount in this situation, and throw away the
7209 * adjustment to <loceol> done before the switch statement */
7210 loceol = PL_regeol;
7211 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7212 scan+=c;
7213 hardcount++;
7214 }
7215 }
7216 break;
7217 case HORIZWS:
7218 if (utf8_target) {
7219 while (hardcount < max && scan < loceol &&
7220 (c=is_HORIZWS_utf8_safe(scan, loceol)))
7221 {
7222 scan += c;
7223 hardcount++;
7224 }
7225 } else {
7226 while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7227 scan++;
7228 }
7229 break;
7230 case NHORIZWS:
7231 if (utf8_target) {
7232 while (hardcount < max && scan < loceol &&
7233 !is_HORIZWS_utf8_safe(scan, loceol))
7234 {
7235 scan += UTF8SKIP(scan);
7236 hardcount++;
7237 }
7238 } else {
7239 while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7240 scan++;
7241
7242 }
7243 break;
7244 case VERTWS:
7245 if (utf8_target) {
7246 while (hardcount < max && scan < loceol &&
7247 (c=is_VERTWS_utf8_safe(scan, loceol)))
7248 {
7249 scan += c;
7250 hardcount++;
7251 }
7252 } else {
7253 while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7254 scan++;
7255
7256 }
7257 break;
7258 case NVERTWS:
7259 if (utf8_target) {
7260 while (hardcount < max && scan < loceol &&
7261 !is_VERTWS_utf8_safe(scan, loceol))
7262 {
7263 scan += UTF8SKIP(scan);
7264 hardcount++;
7265 }
7266 } else {
7267 while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7268 scan++;
7269
7270 }
7271 break;
7272
7273 case BOUND:
7274 case BOUNDA:
7275 case BOUNDL:
7276 case BOUNDU:
7277 case EOS:
7278 case GPOS:
7279 case KEEPS:
7280 case NBOUND:
7281 case NBOUNDA:
7282 case NBOUNDL:
7283 case NBOUNDU:
7284 case OPFAIL:
7285 case SBOL:
7286 case SEOL:
7287 /* These are all 0 width, so match right here or not at all. */
7288 break;
7289
7290 default:
7291 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7292 assert(0); /* NOTREACHED */
7293
7294 }
7295
7296 if (hardcount)
7297 c = hardcount;
7298 else
7299 c = scan - *startposp;
7300 *startposp = scan;
7301
7302 DEBUG_r({
7303 GET_RE_DEBUG_FLAGS_DECL;
7304 DEBUG_EXECUTE_r({
7305 SV * const prop = sv_newmortal();
7306 regprop(prog, prop, p);
7307 PerlIO_printf(Perl_debug_log,
7308 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7309 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7310 });
7311 });
7312
7313 return(c);
7314}
7315
7316
7317#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7318/*
7319- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
7320create a copy so that changes the caller makes won't change the shared one.
7321If <altsvp> is non-null, will return NULL in it, for back-compat.
7322 */
7323SV *
7324Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7325{
7326 PERL_ARGS_ASSERT_REGCLASS_SWASH;
7327
7328 if (altsvp) {
7329 *altsvp = NULL;
7330 }
7331
7332 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7333}
7334#endif
7335
7336STATIC SV *
7337S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp)
7338{
7339 /* Returns the swash for the input 'node' in the regex 'prog'.
7340 * If <doinit> is true, will attempt to create the swash if not already
7341 * done.
7342 * If <listsvp> is non-null, will return the swash initialization string in
7343 * it.
7344 * Tied intimately to how regcomp.c sets up the data structure */
7345
7346 dVAR;
7347 SV *sw = NULL;
7348 SV *si = NULL;
7349 SV* invlist = NULL;
7350
7351 RXi_GET_DECL(prog,progi);
7352 const struct reg_data * const data = prog ? progi->data : NULL;
7353
7354 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7355
7356 assert(ANYOF_NONBITMAP(node));
7357
7358 if (data && data->count) {
7359 const U32 n = ARG(node);
7360
7361 if (data->what[n] == 's') {
7362 SV * const rv = MUTABLE_SV(data->data[n]);
7363 AV * const av = MUTABLE_AV(SvRV(rv));
7364 SV **const ary = AvARRAY(av);
7365 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7366
7367 si = *ary; /* ary[0] = the string to initialize the swash with */
7368
7369 /* Elements 2 and 3 are either both present or both absent. [2] is
7370 * any inversion list generated at compile time; [3] indicates if
7371 * that inversion list has any user-defined properties in it. */
7372 if (av_len(av) >= 2) {
7373 invlist = ary[2];
7374 if (SvUV(ary[3])) {
7375 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7376 }
7377 }
7378 else {
7379 invlist = NULL;
7380 }
7381
7382 /* Element [1] is reserved for the set-up swash. If already there,
7383 * return it; if not, create it and store it there */
7384 if (SvROK(ary[1])) {
7385 sw = ary[1];
7386 }
7387 else if (si && doinit) {
7388
7389 sw = _core_swash_init("utf8", /* the utf8 package */
7390 "", /* nameless */
7391 si,
7392 1, /* binary */
7393 0, /* not from tr/// */
7394 invlist,
7395 &swash_init_flags);
7396 (void)av_store(av, 1, sw);
7397 }
7398 }
7399 }
7400
7401 if (listsvp) {
7402 SV* matches_string = newSVpvn("", 0);
7403
7404 /* Use the swash, if any, which has to have incorporated into it all
7405 * possibilities */
7406 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7407 && (si && si != &PL_sv_undef))
7408 {
7409
7410 /* If no swash, use the input initialization string, if available */
7411 sv_catsv(matches_string, si);
7412 }
7413
7414 /* Add the inversion list to whatever we have. This may have come from
7415 * the swash, or from an input parameter */
7416 if (invlist) {
7417 sv_catsv(matches_string, _invlist_contents(invlist));
7418 }
7419 *listsvp = matches_string;
7420 }
7421
7422 return sw;
7423}
7424
7425/*
7426 - reginclass - determine if a character falls into a character class
7427
7428 n is the ANYOF regnode
7429 p is the target string
7430 utf8_target tells whether p is in UTF-8.
7431
7432 Returns true if matched; false otherwise.
7433
7434 Note that this can be a synthetic start class, a combination of various
7435 nodes, so things you think might be mutually exclusive, such as locale,
7436 aren't. It can match both locale and non-locale
7437
7438 */
7439
7440STATIC bool
7441S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
7442{
7443 dVAR;
7444 const char flags = ANYOF_FLAGS(n);
7445 bool match = FALSE;
7446 UV c = *p;
7447
7448 PERL_ARGS_ASSERT_REGINCLASS;
7449
7450 /* If c is not already the code point, get it. Note that
7451 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7452 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7453 STRLEN c_len = 0;
7454 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7455 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7456 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7457 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7458 * UTF8_ALLOW_FFFF */
7459 if (c_len == (STRLEN)-1)
7460 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7461 }
7462
7463 /* If this character is potentially in the bitmap, check it */
7464 if (c < 256) {
7465 if (ANYOF_BITMAP_TEST(n, c))
7466 match = TRUE;
7467 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7468 && ! utf8_target
7469 && ! isASCII(c))
7470 {
7471 match = TRUE;
7472 }
7473 else if (flags & ANYOF_LOCALE) {
7474 PL_reg_flags |= RF_tainted;
7475
7476 if ((flags & ANYOF_LOC_FOLD)
7477 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7478 {
7479 match = TRUE;
7480 }
7481 else if (ANYOF_CLASS_TEST_ANY_SET(n)) {
7482
7483 /* The data structure is arranged so bits 0, 2, 4, ... are set
7484 * if the class includes the Posix character class given by
7485 * bit/2; and 1, 3, 5, ... are set if the class includes the
7486 * complemented Posix class given by int(bit/2). So we loop
7487 * through the bits, each time changing whether we complement
7488 * the result or not. Suppose for the sake of illustration
7489 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7490 * is set, it means there is a match for this ANYOF node if the
7491 * character is in the class given by the expression (0 / 2 = 0
7492 * = \w). If it is in that class, isFOO_lc() will return 1,
7493 * and since 'to_complement' is 0, the result will stay TRUE,
7494 * and we exit the loop. Suppose instead that bit 0 is 0, but
7495 * bit 1 is 1. That means there is a match if the character
7496 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7497 * but will on bit 1. On the second iteration 'to_complement'
7498 * will be 1, so the exclusive or will reverse things, so we
7499 * are testing for \W. On the third iteration, 'to_complement'
7500 * will be 0, and we would be testing for \s; the fourth
7501 * iteration would test for \S, etc. */
7502
7503 int count = 0;
7504 int to_complement = 0;
7505 while (count < ANYOF_MAX) {
7506 if (ANYOF_CLASS_TEST(n, count)
7507 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7508 {
7509 match = TRUE;
7510 break;
7511 }
7512 count++;
7513 to_complement ^= 1;
7514 }
7515 }
7516 }
7517 }
7518
7519 /* If the bitmap didn't (or couldn't) match, and something outside the
7520 * bitmap could match, try that. Locale nodes specify completely the
7521 * behavior of code points in the bit map (otherwise, a utf8 target would
7522 * cause them to be treated as Unicode and not locale), except in
7523 * the very unlikely event when this node is a synthetic start class, which
7524 * could be a combination of locale and non-locale nodes. So allow locale
7525 * to match for the synthetic start class, which will give a false
7526 * positive that will be resolved when the match is done again as not part
7527 * of the synthetic start class */
7528 if (!match) {
7529 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7530 match = TRUE; /* Everything above 255 matches */
7531 }
7532 else if (ANYOF_NONBITMAP(n)
7533 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7534 || (utf8_target
7535 && (c >=256
7536 || (! (flags & ANYOF_LOCALE))
7537 || (flags & ANYOF_IS_SYNTHETIC)))))
7538 {
7539 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7540 if (sw) {
7541 U8 * utf8_p;
7542 if (utf8_target) {
7543 utf8_p = (U8 *) p;
7544 } else { /* Convert to utf8 */
7545 STRLEN len = 1;
7546 utf8_p = bytes_to_utf8(p, &len);
7547 }
7548
7549 if (swash_fetch(sw, utf8_p, TRUE)) {
7550 match = TRUE;
7551 }
7552
7553 /* If we allocated a string above, free it */
7554 if (! utf8_target) Safefree(utf8_p);
7555 }
7556 }
7557
7558 if (UNICODE_IS_SUPER(c)
7559 && (flags & ANYOF_WARN_SUPER)
7560 && ckWARN_d(WARN_NON_UNICODE))
7561 {
7562 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7563 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7564 }
7565 }
7566
7567 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7568 return cBOOL(flags & ANYOF_INVERT) ^ match;
7569}
7570
7571STATIC U8 *
7572S_reghop3(U8 *s, I32 off, const U8* lim)
7573{
7574 /* return the position 'off' UTF-8 characters away from 's', forward if
7575 * 'off' >= 0, backwards if negative. But don't go outside of position
7576 * 'lim', which better be < s if off < 0 */
7577
7578 dVAR;
7579
7580 PERL_ARGS_ASSERT_REGHOP3;
7581
7582 if (off >= 0) {
7583 while (off-- && s < lim) {
7584 /* XXX could check well-formedness here */
7585 s += UTF8SKIP(s);
7586 }
7587 }
7588 else {
7589 while (off++ && s > lim) {
7590 s--;
7591 if (UTF8_IS_CONTINUED(*s)) {
7592 while (s > lim && UTF8_IS_CONTINUATION(*s))
7593 s--;
7594 }
7595 /* XXX could check well-formedness here */
7596 }
7597 }
7598 return s;
7599}
7600
7601#ifdef XXX_dmq
7602/* there are a bunch of places where we use two reghop3's that should
7603 be replaced with this routine. but since thats not done yet
7604 we ifdef it out - dmq
7605*/
7606STATIC U8 *
7607S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7608{
7609 dVAR;
7610
7611 PERL_ARGS_ASSERT_REGHOP4;
7612
7613 if (off >= 0) {
7614 while (off-- && s < rlim) {
7615 /* XXX could check well-formedness here */
7616 s += UTF8SKIP(s);
7617 }
7618 }
7619 else {
7620 while (off++ && s > llim) {
7621 s--;
7622 if (UTF8_IS_CONTINUED(*s)) {
7623 while (s > llim && UTF8_IS_CONTINUATION(*s))
7624 s--;
7625 }
7626 /* XXX could check well-formedness here */
7627 }
7628 }
7629 return s;
7630}
7631#endif
7632
7633STATIC U8 *
7634S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7635{
7636 dVAR;
7637
7638 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7639
7640 if (off >= 0) {
7641 while (off-- && s < lim) {
7642 /* XXX could check well-formedness here */
7643 s += UTF8SKIP(s);
7644 }
7645 if (off >= 0)
7646 return NULL;
7647 }
7648 else {
7649 while (off++ && s > lim) {
7650 s--;
7651 if (UTF8_IS_CONTINUED(*s)) {
7652 while (s > lim && UTF8_IS_CONTINUATION(*s))
7653 s--;
7654 }
7655 /* XXX could check well-formedness here */
7656 }
7657 if (off <= 0)
7658 return NULL;
7659 }
7660 return s;
7661}
7662
7663static void
7664restore_pos(pTHX_ void *arg)
7665{
7666 dVAR;
7667 regexp * const rex = (regexp *)arg;
7668 if (PL_reg_state.re_state_eval_setup_done) {
7669 if (PL_reg_oldsaved) {
7670 rex->subbeg = PL_reg_oldsaved;
7671 rex->sublen = PL_reg_oldsavedlen;
7672 rex->suboffset = PL_reg_oldsavedoffset;
7673 rex->subcoffset = PL_reg_oldsavedcoffset;
7674#ifdef PERL_ANY_COW
7675 rex->saved_copy = PL_nrs;
7676#endif
7677 RXp_MATCH_COPIED_on(rex);
7678 }
7679 PL_reg_magic->mg_len = PL_reg_oldpos;
7680 PL_reg_state.re_state_eval_setup_done = FALSE;
7681 PL_curpm = PL_reg_oldcurpm;
7682 }
7683}
7684
7685STATIC void
7686S_to_utf8_substr(pTHX_ regexp *prog)
7687{
7688 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7689 * on the converted value */
7690
7691 int i = 1;
7692
7693 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7694
7695 do {
7696 if (prog->substrs->data[i].substr
7697 && !prog->substrs->data[i].utf8_substr) {
7698 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7699 prog->substrs->data[i].utf8_substr = sv;
7700 sv_utf8_upgrade(sv);
7701 if (SvVALID(prog->substrs->data[i].substr)) {
7702 if (SvTAIL(prog->substrs->data[i].substr)) {
7703 /* Trim the trailing \n that fbm_compile added last
7704 time. */
7705 SvCUR_set(sv, SvCUR(sv) - 1);
7706 /* Whilst this makes the SV technically "invalid" (as its
7707 buffer is no longer followed by "\0") when fbm_compile()
7708 adds the "\n" back, a "\0" is restored. */
7709 fbm_compile(sv, FBMcf_TAIL);
7710 } else
7711 fbm_compile(sv, 0);
7712 }
7713 if (prog->substrs->data[i].substr == prog->check_substr)
7714 prog->check_utf8 = sv;
7715 }
7716 } while (i--);
7717}
7718
7719STATIC bool
7720S_to_byte_substr(pTHX_ regexp *prog)
7721{
7722 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7723 * on the converted value; returns FALSE if can't be converted. */
7724
7725 dVAR;
7726 int i = 1;
7727
7728 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7729
7730 do {
7731 if (prog->substrs->data[i].utf8_substr
7732 && !prog->substrs->data[i].substr) {
7733 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7734 if (! sv_utf8_downgrade(sv, TRUE)) {
7735 return FALSE;
7736 }
7737 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7738 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7739 /* Trim the trailing \n that fbm_compile added last
7740 time. */
7741 SvCUR_set(sv, SvCUR(sv) - 1);
7742 fbm_compile(sv, FBMcf_TAIL);
7743 } else
7744 fbm_compile(sv, 0);
7745 }
7746 prog->substrs->data[i].substr = sv;
7747 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7748 prog->check_substr = sv;
7749 }
7750 } while (i--);
7751
7752 return TRUE;
7753}
7754
7755/*
7756 * Local variables:
7757 * c-indentation-style: bsd
7758 * c-basic-offset: 4
7759 * indent-tabs-mode: nil
7760 * End:
7761 *
7762 * ex: set ts=8 sts=4 sw=4 et:
7763 */