This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regmatch(): silence OpenWatcom compiler warnings
[perl5.git] / regexec.c
... / ...
CommitLineData
1/* regexec.c
2 */
3
4/*
5 * One Ring to rule them all, One Ring to find them
6 &
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
10 */
11
12/* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
15 *
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
20 */
21
22/* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
24 */
25
26/* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
29 */
30
31/* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
34*/
35
36#ifdef PERL_EXT_RE_BUILD
37#include "re_top.h"
38#endif
39
40/* At least one required character in the target string is expressible only in
41 * UTF-8. */
42static const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
44
45#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
46 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
47 goto target; \
48} STMT_END
49
50/*
51 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 *
53 * Copyright (c) 1986 by University of Toronto.
54 * Written by Henry Spencer. Not derived from licensed software.
55 *
56 * Permission is granted to anyone to use this software for any
57 * purpose on any computer system, and to redistribute it freely,
58 * subject to the following restrictions:
59 *
60 * 1. The author is not responsible for the consequences of use of
61 * this software, no matter how awful, even if they arise
62 * from defects in it.
63 *
64 * 2. The origin of this software must not be misrepresented, either
65 * by explicit claim or by omission.
66 *
67 * 3. Altered versions must be plainly marked as such, and must not
68 * be misrepresented as being the original software.
69 *
70 **** Alterations to Henry's code are...
71 ****
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
74 **** by Larry Wall and others
75 ****
76 **** You may distribute under the terms of either the GNU General Public
77 **** License or the Artistic License, as specified in the README file.
78 *
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
82 */
83#include "EXTERN.h"
84#define PERL_IN_REGEXEC_C
85#include "perl.h"
86
87#ifdef PERL_IN_XSUB_RE
88# include "re_comp.h"
89#else
90# include "regcomp.h"
91#endif
92
93#include "inline_invlist.c"
94#include "unicode_constants.h"
95
96#define RF_tainted 1 /* tainted information used? e.g. locale */
97#define RF_warned 2 /* warned about big count? */
98
99#define RF_utf8 8 /* Pattern contains multibyte chars? */
100
101#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
102
103#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
104
105#ifndef STATIC
106#define STATIC static
107#endif
108
109/* Valid for non-utf8 strings: avoids the reginclass
110 * call if there are no complications: i.e., if everything matchable is
111 * straight forward in the bitmap */
112#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \
113 : ANYOF_BITMAP_TEST(p,*(c)))
114
115/*
116 * Forwards.
117 */
118
119#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
120#define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
121
122#define HOPc(pos,off) \
123 (char *)(PL_reg_match_utf8 \
124 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
125 : (U8*)(pos + off))
126#define HOPBACKc(pos, off) \
127 (char*)(PL_reg_match_utf8\
128 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
129 : (pos - off >= PL_bostr) \
130 ? (U8*)pos - off \
131 : NULL)
132
133#define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
134#define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
135
136
137#define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
138#define NEXTCHR_IS_EOS (nextchr < 0)
139
140#define SET_nextchr \
141 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
142
143#define SET_locinput(p) \
144 locinput = (p); \
145 SET_nextchr
146
147
148/* these are unrolled below in the CCC_TRY_XXX defined */
149#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
150 if (!CAT2(PL_utf8_,class)) { \
151 bool ok; \
152 ENTER; save_re_context(); \
153 ok=CAT2(is_utf8_,class)((const U8*)str); \
154 PERL_UNUSED_VAR(ok); \
155 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
156/* Doesn't do an assert to verify that is correct */
157#define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
158 if (!CAT2(PL_utf8_,class)) { \
159 bool throw_away PERL_UNUSED_DECL; \
160 ENTER; save_re_context(); \
161 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
162 PERL_UNUSED_VAR(throw_away); \
163 LEAVE; } } STMT_END
164
165#define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
166#define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
167
168#define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
169 /* No asserts are done for some of these, in case called on a */ \
170 /* Unicode version in which they map to nothing */ \
171 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
172 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
173
174#define PLACEHOLDER /* Something for the preprocessor to grab onto */
175
176/* The actual code for CCC_TRY, which uses several variables from the routine
177 * it's callable from. It is designed to be the bulk of a case statement.
178 * FUNC is the macro or function to call on non-utf8 targets that indicate if
179 * nextchr matches the class.
180 * UTF8_TEST is the whole test string to use for utf8 targets
181 * LOAD is what to use to test, and if not present to load in the swash for the
182 * class
183 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
184 * UTF8_TEST test.
185 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
186 * utf8 and a variant, load the swash if necessary and test using the utf8
187 * test. Advance to the next character if test is ok, otherwise fail; If not
188 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
189 * fails, or advance to the next character */
190
191#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
192 if (NEXTCHR_IS_EOS) { \
193 sayNO; \
194 } \
195 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
196 LOAD_UTF8_CHARCLASS(CLASS, STR); \
197 if (POS_OR_NEG (UTF8_TEST)) { \
198 sayNO; \
199 } \
200 } \
201 else if (POS_OR_NEG (FUNC(nextchr))) { \
202 sayNO; \
203 } \
204 goto increment_locinput;
205
206/* Handle the non-locale cases for a character class and its complement. It
207 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
208 * This is because that code fails when the test succeeds, so we want to have
209 * the test fail so that the code succeeds. The swash is stored in a
210 * predictable PL_ place */
211#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
212 CLASS, STR) \
213 case NAME: \
214 _CCC_TRY_CODE( !, FUNC, \
215 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
216 (U8*)locinput, TRUE)), \
217 CLASS, STR) \
218 case NNAME: \
219 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
220 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
221 (U8*)locinput, TRUE)), \
222 CLASS, STR) \
223
224/* Generate the case statements for both locale and non-locale character
225 * classes in regmatch for classes that don't have special unicode semantics.
226 * Locales don't use an immediate swash, but an intermediary special locale
227 * function that is called on the pointer to the current place in the input
228 * string. That function will resolve to needing the same swash. One might
229 * think that because we don't know what the locale will match, we shouldn't
230 * check with the swash loading function that it loaded properly; ie, that we
231 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
232 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
233 * irrelevant here */
234#define CCC_TRY(NAME, NNAME, FUNC, \
235 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
236 NAMEA, NNAMEA, FUNCA, \
237 CLASS, STR) \
238 case NAMEL: \
239 PL_reg_flags |= RF_tainted; \
240 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
241 case NNAMEL: \
242 PL_reg_flags |= RF_tainted; \
243 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
244 CLASS, STR) \
245 case NAMEA: \
246 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
247 sayNO; \
248 } \
249 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
250 locinput++; \
251 break; \
252 case NNAMEA: \
253 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
254 sayNO; \
255 } \
256 goto increment_locinput; \
257 /* Generate the non-locale cases */ \
258 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
259
260/* This is like CCC_TRY, but has an extra set of parameters for generating case
261 * statements to handle separate Unicode semantics nodes */
262#define CCC_TRY_U(NAME, NNAME, FUNC, \
263 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
264 NAMEU, NNAMEU, FUNCU, \
265 NAMEA, NNAMEA, FUNCA, \
266 CLASS, STR) \
267 CCC_TRY(NAME, NNAME, FUNC, \
268 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
269 NAMEA, NNAMEA, FUNCA, \
270 CLASS, STR) \
271 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
272
273/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
274
275/* for use after a quantifier and before an EXACT-like node -- japhy */
276/* it would be nice to rework regcomp.sym to generate this stuff. sigh
277 *
278 * NOTE that *nothing* that affects backtracking should be in here, specifically
279 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
280 * node that is in between two EXACT like nodes when ascertaining what the required
281 * "follow" character is. This should probably be moved to regex compile time
282 * although it may be done at run time beause of the REF possibility - more
283 * investigation required. -- demerphq
284*/
285#define JUMPABLE(rn) ( \
286 OP(rn) == OPEN || \
287 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
288 OP(rn) == EVAL || \
289 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
290 OP(rn) == PLUS || OP(rn) == MINMOD || \
291 OP(rn) == KEEPS || \
292 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
293)
294#define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
295
296#define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
297
298#if 0
299/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
300 we don't need this definition. */
301#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
302#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
303#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
304
305#else
306/* ... so we use this as its faster. */
307#define IS_TEXT(rn) ( OP(rn)==EXACT )
308#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
309#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
310#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
311
312#endif
313
314/*
315 Search for mandatory following text node; for lookahead, the text must
316 follow but for lookbehind (rn->flags != 0) we skip to the next step.
317*/
318#define FIND_NEXT_IMPT(rn) STMT_START { \
319 while (JUMPABLE(rn)) { \
320 const OPCODE type = OP(rn); \
321 if (type == SUSPEND || PL_regkind[type] == CURLY) \
322 rn = NEXTOPER(NEXTOPER(rn)); \
323 else if (type == PLUS) \
324 rn = NEXTOPER(rn); \
325 else if (type == IFMATCH) \
326 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
327 else rn += NEXT_OFF(rn); \
328 } \
329} STMT_END
330
331
332static void restore_pos(pTHX_ void *arg);
333
334#define REGCP_PAREN_ELEMS 3
335#define REGCP_OTHER_ELEMS 3
336#define REGCP_FRAME_ELEMS 1
337/* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
338 * are needed for the regexp context stack bookkeeping. */
339
340STATIC CHECKPOINT
341S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
342{
343 dVAR;
344 const int retval = PL_savestack_ix;
345 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
346 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
347 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
348 I32 p;
349 GET_RE_DEBUG_FLAGS_DECL;
350
351 PERL_ARGS_ASSERT_REGCPPUSH;
352
353 if (paren_elems_to_push < 0)
354 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
355 paren_elems_to_push);
356
357 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
358 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
359 " out of range (%lu-%ld)",
360 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
361
362 SSGROW(total_elems + REGCP_FRAME_ELEMS);
363
364 DEBUG_BUFFERS_r(
365 if ((int)PL_regsize > (int)parenfloor)
366 PerlIO_printf(Perl_debug_log,
367 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
368 PTR2UV(rex),
369 PTR2UV(rex->offs)
370 );
371 );
372 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
373/* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
374 SSPUSHINT(rex->offs[p].end);
375 SSPUSHINT(rex->offs[p].start);
376 SSPUSHINT(rex->offs[p].start_tmp);
377 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
378 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
379 (UV)p,
380 (IV)rex->offs[p].start,
381 (IV)rex->offs[p].start_tmp,
382 (IV)rex->offs[p].end
383 ));
384 }
385/* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
386 SSPUSHINT(PL_regsize);
387 SSPUSHINT(rex->lastparen);
388 SSPUSHINT(rex->lastcloseparen);
389 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
390
391 return retval;
392}
393
394/* These are needed since we do not localize EVAL nodes: */
395#define REGCP_SET(cp) \
396 DEBUG_STATE_r( \
397 PerlIO_printf(Perl_debug_log, \
398 " Setting an EVAL scope, savestack=%"IVdf"\n", \
399 (IV)PL_savestack_ix)); \
400 cp = PL_savestack_ix
401
402#define REGCP_UNWIND(cp) \
403 DEBUG_STATE_r( \
404 if (cp != PL_savestack_ix) \
405 PerlIO_printf(Perl_debug_log, \
406 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
407 (IV)(cp), (IV)PL_savestack_ix)); \
408 regcpblow(cp)
409
410#define UNWIND_PAREN(lp, lcp) \
411 for (n = rex->lastparen; n > lp; n--) \
412 rex->offs[n].end = -1; \
413 rex->lastparen = n; \
414 rex->lastcloseparen = lcp;
415
416
417STATIC void
418S_regcppop(pTHX_ regexp *rex)
419{
420 dVAR;
421 UV i;
422 U32 paren;
423 GET_RE_DEBUG_FLAGS_DECL;
424
425 PERL_ARGS_ASSERT_REGCPPOP;
426
427 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
428 i = SSPOPUV;
429 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
430 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
431 rex->lastcloseparen = SSPOPINT;
432 rex->lastparen = SSPOPINT;
433 PL_regsize = SSPOPINT;
434
435 i -= REGCP_OTHER_ELEMS;
436 /* Now restore the parentheses context. */
437 DEBUG_BUFFERS_r(
438 if (i || rex->lastparen + 1 <= rex->nparens)
439 PerlIO_printf(Perl_debug_log,
440 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
441 PTR2UV(rex),
442 PTR2UV(rex->offs)
443 );
444 );
445 paren = PL_regsize;
446 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
447 I32 tmps;
448 rex->offs[paren].start_tmp = SSPOPINT;
449 rex->offs[paren].start = SSPOPINT;
450 tmps = SSPOPINT;
451 if (paren <= rex->lastparen)
452 rex->offs[paren].end = tmps;
453 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
454 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
455 (UV)paren,
456 (IV)rex->offs[paren].start,
457 (IV)rex->offs[paren].start_tmp,
458 (IV)rex->offs[paren].end,
459 (paren > rex->lastparen ? "(skipped)" : ""));
460 );
461 paren--;
462 }
463#if 1
464 /* It would seem that the similar code in regtry()
465 * already takes care of this, and in fact it is in
466 * a better location to since this code can #if 0-ed out
467 * but the code in regtry() is needed or otherwise tests
468 * requiring null fields (pat.t#187 and split.t#{13,14}
469 * (as of patchlevel 7877) will fail. Then again,
470 * this code seems to be necessary or otherwise
471 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
472 * --jhi updated by dapm */
473 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
474 if (i > PL_regsize)
475 rex->offs[i].start = -1;
476 rex->offs[i].end = -1;
477 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
478 " \\%"UVuf": %s ..-1 undeffing\n",
479 (UV)i,
480 (i > PL_regsize) ? "-1" : " "
481 ));
482 }
483#endif
484}
485
486/* restore the parens and associated vars at savestack position ix,
487 * but without popping the stack */
488
489STATIC void
490S_regcp_restore(pTHX_ regexp *rex, I32 ix)
491{
492 I32 tmpix = PL_savestack_ix;
493 PL_savestack_ix = ix;
494 regcppop(rex);
495 PL_savestack_ix = tmpix;
496}
497
498#define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
499
500/*
501 * pregexec and friends
502 */
503
504#ifndef PERL_IN_XSUB_RE
505/*
506 - pregexec - match a regexp against a string
507 */
508I32
509Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
510 char *strbeg, I32 minend, SV *screamer, U32 nosave)
511/* stringarg: the point in the string at which to begin matching */
512/* strend: pointer to null at end of string */
513/* strbeg: real beginning of string */
514/* minend: end of match must be >= minend bytes after stringarg. */
515/* screamer: SV being matched: only used for utf8 flag, pos() etc; string
516 * itself is accessed via the pointers above */
517/* nosave: For optimizations. */
518{
519 PERL_ARGS_ASSERT_PREGEXEC;
520
521 return
522 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
523 nosave ? 0 : REXEC_COPY_STR);
524}
525#endif
526
527/*
528 * Need to implement the following flags for reg_anch:
529 *
530 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
531 * USE_INTUIT_ML
532 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
533 * INTUIT_AUTORITATIVE_ML
534 * INTUIT_ONCE_NOML - Intuit can match in one location only.
535 * INTUIT_ONCE_ML
536 *
537 * Another flag for this function: SECOND_TIME (so that float substrs
538 * with giant delta may be not rechecked).
539 */
540
541/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
542
543/* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
544 Otherwise, only SvCUR(sv) is used to get strbeg. */
545
546/* XXXX We assume that strpos is strbeg unless sv. */
547
548/* XXXX Some places assume that there is a fixed substring.
549 An update may be needed if optimizer marks as "INTUITable"
550 RExen without fixed substrings. Similarly, it is assumed that
551 lengths of all the strings are no more than minlen, thus they
552 cannot come from lookahead.
553 (Or minlen should take into account lookahead.)
554 NOTE: Some of this comment is not correct. minlen does now take account
555 of lookahead/behind. Further research is required. -- demerphq
556
557*/
558
559/* A failure to find a constant substring means that there is no need to make
560 an expensive call to REx engine, thus we celebrate a failure. Similarly,
561 finding a substring too deep into the string means that less calls to
562 regtry() should be needed.
563
564 REx compiler's optimizer found 4 possible hints:
565 a) Anchored substring;
566 b) Fixed substring;
567 c) Whether we are anchored (beginning-of-line or \G);
568 d) First node (of those at offset 0) which may distinguish positions;
569 We use a)b)d) and multiline-part of c), and try to find a position in the
570 string which does not contradict any of them.
571 */
572
573/* Most of decisions we do here should have been done at compile time.
574 The nodes of the REx which we used for the search should have been
575 deleted from the finite automaton. */
576
577char *
578Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
579 char *strend, const U32 flags, re_scream_pos_data *data)
580{
581 dVAR;
582 struct regexp *const prog = ReANY(rx);
583 I32 start_shift = 0;
584 /* Should be nonnegative! */
585 I32 end_shift = 0;
586 char *s;
587 SV *check;
588 char *strbeg;
589 char *t;
590 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
591 I32 ml_anch;
592 char *other_last = NULL; /* other substr checked before this */
593 char *check_at = NULL; /* check substr found at this pos */
594 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
595 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
596 RXi_GET_DECL(prog,progi);
597#ifdef DEBUGGING
598 const char * const i_strpos = strpos;
599#endif
600 GET_RE_DEBUG_FLAGS_DECL;
601
602 PERL_ARGS_ASSERT_RE_INTUIT_START;
603 PERL_UNUSED_ARG(flags);
604 PERL_UNUSED_ARG(data);
605
606 RX_MATCH_UTF8_set(rx,utf8_target);
607
608 if (RX_UTF8(rx)) {
609 PL_reg_flags |= RF_utf8;
610 }
611 DEBUG_EXECUTE_r(
612 debug_start_match(rx, utf8_target, strpos, strend,
613 sv ? "Guessing start of match in sv for"
614 : "Guessing start of match in string for");
615 );
616
617 /* CHR_DIST() would be more correct here but it makes things slow. */
618 if (prog->minlen > strend - strpos) {
619 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
620 "String too short... [re_intuit_start]\n"));
621 goto fail;
622 }
623
624 /* XXX we need to pass strbeg as a separate arg: the following is
625 * guesswork and can be wrong... */
626 if (sv && SvPOK(sv)) {
627 char * p = SvPVX(sv);
628 STRLEN cur = SvCUR(sv);
629 if (p <= strpos && strpos < p + cur) {
630 strbeg = p;
631 assert(p <= strend && strend <= p + cur);
632 }
633 else
634 strbeg = strend - cur;
635 }
636 else
637 strbeg = strpos;
638
639 PL_regeol = strend;
640 if (utf8_target) {
641 if (!prog->check_utf8 && prog->check_substr)
642 to_utf8_substr(prog);
643 check = prog->check_utf8;
644 } else {
645 if (!prog->check_substr && prog->check_utf8) {
646 if (! to_byte_substr(prog)) {
647 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
648 }
649 }
650 check = prog->check_substr;
651 }
652 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
653 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
654 || ( (prog->extflags & RXf_ANCH_BOL)
655 && !multiline ) ); /* Check after \n? */
656
657 if (!ml_anch) {
658 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
659 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
660 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
661 && sv && !SvROK(sv)
662 && (strpos != strbeg)) {
663 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
664 goto fail;
665 }
666 if (prog->check_offset_min == prog->check_offset_max
667 && !(prog->extflags & RXf_CANY_SEEN)
668 && ! multiline) /* /m can cause \n's to match that aren't
669 accounted for in the string max length.
670 See [perl #115242] */
671 {
672 /* Substring at constant offset from beg-of-str... */
673 I32 slen;
674
675 s = HOP3c(strpos, prog->check_offset_min, strend);
676
677 if (SvTAIL(check)) {
678 slen = SvCUR(check); /* >= 1 */
679
680 if ( strend - s > slen || strend - s < slen - 1
681 || (strend - s == slen && strend[-1] != '\n')) {
682 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
683 goto fail_finish;
684 }
685 /* Now should match s[0..slen-2] */
686 slen--;
687 if (slen && (*SvPVX_const(check) != *s
688 || (slen > 1
689 && memNE(SvPVX_const(check), s, slen)))) {
690 report_neq:
691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
692 goto fail_finish;
693 }
694 }
695 else if (*SvPVX_const(check) != *s
696 || ((slen = SvCUR(check)) > 1
697 && memNE(SvPVX_const(check), s, slen)))
698 goto report_neq;
699 check_at = s;
700 goto success_at_start;
701 }
702 }
703 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
704 s = strpos;
705 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
706 end_shift = prog->check_end_shift;
707
708 if (!ml_anch) {
709 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
710 - (SvTAIL(check) != 0);
711 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
712
713 if (end_shift < eshift)
714 end_shift = eshift;
715 }
716 }
717 else { /* Can match at random position */
718 ml_anch = 0;
719 s = strpos;
720 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
721 end_shift = prog->check_end_shift;
722
723 /* end shift should be non negative here */
724 }
725
726#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
727 if (end_shift < 0)
728 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
729 (IV)end_shift, RX_PRECOMP(prog));
730#endif
731
732 restart:
733 /* Find a possible match in the region s..strend by looking for
734 the "check" substring in the region corrected by start/end_shift. */
735
736 {
737 I32 srch_start_shift = start_shift;
738 I32 srch_end_shift = end_shift;
739 U8* start_point;
740 U8* end_point;
741 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
742 srch_end_shift -= ((strbeg - s) - srch_start_shift);
743 srch_start_shift = strbeg - s;
744 }
745 DEBUG_OPTIMISE_MORE_r({
746 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
747 (IV)prog->check_offset_min,
748 (IV)srch_start_shift,
749 (IV)srch_end_shift,
750 (IV)prog->check_end_shift);
751 });
752
753 if (prog->extflags & RXf_CANY_SEEN) {
754 start_point= (U8*)(s + srch_start_shift);
755 end_point= (U8*)(strend - srch_end_shift);
756 } else {
757 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
758 end_point= HOP3(strend, -srch_end_shift, strbeg);
759 }
760 DEBUG_OPTIMISE_MORE_r({
761 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
762 (int)(end_point - start_point),
763 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
764 start_point);
765 });
766
767 s = fbm_instr( start_point, end_point,
768 check, multiline ? FBMrf_MULTILINE : 0);
769 }
770 /* Update the count-of-usability, remove useless subpatterns,
771 unshift s. */
772
773 DEBUG_EXECUTE_r({
774 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
775 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
776 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
777 (s ? "Found" : "Did not find"),
778 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
779 ? "anchored" : "floating"),
780 quoted,
781 RE_SV_TAIL(check),
782 (s ? " at offset " : "...\n") );
783 });
784
785 if (!s)
786 goto fail_finish;
787 /* Finish the diagnostic message */
788 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
789
790 /* XXX dmq: first branch is for positive lookbehind...
791 Our check string is offset from the beginning of the pattern.
792 So we need to do any stclass tests offset forward from that
793 point. I think. :-(
794 */
795
796
797
798 check_at=s;
799
800
801 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
802 Start with the other substr.
803 XXXX no SCREAM optimization yet - and a very coarse implementation
804 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
805 *always* match. Probably should be marked during compile...
806 Probably it is right to do no SCREAM here...
807 */
808
809 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
810 : (prog->float_substr && prog->anchored_substr))
811 {
812 /* Take into account the "other" substring. */
813 /* XXXX May be hopelessly wrong for UTF... */
814 if (!other_last)
815 other_last = strpos;
816 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
817 do_other_anchored:
818 {
819 char * const last = HOP3c(s, -start_shift, strbeg);
820 char *last1, *last2;
821 char * const saved_s = s;
822 SV* must;
823
824 t = s - prog->check_offset_max;
825 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
826 && (!utf8_target
827 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
828 && t > strpos)))
829 NOOP;
830 else
831 t = strpos;
832 t = HOP3c(t, prog->anchored_offset, strend);
833 if (t < other_last) /* These positions already checked */
834 t = other_last;
835 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
836 if (last < last1)
837 last1 = last;
838 /* XXXX It is not documented what units *_offsets are in.
839 We assume bytes, but this is clearly wrong.
840 Meaning this code needs to be carefully reviewed for errors.
841 dmq.
842 */
843
844 /* On end-of-str: see comment below. */
845 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
846 if (must == &PL_sv_undef) {
847 s = (char*)NULL;
848 DEBUG_r(must = prog->anchored_utf8); /* for debug */
849 }
850 else
851 s = fbm_instr(
852 (unsigned char*)t,
853 HOP3(HOP3(last1, prog->anchored_offset, strend)
854 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
855 must,
856 multiline ? FBMrf_MULTILINE : 0
857 );
858 DEBUG_EXECUTE_r({
859 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
860 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
861 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
862 (s ? "Found" : "Contradicts"),
863 quoted, RE_SV_TAIL(must));
864 });
865
866
867 if (!s) {
868 if (last1 >= last2) {
869 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
870 ", giving up...\n"));
871 goto fail_finish;
872 }
873 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
874 ", trying floating at offset %ld...\n",
875 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
876 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
877 s = HOP3c(last, 1, strend);
878 goto restart;
879 }
880 else {
881 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
882 (long)(s - i_strpos)));
883 t = HOP3c(s, -prog->anchored_offset, strbeg);
884 other_last = HOP3c(s, 1, strend);
885 s = saved_s;
886 if (t == strpos)
887 goto try_at_start;
888 goto try_at_offset;
889 }
890 }
891 }
892 else { /* Take into account the floating substring. */
893 char *last, *last1;
894 char * const saved_s = s;
895 SV* must;
896
897 t = HOP3c(s, -start_shift, strbeg);
898 last1 = last =
899 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
900 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
901 last = HOP3c(t, prog->float_max_offset, strend);
902 s = HOP3c(t, prog->float_min_offset, strend);
903 if (s < other_last)
904 s = other_last;
905 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
906 must = utf8_target ? prog->float_utf8 : prog->float_substr;
907 /* fbm_instr() takes into account exact value of end-of-str
908 if the check is SvTAIL(ed). Since false positives are OK,
909 and end-of-str is not later than strend we are OK. */
910 if (must == &PL_sv_undef) {
911 s = (char*)NULL;
912 DEBUG_r(must = prog->float_utf8); /* for debug message */
913 }
914 else
915 s = fbm_instr((unsigned char*)s,
916 (unsigned char*)last + SvCUR(must)
917 - (SvTAIL(must)!=0),
918 must, multiline ? FBMrf_MULTILINE : 0);
919 DEBUG_EXECUTE_r({
920 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
921 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
922 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
923 (s ? "Found" : "Contradicts"),
924 quoted, RE_SV_TAIL(must));
925 });
926 if (!s) {
927 if (last1 == last) {
928 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
929 ", giving up...\n"));
930 goto fail_finish;
931 }
932 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
933 ", trying anchored starting at offset %ld...\n",
934 (long)(saved_s + 1 - i_strpos)));
935 other_last = last;
936 s = HOP3c(t, 1, strend);
937 goto restart;
938 }
939 else {
940 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
941 (long)(s - i_strpos)));
942 other_last = s; /* Fix this later. --Hugo */
943 s = saved_s;
944 if (t == strpos)
945 goto try_at_start;
946 goto try_at_offset;
947 }
948 }
949 }
950
951
952 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
953
954 DEBUG_OPTIMISE_MORE_r(
955 PerlIO_printf(Perl_debug_log,
956 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
957 (IV)prog->check_offset_min,
958 (IV)prog->check_offset_max,
959 (IV)(s-strpos),
960 (IV)(t-strpos),
961 (IV)(t-s),
962 (IV)(strend-strpos)
963 )
964 );
965
966 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
967 && (!utf8_target
968 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
969 && t > strpos)))
970 {
971 /* Fixed substring is found far enough so that the match
972 cannot start at strpos. */
973 try_at_offset:
974 if (ml_anch && t[-1] != '\n') {
975 /* Eventually fbm_*() should handle this, but often
976 anchored_offset is not 0, so this check will not be wasted. */
977 /* XXXX In the code below we prefer to look for "^" even in
978 presence of anchored substrings. And we search even
979 beyond the found float position. These pessimizations
980 are historical artefacts only. */
981 find_anchor:
982 while (t < strend - prog->minlen) {
983 if (*t == '\n') {
984 if (t < check_at - prog->check_offset_min) {
985 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
986 /* Since we moved from the found position,
987 we definitely contradict the found anchored
988 substr. Due to the above check we do not
989 contradict "check" substr.
990 Thus we can arrive here only if check substr
991 is float. Redo checking for "other"=="fixed".
992 */
993 strpos = t + 1;
994 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
995 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
996 goto do_other_anchored;
997 }
998 /* We don't contradict the found floating substring. */
999 /* XXXX Why not check for STCLASS? */
1000 s = t + 1;
1001 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
1002 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
1003 goto set_useful;
1004 }
1005 /* Position contradicts check-string */
1006 /* XXXX probably better to look for check-string
1007 than for "\n", so one should lower the limit for t? */
1008 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1009 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1010 other_last = strpos = s = t + 1;
1011 goto restart;
1012 }
1013 t++;
1014 }
1015 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1016 PL_colors[0], PL_colors[1]));
1017 goto fail_finish;
1018 }
1019 else {
1020 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1021 PL_colors[0], PL_colors[1]));
1022 }
1023 s = t;
1024 set_useful:
1025 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1026 }
1027 else {
1028 /* The found string does not prohibit matching at strpos,
1029 - no optimization of calling REx engine can be performed,
1030 unless it was an MBOL and we are not after MBOL,
1031 or a future STCLASS check will fail this. */
1032 try_at_start:
1033 /* Even in this situation we may use MBOL flag if strpos is offset
1034 wrt the start of the string. */
1035 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1036 && (strpos != strbeg) && strpos[-1] != '\n'
1037 /* May be due to an implicit anchor of m{.*foo} */
1038 && !(prog->intflags & PREGf_IMPLICIT))
1039 {
1040 t = strpos;
1041 goto find_anchor;
1042 }
1043 DEBUG_EXECUTE_r( if (ml_anch)
1044 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1045 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1046 );
1047 success_at_start:
1048 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1049 && (utf8_target ? (
1050 prog->check_utf8 /* Could be deleted already */
1051 && --BmUSEFUL(prog->check_utf8) < 0
1052 && (prog->check_utf8 == prog->float_utf8)
1053 ) : (
1054 prog->check_substr /* Could be deleted already */
1055 && --BmUSEFUL(prog->check_substr) < 0
1056 && (prog->check_substr == prog->float_substr)
1057 )))
1058 {
1059 /* If flags & SOMETHING - do not do it many times on the same match */
1060 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1061 /* XXX Does the destruction order has to change with utf8_target? */
1062 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1063 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1064 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1065 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1066 check = NULL; /* abort */
1067 s = strpos;
1068 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1069 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1070 if (prog->intflags & PREGf_IMPLICIT)
1071 prog->extflags &= ~RXf_ANCH_MBOL;
1072 /* XXXX This is a remnant of the old implementation. It
1073 looks wasteful, since now INTUIT can use many
1074 other heuristics. */
1075 prog->extflags &= ~RXf_USE_INTUIT;
1076 /* XXXX What other flags might need to be cleared in this branch? */
1077 }
1078 else
1079 s = strpos;
1080 }
1081
1082 /* Last resort... */
1083 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1084 /* trie stclasses are too expensive to use here, we are better off to
1085 leave it to regmatch itself */
1086 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1087 /* minlen == 0 is possible if regstclass is \b or \B,
1088 and the fixed substr is ''$.
1089 Since minlen is already taken into account, s+1 is before strend;
1090 accidentally, minlen >= 1 guaranties no false positives at s + 1
1091 even for \b or \B. But (minlen? 1 : 0) below assumes that
1092 regstclass does not come from lookahead... */
1093 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1094 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1095 const U8* const str = (U8*)STRING(progi->regstclass);
1096 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1097 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1098 : 1);
1099 char * endpos;
1100 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1101 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1102 else if (prog->float_substr || prog->float_utf8)
1103 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1104 else
1105 endpos= strend;
1106
1107 if (checked_upto < s)
1108 checked_upto = s;
1109 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1110 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1111
1112 t = s;
1113 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1114 if (s) {
1115 checked_upto = s;
1116 } else {
1117#ifdef DEBUGGING
1118 const char *what = NULL;
1119#endif
1120 if (endpos == strend) {
1121 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1122 "Could not match STCLASS...\n") );
1123 goto fail;
1124 }
1125 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1126 "This position contradicts STCLASS...\n") );
1127 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1128 goto fail;
1129 checked_upto = HOPBACKc(endpos, start_shift);
1130 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1131 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1132 /* Contradict one of substrings */
1133 if (prog->anchored_substr || prog->anchored_utf8) {
1134 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1135 DEBUG_EXECUTE_r( what = "anchored" );
1136 hop_and_restart:
1137 s = HOP3c(t, 1, strend);
1138 if (s + start_shift + end_shift > strend) {
1139 /* XXXX Should be taken into account earlier? */
1140 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1141 "Could not match STCLASS...\n") );
1142 goto fail;
1143 }
1144 if (!check)
1145 goto giveup;
1146 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1147 "Looking for %s substr starting at offset %ld...\n",
1148 what, (long)(s + start_shift - i_strpos)) );
1149 goto restart;
1150 }
1151 /* Have both, check_string is floating */
1152 if (t + start_shift >= check_at) /* Contradicts floating=check */
1153 goto retry_floating_check;
1154 /* Recheck anchored substring, but not floating... */
1155 s = check_at;
1156 if (!check)
1157 goto giveup;
1158 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1159 "Looking for anchored substr starting at offset %ld...\n",
1160 (long)(other_last - i_strpos)) );
1161 goto do_other_anchored;
1162 }
1163 /* Another way we could have checked stclass at the
1164 current position only: */
1165 if (ml_anch) {
1166 s = t = t + 1;
1167 if (!check)
1168 goto giveup;
1169 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1170 "Looking for /%s^%s/m starting at offset %ld...\n",
1171 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1172 goto try_at_offset;
1173 }
1174 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1175 goto fail;
1176 /* Check is floating substring. */
1177 retry_floating_check:
1178 t = check_at - start_shift;
1179 DEBUG_EXECUTE_r( what = "floating" );
1180 goto hop_and_restart;
1181 }
1182 if (t != s) {
1183 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1184 "By STCLASS: moving %ld --> %ld\n",
1185 (long)(t - i_strpos), (long)(s - i_strpos))
1186 );
1187 }
1188 else {
1189 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1190 "Does not contradict STCLASS...\n");
1191 );
1192 }
1193 }
1194 giveup:
1195 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1196 PL_colors[4], (check ? "Guessed" : "Giving up"),
1197 PL_colors[5], (long)(s - i_strpos)) );
1198 return s;
1199
1200 fail_finish: /* Substring not found */
1201 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1202 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1203 fail:
1204 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1205 PL_colors[4], PL_colors[5]));
1206 return NULL;
1207}
1208
1209#define DECL_TRIE_TYPE(scan) \
1210 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1211 trie_type = ((scan->flags == EXACT) \
1212 ? (utf8_target ? trie_utf8 : trie_plain) \
1213 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1214
1215#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1216uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1217 STRLEN skiplen; \
1218 switch (trie_type) { \
1219 case trie_utf8_fold: \
1220 if ( foldlen>0 ) { \
1221 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1222 foldlen -= len; \
1223 uscan += len; \
1224 len=0; \
1225 } else { \
1226 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1227 len = UTF8SKIP(uc); \
1228 skiplen = UNISKIP( uvc ); \
1229 foldlen -= skiplen; \
1230 uscan = foldbuf + skiplen; \
1231 } \
1232 break; \
1233 case trie_latin_utf8_fold: \
1234 if ( foldlen>0 ) { \
1235 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1236 foldlen -= len; \
1237 uscan += len; \
1238 len=0; \
1239 } else { \
1240 len = 1; \
1241 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1242 skiplen = UNISKIP( uvc ); \
1243 foldlen -= skiplen; \
1244 uscan = foldbuf + skiplen; \
1245 } \
1246 break; \
1247 case trie_utf8: \
1248 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1249 break; \
1250 case trie_plain: \
1251 uvc = (UV)*uc; \
1252 len = 1; \
1253 } \
1254 if (uvc < 256) { \
1255 charid = trie->charmap[ uvc ]; \
1256 } \
1257 else { \
1258 charid = 0; \
1259 if (widecharmap) { \
1260 SV** const svpp = hv_fetch(widecharmap, \
1261 (char*)&uvc, sizeof(UV), 0); \
1262 if (svpp) \
1263 charid = (U16)SvIV(*svpp); \
1264 } \
1265 } \
1266} STMT_END
1267
1268#define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1269STMT_START { \
1270 while (s <= e) { \
1271 if ( (CoNd) \
1272 && (ln == 1 || folder(s, pat_string, ln)) \
1273 && (!reginfo || regtry(reginfo, &s)) ) \
1274 goto got_it; \
1275 s++; \
1276 } \
1277} STMT_END
1278
1279#define REXEC_FBC_UTF8_SCAN(CoDe) \
1280STMT_START { \
1281 while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) { \
1282 CoDe \
1283 s += uskip; \
1284 } \
1285} STMT_END
1286
1287#define REXEC_FBC_SCAN(CoDe) \
1288STMT_START { \
1289 while (s < strend) { \
1290 CoDe \
1291 s++; \
1292 } \
1293} STMT_END
1294
1295#define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1296REXEC_FBC_UTF8_SCAN( \
1297 if (CoNd) { \
1298 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1299 goto got_it; \
1300 else \
1301 tmp = doevery; \
1302 } \
1303 else \
1304 tmp = 1; \
1305)
1306
1307#define REXEC_FBC_CLASS_SCAN(CoNd) \
1308REXEC_FBC_SCAN( \
1309 if (CoNd) { \
1310 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1311 goto got_it; \
1312 else \
1313 tmp = doevery; \
1314 } \
1315 else \
1316 tmp = 1; \
1317)
1318
1319#define REXEC_FBC_TRYIT \
1320if ((!reginfo || regtry(reginfo, &s))) \
1321 goto got_it
1322
1323#define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1324 if (utf8_target) { \
1325 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1326 } \
1327 else { \
1328 REXEC_FBC_CLASS_SCAN(CoNd); \
1329 }
1330
1331#define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1332 if (utf8_target) { \
1333 UtFpReLoAd; \
1334 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1335 } \
1336 else { \
1337 REXEC_FBC_CLASS_SCAN(CoNd); \
1338 }
1339
1340#define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1341 PL_reg_flags |= RF_tainted; \
1342 if (utf8_target) { \
1343 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1344 } \
1345 else { \
1346 REXEC_FBC_CLASS_SCAN(CoNd); \
1347 }
1348
1349#define DUMP_EXEC_POS(li,s,doutf8) \
1350 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1351
1352
1353#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1354 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1355 tmp = TEST_NON_UTF8(tmp); \
1356 REXEC_FBC_UTF8_SCAN( \
1357 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1358 tmp = !tmp; \
1359 IF_SUCCESS; \
1360 } \
1361 else { \
1362 IF_FAIL; \
1363 } \
1364 ); \
1365
1366#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1367 if (s == PL_bostr) { \
1368 tmp = '\n'; \
1369 } \
1370 else { \
1371 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1372 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1373 } \
1374 tmp = TeSt1_UtF8; \
1375 LOAD_UTF8_CHARCLASS_ALNUM(); \
1376 REXEC_FBC_UTF8_SCAN( \
1377 if (tmp == ! (TeSt2_UtF8)) { \
1378 tmp = !tmp; \
1379 IF_SUCCESS; \
1380 } \
1381 else { \
1382 IF_FAIL; \
1383 } \
1384 ); \
1385
1386/* The only difference between the BOUND and NBOUND cases is that
1387 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1388 * NBOUND. This is accomplished by passing it in either the if or else clause,
1389 * with the other one being empty */
1390#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1391 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1392
1393#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1394 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1395
1396#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1397 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1398
1399#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1400 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1401
1402
1403/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1404 * be passed in completely with the variable name being tested, which isn't
1405 * such a clean interface, but this is easier to read than it was before. We
1406 * are looking for the boundary (or non-boundary between a word and non-word
1407 * character. The utf8 and non-utf8 cases have the same logic, but the details
1408 * must be different. Find the "wordness" of the character just prior to this
1409 * one, and compare it with the wordness of this one. If they differ, we have
1410 * a boundary. At the beginning of the string, pretend that the previous
1411 * character was a new-line */
1412#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1413 if (utf8_target) { \
1414 UTF8_CODE \
1415 } \
1416 else { /* Not utf8 */ \
1417 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1418 tmp = TEST_NON_UTF8(tmp); \
1419 REXEC_FBC_SCAN( \
1420 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1421 tmp = !tmp; \
1422 IF_SUCCESS; \
1423 } \
1424 else { \
1425 IF_FAIL; \
1426 } \
1427 ); \
1428 } \
1429 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1430 goto got_it;
1431
1432/* We know what class REx starts with. Try to find this position... */
1433/* if reginfo is NULL, its a dryrun */
1434/* annoyingly all the vars in this routine have different names from their counterparts
1435 in regmatch. /grrr */
1436
1437STATIC char *
1438S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1439 const char *strend, regmatch_info *reginfo)
1440{
1441 dVAR;
1442 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1443 char *pat_string; /* The pattern's exactish string */
1444 char *pat_end; /* ptr to end char of pat_string */
1445 re_fold_t folder; /* Function for computing non-utf8 folds */
1446 const U8 *fold_array; /* array for folding ords < 256 */
1447 STRLEN ln;
1448 STRLEN lnc;
1449 STRLEN uskip;
1450 U8 c1;
1451 U8 c2;
1452 char *e;
1453 I32 tmp = 1; /* Scratch variable? */
1454 const bool utf8_target = PL_reg_match_utf8;
1455 UV utf8_fold_flags = 0;
1456 RXi_GET_DECL(prog,progi);
1457
1458 PERL_ARGS_ASSERT_FIND_BYCLASS;
1459
1460 /* We know what class it must start with. */
1461 switch (OP(c)) {
1462 case ANYOF:
1463 if (utf8_target) {
1464 REXEC_FBC_UTF8_CLASS_SCAN(
1465 reginclass(prog, c, (U8*)s, utf8_target));
1466 }
1467 else {
1468 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1469 }
1470 break;
1471 case CANY:
1472 REXEC_FBC_SCAN(
1473 if (tmp && (!reginfo || regtry(reginfo, &s)))
1474 goto got_it;
1475 else
1476 tmp = doevery;
1477 );
1478 break;
1479
1480 case EXACTFA:
1481 if (UTF_PATTERN || utf8_target) {
1482 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1483 goto do_exactf_utf8;
1484 }
1485 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1486 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1487 goto do_exactf_non_utf8; /* isn't dealt with by these */
1488
1489 case EXACTF:
1490 if (utf8_target) {
1491
1492 /* regcomp.c already folded this if pattern is in UTF-8 */
1493 utf8_fold_flags = 0;
1494 goto do_exactf_utf8;
1495 }
1496 fold_array = PL_fold;
1497 folder = foldEQ;
1498 goto do_exactf_non_utf8;
1499
1500 case EXACTFL:
1501 if (UTF_PATTERN || utf8_target) {
1502 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1503 goto do_exactf_utf8;
1504 }
1505 fold_array = PL_fold_locale;
1506 folder = foldEQ_locale;
1507 goto do_exactf_non_utf8;
1508
1509 case EXACTFU_SS:
1510 if (UTF_PATTERN) {
1511 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1512 }
1513 goto do_exactf_utf8;
1514
1515 case EXACTFU_TRICKYFOLD:
1516 case EXACTFU:
1517 if (UTF_PATTERN || utf8_target) {
1518 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1519 goto do_exactf_utf8;
1520 }
1521
1522 /* Any 'ss' in the pattern should have been replaced by regcomp,
1523 * so we don't have to worry here about this single special case
1524 * in the Latin1 range */
1525 fold_array = PL_fold_latin1;
1526 folder = foldEQ_latin1;
1527
1528 /* FALL THROUGH */
1529
1530 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1531 are no glitches with fold-length differences
1532 between the target string and pattern */
1533
1534 /* The idea in the non-utf8 EXACTF* cases is to first find the
1535 * first character of the EXACTF* node and then, if necessary,
1536 * case-insensitively compare the full text of the node. c1 is the
1537 * first character. c2 is its fold. This logic will not work for
1538 * Unicode semantics and the german sharp ss, which hence should
1539 * not be compiled into a node that gets here. */
1540 pat_string = STRING(c);
1541 ln = STR_LEN(c); /* length to match in octets/bytes */
1542
1543 /* We know that we have to match at least 'ln' bytes (which is the
1544 * same as characters, since not utf8). If we have to match 3
1545 * characters, and there are only 2 availabe, we know without
1546 * trying that it will fail; so don't start a match past the
1547 * required minimum number from the far end */
1548 e = HOP3c(strend, -((I32)ln), s);
1549
1550 if (!reginfo && e < s) {
1551 e = s; /* Due to minlen logic of intuit() */
1552 }
1553
1554 c1 = *pat_string;
1555 c2 = fold_array[c1];
1556 if (c1 == c2) { /* If char and fold are the same */
1557 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1558 }
1559 else {
1560 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1561 }
1562 break;
1563
1564 do_exactf_utf8:
1565 {
1566 unsigned expansion;
1567
1568
1569 /* If one of the operands is in utf8, we can't use the simpler
1570 * folding above, due to the fact that many different characters
1571 * can have the same fold, or portion of a fold, or different-
1572 * length fold */
1573 pat_string = STRING(c);
1574 ln = STR_LEN(c); /* length to match in octets/bytes */
1575 pat_end = pat_string + ln;
1576 lnc = (UTF_PATTERN) /* length to match in characters */
1577 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1578 : ln;
1579
1580 /* We have 'lnc' characters to match in the pattern, but because of
1581 * multi-character folding, each character in the target can match
1582 * up to 3 characters (Unicode guarantees it will never exceed
1583 * this) if it is utf8-encoded; and up to 2 if not (based on the
1584 * fact that the Latin 1 folds are already determined, and the
1585 * only multi-char fold in that range is the sharp-s folding to
1586 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1587 * string character. Adjust lnc accordingly, rounding up, so that
1588 * if we need to match at least 4+1/3 chars, that really is 5. */
1589 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1590 lnc = (lnc + expansion - 1) / expansion;
1591
1592 /* As in the non-UTF8 case, if we have to match 3 characters, and
1593 * only 2 are left, it's guaranteed to fail, so don't start a
1594 * match that would require us to go beyond the end of the string
1595 */
1596 e = HOP3c(strend, -((I32)lnc), s);
1597
1598 if (!reginfo && e < s) {
1599 e = s; /* Due to minlen logic of intuit() */
1600 }
1601
1602 /* XXX Note that we could recalculate e to stop the loop earlier,
1603 * as the worst case expansion above will rarely be met, and as we
1604 * go along we would usually find that e moves further to the left.
1605 * This would happen only after we reached the point in the loop
1606 * where if there were no expansion we should fail. Unclear if
1607 * worth the expense */
1608
1609 while (s <= e) {
1610 char *my_strend= (char *)strend;
1611 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1612 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1613 && (!reginfo || regtry(reginfo, &s)) )
1614 {
1615 goto got_it;
1616 }
1617 s += (utf8_target) ? UTF8SKIP(s) : 1;
1618 }
1619 break;
1620 }
1621 case BOUNDL:
1622 PL_reg_flags |= RF_tainted;
1623 FBC_BOUND(isALNUM_LC,
1624 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1625 isALNUM_LC_utf8((U8*)s));
1626 break;
1627 case NBOUNDL:
1628 PL_reg_flags |= RF_tainted;
1629 FBC_NBOUND(isALNUM_LC,
1630 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1631 isALNUM_LC_utf8((U8*)s));
1632 break;
1633 case BOUND:
1634 FBC_BOUND(isWORDCHAR,
1635 isALNUM_uni(tmp),
1636 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1637 break;
1638 case BOUNDA:
1639 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1640 isWORDCHAR_A(tmp),
1641 isWORDCHAR_A((U8*)s));
1642 break;
1643 case NBOUND:
1644 FBC_NBOUND(isWORDCHAR,
1645 isALNUM_uni(tmp),
1646 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1647 break;
1648 case NBOUNDA:
1649 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1650 isWORDCHAR_A(tmp),
1651 isWORDCHAR_A((U8*)s));
1652 break;
1653 case BOUNDU:
1654 FBC_BOUND(isWORDCHAR_L1,
1655 isALNUM_uni(tmp),
1656 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1657 break;
1658 case NBOUNDU:
1659 FBC_NBOUND(isWORDCHAR_L1,
1660 isALNUM_uni(tmp),
1661 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1662 break;
1663 case ALNUML:
1664 REXEC_FBC_CSCAN_TAINT(
1665 isALNUM_LC_utf8((U8*)s),
1666 isALNUM_LC(*s)
1667 );
1668 break;
1669 case ALNUMU:
1670 REXEC_FBC_CSCAN_PRELOAD(
1671 LOAD_UTF8_CHARCLASS_ALNUM(),
1672 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1673 isWORDCHAR_L1((U8) *s)
1674 );
1675 break;
1676 case ALNUM:
1677 REXEC_FBC_CSCAN_PRELOAD(
1678 LOAD_UTF8_CHARCLASS_ALNUM(),
1679 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1680 isWORDCHAR((U8) *s)
1681 );
1682 break;
1683 case ALNUMA:
1684 /* Don't need to worry about utf8, as it can match only a single
1685 * byte invariant character */
1686 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1687 break;
1688 case NALNUMU:
1689 REXEC_FBC_CSCAN_PRELOAD(
1690 LOAD_UTF8_CHARCLASS_ALNUM(),
1691 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1692 ! isWORDCHAR_L1((U8) *s)
1693 );
1694 break;
1695 case NALNUM:
1696 REXEC_FBC_CSCAN_PRELOAD(
1697 LOAD_UTF8_CHARCLASS_ALNUM(),
1698 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1699 ! isALNUM(*s)
1700 );
1701 break;
1702 case NALNUMA:
1703 REXEC_FBC_CSCAN(
1704 !isWORDCHAR_A(*s),
1705 !isWORDCHAR_A(*s)
1706 );
1707 break;
1708 case NALNUML:
1709 REXEC_FBC_CSCAN_TAINT(
1710 !isALNUM_LC_utf8((U8*)s),
1711 !isALNUM_LC(*s)
1712 );
1713 break;
1714 case SPACEU:
1715 REXEC_FBC_CSCAN(
1716 is_XPERLSPACE_utf8(s),
1717 isSPACE_L1((U8) *s)
1718 );
1719 break;
1720 case SPACE:
1721 REXEC_FBC_CSCAN(
1722 is_XPERLSPACE_utf8(s),
1723 isSPACE((U8) *s)
1724 );
1725 break;
1726 case SPACEA:
1727 /* Don't need to worry about utf8, as it can match only a single
1728 * byte invariant character */
1729 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1730 break;
1731 case SPACEL:
1732 REXEC_FBC_CSCAN_TAINT(
1733 isSPACE_LC_utf8((U8*)s),
1734 isSPACE_LC(*s)
1735 );
1736 break;
1737 case NSPACEU:
1738 REXEC_FBC_CSCAN(
1739 ! is_XPERLSPACE_utf8(s),
1740 ! isSPACE_L1((U8) *s)
1741 );
1742 break;
1743 case NSPACE:
1744 REXEC_FBC_CSCAN(
1745 ! is_XPERLSPACE_utf8(s),
1746 ! isSPACE((U8) *s)
1747 );
1748 break;
1749 case NSPACEA:
1750 REXEC_FBC_CSCAN(
1751 !isSPACE_A(*s),
1752 !isSPACE_A(*s)
1753 );
1754 break;
1755 case NSPACEL:
1756 REXEC_FBC_CSCAN_TAINT(
1757 !isSPACE_LC_utf8((U8*)s),
1758 !isSPACE_LC(*s)
1759 );
1760 break;
1761 case DIGIT:
1762 REXEC_FBC_CSCAN_PRELOAD(
1763 LOAD_UTF8_CHARCLASS_DIGIT(),
1764 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1765 isDIGIT(*s)
1766 );
1767 break;
1768 case DIGITA:
1769 /* Don't need to worry about utf8, as it can match only a single
1770 * byte invariant character */
1771 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1772 break;
1773 case DIGITL:
1774 REXEC_FBC_CSCAN_TAINT(
1775 isDIGIT_LC_utf8((U8*)s),
1776 isDIGIT_LC(*s)
1777 );
1778 break;
1779 case NDIGIT:
1780 REXEC_FBC_CSCAN_PRELOAD(
1781 LOAD_UTF8_CHARCLASS_DIGIT(),
1782 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1783 !isDIGIT(*s)
1784 );
1785 break;
1786 case NDIGITA:
1787 REXEC_FBC_CSCAN(
1788 !isDIGIT_A(*s),
1789 !isDIGIT_A(*s)
1790 );
1791 break;
1792 case NDIGITL:
1793 REXEC_FBC_CSCAN_TAINT(
1794 !isDIGIT_LC_utf8((U8*)s),
1795 !isDIGIT_LC(*s)
1796 );
1797 break;
1798 case LNBREAK:
1799 REXEC_FBC_CSCAN(
1800 is_LNBREAK_utf8_safe(s, strend),
1801 is_LNBREAK_latin1_safe(s, strend)
1802 );
1803 break;
1804 case VERTWS:
1805 REXEC_FBC_CSCAN(
1806 is_VERTWS_utf8_safe(s, strend),
1807 is_VERTWS_latin1_safe(s, strend)
1808 );
1809 break;
1810 case NVERTWS:
1811 REXEC_FBC_CSCAN(
1812 !is_VERTWS_utf8_safe(s, strend),
1813 !is_VERTWS_latin1_safe(s, strend)
1814 );
1815 break;
1816 case HORIZWS:
1817 REXEC_FBC_CSCAN(
1818 is_HORIZWS_utf8_safe(s, strend),
1819 is_HORIZWS_latin1_safe(s, strend)
1820 );
1821 break;
1822 case NHORIZWS:
1823 REXEC_FBC_CSCAN(
1824 !is_HORIZWS_utf8_safe(s, strend),
1825 !is_HORIZWS_latin1_safe(s, strend)
1826 );
1827 break;
1828 case POSIXA:
1829 /* Don't need to worry about utf8, as it can match only a single
1830 * byte invariant character. The flag in this node type is the
1831 * class number to pass to _generic_isCC() to build a mask for
1832 * searching in PL_charclass[] */
1833 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1834 break;
1835 case NPOSIXA:
1836 REXEC_FBC_CSCAN(
1837 !_generic_isCC_A(*s, FLAGS(c)),
1838 !_generic_isCC_A(*s, FLAGS(c))
1839 );
1840 break;
1841
1842 case AHOCORASICKC:
1843 case AHOCORASICK:
1844 {
1845 DECL_TRIE_TYPE(c);
1846 /* what trie are we using right now */
1847 reg_ac_data *aho
1848 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1849 reg_trie_data *trie
1850 = (reg_trie_data*)progi->data->data[ aho->trie ];
1851 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1852
1853 const char *last_start = strend - trie->minlen;
1854#ifdef DEBUGGING
1855 const char *real_start = s;
1856#endif
1857 STRLEN maxlen = trie->maxlen;
1858 SV *sv_points;
1859 U8 **points; /* map of where we were in the input string
1860 when reading a given char. For ASCII this
1861 is unnecessary overhead as the relationship
1862 is always 1:1, but for Unicode, especially
1863 case folded Unicode this is not true. */
1864 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1865 U8 *bitmap=NULL;
1866
1867
1868 GET_RE_DEBUG_FLAGS_DECL;
1869
1870 /* We can't just allocate points here. We need to wrap it in
1871 * an SV so it gets freed properly if there is a croak while
1872 * running the match */
1873 ENTER;
1874 SAVETMPS;
1875 sv_points=newSV(maxlen * sizeof(U8 *));
1876 SvCUR_set(sv_points,
1877 maxlen * sizeof(U8 *));
1878 SvPOK_on(sv_points);
1879 sv_2mortal(sv_points);
1880 points=(U8**)SvPV_nolen(sv_points );
1881 if ( trie_type != trie_utf8_fold
1882 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1883 {
1884 if (trie->bitmap)
1885 bitmap=(U8*)trie->bitmap;
1886 else
1887 bitmap=(U8*)ANYOF_BITMAP(c);
1888 }
1889 /* this is the Aho-Corasick algorithm modified a touch
1890 to include special handling for long "unknown char"
1891 sequences. The basic idea being that we use AC as long
1892 as we are dealing with a possible matching char, when
1893 we encounter an unknown char (and we have not encountered
1894 an accepting state) we scan forward until we find a legal
1895 starting char.
1896 AC matching is basically that of trie matching, except
1897 that when we encounter a failing transition, we fall back
1898 to the current states "fail state", and try the current char
1899 again, a process we repeat until we reach the root state,
1900 state 1, or a legal transition. If we fail on the root state
1901 then we can either terminate if we have reached an accepting
1902 state previously, or restart the entire process from the beginning
1903 if we have not.
1904
1905 */
1906 while (s <= last_start) {
1907 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1908 U8 *uc = (U8*)s;
1909 U16 charid = 0;
1910 U32 base = 1;
1911 U32 state = 1;
1912 UV uvc = 0;
1913 STRLEN len = 0;
1914 STRLEN foldlen = 0;
1915 U8 *uscan = (U8*)NULL;
1916 U8 *leftmost = NULL;
1917#ifdef DEBUGGING
1918 U32 accepted_word= 0;
1919#endif
1920 U32 pointpos = 0;
1921
1922 while ( state && uc <= (U8*)strend ) {
1923 int failed=0;
1924 U32 word = aho->states[ state ].wordnum;
1925
1926 if( state==1 ) {
1927 if ( bitmap ) {
1928 DEBUG_TRIE_EXECUTE_r(
1929 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1930 dump_exec_pos( (char *)uc, c, strend, real_start,
1931 (char *)uc, utf8_target );
1932 PerlIO_printf( Perl_debug_log,
1933 " Scanning for legal start char...\n");
1934 }
1935 );
1936 if (utf8_target) {
1937 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1938 uc += UTF8SKIP(uc);
1939 }
1940 } else {
1941 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1942 uc++;
1943 }
1944 }
1945 s= (char *)uc;
1946 }
1947 if (uc >(U8*)last_start) break;
1948 }
1949
1950 if ( word ) {
1951 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1952 if (!leftmost || lpos < leftmost) {
1953 DEBUG_r(accepted_word=word);
1954 leftmost= lpos;
1955 }
1956 if (base==0) break;
1957
1958 }
1959 points[pointpos++ % maxlen]= uc;
1960 if (foldlen || uc < (U8*)strend) {
1961 REXEC_TRIE_READ_CHAR(trie_type, trie,
1962 widecharmap, uc,
1963 uscan, len, uvc, charid, foldlen,
1964 foldbuf, uniflags);
1965 DEBUG_TRIE_EXECUTE_r({
1966 dump_exec_pos( (char *)uc, c, strend,
1967 real_start, s, utf8_target);
1968 PerlIO_printf(Perl_debug_log,
1969 " Charid:%3u CP:%4"UVxf" ",
1970 charid, uvc);
1971 });
1972 }
1973 else {
1974 len = 0;
1975 charid = 0;
1976 }
1977
1978
1979 do {
1980#ifdef DEBUGGING
1981 word = aho->states[ state ].wordnum;
1982#endif
1983 base = aho->states[ state ].trans.base;
1984
1985 DEBUG_TRIE_EXECUTE_r({
1986 if (failed)
1987 dump_exec_pos( (char *)uc, c, strend, real_start,
1988 s, utf8_target );
1989 PerlIO_printf( Perl_debug_log,
1990 "%sState: %4"UVxf", word=%"UVxf,
1991 failed ? " Fail transition to " : "",
1992 (UV)state, (UV)word);
1993 });
1994 if ( base ) {
1995 U32 tmp;
1996 I32 offset;
1997 if (charid &&
1998 ( ((offset = base + charid
1999 - 1 - trie->uniquecharcount)) >= 0)
2000 && ((U32)offset < trie->lasttrans)
2001 && trie->trans[offset].check == state
2002 && (tmp=trie->trans[offset].next))
2003 {
2004 DEBUG_TRIE_EXECUTE_r(
2005 PerlIO_printf( Perl_debug_log," - legal\n"));
2006 state = tmp;
2007 break;
2008 }
2009 else {
2010 DEBUG_TRIE_EXECUTE_r(
2011 PerlIO_printf( Perl_debug_log," - fail\n"));
2012 failed = 1;
2013 state = aho->fail[state];
2014 }
2015 }
2016 else {
2017 /* we must be accepting here */
2018 DEBUG_TRIE_EXECUTE_r(
2019 PerlIO_printf( Perl_debug_log," - accepting\n"));
2020 failed = 1;
2021 break;
2022 }
2023 } while(state);
2024 uc += len;
2025 if (failed) {
2026 if (leftmost)
2027 break;
2028 if (!state) state = 1;
2029 }
2030 }
2031 if ( aho->states[ state ].wordnum ) {
2032 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2033 if (!leftmost || lpos < leftmost) {
2034 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2035 leftmost = lpos;
2036 }
2037 }
2038 if (leftmost) {
2039 s = (char*)leftmost;
2040 DEBUG_TRIE_EXECUTE_r({
2041 PerlIO_printf(
2042 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2043 (UV)accepted_word, (IV)(s - real_start)
2044 );
2045 });
2046 if (!reginfo || regtry(reginfo, &s)) {
2047 FREETMPS;
2048 LEAVE;
2049 goto got_it;
2050 }
2051 s = HOPc(s,1);
2052 DEBUG_TRIE_EXECUTE_r({
2053 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2054 });
2055 } else {
2056 DEBUG_TRIE_EXECUTE_r(
2057 PerlIO_printf( Perl_debug_log,"No match.\n"));
2058 break;
2059 }
2060 }
2061 FREETMPS;
2062 LEAVE;
2063 }
2064 break;
2065 default:
2066 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2067 break;
2068 }
2069 return 0;
2070 got_it:
2071 return s;
2072}
2073
2074
2075/*
2076 - regexec_flags - match a regexp against a string
2077 */
2078I32
2079Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2080 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2081/* stringarg: the point in the string at which to begin matching */
2082/* strend: pointer to null at end of string */
2083/* strbeg: real beginning of string */
2084/* minend: end of match must be >= minend bytes after stringarg. */
2085/* sv: SV being matched: only used for utf8 flag, pos() etc; string
2086 * itself is accessed via the pointers above */
2087/* data: May be used for some additional optimizations.
2088 Currently its only used, with a U32 cast, for transmitting
2089 the ganch offset when doing a /g match. This will change */
2090/* nosave: For optimizations. */
2091
2092{
2093 dVAR;
2094 struct regexp *const prog = ReANY(rx);
2095 /*register*/ char *s;
2096 regnode *c;
2097 /*register*/ char *startpos = stringarg;
2098 I32 minlen; /* must match at least this many chars */
2099 I32 dontbother = 0; /* how many characters not to try at end */
2100 I32 end_shift = 0; /* Same for the end. */ /* CC */
2101 I32 scream_pos = -1; /* Internal iterator of scream. */
2102 char *scream_olds = NULL;
2103 const bool utf8_target = cBOOL(DO_UTF8(sv));
2104 I32 multiline;
2105 RXi_GET_DECL(prog,progi);
2106 regmatch_info reginfo; /* create some info to pass to regtry etc */
2107 regexp_paren_pair *swap = NULL;
2108 GET_RE_DEBUG_FLAGS_DECL;
2109
2110 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2111 PERL_UNUSED_ARG(data);
2112
2113 /* Be paranoid... */
2114 if (prog == NULL || startpos == NULL) {
2115 Perl_croak(aTHX_ "NULL regexp parameter");
2116 return 0;
2117 }
2118
2119 multiline = prog->extflags & RXf_PMf_MULTILINE;
2120 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2121
2122 RX_MATCH_UTF8_set(rx, utf8_target);
2123 DEBUG_EXECUTE_r(
2124 debug_start_match(rx, utf8_target, startpos, strend,
2125 "Matching");
2126 );
2127
2128 minlen = prog->minlen;
2129
2130 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2131 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2132 "String too short [regexec_flags]...\n"));
2133 goto phooey;
2134 }
2135
2136
2137 /* Check validity of program. */
2138 if (UCHARAT(progi->program) != REG_MAGIC) {
2139 Perl_croak(aTHX_ "corrupted regexp program");
2140 }
2141
2142 PL_reg_flags = 0;
2143 PL_reg_state.re_state_eval_setup_done = FALSE;
2144 PL_reg_maxiter = 0;
2145
2146 if (RX_UTF8(rx))
2147 PL_reg_flags |= RF_utf8;
2148
2149 /* Mark beginning of line for ^ and lookbehind. */
2150 reginfo.bol = startpos; /* XXX not used ??? */
2151 PL_bostr = strbeg;
2152 reginfo.sv = sv;
2153
2154 /* Mark end of line for $ (and such) */
2155 PL_regeol = strend;
2156
2157 /* see how far we have to get to not match where we matched before */
2158 reginfo.till = startpos+minend;
2159
2160 /* If there is a "must appear" string, look for it. */
2161 s = startpos;
2162
2163 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2164 MAGIC *mg;
2165 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2166 reginfo.ganch = startpos + prog->gofs;
2167 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2168 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2169 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2170 && SvMAGIC(sv)
2171 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2172 && mg->mg_len >= 0) {
2173 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2174 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2175 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2176
2177 if (prog->extflags & RXf_ANCH_GPOS) {
2178 if (s > reginfo.ganch)
2179 goto phooey;
2180 s = reginfo.ganch - prog->gofs;
2181 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2182 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2183 if (s < strbeg)
2184 goto phooey;
2185 }
2186 }
2187 else if (data) {
2188 reginfo.ganch = strbeg + PTR2UV(data);
2189 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2190 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2191
2192 } else { /* pos() not defined */
2193 reginfo.ganch = strbeg;
2194 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2195 "GPOS: reginfo.ganch = strbeg\n"));
2196 }
2197 }
2198 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2199 /* We have to be careful. If the previous successful match
2200 was from this regex we don't want a subsequent partially
2201 successful match to clobber the old results.
2202 So when we detect this possibility we add a swap buffer
2203 to the re, and switch the buffer each match. If we fail
2204 we switch it back, otherwise we leave it swapped.
2205 */
2206 swap = prog->offs;
2207 /* do we need a save destructor here for eval dies? */
2208 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2209 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2210 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2211 PTR2UV(prog),
2212 PTR2UV(swap),
2213 PTR2UV(prog->offs)
2214 ));
2215 }
2216 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2217 re_scream_pos_data d;
2218
2219 d.scream_olds = &scream_olds;
2220 d.scream_pos = &scream_pos;
2221 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2222 if (!s) {
2223 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2224 goto phooey; /* not present */
2225 }
2226 }
2227
2228
2229
2230 /* Simplest case: anchored match need be tried only once. */
2231 /* [unless only anchor is BOL and multiline is set] */
2232 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2233 if (s == startpos && regtry(&reginfo, &startpos))
2234 goto got_it;
2235 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2236 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2237 {
2238 char *end;
2239
2240 if (minlen)
2241 dontbother = minlen - 1;
2242 end = HOP3c(strend, -dontbother, strbeg) - 1;
2243 /* for multiline we only have to try after newlines */
2244 if (prog->check_substr || prog->check_utf8) {
2245 /* because of the goto we can not easily reuse the macros for bifurcating the
2246 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2247 if (utf8_target) {
2248 if (s == startpos)
2249 goto after_try_utf8;
2250 while (1) {
2251 if (regtry(&reginfo, &s)) {
2252 goto got_it;
2253 }
2254 after_try_utf8:
2255 if (s > end) {
2256 goto phooey;
2257 }
2258 if (prog->extflags & RXf_USE_INTUIT) {
2259 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2260 if (!s) {
2261 goto phooey;
2262 }
2263 }
2264 else {
2265 s += UTF8SKIP(s);
2266 }
2267 }
2268 } /* end search for check string in unicode */
2269 else {
2270 if (s == startpos) {
2271 goto after_try_latin;
2272 }
2273 while (1) {
2274 if (regtry(&reginfo, &s)) {
2275 goto got_it;
2276 }
2277 after_try_latin:
2278 if (s > end) {
2279 goto phooey;
2280 }
2281 if (prog->extflags & RXf_USE_INTUIT) {
2282 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2283 if (!s) {
2284 goto phooey;
2285 }
2286 }
2287 else {
2288 s++;
2289 }
2290 }
2291 } /* end search for check string in latin*/
2292 } /* end search for check string */
2293 else { /* search for newline */
2294 if (s > startpos) {
2295 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2296 s--;
2297 }
2298 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2299 while (s <= end) { /* note it could be possible to match at the end of the string */
2300 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2301 if (regtry(&reginfo, &s))
2302 goto got_it;
2303 }
2304 }
2305 } /* end search for newline */
2306 } /* end anchored/multiline check string search */
2307 goto phooey;
2308 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2309 {
2310 /* the warning about reginfo.ganch being used without initialization
2311 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2312 and we only enter this block when the same bit is set. */
2313 char *tmp_s = reginfo.ganch - prog->gofs;
2314
2315 if (tmp_s >= strbeg && regtry(&reginfo, &tmp_s))
2316 goto got_it;
2317 goto phooey;
2318 }
2319
2320 /* Messy cases: unanchored match. */
2321 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2322 /* we have /x+whatever/ */
2323 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2324 char ch;
2325#ifdef DEBUGGING
2326 int did_match = 0;
2327#endif
2328 if (utf8_target) {
2329 if (! prog->anchored_utf8) {
2330 to_utf8_substr(prog);
2331 }
2332 ch = SvPVX_const(prog->anchored_utf8)[0];
2333 REXEC_FBC_SCAN(
2334 if (*s == ch) {
2335 DEBUG_EXECUTE_r( did_match = 1 );
2336 if (regtry(&reginfo, &s)) goto got_it;
2337 s += UTF8SKIP(s);
2338 while (s < strend && *s == ch)
2339 s += UTF8SKIP(s);
2340 }
2341 );
2342
2343 }
2344 else {
2345 if (! prog->anchored_substr) {
2346 if (! to_byte_substr(prog)) {
2347 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2348 }
2349 }
2350 ch = SvPVX_const(prog->anchored_substr)[0];
2351 REXEC_FBC_SCAN(
2352 if (*s == ch) {
2353 DEBUG_EXECUTE_r( did_match = 1 );
2354 if (regtry(&reginfo, &s)) goto got_it;
2355 s++;
2356 while (s < strend && *s == ch)
2357 s++;
2358 }
2359 );
2360 }
2361 DEBUG_EXECUTE_r(if (!did_match)
2362 PerlIO_printf(Perl_debug_log,
2363 "Did not find anchored character...\n")
2364 );
2365 }
2366 else if (prog->anchored_substr != NULL
2367 || prog->anchored_utf8 != NULL
2368 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2369 && prog->float_max_offset < strend - s)) {
2370 SV *must;
2371 I32 back_max;
2372 I32 back_min;
2373 char *last;
2374 char *last1; /* Last position checked before */
2375#ifdef DEBUGGING
2376 int did_match = 0;
2377#endif
2378 if (prog->anchored_substr || prog->anchored_utf8) {
2379 if (utf8_target) {
2380 if (! prog->anchored_utf8) {
2381 to_utf8_substr(prog);
2382 }
2383 must = prog->anchored_utf8;
2384 }
2385 else {
2386 if (! prog->anchored_substr) {
2387 if (! to_byte_substr(prog)) {
2388 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2389 }
2390 }
2391 must = prog->anchored_substr;
2392 }
2393 back_max = back_min = prog->anchored_offset;
2394 } else {
2395 if (utf8_target) {
2396 if (! prog->float_utf8) {
2397 to_utf8_substr(prog);
2398 }
2399 must = prog->float_utf8;
2400 }
2401 else {
2402 if (! prog->float_substr) {
2403 if (! to_byte_substr(prog)) {
2404 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2405 }
2406 }
2407 must = prog->float_substr;
2408 }
2409 back_max = prog->float_max_offset;
2410 back_min = prog->float_min_offset;
2411 }
2412
2413 if (back_min<0) {
2414 last = strend;
2415 } else {
2416 last = HOP3c(strend, /* Cannot start after this */
2417 -(I32)(CHR_SVLEN(must)
2418 - (SvTAIL(must) != 0) + back_min), strbeg);
2419 }
2420 if (s > PL_bostr)
2421 last1 = HOPc(s, -1);
2422 else
2423 last1 = s - 1; /* bogus */
2424
2425 /* XXXX check_substr already used to find "s", can optimize if
2426 check_substr==must. */
2427 scream_pos = -1;
2428 dontbother = end_shift;
2429 strend = HOPc(strend, -dontbother);
2430 while ( (s <= last) &&
2431 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2432 (unsigned char*)strend, must,
2433 multiline ? FBMrf_MULTILINE : 0)) ) {
2434 DEBUG_EXECUTE_r( did_match = 1 );
2435 if (HOPc(s, -back_max) > last1) {
2436 last1 = HOPc(s, -back_min);
2437 s = HOPc(s, -back_max);
2438 }
2439 else {
2440 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2441
2442 last1 = HOPc(s, -back_min);
2443 s = t;
2444 }
2445 if (utf8_target) {
2446 while (s <= last1) {
2447 if (regtry(&reginfo, &s))
2448 goto got_it;
2449 if (s >= last1) {
2450 s++; /* to break out of outer loop */
2451 break;
2452 }
2453 s += UTF8SKIP(s);
2454 }
2455 }
2456 else {
2457 while (s <= last1) {
2458 if (regtry(&reginfo, &s))
2459 goto got_it;
2460 s++;
2461 }
2462 }
2463 }
2464 DEBUG_EXECUTE_r(if (!did_match) {
2465 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2466 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2467 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2468 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2469 ? "anchored" : "floating"),
2470 quoted, RE_SV_TAIL(must));
2471 });
2472 goto phooey;
2473 }
2474 else if ( (c = progi->regstclass) ) {
2475 if (minlen) {
2476 const OPCODE op = OP(progi->regstclass);
2477 /* don't bother with what can't match */
2478 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2479 strend = HOPc(strend, -(minlen - 1));
2480 }
2481 DEBUG_EXECUTE_r({
2482 SV * const prop = sv_newmortal();
2483 regprop(prog, prop, c);
2484 {
2485 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2486 s,strend-s,60);
2487 PerlIO_printf(Perl_debug_log,
2488 "Matching stclass %.*s against %s (%d bytes)\n",
2489 (int)SvCUR(prop), SvPVX_const(prop),
2490 quoted, (int)(strend - s));
2491 }
2492 });
2493 if (find_byclass(prog, c, s, strend, &reginfo))
2494 goto got_it;
2495 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2496 }
2497 else {
2498 dontbother = 0;
2499 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2500 /* Trim the end. */
2501 char *last= NULL;
2502 SV* float_real;
2503 STRLEN len;
2504 const char *little;
2505
2506 if (utf8_target) {
2507 if (! prog->float_utf8) {
2508 to_utf8_substr(prog);
2509 }
2510 float_real = prog->float_utf8;
2511 }
2512 else {
2513 if (! prog->float_substr) {
2514 if (! to_byte_substr(prog)) {
2515 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2516 }
2517 }
2518 float_real = prog->float_substr;
2519 }
2520
2521 little = SvPV_const(float_real, len);
2522 if (SvTAIL(float_real)) {
2523 /* This means that float_real contains an artificial \n on
2524 * the end due to the presence of something like this:
2525 * /foo$/ where we can match both "foo" and "foo\n" at the
2526 * end of the string. So we have to compare the end of the
2527 * string first against the float_real without the \n and
2528 * then against the full float_real with the string. We
2529 * have to watch out for cases where the string might be
2530 * smaller than the float_real or the float_real without
2531 * the \n. */
2532 char *checkpos= strend - len;
2533 DEBUG_OPTIMISE_r(
2534 PerlIO_printf(Perl_debug_log,
2535 "%sChecking for float_real.%s\n",
2536 PL_colors[4], PL_colors[5]));
2537 if (checkpos + 1 < strbeg) {
2538 /* can't match, even if we remove the trailing \n
2539 * string is too short to match */
2540 DEBUG_EXECUTE_r(
2541 PerlIO_printf(Perl_debug_log,
2542 "%sString shorter than required trailing substring, cannot match.%s\n",
2543 PL_colors[4], PL_colors[5]));
2544 goto phooey;
2545 } else if (memEQ(checkpos + 1, little, len - 1)) {
2546 /* can match, the end of the string matches without the
2547 * "\n" */
2548 last = checkpos + 1;
2549 } else if (checkpos < strbeg) {
2550 /* cant match, string is too short when the "\n" is
2551 * included */
2552 DEBUG_EXECUTE_r(
2553 PerlIO_printf(Perl_debug_log,
2554 "%sString does not contain required trailing substring, cannot match.%s\n",
2555 PL_colors[4], PL_colors[5]));
2556 goto phooey;
2557 } else if (!multiline) {
2558 /* non multiline match, so compare with the "\n" at the
2559 * end of the string */
2560 if (memEQ(checkpos, little, len)) {
2561 last= checkpos;
2562 } else {
2563 DEBUG_EXECUTE_r(
2564 PerlIO_printf(Perl_debug_log,
2565 "%sString does not contain required trailing substring, cannot match.%s\n",
2566 PL_colors[4], PL_colors[5]));
2567 goto phooey;
2568 }
2569 } else {
2570 /* multiline match, so we have to search for a place
2571 * where the full string is located */
2572 goto find_last;
2573 }
2574 } else {
2575 find_last:
2576 if (len)
2577 last = rninstr(s, strend, little, little + len);
2578 else
2579 last = strend; /* matching "$" */
2580 }
2581 if (!last) {
2582 /* at one point this block contained a comment which was
2583 * probably incorrect, which said that this was a "should not
2584 * happen" case. Even if it was true when it was written I am
2585 * pretty sure it is not anymore, so I have removed the comment
2586 * and replaced it with this one. Yves */
2587 DEBUG_EXECUTE_r(
2588 PerlIO_printf(Perl_debug_log,
2589 "String does not contain required substring, cannot match.\n"
2590 ));
2591 goto phooey;
2592 }
2593 dontbother = strend - last + prog->float_min_offset;
2594 }
2595 if (minlen && (dontbother < minlen))
2596 dontbother = minlen - 1;
2597 strend -= dontbother; /* this one's always in bytes! */
2598 /* We don't know much -- general case. */
2599 if (utf8_target) {
2600 for (;;) {
2601 if (regtry(&reginfo, &s))
2602 goto got_it;
2603 if (s >= strend)
2604 break;
2605 s += UTF8SKIP(s);
2606 };
2607 }
2608 else {
2609 do {
2610 if (regtry(&reginfo, &s))
2611 goto got_it;
2612 } while (s++ < strend);
2613 }
2614 }
2615
2616 /* Failure. */
2617 goto phooey;
2618
2619got_it:
2620 DEBUG_BUFFERS_r(
2621 if (swap)
2622 PerlIO_printf(Perl_debug_log,
2623 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2624 PTR2UV(prog),
2625 PTR2UV(swap)
2626 );
2627 );
2628 Safefree(swap);
2629 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2630
2631 if (PL_reg_state.re_state_eval_setup_done)
2632 restore_pos(aTHX_ prog);
2633 if (RXp_PAREN_NAMES(prog))
2634 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2635
2636 /* make sure $`, $&, $', and $digit will work later */
2637 if ( !(flags & REXEC_NOT_FIRST) ) {
2638 if (flags & REXEC_COPY_STR) {
2639#ifdef PERL_OLD_COPY_ON_WRITE
2640 if ((SvIsCOW(sv)
2641 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2642 if (DEBUG_C_TEST) {
2643 PerlIO_printf(Perl_debug_log,
2644 "Copy on write: regexp capture, type %d\n",
2645 (int) SvTYPE(sv));
2646 }
2647 RX_MATCH_COPY_FREE(rx);
2648 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2649 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2650 assert (SvPOKp(prog->saved_copy));
2651 prog->sublen = PL_regeol - strbeg;
2652 prog->suboffset = 0;
2653 prog->subcoffset = 0;
2654 } else
2655#endif
2656 {
2657 I32 min = 0;
2658 I32 max = PL_regeol - strbeg;
2659 I32 sublen;
2660
2661 if ( (flags & REXEC_COPY_SKIP_POST)
2662 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2663 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2664 ) { /* don't copy $' part of string */
2665 U32 n = 0;
2666 max = -1;
2667 /* calculate the right-most part of the string covered
2668 * by a capture. Due to look-ahead, this may be to
2669 * the right of $&, so we have to scan all captures */
2670 while (n <= prog->lastparen) {
2671 if (prog->offs[n].end > max)
2672 max = prog->offs[n].end;
2673 n++;
2674 }
2675 if (max == -1)
2676 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2677 ? prog->offs[0].start
2678 : 0;
2679 assert(max >= 0 && max <= PL_regeol - strbeg);
2680 }
2681
2682 if ( (flags & REXEC_COPY_SKIP_PRE)
2683 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2684 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2685 ) { /* don't copy $` part of string */
2686 U32 n = 0;
2687 min = max;
2688 /* calculate the left-most part of the string covered
2689 * by a capture. Due to look-behind, this may be to
2690 * the left of $&, so we have to scan all captures */
2691 while (min && n <= prog->lastparen) {
2692 if ( prog->offs[n].start != -1
2693 && prog->offs[n].start < min)
2694 {
2695 min = prog->offs[n].start;
2696 }
2697 n++;
2698 }
2699 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2700 && min > prog->offs[0].end
2701 )
2702 min = prog->offs[0].end;
2703
2704 }
2705
2706 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2707 sublen = max - min;
2708
2709 if (RX_MATCH_COPIED(rx)) {
2710 if (sublen > prog->sublen)
2711 prog->subbeg =
2712 (char*)saferealloc(prog->subbeg, sublen+1);
2713 }
2714 else
2715 prog->subbeg = (char*)safemalloc(sublen+1);
2716 Copy(strbeg + min, prog->subbeg, sublen, char);
2717 prog->subbeg[sublen] = '\0';
2718 prog->suboffset = min;
2719 prog->sublen = sublen;
2720 RX_MATCH_COPIED_on(rx);
2721 }
2722 prog->subcoffset = prog->suboffset;
2723 if (prog->suboffset && utf8_target) {
2724 /* Convert byte offset to chars.
2725 * XXX ideally should only compute this if @-/@+
2726 * has been seen, a la PL_sawampersand ??? */
2727
2728 /* If there's a direct correspondence between the
2729 * string which we're matching and the original SV,
2730 * then we can use the utf8 len cache associated with
2731 * the SV. In particular, it means that under //g,
2732 * sv_pos_b2u() will use the previously cached
2733 * position to speed up working out the new length of
2734 * subcoffset, rather than counting from the start of
2735 * the string each time. This stops
2736 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2737 * from going quadratic */
2738 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2739 sv_pos_b2u(sv, &(prog->subcoffset));
2740 else
2741 prog->subcoffset = utf8_length((U8*)strbeg,
2742 (U8*)(strbeg+prog->suboffset));
2743 }
2744 }
2745 else {
2746 RX_MATCH_COPY_FREE(rx);
2747 prog->subbeg = strbeg;
2748 prog->suboffset = 0;
2749 prog->subcoffset = 0;
2750 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2751 }
2752 }
2753
2754 return 1;
2755
2756phooey:
2757 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2758 PL_colors[4], PL_colors[5]));
2759 if (PL_reg_state.re_state_eval_setup_done)
2760 restore_pos(aTHX_ prog);
2761 if (swap) {
2762 /* we failed :-( roll it back */
2763 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2764 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2765 PTR2UV(prog),
2766 PTR2UV(prog->offs),
2767 PTR2UV(swap)
2768 ));
2769 Safefree(prog->offs);
2770 prog->offs = swap;
2771 }
2772 return 0;
2773}
2774
2775
2776/* Set which rex is pointed to by PL_reg_state, handling ref counting.
2777 * Do inc before dec, in case old and new rex are the same */
2778#define SET_reg_curpm(Re2) \
2779 if (PL_reg_state.re_state_eval_setup_done) { \
2780 (void)ReREFCNT_inc(Re2); \
2781 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2782 PM_SETRE((PL_reg_curpm), (Re2)); \
2783 }
2784
2785
2786/*
2787 - regtry - try match at specific point
2788 */
2789STATIC I32 /* 0 failure, 1 success */
2790S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2791{
2792 dVAR;
2793 CHECKPOINT lastcp;
2794 REGEXP *const rx = reginfo->prog;
2795 regexp *const prog = ReANY(rx);
2796 I32 result;
2797 RXi_GET_DECL(prog,progi);
2798 GET_RE_DEBUG_FLAGS_DECL;
2799
2800 PERL_ARGS_ASSERT_REGTRY;
2801
2802 reginfo->cutpoint=NULL;
2803
2804 if ((prog->extflags & RXf_EVAL_SEEN)
2805 && !PL_reg_state.re_state_eval_setup_done)
2806 {
2807 MAGIC *mg;
2808
2809 PL_reg_state.re_state_eval_setup_done = TRUE;
2810 if (reginfo->sv) {
2811 /* Make $_ available to executed code. */
2812 if (reginfo->sv != DEFSV) {
2813 SAVE_DEFSV;
2814 DEFSV_set(reginfo->sv);
2815 }
2816
2817 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2818 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2819 /* prepare for quick setting of pos */
2820#ifdef PERL_OLD_COPY_ON_WRITE
2821 if (SvIsCOW(reginfo->sv))
2822 sv_force_normal_flags(reginfo->sv, 0);
2823#endif
2824 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2825 &PL_vtbl_mglob, NULL, 0);
2826 mg->mg_len = -1;
2827 }
2828 PL_reg_magic = mg;
2829 PL_reg_oldpos = mg->mg_len;
2830 SAVEDESTRUCTOR_X(restore_pos, prog);
2831 }
2832 if (!PL_reg_curpm) {
2833 Newxz(PL_reg_curpm, 1, PMOP);
2834#ifdef USE_ITHREADS
2835 {
2836 SV* const repointer = &PL_sv_undef;
2837 /* this regexp is also owned by the new PL_reg_curpm, which
2838 will try to free it. */
2839 av_push(PL_regex_padav, repointer);
2840 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2841 PL_regex_pad = AvARRAY(PL_regex_padav);
2842 }
2843#endif
2844 }
2845 SET_reg_curpm(rx);
2846 PL_reg_oldcurpm = PL_curpm;
2847 PL_curpm = PL_reg_curpm;
2848 if (RXp_MATCH_COPIED(prog)) {
2849 /* Here is a serious problem: we cannot rewrite subbeg,
2850 since it may be needed if this match fails. Thus
2851 $` inside (?{}) could fail... */
2852 PL_reg_oldsaved = prog->subbeg;
2853 PL_reg_oldsavedlen = prog->sublen;
2854 PL_reg_oldsavedoffset = prog->suboffset;
2855 PL_reg_oldsavedcoffset = prog->suboffset;
2856#ifdef PERL_OLD_COPY_ON_WRITE
2857 PL_nrs = prog->saved_copy;
2858#endif
2859 RXp_MATCH_COPIED_off(prog);
2860 }
2861 else
2862 PL_reg_oldsaved = NULL;
2863 prog->subbeg = PL_bostr;
2864 prog->suboffset = 0;
2865 prog->subcoffset = 0;
2866 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2867 }
2868#ifdef DEBUGGING
2869 PL_reg_starttry = *startposp;
2870#endif
2871 prog->offs[0].start = *startposp - PL_bostr;
2872 prog->lastparen = 0;
2873 prog->lastcloseparen = 0;
2874 PL_regsize = 0;
2875
2876 /* XXXX What this code is doing here?!!! There should be no need
2877 to do this again and again, prog->lastparen should take care of
2878 this! --ilya*/
2879
2880 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2881 * Actually, the code in regcppop() (which Ilya may be meaning by
2882 * prog->lastparen), is not needed at all by the test suite
2883 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2884 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2885 * Meanwhile, this code *is* needed for the
2886 * above-mentioned test suite tests to succeed. The common theme
2887 * on those tests seems to be returning null fields from matches.
2888 * --jhi updated by dapm */
2889#if 1
2890 if (prog->nparens) {
2891 regexp_paren_pair *pp = prog->offs;
2892 I32 i;
2893 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2894 ++pp;
2895 pp->start = -1;
2896 pp->end = -1;
2897 }
2898 }
2899#endif
2900 REGCP_SET(lastcp);
2901 result = regmatch(reginfo, *startposp, progi->program + 1);
2902 if (result != -1) {
2903 prog->offs[0].end = result;
2904 return 1;
2905 }
2906 if (reginfo->cutpoint)
2907 *startposp= reginfo->cutpoint;
2908 REGCP_UNWIND(lastcp);
2909 return 0;
2910}
2911
2912
2913#define sayYES goto yes
2914#define sayNO goto no
2915#define sayNO_SILENT goto no_silent
2916
2917/* we dont use STMT_START/END here because it leads to
2918 "unreachable code" warnings, which are bogus, but distracting. */
2919#define CACHEsayNO \
2920 if (ST.cache_mask) \
2921 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2922 sayNO
2923
2924/* this is used to determine how far from the left messages like
2925 'failed...' are printed. It should be set such that messages
2926 are inline with the regop output that created them.
2927*/
2928#define REPORT_CODE_OFF 32
2929
2930
2931#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2932#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2933#define CHRTEST_NOT_A_CP_1 -999
2934#define CHRTEST_NOT_A_CP_2 -998
2935
2936#define SLAB_FIRST(s) (&(s)->states[0])
2937#define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2938
2939/* grab a new slab and return the first slot in it */
2940
2941STATIC regmatch_state *
2942S_push_slab(pTHX)
2943{
2944#if PERL_VERSION < 9 && !defined(PERL_CORE)
2945 dMY_CXT;
2946#endif
2947 regmatch_slab *s = PL_regmatch_slab->next;
2948 if (!s) {
2949 Newx(s, 1, regmatch_slab);
2950 s->prev = PL_regmatch_slab;
2951 s->next = NULL;
2952 PL_regmatch_slab->next = s;
2953 }
2954 PL_regmatch_slab = s;
2955 return SLAB_FIRST(s);
2956}
2957
2958
2959/* push a new state then goto it */
2960
2961#define PUSH_STATE_GOTO(state, node, input) \
2962 pushinput = input; \
2963 scan = node; \
2964 st->resume_state = state; \
2965 goto push_state;
2966
2967/* push a new state with success backtracking, then goto it */
2968
2969#define PUSH_YES_STATE_GOTO(state, node, input) \
2970 pushinput = input; \
2971 scan = node; \
2972 st->resume_state = state; \
2973 goto push_yes_state;
2974
2975
2976
2977
2978/*
2979
2980regmatch() - main matching routine
2981
2982This is basically one big switch statement in a loop. We execute an op,
2983set 'next' to point the next op, and continue. If we come to a point which
2984we may need to backtrack to on failure such as (A|B|C), we push a
2985backtrack state onto the backtrack stack. On failure, we pop the top
2986state, and re-enter the loop at the state indicated. If there are no more
2987states to pop, we return failure.
2988
2989Sometimes we also need to backtrack on success; for example /A+/, where
2990after successfully matching one A, we need to go back and try to
2991match another one; similarly for lookahead assertions: if the assertion
2992completes successfully, we backtrack to the state just before the assertion
2993and then carry on. In these cases, the pushed state is marked as
2994'backtrack on success too'. This marking is in fact done by a chain of
2995pointers, each pointing to the previous 'yes' state. On success, we pop to
2996the nearest yes state, discarding any intermediate failure-only states.
2997Sometimes a yes state is pushed just to force some cleanup code to be
2998called at the end of a successful match or submatch; e.g. (??{$re}) uses
2999it to free the inner regex.
3000
3001Note that failure backtracking rewinds the cursor position, while
3002success backtracking leaves it alone.
3003
3004A pattern is complete when the END op is executed, while a subpattern
3005such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3006ops trigger the "pop to last yes state if any, otherwise return true"
3007behaviour.
3008
3009A common convention in this function is to use A and B to refer to the two
3010subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3011the subpattern to be matched possibly multiple times, while B is the entire
3012rest of the pattern. Variable and state names reflect this convention.
3013
3014The states in the main switch are the union of ops and failure/success of
3015substates associated with with that op. For example, IFMATCH is the op
3016that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3017'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3018successfully matched A and IFMATCH_A_fail is a state saying that we have
3019just failed to match A. Resume states always come in pairs. The backtrack
3020state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3021at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3022on success or failure.
3023
3024The struct that holds a backtracking state is actually a big union, with
3025one variant for each major type of op. The variable st points to the
3026top-most backtrack struct. To make the code clearer, within each
3027block of code we #define ST to alias the relevant union.
3028
3029Here's a concrete example of a (vastly oversimplified) IFMATCH
3030implementation:
3031
3032 switch (state) {
3033 ....
3034
3035#define ST st->u.ifmatch
3036
3037 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3038 ST.foo = ...; // some state we wish to save
3039 ...
3040 // push a yes backtrack state with a resume value of
3041 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3042 // first node of A:
3043 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3044 // NOTREACHED
3045
3046 case IFMATCH_A: // we have successfully executed A; now continue with B
3047 next = B;
3048 bar = ST.foo; // do something with the preserved value
3049 break;
3050
3051 case IFMATCH_A_fail: // A failed, so the assertion failed
3052 ...; // do some housekeeping, then ...
3053 sayNO; // propagate the failure
3054
3055#undef ST
3056
3057 ...
3058 }
3059
3060For any old-timers reading this who are familiar with the old recursive
3061approach, the code above is equivalent to:
3062
3063 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3064 {
3065 int foo = ...
3066 ...
3067 if (regmatch(A)) {
3068 next = B;
3069 bar = foo;
3070 break;
3071 }
3072 ...; // do some housekeeping, then ...
3073 sayNO; // propagate the failure
3074 }
3075
3076The topmost backtrack state, pointed to by st, is usually free. If you
3077want to claim it, populate any ST.foo fields in it with values you wish to
3078save, then do one of
3079
3080 PUSH_STATE_GOTO(resume_state, node, newinput);
3081 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3082
3083which sets that backtrack state's resume value to 'resume_state', pushes a
3084new free entry to the top of the backtrack stack, then goes to 'node'.
3085On backtracking, the free slot is popped, and the saved state becomes the
3086new free state. An ST.foo field in this new top state can be temporarily
3087accessed to retrieve values, but once the main loop is re-entered, it
3088becomes available for reuse.
3089
3090Note that the depth of the backtrack stack constantly increases during the
3091left-to-right execution of the pattern, rather than going up and down with
3092the pattern nesting. For example the stack is at its maximum at Z at the
3093end of the pattern, rather than at X in the following:
3094
3095 /(((X)+)+)+....(Y)+....Z/
3096
3097The only exceptions to this are lookahead/behind assertions and the cut,
3098(?>A), which pop all the backtrack states associated with A before
3099continuing.
3100
3101Backtrack state structs are allocated in slabs of about 4K in size.
3102PL_regmatch_state and st always point to the currently active state,
3103and PL_regmatch_slab points to the slab currently containing
3104PL_regmatch_state. The first time regmatch() is called, the first slab is
3105allocated, and is never freed until interpreter destruction. When the slab
3106is full, a new one is allocated and chained to the end. At exit from
3107regmatch(), slabs allocated since entry are freed.
3108
3109*/
3110
3111
3112#define DEBUG_STATE_pp(pp) \
3113 DEBUG_STATE_r({ \
3114 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3115 PerlIO_printf(Perl_debug_log, \
3116 " %*s"pp" %s%s%s%s%s\n", \
3117 depth*2, "", \
3118 PL_reg_name[st->resume_state], \
3119 ((st==yes_state||st==mark_state) ? "[" : ""), \
3120 ((st==yes_state) ? "Y" : ""), \
3121 ((st==mark_state) ? "M" : ""), \
3122 ((st==yes_state||st==mark_state) ? "]" : "") \
3123 ); \
3124 });
3125
3126
3127#define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3128
3129#ifdef DEBUGGING
3130
3131STATIC void
3132S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3133 const char *start, const char *end, const char *blurb)
3134{
3135 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3136
3137 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3138
3139 if (!PL_colorset)
3140 reginitcolors();
3141 {
3142 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3143 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3144
3145 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3146 start, end - start, 60);
3147
3148 PerlIO_printf(Perl_debug_log,
3149 "%s%s REx%s %s against %s\n",
3150 PL_colors[4], blurb, PL_colors[5], s0, s1);
3151
3152 if (utf8_target||utf8_pat)
3153 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3154 utf8_pat ? "pattern" : "",
3155 utf8_pat && utf8_target ? " and " : "",
3156 utf8_target ? "string" : ""
3157 );
3158 }
3159}
3160
3161STATIC void
3162S_dump_exec_pos(pTHX_ const char *locinput,
3163 const regnode *scan,
3164 const char *loc_regeol,
3165 const char *loc_bostr,
3166 const char *loc_reg_starttry,
3167 const bool utf8_target)
3168{
3169 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3170 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3171 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3172 /* The part of the string before starttry has one color
3173 (pref0_len chars), between starttry and current
3174 position another one (pref_len - pref0_len chars),
3175 after the current position the third one.
3176 We assume that pref0_len <= pref_len, otherwise we
3177 decrease pref0_len. */
3178 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3179 ? (5 + taill) - l : locinput - loc_bostr;
3180 int pref0_len;
3181
3182 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3183
3184 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3185 pref_len++;
3186 pref0_len = pref_len - (locinput - loc_reg_starttry);
3187 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3188 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3189 ? (5 + taill) - pref_len : loc_regeol - locinput);
3190 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3191 l--;
3192 if (pref0_len < 0)
3193 pref0_len = 0;
3194 if (pref0_len > pref_len)
3195 pref0_len = pref_len;
3196 {
3197 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3198
3199 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3200 (locinput - pref_len),pref0_len, 60, 4, 5);
3201
3202 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3203 (locinput - pref_len + pref0_len),
3204 pref_len - pref0_len, 60, 2, 3);
3205
3206 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3207 locinput, loc_regeol - locinput, 10, 0, 1);
3208
3209 const STRLEN tlen=len0+len1+len2;
3210 PerlIO_printf(Perl_debug_log,
3211 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3212 (IV)(locinput - loc_bostr),
3213 len0, s0,
3214 len1, s1,
3215 (docolor ? "" : "> <"),
3216 len2, s2,
3217 (int)(tlen > 19 ? 0 : 19 - tlen),
3218 "");
3219 }
3220}
3221
3222#endif
3223
3224/* reg_check_named_buff_matched()
3225 * Checks to see if a named buffer has matched. The data array of
3226 * buffer numbers corresponding to the buffer is expected to reside
3227 * in the regexp->data->data array in the slot stored in the ARG() of
3228 * node involved. Note that this routine doesn't actually care about the
3229 * name, that information is not preserved from compilation to execution.
3230 * Returns the index of the leftmost defined buffer with the given name
3231 * or 0 if non of the buffers matched.
3232 */
3233STATIC I32
3234S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3235{
3236 I32 n;
3237 RXi_GET_DECL(rex,rexi);
3238 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3239 I32 *nums=(I32*)SvPVX(sv_dat);
3240
3241 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3242
3243 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3244 if ((I32)rex->lastparen >= nums[n] &&
3245 rex->offs[nums[n]].end != -1)
3246 {
3247 return nums[n];
3248 }
3249 }
3250 return 0;
3251}
3252
3253
3254/* free all slabs above current one - called during LEAVE_SCOPE */
3255
3256STATIC void
3257S_clear_backtrack_stack(pTHX_ void *p)
3258{
3259 regmatch_slab *s = PL_regmatch_slab->next;
3260 PERL_UNUSED_ARG(p);
3261
3262 if (!s)
3263 return;
3264 PL_regmatch_slab->next = NULL;
3265 while (s) {
3266 regmatch_slab * const osl = s;
3267 s = s->next;
3268 Safefree(osl);
3269 }
3270}
3271static bool
3272S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3273{
3274 /* This function determines if there are one or two characters that match
3275 * the first character of the passed-in EXACTish node <text_node>, and if
3276 * so, returns them in the passed-in pointers.
3277 *
3278 * If it determines that no possible character in the target string can
3279 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3280 * the first character in <text_node> requires UTF-8 to represent, and the
3281 * target string isn't in UTF-8.)
3282 *
3283 * If there are more than two characters that could match the beginning of
3284 * <text_node>, or if more context is required to determine a match or not,
3285 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3286 *
3287 * The motiviation behind this function is to allow the caller to set up
3288 * tight loops for matching. If <text_node> is of type EXACT, there is
3289 * only one possible character that can match its first character, and so
3290 * the situation is quite simple. But things get much more complicated if
3291 * folding is involved. It may be that the first character of an EXACTFish
3292 * node doesn't participate in any possible fold, e.g., punctuation, so it
3293 * can be matched only by itself. The vast majority of characters that are
3294 * in folds match just two things, their lower and upper-case equivalents.
3295 * But not all are like that; some have multiple possible matches, or match
3296 * sequences of more than one character. This function sorts all that out.
3297 *
3298 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3299 * loop of trying to match A*, we know we can't exit where the thing
3300 * following it isn't a B. And something can't be a B unless it is the
3301 * beginning of B. By putting a quick test for that beginning in a tight
3302 * loop, we can rule out things that can't possibly be B without having to
3303 * break out of the loop, thus avoiding work. Similarly, if A is a single
3304 * character, we can make a tight loop matching A*, using the outputs of
3305 * this function.
3306 *
3307 * If the target string to match isn't in UTF-8, and there aren't
3308 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3309 * the one or two possible octets (which are characters in this situation)
3310 * that can match. In all cases, if there is only one character that can
3311 * match, *<c1p> and *<c2p> will be identical.
3312 *
3313 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3314 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3315 * can match the beginning of <text_node>. They should be declared with at
3316 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3317 * undefined what these contain.) If one or both of the buffers are
3318 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3319 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3320 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3321 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3322 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3323
3324 const bool utf8_target = PL_reg_match_utf8;
3325
3326 UV c1 = CHRTEST_NOT_A_CP_1;
3327 UV c2 = CHRTEST_NOT_A_CP_2;
3328 bool use_chrtest_void = FALSE;
3329
3330 /* Used when we have both utf8 input and utf8 output, to avoid converting
3331 * to/from code points */
3332 bool utf8_has_been_setup = FALSE;
3333
3334 dVAR;
3335
3336 U8 *pat = (U8*)STRING(text_node);
3337
3338 if (OP(text_node) == EXACT) {
3339
3340 /* In an exact node, only one thing can be matched, that first
3341 * character. If both the pat and the target are UTF-8, we can just
3342 * copy the input to the output, avoiding finding the code point of
3343 * that character */
3344 if (! UTF_PATTERN) {
3345 c2 = c1 = *pat;
3346 }
3347 else if (utf8_target) {
3348 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3349 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3350 utf8_has_been_setup = TRUE;
3351 }
3352 else {
3353 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3354 }
3355 }
3356 else /* an EXACTFish node */
3357 if ((UTF_PATTERN
3358 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3359 pat + STR_LEN(text_node)))
3360 || (! UTF_PATTERN
3361 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3362 pat + STR_LEN(text_node))))
3363 {
3364 /* Multi-character folds require more context to sort out. Also
3365 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3366 * handled outside this routine */
3367 use_chrtest_void = TRUE;
3368 }
3369 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3370 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3371 if (c1 > 256) {
3372 /* Load the folds hash, if not already done */
3373 SV** listp;
3374 if (! PL_utf8_foldclosures) {
3375 if (! PL_utf8_tofold) {
3376 U8 dummy[UTF8_MAXBYTES+1];
3377
3378 /* Force loading this by folding an above-Latin1 char */
3379 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3380 assert(PL_utf8_tofold); /* Verify that worked */
3381 }
3382 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3383 }
3384
3385 /* The fold closures data structure is a hash with the keys being
3386 * the UTF-8 of every character that is folded to, like 'k', and
3387 * the values each an array of all code points that fold to its
3388 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3389 * not included */
3390 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3391 (char *) pat,
3392 UTF8SKIP(pat),
3393 FALSE))))
3394 {
3395 /* Not found in the hash, therefore there are no folds
3396 * containing it, so there is only a single character that
3397 * could match */
3398 c2 = c1;
3399 }
3400 else { /* Does participate in folds */
3401 AV* list = (AV*) *listp;
3402 if (av_len(list) != 1) {
3403
3404 /* If there aren't exactly two folds to this, it is outside
3405 * the scope of this function */
3406 use_chrtest_void = TRUE;
3407 }
3408 else { /* There are two. Get them */
3409 SV** c_p = av_fetch(list, 0, FALSE);
3410 if (c_p == NULL) {
3411 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3412 }
3413 c1 = SvUV(*c_p);
3414
3415 c_p = av_fetch(list, 1, FALSE);
3416 if (c_p == NULL) {
3417 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3418 }
3419 c2 = SvUV(*c_p);
3420
3421 /* Folds that cross the 255/256 boundary are forbidden if
3422 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3423 * pattern character is above 256, and its only other match
3424 * is below 256, the only legal match will be to itself.
3425 * We have thrown away the original, so have to compute
3426 * which is the one above 255 */
3427 if ((c1 < 256) != (c2 < 256)) {
3428 if (OP(text_node) == EXACTFL
3429 || (OP(text_node) == EXACTFA
3430 && (isASCII(c1) || isASCII(c2))))
3431 {
3432 if (c1 < 256) {
3433 c1 = c2;
3434 }
3435 else {
3436 c2 = c1;
3437 }
3438 }
3439 }
3440 }
3441 }
3442 }
3443 else /* Here, c1 is < 255 */
3444 if (utf8_target
3445 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3446 && OP(text_node) != EXACTFL
3447 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3448 {
3449 /* Here, there could be something above Latin1 in the target which
3450 * folds to this character in the pattern. All such cases except
3451 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3452 * involved in their folds, so are outside the scope of this
3453 * function */
3454 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3455 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3456 }
3457 else {
3458 use_chrtest_void = TRUE;
3459 }
3460 }
3461 else { /* Here nothing above Latin1 can fold to the pattern character */
3462 switch (OP(text_node)) {
3463
3464 case EXACTFL: /* /l rules */
3465 c2 = PL_fold_locale[c1];
3466 break;
3467
3468 case EXACTF:
3469 if (! utf8_target) { /* /d rules */
3470 c2 = PL_fold[c1];
3471 break;
3472 }
3473 /* FALLTHROUGH */
3474 /* /u rules for all these. This happens to work for
3475 * EXACTFA as nothing in Latin1 folds to ASCII */
3476 case EXACTFA:
3477 case EXACTFU_TRICKYFOLD:
3478 case EXACTFU_SS:
3479 case EXACTFU:
3480 c2 = PL_fold_latin1[c1];
3481 break;
3482
3483 default:
3484 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3485 assert(0); /* NOTREACHED */
3486 }
3487 }
3488 }
3489
3490 /* Here have figured things out. Set up the returns */
3491 if (use_chrtest_void) {
3492 *c2p = *c1p = CHRTEST_VOID;
3493 }
3494 else if (utf8_target) {
3495 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3496 uvchr_to_utf8(c1_utf8, c1);
3497 uvchr_to_utf8(c2_utf8, c2);
3498 }
3499
3500 /* Invariants are stored in both the utf8 and byte outputs; Use
3501 * negative numbers otherwise for the byte ones. Make sure that the
3502 * byte ones are the same iff the utf8 ones are the same */
3503 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3504 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3505 ? *c2_utf8
3506 : (c1 == c2)
3507 ? CHRTEST_NOT_A_CP_1
3508 : CHRTEST_NOT_A_CP_2;
3509 }
3510 else if (c1 > 255) {
3511 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3512 can represent */
3513 return FALSE;
3514 }
3515
3516 *c1p = *c2p = c2; /* c2 is the only representable value */
3517 }
3518 else { /* c1 is representable; see about c2 */
3519 *c1p = c1;
3520 *c2p = (c2 < 256) ? c2 : c1;
3521 }
3522
3523 return TRUE;
3524}
3525
3526/* returns -1 on failure, $+[0] on success */
3527STATIC I32
3528S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3529{
3530#if PERL_VERSION < 9 && !defined(PERL_CORE)
3531 dMY_CXT;
3532#endif
3533 dVAR;
3534 const bool utf8_target = PL_reg_match_utf8;
3535 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3536 REGEXP *rex_sv = reginfo->prog;
3537 regexp *rex = ReANY(rex_sv);
3538 RXi_GET_DECL(rex,rexi);
3539 I32 oldsave;
3540 /* the current state. This is a cached copy of PL_regmatch_state */
3541 regmatch_state *st;
3542 /* cache heavy used fields of st in registers */
3543 regnode *scan;
3544 regnode *next;
3545 U32 n = 0; /* general value; init to avoid compiler warning */
3546 I32 ln = 0; /* len or last; init to avoid compiler warning */
3547 char *locinput = startpos;
3548 char *pushinput; /* where to continue after a PUSH */
3549 I32 nextchr; /* is always set to UCHARAT(locinput) */
3550
3551 bool result = 0; /* return value of S_regmatch */
3552 int depth = 0; /* depth of backtrack stack */
3553 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3554 const U32 max_nochange_depth =
3555 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3556 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3557 regmatch_state *yes_state = NULL; /* state to pop to on success of
3558 subpattern */
3559 /* mark_state piggy backs on the yes_state logic so that when we unwind
3560 the stack on success we can update the mark_state as we go */
3561 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3562 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3563 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3564 U32 state_num;
3565 bool no_final = 0; /* prevent failure from backtracking? */
3566 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3567 char *startpoint = locinput;
3568 SV *popmark = NULL; /* are we looking for a mark? */
3569 SV *sv_commit = NULL; /* last mark name seen in failure */
3570 SV *sv_yes_mark = NULL; /* last mark name we have seen
3571 during a successful match */
3572 U32 lastopen = 0; /* last open we saw */
3573 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3574 SV* const oreplsv = GvSV(PL_replgv);
3575 /* these three flags are set by various ops to signal information to
3576 * the very next op. They have a useful lifetime of exactly one loop
3577 * iteration, and are not preserved or restored by state pushes/pops
3578 */
3579 bool sw = 0; /* the condition value in (?(cond)a|b) */
3580 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3581 int logical = 0; /* the following EVAL is:
3582 0: (?{...})
3583 1: (?(?{...})X|Y)
3584 2: (??{...})
3585 or the following IFMATCH/UNLESSM is:
3586 false: plain (?=foo)
3587 true: used as a condition: (?(?=foo))
3588 */
3589 PAD* last_pad = NULL;
3590 dMULTICALL;
3591 I32 gimme = G_SCALAR;
3592 CV *caller_cv = NULL; /* who called us */
3593 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3594 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3595
3596#ifdef DEBUGGING
3597 GET_RE_DEBUG_FLAGS_DECL;
3598#endif
3599
3600 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3601 multicall_oldcatch = 0;
3602 multicall_cv = NULL;
3603 cx = NULL;
3604 PERL_UNUSED_VAR(multicall_cop);
3605 PERL_UNUSED_VAR(newsp);
3606
3607
3608 PERL_ARGS_ASSERT_REGMATCH;
3609
3610 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3611 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3612 }));
3613 /* on first ever call to regmatch, allocate first slab */
3614 if (!PL_regmatch_slab) {
3615 Newx(PL_regmatch_slab, 1, regmatch_slab);
3616 PL_regmatch_slab->prev = NULL;
3617 PL_regmatch_slab->next = NULL;
3618 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3619 }
3620
3621 oldsave = PL_savestack_ix;
3622 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3623 SAVEVPTR(PL_regmatch_slab);
3624 SAVEVPTR(PL_regmatch_state);
3625
3626 /* grab next free state slot */
3627 st = ++PL_regmatch_state;
3628 if (st > SLAB_LAST(PL_regmatch_slab))
3629 st = PL_regmatch_state = S_push_slab(aTHX);
3630
3631 /* Note that nextchr is a byte even in UTF */
3632 SET_nextchr;
3633 scan = prog;
3634 while (scan != NULL) {
3635
3636 DEBUG_EXECUTE_r( {
3637 SV * const prop = sv_newmortal();
3638 regnode *rnext=regnext(scan);
3639 DUMP_EXEC_POS( locinput, scan, utf8_target );
3640 regprop(rex, prop, scan);
3641
3642 PerlIO_printf(Perl_debug_log,
3643 "%3"IVdf":%*s%s(%"IVdf")\n",
3644 (IV)(scan - rexi->program), depth*2, "",
3645 SvPVX_const(prop),
3646 (PL_regkind[OP(scan)] == END || !rnext) ?
3647 0 : (IV)(rnext - rexi->program));
3648 });
3649
3650 next = scan + NEXT_OFF(scan);
3651 if (next == scan)
3652 next = NULL;
3653 state_num = OP(scan);
3654
3655 reenter_switch:
3656
3657 SET_nextchr;
3658 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3659
3660 switch (state_num) {
3661 case BOL: /* /^../ */
3662 if (locinput == PL_bostr)
3663 {
3664 /* reginfo->till = reginfo->bol; */
3665 break;
3666 }
3667 sayNO;
3668
3669 case MBOL: /* /^../m */
3670 if (locinput == PL_bostr ||
3671 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3672 {
3673 break;
3674 }
3675 sayNO;
3676
3677 case SBOL: /* /^../s */
3678 if (locinput == PL_bostr)
3679 break;
3680 sayNO;
3681
3682 case GPOS: /* \G */
3683 if (locinput == reginfo->ganch)
3684 break;
3685 sayNO;
3686
3687 case KEEPS: /* \K */
3688 /* update the startpoint */
3689 st->u.keeper.val = rex->offs[0].start;
3690 rex->offs[0].start = locinput - PL_bostr;
3691 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3692 assert(0); /*NOTREACHED*/
3693 case KEEPS_next_fail:
3694 /* rollback the start point change */
3695 rex->offs[0].start = st->u.keeper.val;
3696 sayNO_SILENT;
3697 assert(0); /*NOTREACHED*/
3698
3699 case EOL: /* /..$/ */
3700 goto seol;
3701
3702 case MEOL: /* /..$/m */
3703 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3704 sayNO;
3705 break;
3706
3707 case SEOL: /* /..$/s */
3708 seol:
3709 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3710 sayNO;
3711 if (PL_regeol - locinput > 1)
3712 sayNO;
3713 break;
3714
3715 case EOS: /* \z */
3716 if (!NEXTCHR_IS_EOS)
3717 sayNO;
3718 break;
3719
3720 case SANY: /* /./s */
3721 if (NEXTCHR_IS_EOS)
3722 sayNO;
3723 goto increment_locinput;
3724
3725 case CANY: /* \C */
3726 if (NEXTCHR_IS_EOS)
3727 sayNO;
3728 locinput++;
3729 break;
3730
3731 case REG_ANY: /* /./ */
3732 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3733 sayNO;
3734 goto increment_locinput;
3735
3736
3737#undef ST
3738#define ST st->u.trie
3739 case TRIEC: /* (ab|cd) with known charclass */
3740 /* In this case the charclass data is available inline so
3741 we can fail fast without a lot of extra overhead.
3742 */
3743 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3744 DEBUG_EXECUTE_r(
3745 PerlIO_printf(Perl_debug_log,
3746 "%*s %sfailed to match trie start class...%s\n",
3747 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3748 );
3749 sayNO_SILENT;
3750 assert(0); /* NOTREACHED */
3751 }
3752 /* FALL THROUGH */
3753 case TRIE: /* (ab|cd) */
3754 /* the basic plan of execution of the trie is:
3755 * At the beginning, run though all the states, and
3756 * find the longest-matching word. Also remember the position
3757 * of the shortest matching word. For example, this pattern:
3758 * 1 2 3 4 5
3759 * ab|a|x|abcd|abc
3760 * when matched against the string "abcde", will generate
3761 * accept states for all words except 3, with the longest
3762 * matching word being 4, and the shortest being 2 (with
3763 * the position being after char 1 of the string).
3764 *
3765 * Then for each matching word, in word order (i.e. 1,2,4,5),
3766 * we run the remainder of the pattern; on each try setting
3767 * the current position to the character following the word,
3768 * returning to try the next word on failure.
3769 *
3770 * We avoid having to build a list of words at runtime by
3771 * using a compile-time structure, wordinfo[].prev, which
3772 * gives, for each word, the previous accepting word (if any).
3773 * In the case above it would contain the mappings 1->2, 2->0,
3774 * 3->0, 4->5, 5->1. We can use this table to generate, from
3775 * the longest word (4 above), a list of all words, by
3776 * following the list of prev pointers; this gives us the
3777 * unordered list 4,5,1,2. Then given the current word we have
3778 * just tried, we can go through the list and find the
3779 * next-biggest word to try (so if we just failed on word 2,
3780 * the next in the list is 4).
3781 *
3782 * Since at runtime we don't record the matching position in
3783 * the string for each word, we have to work that out for
3784 * each word we're about to process. The wordinfo table holds
3785 * the character length of each word; given that we recorded
3786 * at the start: the position of the shortest word and its
3787 * length in chars, we just need to move the pointer the
3788 * difference between the two char lengths. Depending on
3789 * Unicode status and folding, that's cheap or expensive.
3790 *
3791 * This algorithm is optimised for the case where are only a
3792 * small number of accept states, i.e. 0,1, or maybe 2.
3793 * With lots of accepts states, and having to try all of them,
3794 * it becomes quadratic on number of accept states to find all
3795 * the next words.
3796 */
3797
3798 {
3799 /* what type of TRIE am I? (utf8 makes this contextual) */
3800 DECL_TRIE_TYPE(scan);
3801
3802 /* what trie are we using right now */
3803 reg_trie_data * const trie
3804 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3805 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3806 U32 state = trie->startstate;
3807
3808 if ( trie->bitmap
3809 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3810 {
3811 if (trie->states[ state ].wordnum) {
3812 DEBUG_EXECUTE_r(
3813 PerlIO_printf(Perl_debug_log,
3814 "%*s %smatched empty string...%s\n",
3815 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3816 );
3817 if (!trie->jump)
3818 break;
3819 } else {
3820 DEBUG_EXECUTE_r(
3821 PerlIO_printf(Perl_debug_log,
3822 "%*s %sfailed to match trie start class...%s\n",
3823 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3824 );
3825 sayNO_SILENT;
3826 }
3827 }
3828
3829 {
3830 U8 *uc = ( U8* )locinput;
3831
3832 STRLEN len = 0;
3833 STRLEN foldlen = 0;
3834 U8 *uscan = (U8*)NULL;
3835 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3836 U32 charcount = 0; /* how many input chars we have matched */
3837 U32 accepted = 0; /* have we seen any accepting states? */
3838
3839 ST.jump = trie->jump;
3840 ST.me = scan;
3841 ST.firstpos = NULL;
3842 ST.longfold = FALSE; /* char longer if folded => it's harder */
3843 ST.nextword = 0;
3844
3845 /* fully traverse the TRIE; note the position of the
3846 shortest accept state and the wordnum of the longest
3847 accept state */
3848
3849 while ( state && uc <= (U8*)PL_regeol ) {
3850 U32 base = trie->states[ state ].trans.base;
3851 UV uvc = 0;
3852 U16 charid = 0;
3853 U16 wordnum;
3854 wordnum = trie->states[ state ].wordnum;
3855
3856 if (wordnum) { /* it's an accept state */
3857 if (!accepted) {
3858 accepted = 1;
3859 /* record first match position */
3860 if (ST.longfold) {
3861 ST.firstpos = (U8*)locinput;
3862 ST.firstchars = 0;
3863 }
3864 else {
3865 ST.firstpos = uc;
3866 ST.firstchars = charcount;
3867 }
3868 }
3869 if (!ST.nextword || wordnum < ST.nextword)
3870 ST.nextword = wordnum;
3871 ST.topword = wordnum;
3872 }
3873
3874 DEBUG_TRIE_EXECUTE_r({
3875 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3876 PerlIO_printf( Perl_debug_log,
3877 "%*s %sState: %4"UVxf" Accepted: %c ",
3878 2+depth * 2, "", PL_colors[4],
3879 (UV)state, (accepted ? 'Y' : 'N'));
3880 });
3881
3882 /* read a char and goto next state */
3883 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3884 I32 offset;
3885 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3886 uscan, len, uvc, charid, foldlen,
3887 foldbuf, uniflags);
3888 charcount++;
3889 if (foldlen>0)
3890 ST.longfold = TRUE;
3891 if (charid &&
3892 ( ((offset =
3893 base + charid - 1 - trie->uniquecharcount)) >= 0)
3894
3895 && ((U32)offset < trie->lasttrans)
3896 && trie->trans[offset].check == state)
3897 {
3898 state = trie->trans[offset].next;
3899 }
3900 else {
3901 state = 0;
3902 }
3903 uc += len;
3904
3905 }
3906 else {
3907 state = 0;
3908 }
3909 DEBUG_TRIE_EXECUTE_r(
3910 PerlIO_printf( Perl_debug_log,
3911 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3912 charid, uvc, (UV)state, PL_colors[5] );
3913 );
3914 }
3915 if (!accepted)
3916 sayNO;
3917
3918 /* calculate total number of accept states */
3919 {
3920 U16 w = ST.topword;
3921 accepted = 0;
3922 while (w) {
3923 w = trie->wordinfo[w].prev;
3924 accepted++;
3925 }
3926 ST.accepted = accepted;
3927 }
3928
3929 DEBUG_EXECUTE_r(
3930 PerlIO_printf( Perl_debug_log,
3931 "%*s %sgot %"IVdf" possible matches%s\n",
3932 REPORT_CODE_OFF + depth * 2, "",
3933 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3934 );
3935 goto trie_first_try; /* jump into the fail handler */
3936 }}
3937 assert(0); /* NOTREACHED */
3938
3939 case TRIE_next_fail: /* we failed - try next alternative */
3940 {
3941 U8 *uc;
3942 if ( ST.jump) {
3943 REGCP_UNWIND(ST.cp);
3944 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3945 }
3946 if (!--ST.accepted) {
3947 DEBUG_EXECUTE_r({
3948 PerlIO_printf( Perl_debug_log,
3949 "%*s %sTRIE failed...%s\n",
3950 REPORT_CODE_OFF+depth*2, "",
3951 PL_colors[4],
3952 PL_colors[5] );
3953 });
3954 sayNO_SILENT;
3955 }
3956 {
3957 /* Find next-highest word to process. Note that this code
3958 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3959 U16 min = 0;
3960 U16 word;
3961 U16 const nextword = ST.nextword;
3962 reg_trie_wordinfo * const wordinfo
3963 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3964 for (word=ST.topword; word; word=wordinfo[word].prev) {
3965 if (word > nextword && (!min || word < min))
3966 min = word;
3967 }
3968 ST.nextword = min;
3969 }
3970
3971 trie_first_try:
3972 if (do_cutgroup) {
3973 do_cutgroup = 0;
3974 no_final = 0;
3975 }
3976
3977 if ( ST.jump) {
3978 ST.lastparen = rex->lastparen;
3979 ST.lastcloseparen = rex->lastcloseparen;
3980 REGCP_SET(ST.cp);
3981 }
3982
3983 /* find start char of end of current word */
3984 {
3985 U32 chars; /* how many chars to skip */
3986 reg_trie_data * const trie
3987 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3988
3989 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3990 >= ST.firstchars);
3991 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3992 - ST.firstchars;
3993 uc = ST.firstpos;
3994
3995 if (ST.longfold) {
3996 /* the hard option - fold each char in turn and find
3997 * its folded length (which may be different */
3998 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3999 STRLEN foldlen;
4000 STRLEN len;
4001 UV uvc;
4002 U8 *uscan;
4003
4004 while (chars) {
4005 if (utf8_target) {
4006 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4007 uniflags);
4008 uc += len;
4009 }
4010 else {
4011 uvc = *uc;
4012 uc++;
4013 }
4014 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4015 uscan = foldbuf;
4016 while (foldlen) {
4017 if (!--chars)
4018 break;
4019 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4020 uniflags);
4021 uscan += len;
4022 foldlen -= len;
4023 }
4024 }
4025 }
4026 else {
4027 if (utf8_target)
4028 while (chars--)
4029 uc += UTF8SKIP(uc);
4030 else
4031 uc += chars;
4032 }
4033 }
4034
4035 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4036 ? ST.jump[ST.nextword]
4037 : NEXT_OFF(ST.me));
4038
4039 DEBUG_EXECUTE_r({
4040 PerlIO_printf( Perl_debug_log,
4041 "%*s %sTRIE matched word #%d, continuing%s\n",
4042 REPORT_CODE_OFF+depth*2, "",
4043 PL_colors[4],
4044 ST.nextword,
4045 PL_colors[5]
4046 );
4047 });
4048
4049 if (ST.accepted > 1 || has_cutgroup) {
4050 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4051 assert(0); /* NOTREACHED */
4052 }
4053 /* only one choice left - just continue */
4054 DEBUG_EXECUTE_r({
4055 AV *const trie_words
4056 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4057 SV ** const tmp = av_fetch( trie_words,
4058 ST.nextword-1, 0 );
4059 SV *sv= tmp ? sv_newmortal() : NULL;
4060
4061 PerlIO_printf( Perl_debug_log,
4062 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4063 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4064 ST.nextword,
4065 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4066 PL_colors[0], PL_colors[1],
4067 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4068 )
4069 : "not compiled under -Dr",
4070 PL_colors[5] );
4071 });
4072
4073 locinput = (char*)uc;
4074 continue; /* execute rest of RE */
4075 assert(0); /* NOTREACHED */
4076 }
4077#undef ST
4078
4079 case EXACT: { /* /abc/ */
4080 char *s = STRING(scan);
4081 ln = STR_LEN(scan);
4082 if (utf8_target != UTF_PATTERN) {
4083 /* The target and the pattern have differing utf8ness. */
4084 char *l = locinput;
4085 const char * const e = s + ln;
4086
4087 if (utf8_target) {
4088 /* The target is utf8, the pattern is not utf8.
4089 * Above-Latin1 code points can't match the pattern;
4090 * invariants match exactly, and the other Latin1 ones need
4091 * to be downgraded to a single byte in order to do the
4092 * comparison. (If we could be confident that the target
4093 * is not malformed, this could be refactored to have fewer
4094 * tests by just assuming that if the first bytes match, it
4095 * is an invariant, but there are tests in the test suite
4096 * dealing with (??{...}) which violate this) */
4097 while (s < e) {
4098 if (l >= PL_regeol)
4099 sayNO;
4100 if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4101 sayNO;
4102 }
4103 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4104 if (*l != *s) {
4105 sayNO;
4106 }
4107 l++;
4108 }
4109 else {
4110 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4111 sayNO;
4112 }
4113 l += 2;
4114 }
4115 s++;
4116 }
4117 }
4118 else {
4119 /* The target is not utf8, the pattern is utf8. */
4120 while (s < e) {
4121 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4122 {
4123 sayNO;
4124 }
4125 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4126 if (*s != *l) {
4127 sayNO;
4128 }
4129 s++;
4130 }
4131 else {
4132 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4133 sayNO;
4134 }
4135 s += 2;
4136 }
4137 l++;
4138 }
4139 }
4140 locinput = l;
4141 break;
4142 }
4143 /* The target and the pattern have the same utf8ness. */
4144 /* Inline the first character, for speed. */
4145 if (UCHARAT(s) != nextchr)
4146 sayNO;
4147 if (PL_regeol - locinput < ln)
4148 sayNO;
4149 if (ln > 1 && memNE(s, locinput, ln))
4150 sayNO;
4151 locinput += ln;
4152 break;
4153 }
4154
4155 case EXACTFL: { /* /abc/il */
4156 re_fold_t folder;
4157 const U8 * fold_array;
4158 const char * s;
4159 U32 fold_utf8_flags;
4160
4161 PL_reg_flags |= RF_tainted;
4162 folder = foldEQ_locale;
4163 fold_array = PL_fold_locale;
4164 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4165 goto do_exactf;
4166
4167 case EXACTFU_SS: /* /\x{df}/iu */
4168 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4169 case EXACTFU: /* /abc/iu */
4170 folder = foldEQ_latin1;
4171 fold_array = PL_fold_latin1;
4172 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4173 goto do_exactf;
4174
4175 case EXACTFA: /* /abc/iaa */
4176 folder = foldEQ_latin1;
4177 fold_array = PL_fold_latin1;
4178 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4179 goto do_exactf;
4180
4181 case EXACTF: /* /abc/i */
4182 folder = foldEQ;
4183 fold_array = PL_fold;
4184 fold_utf8_flags = 0;
4185
4186 do_exactf:
4187 s = STRING(scan);
4188 ln = STR_LEN(scan);
4189
4190 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4191 /* Either target or the pattern are utf8, or has the issue where
4192 * the fold lengths may differ. */
4193 const char * const l = locinput;
4194 char *e = PL_regeol;
4195
4196 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
4197 l, &e, 0, utf8_target, fold_utf8_flags))
4198 {
4199 sayNO;
4200 }
4201 locinput = e;
4202 break;
4203 }
4204
4205 /* Neither the target nor the pattern are utf8 */
4206 if (UCHARAT(s) != nextchr
4207 && !NEXTCHR_IS_EOS
4208 && UCHARAT(s) != fold_array[nextchr])
4209 {
4210 sayNO;
4211 }
4212 if (PL_regeol - locinput < ln)
4213 sayNO;
4214 if (ln > 1 && ! folder(s, locinput, ln))
4215 sayNO;
4216 locinput += ln;
4217 break;
4218 }
4219
4220 /* XXX Could improve efficiency by separating these all out using a
4221 * macro or in-line function. At that point regcomp.c would no longer
4222 * have to set the FLAGS fields of these */
4223 case BOUNDL: /* /\b/l */
4224 case NBOUNDL: /* /\B/l */
4225 PL_reg_flags |= RF_tainted;
4226 /* FALL THROUGH */
4227 case BOUND: /* /\b/ */
4228 case BOUNDU: /* /\b/u */
4229 case BOUNDA: /* /\b/a */
4230 case NBOUND: /* /\B/ */
4231 case NBOUNDU: /* /\B/u */
4232 case NBOUNDA: /* /\B/a */
4233 /* was last char in word? */
4234 if (utf8_target
4235 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4236 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4237 {
4238 if (locinput == PL_bostr)
4239 ln = '\n';
4240 else {
4241 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4242
4243 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4244 }
4245 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4246 ln = isALNUM_uni(ln);
4247 if (NEXTCHR_IS_EOS)
4248 n = 0;
4249 else {
4250 LOAD_UTF8_CHARCLASS_ALNUM();
4251 n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4252 utf8_target);
4253 }
4254 }
4255 else {
4256 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4257 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4258 }
4259 }
4260 else {
4261
4262 /* Here the string isn't utf8, or is utf8 and only ascii
4263 * characters are to match \w. In the latter case looking at
4264 * the byte just prior to the current one may be just the final
4265 * byte of a multi-byte character. This is ok. There are two
4266 * cases:
4267 * 1) it is a single byte character, and then the test is doing
4268 * just what it's supposed to.
4269 * 2) it is a multi-byte character, in which case the final
4270 * byte is never mistakable for ASCII, and so the test
4271 * will say it is not a word character, which is the
4272 * correct answer. */
4273 ln = (locinput != PL_bostr) ?
4274 UCHARAT(locinput - 1) : '\n';
4275 switch (FLAGS(scan)) {
4276 case REGEX_UNICODE_CHARSET:
4277 ln = isWORDCHAR_L1(ln);
4278 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4279 break;
4280 case REGEX_LOCALE_CHARSET:
4281 ln = isALNUM_LC(ln);
4282 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4283 break;
4284 case REGEX_DEPENDS_CHARSET:
4285 ln = isALNUM(ln);
4286 n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4287 break;
4288 case REGEX_ASCII_RESTRICTED_CHARSET:
4289 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4290 ln = isWORDCHAR_A(ln);
4291 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4292 break;
4293 default:
4294 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4295 break;
4296 }
4297 }
4298 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4299 * regcomp.sym */
4300 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4301 sayNO;
4302 break;
4303
4304 case ANYOF: /* /[abc]/ */
4305 if (NEXTCHR_IS_EOS)
4306 sayNO;
4307 if (utf8_target) {
4308 if (!reginclass(rex, scan, (U8*)locinput, utf8_target))
4309 sayNO;
4310 locinput += UTF8SKIP(locinput);
4311 break;
4312 }
4313 else {
4314 if (!REGINCLASS(rex, scan, (U8*)locinput))
4315 sayNO;
4316 locinput++;
4317 break;
4318 }
4319 break;
4320
4321 /* Special char classes: \d, \w etc.
4322 * The defines start on line 166 or so */
4323 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
4324 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4325 ALNUMU, NALNUMU, isWORDCHAR_L1,
4326 ALNUMA, NALNUMA, isWORDCHAR_A,
4327 alnum, "a");
4328
4329 case SPACEL:
4330 PL_reg_flags |= RF_tainted;
4331 if (NEXTCHR_IS_EOS) {
4332 sayNO;
4333 }
4334 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4335 if (! isSPACE_LC_utf8((U8 *) locinput)) {
4336 sayNO;
4337 }
4338 }
4339 else if (! isSPACE_LC((U8) nextchr)) {
4340 sayNO;
4341 }
4342 goto increment_locinput;
4343
4344 case NSPACEL:
4345 PL_reg_flags |= RF_tainted;
4346 if (NEXTCHR_IS_EOS) {
4347 sayNO;
4348 }
4349 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) {
4350 if (isSPACE_LC_utf8((U8 *) locinput)) {
4351 sayNO;
4352 }
4353 }
4354 else if (isSPACE_LC(nextchr)) {
4355 sayNO;
4356 }
4357 goto increment_locinput;
4358
4359 case SPACE:
4360 if (utf8_target) {
4361 goto utf8_space;
4362 }
4363 /* FALL THROUGH */
4364 case SPACEA:
4365 if (NEXTCHR_IS_EOS || ! isSPACE_A(nextchr)) {
4366 sayNO;
4367 }
4368 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4369 locinput++;
4370 break;
4371
4372 case NSPACE:
4373 if (utf8_target) {
4374 goto utf8_nspace;
4375 }
4376 /* FALL THROUGH */
4377 case NSPACEA:
4378 if (NEXTCHR_IS_EOS || isSPACE_A(nextchr)) {
4379 sayNO;
4380 }
4381 goto increment_locinput;
4382
4383 case SPACEU:
4384 utf8_space:
4385 if (NEXTCHR_IS_EOS || ! is_XPERLSPACE(locinput, utf8_target)) {
4386 sayNO;
4387 }
4388 goto increment_locinput;
4389
4390 case NSPACEU:
4391 utf8_nspace:
4392 if (NEXTCHR_IS_EOS || is_XPERLSPACE(locinput, utf8_target)) {
4393 sayNO;
4394 }
4395 goto increment_locinput;
4396
4397 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
4398 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4399 DIGITA, NDIGITA, isDIGIT_A,
4400 digit, "0");
4401
4402 case POSIXA: /* /[[:ascii:]]/ etc */
4403 if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4404 sayNO;
4405 }
4406 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4407 locinput++;
4408 break;
4409
4410 case NPOSIXA: /* /[^[:ascii:]]/ etc */
4411 if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4412 sayNO;
4413 }
4414 goto increment_locinput;
4415
4416 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4417 a Unicode extended Grapheme Cluster */
4418 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4419 extended Grapheme Cluster is:
4420
4421 CR LF
4422 | Prepend* Begin Extend*
4423 | .
4424
4425 Begin is: ( Special_Begin | ! Control )
4426 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4427 Extend is: ( Grapheme_Extend | Spacing_Mark )
4428 Control is: [ GCB_Control CR LF ]
4429 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4430
4431 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4432 we can rewrite
4433
4434 Begin is ( Regular_Begin + Special Begin )
4435
4436 It turns out that 98.4% of all Unicode code points match
4437 Regular_Begin. Doing it this way eliminates a table match in
4438 the previous implementation for almost all Unicode code points.
4439
4440 There is a subtlety with Prepend* which showed up in testing.
4441 Note that the Begin, and only the Begin is required in:
4442 | Prepend* Begin Extend*
4443 Also, Begin contains '! Control'. A Prepend must be a
4444 '! Control', which means it must also be a Begin. What it
4445 comes down to is that if we match Prepend* and then find no
4446 suitable Begin afterwards, that if we backtrack the last
4447 Prepend, that one will be a suitable Begin.
4448 */
4449
4450 if (NEXTCHR_IS_EOS)
4451 sayNO;
4452 if (! utf8_target) {
4453
4454 /* Match either CR LF or '.', as all the other possibilities
4455 * require utf8 */
4456 locinput++; /* Match the . or CR */
4457 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4458 match the LF */
4459 && locinput < PL_regeol
4460 && UCHARAT(locinput) == '\n') locinput++;
4461 }
4462 else {
4463
4464 /* Utf8: See if is ( CR LF ); already know that locinput <
4465 * PL_regeol, so locinput+1 is in bounds */
4466 if ( nextchr == '\r' && locinput+1 < PL_regeol
4467 && UCHARAT(locinput + 1) == '\n')
4468 {
4469 locinput += 2;
4470 }
4471 else {
4472 STRLEN len;
4473
4474 /* In case have to backtrack to beginning, then match '.' */
4475 char *starting = locinput;
4476
4477 /* In case have to backtrack the last prepend */
4478 char *previous_prepend = 0;
4479
4480 LOAD_UTF8_CHARCLASS_GCB();
4481
4482 /* Match (prepend)* */
4483 while (locinput < PL_regeol
4484 && (len = is_GCB_Prepend_utf8(locinput)))
4485 {
4486 previous_prepend = locinput;
4487 locinput += len;
4488 }
4489
4490 /* As noted above, if we matched a prepend character, but
4491 * the next thing won't match, back off the last prepend we
4492 * matched, as it is guaranteed to match the begin */
4493 if (previous_prepend
4494 && (locinput >= PL_regeol
4495 || (! swash_fetch(PL_utf8_X_regular_begin,
4496 (U8*)locinput, utf8_target)
4497 && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4498 )
4499 {
4500 locinput = previous_prepend;
4501 }
4502
4503 /* Note that here we know PL_regeol > locinput, as we
4504 * tested that upon input to this switch case, and if we
4505 * moved locinput forward, we tested the result just above
4506 * and it either passed, or we backed off so that it will
4507 * now pass */
4508 if (swash_fetch(PL_utf8_X_regular_begin,
4509 (U8*)locinput, utf8_target)) {
4510 locinput += UTF8SKIP(locinput);
4511 }
4512 else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4513
4514 /* Here did not match the required 'Begin' in the
4515 * second term. So just match the very first
4516 * character, the '.' of the final term of the regex */
4517 locinput = starting + UTF8SKIP(starting);
4518 goto exit_utf8;
4519 } else {
4520
4521 /* Here is a special begin. It can be composed of
4522 * several individual characters. One possibility is
4523 * RI+ */
4524 if ((len = is_GCB_RI_utf8(locinput))) {
4525 locinput += len;
4526 while (locinput < PL_regeol
4527 && (len = is_GCB_RI_utf8(locinput)))
4528 {
4529 locinput += len;
4530 }
4531 } else if ((len = is_GCB_T_utf8(locinput))) {
4532 /* Another possibility is T+ */
4533 locinput += len;
4534 while (locinput < PL_regeol
4535 && (len = is_GCB_T_utf8(locinput)))
4536 {
4537 locinput += len;
4538 }
4539 } else {
4540
4541 /* Here, neither RI+ nor T+; must be some other
4542 * Hangul. That means it is one of the others: L,
4543 * LV, LVT or V, and matches:
4544 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4545
4546 /* Match L* */
4547 while (locinput < PL_regeol
4548 && (len = is_GCB_L_utf8(locinput)))
4549 {
4550 locinput += len;
4551 }
4552
4553 /* Here, have exhausted L*. If the next character
4554 * is not an LV, LVT nor V, it means we had to have
4555 * at least one L, so matches L+ in the original
4556 * equation, we have a complete hangul syllable.
4557 * Are done. */
4558
4559 if (locinput < PL_regeol
4560 && is_GCB_LV_LVT_V_utf8(locinput))
4561 {
4562
4563 /* Otherwise keep going. Must be LV, LVT or V.
4564 * See if LVT */
4565 if (is_utf8_X_LVT((U8*)locinput)) {
4566 locinput += UTF8SKIP(locinput);
4567 } else {
4568
4569 /* Must be V or LV. Take it, then match
4570 * V* */
4571 locinput += UTF8SKIP(locinput);
4572 while (locinput < PL_regeol
4573 && (len = is_GCB_V_utf8(locinput)))
4574 {
4575 locinput += len;
4576 }
4577 }
4578
4579 /* And any of LV, LVT, or V can be followed
4580 * by T* */
4581 while (locinput < PL_regeol
4582 && (len = is_GCB_T_utf8(locinput)))
4583 {
4584 locinput += len;
4585 }
4586 }
4587 }
4588 }
4589
4590 /* Match any extender */
4591 while (locinput < PL_regeol
4592 && swash_fetch(PL_utf8_X_extend,
4593 (U8*)locinput, utf8_target))
4594 {
4595 locinput += UTF8SKIP(locinput);
4596 }
4597 }
4598 exit_utf8:
4599 if (locinput > PL_regeol) sayNO;
4600 }
4601 break;
4602
4603 case NREFFL: /* /\g{name}/il */
4604 { /* The capture buffer cases. The ones beginning with N for the
4605 named buffers just convert to the equivalent numbered and
4606 pretend they were called as the corresponding numbered buffer
4607 op. */
4608 /* don't initialize these in the declaration, it makes C++
4609 unhappy */
4610 char *s;
4611 char type;
4612 re_fold_t folder;
4613 const U8 *fold_array;
4614 UV utf8_fold_flags;
4615
4616 PL_reg_flags |= RF_tainted;
4617 folder = foldEQ_locale;
4618 fold_array = PL_fold_locale;
4619 type = REFFL;
4620 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4621 goto do_nref;
4622
4623 case NREFFA: /* /\g{name}/iaa */
4624 folder = foldEQ_latin1;
4625 fold_array = PL_fold_latin1;
4626 type = REFFA;
4627 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4628 goto do_nref;
4629
4630 case NREFFU: /* /\g{name}/iu */
4631 folder = foldEQ_latin1;
4632 fold_array = PL_fold_latin1;
4633 type = REFFU;
4634 utf8_fold_flags = 0;
4635 goto do_nref;
4636
4637 case NREFF: /* /\g{name}/i */
4638 folder = foldEQ;
4639 fold_array = PL_fold;
4640 type = REFF;
4641 utf8_fold_flags = 0;
4642 goto do_nref;
4643
4644 case NREF: /* /\g{name}/ */
4645 type = REF;
4646 folder = NULL;
4647 fold_array = NULL;
4648 utf8_fold_flags = 0;
4649 do_nref:
4650
4651 /* For the named back references, find the corresponding buffer
4652 * number */
4653 n = reg_check_named_buff_matched(rex,scan);
4654
4655 if ( ! n ) {
4656 sayNO;
4657 }
4658 goto do_nref_ref_common;
4659
4660 case REFFL: /* /\1/il */
4661 PL_reg_flags |= RF_tainted;
4662 folder = foldEQ_locale;
4663 fold_array = PL_fold_locale;
4664 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4665 goto do_ref;
4666
4667 case REFFA: /* /\1/iaa */
4668 folder = foldEQ_latin1;
4669 fold_array = PL_fold_latin1;
4670 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4671 goto do_ref;
4672
4673 case REFFU: /* /\1/iu */
4674 folder = foldEQ_latin1;
4675 fold_array = PL_fold_latin1;
4676 utf8_fold_flags = 0;
4677 goto do_ref;
4678
4679 case REFF: /* /\1/i */
4680 folder = foldEQ;
4681 fold_array = PL_fold;
4682 utf8_fold_flags = 0;
4683 goto do_ref;
4684
4685 case REF: /* /\1/ */
4686 folder = NULL;
4687 fold_array = NULL;
4688 utf8_fold_flags = 0;
4689
4690 do_ref:
4691 type = OP(scan);
4692 n = ARG(scan); /* which paren pair */
4693
4694 do_nref_ref_common:
4695 ln = rex->offs[n].start;
4696 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4697 if (rex->lastparen < n || ln == -1)
4698 sayNO; /* Do not match unless seen CLOSEn. */
4699 if (ln == rex->offs[n].end)
4700 break;
4701
4702 s = PL_bostr + ln;
4703 if (type != REF /* REF can do byte comparison */
4704 && (utf8_target || type == REFFU))
4705 { /* XXX handle REFFL better */
4706 char * limit = PL_regeol;
4707
4708 /* This call case insensitively compares the entire buffer
4709 * at s, with the current input starting at locinput, but
4710 * not going off the end given by PL_regeol, and returns in
4711 * <limit> upon success, how much of the current input was
4712 * matched */
4713 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4714 locinput, &limit, 0, utf8_target, utf8_fold_flags))
4715 {
4716 sayNO;
4717 }
4718 locinput = limit;
4719 break;
4720 }
4721
4722 /* Not utf8: Inline the first character, for speed. */
4723 if (!NEXTCHR_IS_EOS &&
4724 UCHARAT(s) != nextchr &&
4725 (type == REF ||
4726 UCHARAT(s) != fold_array[nextchr]))
4727 sayNO;
4728 ln = rex->offs[n].end - ln;
4729 if (locinput + ln > PL_regeol)
4730 sayNO;
4731 if (ln > 1 && (type == REF
4732 ? memNE(s, locinput, ln)
4733 : ! folder(s, locinput, ln)))
4734 sayNO;
4735 locinput += ln;
4736 break;
4737 }
4738
4739 case NOTHING: /* null op; e.g. the 'nothing' following
4740 * the '*' in m{(a+|b)*}' */
4741 break;
4742 case TAIL: /* placeholder while compiling (A|B|C) */
4743 break;
4744
4745 case BACK: /* ??? doesn't appear to be used ??? */
4746 break;
4747
4748#undef ST
4749#define ST st->u.eval
4750 {
4751 SV *ret;
4752 REGEXP *re_sv;
4753 regexp *re;
4754 regexp_internal *rei;
4755 regnode *startpoint;
4756
4757 case GOSTART: /* (?R) */
4758 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4759 if (cur_eval && cur_eval->locinput==locinput) {
4760 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4761 Perl_croak(aTHX_ "Infinite recursion in regex");
4762 if ( ++nochange_depth > max_nochange_depth )
4763 Perl_croak(aTHX_
4764 "Pattern subroutine nesting without pos change"
4765 " exceeded limit in regex");
4766 } else {
4767 nochange_depth = 0;
4768 }
4769 re_sv = rex_sv;
4770 re = rex;
4771 rei = rexi;
4772 if (OP(scan)==GOSUB) {
4773 startpoint = scan + ARG2L(scan);
4774 ST.close_paren = ARG(scan);
4775 } else {
4776 startpoint = rei->program+1;
4777 ST.close_paren = 0;
4778 }
4779 goto eval_recurse_doit;
4780 assert(0); /* NOTREACHED */
4781
4782 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4783 if (cur_eval && cur_eval->locinput==locinput) {
4784 if ( ++nochange_depth > max_nochange_depth )
4785 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4786 } else {
4787 nochange_depth = 0;
4788 }
4789 {
4790 /* execute the code in the {...} */
4791
4792 dSP;
4793 PADOFFSET before;
4794 OP * const oop = PL_op;
4795 COP * const ocurcop = PL_curcop;
4796 OP *nop;
4797 char *saved_regeol = PL_regeol;
4798 struct re_save_state saved_state;
4799 CV *newcv;
4800
4801 /* save *all* paren positions */
4802 regcppush(rex, 0);
4803 REGCP_SET(runops_cp);
4804
4805 /* To not corrupt the existing regex state while executing the
4806 * eval we would normally put it on the save stack, like with
4807 * save_re_context. However, re-evals have a weird scoping so we
4808 * can't just add ENTER/LEAVE here. With that, things like
4809 *
4810 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4811 *
4812 * would break, as they expect the localisation to be unwound
4813 * only when the re-engine backtracks through the bit that
4814 * localised it.
4815 *
4816 * What we do instead is just saving the state in a local c
4817 * variable.
4818 */
4819 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4820
4821 PL_reg_state.re_reparsing = FALSE;
4822
4823 if (!caller_cv)
4824 caller_cv = find_runcv(NULL);
4825
4826 n = ARG(scan);
4827
4828 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4829 newcv = (ReANY(
4830 (REGEXP*)(rexi->data->data[n])
4831 ))->qr_anoncv
4832 ;
4833 nop = (OP*)rexi->data->data[n+1];
4834 }
4835 else if (rexi->data->what[n] == 'l') { /* literal code */
4836 newcv = caller_cv;
4837 nop = (OP*)rexi->data->data[n];
4838 assert(CvDEPTH(newcv));
4839 }
4840 else {
4841 /* literal with own CV */
4842 assert(rexi->data->what[n] == 'L');
4843 newcv = rex->qr_anoncv;
4844 nop = (OP*)rexi->data->data[n];
4845 }
4846
4847 /* normally if we're about to execute code from the same
4848 * CV that we used previously, we just use the existing
4849 * CX stack entry. However, its possible that in the
4850 * meantime we may have backtracked, popped from the save
4851 * stack, and undone the SAVECOMPPAD(s) associated with
4852 * PUSH_MULTICALL; in which case PL_comppad no longer
4853 * points to newcv's pad. */
4854 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4855 {
4856 I32 depth = (newcv == caller_cv) ? 0 : 1;
4857 if (last_pushed_cv) {
4858 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4859 }
4860 else {
4861 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4862 }
4863 last_pushed_cv = newcv;
4864 }
4865 else {
4866 /* these assignments are just to silence compiler
4867 * warnings */
4868 multicall_cop = NULL;
4869 newsp = NULL;
4870 }
4871 last_pad = PL_comppad;
4872
4873 /* the initial nextstate you would normally execute
4874 * at the start of an eval (which would cause error
4875 * messages to come from the eval), may be optimised
4876 * away from the execution path in the regex code blocks;
4877 * so manually set PL_curcop to it initially */
4878 {
4879 OP *o = cUNOPx(nop)->op_first;
4880 assert(o->op_type == OP_NULL);
4881 if (o->op_targ == OP_SCOPE) {
4882 o = cUNOPo->op_first;
4883 }
4884 else {
4885 assert(o->op_targ == OP_LEAVE);
4886 o = cUNOPo->op_first;
4887 assert(o->op_type == OP_ENTER);
4888 o = o->op_sibling;
4889 }
4890
4891 if (o->op_type != OP_STUB) {
4892 assert( o->op_type == OP_NEXTSTATE
4893 || o->op_type == OP_DBSTATE
4894 || (o->op_type == OP_NULL
4895 && ( o->op_targ == OP_NEXTSTATE
4896 || o->op_targ == OP_DBSTATE
4897 )
4898 )
4899 );
4900 PL_curcop = (COP*)o;
4901 }
4902 }
4903 nop = nop->op_next;
4904
4905 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4906 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4907
4908 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4909
4910 if (sv_yes_mark) {
4911 SV *sv_mrk = get_sv("REGMARK", 1);
4912 sv_setsv(sv_mrk, sv_yes_mark);
4913 }
4914
4915 /* we don't use MULTICALL here as we want to call the
4916 * first op of the block of interest, rather than the
4917 * first op of the sub */
4918 before = SP-PL_stack_base;
4919 PL_op = nop;
4920 CALLRUNOPS(aTHX); /* Scalar context. */
4921 SPAGAIN;
4922 if (SP-PL_stack_base == before)
4923 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
4924 else {
4925 ret = POPs;
4926 PUTBACK;
4927 }
4928
4929 /* before restoring everything, evaluate the returned
4930 * value, so that 'uninit' warnings don't use the wrong
4931 * PL_op or pad. Also need to process any magic vars
4932 * (e.g. $1) *before* parentheses are restored */
4933
4934 PL_op = NULL;
4935
4936 re_sv = NULL;
4937 if (logical == 0) /* (?{})/ */
4938 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4939 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4940 sw = cBOOL(SvTRUE(ret));
4941 logical = 0;
4942 }
4943 else { /* /(??{}) */
4944 /* if its overloaded, let the regex compiler handle
4945 * it; otherwise extract regex, or stringify */
4946 if (!SvAMAGIC(ret)) {
4947 SV *sv = ret;
4948 if (SvROK(sv))
4949 sv = SvRV(sv);
4950 if (SvTYPE(sv) == SVt_REGEXP)
4951 re_sv = (REGEXP*) sv;
4952 else if (SvSMAGICAL(sv)) {
4953 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4954 if (mg)
4955 re_sv = (REGEXP *) mg->mg_obj;
4956 }
4957
4958 /* force any magic, undef warnings here */
4959 if (!re_sv) {
4960 ret = sv_mortalcopy(ret);
4961 (void) SvPV_force_nolen(ret);
4962 }
4963 }
4964
4965 }
4966
4967 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4968
4969 /* *** Note that at this point we don't restore
4970 * PL_comppad, (or pop the CxSUB) on the assumption it may
4971 * be used again soon. This is safe as long as nothing
4972 * in the regexp code uses the pad ! */
4973 PL_op = oop;
4974 PL_curcop = ocurcop;
4975 PL_regeol = saved_regeol;
4976 S_regcp_restore(aTHX_ rex, runops_cp);
4977
4978 if (logical != 2)
4979 break;
4980 }
4981
4982 /* only /(??{})/ from now on */
4983 logical = 0;
4984 {
4985 /* extract RE object from returned value; compiling if
4986 * necessary */
4987
4988 if (re_sv) {
4989 re_sv = reg_temp_copy(NULL, re_sv);
4990 }
4991 else {
4992 U32 pm_flags = 0;
4993 const I32 osize = PL_regsize;
4994
4995 if (SvUTF8(ret) && IN_BYTES) {
4996 /* In use 'bytes': make a copy of the octet
4997 * sequence, but without the flag on */
4998 STRLEN len;
4999 const char *const p = SvPV(ret, len);
5000 ret = newSVpvn_flags(p, len, SVs_TEMP);
5001 }
5002 if (rex->intflags & PREGf_USE_RE_EVAL)
5003 pm_flags |= PMf_USE_RE_EVAL;
5004
5005 /* if we got here, it should be an engine which
5006 * supports compiling code blocks and stuff */
5007 assert(rex->engine && rex->engine->op_comp);
5008 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5009 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5010 rex->engine, NULL, NULL,
5011 /* copy /msix etc to inner pattern */
5012 scan->flags,
5013 pm_flags);
5014
5015 if (!(SvFLAGS(ret)
5016 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
5017 | SVs_GMG))) {
5018 /* This isn't a first class regexp. Instead, it's
5019 caching a regexp onto an existing, Perl visible
5020 scalar. */
5021 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5022 }
5023 PL_regsize = osize;
5024 /* safe to do now that any $1 etc has been
5025 * interpolated into the new pattern string and
5026 * compiled */
5027 S_regcp_restore(aTHX_ rex, runops_cp);
5028 }
5029 SAVEFREESV(re_sv);
5030 re = ReANY(re_sv);
5031 }
5032 RXp_MATCH_COPIED_off(re);
5033 re->subbeg = rex->subbeg;
5034 re->sublen = rex->sublen;
5035 re->suboffset = rex->suboffset;
5036 re->subcoffset = rex->subcoffset;
5037 rei = RXi_GET(re);
5038 DEBUG_EXECUTE_r(
5039 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
5040 "Matching embedded");
5041 );
5042 startpoint = rei->program + 1;
5043 ST.close_paren = 0; /* only used for GOSUB */
5044
5045 eval_recurse_doit: /* Share code with GOSUB below this line */
5046 /* run the pattern returned from (??{...}) */
5047 ST.cp = regcppush(rex, 0); /* Save *all* the positions. */
5048 REGCP_SET(ST.lastcp);
5049
5050 re->lastparen = 0;
5051 re->lastcloseparen = 0;
5052
5053 PL_regsize = 0;
5054
5055 /* XXXX This is too dramatic a measure... */
5056 PL_reg_maxiter = 0;
5057
5058 ST.toggle_reg_flags = PL_reg_flags;
5059 if (RX_UTF8(re_sv))
5060 PL_reg_flags |= RF_utf8;
5061 else
5062 PL_reg_flags &= ~RF_utf8;
5063 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5064
5065 ST.prev_rex = rex_sv;
5066 ST.prev_curlyx = cur_curlyx;
5067 rex_sv = re_sv;
5068 SET_reg_curpm(rex_sv);
5069 rex = re;
5070 rexi = rei;
5071 cur_curlyx = NULL;
5072 ST.B = next;
5073 ST.prev_eval = cur_eval;
5074 cur_eval = st;
5075 /* now continue from first node in postoned RE */
5076 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5077 assert(0); /* NOTREACHED */
5078 }
5079
5080 case EVAL_AB: /* cleanup after a successful (??{A})B */
5081 /* note: this is called twice; first after popping B, then A */
5082 PL_reg_flags ^= ST.toggle_reg_flags;
5083 rex_sv = ST.prev_rex;
5084 SET_reg_curpm(rex_sv);
5085 rex = ReANY(rex_sv);
5086 rexi = RXi_GET(rex);
5087 regcpblow(ST.cp);
5088 cur_eval = ST.prev_eval;
5089 cur_curlyx = ST.prev_curlyx;
5090
5091 /* XXXX This is too dramatic a measure... */
5092 PL_reg_maxiter = 0;
5093 if ( nochange_depth )
5094 nochange_depth--;
5095 sayYES;
5096
5097
5098 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5099 /* note: this is called twice; first after popping B, then A */
5100 PL_reg_flags ^= ST.toggle_reg_flags;
5101 rex_sv = ST.prev_rex;
5102 SET_reg_curpm(rex_sv);
5103 rex = ReANY(rex_sv);
5104 rexi = RXi_GET(rex);
5105
5106 REGCP_UNWIND(ST.lastcp);
5107 regcppop(rex);
5108 cur_eval = ST.prev_eval;
5109 cur_curlyx = ST.prev_curlyx;
5110 /* XXXX This is too dramatic a measure... */
5111 PL_reg_maxiter = 0;
5112 if ( nochange_depth )
5113 nochange_depth--;
5114 sayNO_SILENT;
5115#undef ST
5116
5117 case OPEN: /* ( */
5118 n = ARG(scan); /* which paren pair */
5119 rex->offs[n].start_tmp = locinput - PL_bostr;
5120 if (n > PL_regsize)
5121 PL_regsize = n;
5122 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5123 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
5124 PTR2UV(rex),
5125 PTR2UV(rex->offs),
5126 (UV)n,
5127 (IV)rex->offs[n].start_tmp,
5128 (UV)PL_regsize
5129 ));
5130 lastopen = n;
5131 break;
5132
5133/* XXX really need to log other places start/end are set too */
5134#define CLOSE_CAPTURE \
5135 rex->offs[n].start = rex->offs[n].start_tmp; \
5136 rex->offs[n].end = locinput - PL_bostr; \
5137 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5138 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5139 PTR2UV(rex), \
5140 PTR2UV(rex->offs), \
5141 (UV)n, \
5142 (IV)rex->offs[n].start, \
5143 (IV)rex->offs[n].end \
5144 ))
5145
5146 case CLOSE: /* ) */
5147 n = ARG(scan); /* which paren pair */
5148 CLOSE_CAPTURE;
5149 /*if (n > PL_regsize)
5150 PL_regsize = n;*/
5151 if (n > rex->lastparen)
5152 rex->lastparen = n;
5153 rex->lastcloseparen = n;
5154 if (cur_eval && cur_eval->u.eval.close_paren == n) {
5155 goto fake_end;
5156 }
5157 break;
5158
5159 case ACCEPT: /* (*ACCEPT) */
5160 if (ARG(scan)){
5161 regnode *cursor;
5162 for (cursor=scan;
5163 cursor && OP(cursor)!=END;
5164 cursor=regnext(cursor))
5165 {
5166 if ( OP(cursor)==CLOSE ){
5167 n = ARG(cursor);
5168 if ( n <= lastopen ) {
5169 CLOSE_CAPTURE;
5170 /*if (n > PL_regsize)
5171 PL_regsize = n;*/
5172 if (n > rex->lastparen)
5173 rex->lastparen = n;
5174 rex->lastcloseparen = n;
5175 if ( n == ARG(scan) || (cur_eval &&
5176 cur_eval->u.eval.close_paren == n))
5177 break;
5178 }
5179 }
5180 }
5181 }
5182 goto fake_end;
5183 /*NOTREACHED*/
5184
5185 case GROUPP: /* (?(1)) */
5186 n = ARG(scan); /* which paren pair */
5187 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5188 break;
5189
5190 case NGROUPP: /* (?(<name>)) */
5191 /* reg_check_named_buff_matched returns 0 for no match */
5192 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5193 break;
5194
5195 case INSUBP: /* (?(R)) */
5196 n = ARG(scan);
5197 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5198 break;
5199
5200 case DEFINEP: /* (?(DEFINE)) */
5201 sw = 0;
5202 break;
5203
5204 case IFTHEN: /* (?(cond)A|B) */
5205 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5206 if (sw)
5207 next = NEXTOPER(NEXTOPER(scan));
5208 else {
5209 next = scan + ARG(scan);
5210 if (OP(next) == IFTHEN) /* Fake one. */
5211 next = NEXTOPER(NEXTOPER(next));
5212 }
5213 break;
5214
5215 case LOGICAL: /* modifier for EVAL and IFMATCH */
5216 logical = scan->flags;
5217 break;
5218
5219/*******************************************************************
5220
5221The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5222pattern, where A and B are subpatterns. (For simple A, CURLYM or
5223STAR/PLUS/CURLY/CURLYN are used instead.)
5224
5225A*B is compiled as <CURLYX><A><WHILEM><B>
5226
5227On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5228state, which contains the current count, initialised to -1. It also sets
5229cur_curlyx to point to this state, with any previous value saved in the
5230state block.
5231
5232CURLYX then jumps straight to the WHILEM op, rather than executing A,
5233since the pattern may possibly match zero times (i.e. it's a while {} loop
5234rather than a do {} while loop).
5235
5236Each entry to WHILEM represents a successful match of A. The count in the
5237CURLYX block is incremented, another WHILEM state is pushed, and execution
5238passes to A or B depending on greediness and the current count.
5239
5240For example, if matching against the string a1a2a3b (where the aN are
5241substrings that match /A/), then the match progresses as follows: (the
5242pushed states are interspersed with the bits of strings matched so far):
5243
5244 <CURLYX cnt=-1>
5245 <CURLYX cnt=0><WHILEM>
5246 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5247 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5248 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5249 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5250
5251(Contrast this with something like CURLYM, which maintains only a single
5252backtrack state:
5253
5254 <CURLYM cnt=0> a1
5255 a1 <CURLYM cnt=1> a2
5256 a1 a2 <CURLYM cnt=2> a3
5257 a1 a2 a3 <CURLYM cnt=3> b
5258)
5259
5260Each WHILEM state block marks a point to backtrack to upon partial failure
5261of A or B, and also contains some minor state data related to that
5262iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5263overall state, such as the count, and pointers to the A and B ops.
5264
5265This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5266must always point to the *current* CURLYX block, the rules are:
5267
5268When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5269and set cur_curlyx to point the new block.
5270
5271When popping the CURLYX block after a successful or unsuccessful match,
5272restore the previous cur_curlyx.
5273
5274When WHILEM is about to execute B, save the current cur_curlyx, and set it
5275to the outer one saved in the CURLYX block.
5276
5277When popping the WHILEM block after a successful or unsuccessful B match,
5278restore the previous cur_curlyx.
5279
5280Here's an example for the pattern (AI* BI)*BO
5281I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5282
5283cur_
5284curlyx backtrack stack
5285------ ---------------
5286NULL
5287CO <CO prev=NULL> <WO>
5288CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5289CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5290NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5291
5292At this point the pattern succeeds, and we work back down the stack to
5293clean up, restoring as we go:
5294
5295CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5296CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5297CO <CO prev=NULL> <WO>
5298NULL
5299
5300*******************************************************************/
5301
5302#define ST st->u.curlyx
5303
5304 case CURLYX: /* start of /A*B/ (for complex A) */
5305 {
5306 /* No need to save/restore up to this paren */
5307 I32 parenfloor = scan->flags;
5308
5309 assert(next); /* keep Coverity happy */
5310 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5311 next += ARG(next);
5312
5313 /* XXXX Probably it is better to teach regpush to support
5314 parenfloor > PL_regsize... */
5315 if (parenfloor > (I32)rex->lastparen)
5316 parenfloor = rex->lastparen; /* Pessimization... */
5317
5318 ST.prev_curlyx= cur_curlyx;
5319 cur_curlyx = st;
5320 ST.cp = PL_savestack_ix;
5321
5322 /* these fields contain the state of the current curly.
5323 * they are accessed by subsequent WHILEMs */
5324 ST.parenfloor = parenfloor;
5325 ST.me = scan;
5326 ST.B = next;
5327 ST.minmod = minmod;
5328 minmod = 0;
5329 ST.count = -1; /* this will be updated by WHILEM */
5330 ST.lastloc = NULL; /* this will be updated by WHILEM */
5331
5332 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5333 assert(0); /* NOTREACHED */
5334 }
5335
5336 case CURLYX_end: /* just finished matching all of A*B */
5337 cur_curlyx = ST.prev_curlyx;
5338 sayYES;
5339 assert(0); /* NOTREACHED */
5340
5341 case CURLYX_end_fail: /* just failed to match all of A*B */
5342 regcpblow(ST.cp);
5343 cur_curlyx = ST.prev_curlyx;
5344 sayNO;
5345 assert(0); /* NOTREACHED */
5346
5347
5348#undef ST
5349#define ST st->u.whilem
5350
5351 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5352 {
5353 /* see the discussion above about CURLYX/WHILEM */
5354 I32 n;
5355 int min = ARG1(cur_curlyx->u.curlyx.me);
5356 int max = ARG2(cur_curlyx->u.curlyx.me);
5357 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5358
5359 assert(cur_curlyx); /* keep Coverity happy */
5360 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5361 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5362 ST.cache_offset = 0;
5363 ST.cache_mask = 0;
5364
5365
5366 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5367 "%*s whilem: matched %ld out of %d..%d\n",
5368 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5369 );
5370
5371 /* First just match a string of min A's. */
5372
5373 if (n < min) {
5374 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5375 cur_curlyx->u.curlyx.lastloc = locinput;
5376 REGCP_SET(ST.lastcp);
5377
5378 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5379 assert(0); /* NOTREACHED */
5380 }
5381
5382 /* If degenerate A matches "", assume A done. */
5383
5384 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5385 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5386 "%*s whilem: empty match detected, trying continuation...\n",
5387 REPORT_CODE_OFF+depth*2, "")
5388 );
5389 goto do_whilem_B_max;
5390 }
5391
5392 /* super-linear cache processing */
5393
5394 if (scan->flags) {
5395
5396 if (!PL_reg_maxiter) {
5397 /* start the countdown: Postpone detection until we
5398 * know the match is not *that* much linear. */
5399 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5400 /* possible overflow for long strings and many CURLYX's */
5401 if (PL_reg_maxiter < 0)
5402 PL_reg_maxiter = I32_MAX;
5403 PL_reg_leftiter = PL_reg_maxiter;
5404 }
5405
5406 if (PL_reg_leftiter-- == 0) {
5407 /* initialise cache */
5408 const I32 size = (PL_reg_maxiter + 7)/8;
5409 if (PL_reg_poscache) {
5410 if ((I32)PL_reg_poscache_size < size) {
5411 Renew(PL_reg_poscache, size, char);
5412 PL_reg_poscache_size = size;
5413 }
5414 Zero(PL_reg_poscache, size, char);
5415 }
5416 else {
5417 PL_reg_poscache_size = size;
5418 Newxz(PL_reg_poscache, size, char);
5419 }
5420 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5421 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5422 PL_colors[4], PL_colors[5])
5423 );
5424 }
5425
5426 if (PL_reg_leftiter < 0) {
5427 /* have we already failed at this position? */
5428 I32 offset, mask;
5429 offset = (scan->flags & 0xf) - 1
5430 + (locinput - PL_bostr) * (scan->flags>>4);
5431 mask = 1 << (offset % 8);
5432 offset /= 8;
5433 if (PL_reg_poscache[offset] & mask) {
5434 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5435 "%*s whilem: (cache) already tried at this position...\n",
5436 REPORT_CODE_OFF+depth*2, "")
5437 );
5438 sayNO; /* cache records failure */
5439 }
5440 ST.cache_offset = offset;
5441 ST.cache_mask = mask;
5442 }
5443 }
5444
5445 /* Prefer B over A for minimal matching. */
5446
5447 if (cur_curlyx->u.curlyx.minmod) {
5448 ST.save_curlyx = cur_curlyx;
5449 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5450 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5451 REGCP_SET(ST.lastcp);
5452 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5453 locinput);
5454 assert(0); /* NOTREACHED */
5455 }
5456
5457 /* Prefer A over B for maximal matching. */
5458
5459 if (n < max) { /* More greed allowed? */
5460 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5461 cur_curlyx->u.curlyx.lastloc = locinput;
5462 REGCP_SET(ST.lastcp);
5463 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5464 assert(0); /* NOTREACHED */
5465 }
5466 goto do_whilem_B_max;
5467 }
5468 assert(0); /* NOTREACHED */
5469
5470 case WHILEM_B_min: /* just matched B in a minimal match */
5471 case WHILEM_B_max: /* just matched B in a maximal match */
5472 cur_curlyx = ST.save_curlyx;
5473 sayYES;
5474 assert(0); /* NOTREACHED */
5475
5476 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5477 cur_curlyx = ST.save_curlyx;
5478 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5479 cur_curlyx->u.curlyx.count--;
5480 CACHEsayNO;
5481 assert(0); /* NOTREACHED */
5482
5483 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5484 /* FALL THROUGH */
5485 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5486 REGCP_UNWIND(ST.lastcp);
5487 regcppop(rex);
5488 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5489 cur_curlyx->u.curlyx.count--;
5490 CACHEsayNO;
5491 assert(0); /* NOTREACHED */
5492
5493 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5494 REGCP_UNWIND(ST.lastcp);
5495 regcppop(rex); /* Restore some previous $<digit>s? */
5496 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5497 "%*s whilem: failed, trying continuation...\n",
5498 REPORT_CODE_OFF+depth*2, "")
5499 );
5500 do_whilem_B_max:
5501 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5502 && ckWARN(WARN_REGEXP)
5503 && !(PL_reg_flags & RF_warned))
5504 {
5505 PL_reg_flags |= RF_warned;
5506 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5507 "Complex regular subexpression recursion limit (%d) "
5508 "exceeded",
5509 REG_INFTY - 1);
5510 }
5511
5512 /* now try B */
5513 ST.save_curlyx = cur_curlyx;
5514 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5515 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5516 locinput);
5517 assert(0); /* NOTREACHED */
5518
5519 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5520 cur_curlyx = ST.save_curlyx;
5521 REGCP_UNWIND(ST.lastcp);
5522 regcppop(rex);
5523
5524 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5525 /* Maximum greed exceeded */
5526 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5527 && ckWARN(WARN_REGEXP)
5528 && !(PL_reg_flags & RF_warned))
5529 {
5530 PL_reg_flags |= RF_warned;
5531 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5532 "Complex regular subexpression recursion "
5533 "limit (%d) exceeded",
5534 REG_INFTY - 1);
5535 }
5536 cur_curlyx->u.curlyx.count--;
5537 CACHEsayNO;
5538 }
5539
5540 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5541 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5542 );
5543 /* Try grabbing another A and see if it helps. */
5544 cur_curlyx->u.curlyx.lastloc = locinput;
5545 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5546 REGCP_SET(ST.lastcp);
5547 PUSH_STATE_GOTO(WHILEM_A_min,
5548 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5549 locinput);
5550 assert(0); /* NOTREACHED */
5551
5552#undef ST
5553#define ST st->u.branch
5554
5555 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
5556 next = scan + ARG(scan);
5557 if (next == scan)
5558 next = NULL;
5559 scan = NEXTOPER(scan);
5560 /* FALL THROUGH */
5561
5562 case BRANCH: /* /(...|A|...)/ */
5563 scan = NEXTOPER(scan); /* scan now points to inner node */
5564 ST.lastparen = rex->lastparen;
5565 ST.lastcloseparen = rex->lastcloseparen;
5566 ST.next_branch = next;
5567 REGCP_SET(ST.cp);
5568
5569 /* Now go into the branch */
5570 if (has_cutgroup) {
5571 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5572 } else {
5573 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5574 }
5575 assert(0); /* NOTREACHED */
5576
5577 case CUTGROUP: /* /(*THEN)/ */
5578 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5579 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5580 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5581 assert(0); /* NOTREACHED */
5582
5583 case CUTGROUP_next_fail:
5584 do_cutgroup = 1;
5585 no_final = 1;
5586 if (st->u.mark.mark_name)
5587 sv_commit = st->u.mark.mark_name;
5588 sayNO;
5589 assert(0); /* NOTREACHED */
5590
5591 case BRANCH_next:
5592 sayYES;
5593 assert(0); /* NOTREACHED */
5594
5595 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5596 if (do_cutgroup) {
5597 do_cutgroup = 0;
5598 no_final = 0;
5599 }
5600 REGCP_UNWIND(ST.cp);
5601 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5602 scan = ST.next_branch;
5603 /* no more branches? */
5604 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5605 DEBUG_EXECUTE_r({
5606 PerlIO_printf( Perl_debug_log,
5607 "%*s %sBRANCH failed...%s\n",
5608 REPORT_CODE_OFF+depth*2, "",
5609 PL_colors[4],
5610 PL_colors[5] );
5611 });
5612 sayNO_SILENT;
5613 }
5614 continue; /* execute next BRANCH[J] op */
5615 assert(0); /* NOTREACHED */
5616
5617 case MINMOD: /* next op will be non-greedy, e.g. A*? */
5618 minmod = 1;
5619 break;
5620
5621#undef ST
5622#define ST st->u.curlym
5623
5624 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5625
5626 /* This is an optimisation of CURLYX that enables us to push
5627 * only a single backtracking state, no matter how many matches
5628 * there are in {m,n}. It relies on the pattern being constant
5629 * length, with no parens to influence future backrefs
5630 */
5631
5632 ST.me = scan;
5633 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5634
5635 ST.lastparen = rex->lastparen;
5636 ST.lastcloseparen = rex->lastcloseparen;
5637
5638 /* if paren positive, emulate an OPEN/CLOSE around A */
5639 if (ST.me->flags) {
5640 U32 paren = ST.me->flags;
5641 if (paren > PL_regsize)
5642 PL_regsize = paren;
5643 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5644 }
5645 ST.A = scan;
5646 ST.B = next;
5647 ST.alen = 0;
5648 ST.count = 0;
5649 ST.minmod = minmod;
5650 minmod = 0;
5651 ST.c1 = CHRTEST_UNINIT;
5652 REGCP_SET(ST.cp);
5653
5654 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5655 goto curlym_do_B;
5656
5657 curlym_do_A: /* execute the A in /A{m,n}B/ */
5658 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5659 assert(0); /* NOTREACHED */
5660
5661 case CURLYM_A: /* we've just matched an A */
5662 ST.count++;
5663 /* after first match, determine A's length: u.curlym.alen */
5664 if (ST.count == 1) {
5665 if (PL_reg_match_utf8) {
5666 char *s = st->locinput;
5667 while (s < locinput) {
5668 ST.alen++;
5669 s += UTF8SKIP(s);
5670 }
5671 }
5672 else {
5673 ST.alen = locinput - st->locinput;
5674 }
5675 if (ST.alen == 0)
5676 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5677 }
5678 DEBUG_EXECUTE_r(
5679 PerlIO_printf(Perl_debug_log,
5680 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5681 (int)(REPORT_CODE_OFF+(depth*2)), "",
5682 (IV) ST.count, (IV)ST.alen)
5683 );
5684
5685 if (cur_eval && cur_eval->u.eval.close_paren &&
5686 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5687 goto fake_end;
5688
5689 {
5690 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5691 if ( max == REG_INFTY || ST.count < max )
5692 goto curlym_do_A; /* try to match another A */
5693 }
5694 goto curlym_do_B; /* try to match B */
5695
5696 case CURLYM_A_fail: /* just failed to match an A */
5697 REGCP_UNWIND(ST.cp);
5698
5699 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5700 || (cur_eval && cur_eval->u.eval.close_paren &&
5701 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5702 sayNO;
5703
5704 curlym_do_B: /* execute the B in /A{m,n}B/ */
5705 if (ST.c1 == CHRTEST_UNINIT) {
5706 /* calculate c1 and c2 for possible match of 1st char
5707 * following curly */
5708 ST.c1 = ST.c2 = CHRTEST_VOID;
5709 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5710 regnode *text_node = ST.B;
5711 if (! HAS_TEXT(text_node))
5712 FIND_NEXT_IMPT(text_node);
5713 /* this used to be
5714
5715 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5716
5717 But the former is redundant in light of the latter.
5718
5719 if this changes back then the macro for
5720 IS_TEXT and friends need to change.
5721 */
5722 if (PL_regkind[OP(text_node)] == EXACT) {
5723 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5724 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5725 {
5726 sayNO;
5727 }
5728 }
5729 }
5730 }
5731
5732 DEBUG_EXECUTE_r(
5733 PerlIO_printf(Perl_debug_log,
5734 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5735 (int)(REPORT_CODE_OFF+(depth*2)),
5736 "", (IV)ST.count)
5737 );
5738 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5739 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5740 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5741 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5742 {
5743 /* simulate B failing */
5744 DEBUG_OPTIMISE_r(
5745 PerlIO_printf(Perl_debug_log,
5746 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5747 (int)(REPORT_CODE_OFF+(depth*2)),"",
5748 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5749 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5750 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5751 );
5752 state_num = CURLYM_B_fail;
5753 goto reenter_switch;
5754 }
5755 }
5756 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5757 /* simulate B failing */
5758 DEBUG_OPTIMISE_r(
5759 PerlIO_printf(Perl_debug_log,
5760 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5761 (int)(REPORT_CODE_OFF+(depth*2)),"",
5762 (int) nextchr, ST.c1, ST.c2)
5763 );
5764 state_num = CURLYM_B_fail;
5765 goto reenter_switch;
5766 }
5767 }
5768
5769 if (ST.me->flags) {
5770 /* emulate CLOSE: mark current A as captured */
5771 I32 paren = ST.me->flags;
5772 if (ST.count) {
5773 rex->offs[paren].start
5774 = HOPc(locinput, -ST.alen) - PL_bostr;
5775 rex->offs[paren].end = locinput - PL_bostr;
5776 if ((U32)paren > rex->lastparen)
5777 rex->lastparen = paren;
5778 rex->lastcloseparen = paren;
5779 }
5780 else
5781 rex->offs[paren].end = -1;
5782 if (cur_eval && cur_eval->u.eval.close_paren &&
5783 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5784 {
5785 if (ST.count)
5786 goto fake_end;
5787 else
5788 sayNO;
5789 }
5790 }
5791
5792 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5793 assert(0); /* NOTREACHED */
5794
5795 case CURLYM_B_fail: /* just failed to match a B */
5796 REGCP_UNWIND(ST.cp);
5797 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5798 if (ST.minmod) {
5799 I32 max = ARG2(ST.me);
5800 if (max != REG_INFTY && ST.count == max)
5801 sayNO;
5802 goto curlym_do_A; /* try to match a further A */
5803 }
5804 /* backtrack one A */
5805 if (ST.count == ARG1(ST.me) /* min */)
5806 sayNO;
5807 ST.count--;
5808 SET_locinput(HOPc(locinput, -ST.alen));
5809 goto curlym_do_B; /* try to match B */
5810
5811#undef ST
5812#define ST st->u.curly
5813
5814#define CURLY_SETPAREN(paren, success) \
5815 if (paren) { \
5816 if (success) { \
5817 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5818 rex->offs[paren].end = locinput - PL_bostr; \
5819 if (paren > rex->lastparen) \
5820 rex->lastparen = paren; \
5821 rex->lastcloseparen = paren; \
5822 } \
5823 else { \
5824 rex->offs[paren].end = -1; \
5825 rex->lastparen = ST.lastparen; \
5826 rex->lastcloseparen = ST.lastcloseparen; \
5827 } \
5828 }
5829
5830 case STAR: /* /A*B/ where A is width 1 char */
5831 ST.paren = 0;
5832 ST.min = 0;
5833 ST.max = REG_INFTY;
5834 scan = NEXTOPER(scan);
5835 goto repeat;
5836
5837 case PLUS: /* /A+B/ where A is width 1 char */
5838 ST.paren = 0;
5839 ST.min = 1;
5840 ST.max = REG_INFTY;
5841 scan = NEXTOPER(scan);
5842 goto repeat;
5843
5844 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5845 ST.paren = scan->flags; /* Which paren to set */
5846 ST.lastparen = rex->lastparen;
5847 ST.lastcloseparen = rex->lastcloseparen;
5848 if (ST.paren > PL_regsize)
5849 PL_regsize = ST.paren;
5850 ST.min = ARG1(scan); /* min to match */
5851 ST.max = ARG2(scan); /* max to match */
5852 if (cur_eval && cur_eval->u.eval.close_paren &&
5853 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5854 ST.min=1;
5855 ST.max=1;
5856 }
5857 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5858 goto repeat;
5859
5860 case CURLY: /* /A{m,n}B/ where A is width 1 char */
5861 ST.paren = 0;
5862 ST.min = ARG1(scan); /* min to match */
5863 ST.max = ARG2(scan); /* max to match */
5864 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5865 repeat:
5866 /*
5867 * Lookahead to avoid useless match attempts
5868 * when we know what character comes next.
5869 *
5870 * Used to only do .*x and .*?x, but now it allows
5871 * for )'s, ('s and (?{ ... })'s to be in the way
5872 * of the quantifier and the EXACT-like node. -- japhy
5873 */
5874
5875 assert(ST.min <= ST.max);
5876 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5877 ST.c1 = ST.c2 = CHRTEST_VOID;
5878 }
5879 else {
5880 regnode *text_node = next;
5881
5882 if (! HAS_TEXT(text_node))
5883 FIND_NEXT_IMPT(text_node);
5884
5885 if (! HAS_TEXT(text_node))
5886 ST.c1 = ST.c2 = CHRTEST_VOID;
5887 else {
5888 if ( PL_regkind[OP(text_node)] != EXACT ) {
5889 ST.c1 = ST.c2 = CHRTEST_VOID;
5890 }
5891 else {
5892
5893 /* Currently we only get here when
5894
5895 PL_rekind[OP(text_node)] == EXACT
5896
5897 if this changes back then the macro for IS_TEXT and
5898 friends need to change. */
5899 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5900 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5901 {
5902 sayNO;
5903 }
5904 }
5905 }
5906 }
5907
5908 ST.A = scan;
5909 ST.B = next;
5910 if (minmod) {
5911 char *li = locinput;
5912 minmod = 0;
5913 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5914 sayNO;
5915 SET_locinput(li);
5916 ST.count = ST.min;
5917 REGCP_SET(ST.cp);
5918 if (ST.c1 == CHRTEST_VOID)
5919 goto curly_try_B_min;
5920
5921 ST.oldloc = locinput;
5922
5923 /* set ST.maxpos to the furthest point along the
5924 * string that could possibly match */
5925 if (ST.max == REG_INFTY) {
5926 ST.maxpos = PL_regeol - 1;
5927 if (utf8_target)
5928 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5929 ST.maxpos--;
5930 }
5931 else if (utf8_target) {
5932 int m = ST.max - ST.min;
5933 for (ST.maxpos = locinput;
5934 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5935 ST.maxpos += UTF8SKIP(ST.maxpos);
5936 }
5937 else {
5938 ST.maxpos = locinput + ST.max - ST.min;
5939 if (ST.maxpos >= PL_regeol)
5940 ST.maxpos = PL_regeol - 1;
5941 }
5942 goto curly_try_B_min_known;
5943
5944 }
5945 else {
5946 /* avoid taking address of locinput, so it can remain
5947 * a register var */
5948 char *li = locinput;
5949 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5950 if (ST.count < ST.min)
5951 sayNO;
5952 SET_locinput(li);
5953 if ((ST.count > ST.min)
5954 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5955 {
5956 /* A{m,n} must come at the end of the string, there's
5957 * no point in backing off ... */
5958 ST.min = ST.count;
5959 /* ...except that $ and \Z can match before *and* after
5960 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5961 We may back off by one in this case. */
5962 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5963 ST.min--;
5964 }
5965 REGCP_SET(ST.cp);
5966 goto curly_try_B_max;
5967 }
5968 assert(0); /* NOTREACHED */
5969
5970
5971 case CURLY_B_min_known_fail:
5972 /* failed to find B in a non-greedy match where c1,c2 valid */
5973
5974 REGCP_UNWIND(ST.cp);
5975 if (ST.paren) {
5976 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5977 }
5978 /* Couldn't or didn't -- move forward. */
5979 ST.oldloc = locinput;
5980 if (utf8_target)
5981 locinput += UTF8SKIP(locinput);
5982 else
5983 locinput++;
5984 ST.count++;
5985 curly_try_B_min_known:
5986 /* find the next place where 'B' could work, then call B */
5987 {
5988 int n;
5989 if (utf8_target) {
5990 n = (ST.oldloc == locinput) ? 0 : 1;
5991 if (ST.c1 == ST.c2) {
5992 /* set n to utf8_distance(oldloc, locinput) */
5993 while (locinput <= ST.maxpos
5994 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
5995 {
5996 locinput += UTF8SKIP(locinput);
5997 n++;
5998 }
5999 }
6000 else {
6001 /* set n to utf8_distance(oldloc, locinput) */
6002 while (locinput <= ST.maxpos
6003 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6004 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6005 {
6006 locinput += UTF8SKIP(locinput);
6007 n++;
6008 }
6009 }
6010 }
6011 else { /* Not utf8_target */
6012 if (ST.c1 == ST.c2) {
6013 while (locinput <= ST.maxpos &&
6014 UCHARAT(locinput) != ST.c1)
6015 locinput++;
6016 }
6017 else {
6018 while (locinput <= ST.maxpos
6019 && UCHARAT(locinput) != ST.c1
6020 && UCHARAT(locinput) != ST.c2)
6021 locinput++;
6022 }
6023 n = locinput - ST.oldloc;
6024 }
6025 if (locinput > ST.maxpos)
6026 sayNO;
6027 if (n) {
6028 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6029 * at b; check that everything between oldloc and
6030 * locinput matches */
6031 char *li = ST.oldloc;
6032 ST.count += n;
6033 if (regrepeat(rex, &li, ST.A, n, depth) < n)
6034 sayNO;
6035 assert(n == REG_INFTY || locinput == li);
6036 }
6037 CURLY_SETPAREN(ST.paren, ST.count);
6038 if (cur_eval && cur_eval->u.eval.close_paren &&
6039 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6040 goto fake_end;
6041 }
6042 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6043 }
6044 assert(0); /* NOTREACHED */
6045
6046
6047 case CURLY_B_min_fail:
6048 /* failed to find B in a non-greedy match where c1,c2 invalid */
6049
6050 REGCP_UNWIND(ST.cp);
6051 if (ST.paren) {
6052 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6053 }
6054 /* failed -- move forward one */
6055 {
6056 char *li = locinput;
6057 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
6058 sayNO;
6059 }
6060 locinput = li;
6061 }
6062 {
6063 ST.count++;
6064 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6065 ST.count > 0)) /* count overflow ? */
6066 {
6067 curly_try_B_min:
6068 CURLY_SETPAREN(ST.paren, ST.count);
6069 if (cur_eval && cur_eval->u.eval.close_paren &&
6070 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6071 goto fake_end;
6072 }
6073 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6074 }
6075 }
6076 sayNO;
6077 assert(0); /* NOTREACHED */
6078
6079
6080 curly_try_B_max:
6081 /* a successful greedy match: now try to match B */
6082 if (cur_eval && cur_eval->u.eval.close_paren &&
6083 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6084 goto fake_end;
6085 }
6086 {
6087 bool could_match = locinput < PL_regeol;
6088
6089 /* If it could work, try it. */
6090 if (ST.c1 != CHRTEST_VOID && could_match) {
6091 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6092 {
6093 could_match = memEQ(locinput,
6094 ST.c1_utf8,
6095 UTF8SKIP(locinput))
6096 || memEQ(locinput,
6097 ST.c2_utf8,
6098 UTF8SKIP(locinput));
6099 }
6100 else {
6101 could_match = UCHARAT(locinput) == ST.c1
6102 || UCHARAT(locinput) == ST.c2;
6103 }
6104 }
6105 if (ST.c1 == CHRTEST_VOID || could_match) {
6106 CURLY_SETPAREN(ST.paren, ST.count);
6107 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6108 assert(0); /* NOTREACHED */
6109 }
6110 }
6111 /* FALL THROUGH */
6112
6113 case CURLY_B_max_fail:
6114 /* failed to find B in a greedy match */
6115
6116 REGCP_UNWIND(ST.cp);
6117 if (ST.paren) {
6118 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6119 }
6120 /* back up. */
6121 if (--ST.count < ST.min)
6122 sayNO;
6123 locinput = HOPc(locinput, -1);
6124 goto curly_try_B_max;
6125
6126#undef ST
6127
6128 case END: /* last op of main pattern */
6129 fake_end:
6130 if (cur_eval) {
6131 /* we've just finished A in /(??{A})B/; now continue with B */
6132 st->u.eval.toggle_reg_flags
6133 = cur_eval->u.eval.toggle_reg_flags;
6134 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6135
6136 st->u.eval.prev_rex = rex_sv; /* inner */
6137 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
6138 rex_sv = cur_eval->u.eval.prev_rex;
6139 SET_reg_curpm(rex_sv);
6140 rex = ReANY(rex_sv);
6141 rexi = RXi_GET(rex);
6142 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6143
6144 REGCP_SET(st->u.eval.lastcp);
6145
6146 /* Restore parens of the outer rex without popping the
6147 * savestack */
6148 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
6149
6150 st->u.eval.prev_eval = cur_eval;
6151 cur_eval = cur_eval->u.eval.prev_eval;
6152 DEBUG_EXECUTE_r(
6153 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6154 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6155 if ( nochange_depth )
6156 nochange_depth--;
6157
6158 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6159 locinput); /* match B */
6160 }
6161
6162 if (locinput < reginfo->till) {
6163 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6164 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6165 PL_colors[4],
6166 (long)(locinput - PL_reg_starttry),
6167 (long)(reginfo->till - PL_reg_starttry),
6168 PL_colors[5]));
6169
6170 sayNO_SILENT; /* Cannot match: too short. */
6171 }
6172 sayYES; /* Success! */
6173
6174 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6175 DEBUG_EXECUTE_r(
6176 PerlIO_printf(Perl_debug_log,
6177 "%*s %ssubpattern success...%s\n",
6178 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6179 sayYES; /* Success! */
6180
6181#undef ST
6182#define ST st->u.ifmatch
6183
6184 {
6185 char *newstart;
6186
6187 case SUSPEND: /* (?>A) */
6188 ST.wanted = 1;
6189 newstart = locinput;
6190 goto do_ifmatch;
6191
6192 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6193 ST.wanted = 0;
6194 goto ifmatch_trivial_fail_test;
6195
6196 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6197 ST.wanted = 1;
6198 ifmatch_trivial_fail_test:
6199 if (scan->flags) {
6200 char * const s = HOPBACKc(locinput, scan->flags);
6201 if (!s) {
6202 /* trivial fail */
6203 if (logical) {
6204 logical = 0;
6205 sw = 1 - cBOOL(ST.wanted);
6206 }
6207 else if (ST.wanted)
6208 sayNO;
6209 next = scan + ARG(scan);
6210 if (next == scan)
6211 next = NULL;
6212 break;
6213 }
6214 newstart = s;
6215 }
6216 else
6217 newstart = locinput;
6218
6219 do_ifmatch:
6220 ST.me = scan;
6221 ST.logical = logical;
6222 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6223
6224 /* execute body of (?...A) */
6225 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6226 assert(0); /* NOTREACHED */
6227 }
6228
6229 case IFMATCH_A_fail: /* body of (?...A) failed */
6230 ST.wanted = !ST.wanted;
6231 /* FALL THROUGH */
6232
6233 case IFMATCH_A: /* body of (?...A) succeeded */
6234 if (ST.logical) {
6235 sw = cBOOL(ST.wanted);
6236 }
6237 else if (!ST.wanted)
6238 sayNO;
6239
6240 if (OP(ST.me) != SUSPEND) {
6241 /* restore old position except for (?>...) */
6242 locinput = st->locinput;
6243 }
6244 scan = ST.me + ARG(ST.me);
6245 if (scan == ST.me)
6246 scan = NULL;
6247 continue; /* execute B */
6248
6249#undef ST
6250
6251 case LONGJMP: /* alternative with many branches compiles to
6252 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6253 next = scan + ARG(scan);
6254 if (next == scan)
6255 next = NULL;
6256 break;
6257
6258 case COMMIT: /* (*COMMIT) */
6259 reginfo->cutpoint = PL_regeol;
6260 /* FALLTHROUGH */
6261
6262 case PRUNE: /* (*PRUNE) */
6263 if (!scan->flags)
6264 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6265 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6266 assert(0); /* NOTREACHED */
6267
6268 case COMMIT_next_fail:
6269 no_final = 1;
6270 /* FALLTHROUGH */
6271
6272 case OPFAIL: /* (*FAIL) */
6273 sayNO;
6274 assert(0); /* NOTREACHED */
6275
6276#define ST st->u.mark
6277 case MARKPOINT: /* (*MARK:foo) */
6278 ST.prev_mark = mark_state;
6279 ST.mark_name = sv_commit = sv_yes_mark
6280 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6281 mark_state = st;
6282 ST.mark_loc = locinput;
6283 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6284 assert(0); /* NOTREACHED */
6285
6286 case MARKPOINT_next:
6287 mark_state = ST.prev_mark;
6288 sayYES;
6289 assert(0); /* NOTREACHED */
6290
6291 case MARKPOINT_next_fail:
6292 if (popmark && sv_eq(ST.mark_name,popmark))
6293 {
6294 if (ST.mark_loc > startpoint)
6295 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6296 popmark = NULL; /* we found our mark */
6297 sv_commit = ST.mark_name;
6298
6299 DEBUG_EXECUTE_r({
6300 PerlIO_printf(Perl_debug_log,
6301 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6302 REPORT_CODE_OFF+depth*2, "",
6303 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6304 });
6305 }
6306 mark_state = ST.prev_mark;
6307 sv_yes_mark = mark_state ?
6308 mark_state->u.mark.mark_name : NULL;
6309 sayNO;
6310 assert(0); /* NOTREACHED */
6311
6312 case SKIP: /* (*SKIP) */
6313 if (scan->flags) {
6314 /* (*SKIP) : if we fail we cut here*/
6315 ST.mark_name = NULL;
6316 ST.mark_loc = locinput;
6317 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6318 } else {
6319 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6320 otherwise do nothing. Meaning we need to scan
6321 */
6322 regmatch_state *cur = mark_state;
6323 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6324
6325 while (cur) {
6326 if ( sv_eq( cur->u.mark.mark_name,
6327 find ) )
6328 {
6329 ST.mark_name = find;
6330 PUSH_STATE_GOTO( SKIP_next, next, locinput);
6331 }
6332 cur = cur->u.mark.prev_mark;
6333 }
6334 }
6335 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6336 break;
6337
6338 case SKIP_next_fail:
6339 if (ST.mark_name) {
6340 /* (*CUT:NAME) - Set up to search for the name as we
6341 collapse the stack*/
6342 popmark = ST.mark_name;
6343 } else {
6344 /* (*CUT) - No name, we cut here.*/
6345 if (ST.mark_loc > startpoint)
6346 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6347 /* but we set sv_commit to latest mark_name if there
6348 is one so they can test to see how things lead to this
6349 cut */
6350 if (mark_state)
6351 sv_commit=mark_state->u.mark.mark_name;
6352 }
6353 no_final = 1;
6354 sayNO;
6355 assert(0); /* NOTREACHED */
6356#undef ST
6357
6358 case LNBREAK: /* \R */
6359 if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6360 locinput += n;
6361 } else
6362 sayNO;
6363 break;
6364
6365#define CASE_CLASS(nAmE) \
6366 case nAmE: \
6367 if (NEXTCHR_IS_EOS) \
6368 sayNO; \
6369 if ((n=is_##nAmE(locinput,utf8_target))) { \
6370 locinput += n; \
6371 } else \
6372 sayNO; \
6373 break; \
6374 case N##nAmE: \
6375 if (NEXTCHR_IS_EOS) \
6376 sayNO; \
6377 if ((n=is_##nAmE(locinput,utf8_target))) { \
6378 sayNO; \
6379 } else { \
6380 locinput += UTF8SKIP(locinput); \
6381 } \
6382 break
6383
6384 CASE_CLASS(VERTWS); /* \v \V */
6385 CASE_CLASS(HORIZWS); /* \h \H */
6386#undef CASE_CLASS
6387
6388 default:
6389 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6390 PTR2UV(scan), OP(scan));
6391 Perl_croak(aTHX_ "regexp memory corruption");
6392
6393 /* this is a point to jump to in order to increment
6394 * locinput by one character */
6395 increment_locinput:
6396 assert(!NEXTCHR_IS_EOS);
6397 if (utf8_target) {
6398 locinput += PL_utf8skip[nextchr];
6399 /* locinput is allowed to go 1 char off the end, but not 2+ */
6400 if (locinput > PL_regeol)
6401 sayNO;
6402 }
6403 else
6404 locinput++;
6405 break;
6406
6407 } /* end switch */
6408
6409 /* switch break jumps here */
6410 scan = next; /* prepare to execute the next op and ... */
6411 continue; /* ... jump back to the top, reusing st */
6412 assert(0); /* NOTREACHED */
6413
6414 push_yes_state:
6415 /* push a state that backtracks on success */
6416 st->u.yes.prev_yes_state = yes_state;
6417 yes_state = st;
6418 /* FALL THROUGH */
6419 push_state:
6420 /* push a new regex state, then continue at scan */
6421 {
6422 regmatch_state *newst;
6423
6424 DEBUG_STACK_r({
6425 regmatch_state *cur = st;
6426 regmatch_state *curyes = yes_state;
6427 int curd = depth;
6428 regmatch_slab *slab = PL_regmatch_slab;
6429 for (;curd > -1;cur--,curd--) {
6430 if (cur < SLAB_FIRST(slab)) {
6431 slab = slab->prev;
6432 cur = SLAB_LAST(slab);
6433 }
6434 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6435 REPORT_CODE_OFF + 2 + depth * 2,"",
6436 curd, PL_reg_name[cur->resume_state],
6437 (curyes == cur) ? "yes" : ""
6438 );
6439 if (curyes == cur)
6440 curyes = cur->u.yes.prev_yes_state;
6441 }
6442 } else
6443 DEBUG_STATE_pp("push")
6444 );
6445 depth++;
6446 st->locinput = locinput;
6447 newst = st+1;
6448 if (newst > SLAB_LAST(PL_regmatch_slab))
6449 newst = S_push_slab(aTHX);
6450 PL_regmatch_state = newst;
6451
6452 locinput = pushinput;
6453 st = newst;
6454 continue;
6455 assert(0); /* NOTREACHED */
6456 }
6457 }
6458
6459 /*
6460 * We get here only if there's trouble -- normally "case END" is
6461 * the terminating point.
6462 */
6463 Perl_croak(aTHX_ "corrupted regexp pointers");
6464 /*NOTREACHED*/
6465 sayNO;
6466
6467yes:
6468 if (yes_state) {
6469 /* we have successfully completed a subexpression, but we must now
6470 * pop to the state marked by yes_state and continue from there */
6471 assert(st != yes_state);
6472#ifdef DEBUGGING
6473 while (st != yes_state) {
6474 st--;
6475 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6476 PL_regmatch_slab = PL_regmatch_slab->prev;
6477 st = SLAB_LAST(PL_regmatch_slab);
6478 }
6479 DEBUG_STATE_r({
6480 if (no_final) {
6481 DEBUG_STATE_pp("pop (no final)");
6482 } else {
6483 DEBUG_STATE_pp("pop (yes)");
6484 }
6485 });
6486 depth--;
6487 }
6488#else
6489 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6490 || yes_state > SLAB_LAST(PL_regmatch_slab))
6491 {
6492 /* not in this slab, pop slab */
6493 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6494 PL_regmatch_slab = PL_regmatch_slab->prev;
6495 st = SLAB_LAST(PL_regmatch_slab);
6496 }
6497 depth -= (st - yes_state);
6498#endif
6499 st = yes_state;
6500 yes_state = st->u.yes.prev_yes_state;
6501 PL_regmatch_state = st;
6502
6503 if (no_final)
6504 locinput= st->locinput;
6505 state_num = st->resume_state + no_final;
6506 goto reenter_switch;
6507 }
6508
6509 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6510 PL_colors[4], PL_colors[5]));
6511
6512 if (PL_reg_state.re_state_eval_setup_done) {
6513 /* each successfully executed (?{...}) block does the equivalent of
6514 * local $^R = do {...}
6515 * When popping the save stack, all these locals would be undone;
6516 * bypass this by setting the outermost saved $^R to the latest
6517 * value */
6518 if (oreplsv != GvSV(PL_replgv))
6519 sv_setsv(oreplsv, GvSV(PL_replgv));
6520 }
6521 result = 1;
6522 goto final_exit;
6523
6524no:
6525 DEBUG_EXECUTE_r(
6526 PerlIO_printf(Perl_debug_log,
6527 "%*s %sfailed...%s\n",
6528 REPORT_CODE_OFF+depth*2, "",
6529 PL_colors[4], PL_colors[5])
6530 );
6531
6532no_silent:
6533 if (no_final) {
6534 if (yes_state) {
6535 goto yes;
6536 } else {
6537 goto final_exit;
6538 }
6539 }
6540 if (depth) {
6541 /* there's a previous state to backtrack to */
6542 st--;
6543 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6544 PL_regmatch_slab = PL_regmatch_slab->prev;
6545 st = SLAB_LAST(PL_regmatch_slab);
6546 }
6547 PL_regmatch_state = st;
6548 locinput= st->locinput;
6549
6550 DEBUG_STATE_pp("pop");
6551 depth--;
6552 if (yes_state == st)
6553 yes_state = st->u.yes.prev_yes_state;
6554
6555 state_num = st->resume_state + 1; /* failure = success + 1 */
6556 goto reenter_switch;
6557 }
6558 result = 0;
6559
6560 final_exit:
6561 if (rex->intflags & PREGf_VERBARG_SEEN) {
6562 SV *sv_err = get_sv("REGERROR", 1);
6563 SV *sv_mrk = get_sv("REGMARK", 1);
6564 if (result) {
6565 sv_commit = &PL_sv_no;
6566 if (!sv_yes_mark)
6567 sv_yes_mark = &PL_sv_yes;
6568 } else {
6569 if (!sv_commit)
6570 sv_commit = &PL_sv_yes;
6571 sv_yes_mark = &PL_sv_no;
6572 }
6573 sv_setsv(sv_err, sv_commit);
6574 sv_setsv(sv_mrk, sv_yes_mark);
6575 }
6576
6577
6578 if (last_pushed_cv) {
6579 dSP;
6580 POP_MULTICALL;
6581 PERL_UNUSED_VAR(SP);
6582 }
6583
6584 /* clean up; in particular, free all slabs above current one */
6585 LEAVE_SCOPE(oldsave);
6586
6587 assert(!result || locinput - PL_bostr >= 0);
6588 return result ? locinput - PL_bostr : -1;
6589}
6590
6591/*
6592 - regrepeat - repeatedly match something simple, report how many
6593 *
6594 * What 'simple' means is a node which can be the operand of a quantifier like
6595 * '+', or {1,3}
6596 *
6597 * startposp - pointer a pointer to the start position. This is updated
6598 * to point to the byte following the highest successful
6599 * match.
6600 * p - the regnode to be repeatedly matched against.
6601 * max - maximum number of things to match.
6602 * depth - (for debugging) backtracking depth.
6603 */
6604STATIC I32
6605S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6606{
6607 dVAR;
6608 char *scan; /* Pointer to current position in target string */
6609 I32 c;
6610 char *loceol = PL_regeol; /* local version */
6611 I32 hardcount = 0; /* How many matches so far */
6612 bool utf8_target = PL_reg_match_utf8;
6613 UV utf8_flags;
6614#ifndef DEBUGGING
6615 PERL_UNUSED_ARG(depth);
6616#endif
6617
6618 PERL_ARGS_ASSERT_REGREPEAT;
6619
6620 scan = *startposp;
6621 if (max == REG_INFTY)
6622 max = I32_MAX;
6623 else if (! utf8_target && scan + max < loceol)
6624 loceol = scan + max;
6625
6626 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
6627 * to the maximum of how far we should go in it (leaving it set to the real
6628 * end, if the maximum permissible would take us beyond that). This allows
6629 * us to make the loop exit condition that we haven't gone past <loceol> to
6630 * also mean that we haven't exceeded the max permissible count, saving a
6631 * test each time through the loop. But it assumes that the OP matches a
6632 * single byte, which is true for most of the OPs below when applied to a
6633 * non-UTF-8 target. Those relatively few OPs that don't have this
6634 * characteristic will have to compensate.
6635 *
6636 * There is no adjustment for UTF-8 targets, as the number of bytes per
6637 * character varies. OPs will have to test both that the count is less
6638 * than the max permissible (using <hardcount> to keep track), and that we
6639 * are still within the bounds of the string (using <loceol>. A few OPs
6640 * match a single byte no matter what the encoding. They can omit the max
6641 * test if, for the UTF-8 case, they do the adjustment that was skipped
6642 * above.
6643 *
6644 * Thus, the code above sets things up for the common case; and exceptional
6645 * cases need extra work; the common case is to make sure <scan> doesn't
6646 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
6647 * count doesn't exceed the maximum permissible */
6648
6649 switch (OP(p)) {
6650 case REG_ANY:
6651 if (utf8_target) {
6652 while (scan < loceol && hardcount < max && *scan != '\n') {
6653 scan += UTF8SKIP(scan);
6654 hardcount++;
6655 }
6656 } else {
6657 while (scan < loceol && *scan != '\n')
6658 scan++;
6659 }
6660 break;
6661 case SANY:
6662 if (utf8_target) {
6663 while (scan < loceol && hardcount < max) {
6664 scan += UTF8SKIP(scan);
6665 hardcount++;
6666 }
6667 }
6668 else
6669 scan = loceol;
6670 break;
6671 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
6672 if (utf8_target && scan + max < loceol) {
6673
6674 /* <loceol> hadn't been adjusted in the UTF-8 case */
6675 scan += max;
6676 }
6677 else {
6678 scan = loceol;
6679 }
6680 break;
6681 case EXACT:
6682 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6683
6684 c = (U8)*STRING(p);
6685
6686 /* Can use a simple loop if the pattern char to match on is invariant
6687 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6688 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6689 * true iff it doesn't matter if the argument is in UTF-8 or not */
6690 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6691 if (utf8_target && scan + max < loceol) {
6692 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6693 * since here, to match at all, 1 char == 1 byte */
6694 loceol = scan + max;
6695 }
6696 while (scan < loceol && UCHARAT(scan) == c) {
6697 scan++;
6698 }
6699 }
6700 else if (UTF_PATTERN) {
6701 if (utf8_target) {
6702 STRLEN scan_char_len;
6703
6704 /* When both target and pattern are UTF-8, we have to do
6705 * string EQ */
6706 while (hardcount < max
6707 && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6708 && scan_char_len <= STR_LEN(p)
6709 && memEQ(scan, STRING(p), scan_char_len))
6710 {
6711 scan += scan_char_len;
6712 hardcount++;
6713 }
6714 }
6715 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6716
6717 /* Target isn't utf8; convert the character in the UTF-8
6718 * pattern to non-UTF8, and do a simple loop */
6719 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6720 while (scan < loceol && UCHARAT(scan) == c) {
6721 scan++;
6722 }
6723 } /* else pattern char is above Latin1, can't possibly match the
6724 non-UTF-8 target */
6725 }
6726 else {
6727
6728 /* Here, the string must be utf8; pattern isn't, and <c> is
6729 * different in utf8 than not, so can't compare them directly.
6730 * Outside the loop, find the two utf8 bytes that represent c, and
6731 * then look for those in sequence in the utf8 string */
6732 U8 high = UTF8_TWO_BYTE_HI(c);
6733 U8 low = UTF8_TWO_BYTE_LO(c);
6734
6735 while (hardcount < max
6736 && scan + 1 < loceol
6737 && UCHARAT(scan) == high
6738 && UCHARAT(scan + 1) == low)
6739 {
6740 scan += 2;
6741 hardcount++;
6742 }
6743 }
6744 break;
6745
6746 case EXACTFA:
6747 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6748 goto do_exactf;
6749
6750 case EXACTFL:
6751 PL_reg_flags |= RF_tainted;
6752 utf8_flags = FOLDEQ_UTF8_LOCALE;
6753 goto do_exactf;
6754
6755 case EXACTF:
6756 utf8_flags = 0;
6757 goto do_exactf;
6758
6759 case EXACTFU_SS:
6760 case EXACTFU_TRICKYFOLD:
6761 case EXACTFU:
6762 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6763
6764 do_exactf: {
6765 int c1, c2;
6766 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6767
6768 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6769
6770 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6771 if (c1 == CHRTEST_VOID) {
6772 /* Use full Unicode fold matching */
6773 char *tmpeol = PL_regeol;
6774 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6775 while (hardcount < max
6776 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6777 STRING(p), NULL, pat_len,
6778 cBOOL(UTF_PATTERN), utf8_flags))
6779 {
6780 scan = tmpeol;
6781 tmpeol = PL_regeol;
6782 hardcount++;
6783 }
6784 }
6785 else if (utf8_target) {
6786 if (c1 == c2) {
6787 while (scan < loceol
6788 && hardcount < max
6789 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6790 {
6791 scan += UTF8SKIP(scan);
6792 hardcount++;
6793 }
6794 }
6795 else {
6796 while (scan < loceol
6797 && hardcount < max
6798 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6799 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6800 {
6801 scan += UTF8SKIP(scan);
6802 hardcount++;
6803 }
6804 }
6805 }
6806 else if (c1 == c2) {
6807 while (scan < loceol && UCHARAT(scan) == c1) {
6808 scan++;
6809 }
6810 }
6811 else {
6812 while (scan < loceol &&
6813 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6814 {
6815 scan++;
6816 }
6817 }
6818 }
6819 break;
6820 }
6821 case ANYOF:
6822 if (utf8_target) {
6823 STRLEN inclasslen;
6824 while (hardcount < max
6825 && scan + (inclasslen = UTF8SKIP(scan)) <= loceol
6826 && reginclass(prog, p, (U8*)scan, utf8_target))
6827 {
6828 scan += inclasslen;
6829 hardcount++;
6830 }
6831 } else {
6832 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6833 scan++;
6834 }
6835 break;
6836 case ALNUMU:
6837 if (utf8_target) {
6838 utf8_wordchar:
6839 LOAD_UTF8_CHARCLASS_ALNUM();
6840 while (hardcount < max && scan < loceol &&
6841 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6842 {
6843 scan += UTF8SKIP(scan);
6844 hardcount++;
6845 }
6846 } else {
6847 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6848 scan++;
6849 }
6850 }
6851 break;
6852 case ALNUM:
6853 if (utf8_target)
6854 goto utf8_wordchar;
6855 while (scan < loceol && isALNUM((U8) *scan)) {
6856 scan++;
6857 }
6858 break;
6859 case ALNUMA:
6860 if (utf8_target && scan + max < loceol) {
6861
6862 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6863 * since here, to match, 1 char == 1 byte */
6864 loceol = scan + max;
6865 }
6866 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6867 scan++;
6868 }
6869 break;
6870 case ALNUML:
6871 PL_reg_flags |= RF_tainted;
6872 if (utf8_target) {
6873 while (hardcount < max && scan < loceol &&
6874 isALNUM_LC_utf8((U8*)scan)) {
6875 scan += UTF8SKIP(scan);
6876 hardcount++;
6877 }
6878 } else {
6879 while (scan < loceol && isALNUM_LC(*scan))
6880 scan++;
6881 }
6882 break;
6883 case NALNUMU:
6884 if (utf8_target) {
6885
6886 utf8_Nwordchar:
6887
6888 LOAD_UTF8_CHARCLASS_ALNUM();
6889 while (hardcount < max && scan < loceol &&
6890 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6891 {
6892 scan += UTF8SKIP(scan);
6893 hardcount++;
6894 }
6895 } else {
6896 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6897 scan++;
6898 }
6899 }
6900 break;
6901 case NALNUM:
6902 if (utf8_target)
6903 goto utf8_Nwordchar;
6904 while (scan < loceol && ! isALNUM((U8) *scan)) {
6905 scan++;
6906 }
6907 break;
6908
6909 case POSIXA:
6910 if (utf8_target && scan + max < loceol) {
6911
6912 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6913 * since here, to match, 1 char == 1 byte */
6914 loceol = scan + max;
6915 }
6916 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6917 scan++;
6918 }
6919 break;
6920 case NPOSIXA:
6921 if (utf8_target) {
6922 while (scan < loceol && hardcount < max
6923 && ! _generic_isCC_A((U8) *scan, FLAGS(p)))
6924 {
6925 scan += UTF8SKIP(scan);
6926 hardcount++;
6927 }
6928 }
6929 else {
6930 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6931 scan++;
6932 }
6933 }
6934 break;
6935 case NALNUMA:
6936 if (utf8_target) {
6937 while (scan < loceol && hardcount < max
6938 && ! isWORDCHAR_A((U8) *scan))
6939 {
6940 scan += UTF8SKIP(scan);
6941 hardcount++;
6942 }
6943 }
6944 else {
6945 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6946 scan++;
6947 }
6948 }
6949 break;
6950 case NALNUML:
6951 PL_reg_flags |= RF_tainted;
6952 if (utf8_target) {
6953 while (hardcount < max && scan < loceol &&
6954 !isALNUM_LC_utf8((U8*)scan)) {
6955 scan += UTF8SKIP(scan);
6956 hardcount++;
6957 }
6958 } else {
6959 while (scan < loceol && !isALNUM_LC(*scan))
6960 scan++;
6961 }
6962 break;
6963 case SPACEU:
6964 if (utf8_target) {
6965
6966 utf8_space:
6967
6968 while (hardcount < max && scan < loceol
6969 && is_XPERLSPACE_utf8((U8*)scan))
6970 {
6971 scan += UTF8SKIP(scan);
6972 hardcount++;
6973 }
6974 break;
6975 }
6976 else {
6977 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6978 scan++;
6979 }
6980 break;
6981 }
6982 case SPACE:
6983 if (utf8_target)
6984 goto utf8_space;
6985
6986 while (scan < loceol && isSPACE((U8) *scan)) {
6987 scan++;
6988 }
6989 break;
6990 case SPACEA:
6991 if (utf8_target && scan + max < loceol) {
6992
6993 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
6994 * since here, to match, 1 char == 1 byte */
6995 loceol = scan + max;
6996 }
6997 while (scan < loceol && isSPACE_A((U8) *scan)) {
6998 scan++;
6999 }
7000 break;
7001 case SPACEL:
7002 PL_reg_flags |= RF_tainted;
7003 if (utf8_target) {
7004 while (hardcount < max && scan < loceol &&
7005 isSPACE_LC_utf8((U8*)scan)) {
7006 scan += UTF8SKIP(scan);
7007 hardcount++;
7008 }
7009 } else {
7010 while (scan < loceol && isSPACE_LC(*scan))
7011 scan++;
7012 }
7013 break;
7014 case NSPACEU:
7015 if (utf8_target) {
7016
7017 utf8_Nspace:
7018
7019 while (hardcount < max && scan < loceol
7020 && ! is_XPERLSPACE_utf8((U8*)scan))
7021 {
7022 scan += UTF8SKIP(scan);
7023 hardcount++;
7024 }
7025 break;
7026 }
7027 else {
7028 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
7029 scan++;
7030 }
7031 }
7032 break;
7033 case NSPACE:
7034 if (utf8_target)
7035 goto utf8_Nspace;
7036
7037 while (scan < loceol && ! isSPACE((U8) *scan)) {
7038 scan++;
7039 }
7040 break;
7041 case NSPACEA:
7042 if (utf8_target) {
7043 while (hardcount < max && scan < loceol
7044 && ! isSPACE_A((U8) *scan))
7045 {
7046 scan += UTF8SKIP(scan);
7047 hardcount++;
7048 }
7049 }
7050 else {
7051 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
7052 scan++;
7053 }
7054 }
7055 break;
7056 case NSPACEL:
7057 PL_reg_flags |= RF_tainted;
7058 if (utf8_target) {
7059 while (hardcount < max && scan < loceol &&
7060 !isSPACE_LC_utf8((U8*)scan)) {
7061 scan += UTF8SKIP(scan);
7062 hardcount++;
7063 }
7064 } else {
7065 while (scan < loceol && !isSPACE_LC(*scan))
7066 scan++;
7067 }
7068 break;
7069 case DIGIT:
7070 if (utf8_target) {
7071 LOAD_UTF8_CHARCLASS_DIGIT();
7072 while (hardcount < max && scan < loceol &&
7073 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7074 scan += UTF8SKIP(scan);
7075 hardcount++;
7076 }
7077 } else {
7078 while (scan < loceol && isDIGIT(*scan))
7079 scan++;
7080 }
7081 break;
7082 case DIGITA:
7083 if (utf8_target && scan + max < loceol) {
7084
7085 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7086 * since here, to match, 1 char == 1 byte */
7087 loceol = scan + max;
7088 }
7089 while (scan < loceol && isDIGIT_A((U8) *scan)) {
7090 scan++;
7091 }
7092 break;
7093 case DIGITL:
7094 PL_reg_flags |= RF_tainted;
7095 if (utf8_target) {
7096 while (hardcount < max && scan < loceol &&
7097 isDIGIT_LC_utf8((U8*)scan)) {
7098 scan += UTF8SKIP(scan);
7099 hardcount++;
7100 }
7101 } else {
7102 while (scan < loceol && isDIGIT_LC(*scan))
7103 scan++;
7104 }
7105 break;
7106 case NDIGIT:
7107 if (utf8_target) {
7108 LOAD_UTF8_CHARCLASS_DIGIT();
7109 while (hardcount < max && scan < loceol &&
7110 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
7111 scan += UTF8SKIP(scan);
7112 hardcount++;
7113 }
7114 } else {
7115 while (scan < loceol && !isDIGIT(*scan))
7116 scan++;
7117 }
7118 break;
7119 case NDIGITA:
7120 if (utf8_target) {
7121 while (hardcount < max && scan < loceol
7122 && ! isDIGIT_A((U8) *scan)) {
7123 scan += UTF8SKIP(scan);
7124 hardcount++;
7125 }
7126 }
7127 else {
7128 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7129 scan++;
7130 }
7131 }
7132 break;
7133 case NDIGITL:
7134 PL_reg_flags |= RF_tainted;
7135 if (utf8_target) {
7136 while (hardcount < max && scan < loceol &&
7137 !isDIGIT_LC_utf8((U8*)scan)) {
7138 scan += UTF8SKIP(scan);
7139 hardcount++;
7140 }
7141 } else {
7142 while (scan < loceol && !isDIGIT_LC(*scan))
7143 scan++;
7144 }
7145 break;
7146 case LNBREAK:
7147 if (utf8_target) {
7148 while (hardcount < max && scan < loceol &&
7149 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7150 scan += c;
7151 hardcount++;
7152 }
7153 } else {
7154 /* LNBREAK can match one or two latin chars, which is ok, but we
7155 * have to use hardcount in this situation, and throw away the
7156 * adjustment to <loceol> done before the switch statement */
7157 loceol = PL_regeol;
7158 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7159 scan+=c;
7160 hardcount++;
7161 }
7162 }
7163 break;
7164 case HORIZWS:
7165 if (utf8_target) {
7166 while (hardcount < max && scan < loceol &&
7167 (c=is_HORIZWS_utf8_safe(scan, loceol)))
7168 {
7169 scan += c;
7170 hardcount++;
7171 }
7172 } else {
7173 while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7174 scan++;
7175 }
7176 break;
7177 case NHORIZWS:
7178 if (utf8_target) {
7179 while (hardcount < max && scan < loceol &&
7180 !is_HORIZWS_utf8_safe(scan, loceol))
7181 {
7182 scan += UTF8SKIP(scan);
7183 hardcount++;
7184 }
7185 } else {
7186 while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7187 scan++;
7188
7189 }
7190 break;
7191 case VERTWS:
7192 if (utf8_target) {
7193 while (hardcount < max && scan < loceol &&
7194 (c=is_VERTWS_utf8_safe(scan, loceol)))
7195 {
7196 scan += c;
7197 hardcount++;
7198 }
7199 } else {
7200 while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7201 scan++;
7202
7203 }
7204 break;
7205 case NVERTWS:
7206 if (utf8_target) {
7207 while (hardcount < max && scan < loceol &&
7208 !is_VERTWS_utf8_safe(scan, loceol))
7209 {
7210 scan += UTF8SKIP(scan);
7211 hardcount++;
7212 }
7213 } else {
7214 while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7215 scan++;
7216
7217 }
7218 break;
7219
7220 case BOUND:
7221 case BOUNDA:
7222 case BOUNDL:
7223 case BOUNDU:
7224 case EOS:
7225 case GPOS:
7226 case KEEPS:
7227 case NBOUND:
7228 case NBOUNDA:
7229 case NBOUNDL:
7230 case NBOUNDU:
7231 case OPFAIL:
7232 case SBOL:
7233 case SEOL:
7234 /* These are all 0 width, so match right here or not at all. */
7235 break;
7236
7237 default:
7238 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7239 assert(0); /* NOTREACHED */
7240
7241 }
7242
7243 if (hardcount)
7244 c = hardcount;
7245 else
7246 c = scan - *startposp;
7247 *startposp = scan;
7248
7249 DEBUG_r({
7250 GET_RE_DEBUG_FLAGS_DECL;
7251 DEBUG_EXECUTE_r({
7252 SV * const prop = sv_newmortal();
7253 regprop(prog, prop, p);
7254 PerlIO_printf(Perl_debug_log,
7255 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7256 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7257 });
7258 });
7259
7260 return(c);
7261}
7262
7263
7264#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7265/*
7266- regclass_swash - prepare the utf8 swash. Wraps the shared core version to
7267create a copy so that changes the caller makes won't change the shared one.
7268If <altsvp> is non-null, will return NULL in it, for back-compat.
7269 */
7270SV *
7271Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7272{
7273 PERL_ARGS_ASSERT_REGCLASS_SWASH;
7274
7275 if (altsvp) {
7276 *altsvp = NULL;
7277 }
7278
7279 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7280}
7281#endif
7282
7283STATIC SV *
7284S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp)
7285{
7286 /* Returns the swash for the input 'node' in the regex 'prog'.
7287 * If <doinit> is true, will attempt to create the swash if not already
7288 * done.
7289 * If <listsvp> is non-null, will return the swash initialization string in
7290 * it.
7291 * Tied intimately to how regcomp.c sets up the data structure */
7292
7293 dVAR;
7294 SV *sw = NULL;
7295 SV *si = NULL;
7296 SV* invlist = NULL;
7297
7298 RXi_GET_DECL(prog,progi);
7299 const struct reg_data * const data = prog ? progi->data : NULL;
7300
7301 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7302
7303 assert(ANYOF_NONBITMAP(node));
7304
7305 if (data && data->count) {
7306 const U32 n = ARG(node);
7307
7308 if (data->what[n] == 's') {
7309 SV * const rv = MUTABLE_SV(data->data[n]);
7310 AV * const av = MUTABLE_AV(SvRV(rv));
7311 SV **const ary = AvARRAY(av);
7312 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7313
7314 si = *ary; /* ary[0] = the string to initialize the swash with */
7315
7316 /* Elements 2 and 3 are either both present or both absent. [2] is
7317 * any inversion list generated at compile time; [3] indicates if
7318 * that inversion list has any user-defined properties in it. */
7319 if (av_len(av) >= 2) {
7320 invlist = ary[2];
7321 if (SvUV(ary[3])) {
7322 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7323 }
7324 }
7325 else {
7326 invlist = NULL;
7327 }
7328
7329 /* Element [1] is reserved for the set-up swash. If already there,
7330 * return it; if not, create it and store it there */
7331 if (SvROK(ary[1])) {
7332 sw = ary[1];
7333 }
7334 else if (si && doinit) {
7335
7336 sw = _core_swash_init("utf8", /* the utf8 package */
7337 "", /* nameless */
7338 si,
7339 1, /* binary */
7340 0, /* not from tr/// */
7341 invlist,
7342 &swash_init_flags);
7343 (void)av_store(av, 1, sw);
7344 }
7345 }
7346 }
7347
7348 if (listsvp) {
7349 SV* matches_string = newSVpvn("", 0);
7350
7351 /* Use the swash, if any, which has to have incorporated into it all
7352 * possibilities */
7353 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7354 && (si && si != &PL_sv_undef))
7355 {
7356
7357 /* If no swash, use the input initialization string, if available */
7358 sv_catsv(matches_string, si);
7359 }
7360
7361 /* Add the inversion list to whatever we have. This may have come from
7362 * the swash, or from an input parameter */
7363 if (invlist) {
7364 sv_catsv(matches_string, _invlist_contents(invlist));
7365 }
7366 *listsvp = matches_string;
7367 }
7368
7369 return sw;
7370}
7371
7372/*
7373 - reginclass - determine if a character falls into a character class
7374
7375 n is the ANYOF regnode
7376 p is the target string
7377 utf8_target tells whether p is in UTF-8.
7378
7379 Returns true if matched; false otherwise.
7380
7381 Note that this can be a synthetic start class, a combination of various
7382 nodes, so things you think might be mutually exclusive, such as locale,
7383 aren't. It can match both locale and non-locale
7384
7385 */
7386
7387STATIC bool
7388S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, register const bool utf8_target)
7389{
7390 dVAR;
7391 const char flags = ANYOF_FLAGS(n);
7392 bool match = FALSE;
7393 UV c = *p;
7394
7395 PERL_ARGS_ASSERT_REGINCLASS;
7396
7397 /* If c is not already the code point, get it. Note that
7398 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7399 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7400 STRLEN c_len = 0;
7401 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7402 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7403 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7404 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7405 * UTF8_ALLOW_FFFF */
7406 if (c_len == (STRLEN)-1)
7407 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7408 }
7409
7410 /* If this character is potentially in the bitmap, check it */
7411 if (c < 256) {
7412 if (ANYOF_BITMAP_TEST(n, c))
7413 match = TRUE;
7414 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7415 && ! utf8_target
7416 && ! isASCII(c))
7417 {
7418 match = TRUE;
7419 }
7420 else if (flags & ANYOF_LOCALE) {
7421 PL_reg_flags |= RF_tainted;
7422
7423 if ((flags & ANYOF_LOC_FOLD)
7424 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7425 {
7426 match = TRUE;
7427 }
7428 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7429 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
7430 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
7431 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
7432 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
7433 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
7434 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
7435 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
7436 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7437 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
7438 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
7439 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) ||
7440 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) ||
7441 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
7442 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
7443 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
7444 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
7445 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
7446 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
7447 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
7448 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
7449 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
7450 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
7451 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
7452 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
7453 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
7454 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
7455 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
7456 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
7457 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) ||
7458 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c))
7459 ) /* How's that for a conditional? */
7460 ) {
7461 match = TRUE;
7462 }
7463 }
7464 }
7465
7466 /* If the bitmap didn't (or couldn't) match, and something outside the
7467 * bitmap could match, try that. Locale nodes specify completely the
7468 * behavior of code points in the bit map (otherwise, a utf8 target would
7469 * cause them to be treated as Unicode and not locale), except in
7470 * the very unlikely event when this node is a synthetic start class, which
7471 * could be a combination of locale and non-locale nodes. So allow locale
7472 * to match for the synthetic start class, which will give a false
7473 * positive that will be resolved when the match is done again as not part
7474 * of the synthetic start class */
7475 if (!match) {
7476 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7477 match = TRUE; /* Everything above 255 matches */
7478 }
7479 else if (ANYOF_NONBITMAP(n)
7480 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7481 || (utf8_target
7482 && (c >=256
7483 || (! (flags & ANYOF_LOCALE))
7484 || (flags & ANYOF_IS_SYNTHETIC)))))
7485 {
7486 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7487 if (sw) {
7488 U8 * utf8_p;
7489 if (utf8_target) {
7490 utf8_p = (U8 *) p;
7491 } else { /* Convert to utf8 */
7492 STRLEN len = 1;
7493 utf8_p = bytes_to_utf8(p, &len);
7494 }
7495
7496 if (swash_fetch(sw, utf8_p, TRUE)) {
7497 match = TRUE;
7498 }
7499
7500 /* If we allocated a string above, free it */
7501 if (! utf8_target) Safefree(utf8_p);
7502 }
7503 }
7504
7505 if (UNICODE_IS_SUPER(c)
7506 && (flags & ANYOF_WARN_SUPER)
7507 && ckWARN_d(WARN_NON_UNICODE))
7508 {
7509 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7510 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7511 }
7512 }
7513
7514 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7515 return cBOOL(flags & ANYOF_INVERT) ^ match;
7516}
7517
7518STATIC U8 *
7519S_reghop3(U8 *s, I32 off, const U8* lim)
7520{
7521 /* return the position 'off' UTF-8 characters away from 's', forward if
7522 * 'off' >= 0, backwards if negative. But don't go outside of position
7523 * 'lim', which better be < s if off < 0 */
7524
7525 dVAR;
7526
7527 PERL_ARGS_ASSERT_REGHOP3;
7528
7529 if (off >= 0) {
7530 while (off-- && s < lim) {
7531 /* XXX could check well-formedness here */
7532 s += UTF8SKIP(s);
7533 }
7534 }
7535 else {
7536 while (off++ && s > lim) {
7537 s--;
7538 if (UTF8_IS_CONTINUED(*s)) {
7539 while (s > lim && UTF8_IS_CONTINUATION(*s))
7540 s--;
7541 }
7542 /* XXX could check well-formedness here */
7543 }
7544 }
7545 return s;
7546}
7547
7548#ifdef XXX_dmq
7549/* there are a bunch of places where we use two reghop3's that should
7550 be replaced with this routine. but since thats not done yet
7551 we ifdef it out - dmq
7552*/
7553STATIC U8 *
7554S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7555{
7556 dVAR;
7557
7558 PERL_ARGS_ASSERT_REGHOP4;
7559
7560 if (off >= 0) {
7561 while (off-- && s < rlim) {
7562 /* XXX could check well-formedness here */
7563 s += UTF8SKIP(s);
7564 }
7565 }
7566 else {
7567 while (off++ && s > llim) {
7568 s--;
7569 if (UTF8_IS_CONTINUED(*s)) {
7570 while (s > llim && UTF8_IS_CONTINUATION(*s))
7571 s--;
7572 }
7573 /* XXX could check well-formedness here */
7574 }
7575 }
7576 return s;
7577}
7578#endif
7579
7580STATIC U8 *
7581S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7582{
7583 dVAR;
7584
7585 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7586
7587 if (off >= 0) {
7588 while (off-- && s < lim) {
7589 /* XXX could check well-formedness here */
7590 s += UTF8SKIP(s);
7591 }
7592 if (off >= 0)
7593 return NULL;
7594 }
7595 else {
7596 while (off++ && s > lim) {
7597 s--;
7598 if (UTF8_IS_CONTINUED(*s)) {
7599 while (s > lim && UTF8_IS_CONTINUATION(*s))
7600 s--;
7601 }
7602 /* XXX could check well-formedness here */
7603 }
7604 if (off <= 0)
7605 return NULL;
7606 }
7607 return s;
7608}
7609
7610static void
7611restore_pos(pTHX_ void *arg)
7612{
7613 dVAR;
7614 regexp * const rex = (regexp *)arg;
7615 if (PL_reg_state.re_state_eval_setup_done) {
7616 if (PL_reg_oldsaved) {
7617 rex->subbeg = PL_reg_oldsaved;
7618 rex->sublen = PL_reg_oldsavedlen;
7619 rex->suboffset = PL_reg_oldsavedoffset;
7620 rex->subcoffset = PL_reg_oldsavedcoffset;
7621#ifdef PERL_OLD_COPY_ON_WRITE
7622 rex->saved_copy = PL_nrs;
7623#endif
7624 RXp_MATCH_COPIED_on(rex);
7625 }
7626 PL_reg_magic->mg_len = PL_reg_oldpos;
7627 PL_reg_state.re_state_eval_setup_done = FALSE;
7628 PL_curpm = PL_reg_oldcurpm;
7629 }
7630}
7631
7632STATIC void
7633S_to_utf8_substr(pTHX_ register regexp *prog)
7634{
7635 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7636 * on the converted value */
7637
7638 int i = 1;
7639
7640 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7641
7642 do {
7643 if (prog->substrs->data[i].substr
7644 && !prog->substrs->data[i].utf8_substr) {
7645 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7646 prog->substrs->data[i].utf8_substr = sv;
7647 sv_utf8_upgrade(sv);
7648 if (SvVALID(prog->substrs->data[i].substr)) {
7649 if (SvTAIL(prog->substrs->data[i].substr)) {
7650 /* Trim the trailing \n that fbm_compile added last
7651 time. */
7652 SvCUR_set(sv, SvCUR(sv) - 1);
7653 /* Whilst this makes the SV technically "invalid" (as its
7654 buffer is no longer followed by "\0") when fbm_compile()
7655 adds the "\n" back, a "\0" is restored. */
7656 fbm_compile(sv, FBMcf_TAIL);
7657 } else
7658 fbm_compile(sv, 0);
7659 }
7660 if (prog->substrs->data[i].substr == prog->check_substr)
7661 prog->check_utf8 = sv;
7662 }
7663 } while (i--);
7664}
7665
7666STATIC bool
7667S_to_byte_substr(pTHX_ register regexp *prog)
7668{
7669 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7670 * on the converted value; returns FALSE if can't be converted. */
7671
7672 dVAR;
7673 int i = 1;
7674
7675 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7676
7677 do {
7678 if (prog->substrs->data[i].utf8_substr
7679 && !prog->substrs->data[i].substr) {
7680 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7681 if (! sv_utf8_downgrade(sv, TRUE)) {
7682 return FALSE;
7683 }
7684 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7685 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7686 /* Trim the trailing \n that fbm_compile added last
7687 time. */
7688 SvCUR_set(sv, SvCUR(sv) - 1);
7689 fbm_compile(sv, FBMcf_TAIL);
7690 } else
7691 fbm_compile(sv, 0);
7692 }
7693 prog->substrs->data[i].substr = sv;
7694 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7695 prog->check_substr = sv;
7696 }
7697 } while (i--);
7698
7699 return TRUE;
7700}
7701
7702/* These constants are for finding GCB=LV and GCB=LVT. These are for the
7703 * pre-composed Hangul syllables, which are all in a contiguous block and
7704 * arranged there in such a way so as to facilitate alorithmic determination of
7705 * their characteristics. As such, they don't need a swash, but can be
7706 * determined by simple arithmetic. Almost all are GCB=LVT, but every 28th one
7707 * is a GCB=LV */
7708#define SBASE 0xAC00 /* Start of block */
7709#define SCount 11172 /* Length of block */
7710#define TCount 28
7711
7712#if 0 /* This routine is not currently used */
7713PERL_STATIC_INLINE bool
7714S_is_utf8_X_LV(pTHX_ const U8 *p)
7715{
7716 /* Unlike most other similarly named routines here, this does not create a
7717 * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7718
7719 dVAR;
7720
7721 UV cp = valid_utf8_to_uvchr(p, NULL);
7722
7723 PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7724
7725 /* The earliest Unicode releases did not have these precomposed Hangul
7726 * syllables. Set to point to undef in that case, so will return false on
7727 * every call */
7728 if (! PL_utf8_X_LV) { /* Set up if this is the first time called */
7729 PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7730 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7731 SvREFCNT_dec(PL_utf8_X_LV);
7732 PL_utf8_X_LV = &PL_sv_undef;
7733 }
7734 }
7735
7736 return (PL_utf8_X_LV != &PL_sv_undef
7737 && cp >= SBASE && cp < SBASE + SCount
7738 && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7739}
7740#endif
7741
7742PERL_STATIC_INLINE bool
7743S_is_utf8_X_LVT(pTHX_ const U8 *p)
7744{
7745 /* Unlike most other similarly named routines here, this does not create a
7746 * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7747
7748 dVAR;
7749
7750 UV cp = valid_utf8_to_uvchr(p, NULL);
7751
7752 PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7753
7754 /* The earliest Unicode releases did not have these precomposed Hangul
7755 * syllables. Set to point to undef in that case, so will return false on
7756 * every call */
7757 if (! PL_utf8_X_LVT) { /* Set up if this is the first time called */
7758 PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7759 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7760 SvREFCNT_dec(PL_utf8_X_LVT);
7761 PL_utf8_X_LVT = &PL_sv_undef;
7762 }
7763 }
7764
7765 return (PL_utf8_X_LVT != &PL_sv_undef
7766 && cp >= SBASE && cp < SBASE + SCount
7767 && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7768}
7769
7770/*
7771 * Local variables:
7772 * c-indentation-style: bsd
7773 * c-basic-offset: 4
7774 * indent-tabs-mode: nil
7775 * End:
7776 *
7777 * ex: set ts=8 sts=4 sw=4 et:
7778 */