5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
287 #define Simple_vFAIL(m) STMT_START { \
288 const IV offset = RExC_parse - RExC_precomp; \
289 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
290 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
294 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
296 #define vFAIL(m) STMT_START { \
298 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
303 * Like Simple_vFAIL(), but accepts two arguments.
305 #define Simple_vFAIL2(m,a1) STMT_START { \
306 const IV offset = RExC_parse - RExC_precomp; \
307 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
308 (int)offset, RExC_precomp, RExC_precomp + offset); \
312 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
314 #define vFAIL2(m,a1) STMT_START { \
316 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
317 Simple_vFAIL2(m, a1); \
322 * Like Simple_vFAIL(), but accepts three arguments.
324 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
325 const IV offset = RExC_parse - RExC_precomp; \
326 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
327 (int)offset, RExC_precomp, RExC_precomp + offset); \
331 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
333 #define vFAIL3(m,a1,a2) STMT_START { \
335 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
336 Simple_vFAIL3(m, a1, a2); \
340 * Like Simple_vFAIL(), but accepts four arguments.
342 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
343 const IV offset = RExC_parse - RExC_precomp; \
344 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
345 (int)offset, RExC_precomp, RExC_precomp + offset); \
348 #define vWARN(loc,m) STMT_START { \
349 const IV offset = loc - RExC_precomp; \
350 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
351 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
354 #define vWARNdep(loc,m) STMT_START { \
355 const IV offset = loc - RExC_precomp; \
356 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
357 "%s" REPORT_LOCATION, \
358 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
362 #define vWARN2(loc, m, a1) STMT_START { \
363 const IV offset = loc - RExC_precomp; \
364 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
365 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
368 #define vWARN3(loc, m, a1, a2) STMT_START { \
369 const IV offset = loc - RExC_precomp; \
370 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
371 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
374 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
375 const IV offset = loc - RExC_precomp; \
376 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
377 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
381 const IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
383 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
387 /* Allow for side effects in s */
388 #define REGC(c,s) STMT_START { \
389 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
392 /* Macros for recording node offsets. 20001227 mjd@plover.com
393 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
394 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
395 * Element 0 holds the number n.
398 #define MJD_OFFSET_DEBUG(x)
399 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
402 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
404 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
405 __LINE__, (node), (byte))); \
407 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
409 RExC_offsets[2*(node)-1] = (byte); \
414 #define Set_Node_Offset(node,byte) \
415 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
416 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
418 #define Set_Node_Length_To_R(node,len) STMT_START { \
420 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
421 __LINE__, (int)(node), (int)(len))); \
423 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
425 RExC_offsets[2*(node)] = (len); \
430 #define Set_Node_Length(node,len) \
431 Set_Node_Length_To_R((node)-RExC_emit_start, len)
432 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
433 #define Set_Node_Cur_Length(node) \
434 Set_Node_Length(node, RExC_parse - parse_start)
436 /* Get offsets and lengths */
437 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
438 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
440 static void clear_re(pTHX_ void *r);
442 /* Mark that we cannot extend a found fixed substring at this point.
443 Updata the longest found anchored substring and the longest found
444 floating substrings if needed. */
447 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data)
449 const STRLEN l = CHR_SVLEN(data->last_found);
450 const STRLEN old_l = CHR_SVLEN(*data->longest);
452 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
453 SvSetMagicSV(*data->longest, data->last_found);
454 if (*data->longest == data->longest_fixed) {
455 data->offset_fixed = l ? data->last_start_min : data->pos_min;
456 if (data->flags & SF_BEFORE_EOL)
458 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
460 data->flags &= ~SF_FIX_BEFORE_EOL;
463 data->offset_float_min = l ? data->last_start_min : data->pos_min;
464 data->offset_float_max = (l
465 ? data->last_start_max
466 : data->pos_min + data->pos_delta);
467 if ((U32)data->offset_float_max > (U32)I32_MAX)
468 data->offset_float_max = I32_MAX;
469 if (data->flags & SF_BEFORE_EOL)
471 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
473 data->flags &= ~SF_FL_BEFORE_EOL;
476 SvCUR_set(data->last_found, 0);
478 SV * const sv = data->last_found;
479 if (SvUTF8(sv) && SvMAGICAL(sv)) {
480 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
486 data->flags &= ~SF_BEFORE_EOL;
489 /* Can match anything (initialization) */
491 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
493 ANYOF_CLASS_ZERO(cl);
494 ANYOF_BITMAP_SETALL(cl);
495 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
497 cl->flags |= ANYOF_LOCALE;
500 /* Can match anything (initialization) */
502 S_cl_is_anything(const struct regnode_charclass_class *cl)
506 for (value = 0; value <= ANYOF_MAX; value += 2)
507 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
509 if (!(cl->flags & ANYOF_UNICODE_ALL))
511 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
516 /* Can match anything (initialization) */
518 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
520 Zero(cl, 1, struct regnode_charclass_class);
522 cl_anything(pRExC_state, cl);
526 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
528 Zero(cl, 1, struct regnode_charclass_class);
530 cl_anything(pRExC_state, cl);
532 cl->flags |= ANYOF_LOCALE;
535 /* 'And' a given class with another one. Can create false positives */
536 /* We assume that cl is not inverted */
538 S_cl_and(struct regnode_charclass_class *cl,
539 const struct regnode_charclass_class *and_with)
541 if (!(and_with->flags & ANYOF_CLASS)
542 && !(cl->flags & ANYOF_CLASS)
543 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
544 && !(and_with->flags & ANYOF_FOLD)
545 && !(cl->flags & ANYOF_FOLD)) {
548 if (and_with->flags & ANYOF_INVERT)
549 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
550 cl->bitmap[i] &= ~and_with->bitmap[i];
552 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
553 cl->bitmap[i] &= and_with->bitmap[i];
554 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
555 if (!(and_with->flags & ANYOF_EOS))
556 cl->flags &= ~ANYOF_EOS;
558 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
559 !(and_with->flags & ANYOF_INVERT)) {
560 cl->flags &= ~ANYOF_UNICODE_ALL;
561 cl->flags |= ANYOF_UNICODE;
562 ARG_SET(cl, ARG(and_with));
564 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
565 !(and_with->flags & ANYOF_INVERT))
566 cl->flags &= ~ANYOF_UNICODE_ALL;
567 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
568 !(and_with->flags & ANYOF_INVERT))
569 cl->flags &= ~ANYOF_UNICODE;
572 /* 'OR' a given class with another one. Can create false positives */
573 /* We assume that cl is not inverted */
575 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
577 if (or_with->flags & ANYOF_INVERT) {
579 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
580 * <= (B1 | !B2) | (CL1 | !CL2)
581 * which is wasteful if CL2 is small, but we ignore CL2:
582 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
583 * XXXX Can we handle case-fold? Unclear:
584 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
585 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
587 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
588 && !(or_with->flags & ANYOF_FOLD)
589 && !(cl->flags & ANYOF_FOLD) ) {
592 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
593 cl->bitmap[i] |= ~or_with->bitmap[i];
594 } /* XXXX: logic is complicated otherwise */
596 cl_anything(pRExC_state, cl);
599 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
600 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
601 && (!(or_with->flags & ANYOF_FOLD)
602 || (cl->flags & ANYOF_FOLD)) ) {
605 /* OR char bitmap and class bitmap separately */
606 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
607 cl->bitmap[i] |= or_with->bitmap[i];
608 if (or_with->flags & ANYOF_CLASS) {
609 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
610 cl->classflags[i] |= or_with->classflags[i];
611 cl->flags |= ANYOF_CLASS;
614 else { /* XXXX: logic is complicated, leave it along for a moment. */
615 cl_anything(pRExC_state, cl);
618 if (or_with->flags & ANYOF_EOS)
619 cl->flags |= ANYOF_EOS;
621 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
622 ARG(cl) != ARG(or_with)) {
623 cl->flags |= ANYOF_UNICODE_ALL;
624 cl->flags &= ~ANYOF_UNICODE;
626 if (or_with->flags & ANYOF_UNICODE_ALL) {
627 cl->flags |= ANYOF_UNICODE_ALL;
628 cl->flags &= ~ANYOF_UNICODE;
633 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
634 * These need to be revisited when a newer toolchain becomes available.
636 #if defined(__sparc64__) && defined(__GNUC__)
637 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
638 # undef SPARC64_GCC_WORKAROUND
639 # define SPARC64_GCC_WORKAROUND 1
643 /* REx optimizer. Converts nodes into quickier variants "in place".
644 Finds fixed substrings. */
646 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
647 to the position after last scanned or to NULL. */
650 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
651 /* scanp: Start here (read-write). */
652 /* deltap: Write maxlen-minlen here. */
653 /* last: Stop before this one. */
655 I32 min = 0, pars = 0, code;
656 regnode *scan = *scanp, *next;
658 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
659 int is_inf_internal = 0; /* The studied chunk is infinite */
660 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
661 scan_data_t data_fake;
662 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
664 while (scan && OP(scan) != END && scan < last) {
665 /* Peephole optimizer: */
667 if (PL_regkind[(U8)OP(scan)] == EXACT) {
668 /* Merge several consecutive EXACTish nodes into one. */
669 regnode *n = regnext(scan);
672 regnode *stop = scan;
675 next = scan + NODE_SZ_STR(scan);
676 /* Skip NOTHING, merge EXACT*. */
678 ( PL_regkind[(U8)OP(n)] == NOTHING ||
679 (stringok && (OP(n) == OP(scan))))
681 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
682 if (OP(n) == TAIL || n > next)
684 if (PL_regkind[(U8)OP(n)] == NOTHING) {
685 NEXT_OFF(scan) += NEXT_OFF(n);
686 next = n + NODE_STEP_REGNODE;
694 const int oldl = STR_LEN(scan);
695 regnode * const nnext = regnext(n);
697 if (oldl + STR_LEN(n) > U8_MAX)
699 NEXT_OFF(scan) += NEXT_OFF(n);
700 STR_LEN(scan) += STR_LEN(n);
701 next = n + NODE_SZ_STR(n);
702 /* Now we can overwrite *n : */
703 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
711 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
713 Two problematic code points in Unicode casefolding of EXACT nodes:
715 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
716 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
722 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
723 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
725 This means that in case-insensitive matching (or "loose matching",
726 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
727 length of the above casefolded versions) can match a target string
728 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
729 This would rather mess up the minimum length computation.
731 What we'll do is to look for the tail four bytes, and then peek
732 at the preceding two bytes to see whether we need to decrease
733 the minimum length by four (six minus two).
735 Thanks to the design of UTF-8, there cannot be false matches:
736 A sequence of valid UTF-8 bytes cannot be a subsequence of
737 another valid sequence of UTF-8 bytes.
740 char * const s0 = STRING(scan), *s, *t;
741 char * const s1 = s0 + STR_LEN(scan) - 1;
742 char * const s2 = s1 - 4;
743 const char t0[] = "\xcc\x88\xcc\x81";
744 const char * const t1 = t0 + 3;
747 s < s2 && (t = ninstr(s, s1, t0, t1));
749 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
750 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
757 n = scan + NODE_SZ_STR(scan);
759 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
767 /* Follow the next-chain of the current node and optimize
768 away all the NOTHINGs from it. */
769 if (OP(scan) != CURLYX) {
770 const int max = (reg_off_by_arg[OP(scan)]
772 /* I32 may be smaller than U16 on CRAYs! */
773 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
774 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
778 /* Skip NOTHING and LONGJMP. */
779 while ((n = regnext(n))
780 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
781 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
784 if (reg_off_by_arg[OP(scan)])
787 NEXT_OFF(scan) = off;
789 /* The principal pseudo-switch. Cannot be a switch, since we
790 look into several different things. */
791 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
792 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
793 next = regnext(scan);
796 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
797 I32 max1 = 0, min1 = I32_MAX, num = 0;
798 struct regnode_charclass_class accum;
800 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
801 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
802 if (flags & SCF_DO_STCLASS)
803 cl_init_zero(pRExC_state, &accum);
804 while (OP(scan) == code) {
805 I32 deltanext, minnext, f = 0, fake;
806 struct regnode_charclass_class this_class;
811 data_fake.whilem_c = data->whilem_c;
812 data_fake.last_closep = data->last_closep;
815 data_fake.last_closep = &fake;
816 next = regnext(scan);
817 scan = NEXTOPER(scan);
819 scan = NEXTOPER(scan);
820 if (flags & SCF_DO_STCLASS) {
821 cl_init(pRExC_state, &this_class);
822 data_fake.start_class = &this_class;
823 f = SCF_DO_STCLASS_AND;
825 if (flags & SCF_WHILEM_VISITED_POS)
826 f |= SCF_WHILEM_VISITED_POS;
827 /* we suppose the run is continuous, last=next...*/
828 minnext = study_chunk(pRExC_state, &scan, &deltanext,
829 next, &data_fake, f);
832 if (max1 < minnext + deltanext)
833 max1 = minnext + deltanext;
834 if (deltanext == I32_MAX)
835 is_inf = is_inf_internal = 1;
837 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
839 if (data && (data_fake.flags & SF_HAS_EVAL))
840 data->flags |= SF_HAS_EVAL;
842 data->whilem_c = data_fake.whilem_c;
843 if (flags & SCF_DO_STCLASS)
844 cl_or(pRExC_state, &accum, &this_class);
848 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
850 if (flags & SCF_DO_SUBSTR) {
851 data->pos_min += min1;
852 data->pos_delta += max1 - min1;
853 if (max1 != min1 || is_inf)
854 data->longest = &(data->longest_float);
857 delta += max1 - min1;
858 if (flags & SCF_DO_STCLASS_OR) {
859 cl_or(pRExC_state, data->start_class, &accum);
861 cl_and(data->start_class, &and_with);
862 flags &= ~SCF_DO_STCLASS;
865 else if (flags & SCF_DO_STCLASS_AND) {
867 cl_and(data->start_class, &accum);
868 flags &= ~SCF_DO_STCLASS;
871 /* Switch to OR mode: cache the old value of
872 * data->start_class */
873 StructCopy(data->start_class, &and_with,
874 struct regnode_charclass_class);
875 flags &= ~SCF_DO_STCLASS_AND;
876 StructCopy(&accum, data->start_class,
877 struct regnode_charclass_class);
878 flags |= SCF_DO_STCLASS_OR;
879 data->start_class->flags |= ANYOF_EOS;
884 else if (code == BRANCHJ) /* single branch is optimized. */
885 scan = NEXTOPER(NEXTOPER(scan));
886 else /* single branch is optimized. */
887 scan = NEXTOPER(scan);
890 else if (OP(scan) == EXACT) {
891 I32 l = STR_LEN(scan);
894 const U8 * const s = (U8*)STRING(scan);
895 l = utf8_length((U8 *)s, (U8 *)s + l);
896 uc = utf8_to_uvchr((U8 *)s, NULL);
898 uc = *((U8*)STRING(scan));
901 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
902 /* The code below prefers earlier match for fixed
903 offset, later match for variable offset. */
904 if (data->last_end == -1) { /* Update the start info. */
905 data->last_start_min = data->pos_min;
906 data->last_start_max = is_inf
907 ? I32_MAX : data->pos_min + data->pos_delta;
909 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
911 SvUTF8_on(data->last_found);
913 SV * const sv = data->last_found;
914 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
915 mg_find(sv, PERL_MAGIC_utf8) : NULL;
916 if (mg && mg->mg_len >= 0)
917 mg->mg_len += utf8_length((U8*)STRING(scan),
918 (U8*)STRING(scan)+STR_LEN(scan));
920 data->last_end = data->pos_min + l;
921 data->pos_min += l; /* As in the first entry. */
922 data->flags &= ~SF_BEFORE_EOL;
924 if (flags & SCF_DO_STCLASS_AND) {
925 /* Check whether it is compatible with what we know already! */
929 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
930 && !ANYOF_BITMAP_TEST(data->start_class, uc)
931 && (!(data->start_class->flags & ANYOF_FOLD)
932 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
935 ANYOF_CLASS_ZERO(data->start_class);
936 ANYOF_BITMAP_ZERO(data->start_class);
938 ANYOF_BITMAP_SET(data->start_class, uc);
939 data->start_class->flags &= ~ANYOF_EOS;
941 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
943 else if (flags & SCF_DO_STCLASS_OR) {
944 /* false positive possible if the class is case-folded */
946 ANYOF_BITMAP_SET(data->start_class, uc);
948 data->start_class->flags |= ANYOF_UNICODE_ALL;
949 data->start_class->flags &= ~ANYOF_EOS;
950 cl_and(data->start_class, &and_with);
952 flags &= ~SCF_DO_STCLASS;
954 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
955 I32 l = STR_LEN(scan);
956 UV uc = *((U8*)STRING(scan));
958 /* Search for fixed substrings supports EXACT only. */
959 if (flags & SCF_DO_SUBSTR) {
961 scan_commit(pRExC_state, data);
964 U8 * const s = (U8 *)STRING(scan);
965 l = utf8_length(s, s + l);
966 uc = utf8_to_uvchr(s, NULL);
969 if (flags & SCF_DO_SUBSTR)
971 if (flags & SCF_DO_STCLASS_AND) {
972 /* Check whether it is compatible with what we know already! */
976 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
977 && !ANYOF_BITMAP_TEST(data->start_class, uc)
978 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
980 ANYOF_CLASS_ZERO(data->start_class);
981 ANYOF_BITMAP_ZERO(data->start_class);
983 ANYOF_BITMAP_SET(data->start_class, uc);
984 data->start_class->flags &= ~ANYOF_EOS;
985 data->start_class->flags |= ANYOF_FOLD;
986 if (OP(scan) == EXACTFL)
987 data->start_class->flags |= ANYOF_LOCALE;
990 else if (flags & SCF_DO_STCLASS_OR) {
991 if (data->start_class->flags & ANYOF_FOLD) {
992 /* false positive possible if the class is case-folded.
993 Assume that the locale settings are the same... */
995 ANYOF_BITMAP_SET(data->start_class, uc);
996 data->start_class->flags &= ~ANYOF_EOS;
998 cl_and(data->start_class, &and_with);
1000 flags &= ~SCF_DO_STCLASS;
1002 else if (strchr((const char*)PL_varies,OP(scan))) {
1003 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1004 I32 f = flags, pos_before = 0;
1005 regnode * const oscan = scan;
1006 struct regnode_charclass_class this_class;
1007 struct regnode_charclass_class *oclass = NULL;
1008 I32 next_is_eval = 0;
1010 switch (PL_regkind[(U8)OP(scan)]) {
1011 case WHILEM: /* End of (?:...)* . */
1012 scan = NEXTOPER(scan);
1015 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1016 next = NEXTOPER(scan);
1017 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1019 maxcount = REG_INFTY;
1020 next = regnext(scan);
1021 scan = NEXTOPER(scan);
1025 if (flags & SCF_DO_SUBSTR)
1030 if (flags & SCF_DO_STCLASS) {
1032 maxcount = REG_INFTY;
1033 next = regnext(scan);
1034 scan = NEXTOPER(scan);
1037 is_inf = is_inf_internal = 1;
1038 scan = regnext(scan);
1039 if (flags & SCF_DO_SUBSTR) {
1040 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1041 data->longest = &(data->longest_float);
1043 goto optimize_curly_tail;
1045 mincount = ARG1(scan);
1046 maxcount = ARG2(scan);
1047 next = regnext(scan);
1048 if (OP(scan) == CURLYX) {
1049 I32 lp = (data ? *(data->last_closep) : 0);
1051 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1053 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1054 next_is_eval = (OP(scan) == EVAL);
1056 if (flags & SCF_DO_SUBSTR) {
1057 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1058 pos_before = data->pos_min;
1062 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1064 data->flags |= SF_IS_INF;
1066 if (flags & SCF_DO_STCLASS) {
1067 cl_init(pRExC_state, &this_class);
1068 oclass = data->start_class;
1069 data->start_class = &this_class;
1070 f |= SCF_DO_STCLASS_AND;
1071 f &= ~SCF_DO_STCLASS_OR;
1073 /* These are the cases when once a subexpression
1074 fails at a particular position, it cannot succeed
1075 even after backtracking at the enclosing scope.
1077 XXXX what if minimal match and we are at the
1078 initial run of {n,m}? */
1079 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1080 f &= ~SCF_WHILEM_VISITED_POS;
1082 /* This will finish on WHILEM, setting scan, or on NULL: */
1083 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1085 ? (f & ~SCF_DO_SUBSTR) : f);
1087 if (flags & SCF_DO_STCLASS)
1088 data->start_class = oclass;
1089 if (mincount == 0 || minnext == 0) {
1090 if (flags & SCF_DO_STCLASS_OR) {
1091 cl_or(pRExC_state, data->start_class, &this_class);
1093 else if (flags & SCF_DO_STCLASS_AND) {
1094 /* Switch to OR mode: cache the old value of
1095 * data->start_class */
1096 StructCopy(data->start_class, &and_with,
1097 struct regnode_charclass_class);
1098 flags &= ~SCF_DO_STCLASS_AND;
1099 StructCopy(&this_class, data->start_class,
1100 struct regnode_charclass_class);
1101 flags |= SCF_DO_STCLASS_OR;
1102 data->start_class->flags |= ANYOF_EOS;
1104 } else { /* Non-zero len */
1105 if (flags & SCF_DO_STCLASS_OR) {
1106 cl_or(pRExC_state, data->start_class, &this_class);
1107 cl_and(data->start_class, &and_with);
1109 else if (flags & SCF_DO_STCLASS_AND)
1110 cl_and(data->start_class, &this_class);
1111 flags &= ~SCF_DO_STCLASS;
1113 if (!scan) /* It was not CURLYX, but CURLY. */
1115 if ( /* ? quantifier ok, except for (?{ ... }) */
1116 (next_is_eval || !(mincount == 0 && maxcount == 1))
1117 && (minnext == 0) && (deltanext == 0)
1118 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1119 && maxcount <= REG_INFTY/3 /* Complement check for big count */
1120 && ckWARN(WARN_REGEXP))
1123 "Quantifier unexpected on zero-length expression");
1126 min += minnext * mincount;
1127 is_inf_internal |= ((maxcount == REG_INFTY
1128 && (minnext + deltanext) > 0)
1129 || deltanext == I32_MAX);
1130 is_inf |= is_inf_internal;
1131 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1133 /* Try powerful optimization CURLYX => CURLYN. */
1134 if ( OP(oscan) == CURLYX && data
1135 && data->flags & SF_IN_PAR
1136 && !(data->flags & SF_HAS_EVAL)
1137 && !deltanext && minnext == 1 ) {
1138 /* Try to optimize to CURLYN. */
1139 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1140 regnode * const nxt1 = nxt;
1147 if (!strchr((const char*)PL_simple,OP(nxt))
1148 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1149 && STR_LEN(nxt) == 1))
1155 if (OP(nxt) != CLOSE)
1157 /* Now we know that nxt2 is the only contents: */
1158 oscan->flags = (U8)ARG(nxt);
1160 OP(nxt1) = NOTHING; /* was OPEN. */
1162 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1163 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1164 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1165 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1166 OP(nxt + 1) = OPTIMIZED; /* was count. */
1167 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1172 /* Try optimization CURLYX => CURLYM. */
1173 if ( OP(oscan) == CURLYX && data
1174 && !(data->flags & SF_HAS_PAR)
1175 && !(data->flags & SF_HAS_EVAL)
1176 && !deltanext /* atom is fixed width */
1177 && minnext != 0 /* CURLYM can't handle zero width */
1179 /* XXXX How to optimize if data == 0? */
1180 /* Optimize to a simpler form. */
1181 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1185 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1186 && (OP(nxt2) != WHILEM))
1188 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1189 /* Need to optimize away parenths. */
1190 if (data->flags & SF_IN_PAR) {
1191 /* Set the parenth number. */
1192 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1194 if (OP(nxt) != CLOSE)
1195 FAIL("Panic opt close");
1196 oscan->flags = (U8)ARG(nxt);
1197 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1198 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1200 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1201 OP(nxt + 1) = OPTIMIZED; /* was count. */
1202 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1203 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1206 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1207 regnode *nnxt = regnext(nxt1);
1210 if (reg_off_by_arg[OP(nxt1)])
1211 ARG_SET(nxt1, nxt2 - nxt1);
1212 else if (nxt2 - nxt1 < U16_MAX)
1213 NEXT_OFF(nxt1) = nxt2 - nxt1;
1215 OP(nxt) = NOTHING; /* Cannot beautify */
1220 /* Optimize again: */
1221 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1227 else if ((OP(oscan) == CURLYX)
1228 && (flags & SCF_WHILEM_VISITED_POS)
1229 /* See the comment on a similar expression above.
1230 However, this time it not a subexpression
1231 we care about, but the expression itself. */
1232 && (maxcount == REG_INFTY)
1233 && data && ++data->whilem_c < 16) {
1234 /* This stays as CURLYX, we can put the count/of pair. */
1235 /* Find WHILEM (as in regexec.c) */
1236 regnode *nxt = oscan + NEXT_OFF(oscan);
1238 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1240 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1241 | (RExC_whilem_seen << 4)); /* On WHILEM */
1243 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1245 if (flags & SCF_DO_SUBSTR) {
1246 SV *last_str = NULL;
1247 int counted = mincount != 0;
1249 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1250 #if defined(SPARC64_GCC_WORKAROUND)
1253 const char *s = NULL;
1256 if (pos_before >= data->last_start_min)
1259 b = data->last_start_min;
1262 s = SvPV_const(data->last_found, l);
1263 old = b - data->last_start_min;
1266 I32 b = pos_before >= data->last_start_min
1267 ? pos_before : data->last_start_min;
1269 const char * const s = SvPV_const(data->last_found, l);
1270 I32 old = b - data->last_start_min;
1274 old = utf8_hop((U8*)s, old) - (U8*)s;
1277 /* Get the added string: */
1278 last_str = newSVpvn(s + old, l);
1280 SvUTF8_on(last_str);
1281 if (deltanext == 0 && pos_before == b) {
1282 /* What was added is a constant string */
1284 SvGROW(last_str, (mincount * l) + 1);
1285 repeatcpy(SvPVX(last_str) + l,
1286 SvPVX_const(last_str), l, mincount - 1);
1287 SvCUR_set(last_str, SvCUR(last_str) * mincount);
1288 /* Add additional parts. */
1289 SvCUR_set(data->last_found,
1290 SvCUR(data->last_found) - l);
1291 sv_catsv(data->last_found, last_str);
1293 SV * sv = data->last_found;
1295 SvUTF8(sv) && SvMAGICAL(sv) ?
1296 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1297 if (mg && mg->mg_len >= 0)
1298 mg->mg_len += CHR_SVLEN(last_str);
1300 data->last_end += l * (mincount - 1);
1303 /* start offset must point into the last copy */
1304 data->last_start_min += minnext * (mincount - 1);
1305 data->last_start_max += is_inf ? I32_MAX
1306 : (maxcount - 1) * (minnext + data->pos_delta);
1309 /* It is counted once already... */
1310 data->pos_min += minnext * (mincount - counted);
1311 data->pos_delta += - counted * deltanext +
1312 (minnext + deltanext) * maxcount - minnext * mincount;
1313 if (mincount != maxcount) {
1314 /* Cannot extend fixed substrings found inside
1316 scan_commit(pRExC_state,data);
1317 if (mincount && last_str) {
1318 SV * const sv = data->last_found;
1319 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
1320 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1324 sv_setsv(sv, last_str);
1325 data->last_end = data->pos_min;
1326 data->last_start_min =
1327 data->pos_min - CHR_SVLEN(last_str);
1328 data->last_start_max = is_inf
1330 : data->pos_min + data->pos_delta
1331 - CHR_SVLEN(last_str);
1333 data->longest = &(data->longest_float);
1335 SvREFCNT_dec(last_str);
1337 if (data && (fl & SF_HAS_EVAL))
1338 data->flags |= SF_HAS_EVAL;
1339 optimize_curly_tail:
1340 if (OP(oscan) != CURLYX) {
1341 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1343 NEXT_OFF(oscan) += NEXT_OFF(next);
1346 default: /* REF and CLUMP only? */
1347 if (flags & SCF_DO_SUBSTR) {
1348 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1349 data->longest = &(data->longest_float);
1351 is_inf = is_inf_internal = 1;
1352 if (flags & SCF_DO_STCLASS_OR)
1353 cl_anything(pRExC_state, data->start_class);
1354 flags &= ~SCF_DO_STCLASS;
1358 else if (strchr((const char*)PL_simple,OP(scan))) {
1361 if (flags & SCF_DO_SUBSTR) {
1362 scan_commit(pRExC_state,data);
1366 if (flags & SCF_DO_STCLASS) {
1367 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1369 /* Some of the logic below assumes that switching
1370 locale on will only add false positives. */
1371 switch (PL_regkind[(U8)OP(scan)]) {
1375 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1376 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1377 cl_anything(pRExC_state, data->start_class);
1380 if (OP(scan) == SANY)
1382 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1383 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1384 || (data->start_class->flags & ANYOF_CLASS));
1385 cl_anything(pRExC_state, data->start_class);
1387 if (flags & SCF_DO_STCLASS_AND || !value)
1388 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1391 if (flags & SCF_DO_STCLASS_AND)
1392 cl_and(data->start_class,
1393 (struct regnode_charclass_class*)scan);
1395 cl_or(pRExC_state, data->start_class,
1396 (struct regnode_charclass_class*)scan);
1399 if (flags & SCF_DO_STCLASS_AND) {
1400 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1401 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1402 for (value = 0; value < 256; value++)
1403 if (!isALNUM(value))
1404 ANYOF_BITMAP_CLEAR(data->start_class, value);
1408 if (data->start_class->flags & ANYOF_LOCALE)
1409 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1411 for (value = 0; value < 256; value++)
1413 ANYOF_BITMAP_SET(data->start_class, value);
1418 if (flags & SCF_DO_STCLASS_AND) {
1419 if (data->start_class->flags & ANYOF_LOCALE)
1420 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1423 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1424 data->start_class->flags |= ANYOF_LOCALE;
1428 if (flags & SCF_DO_STCLASS_AND) {
1429 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1430 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1431 for (value = 0; value < 256; value++)
1433 ANYOF_BITMAP_CLEAR(data->start_class, value);
1437 if (data->start_class->flags & ANYOF_LOCALE)
1438 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1440 for (value = 0; value < 256; value++)
1441 if (!isALNUM(value))
1442 ANYOF_BITMAP_SET(data->start_class, value);
1447 if (flags & SCF_DO_STCLASS_AND) {
1448 if (data->start_class->flags & ANYOF_LOCALE)
1449 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1452 data->start_class->flags |= ANYOF_LOCALE;
1453 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1457 if (flags & SCF_DO_STCLASS_AND) {
1458 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1459 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1460 for (value = 0; value < 256; value++)
1461 if (!isSPACE(value))
1462 ANYOF_BITMAP_CLEAR(data->start_class, value);
1466 if (data->start_class->flags & ANYOF_LOCALE)
1467 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1469 for (value = 0; value < 256; value++)
1471 ANYOF_BITMAP_SET(data->start_class, value);
1476 if (flags & SCF_DO_STCLASS_AND) {
1477 if (data->start_class->flags & ANYOF_LOCALE)
1478 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1481 data->start_class->flags |= ANYOF_LOCALE;
1482 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1486 if (flags & SCF_DO_STCLASS_AND) {
1487 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1488 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1489 for (value = 0; value < 256; value++)
1491 ANYOF_BITMAP_CLEAR(data->start_class, value);
1495 if (data->start_class->flags & ANYOF_LOCALE)
1496 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1498 for (value = 0; value < 256; value++)
1499 if (!isSPACE(value))
1500 ANYOF_BITMAP_SET(data->start_class, value);
1505 if (flags & SCF_DO_STCLASS_AND) {
1506 if (data->start_class->flags & ANYOF_LOCALE) {
1507 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1508 for (value = 0; value < 256; value++)
1509 if (!isSPACE(value))
1510 ANYOF_BITMAP_CLEAR(data->start_class, value);
1514 data->start_class->flags |= ANYOF_LOCALE;
1515 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1519 if (flags & SCF_DO_STCLASS_AND) {
1520 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1521 for (value = 0; value < 256; value++)
1522 if (!isDIGIT(value))
1523 ANYOF_BITMAP_CLEAR(data->start_class, value);
1526 if (data->start_class->flags & ANYOF_LOCALE)
1527 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1529 for (value = 0; value < 256; value++)
1531 ANYOF_BITMAP_SET(data->start_class, value);
1536 if (flags & SCF_DO_STCLASS_AND) {
1537 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1538 for (value = 0; value < 256; value++)
1540 ANYOF_BITMAP_CLEAR(data->start_class, value);
1543 if (data->start_class->flags & ANYOF_LOCALE)
1544 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1546 for (value = 0; value < 256; value++)
1547 if (!isDIGIT(value))
1548 ANYOF_BITMAP_SET(data->start_class, value);
1553 if (flags & SCF_DO_STCLASS_OR)
1554 cl_and(data->start_class, &and_with);
1555 flags &= ~SCF_DO_STCLASS;
1558 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1559 data->flags |= (OP(scan) == MEOL
1563 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1564 /* Lookbehind, or need to calculate parens/evals/stclass: */
1565 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1566 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1567 /* Lookahead/lookbehind */
1568 I32 deltanext, minnext, fake = 0;
1570 struct regnode_charclass_class intrnl;
1573 data_fake.flags = 0;
1575 data_fake.whilem_c = data->whilem_c;
1576 data_fake.last_closep = data->last_closep;
1579 data_fake.last_closep = &fake;
1580 if ( flags & SCF_DO_STCLASS && !scan->flags
1581 && OP(scan) == IFMATCH ) { /* Lookahead */
1582 cl_init(pRExC_state, &intrnl);
1583 data_fake.start_class = &intrnl;
1584 f |= SCF_DO_STCLASS_AND;
1586 if (flags & SCF_WHILEM_VISITED_POS)
1587 f |= SCF_WHILEM_VISITED_POS;
1588 next = regnext(scan);
1589 nscan = NEXTOPER(NEXTOPER(scan));
1590 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1593 vFAIL("Variable length lookbehind not implemented");
1595 else if (minnext > U8_MAX) {
1596 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1598 scan->flags = (U8)minnext;
1600 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1602 if (data && (data_fake.flags & SF_HAS_EVAL))
1603 data->flags |= SF_HAS_EVAL;
1605 data->whilem_c = data_fake.whilem_c;
1606 if (f & SCF_DO_STCLASS_AND) {
1607 const int was = (data->start_class->flags & ANYOF_EOS);
1609 cl_and(data->start_class, &intrnl);
1611 data->start_class->flags |= ANYOF_EOS;
1614 else if (OP(scan) == OPEN) {
1617 else if (OP(scan) == CLOSE) {
1618 if ((I32)ARG(scan) == is_par) {
1619 next = regnext(scan);
1621 if ( next && (OP(next) != WHILEM) && next < last)
1622 is_par = 0; /* Disable optimization */
1625 *(data->last_closep) = ARG(scan);
1627 else if (OP(scan) == EVAL) {
1629 data->flags |= SF_HAS_EVAL;
1631 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1632 if (flags & SCF_DO_SUBSTR) {
1633 scan_commit(pRExC_state,data);
1634 data->longest = &(data->longest_float);
1636 is_inf = is_inf_internal = 1;
1637 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1638 cl_anything(pRExC_state, data->start_class);
1639 flags &= ~SCF_DO_STCLASS;
1641 /* Else: zero-length, ignore. */
1642 scan = regnext(scan);
1647 *deltap = is_inf_internal ? I32_MAX : delta;
1648 if (flags & SCF_DO_SUBSTR && is_inf)
1649 data->pos_delta = I32_MAX - data->pos_min;
1650 if (is_par > U8_MAX)
1652 if (is_par && pars==1 && data) {
1653 data->flags |= SF_IN_PAR;
1654 data->flags &= ~SF_HAS_PAR;
1656 else if (pars && data) {
1657 data->flags |= SF_HAS_PAR;
1658 data->flags &= ~SF_IN_PAR;
1660 if (flags & SCF_DO_STCLASS_OR)
1661 cl_and(data->start_class, &and_with);
1666 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
1668 U32 count = RExC_rx->data ? RExC_rx->data->count : 0;
1670 Renewc(RExC_rx->data,
1671 sizeof(*RExC_rx->data) + sizeof(void*) * (count + n - 1),
1672 char, struct reg_data);
1674 Renew(RExC_rx->data->what, count + n, U8);
1676 Newx(RExC_rx->data->what, n, U8);
1677 RExC_rx->data->count = count + n;
1678 Copy(s, RExC_rx->data->what + count, n, U8);
1683 Perl_reginitcolors(pTHX)
1685 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
1687 char *t = savepv(s);
1691 t = strchr(t, '\t');
1697 PL_colors[i] = t = (char *)"";
1702 PL_colors[i++] = (char *)"";
1709 - pregcomp - compile a regular expression into internal code
1711 * We can't allocate space until we know how big the compiled form will be,
1712 * but we can't compile it (and thus know how big it is) until we've got a
1713 * place to put the code. So we cheat: we compile it twice, once with code
1714 * generation turned off and size counting turned on, and once "for real".
1715 * This also means that we don't allocate space until we are sure that the
1716 * thing really will compile successfully, and we never have to move the
1717 * code and thus invalidate pointers into it. (Note that it has to be in
1718 * one piece because free() must be able to free it all.) [NB: not true in perl]
1720 * Beware that the optimization-preparation code in here knows about some
1721 * of the structure of the compiled regexp. [I'll say.]
1724 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1734 RExC_state_t RExC_state;
1735 RExC_state_t *pRExC_state = &RExC_state;
1738 FAIL("NULL regexp argument");
1740 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1744 if (!PL_colorset) reginitcolors();
1745 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1746 PL_colors[4],PL_colors[5],PL_colors[0],
1747 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1749 RExC_flags = pm->op_pmflags;
1753 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1754 RExC_seen_evals = 0;
1757 /* First pass: determine size, legality. */
1764 RExC_emit = &PL_regdummy;
1765 RExC_whilem_seen = 0;
1766 #if 0 /* REGC() is (currently) a NOP at the first pass.
1767 * Clever compilers notice this and complain. --jhi */
1768 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1770 if (reg(pRExC_state, 0, &flags) == NULL) {
1771 RExC_precomp = NULL;
1774 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1776 /* Small enough for pointer-storage convention?
1777 If extralen==0, this means that we will not need long jumps. */
1778 if (RExC_size >= 0x10000L && RExC_extralen)
1779 RExC_size += RExC_extralen;
1782 if (RExC_whilem_seen > 15)
1783 RExC_whilem_seen = 15;
1785 /* Allocate space and initialize. */
1786 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1789 FAIL("Regexp out of space");
1792 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1793 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1796 r->prelen = xend - exp;
1797 r->precomp = savepvn(RExC_precomp, r->prelen);
1799 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1800 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1801 r->lastparen = 0; /* mg.c reads this. */
1803 r->substrs = 0; /* Useful during FAIL. */
1804 r->startp = 0; /* Useful during FAIL. */
1805 r->endp = 0; /* Useful during FAIL. */
1807 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1809 r->offsets[0] = RExC_size;
1811 DEBUG_r(PerlIO_printf(Perl_debug_log,
1812 "%s %"UVuf" bytes for offset annotations.\n",
1813 r->offsets ? "Got" : "Couldn't get",
1814 (UV)((2*RExC_size+1) * sizeof(U32))));
1818 /* Second pass: emit code. */
1819 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1824 RExC_emit_start = r->program;
1825 RExC_emit = r->program;
1826 /* Store the count of eval-groups for security checks: */
1827 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1828 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1830 if (reg(pRExC_state, 0, &flags) == NULL)
1833 /* Dig out information for optimizations. */
1834 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1835 pm->op_pmflags = RExC_flags;
1837 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1838 r->regstclass = NULL;
1839 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1840 r->reganch |= ROPT_NAUGHTY;
1841 scan = r->program + 1; /* First BRANCH. */
1843 /* XXXX To minimize changes to RE engine we always allocate
1844 3-units-long substrs field. */
1845 Newxz(r->substrs, 1, struct reg_substr_data);
1847 StructCopy(&zero_scan_data, &data, scan_data_t);
1848 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1849 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1851 STRLEN longest_float_length, longest_fixed_length;
1852 struct regnode_charclass_class ch_class;
1857 /* Skip introductions and multiplicators >= 1. */
1858 while ((OP(first) == OPEN && (sawopen = 1)) ||
1859 /* An OR of *one* alternative - should not happen now. */
1860 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1861 (OP(first) == PLUS) ||
1862 (OP(first) == MINMOD) ||
1863 /* An {n,m} with n>0 */
1864 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1865 if (OP(first) == PLUS)
1868 first += regarglen[(U8)OP(first)];
1869 first = NEXTOPER(first);
1872 /* Starting-point info. */
1874 if (PL_regkind[(U8)OP(first)] == EXACT) {
1875 if (OP(first) == EXACT)
1876 NOOP; /* Empty, get anchored substr later. */
1877 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1878 r->regstclass = first;
1880 else if (strchr((const char*)PL_simple,OP(first)))
1881 r->regstclass = first;
1882 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1883 PL_regkind[(U8)OP(first)] == NBOUND)
1884 r->regstclass = first;
1885 else if (PL_regkind[(U8)OP(first)] == BOL) {
1886 r->reganch |= (OP(first) == MBOL
1888 : (OP(first) == SBOL
1891 first = NEXTOPER(first);
1894 else if (OP(first) == GPOS) {
1895 r->reganch |= ROPT_ANCH_GPOS;
1896 first = NEXTOPER(first);
1899 else if ((!sawopen || !RExC_sawback) &&
1900 (OP(first) == STAR &&
1901 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1902 !(r->reganch & ROPT_ANCH) )
1904 /* turn .* into ^.* with an implied $*=1 */
1906 (OP(NEXTOPER(first)) == REG_ANY)
1909 r->reganch |= type | ROPT_IMPLICIT;
1910 first = NEXTOPER(first);
1913 if (sawplus && (!sawopen || !RExC_sawback)
1914 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1915 /* x+ must match at the 1st pos of run of x's */
1916 r->reganch |= ROPT_SKIP;
1918 /* Scan is after the zeroth branch, first is atomic matcher. */
1919 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1920 (IV)(first - scan + 1)));
1922 * If there's something expensive in the r.e., find the
1923 * longest literal string that must appear and make it the
1924 * regmust. Resolve ties in favor of later strings, since
1925 * the regstart check works with the beginning of the r.e.
1926 * and avoiding duplication strengthens checking. Not a
1927 * strong reason, but sufficient in the absence of others.
1928 * [Now we resolve ties in favor of the earlier string if
1929 * it happens that c_offset_min has been invalidated, since the
1930 * earlier string may buy us something the later one won't.]
1934 data.longest_fixed = newSVpvs("");
1935 data.longest_float = newSVpvs("");
1936 data.last_found = newSVpvs("");
1937 data.longest = &(data.longest_fixed);
1939 if (!r->regstclass) {
1940 cl_init(pRExC_state, &ch_class);
1941 data.start_class = &ch_class;
1942 stclass_flag = SCF_DO_STCLASS_AND;
1943 } else /* XXXX Check for BOUND? */
1945 data.last_closep = &last_close;
1947 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1948 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1949 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1950 && data.last_start_min == 0 && data.last_end > 0
1951 && !RExC_seen_zerolen
1952 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1953 r->reganch |= ROPT_CHECK_ALL;
1954 scan_commit(pRExC_state, &data);
1955 SvREFCNT_dec(data.last_found);
1957 longest_float_length = CHR_SVLEN(data.longest_float);
1958 if (longest_float_length
1959 || (data.flags & SF_FL_BEFORE_EOL
1960 && (!(data.flags & SF_FL_BEFORE_MEOL)
1961 || (RExC_flags & PMf_MULTILINE)))) {
1964 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1965 && data.offset_fixed == data.offset_float_min
1966 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1967 goto remove_float; /* As in (a)+. */
1969 if (SvUTF8(data.longest_float)) {
1970 r->float_utf8 = data.longest_float;
1971 r->float_substr = NULL;
1973 r->float_substr = data.longest_float;
1974 r->float_utf8 = NULL;
1976 r->float_min_offset = data.offset_float_min;
1977 r->float_max_offset = data.offset_float_max;
1978 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1979 && (!(data.flags & SF_FL_BEFORE_MEOL)
1980 || (RExC_flags & PMf_MULTILINE)));
1981 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1985 r->float_substr = r->float_utf8 = NULL;
1986 SvREFCNT_dec(data.longest_float);
1987 longest_float_length = 0;
1990 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1991 if (longest_fixed_length
1992 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1993 && (!(data.flags & SF_FIX_BEFORE_MEOL)
1994 || (RExC_flags & PMf_MULTILINE)))) {
1997 if (SvUTF8(data.longest_fixed)) {
1998 r->anchored_utf8 = data.longest_fixed;
1999 r->anchored_substr = NULL;
2001 r->anchored_substr = data.longest_fixed;
2002 r->anchored_utf8 = NULL;
2004 r->anchored_offset = data.offset_fixed;
2005 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2006 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2007 || (RExC_flags & PMf_MULTILINE)));
2008 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2011 r->anchored_substr = r->anchored_utf8 = NULL;
2012 SvREFCNT_dec(data.longest_fixed);
2013 longest_fixed_length = 0;
2016 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2017 r->regstclass = NULL;
2018 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2020 && !(data.start_class->flags & ANYOF_EOS)
2021 && !cl_is_anything(data.start_class))
2023 const U32 n = add_data(pRExC_state, 1, "f");
2025 Newx(RExC_rx->data->data[n], 1,
2026 struct regnode_charclass_class);
2027 StructCopy(data.start_class,
2028 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2029 struct regnode_charclass_class);
2030 r->regstclass = (regnode*)RExC_rx->data->data[n];
2031 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2032 PL_regdata = r->data; /* for regprop() */
2033 DEBUG_r({ SV *sv = sv_newmortal();
2034 regprop(sv, (regnode*)data.start_class);
2035 PerlIO_printf(Perl_debug_log,
2036 "synthetic stclass \"%s\".\n",
2037 SvPVX_const(sv));});
2040 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2041 if (longest_fixed_length > longest_float_length) {
2042 r->check_substr = r->anchored_substr;
2043 r->check_utf8 = r->anchored_utf8;
2044 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2045 if (r->reganch & ROPT_ANCH_SINGLE)
2046 r->reganch |= ROPT_NOSCAN;
2049 r->check_substr = r->float_substr;
2050 r->check_utf8 = r->float_utf8;
2051 r->check_offset_min = data.offset_float_min;
2052 r->check_offset_max = data.offset_float_max;
2054 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2055 This should be changed ASAP! */
2056 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2057 r->reganch |= RE_USE_INTUIT;
2058 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2059 r->reganch |= RE_INTUIT_TAIL;
2063 /* Several toplevels. Best we can is to set minlen. */
2065 struct regnode_charclass_class ch_class;
2068 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2069 scan = r->program + 1;
2070 cl_init(pRExC_state, &ch_class);
2071 data.start_class = &ch_class;
2072 data.last_closep = &last_close;
2073 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2074 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2075 = r->float_substr = r->float_utf8 = NULL;
2076 if (!(data.start_class->flags & ANYOF_EOS)
2077 && !cl_is_anything(data.start_class))
2079 const U32 n = add_data(pRExC_state, 1, "f");
2081 Newx(RExC_rx->data->data[n], 1,
2082 struct regnode_charclass_class);
2083 StructCopy(data.start_class,
2084 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2085 struct regnode_charclass_class);
2086 r->regstclass = (regnode*)RExC_rx->data->data[n];
2087 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2088 DEBUG_r({ SV* sv = sv_newmortal();
2089 regprop(sv, (regnode*)data.start_class);
2090 PerlIO_printf(Perl_debug_log,
2091 "synthetic stclass \"%s\".\n",
2092 SvPVX_const(sv));});
2097 if (RExC_seen & REG_SEEN_GPOS)
2098 r->reganch |= ROPT_GPOS_SEEN;
2099 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2100 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2101 if (RExC_seen & REG_SEEN_EVAL)
2102 r->reganch |= ROPT_EVAL_SEEN;
2103 if (RExC_seen & REG_SEEN_CANY)
2104 r->reganch |= ROPT_CANY_SEEN;
2105 Newxz(r->startp, RExC_npar, I32);
2106 Newxz(r->endp, RExC_npar, I32);
2107 PL_regdata = r->data; /* for regprop() */
2108 DEBUG_r(regdump(r));
2113 - reg - regular expression, i.e. main body or parenthesized thing
2115 * Caller must absorb opening parenthesis.
2117 * Combining parenthesis handling with the base level of regular expression
2118 * is a trifle forced, but the need to tie the tails of the branches to what
2119 * follows makes it hard to avoid.
2122 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2123 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2125 register regnode *ret; /* Will be the head of the group. */
2126 register regnode *br;
2127 register regnode *lastbr;
2128 register regnode *ender = NULL;
2129 register I32 parno = 0;
2131 const I32 oregflags = RExC_flags;
2132 bool have_branch = 0;
2135 /* for (?g), (?gc), and (?o) warnings; warning
2136 about (?c) will warn about (?g) -- japhy */
2138 #define WASTED_O 0x01
2139 #define WASTED_G 0x02
2140 #define WASTED_C 0x04
2141 #define WASTED_GC (0x02|0x04)
2142 I32 wastedflags = 0x00;
2144 char * parse_start = RExC_parse; /* MJD */
2145 char * const oregcomp_parse = RExC_parse;
2147 *flagp = 0; /* Tentatively. */
2150 /* Make an OPEN node, if parenthesized. */
2152 if (*RExC_parse == '?') { /* (?...) */
2153 U32 posflags = 0, negflags = 0;
2154 U32 *flagsp = &posflags;
2155 bool is_logical = 0;
2156 const char * const seqstart = RExC_parse;
2159 paren = *RExC_parse++;
2160 ret = NULL; /* For look-ahead/behind. */
2162 case '<': /* (?<...) */
2163 RExC_seen |= REG_SEEN_LOOKBEHIND;
2164 if (*RExC_parse == '!')
2166 if (*RExC_parse != '=' && *RExC_parse != '!')
2169 case '=': /* (?=...) */
2170 case '!': /* (?!...) */
2171 RExC_seen_zerolen++;
2172 case ':': /* (?:...) */
2173 case '>': /* (?>...) */
2175 case '$': /* (?$...) */
2176 case '@': /* (?@...) */
2177 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2179 case '#': /* (?#...) */
2180 while (*RExC_parse && *RExC_parse != ')')
2182 if (*RExC_parse != ')')
2183 FAIL("Sequence (?#... not terminated");
2184 nextchar(pRExC_state);
2187 case 'p': /* (?p...) */
2188 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2189 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2191 case '?': /* (??...) */
2193 if (*RExC_parse != '{')
2195 paren = *RExC_parse++;
2197 case '{': /* (?{...}) */
2202 char *s = RExC_parse;
2204 RExC_seen_zerolen++;
2205 RExC_seen |= REG_SEEN_EVAL;
2206 while (count && (c = *RExC_parse)) {
2217 if (*RExC_parse != ')') {
2219 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2223 OP_4tree *sop, *rop;
2224 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
2227 Perl_save_re_context(aTHX);
2228 rop = sv_compile_2op(sv, &sop, "re", &pad);
2229 sop->op_private |= OPpREFCOUNTED;
2230 /* re_dup will OpREFCNT_inc */
2231 OpREFCNT_set(sop, 1);
2234 n = add_data(pRExC_state, 3, "nop");
2235 RExC_rx->data->data[n] = (void*)rop;
2236 RExC_rx->data->data[n+1] = (void*)sop;
2237 RExC_rx->data->data[n+2] = (void*)pad;
2240 else { /* First pass */
2241 if (PL_reginterp_cnt < ++RExC_seen_evals
2243 /* No compiled RE interpolated, has runtime
2244 components ===> unsafe. */
2245 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2246 if (PL_tainting && PL_tainted)
2247 FAIL("Eval-group in insecure regular expression");
2250 nextchar(pRExC_state);
2252 ret = reg_node(pRExC_state, LOGICAL);
2255 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2256 /* deal with the length of this later - MJD */
2259 ret = reganode(pRExC_state, EVAL, n);
2260 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2261 Set_Node_Offset(ret, parse_start);
2264 case '(': /* (?(?{...})...) and (?(?=...)...) */
2266 if (RExC_parse[0] == '?') { /* (?(?...)) */
2267 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2268 || RExC_parse[1] == '<'
2269 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2272 ret = reg_node(pRExC_state, LOGICAL);
2275 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2279 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2282 parno = atoi(RExC_parse++);
2284 while (isDIGIT(*RExC_parse))
2286 ret = reganode(pRExC_state, GROUPP, parno);
2288 if ((c = *nextchar(pRExC_state)) != ')')
2289 vFAIL("Switch condition not recognized");
2291 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2292 br = regbranch(pRExC_state, &flags, 1);
2294 br = reganode(pRExC_state, LONGJMP, 0);
2296 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2297 c = *nextchar(pRExC_state);
2301 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2302 regbranch(pRExC_state, &flags, 1);
2303 regtail(pRExC_state, ret, lastbr);
2306 c = *nextchar(pRExC_state);
2311 vFAIL("Switch (?(condition)... contains too many branches");
2312 ender = reg_node(pRExC_state, TAIL);
2313 regtail(pRExC_state, br, ender);
2315 regtail(pRExC_state, lastbr, ender);
2316 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2319 regtail(pRExC_state, ret, ender);
2323 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2327 RExC_parse--; /* for vFAIL to print correctly */
2328 vFAIL("Sequence (? incomplete");
2332 parse_flags: /* (?i) */
2333 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2334 /* (?g), (?gc) and (?o) are useless here
2335 and must be globally applied -- japhy */
2337 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2338 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2339 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
2340 if (! (wastedflags & wflagbit) ) {
2341 wastedflags |= wflagbit;
2344 "Useless (%s%c) - %suse /%c modifier",
2345 flagsp == &negflags ? "?-" : "?",
2347 flagsp == &negflags ? "don't " : "",
2353 else if (*RExC_parse == 'c') {
2354 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2355 if (! (wastedflags & WASTED_C) ) {
2356 wastedflags |= WASTED_GC;
2359 "Useless (%sc) - %suse /gc modifier",
2360 flagsp == &negflags ? "?-" : "?",
2361 flagsp == &negflags ? "don't " : ""
2366 else { pmflag(flagsp, *RExC_parse); }
2370 if (*RExC_parse == '-') {
2372 wastedflags = 0; /* reset so (?g-c) warns twice */
2376 RExC_flags |= posflags;
2377 RExC_flags &= ~negflags;
2378 if (*RExC_parse == ':') {
2384 if (*RExC_parse != ')') {
2386 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2388 nextchar(pRExC_state);
2396 ret = reganode(pRExC_state, OPEN, parno);
2397 Set_Node_Length(ret, 1); /* MJD */
2398 Set_Node_Offset(ret, RExC_parse); /* MJD */
2405 /* Pick up the branches, linking them together. */
2406 parse_start = RExC_parse; /* MJD */
2407 br = regbranch(pRExC_state, &flags, 1);
2408 /* branch_len = (paren != 0); */
2412 if (*RExC_parse == '|') {
2413 if (!SIZE_ONLY && RExC_extralen) {
2414 reginsert(pRExC_state, BRANCHJ, br);
2417 reginsert(pRExC_state, BRANCH, br);
2418 Set_Node_Length(br, paren != 0);
2419 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2423 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2425 else if (paren == ':') {
2426 *flagp |= flags&SIMPLE;
2428 if (is_open) { /* Starts with OPEN. */
2429 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2431 else if (paren != '?') /* Not Conditional */
2433 *flagp |= flags & (SPSTART | HASWIDTH);
2435 while (*RExC_parse == '|') {
2436 if (!SIZE_ONLY && RExC_extralen) {
2437 ender = reganode(pRExC_state, LONGJMP,0);
2438 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2441 RExC_extralen += 2; /* Account for LONGJMP. */
2442 nextchar(pRExC_state);
2443 br = regbranch(pRExC_state, &flags, 0);
2447 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2451 *flagp |= flags&SPSTART;
2454 if (have_branch || paren != ':') {
2455 /* Make a closing node, and hook it on the end. */
2458 ender = reg_node(pRExC_state, TAIL);
2461 ender = reganode(pRExC_state, CLOSE, parno);
2462 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2463 Set_Node_Length(ender,1); /* MJD */
2469 *flagp &= ~HASWIDTH;
2472 ender = reg_node(pRExC_state, SUCCEED);
2475 ender = reg_node(pRExC_state, END);
2478 regtail(pRExC_state, lastbr, ender);
2480 if (have_branch && !SIZE_ONLY) {
2481 /* Hook the tails of the branches to the closing node. */
2482 for (br = ret; br; br = regnext(br)) {
2483 const U8 op = PL_regkind[OP(br)];
2485 regtail(pRExC_state, NEXTOPER(br), ender);
2487 else if (op == BRANCHJ) {
2488 regtail(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
2496 static const char parens[] = "=!<,>";
2498 if (paren && (p = strchr(parens, paren))) {
2499 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2500 int flag = (p - parens) > 1;
2503 node = SUSPEND, flag = 0;
2504 reginsert(pRExC_state, node,ret);
2505 Set_Node_Cur_Length(ret);
2506 Set_Node_Offset(ret, parse_start + 1);
2508 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2512 /* Check for proper termination. */
2514 RExC_flags = oregflags;
2515 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2516 RExC_parse = oregcomp_parse;
2517 vFAIL("Unmatched (");
2520 else if (!paren && RExC_parse < RExC_end) {
2521 if (*RExC_parse == ')') {
2523 vFAIL("Unmatched )");
2526 FAIL("Junk on end of regexp"); /* "Can't happen". */
2534 - regbranch - one alternative of an | operator
2536 * Implements the concatenation operator.
2539 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2541 register regnode *ret;
2542 register regnode *chain = NULL;
2543 register regnode *latest;
2544 I32 flags = 0, c = 0;
2549 if (!SIZE_ONLY && RExC_extralen)
2550 ret = reganode(pRExC_state, BRANCHJ,0);
2552 ret = reg_node(pRExC_state, BRANCH);
2553 Set_Node_Length(ret, 1);
2557 if (!first && SIZE_ONLY)
2558 RExC_extralen += 1; /* BRANCHJ */
2560 *flagp = WORST; /* Tentatively. */
2563 nextchar(pRExC_state);
2564 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2566 latest = regpiece(pRExC_state, &flags);
2567 if (latest == NULL) {
2568 if (flags & TRYAGAIN)
2572 else if (ret == NULL)
2574 *flagp |= flags&HASWIDTH;
2575 if (chain == NULL) /* First piece. */
2576 *flagp |= flags&SPSTART;
2579 regtail(pRExC_state, chain, latest);
2584 if (chain == NULL) { /* Loop ran zero times. */
2585 chain = reg_node(pRExC_state, NOTHING);
2590 *flagp |= flags&SIMPLE;
2597 - regpiece - something followed by possible [*+?]
2599 * Note that the branching code sequences used for ? and the general cases
2600 * of * and + are somewhat optimized: they use the same NOTHING node as
2601 * both the endmarker for their branch list and the body of the last branch.
2602 * It might seem that this node could be dispensed with entirely, but the
2603 * endmarker role is not redundant.
2606 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2608 register regnode *ret;
2610 register char *next;
2612 const char * const origparse = RExC_parse;
2614 I32 max = REG_INFTY;
2616 const char *maxpos = NULL;
2618 ret = regatom(pRExC_state, &flags);
2620 if (flags & TRYAGAIN)
2627 if (op == '{' && regcurly(RExC_parse)) {
2628 parse_start = RExC_parse; /* MJD */
2629 next = RExC_parse + 1;
2631 while (isDIGIT(*next) || *next == ',') {
2640 if (*next == '}') { /* got one */
2644 min = atoi(RExC_parse);
2648 maxpos = RExC_parse;
2650 if (!max && *maxpos != '0')
2651 max = REG_INFTY; /* meaning "infinity" */
2652 else if (max >= REG_INFTY)
2653 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2655 nextchar(pRExC_state);
2658 if ((flags&SIMPLE)) {
2659 RExC_naughty += 2 + RExC_naughty / 2;
2660 reginsert(pRExC_state, CURLY, ret);
2661 Set_Node_Offset(ret, parse_start+1); /* MJD */
2662 Set_Node_Cur_Length(ret);
2665 regnode *w = reg_node(pRExC_state, WHILEM);
2668 regtail(pRExC_state, ret, w);
2669 if (!SIZE_ONLY && RExC_extralen) {
2670 reginsert(pRExC_state, LONGJMP,ret);
2671 reginsert(pRExC_state, NOTHING,ret);
2672 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2674 reginsert(pRExC_state, CURLYX,ret);
2676 Set_Node_Offset(ret, parse_start+1);
2677 Set_Node_Length(ret,
2678 op == '{' ? (RExC_parse - parse_start) : 1);
2680 if (!SIZE_ONLY && RExC_extralen)
2681 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2682 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2684 RExC_whilem_seen++, RExC_extralen += 3;
2685 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2693 if (max && max < min)
2694 vFAIL("Can't do {n,m} with n > m");
2696 ARG1_SET(ret, (U16)min);
2697 ARG2_SET(ret, (U16)max);
2709 #if 0 /* Now runtime fix should be reliable. */
2711 /* if this is reinstated, don't forget to put this back into perldiag:
2713 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2715 (F) The part of the regexp subject to either the * or + quantifier
2716 could match an empty string. The {#} shows in the regular
2717 expression about where the problem was discovered.
2721 if (!(flags&HASWIDTH) && op != '?')
2722 vFAIL("Regexp *+ operand could be empty");
2725 parse_start = RExC_parse;
2726 nextchar(pRExC_state);
2728 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2730 if (op == '*' && (flags&SIMPLE)) {
2731 reginsert(pRExC_state, STAR, ret);
2735 else if (op == '*') {
2739 else if (op == '+' && (flags&SIMPLE)) {
2740 reginsert(pRExC_state, PLUS, ret);
2744 else if (op == '+') {
2748 else if (op == '?') {
2753 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
2755 "%.*s matches null string many times",
2756 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
2760 if (*RExC_parse == '?') {
2761 nextchar(pRExC_state);
2762 reginsert(pRExC_state, MINMOD, ret);
2763 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2765 if (ISMULT2(RExC_parse)) {
2767 vFAIL("Nested quantifiers");
2774 - regatom - the lowest level
2776 * Optimization: gobbles an entire sequence of ordinary characters so that
2777 * it can turn them into a single node, which is smaller to store and
2778 * faster to run. Backslashed characters are exceptions, each becoming a
2779 * separate node; the code is simpler that way and it's not worth fixing.
2781 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2783 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2785 register regnode *ret = NULL;
2787 char *parse_start = RExC_parse;
2789 *flagp = WORST; /* Tentatively. */
2792 switch (*RExC_parse) {
2794 RExC_seen_zerolen++;
2795 nextchar(pRExC_state);
2796 if (RExC_flags & PMf_MULTILINE)
2797 ret = reg_node(pRExC_state, MBOL);
2798 else if (RExC_flags & PMf_SINGLELINE)
2799 ret = reg_node(pRExC_state, SBOL);
2801 ret = reg_node(pRExC_state, BOL);
2802 Set_Node_Length(ret, 1); /* MJD */
2805 nextchar(pRExC_state);
2807 RExC_seen_zerolen++;
2808 if (RExC_flags & PMf_MULTILINE)
2809 ret = reg_node(pRExC_state, MEOL);
2810 else if (RExC_flags & PMf_SINGLELINE)
2811 ret = reg_node(pRExC_state, SEOL);
2813 ret = reg_node(pRExC_state, EOL);
2814 Set_Node_Length(ret, 1); /* MJD */
2817 nextchar(pRExC_state);
2818 if (RExC_flags & PMf_SINGLELINE)
2819 ret = reg_node(pRExC_state, SANY);
2821 ret = reg_node(pRExC_state, REG_ANY);
2822 *flagp |= HASWIDTH|SIMPLE;
2824 Set_Node_Length(ret, 1); /* MJD */
2828 char *oregcomp_parse = ++RExC_parse;
2829 ret = regclass(pRExC_state);
2830 if (*RExC_parse != ']') {
2831 RExC_parse = oregcomp_parse;
2832 vFAIL("Unmatched [");
2834 nextchar(pRExC_state);
2835 *flagp |= HASWIDTH|SIMPLE;
2836 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2840 nextchar(pRExC_state);
2841 ret = reg(pRExC_state, 1, &flags);
2843 if (flags & TRYAGAIN) {
2844 if (RExC_parse == RExC_end) {
2845 /* Make parent create an empty node if needed. */
2853 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2857 if (flags & TRYAGAIN) {
2861 vFAIL("Internal urp");
2862 /* Supposed to be caught earlier. */
2865 if (!regcurly(RExC_parse)) {
2874 vFAIL("Quantifier follows nothing");
2877 switch (*++RExC_parse) {
2879 RExC_seen_zerolen++;
2880 ret = reg_node(pRExC_state, SBOL);
2882 nextchar(pRExC_state);
2883 Set_Node_Length(ret, 2); /* MJD */
2886 ret = reg_node(pRExC_state, GPOS);
2887 RExC_seen |= REG_SEEN_GPOS;
2889 nextchar(pRExC_state);
2890 Set_Node_Length(ret, 2); /* MJD */
2893 ret = reg_node(pRExC_state, SEOL);
2895 RExC_seen_zerolen++; /* Do not optimize RE away */
2896 nextchar(pRExC_state);
2899 ret = reg_node(pRExC_state, EOS);
2901 RExC_seen_zerolen++; /* Do not optimize RE away */
2902 nextchar(pRExC_state);
2903 Set_Node_Length(ret, 2); /* MJD */
2906 ret = reg_node(pRExC_state, CANY);
2907 RExC_seen |= REG_SEEN_CANY;
2908 *flagp |= HASWIDTH|SIMPLE;
2909 nextchar(pRExC_state);
2910 Set_Node_Length(ret, 2); /* MJD */
2913 ret = reg_node(pRExC_state, CLUMP);
2915 nextchar(pRExC_state);
2916 Set_Node_Length(ret, 2); /* MJD */
2919 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2920 *flagp |= HASWIDTH|SIMPLE;
2921 nextchar(pRExC_state);
2922 Set_Node_Length(ret, 2); /* MJD */
2925 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2926 *flagp |= HASWIDTH|SIMPLE;
2927 nextchar(pRExC_state);
2928 Set_Node_Length(ret, 2); /* MJD */
2931 RExC_seen_zerolen++;
2932 RExC_seen |= REG_SEEN_LOOKBEHIND;
2933 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2935 nextchar(pRExC_state);
2936 Set_Node_Length(ret, 2); /* MJD */
2939 RExC_seen_zerolen++;
2940 RExC_seen |= REG_SEEN_LOOKBEHIND;
2941 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2943 nextchar(pRExC_state);
2944 Set_Node_Length(ret, 2); /* MJD */
2947 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2948 *flagp |= HASWIDTH|SIMPLE;
2949 nextchar(pRExC_state);
2950 Set_Node_Length(ret, 2); /* MJD */
2953 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2954 *flagp |= HASWIDTH|SIMPLE;
2955 nextchar(pRExC_state);
2956 Set_Node_Length(ret, 2); /* MJD */
2959 ret = reg_node(pRExC_state, DIGIT);
2960 *flagp |= HASWIDTH|SIMPLE;
2961 nextchar(pRExC_state);
2962 Set_Node_Length(ret, 2); /* MJD */
2965 ret = reg_node(pRExC_state, NDIGIT);
2966 *flagp |= HASWIDTH|SIMPLE;
2967 nextchar(pRExC_state);
2968 Set_Node_Length(ret, 2); /* MJD */
2973 char* oldregxend = RExC_end;
2974 char* parse_start = RExC_parse - 2;
2976 if (RExC_parse[1] == '{') {
2977 /* a lovely hack--pretend we saw [\pX] instead */
2978 RExC_end = strchr(RExC_parse, '}');
2980 U8 c = (U8)*RExC_parse;
2982 RExC_end = oldregxend;
2983 vFAIL2("Missing right brace on \\%c{}", c);
2988 RExC_end = RExC_parse + 2;
2989 if (RExC_end > oldregxend)
2990 RExC_end = oldregxend;
2994 ret = regclass(pRExC_state);
2996 RExC_end = oldregxend;
2999 Set_Node_Offset(ret, parse_start + 2);
3000 Set_Node_Cur_Length(ret);
3001 nextchar(pRExC_state);
3002 *flagp |= HASWIDTH|SIMPLE;
3015 case '1': case '2': case '3': case '4':
3016 case '5': case '6': case '7': case '8': case '9':
3018 const I32 num = atoi(RExC_parse);
3020 if (num > 9 && num >= RExC_npar)
3023 char * parse_start = RExC_parse - 1; /* MJD */
3024 while (isDIGIT(*RExC_parse))
3027 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3028 vFAIL("Reference to nonexistent group");
3030 ret = reganode(pRExC_state,
3031 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3035 /* override incorrect value set in reganode MJD */
3036 Set_Node_Offset(ret, parse_start+1);
3037 Set_Node_Cur_Length(ret); /* MJD */
3039 nextchar(pRExC_state);
3044 if (RExC_parse >= RExC_end)
3045 FAIL("Trailing \\");
3048 /* Do not generate "unrecognized" warnings here, we fall
3049 back into the quick-grab loop below */
3056 if (RExC_flags & PMf_EXTENDED) {
3057 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3058 if (RExC_parse < RExC_end)
3064 register STRLEN len;
3069 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
3071 parse_start = RExC_parse - 1;
3077 ret = reg_node(pRExC_state,
3078 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3080 for (len = 0, p = RExC_parse - 1;
3081 len < 127 && p < RExC_end;
3086 if (RExC_flags & PMf_EXTENDED)
3087 p = regwhite(p, RExC_end);
3134 ender = ASCII_TO_NATIVE('\033');
3138 ender = ASCII_TO_NATIVE('\007');
3143 char* const e = strchr(p, '}');
3147 vFAIL("Missing right brace on \\x{}");
3150 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3151 | PERL_SCAN_DISALLOW_PREFIX;
3152 STRLEN numlen = e - p - 1;
3153 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3160 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3162 ender = grok_hex(p, &numlen, &flags, NULL);
3168 ender = UCHARAT(p++);
3169 ender = toCTRL(ender);
3171 case '0': case '1': case '2': case '3':case '4':
3172 case '5': case '6': case '7': case '8':case '9':
3174 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3177 ender = grok_oct(p, &numlen, &flags, NULL);
3187 FAIL("Trailing \\");
3190 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
3191 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3192 goto normal_default;
3197 if (UTF8_IS_START(*p) && UTF) {
3199 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3207 if (RExC_flags & PMf_EXTENDED)
3208 p = regwhite(p, RExC_end);
3210 /* Prime the casefolded buffer. */
3211 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3213 if (ISMULT2(p)) { /* Back off on ?+*. */
3218 /* Emit all the Unicode characters. */
3220 for (foldbuf = tmpbuf;
3222 foldlen -= numlen) {
3223 ender = utf8_to_uvchr(foldbuf, &numlen);
3225 const STRLEN unilen = reguni(pRExC_state, ender, s);
3228 /* In EBCDIC the numlen
3229 * and unilen can differ. */
3231 if (numlen >= foldlen)
3235 break; /* "Can't happen." */
3239 const STRLEN unilen = reguni(pRExC_state, ender, s);
3248 REGC((char)ender, s++);
3254 /* Emit all the Unicode characters. */
3256 for (foldbuf = tmpbuf;
3258 foldlen -= numlen) {
3259 ender = utf8_to_uvchr(foldbuf, &numlen);
3261 const STRLEN unilen = reguni(pRExC_state, ender, s);
3264 /* In EBCDIC the numlen
3265 * and unilen can differ. */
3267 if (numlen >= foldlen)
3275 const STRLEN unilen = reguni(pRExC_state, ender, s);
3284 REGC((char)ender, s++);
3288 Set_Node_Cur_Length(ret); /* MJD */
3289 nextchar(pRExC_state);
3291 /* len is STRLEN which is unsigned, need to copy to signed */
3294 vFAIL("Internal disaster");
3298 if (len == 1 && UNI_IS_INVARIANT(ender))
3303 RExC_size += STR_SZ(len);
3305 RExC_emit += STR_SZ(len);
3310 /* If the encoding pragma is in effect recode the text of
3311 * any EXACT-kind nodes. */
3312 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3313 STRLEN oldlen = STR_LEN(ret);
3314 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3318 if (sv_utf8_downgrade(sv, TRUE)) {
3319 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
3320 const STRLEN newlen = SvCUR(sv);
3325 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3326 (int)oldlen, STRING(ret),
3328 Copy(s, STRING(ret), newlen, char);
3329 STR_LEN(ret) += newlen - oldlen;
3330 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3332 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3340 S_regwhite(char *p, const char *e)
3345 else if (*p == '#') {
3348 } while (p < e && *p != '\n');
3356 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3357 Character classes ([:foo:]) can also be negated ([:^foo:]).
3358 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3359 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3360 but trigger failures because they are currently unimplemented. */
3362 #define POSIXCC_DONE(c) ((c) == ':')
3363 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3364 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3367 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3369 I32 namedclass = OOB_NAMEDCLASS;
3371 if (value == '[' && RExC_parse + 1 < RExC_end &&
3372 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3373 POSIXCC(UCHARAT(RExC_parse))) {
3374 const char c = UCHARAT(RExC_parse);
3375 char* const s = RExC_parse++;
3377 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3379 if (RExC_parse == RExC_end)
3380 /* Grandfather lone [:, [=, [. */
3383 const char* t = RExC_parse++; /* skip over the c */
3384 const char *posixcc;
3388 if (UCHARAT(RExC_parse) == ']') {
3389 RExC_parse++; /* skip over the ending ] */
3392 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3393 const I32 skip = t - posixcc;
3395 /* Initially switch on the length of the name. */
3398 if (memEQ(posixcc, "word", 4)) {
3399 /* this is not POSIX, this is the Perl \w */;
3401 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3405 /* Names all of length 5. */
3406 /* alnum alpha ascii blank cntrl digit graph lower
3407 print punct space upper */
3408 /* Offset 4 gives the best switch position. */
3409 switch (posixcc[4]) {
3411 if (memEQ(posixcc, "alph", 4)) {
3414 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3418 if (memEQ(posixcc, "spac", 4)) {
3421 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3425 if (memEQ(posixcc, "grap", 4)) {
3428 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3432 if (memEQ(posixcc, "asci", 4)) {
3435 = complement ? ANYOF_NASCII : ANYOF_ASCII;
3439 if (memEQ(posixcc, "blan", 4)) {
3442 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
3446 if (memEQ(posixcc, "cntr", 4)) {
3449 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3453 if (memEQ(posixcc, "alnu", 4)) {
3456 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3460 if (memEQ(posixcc, "lowe", 4)) {
3463 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
3465 if (memEQ(posixcc, "uppe", 4)) {
3468 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
3472 if (memEQ(posixcc, "digi", 4)) {
3475 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3477 if (memEQ(posixcc, "prin", 4)) {
3480 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
3482 if (memEQ(posixcc, "punc", 4)) {
3485 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3491 if (memEQ(posixcc, "xdigit", 6)) {
3493 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3498 if (namedclass == OOB_NAMEDCLASS)
3500 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3503 assert (posixcc[skip] == ':');
3504 assert (posixcc[skip+1] == ']');
3505 } else if (!SIZE_ONLY) {
3506 /* [[=foo=]] and [[.foo.]] are still future. */
3508 /* adjust RExC_parse so the warning shows after
3510 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3512 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3515 /* Maternal grandfather:
3516 * "[:" ending in ":" but not in ":]" */
3526 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3528 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3529 const char *s = RExC_parse;
3530 const char c = *s++;
3532 while(*s && isALNUM(*s))
3534 if (*s && c == *s && s[1] == ']') {
3535 if (ckWARN(WARN_REGEXP))
3537 "POSIX syntax [%c %c] belongs inside character classes",
3540 /* [[=foo=]] and [[.foo.]] are still future. */
3541 if (POSIXCC_NOTYET(c)) {
3542 /* adjust RExC_parse so the error shows after
3544 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3546 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3553 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3555 register UV value = 0;
3556 register UV nextvalue;
3557 register IV prevvalue = OOB_UNICODE;
3558 register IV range = 0;
3559 register regnode *ret;
3562 char *rangebegin = NULL;
3563 bool need_class = 0;
3567 bool optimize_invert = TRUE;
3568 AV* unicode_alternate = NULL;
3570 UV literal_endpoint = 0;
3573 ret = reganode(pRExC_state, ANYOF, 0);
3576 ANYOF_FLAGS(ret) = 0;
3578 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3582 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3586 RExC_size += ANYOF_SKIP;
3587 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
3590 RExC_emit += ANYOF_SKIP;
3592 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3594 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3595 ANYOF_BITMAP_ZERO(ret);
3596 listsv = newSVpvs("# comment\n");
3599 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3601 if (!SIZE_ONLY && POSIXCC(nextvalue))
3602 checkposixcc(pRExC_state);
3604 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3605 if (UCHARAT(RExC_parse) == ']')
3608 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3612 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3615 rangebegin = RExC_parse;
3617 value = utf8n_to_uvchr((U8*)RExC_parse,
3618 RExC_end - RExC_parse,
3620 RExC_parse += numlen;
3623 value = UCHARAT(RExC_parse++);
3624 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3625 if (value == '[' && POSIXCC(nextvalue))
3626 namedclass = regpposixcc(pRExC_state, value);
3627 else if (value == '\\') {
3629 value = utf8n_to_uvchr((U8*)RExC_parse,
3630 RExC_end - RExC_parse,
3632 RExC_parse += numlen;
3635 value = UCHARAT(RExC_parse++);
3636 /* Some compilers cannot handle switching on 64-bit integer
3637 * values, therefore value cannot be an UV. Yes, this will
3638 * be a problem later if we want switch on Unicode.
3639 * A similar issue a little bit later when switching on
3640 * namedclass. --jhi */
3641 switch ((I32)value) {
3642 case 'w': namedclass = ANYOF_ALNUM; break;
3643 case 'W': namedclass = ANYOF_NALNUM; break;
3644 case 's': namedclass = ANYOF_SPACE; break;
3645 case 'S': namedclass = ANYOF_NSPACE; break;
3646 case 'd': namedclass = ANYOF_DIGIT; break;
3647 case 'D': namedclass = ANYOF_NDIGIT; break;
3650 if (RExC_parse >= RExC_end)
3651 vFAIL2("Empty \\%c{}", (U8)value);
3652 if (*RExC_parse == '{') {
3653 const U8 c = (U8)value;
3654 e = strchr(RExC_parse++, '}');
3656 vFAIL2("Missing right brace on \\%c{}", c);
3657 while (isSPACE(UCHARAT(RExC_parse)))
3659 if (e == RExC_parse)
3660 vFAIL2("Empty \\%c{}", c);
3662 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3670 if (UCHARAT(RExC_parse) == '^') {
3673 value = value == 'p' ? 'P' : 'p'; /* toggle */
3674 while (isSPACE(UCHARAT(RExC_parse))) {
3679 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
3680 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
3683 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3684 namedclass = ANYOF_MAX; /* no official name, but it's named */
3686 case 'n': value = '\n'; break;
3687 case 'r': value = '\r'; break;
3688 case 't': value = '\t'; break;
3689 case 'f': value = '\f'; break;
3690 case 'b': value = '\b'; break;
3691 case 'e': value = ASCII_TO_NATIVE('\033');break;
3692 case 'a': value = ASCII_TO_NATIVE('\007');break;
3694 if (*RExC_parse == '{') {
3695 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3696 | PERL_SCAN_DISALLOW_PREFIX;
3697 e = strchr(RExC_parse++, '}');
3699 vFAIL("Missing right brace on \\x{}");
3701 numlen = e - RExC_parse;
3702 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3706 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3708 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3709 RExC_parse += numlen;
3713 value = UCHARAT(RExC_parse++);
3714 value = toCTRL(value);
3716 case '0': case '1': case '2': case '3': case '4':
3717 case '5': case '6': case '7': case '8': case '9':
3721 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3722 RExC_parse += numlen;
3726 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
3728 "Unrecognized escape \\%c in character class passed through",
3732 } /* end of \blah */
3738 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3740 if (!SIZE_ONLY && !need_class)
3741 ANYOF_CLASS_ZERO(ret);
3745 /* a bad range like a-\d, a-[:digit:] ? */
3748 if (ckWARN(WARN_REGEXP)) {
3750 RExC_parse >= rangebegin ?
3751 RExC_parse - rangebegin : 0;
3753 "False [] range \"%*.*s\"",
3756 if (prevvalue < 256) {
3757 ANYOF_BITMAP_SET(ret, prevvalue);
3758 ANYOF_BITMAP_SET(ret, '-');
3761 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3762 Perl_sv_catpvf(aTHX_ listsv,
3763 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3767 range = 0; /* this was not a true range */
3771 const char *what = NULL;
3774 if (namedclass > OOB_NAMEDCLASS)
3775 optimize_invert = FALSE;
3776 /* Possible truncation here but in some 64-bit environments
3777 * the compiler gets heartburn about switch on 64-bit values.
3778 * A similar issue a little earlier when switching on value.
3780 switch ((I32)namedclass) {
3783 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3785 for (value = 0; value < 256; value++)
3787 ANYOF_BITMAP_SET(ret, value);
3794 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3796 for (value = 0; value < 256; value++)
3797 if (!isALNUM(value))
3798 ANYOF_BITMAP_SET(ret, value);
3805 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3807 for (value = 0; value < 256; value++)
3808 if (isALNUMC(value))
3809 ANYOF_BITMAP_SET(ret, value);
3816 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3818 for (value = 0; value < 256; value++)
3819 if (!isALNUMC(value))
3820 ANYOF_BITMAP_SET(ret, value);
3827 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3829 for (value = 0; value < 256; value++)
3831 ANYOF_BITMAP_SET(ret, value);
3838 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3840 for (value = 0; value < 256; value++)
3841 if (!isALPHA(value))
3842 ANYOF_BITMAP_SET(ret, value);
3849 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3852 for (value = 0; value < 128; value++)
3853 ANYOF_BITMAP_SET(ret, value);
3855 for (value = 0; value < 256; value++) {
3857 ANYOF_BITMAP_SET(ret, value);
3866 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3869 for (value = 128; value < 256; value++)
3870 ANYOF_BITMAP_SET(ret, value);
3872 for (value = 0; value < 256; value++) {
3873 if (!isASCII(value))
3874 ANYOF_BITMAP_SET(ret, value);
3883 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3885 for (value = 0; value < 256; value++)
3887 ANYOF_BITMAP_SET(ret, value);
3894 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3896 for (value = 0; value < 256; value++)
3897 if (!isBLANK(value))
3898 ANYOF_BITMAP_SET(ret, value);
3905 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3907 for (value = 0; value < 256; value++)
3909 ANYOF_BITMAP_SET(ret, value);
3916 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3918 for (value = 0; value < 256; value++)
3919 if (!isCNTRL(value))
3920 ANYOF_BITMAP_SET(ret, value);
3927 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3929 /* consecutive digits assumed */
3930 for (value = '0'; value <= '9'; value++)
3931 ANYOF_BITMAP_SET(ret, value);
3938 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3940 /* consecutive digits assumed */
3941 for (value = 0; value < '0'; value++)
3942 ANYOF_BITMAP_SET(ret, value);
3943 for (value = '9' + 1; value < 256; value++)
3944 ANYOF_BITMAP_SET(ret, value);
3951 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3953 for (value = 0; value < 256; value++)
3955 ANYOF_BITMAP_SET(ret, value);
3962 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3964 for (value = 0; value < 256; value++)
3965 if (!isGRAPH(value))
3966 ANYOF_BITMAP_SET(ret, value);
3973 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3975 for (value = 0; value < 256; value++)
3977 ANYOF_BITMAP_SET(ret, value);
3984 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3986 for (value = 0; value < 256; value++)
3987 if (!isLOWER(value))
3988 ANYOF_BITMAP_SET(ret, value);
3995 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3997 for (value = 0; value < 256; value++)
3999 ANYOF_BITMAP_SET(ret, value);
4006 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
4008 for (value = 0; value < 256; value++)
4009 if (!isPRINT(value))
4010 ANYOF_BITMAP_SET(ret, value);
4017 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
4019 for (value = 0; value < 256; value++)
4020 if (isPSXSPC(value))
4021 ANYOF_BITMAP_SET(ret, value);
4028 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
4030 for (value = 0; value < 256; value++)
4031 if (!isPSXSPC(value))
4032 ANYOF_BITMAP_SET(ret, value);
4039 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
4041 for (value = 0; value < 256; value++)
4043 ANYOF_BITMAP_SET(ret, value);
4050 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4052 for (value = 0; value < 256; value++)
4053 if (!isPUNCT(value))
4054 ANYOF_BITMAP_SET(ret, value);
4061 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4063 for (value = 0; value < 256; value++)
4065 ANYOF_BITMAP_SET(ret, value);
4072 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4074 for (value = 0; value < 256; value++)
4075 if (!isSPACE(value))
4076 ANYOF_BITMAP_SET(ret, value);
4083 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4085 for (value = 0; value < 256; value++)
4087 ANYOF_BITMAP_SET(ret, value);
4094 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4096 for (value = 0; value < 256; value++)
4097 if (!isUPPER(value))
4098 ANYOF_BITMAP_SET(ret, value);
4105 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4107 for (value = 0; value < 256; value++)
4108 if (isXDIGIT(value))
4109 ANYOF_BITMAP_SET(ret, value);
4116 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4118 for (value = 0; value < 256; value++)
4119 if (!isXDIGIT(value))
4120 ANYOF_BITMAP_SET(ret, value);
4126 /* this is to handle \p and \P */
4129 vFAIL("Invalid [::] class");
4133 /* Strings such as "+utf8::isWord\n" */
4134 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
4137 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4140 } /* end of namedclass \blah */
4143 if (prevvalue > (IV)value) /* b-a */ {
4144 const int w = RExC_parse - rangebegin;
4145 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
4146 range = 0; /* not a valid range */
4150 prevvalue = value; /* save the beginning of the range */
4151 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4152 RExC_parse[1] != ']') {
4155 /* a bad range like \w-, [:word:]- ? */
4156 if (namedclass > OOB_NAMEDCLASS) {
4157 if (ckWARN(WARN_REGEXP)) {
4159 RExC_parse >= rangebegin ?
4160 RExC_parse - rangebegin : 0;
4162 "False [] range \"%*.*s\"",
4166 ANYOF_BITMAP_SET(ret, '-');
4168 range = 1; /* yeah, it's a range! */
4169 continue; /* but do it the next time */
4173 /* now is the next time */
4177 if (prevvalue < 256) {
4178 const IV ceilvalue = value < 256 ? value : 255;
4181 /* In EBCDIC [\x89-\x91] should include
4182 * the \x8e but [i-j] should not. */
4183 if (literal_endpoint == 2 &&
4184 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4185 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4187 if (isLOWER(prevvalue)) {
4188 for (i = prevvalue; i <= ceilvalue; i++)
4190 ANYOF_BITMAP_SET(ret, i);
4192 for (i = prevvalue; i <= ceilvalue; i++)
4194 ANYOF_BITMAP_SET(ret, i);
4199 for (i = prevvalue; i <= ceilvalue; i++)
4200 ANYOF_BITMAP_SET(ret, i);
4202 if (value > 255 || UTF) {
4203 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4204 const UV natvalue = NATIVE_TO_UNI(value);
4206 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4207 if (prevnatvalue < natvalue) { /* what about > ? */
4208 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4209 prevnatvalue, natvalue);
4211 else if (prevnatvalue == natvalue) {
4212 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4214 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
4216 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4218 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
4219 if (RExC_precomp[0] == ':' &&
4220 RExC_precomp[1] == '[' &&
4221 (f == 0xDF || f == 0x92)) {
4222 f = NATIVE_TO_UNI(f);
4225 /* If folding and foldable and a single
4226 * character, insert also the folded version
4227 * to the charclass. */
4229 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
4230 if ((RExC_precomp[0] == ':' &&
4231 RExC_precomp[1] == '[' &&
4233 (value == 0xFB05 || value == 0xFB06))) ?
4234 foldlen == ((STRLEN)UNISKIP(f) - 1) :
4235 foldlen == (STRLEN)UNISKIP(f) )
4237 if (foldlen == (STRLEN)UNISKIP(f))
4239 Perl_sv_catpvf(aTHX_ listsv,
4242 /* Any multicharacter foldings
4243 * require the following transform:
4244 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4245 * where E folds into "pq" and F folds
4246 * into "rst", all other characters
4247 * fold to single characters. We save
4248 * away these multicharacter foldings,
4249 * to be later saved as part of the
4250 * additional "s" data. */
4253 if (!unicode_alternate)
4254 unicode_alternate = newAV();
4255 sv = newSVpvn((char*)foldbuf, foldlen);
4257 av_push(unicode_alternate, sv);
4261 /* If folding and the value is one of the Greek
4262 * sigmas insert a few more sigmas to make the
4263 * folding rules of the sigmas to work right.
4264 * Note that not all the possible combinations
4265 * are handled here: some of them are handled
4266 * by the standard folding rules, and some of
4267 * them (literal or EXACTF cases) are handled
4268 * during runtime in regexec.c:S_find_byclass(). */
4269 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4270 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4271 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4272 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4273 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4275 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4276 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4277 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4282 literal_endpoint = 0;
4286 range = 0; /* this range (if it was one) is done now */
4290 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4292 RExC_size += ANYOF_CLASS_ADD_SKIP;
4294 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4297 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4299 /* If the only flag is folding (plus possibly inversion). */
4300 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4302 for (value = 0; value < 256; ++value) {
4303 if (ANYOF_BITMAP_TEST(ret, value)) {
4304 UV fold = PL_fold[value];
4307 ANYOF_BITMAP_SET(ret, fold);
4310 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4313 /* optimize inverted simple patterns (e.g. [^a-z]) */
4314 if (!SIZE_ONLY && optimize_invert &&
4315 /* If the only flag is inversion. */
4316 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4317 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4318 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4319 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4323 AV * const av = newAV();
4326 /* The 0th element stores the character class description
4327 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4328 * to initialize the appropriate swash (which gets stored in
4329 * the 1st element), and also useful for dumping the regnode.
4330 * The 2nd element stores the multicharacter foldings,
4331 * used later (regexec.c:S_reginclass()). */
4332 av_store(av, 0, listsv);
4333 av_store(av, 1, NULL);
4334 av_store(av, 2, (SV*)unicode_alternate);
4335 rv = newRV_noinc((SV*)av);
4336 n = add_data(pRExC_state, 1, "s");
4337 RExC_rx->data->data[n] = (void*)rv;
4345 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4347 char* const retval = RExC_parse++;
4350 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4351 RExC_parse[2] == '#') {
4352 while (*RExC_parse != ')') {
4353 if (RExC_parse == RExC_end)
4354 FAIL("Sequence (?#... not terminated");
4360 if (RExC_flags & PMf_EXTENDED) {
4361 if (isSPACE(*RExC_parse)) {
4365 else if (*RExC_parse == '#') {
4366 while (RExC_parse < RExC_end)
4367 if (*RExC_parse++ == '\n') break;
4376 - reg_node - emit a node
4378 STATIC regnode * /* Location. */
4379 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4381 register regnode *ptr;
4382 regnode * const ret = RExC_emit;
4385 SIZE_ALIGN(RExC_size);
4390 NODE_ALIGN_FILL(ret);
4392 FILL_ADVANCE_NODE(ptr, op);
4393 if (RExC_offsets) { /* MJD */
4394 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4395 "reg_node", __LINE__,
4397 RExC_emit - RExC_emit_start > RExC_offsets[0]
4398 ? "Overwriting end of array!\n" : "OK",
4399 RExC_emit - RExC_emit_start,
4400 RExC_parse - RExC_start,
4402 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4411 - reganode - emit a node with an argument
4413 STATIC regnode * /* Location. */
4414 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4416 register regnode *ptr;
4417 regnode * const ret = RExC_emit;
4420 SIZE_ALIGN(RExC_size);
4425 NODE_ALIGN_FILL(ret);
4427 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4428 if (RExC_offsets) { /* MJD */
4429 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4433 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4434 "Overwriting end of array!\n" : "OK",
4435 RExC_emit - RExC_emit_start,
4436 RExC_parse - RExC_start,
4438 Set_Cur_Node_Offset;
4447 - reguni - emit (if appropriate) a Unicode character
4450 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
4452 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4456 - reginsert - insert an operator in front of already-emitted operand
4458 * Means relocating the operand.
4461 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4463 register regnode *src;
4464 register regnode *dst;
4465 register regnode *place;
4466 const int offset = regarglen[(U8)op];
4468 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4471 RExC_size += NODE_STEP_REGNODE + offset;
4476 RExC_emit += NODE_STEP_REGNODE + offset;
4478 while (src > opnd) {
4479 StructCopy(--src, --dst, regnode);
4480 if (RExC_offsets) { /* MJD 20010112 */
4481 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4485 dst - RExC_emit_start > RExC_offsets[0]
4486 ? "Overwriting end of array!\n" : "OK",
4487 src - RExC_emit_start,
4488 dst - RExC_emit_start,
4490 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4491 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4496 place = opnd; /* Op node, where operand used to be. */
4497 if (RExC_offsets) { /* MJD */
4498 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4502 place - RExC_emit_start > RExC_offsets[0]
4503 ? "Overwriting end of array!\n" : "OK",
4504 place - RExC_emit_start,
4505 RExC_parse - RExC_start,
4507 Set_Node_Offset(place, RExC_parse);
4508 Set_Node_Length(place, 1);
4510 src = NEXTOPER(place);
4511 FILL_ADVANCE_NODE(place, op);
4512 Zero(src, offset, regnode);
4516 - regtail - set the next-pointer at the end of a node chain of p to val.
4518 /* TODO: All three parms should be const */
4520 S_regtail(pTHX_ const RExC_state_t *pRExC_state, regnode *p, const regnode *val)
4522 register regnode *scan;
4527 /* Find last node. */
4530 regnode * const temp = regnext(scan);
4536 if (reg_off_by_arg[OP(scan)]) {
4537 ARG_SET(scan, val - scan);
4540 NEXT_OFF(scan) = val - scan;
4545 - regcurly - a little FSA that accepts {\d+,?\d*}
4548 S_regcurly(register const char *s)
4566 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4569 Perl_regdump(pTHX_ const regexp *r)
4572 SV * const sv = sv_newmortal();
4574 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4576 /* Header fields of interest. */
4577 if (r->anchored_substr)
4578 PerlIO_printf(Perl_debug_log,
4579 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
4581 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4582 SvPVX_const(r->anchored_substr),
4584 SvTAIL(r->anchored_substr) ? "$" : "",
4585 (IV)r->anchored_offset);
4586 else if (r->anchored_utf8)
4587 PerlIO_printf(Perl_debug_log,
4588 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
4590 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4591 SvPVX_const(r->anchored_utf8),
4593 SvTAIL(r->anchored_utf8) ? "$" : "",
4594 (IV)r->anchored_offset);
4595 if (r->float_substr)
4596 PerlIO_printf(Perl_debug_log,
4597 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4599 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4600 SvPVX_const(r->float_substr),
4602 SvTAIL(r->float_substr) ? "$" : "",
4603 (IV)r->float_min_offset, (UV)r->float_max_offset);
4604 else if (r->float_utf8)
4605 PerlIO_printf(Perl_debug_log,
4606 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4608 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4609 SvPVX_const(r->float_utf8),
4611 SvTAIL(r->float_utf8) ? "$" : "",
4612 (IV)r->float_min_offset, (UV)r->float_max_offset);
4613 if (r->check_substr || r->check_utf8)
4614 PerlIO_printf(Perl_debug_log,
4616 (r->check_substr == r->float_substr
4617 && r->check_utf8 == r->float_utf8
4618 ? "(checking floating" : "(checking anchored"));
4619 if (r->reganch & ROPT_NOSCAN)
4620 PerlIO_printf(Perl_debug_log, " noscan");
4621 if (r->reganch & ROPT_CHECK_ALL)
4622 PerlIO_printf(Perl_debug_log, " isall");
4623 if (r->check_substr || r->check_utf8)
4624 PerlIO_printf(Perl_debug_log, ") ");
4626 if (r->regstclass) {
4627 regprop(sv, r->regstclass);
4628 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
4630 if (r->reganch & ROPT_ANCH) {
4631 PerlIO_printf(Perl_debug_log, "anchored");
4632 if (r->reganch & ROPT_ANCH_BOL)
4633 PerlIO_printf(Perl_debug_log, "(BOL)");
4634 if (r->reganch & ROPT_ANCH_MBOL)
4635 PerlIO_printf(Perl_debug_log, "(MBOL)");
4636 if (r->reganch & ROPT_ANCH_SBOL)
4637 PerlIO_printf(Perl_debug_log, "(SBOL)");
4638 if (r->reganch & ROPT_ANCH_GPOS)
4639 PerlIO_printf(Perl_debug_log, "(GPOS)");
4640 PerlIO_putc(Perl_debug_log, ' ');
4642 if (r->reganch & ROPT_GPOS_SEEN)
4643 PerlIO_printf(Perl_debug_log, "GPOS ");
4644 if (r->reganch & ROPT_SKIP)
4645 PerlIO_printf(Perl_debug_log, "plus ");
4646 if (r->reganch & ROPT_IMPLICIT)
4647 PerlIO_printf(Perl_debug_log, "implicit ");
4648 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4649 if (r->reganch & ROPT_EVAL_SEEN)
4650 PerlIO_printf(Perl_debug_log, "with eval ");
4651 PerlIO_printf(Perl_debug_log, "\n");
4654 const U32 len = r->offsets[0];
4655 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4656 for (i = 1; i <= len; i++)
4657 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4658 (UV)r->offsets[i*2-1],
4659 (UV)r->offsets[i*2]);
4660 PerlIO_printf(Perl_debug_log, "\n");
4663 PERL_UNUSED_CONTEXT;
4665 #endif /* DEBUGGING */
4669 - regprop - printable representation of opcode
4672 Perl_regprop(pTHX_ SV *sv, regnode *o)
4677 sv_setpvn(sv, "", 0);
4678 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4679 /* It would be nice to FAIL() here, but this may be called from
4680 regexec.c, and it would be hard to supply pRExC_state. */
4681 Perl_croak(aTHX_ "Corrupted regexp opcode");
4682 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
4684 k = PL_regkind[(U8)OP(o)];
4687 SV * const dsv = sv_2mortal(newSVpvs(""));
4688 /* Using is_utf8_string() is a crude hack but it may
4689 * be the best for now since we have no flag "this EXACTish
4690 * node was UTF-8" --jhi */
4691 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4692 const char * const s = do_utf8 ?
4693 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4694 UNI_DISPLAY_REGEX) :
4696 const int len = do_utf8 ?
4699 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4704 else if (k == CURLY) {
4705 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4706 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4707 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4709 else if (k == WHILEM && o->flags) /* Ordinal/of */
4710 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4711 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4712 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4713 else if (k == LOGICAL)
4714 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4715 else if (k == ANYOF) {
4716 int i, rangestart = -1;
4717 const U8 flags = ANYOF_FLAGS(o);
4719 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
4720 static const char * const anyofs[] = {
4753 if (flags & ANYOF_LOCALE)
4754 sv_catpvs(sv, "{loc}");
4755 if (flags & ANYOF_FOLD)
4756 sv_catpvs(sv, "{i}");
4757 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4758 if (flags & ANYOF_INVERT)
4760 for (i = 0; i <= 256; i++) {
4761 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4762 if (rangestart == -1)
4764 } else if (rangestart != -1) {
4765 if (i <= rangestart + 3)
4766 for (; rangestart < i; rangestart++)
4767 put_byte(sv, rangestart);
4769 put_byte(sv, rangestart);
4771 put_byte(sv, i - 1);
4777 if (o->flags & ANYOF_CLASS)
4778 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
4779 if (ANYOF_CLASS_TEST(o,i))
4780 sv_catpv(sv, anyofs[i]);
4782 if (flags & ANYOF_UNICODE)
4783 sv_catpvs(sv, "{unicode}");
4784 else if (flags & ANYOF_UNICODE_ALL)
4785 sv_catpvs(sv, "{unicode_all}");
4789 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
4793 U8 s[UTF8_MAXBYTES_CASE+1];
4795 for (i = 0; i <= 256; i++) { /* just the first 256 */
4796 uvchr_to_utf8(s, i);
4798 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4799 if (rangestart == -1)
4801 } else if (rangestart != -1) {
4802 if (i <= rangestart + 3)
4803 for (; rangestart < i; rangestart++) {
4804 const U8 * const e = uvchr_to_utf8(s,rangestart);
4806 for(p = s; p < e; p++)
4810 const U8 *e = uvchr_to_utf8(s,rangestart);
4812 for (p = s; p < e; p++)
4815 e = uvchr_to_utf8(s, i-1);
4816 for (p = s; p < e; p++)
4823 sv_catpvs(sv, "..."); /* et cetera */
4827 char *s = savesvpv(lv);
4828 char * const origs = s;
4830 while(*s && *s != '\n') s++;
4833 const char * const t = ++s;
4851 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4853 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4854 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4856 PERL_UNUSED_CONTEXT;
4857 PERL_UNUSED_ARG(sv);
4859 #endif /* DEBUGGING */
4863 Perl_re_intuit_string(pTHX_ regexp *prog)
4864 { /* Assume that RE_INTUIT is set */
4867 const char * const s = SvPV_nolen_const(prog->check_substr
4868 ? prog->check_substr : prog->check_utf8);
4870 if (!PL_colorset) reginitcolors();
4871 PerlIO_printf(Perl_debug_log,
4872 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
4874 prog->check_substr ? "" : "utf8 ",
4875 PL_colors[5],PL_colors[0],
4878 (strlen(s) > 60 ? "..." : ""));
4881 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4885 Perl_pregfree(pTHX_ struct regexp *r)
4888 SV * const dsv = PERL_DEBUG_PAD_ZERO(0);
4891 if (!r || (--r->refcnt > 0))
4894 const char * const s = (r->reganch & ROPT_UTF8)
4895 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
4896 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4897 const int len = SvCUR(dsv);
4900 PerlIO_printf(Perl_debug_log,
4901 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4902 PL_colors[4],PL_colors[5],PL_colors[0],
4905 len > 60 ? "..." : "");
4908 /* gcov results gave these as non-null 100% of the time, so there's no
4909 optimisation in checking them before calling Safefree */
4910 Safefree(r->precomp);
4911 Safefree(r->offsets); /* 20010421 MJD */
4912 if (RX_MATCH_COPIED(r))
4913 Safefree(r->subbeg);
4915 if (r->anchored_substr)
4916 SvREFCNT_dec(r->anchored_substr);
4917 if (r->anchored_utf8)
4918 SvREFCNT_dec(r->anchored_utf8);
4919 if (r->float_substr)
4920 SvREFCNT_dec(r->float_substr);
4922 SvREFCNT_dec(r->float_utf8);
4923 Safefree(r->substrs);
4926 int n = r->data->count;
4927 PAD* new_comppad = NULL;
4932 /* If you add a ->what type here, update the comment in regcomp.h */
4933 switch (r->data->what[n]) {
4935 SvREFCNT_dec((SV*)r->data->data[n]);
4938 Safefree(r->data->data[n]);
4941 new_comppad = (AV*)r->data->data[n];
4944 if (new_comppad == NULL)
4945 Perl_croak(aTHX_ "panic: pregfree comppad");
4946 PAD_SAVE_LOCAL(old_comppad,
4947 /* Watch out for global destruction's random ordering. */
4948 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
4951 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
4954 op_free((OP_4tree*)r->data->data[n]);
4956 PAD_RESTORE_LOCAL(old_comppad);
4957 SvREFCNT_dec((SV*)new_comppad);
4963 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4966 Safefree(r->data->what);
4969 Safefree(r->startp);
4975 - regnext - dig the "next" pointer out of a node
4977 * [Note, when REGALIGN is defined there are two places in regmatch()
4978 * that bypass this code for speed.]
4981 Perl_regnext(pTHX_ register regnode *p)
4983 register I32 offset;
4985 if (p == &PL_regdummy)
4988 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
4996 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
4999 STRLEN l1 = strlen(pat1);
5000 STRLEN l2 = strlen(pat2);
5003 const char *message;
5009 Copy(pat1, buf, l1 , char);
5010 Copy(pat2, buf + l1, l2 , char);
5011 buf[l1 + l2] = '\n';
5012 buf[l1 + l2 + 1] = '\0';
5014 /* ANSI variant takes additional second argument */
5015 va_start(args, pat2);
5019 msv = vmess(buf, &args);
5021 message = SvPV_const(msv,l1);
5024 Copy(message, buf, l1 , char);
5025 buf[l1-1] = '\0'; /* Overwrite \n */
5026 Perl_croak(aTHX_ "%s", buf);
5029 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5032 Perl_save_re_context(pTHX)
5035 struct re_save_state *state;
5037 SAVEVPTR(PL_curcop);
5038 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
5040 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
5041 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
5042 SSPUSHINT(SAVEt_RE_STATE);
5044 state->re_state_reg_flags = PL_reg_flags;
5045 state->re_state_bostr = PL_bostr;
5046 state->re_state_reginput = PL_reginput;
5047 state->re_state_regbol = PL_regbol;
5048 state->re_state_regeol = PL_regeol;
5049 state->re_state_regstartp = PL_regstartp;
5050 state->re_state_regendp = PL_regendp;
5051 state->re_state_reglastparen = PL_reglastparen;
5052 state->re_state_reglastcloseparen = PL_reglastcloseparen;
5053 state->re_state_regtill = PL_regtill;
5054 state->re_state_reg_start_tmp = PL_reg_start_tmp;
5055 state->re_state_reg_start_tmpl = PL_reg_start_tmpl;
5056 state->re_state_reg_eval_set = PL_reg_eval_set;
5057 state->re_state_regnarrate = PL_regnarrate;
5058 state->re_state_regindent = PL_regindent;
5059 state->re_state_reg_call_cc = PL_reg_call_cc;
5060 state->re_state_reg_re = PL_reg_re;
5061 state->re_state_reg_ganch = PL_reg_ganch;
5062 state->re_state_reg_sv = PL_reg_sv;
5063 state->re_state_reg_match_utf8 = PL_reg_match_utf8;
5064 state->re_state_reg_magic = PL_reg_magic;
5065 state->re_state_reg_oldpos = PL_reg_oldpos;
5066 state->re_state_reg_oldcurpm = PL_reg_oldcurpm;
5067 state->re_state_reg_curpm = PL_reg_curpm;
5068 state->re_state_reg_oldsaved = PL_reg_oldsaved;
5069 state->re_state_reg_oldsavedlen = PL_reg_oldsavedlen;
5070 state->re_state_reg_maxiter = PL_reg_maxiter;
5071 state->re_state_reg_leftiter = PL_reg_leftiter;
5072 state->re_state_reg_poscache = PL_reg_poscache;
5073 state->re_state_reg_poscache_size = PL_reg_poscache_size;
5074 state->re_state_regsize = PL_regsize;
5075 state->re_state_reg_starttry = PL_reg_starttry;
5077 /* These variables have been eliminated from 5.10: */
5078 state->re_state_regdata = PL_regdata;
5079 state->re_state_regprogram = PL_regprogram;
5080 state->re_state_regcc = PL_regcc;
5081 state->re_state_regprecomp = PL_regprecomp;
5082 state->re_state_regnpar = PL_regnpar;
5084 PL_reg_start_tmp = 0;
5085 PL_reg_start_tmpl = 0;
5086 PL_reg_oldsaved = NULL;
5087 PL_reg_oldsavedlen = 0;
5089 PL_reg_leftiter = 0;
5090 PL_reg_poscache = NULL;
5091 PL_reg_poscache_size = 0;
5093 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5095 const REGEXP * const rx = PM_GETRE(PL_curpm);
5098 for (i = 1; i <= rx->nparens; i++) {
5100 char digits[TYPE_CHARS(long)];
5101 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
5102 GV *const *const gvp
5103 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
5105 if (gvp && SvTYPE(gv = *gvp) == SVt_PVGV && GvSV(gv)) {
5114 clear_re(pTHX_ void *r)
5116 ReREFCNT_dec((regexp *)r);
5122 S_put_byte(pTHX_ SV *sv, int c)
5124 if (isCNTRL(c) || c == 255 || !isPRINT(c))
5125 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5126 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5127 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5129 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5133 STATIC const regnode *
5134 S_dumpuntil(pTHX_ const regnode *start, const regnode *node,
5135 const regnode *last, SV* sv, I32 l)
5137 register U8 op = EXACT; /* Arbitrary non-END op. */
5138 register const regnode *next;
5140 while (op != END && (!last || node < last)) {
5141 /* While that wasn't END last time... */
5147 next = regnext((regnode *)node);
5149 if (OP(node) == OPTIMIZED)
5151 regprop(sv, (regnode *) node);
5152 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
5153 (int)(2*l + 1), "", SvPVX_const(sv));
5154 if (next == NULL) /* Next ptr. */
5155 PerlIO_printf(Perl_debug_log, "(0)");
5157 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
5158 (void)PerlIO_putc(Perl_debug_log, '\n');
5160 if (PL_regkind[(U8)op] == BRANCHJ) {
5161 register const regnode *nnode = (OP(next) == LONGJMP
5162 ? regnext((regnode *)next)
5164 if (last && nnode > last)
5166 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
5168 else if (PL_regkind[(U8)op] == BRANCH) {
5169 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
5171 else if ( op == CURLY) { /* "next" might be very big: optimizer */
5172 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5173 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
5175 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
5176 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5179 else if ( op == PLUS || op == STAR) {
5180 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
5182 else if (op == ANYOF) {
5183 /* arglen 1 + class block */
5184 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5185 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5186 node = NEXTOPER(node);
5188 else if (PL_regkind[(U8)op] == EXACT) {
5189 /* Literal string, where present. */
5190 node += NODE_SZ_STR(node) - 1;
5191 node = NEXTOPER(node);
5194 node = NEXTOPER(node);
5195 node += regarglen[(U8)op];
5197 if (op == CURLYX || op == OPEN)
5199 else if (op == WHILEM)
5205 #endif /* DEBUGGING */
5209 * c-indentation-style: bsd
5211 * indent-tabs-mode: t
5214 * ex: set ts=8 sts=4 sw=4 noet: