5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
59 * pregcomp and pregexec -- regsub and regerror are not used in perl
61 * Copyright (c) 1986 by University of Toronto.
62 * Written by Henry Spencer. Not derived from licensed software.
64 * Permission is granted to anyone to use this software for any
65 * purpose on any computer system, and to redistribute it freely,
66 * subject to the following restrictions:
68 * 1. The author is not responsible for the consequences of use of
69 * this software, no matter how awful, even if they arise
72 * 2. The origin of this software must not be misrepresented, either
73 * by explicit claim or by omission.
75 * 3. Altered versions must be plainly marked as such, and must not
76 * be misrepresented as being the original software.
79 **** Alterations to Henry's code are...
81 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
82 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, by Larry Wall and others
84 **** You may distribute under the terms of either the GNU General Public
85 **** License or the Artistic License, as specified in the README file.
88 * Beware that some of this code is subtly aware of the way operator
89 * precedence is structured in regular expressions. Serious changes in
90 * regular-expression syntax might require a total rethink.
93 #define PERL_IN_REGCOMP_C
96 #ifndef PERL_IN_XSUB_RE
108 # if defined(BUGGY_MSC6)
109 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
110 # pragma optimize("a",off)
111 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
112 # pragma optimize("w",on )
113 # endif /* BUGGY_MSC6 */
117 #define STATIC static
120 typedef struct RExC_state_t {
121 U32 flags; /* are we folding, multilining? */
122 char *precomp; /* uncompiled string. */
124 char *start; /* Start of input for compile */
125 char *end; /* End of input for compile */
126 char *parse; /* Input-scan pointer. */
127 I32 whilem_seen; /* number of WHILEM in this expr */
128 regnode *emit_start; /* Start of emitted-code area */
129 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
133 I32 size; /* Code size. */
134 I32 npar; /* () count. */
140 char *starttry; /* -Dr: where regtry was called. */
141 #define RExC_starttry (pRExC_state->starttry)
145 #define RExC_flags (pRExC_state->flags)
146 #define RExC_precomp (pRExC_state->precomp)
147 #define RExC_rx (pRExC_state->rx)
148 #define RExC_start (pRExC_state->start)
149 #define RExC_end (pRExC_state->end)
150 #define RExC_parse (pRExC_state->parse)
151 #define RExC_whilem_seen (pRExC_state->whilem_seen)
152 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
153 #define RExC_emit (pRExC_state->emit)
154 #define RExC_emit_start (pRExC_state->emit_start)
155 #define RExC_naughty (pRExC_state->naughty)
156 #define RExC_sawback (pRExC_state->sawback)
157 #define RExC_seen (pRExC_state->seen)
158 #define RExC_size (pRExC_state->size)
159 #define RExC_npar (pRExC_state->npar)
160 #define RExC_extralen (pRExC_state->extralen)
161 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
162 #define RExC_seen_evals (pRExC_state->seen_evals)
163 #define RExC_utf8 (pRExC_state->utf8)
165 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
166 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
167 ((*s) == '{' && regcurly(s)))
170 #undef SPSTART /* dratted cpp namespace... */
173 * Flags to be passed up and down.
175 #define WORST 0 /* Worst case. */
176 #define HASWIDTH 0x1 /* Known to match non-null strings. */
177 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
178 #define SPSTART 0x4 /* Starts with * or +. */
179 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
181 /* Length of a variant. */
183 typedef struct scan_data_t {
189 I32 last_end; /* min value, <0 unless valid. */
192 SV **longest; /* Either &l_fixed, or &l_float. */
196 I32 offset_float_min;
197 I32 offset_float_max;
201 struct regnode_charclass_class *start_class;
205 * Forward declarations for pregcomp()'s friends.
208 static const scan_data_t zero_scan_data =
209 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
211 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
212 #define SF_BEFORE_SEOL 0x1
213 #define SF_BEFORE_MEOL 0x2
214 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
215 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
218 # define SF_FIX_SHIFT_EOL (0+2)
219 # define SF_FL_SHIFT_EOL (0+4)
221 # define SF_FIX_SHIFT_EOL (+2)
222 # define SF_FL_SHIFT_EOL (+4)
225 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
226 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
228 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
229 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
230 #define SF_IS_INF 0x40
231 #define SF_HAS_PAR 0x80
232 #define SF_IN_PAR 0x100
233 #define SF_HAS_EVAL 0x200
234 #define SCF_DO_SUBSTR 0x400
235 #define SCF_DO_STCLASS_AND 0x0800
236 #define SCF_DO_STCLASS_OR 0x1000
237 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
238 #define SCF_WHILEM_VISITED_POS 0x2000
240 #define UTF (RExC_utf8 != 0)
241 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
242 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
244 #define OOB_UNICODE 12345678
245 #define OOB_NAMEDCLASS -1
247 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
248 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
251 /* length of regex to show in messages that don't mark a position within */
252 #define RegexLengthToShowInErrorMessages 127
255 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
256 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
257 * op/pragma/warn/regcomp.
259 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
260 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
262 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
265 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
266 * arg. Show regex, up to a maximum length. If it's too long, chop and add
269 #define FAIL(msg) STMT_START { \
270 const char *ellipses = ""; \
271 IV len = RExC_end - RExC_precomp; \
274 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
275 if (len > RegexLengthToShowInErrorMessages) { \
276 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
277 len = RegexLengthToShowInErrorMessages - 10; \
280 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
281 msg, (int)len, RExC_precomp, ellipses); \
285 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
286 * args. Show regex, up to a maximum length. If it's too long, chop and add
289 #define FAIL2(pat,msg) STMT_START { \
290 const char *ellipses = ""; \
291 IV len = RExC_end - RExC_precomp; \
294 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
295 if (len > RegexLengthToShowInErrorMessages) { \
296 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
297 len = RegexLengthToShowInErrorMessages - 10; \
300 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
301 msg, (int)len, RExC_precomp, ellipses); \
306 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
308 #define Simple_vFAIL(m) STMT_START { \
309 const IV offset = RExC_parse - RExC_precomp; \
310 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
311 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
315 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
317 #define vFAIL(m) STMT_START { \
319 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
324 * Like Simple_vFAIL(), but accepts two arguments.
326 #define Simple_vFAIL2(m,a1) STMT_START { \
327 const IV offset = RExC_parse - RExC_precomp; \
328 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
329 (int)offset, RExC_precomp, RExC_precomp + offset); \
333 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
335 #define vFAIL2(m,a1) STMT_START { \
337 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
338 Simple_vFAIL2(m, a1); \
343 * Like Simple_vFAIL(), but accepts three arguments.
345 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
346 const IV offset = RExC_parse - RExC_precomp; \
347 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
348 (int)offset, RExC_precomp, RExC_precomp + offset); \
352 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
354 #define vFAIL3(m,a1,a2) STMT_START { \
356 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
357 Simple_vFAIL3(m, a1, a2); \
361 * Like Simple_vFAIL(), but accepts four arguments.
363 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
364 const IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
369 #define vWARN(loc,m) STMT_START { \
370 const IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375 #define vWARNdep(loc,m) STMT_START { \
376 const IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
383 #define vWARN2(loc, m, a1) STMT_START { \
384 const IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
389 #define vWARN3(loc, m, a1, a2) STMT_START { \
390 const IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 const IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 const IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
413 /* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
423 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
430 RExC_offsets[2*(node)-1] = (byte); \
435 #define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439 #define Set_Node_Length_To_R(node,len) STMT_START { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (int)(node), (int)(len))); \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
446 RExC_offsets[2*(node)] = (len); \
451 #define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461 static void clear_re(pTHX_ void *r);
463 /* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 const STRLEN l = CHR_SVLEN(data->last_found);
471 const STRLEN old_l = CHR_SVLEN(*data->longest);
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 SvSetMagicSV(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 data->flags &= ~SF_FIX_BEFORE_EOL;
484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
485 data->offset_float_max = (l
486 ? data->last_start_max
487 : data->pos_min + data->pos_delta);
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
490 if (data->flags & SF_BEFORE_EOL)
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 data->flags &= ~SF_FL_BEFORE_EOL;
497 SvCUR_set(data->last_found, 0);
499 SV * const sv = data->last_found;
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
506 data->flags &= ~SF_BEFORE_EOL;
509 /* Can match anything (initialization) */
511 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
513 ANYOF_CLASS_ZERO(cl);
514 ANYOF_BITMAP_SETALL(cl);
515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
517 cl->flags |= ANYOF_LOCALE;
520 /* Can match anything (initialization) */
522 S_cl_is_anything(pTHX_ const struct regnode_charclass_class *cl)
526 for (value = 0; value <= ANYOF_MAX; value += 2)
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
536 /* Can match anything (initialization) */
538 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 Zero(cl, 1, struct regnode_charclass_class);
542 cl_anything(pRExC_state, cl);
546 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
548 Zero(cl, 1, struct regnode_charclass_class);
550 cl_anything(pRExC_state, cl);
552 cl->flags |= ANYOF_LOCALE;
555 /* 'And' a given class with another one. Can create false positives */
556 /* We assume that cl is not inverted */
558 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
559 const struct regnode_charclass_class *and_with)
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
579 !(and_with->flags & ANYOF_INVERT)) {
580 cl->flags &= ~ANYOF_UNICODE_ALL;
581 cl->flags |= ANYOF_UNICODE;
582 ARG_SET(cl, ARG(and_with));
584 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
585 !(and_with->flags & ANYOF_INVERT))
586 cl->flags &= ~ANYOF_UNICODE_ALL;
587 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
588 !(and_with->flags & ANYOF_INVERT))
589 cl->flags &= ~ANYOF_UNICODE;
592 /* 'OR' a given class with another one. Can create false positives */
593 /* We assume that cl is not inverted */
595 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
597 if (or_with->flags & ANYOF_INVERT) {
599 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
600 * <= (B1 | !B2) | (CL1 | !CL2)
601 * which is wasteful if CL2 is small, but we ignore CL2:
602 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
603 * XXXX Can we handle case-fold? Unclear:
604 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
605 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
607 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
608 && !(or_with->flags & ANYOF_FOLD)
609 && !(cl->flags & ANYOF_FOLD) ) {
612 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
613 cl->bitmap[i] |= ~or_with->bitmap[i];
614 } /* XXXX: logic is complicated otherwise */
616 cl_anything(pRExC_state, cl);
619 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
620 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
621 && (!(or_with->flags & ANYOF_FOLD)
622 || (cl->flags & ANYOF_FOLD)) ) {
625 /* OR char bitmap and class bitmap separately */
626 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
627 cl->bitmap[i] |= or_with->bitmap[i];
628 if (or_with->flags & ANYOF_CLASS) {
629 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
630 cl->classflags[i] |= or_with->classflags[i];
631 cl->flags |= ANYOF_CLASS;
634 else { /* XXXX: logic is complicated, leave it along for a moment. */
635 cl_anything(pRExC_state, cl);
638 if (or_with->flags & ANYOF_EOS)
639 cl->flags |= ANYOF_EOS;
641 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
642 ARG(cl) != ARG(or_with)) {
643 cl->flags |= ANYOF_UNICODE_ALL;
644 cl->flags &= ~ANYOF_UNICODE;
646 if (or_with->flags & ANYOF_UNICODE_ALL) {
647 cl->flags |= ANYOF_UNICODE_ALL;
648 cl->flags &= ~ANYOF_UNICODE;
653 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
654 * These need to be revisited when a newer toolchain becomes available.
656 #if defined(__sparc64__) && defined(__GNUC__)
657 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
658 # undef SPARC64_GCC_WORKAROUND
659 # define SPARC64_GCC_WORKAROUND 1
663 /* REx optimizer. Converts nodes into quickier variants "in place".
664 Finds fixed substrings. */
666 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
667 to the position after last scanned or to NULL. */
670 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
671 /* scanp: Start here (read-write). */
672 /* deltap: Write maxlen-minlen here. */
673 /* last: Stop before this one. */
675 I32 min = 0, pars = 0, code;
676 regnode *scan = *scanp, *next;
678 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
679 int is_inf_internal = 0; /* The studied chunk is infinite */
680 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
681 scan_data_t data_fake;
682 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
684 while (scan && OP(scan) != END && scan < last) {
685 /* Peephole optimizer: */
687 if (PL_regkind[(U8)OP(scan)] == EXACT) {
688 /* Merge several consecutive EXACTish nodes into one. */
689 regnode *n = regnext(scan);
692 regnode *stop = scan;
695 next = scan + NODE_SZ_STR(scan);
696 /* Skip NOTHING, merge EXACT*. */
698 ( PL_regkind[(U8)OP(n)] == NOTHING ||
699 (stringok && (OP(n) == OP(scan))))
701 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
702 if (OP(n) == TAIL || n > next)
704 if (PL_regkind[(U8)OP(n)] == NOTHING) {
705 NEXT_OFF(scan) += NEXT_OFF(n);
706 next = n + NODE_STEP_REGNODE;
714 const int oldl = STR_LEN(scan);
715 regnode *nnext = regnext(n);
717 if (oldl + STR_LEN(n) > U8_MAX)
719 NEXT_OFF(scan) += NEXT_OFF(n);
720 STR_LEN(scan) += STR_LEN(n);
721 next = n + NODE_SZ_STR(n);
722 /* Now we can overwrite *n : */
723 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
731 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
733 Two problematic code points in Unicode casefolding of EXACT nodes:
735 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
736 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
742 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
743 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
745 This means that in case-insensitive matching (or "loose matching",
746 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
747 length of the above casefolded versions) can match a target string
748 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
749 This would rather mess up the minimum length computation.
751 What we'll do is to look for the tail four bytes, and then peek
752 at the preceding two bytes to see whether we need to decrease
753 the minimum length by four (six minus two).
755 Thanks to the design of UTF-8, there cannot be false matches:
756 A sequence of valid UTF-8 bytes cannot be a subsequence of
757 another valid sequence of UTF-8 bytes.
760 char *s0 = STRING(scan), *s, *t;
761 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
762 const char * const t0 = "\xcc\x88\xcc\x81";
763 const char * const t1 = t0 + 3;
766 s < s2 && (t = ninstr(s, s1, t0, t1));
768 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
769 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
776 n = scan + NODE_SZ_STR(scan);
778 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
786 /* Follow the next-chain of the current node and optimize
787 away all the NOTHINGs from it. */
788 if (OP(scan) != CURLYX) {
789 const int max = (reg_off_by_arg[OP(scan)]
791 /* I32 may be smaller than U16 on CRAYs! */
792 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
793 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
797 /* Skip NOTHING and LONGJMP. */
798 while ((n = regnext(n))
799 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
800 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
803 if (reg_off_by_arg[OP(scan)])
806 NEXT_OFF(scan) = off;
808 /* The principal pseudo-switch. Cannot be a switch, since we
809 look into several different things. */
810 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
811 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
812 next = regnext(scan);
815 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
816 I32 max1 = 0, min1 = I32_MAX, num = 0;
817 struct regnode_charclass_class accum;
819 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
820 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
821 if (flags & SCF_DO_STCLASS)
822 cl_init_zero(pRExC_state, &accum);
823 while (OP(scan) == code) {
824 I32 deltanext, minnext, f = 0, fake;
825 struct regnode_charclass_class this_class;
830 data_fake.whilem_c = data->whilem_c;
831 data_fake.last_closep = data->last_closep;
834 data_fake.last_closep = &fake;
835 next = regnext(scan);
836 scan = NEXTOPER(scan);
838 scan = NEXTOPER(scan);
839 if (flags & SCF_DO_STCLASS) {
840 cl_init(pRExC_state, &this_class);
841 data_fake.start_class = &this_class;
842 f = SCF_DO_STCLASS_AND;
844 if (flags & SCF_WHILEM_VISITED_POS)
845 f |= SCF_WHILEM_VISITED_POS;
846 /* we suppose the run is continuous, last=next...*/
847 minnext = study_chunk(pRExC_state, &scan, &deltanext,
848 next, &data_fake, f);
851 if (max1 < minnext + deltanext)
852 max1 = minnext + deltanext;
853 if (deltanext == I32_MAX)
854 is_inf = is_inf_internal = 1;
856 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
858 if (data && (data_fake.flags & SF_HAS_EVAL))
859 data->flags |= SF_HAS_EVAL;
861 data->whilem_c = data_fake.whilem_c;
862 if (flags & SCF_DO_STCLASS)
863 cl_or(pRExC_state, &accum, &this_class);
867 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
869 if (flags & SCF_DO_SUBSTR) {
870 data->pos_min += min1;
871 data->pos_delta += max1 - min1;
872 if (max1 != min1 || is_inf)
873 data->longest = &(data->longest_float);
876 delta += max1 - min1;
877 if (flags & SCF_DO_STCLASS_OR) {
878 cl_or(pRExC_state, data->start_class, &accum);
880 cl_and(data->start_class, &and_with);
881 flags &= ~SCF_DO_STCLASS;
884 else if (flags & SCF_DO_STCLASS_AND) {
886 cl_and(data->start_class, &accum);
887 flags &= ~SCF_DO_STCLASS;
890 /* Switch to OR mode: cache the old value of
891 * data->start_class */
892 StructCopy(data->start_class, &and_with,
893 struct regnode_charclass_class);
894 flags &= ~SCF_DO_STCLASS_AND;
895 StructCopy(&accum, data->start_class,
896 struct regnode_charclass_class);
897 flags |= SCF_DO_STCLASS_OR;
898 data->start_class->flags |= ANYOF_EOS;
903 else if (code == BRANCHJ) /* single branch is optimized. */
904 scan = NEXTOPER(NEXTOPER(scan));
905 else /* single branch is optimized. */
906 scan = NEXTOPER(scan);
909 else if (OP(scan) == EXACT) {
910 I32 l = STR_LEN(scan);
911 UV uc = *((U8*)STRING(scan));
913 const U8 * const s = (U8*)STRING(scan);
914 l = utf8_length((U8 *)s, (U8 *)s + l);
915 uc = utf8_to_uvchr((U8 *)s, NULL);
918 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
919 /* The code below prefers earlier match for fixed
920 offset, later match for variable offset. */
921 if (data->last_end == -1) { /* Update the start info. */
922 data->last_start_min = data->pos_min;
923 data->last_start_max = is_inf
924 ? I32_MAX : data->pos_min + data->pos_delta;
926 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
928 SvUTF8_on(data->last_found);
930 SV * const sv = data->last_found;
931 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
932 mg_find(sv, PERL_MAGIC_utf8) : NULL;
933 if (mg && mg->mg_len >= 0)
934 mg->mg_len += utf8_length((U8*)STRING(scan),
935 (U8*)STRING(scan)+STR_LEN(scan));
937 data->last_end = data->pos_min + l;
938 data->pos_min += l; /* As in the first entry. */
939 data->flags &= ~SF_BEFORE_EOL;
941 if (flags & SCF_DO_STCLASS_AND) {
942 /* Check whether it is compatible with what we know already! */
946 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
947 && !ANYOF_BITMAP_TEST(data->start_class, uc)
948 && (!(data->start_class->flags & ANYOF_FOLD)
949 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
952 ANYOF_CLASS_ZERO(data->start_class);
953 ANYOF_BITMAP_ZERO(data->start_class);
955 ANYOF_BITMAP_SET(data->start_class, uc);
956 data->start_class->flags &= ~ANYOF_EOS;
958 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
960 else if (flags & SCF_DO_STCLASS_OR) {
961 /* false positive possible if the class is case-folded */
963 ANYOF_BITMAP_SET(data->start_class, uc);
965 data->start_class->flags |= ANYOF_UNICODE_ALL;
966 data->start_class->flags &= ~ANYOF_EOS;
967 cl_and(data->start_class, &and_with);
969 flags &= ~SCF_DO_STCLASS;
971 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
972 I32 l = STR_LEN(scan);
973 UV uc = *((U8*)STRING(scan));
975 /* Search for fixed substrings supports EXACT only. */
976 if (flags & SCF_DO_SUBSTR)
977 scan_commit(pRExC_state, data);
979 U8 *s = (U8 *)STRING(scan);
980 l = utf8_length(s, s + l);
981 uc = utf8_to_uvchr(s, NULL);
984 if (data && (flags & SCF_DO_SUBSTR))
986 if (flags & SCF_DO_STCLASS_AND) {
987 /* Check whether it is compatible with what we know already! */
991 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
992 && !ANYOF_BITMAP_TEST(data->start_class, uc)
993 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
995 ANYOF_CLASS_ZERO(data->start_class);
996 ANYOF_BITMAP_ZERO(data->start_class);
998 ANYOF_BITMAP_SET(data->start_class, uc);
999 data->start_class->flags &= ~ANYOF_EOS;
1000 data->start_class->flags |= ANYOF_FOLD;
1001 if (OP(scan) == EXACTFL)
1002 data->start_class->flags |= ANYOF_LOCALE;
1005 else if (flags & SCF_DO_STCLASS_OR) {
1006 if (data->start_class->flags & ANYOF_FOLD) {
1007 /* false positive possible if the class is case-folded.
1008 Assume that the locale settings are the same... */
1010 ANYOF_BITMAP_SET(data->start_class, uc);
1011 data->start_class->flags &= ~ANYOF_EOS;
1013 cl_and(data->start_class, &and_with);
1015 flags &= ~SCF_DO_STCLASS;
1017 else if (strchr((const char*)PL_varies,OP(scan))) {
1018 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1019 I32 f = flags, pos_before = 0;
1020 regnode *oscan = scan;
1021 struct regnode_charclass_class this_class;
1022 struct regnode_charclass_class *oclass = NULL;
1023 I32 next_is_eval = 0;
1025 switch (PL_regkind[(U8)OP(scan)]) {
1026 case WHILEM: /* End of (?:...)* . */
1027 scan = NEXTOPER(scan);
1030 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1031 next = NEXTOPER(scan);
1032 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1034 maxcount = REG_INFTY;
1035 next = regnext(scan);
1036 scan = NEXTOPER(scan);
1040 if (flags & SCF_DO_SUBSTR)
1045 if (flags & SCF_DO_STCLASS) {
1047 maxcount = REG_INFTY;
1048 next = regnext(scan);
1049 scan = NEXTOPER(scan);
1052 is_inf = is_inf_internal = 1;
1053 scan = regnext(scan);
1054 if (flags & SCF_DO_SUBSTR) {
1055 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1056 data->longest = &(data->longest_float);
1058 goto optimize_curly_tail;
1060 mincount = ARG1(scan);
1061 maxcount = ARG2(scan);
1062 next = regnext(scan);
1063 if (OP(scan) == CURLYX) {
1064 I32 lp = (data ? *(data->last_closep) : 0);
1066 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1068 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1069 next_is_eval = (OP(scan) == EVAL);
1071 if (flags & SCF_DO_SUBSTR) {
1072 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1073 pos_before = data->pos_min;
1077 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1079 data->flags |= SF_IS_INF;
1081 if (flags & SCF_DO_STCLASS) {
1082 cl_init(pRExC_state, &this_class);
1083 oclass = data->start_class;
1084 data->start_class = &this_class;
1085 f |= SCF_DO_STCLASS_AND;
1086 f &= ~SCF_DO_STCLASS_OR;
1088 /* These are the cases when once a subexpression
1089 fails at a particular position, it cannot succeed
1090 even after backtracking at the enclosing scope.
1092 XXXX what if minimal match and we are at the
1093 initial run of {n,m}? */
1094 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1095 f &= ~SCF_WHILEM_VISITED_POS;
1097 /* This will finish on WHILEM, setting scan, or on NULL: */
1098 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1100 ? (f & ~SCF_DO_SUBSTR) : f);
1102 if (flags & SCF_DO_STCLASS)
1103 data->start_class = oclass;
1104 if (mincount == 0 || minnext == 0) {
1105 if (flags & SCF_DO_STCLASS_OR) {
1106 cl_or(pRExC_state, data->start_class, &this_class);
1108 else if (flags & SCF_DO_STCLASS_AND) {
1109 /* Switch to OR mode: cache the old value of
1110 * data->start_class */
1111 StructCopy(data->start_class, &and_with,
1112 struct regnode_charclass_class);
1113 flags &= ~SCF_DO_STCLASS_AND;
1114 StructCopy(&this_class, data->start_class,
1115 struct regnode_charclass_class);
1116 flags |= SCF_DO_STCLASS_OR;
1117 data->start_class->flags |= ANYOF_EOS;
1119 } else { /* Non-zero len */
1120 if (flags & SCF_DO_STCLASS_OR) {
1121 cl_or(pRExC_state, data->start_class, &this_class);
1122 cl_and(data->start_class, &and_with);
1124 else if (flags & SCF_DO_STCLASS_AND)
1125 cl_and(data->start_class, &this_class);
1126 flags &= ~SCF_DO_STCLASS;
1128 if (!scan) /* It was not CURLYX, but CURLY. */
1130 if ( /* ? quantifier ok, except for (?{ ... }) */
1131 (next_is_eval || !(mincount == 0 && maxcount == 1))
1132 && (minnext == 0) && (deltanext == 0)
1133 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1134 && maxcount <= REG_INFTY/3 /* Complement check for big count */
1135 && ckWARN(WARN_REGEXP))
1138 "Quantifier unexpected on zero-length expression");
1141 min += minnext * mincount;
1142 is_inf_internal |= ((maxcount == REG_INFTY
1143 && (minnext + deltanext) > 0)
1144 || deltanext == I32_MAX);
1145 is_inf |= is_inf_internal;
1146 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1148 /* Try powerful optimization CURLYX => CURLYN. */
1149 if ( OP(oscan) == CURLYX && data
1150 && data->flags & SF_IN_PAR
1151 && !(data->flags & SF_HAS_EVAL)
1152 && !deltanext && minnext == 1 ) {
1153 /* Try to optimize to CURLYN. */
1154 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1155 regnode *nxt1 = nxt;
1162 if (!strchr((const char*)PL_simple,OP(nxt))
1163 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1164 && STR_LEN(nxt) == 1))
1170 if (OP(nxt) != CLOSE)
1172 /* Now we know that nxt2 is the only contents: */
1173 oscan->flags = (U8)ARG(nxt);
1175 OP(nxt1) = NOTHING; /* was OPEN. */
1177 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1178 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1179 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1180 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1181 OP(nxt + 1) = OPTIMIZED; /* was count. */
1182 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1187 /* Try optimization CURLYX => CURLYM. */
1188 if ( OP(oscan) == CURLYX && data
1189 && !(data->flags & SF_HAS_PAR)
1190 && !(data->flags & SF_HAS_EVAL)
1191 && !deltanext /* atom is fixed width */
1192 && minnext != 0 /* CURLYM can't handle zero width */
1194 /* XXXX How to optimize if data == 0? */
1195 /* Optimize to a simpler form. */
1196 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1200 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1201 && (OP(nxt2) != WHILEM))
1203 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1204 /* Need to optimize away parenths. */
1205 if (data->flags & SF_IN_PAR) {
1206 /* Set the parenth number. */
1207 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1209 if (OP(nxt) != CLOSE)
1210 FAIL("Panic opt close");
1211 oscan->flags = (U8)ARG(nxt);
1212 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1213 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1215 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1216 OP(nxt + 1) = OPTIMIZED; /* was count. */
1217 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1218 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1221 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1222 regnode *nnxt = regnext(nxt1);
1225 if (reg_off_by_arg[OP(nxt1)])
1226 ARG_SET(nxt1, nxt2 - nxt1);
1227 else if (nxt2 - nxt1 < U16_MAX)
1228 NEXT_OFF(nxt1) = nxt2 - nxt1;
1230 OP(nxt) = NOTHING; /* Cannot beautify */
1235 /* Optimize again: */
1236 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1242 else if ((OP(oscan) == CURLYX)
1243 && (flags & SCF_WHILEM_VISITED_POS)
1244 /* See the comment on a similar expression above.
1245 However, this time it not a subexpression
1246 we care about, but the expression itself. */
1247 && (maxcount == REG_INFTY)
1248 && data && ++data->whilem_c < 16) {
1249 /* This stays as CURLYX, we can put the count/of pair. */
1250 /* Find WHILEM (as in regexec.c) */
1251 regnode *nxt = oscan + NEXT_OFF(oscan);
1253 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1255 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1256 | (RExC_whilem_seen << 4)); /* On WHILEM */
1258 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1260 if (flags & SCF_DO_SUBSTR) {
1261 SV *last_str = Nullsv;
1262 int counted = mincount != 0;
1264 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1265 #if defined(SPARC64_GCC_WORKAROUND)
1268 const char *s = NULL;
1271 if (pos_before >= data->last_start_min)
1274 b = data->last_start_min;
1277 s = SvPV_const(data->last_found, l);
1278 old = b - data->last_start_min;
1281 I32 b = pos_before >= data->last_start_min
1282 ? pos_before : data->last_start_min;
1284 const char *s = SvPV_const(data->last_found, l);
1285 I32 old = b - data->last_start_min;
1289 old = utf8_hop((U8*)s, old) - (U8*)s;
1292 /* Get the added string: */
1293 last_str = newSVpvn(s + old, l);
1295 SvUTF8_on(last_str);
1296 if (deltanext == 0 && pos_before == b) {
1297 /* What was added is a constant string */
1299 SvGROW(last_str, (mincount * l) + 1);
1300 repeatcpy(SvPVX(last_str) + l,
1301 SvPVX_const(last_str), l, mincount - 1);
1302 SvCUR_set(last_str, SvCUR(last_str) * mincount);
1303 /* Add additional parts. */
1304 SvCUR_set(data->last_found,
1305 SvCUR(data->last_found) - l);
1306 sv_catsv(data->last_found, last_str);
1308 SV * sv = data->last_found;
1310 SvUTF8(sv) && SvMAGICAL(sv) ?
1311 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1312 if (mg && mg->mg_len >= 0)
1313 mg->mg_len += CHR_SVLEN(last_str);
1315 data->last_end += l * (mincount - 1);
1318 /* start offset must point into the last copy */
1319 data->last_start_min += minnext * (mincount - 1);
1320 data->last_start_max += is_inf ? I32_MAX
1321 : (maxcount - 1) * (minnext + data->pos_delta);
1324 /* It is counted once already... */
1325 data->pos_min += minnext * (mincount - counted);
1326 data->pos_delta += - counted * deltanext +
1327 (minnext + deltanext) * maxcount - minnext * mincount;
1328 if (mincount != maxcount) {
1329 /* Cannot extend fixed substrings found inside
1331 scan_commit(pRExC_state,data);
1332 if (mincount && last_str) {
1333 SV *sv = data->last_found;
1334 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
1335 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1339 sv_setsv(sv, last_str);
1340 data->last_end = data->pos_min;
1341 data->last_start_min =
1342 data->pos_min - CHR_SVLEN(last_str);
1343 data->last_start_max = is_inf
1345 : data->pos_min + data->pos_delta
1346 - CHR_SVLEN(last_str);
1348 data->longest = &(data->longest_float);
1350 SvREFCNT_dec(last_str);
1352 if (data && (fl & SF_HAS_EVAL))
1353 data->flags |= SF_HAS_EVAL;
1354 optimize_curly_tail:
1355 if (OP(oscan) != CURLYX) {
1356 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1358 NEXT_OFF(oscan) += NEXT_OFF(next);
1361 default: /* REF and CLUMP only? */
1362 if (flags & SCF_DO_SUBSTR) {
1363 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1364 data->longest = &(data->longest_float);
1366 is_inf = is_inf_internal = 1;
1367 if (flags & SCF_DO_STCLASS_OR)
1368 cl_anything(pRExC_state, data->start_class);
1369 flags &= ~SCF_DO_STCLASS;
1373 else if (strchr((const char*)PL_simple,OP(scan))) {
1376 if (flags & SCF_DO_SUBSTR) {
1377 scan_commit(pRExC_state,data);
1381 if (flags & SCF_DO_STCLASS) {
1382 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1384 /* Some of the logic below assumes that switching
1385 locale on will only add false positives. */
1386 switch (PL_regkind[(U8)OP(scan)]) {
1390 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1391 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1392 cl_anything(pRExC_state, data->start_class);
1395 if (OP(scan) == SANY)
1397 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1398 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1399 || (data->start_class->flags & ANYOF_CLASS));
1400 cl_anything(pRExC_state, data->start_class);
1402 if (flags & SCF_DO_STCLASS_AND || !value)
1403 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1406 if (flags & SCF_DO_STCLASS_AND)
1407 cl_and(data->start_class,
1408 (struct regnode_charclass_class*)scan);
1410 cl_or(pRExC_state, data->start_class,
1411 (struct regnode_charclass_class*)scan);
1414 if (flags & SCF_DO_STCLASS_AND) {
1415 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1416 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1417 for (value = 0; value < 256; value++)
1418 if (!isALNUM(value))
1419 ANYOF_BITMAP_CLEAR(data->start_class, value);
1423 if (data->start_class->flags & ANYOF_LOCALE)
1424 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1426 for (value = 0; value < 256; value++)
1428 ANYOF_BITMAP_SET(data->start_class, value);
1433 if (flags & SCF_DO_STCLASS_AND) {
1434 if (data->start_class->flags & ANYOF_LOCALE)
1435 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1438 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1439 data->start_class->flags |= ANYOF_LOCALE;
1443 if (flags & SCF_DO_STCLASS_AND) {
1444 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1445 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1446 for (value = 0; value < 256; value++)
1448 ANYOF_BITMAP_CLEAR(data->start_class, value);
1452 if (data->start_class->flags & ANYOF_LOCALE)
1453 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1455 for (value = 0; value < 256; value++)
1456 if (!isALNUM(value))
1457 ANYOF_BITMAP_SET(data->start_class, value);
1462 if (flags & SCF_DO_STCLASS_AND) {
1463 if (data->start_class->flags & ANYOF_LOCALE)
1464 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1467 data->start_class->flags |= ANYOF_LOCALE;
1468 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1472 if (flags & SCF_DO_STCLASS_AND) {
1473 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1474 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1475 for (value = 0; value < 256; value++)
1476 if (!isSPACE(value))
1477 ANYOF_BITMAP_CLEAR(data->start_class, value);
1481 if (data->start_class->flags & ANYOF_LOCALE)
1482 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1484 for (value = 0; value < 256; value++)
1486 ANYOF_BITMAP_SET(data->start_class, value);
1491 if (flags & SCF_DO_STCLASS_AND) {
1492 if (data->start_class->flags & ANYOF_LOCALE)
1493 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1496 data->start_class->flags |= ANYOF_LOCALE;
1497 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1501 if (flags & SCF_DO_STCLASS_AND) {
1502 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1503 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1504 for (value = 0; value < 256; value++)
1506 ANYOF_BITMAP_CLEAR(data->start_class, value);
1510 if (data->start_class->flags & ANYOF_LOCALE)
1511 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1513 for (value = 0; value < 256; value++)
1514 if (!isSPACE(value))
1515 ANYOF_BITMAP_SET(data->start_class, value);
1520 if (flags & SCF_DO_STCLASS_AND) {
1521 if (data->start_class->flags & ANYOF_LOCALE) {
1522 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1523 for (value = 0; value < 256; value++)
1524 if (!isSPACE(value))
1525 ANYOF_BITMAP_CLEAR(data->start_class, value);
1529 data->start_class->flags |= ANYOF_LOCALE;
1530 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1534 if (flags & SCF_DO_STCLASS_AND) {
1535 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1536 for (value = 0; value < 256; value++)
1537 if (!isDIGIT(value))
1538 ANYOF_BITMAP_CLEAR(data->start_class, value);
1541 if (data->start_class->flags & ANYOF_LOCALE)
1542 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1544 for (value = 0; value < 256; value++)
1546 ANYOF_BITMAP_SET(data->start_class, value);
1551 if (flags & SCF_DO_STCLASS_AND) {
1552 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1553 for (value = 0; value < 256; value++)
1555 ANYOF_BITMAP_CLEAR(data->start_class, value);
1558 if (data->start_class->flags & ANYOF_LOCALE)
1559 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1561 for (value = 0; value < 256; value++)
1562 if (!isDIGIT(value))
1563 ANYOF_BITMAP_SET(data->start_class, value);
1568 if (flags & SCF_DO_STCLASS_OR)
1569 cl_and(data->start_class, &and_with);
1570 flags &= ~SCF_DO_STCLASS;
1573 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1574 data->flags |= (OP(scan) == MEOL
1578 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1579 /* Lookbehind, or need to calculate parens/evals/stclass: */
1580 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1581 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1582 /* Lookahead/lookbehind */
1583 I32 deltanext, minnext, fake = 0;
1585 struct regnode_charclass_class intrnl;
1588 data_fake.flags = 0;
1590 data_fake.whilem_c = data->whilem_c;
1591 data_fake.last_closep = data->last_closep;
1594 data_fake.last_closep = &fake;
1595 if ( flags & SCF_DO_STCLASS && !scan->flags
1596 && OP(scan) == IFMATCH ) { /* Lookahead */
1597 cl_init(pRExC_state, &intrnl);
1598 data_fake.start_class = &intrnl;
1599 f |= SCF_DO_STCLASS_AND;
1601 if (flags & SCF_WHILEM_VISITED_POS)
1602 f |= SCF_WHILEM_VISITED_POS;
1603 next = regnext(scan);
1604 nscan = NEXTOPER(NEXTOPER(scan));
1605 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1608 vFAIL("Variable length lookbehind not implemented");
1610 else if (minnext > U8_MAX) {
1611 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1613 scan->flags = (U8)minnext;
1615 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1617 if (data && (data_fake.flags & SF_HAS_EVAL))
1618 data->flags |= SF_HAS_EVAL;
1620 data->whilem_c = data_fake.whilem_c;
1621 if (f & SCF_DO_STCLASS_AND) {
1622 const int was = (data->start_class->flags & ANYOF_EOS);
1624 cl_and(data->start_class, &intrnl);
1626 data->start_class->flags |= ANYOF_EOS;
1629 else if (OP(scan) == OPEN) {
1632 else if (OP(scan) == CLOSE) {
1633 if ((I32)ARG(scan) == is_par) {
1634 next = regnext(scan);
1636 if ( next && (OP(next) != WHILEM) && next < last)
1637 is_par = 0; /* Disable optimization */
1640 *(data->last_closep) = ARG(scan);
1642 else if (OP(scan) == EVAL) {
1644 data->flags |= SF_HAS_EVAL;
1646 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1647 if (flags & SCF_DO_SUBSTR) {
1648 scan_commit(pRExC_state,data);
1649 data->longest = &(data->longest_float);
1651 is_inf = is_inf_internal = 1;
1652 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1653 cl_anything(pRExC_state, data->start_class);
1654 flags &= ~SCF_DO_STCLASS;
1656 /* Else: zero-length, ignore. */
1657 scan = regnext(scan);
1662 *deltap = is_inf_internal ? I32_MAX : delta;
1663 if (flags & SCF_DO_SUBSTR && is_inf)
1664 data->pos_delta = I32_MAX - data->pos_min;
1665 if (is_par > U8_MAX)
1667 if (is_par && pars==1 && data) {
1668 data->flags |= SF_IN_PAR;
1669 data->flags &= ~SF_HAS_PAR;
1671 else if (pars && data) {
1672 data->flags |= SF_HAS_PAR;
1673 data->flags &= ~SF_IN_PAR;
1675 if (flags & SCF_DO_STCLASS_OR)
1676 cl_and(data->start_class, &and_with);
1681 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
1683 if (RExC_rx->data) {
1684 Renewc(RExC_rx->data,
1685 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1686 char, struct reg_data);
1687 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1688 RExC_rx->data->count += n;
1691 Newxc(RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1692 char, struct reg_data);
1693 Newx(RExC_rx->data->what, n, U8);
1694 RExC_rx->data->count = n;
1696 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1697 return RExC_rx->data->count - n;
1701 Perl_reginitcolors(pTHX)
1703 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
1705 char *t = savepv(s);
1709 t = strchr(t, '\t');
1715 PL_colors[i] = t = (char *)"";
1720 PL_colors[i++] = (char *)"";
1727 - pregcomp - compile a regular expression into internal code
1729 * We can't allocate space until we know how big the compiled form will be,
1730 * but we can't compile it (and thus know how big it is) until we've got a
1731 * place to put the code. So we cheat: we compile it twice, once with code
1732 * generation turned off and size counting turned on, and once "for real".
1733 * This also means that we don't allocate space until we are sure that the
1734 * thing really will compile successfully, and we never have to move the
1735 * code and thus invalidate pointers into it. (Note that it has to be in
1736 * one piece because free() must be able to free it all.) [NB: not true in perl]
1738 * Beware that the optimization-preparation code in here knows about some
1739 * of the structure of the compiled regexp. [I'll say.]
1742 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1752 RExC_state_t RExC_state;
1753 RExC_state_t *pRExC_state = &RExC_state;
1756 FAIL("NULL regexp argument");
1758 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1762 if (!PL_colorset) reginitcolors();
1763 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1764 PL_colors[4],PL_colors[5],PL_colors[0],
1765 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1767 RExC_flags = pm->op_pmflags;
1771 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1772 RExC_seen_evals = 0;
1775 /* First pass: determine size, legality. */
1782 RExC_emit = &PL_regdummy;
1783 RExC_whilem_seen = 0;
1784 #if 0 /* REGC() is (currently) a NOP at the first pass.
1785 * Clever compilers notice this and complain. --jhi */
1786 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1788 if (reg(pRExC_state, 0, &flags) == NULL) {
1789 RExC_precomp = Nullch;
1792 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1794 /* Small enough for pointer-storage convention?
1795 If extralen==0, this means that we will not need long jumps. */
1796 if (RExC_size >= 0x10000L && RExC_extralen)
1797 RExC_size += RExC_extralen;
1800 if (RExC_whilem_seen > 15)
1801 RExC_whilem_seen = 15;
1803 /* Allocate space and initialize. */
1804 Newxc(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1807 FAIL("Regexp out of space");
1810 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1811 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1814 r->prelen = xend - exp;
1815 r->precomp = savepvn(RExC_precomp, r->prelen);
1817 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1818 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1819 r->lastparen = 0; /* mg.c reads this. */
1821 r->substrs = 0; /* Useful during FAIL. */
1822 r->startp = 0; /* Useful during FAIL. */
1823 r->endp = 0; /* Useful during FAIL. */
1825 Newxz(r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1827 r->offsets[0] = RExC_size;
1829 DEBUG_r(PerlIO_printf(Perl_debug_log,
1830 "%s %"UVuf" bytes for offset annotations.\n",
1831 r->offsets ? "Got" : "Couldn't get",
1832 (UV)((2*RExC_size+1) * sizeof(U32))));
1836 /* Second pass: emit code. */
1837 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1842 RExC_emit_start = r->program;
1843 RExC_emit = r->program;
1844 /* Store the count of eval-groups for security checks: */
1845 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1846 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1848 if (reg(pRExC_state, 0, &flags) == NULL)
1851 /* Dig out information for optimizations. */
1852 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1853 pm->op_pmflags = RExC_flags;
1855 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1856 r->regstclass = NULL;
1857 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1858 r->reganch |= ROPT_NAUGHTY;
1859 scan = r->program + 1; /* First BRANCH. */
1861 /* XXXX To minimize changes to RE engine we always allocate
1862 3-units-long substrs field. */
1863 Newxz(r->substrs, 1, struct reg_substr_data);
1865 StructCopy(&zero_scan_data, &data, scan_data_t);
1866 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1867 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1869 STRLEN longest_float_length, longest_fixed_length;
1870 struct regnode_charclass_class ch_class;
1875 /* Skip introductions and multiplicators >= 1. */
1876 while ((OP(first) == OPEN && (sawopen = 1)) ||
1877 /* An OR of *one* alternative - should not happen now. */
1878 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1879 (OP(first) == PLUS) ||
1880 (OP(first) == MINMOD) ||
1881 /* An {n,m} with n>0 */
1882 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1883 if (OP(first) == PLUS)
1886 first += regarglen[(U8)OP(first)];
1887 first = NEXTOPER(first);
1890 /* Starting-point info. */
1892 if (PL_regkind[(U8)OP(first)] == EXACT) {
1893 if (OP(first) == EXACT)
1894 ; /* Empty, get anchored substr later. */
1895 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1896 r->regstclass = first;
1898 else if (strchr((const char*)PL_simple,OP(first)))
1899 r->regstclass = first;
1900 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1901 PL_regkind[(U8)OP(first)] == NBOUND)
1902 r->regstclass = first;
1903 else if (PL_regkind[(U8)OP(first)] == BOL) {
1904 r->reganch |= (OP(first) == MBOL
1906 : (OP(first) == SBOL
1909 first = NEXTOPER(first);
1912 else if (OP(first) == GPOS) {
1913 r->reganch |= ROPT_ANCH_GPOS;
1914 first = NEXTOPER(first);
1917 else if (!sawopen && (OP(first) == STAR &&
1918 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1919 !(r->reganch & ROPT_ANCH) )
1921 /* turn .* into ^.* with an implied $*=1 */
1923 (OP(NEXTOPER(first)) == REG_ANY)
1926 r->reganch |= type | ROPT_IMPLICIT;
1927 first = NEXTOPER(first);
1930 if (sawplus && (!sawopen || !RExC_sawback)
1931 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1932 /* x+ must match at the 1st pos of run of x's */
1933 r->reganch |= ROPT_SKIP;
1935 /* Scan is after the zeroth branch, first is atomic matcher. */
1936 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1937 (IV)(first - scan + 1)));
1939 * If there's something expensive in the r.e., find the
1940 * longest literal string that must appear and make it the
1941 * regmust. Resolve ties in favor of later strings, since
1942 * the regstart check works with the beginning of the r.e.
1943 * and avoiding duplication strengthens checking. Not a
1944 * strong reason, but sufficient in the absence of others.
1945 * [Now we resolve ties in favor of the earlier string if
1946 * it happens that c_offset_min has been invalidated, since the
1947 * earlier string may buy us something the later one won't.]
1951 data.longest_fixed = newSVpvn("",0);
1952 data.longest_float = newSVpvn("",0);
1953 data.last_found = newSVpvn("",0);
1954 data.longest = &(data.longest_fixed);
1956 if (!r->regstclass) {
1957 cl_init(pRExC_state, &ch_class);
1958 data.start_class = &ch_class;
1959 stclass_flag = SCF_DO_STCLASS_AND;
1960 } else /* XXXX Check for BOUND? */
1962 data.last_closep = &last_close;
1964 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1965 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1966 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1967 && data.last_start_min == 0 && data.last_end > 0
1968 && !RExC_seen_zerolen
1969 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1970 r->reganch |= ROPT_CHECK_ALL;
1971 scan_commit(pRExC_state, &data);
1972 SvREFCNT_dec(data.last_found);
1974 longest_float_length = CHR_SVLEN(data.longest_float);
1975 if (longest_float_length
1976 || (data.flags & SF_FL_BEFORE_EOL
1977 && (!(data.flags & SF_FL_BEFORE_MEOL)
1978 || (RExC_flags & PMf_MULTILINE)))) {
1981 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1982 && data.offset_fixed == data.offset_float_min
1983 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1984 goto remove_float; /* As in (a)+. */
1986 if (SvUTF8(data.longest_float)) {
1987 r->float_utf8 = data.longest_float;
1988 r->float_substr = Nullsv;
1990 r->float_substr = data.longest_float;
1991 r->float_utf8 = Nullsv;
1993 r->float_min_offset = data.offset_float_min;
1994 r->float_max_offset = data.offset_float_max;
1995 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1996 && (!(data.flags & SF_FL_BEFORE_MEOL)
1997 || (RExC_flags & PMf_MULTILINE)));
1998 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2002 r->float_substr = r->float_utf8 = Nullsv;
2003 SvREFCNT_dec(data.longest_float);
2004 longest_float_length = 0;
2007 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2008 if (longest_fixed_length
2009 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2010 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2011 || (RExC_flags & PMf_MULTILINE)))) {
2014 if (SvUTF8(data.longest_fixed)) {
2015 r->anchored_utf8 = data.longest_fixed;
2016 r->anchored_substr = Nullsv;
2018 r->anchored_substr = data.longest_fixed;
2019 r->anchored_utf8 = Nullsv;
2021 r->anchored_offset = data.offset_fixed;
2022 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2023 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2024 || (RExC_flags & PMf_MULTILINE)));
2025 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2028 r->anchored_substr = r->anchored_utf8 = Nullsv;
2029 SvREFCNT_dec(data.longest_fixed);
2030 longest_fixed_length = 0;
2033 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2034 r->regstclass = NULL;
2035 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2037 && !(data.start_class->flags & ANYOF_EOS)
2038 && !cl_is_anything(data.start_class))
2040 const I32 n = add_data(pRExC_state, 1, "f");
2042 Newx(RExC_rx->data->data[n], 1,
2043 struct regnode_charclass_class);
2044 StructCopy(data.start_class,
2045 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2046 struct regnode_charclass_class);
2047 r->regstclass = (regnode*)RExC_rx->data->data[n];
2048 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2049 PL_regdata = r->data; /* for regprop() */
2050 DEBUG_r({ SV *sv = sv_newmortal();
2051 regprop(sv, (regnode*)data.start_class);
2052 PerlIO_printf(Perl_debug_log,
2053 "synthetic stclass \"%s\".\n",
2054 SvPVX_const(sv));});
2057 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2058 if (longest_fixed_length > longest_float_length) {
2059 r->check_substr = r->anchored_substr;
2060 r->check_utf8 = r->anchored_utf8;
2061 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2062 if (r->reganch & ROPT_ANCH_SINGLE)
2063 r->reganch |= ROPT_NOSCAN;
2066 r->check_substr = r->float_substr;
2067 r->check_utf8 = r->float_utf8;
2068 r->check_offset_min = data.offset_float_min;
2069 r->check_offset_max = data.offset_float_max;
2071 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2072 This should be changed ASAP! */
2073 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2074 r->reganch |= RE_USE_INTUIT;
2075 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2076 r->reganch |= RE_INTUIT_TAIL;
2080 /* Several toplevels. Best we can is to set minlen. */
2082 struct regnode_charclass_class ch_class;
2085 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2086 scan = r->program + 1;
2087 cl_init(pRExC_state, &ch_class);
2088 data.start_class = &ch_class;
2089 data.last_closep = &last_close;
2090 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2091 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2092 = r->float_substr = r->float_utf8 = Nullsv;
2093 if (!(data.start_class->flags & ANYOF_EOS)
2094 && !cl_is_anything(data.start_class))
2096 const I32 n = add_data(pRExC_state, 1, "f");
2098 Newx(RExC_rx->data->data[n], 1,
2099 struct regnode_charclass_class);
2100 StructCopy(data.start_class,
2101 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2102 struct regnode_charclass_class);
2103 r->regstclass = (regnode*)RExC_rx->data->data[n];
2104 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2105 DEBUG_r({ SV* sv = sv_newmortal();
2106 regprop(sv, (regnode*)data.start_class);
2107 PerlIO_printf(Perl_debug_log,
2108 "synthetic stclass \"%s\".\n",
2109 SvPVX_const(sv));});
2114 if (RExC_seen & REG_SEEN_GPOS)
2115 r->reganch |= ROPT_GPOS_SEEN;
2116 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2117 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2118 if (RExC_seen & REG_SEEN_EVAL)
2119 r->reganch |= ROPT_EVAL_SEEN;
2120 if (RExC_seen & REG_SEEN_CANY)
2121 r->reganch |= ROPT_CANY_SEEN;
2122 Newxz(r->startp, RExC_npar, I32);
2123 Newxz(r->endp, RExC_npar, I32);
2124 PL_regdata = r->data; /* for regprop() */
2125 DEBUG_r(regdump(r));
2130 - reg - regular expression, i.e. main body or parenthesized thing
2132 * Caller must absorb opening parenthesis.
2134 * Combining parenthesis handling with the base level of regular expression
2135 * is a trifle forced, but the need to tie the tails of the branches to what
2136 * follows makes it hard to avoid.
2139 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2140 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2142 register regnode *ret; /* Will be the head of the group. */
2143 register regnode *br;
2144 register regnode *lastbr;
2145 register regnode *ender = 0;
2146 register I32 parno = 0;
2147 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2149 /* for (?g), (?gc), and (?o) warnings; warning
2150 about (?c) will warn about (?g) -- japhy */
2152 I32 wastedflags = 0x00,
2155 wasted_gc = 0x02 | 0x04,
2158 char * parse_start = RExC_parse; /* MJD */
2159 char * const oregcomp_parse = RExC_parse;
2162 *flagp = 0; /* Tentatively. */
2165 /* Make an OPEN node, if parenthesized. */
2167 if (*RExC_parse == '?') { /* (?...) */
2168 U32 posflags = 0, negflags = 0;
2169 U32 *flagsp = &posflags;
2171 const char * const seqstart = RExC_parse;
2174 paren = *RExC_parse++;
2175 ret = NULL; /* For look-ahead/behind. */
2177 case '<': /* (?<...) */
2178 RExC_seen |= REG_SEEN_LOOKBEHIND;
2179 if (*RExC_parse == '!')
2181 if (*RExC_parse != '=' && *RExC_parse != '!')
2184 case '=': /* (?=...) */
2185 case '!': /* (?!...) */
2186 RExC_seen_zerolen++;
2187 case ':': /* (?:...) */
2188 case '>': /* (?>...) */
2190 case '$': /* (?$...) */
2191 case '@': /* (?@...) */
2192 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2194 case '#': /* (?#...) */
2195 while (*RExC_parse && *RExC_parse != ')')
2197 if (*RExC_parse != ')')
2198 FAIL("Sequence (?#... not terminated");
2199 nextchar(pRExC_state);
2202 case 'p': /* (?p...) */
2203 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2204 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2206 case '?': /* (??...) */
2208 if (*RExC_parse != '{')
2210 paren = *RExC_parse++;
2212 case '{': /* (?{...}) */
2214 I32 count = 1, n = 0;
2216 char *s = RExC_parse;
2218 OP_4tree *sop, *rop;
2220 RExC_seen_zerolen++;
2221 RExC_seen |= REG_SEEN_EVAL;
2222 while (count && (c = *RExC_parse)) {
2223 if (c == '\\' && RExC_parse[1])
2231 if (*RExC_parse != ')')
2234 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2239 if (RExC_parse - 1 - s)
2240 sv = newSVpvn(s, RExC_parse - 1 - s);
2242 sv = newSVpvn("", 0);
2245 Perl_save_re_context(aTHX);
2246 rop = sv_compile_2op(sv, &sop, "re", &pad);
2247 sop->op_private |= OPpREFCOUNTED;
2248 /* re_dup will OpREFCNT_inc */
2249 OpREFCNT_set(sop, 1);
2252 n = add_data(pRExC_state, 3, "nop");
2253 RExC_rx->data->data[n] = (void*)rop;
2254 RExC_rx->data->data[n+1] = (void*)sop;
2255 RExC_rx->data->data[n+2] = (void*)pad;
2258 else { /* First pass */
2259 if (PL_reginterp_cnt < ++RExC_seen_evals
2261 /* No compiled RE interpolated, has runtime
2262 components ===> unsafe. */
2263 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2264 if (PL_tainting && PL_tainted)
2265 FAIL("Eval-group in insecure regular expression");
2268 nextchar(pRExC_state);
2270 ret = reg_node(pRExC_state, LOGICAL);
2273 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2274 /* deal with the length of this later - MJD */
2277 ret = reganode(pRExC_state, EVAL, n);
2278 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2279 Set_Node_Offset(ret, parse_start);
2282 case '(': /* (?(?{...})...) and (?(?=...)...) */
2284 if (RExC_parse[0] == '?') { /* (?(?...)) */
2285 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2286 || RExC_parse[1] == '<'
2287 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2290 ret = reg_node(pRExC_state, LOGICAL);
2293 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2297 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2299 parno = atoi(RExC_parse++);
2301 while (isDIGIT(*RExC_parse))
2303 ret = reganode(pRExC_state, GROUPP, parno);
2305 if ((c = *nextchar(pRExC_state)) != ')')
2306 vFAIL("Switch condition not recognized");
2308 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2309 br = regbranch(pRExC_state, &flags, 1);
2311 br = reganode(pRExC_state, LONGJMP, 0);
2313 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2314 c = *nextchar(pRExC_state);
2318 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2319 regbranch(pRExC_state, &flags, 1);
2320 regtail(pRExC_state, ret, lastbr);
2323 c = *nextchar(pRExC_state);
2328 vFAIL("Switch (?(condition)... contains too many branches");
2329 ender = reg_node(pRExC_state, TAIL);
2330 regtail(pRExC_state, br, ender);
2332 regtail(pRExC_state, lastbr, ender);
2333 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2336 regtail(pRExC_state, ret, ender);
2340 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2344 RExC_parse--; /* for vFAIL to print correctly */
2345 vFAIL("Sequence (? incomplete");
2349 parse_flags: /* (?i) */
2350 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2351 /* (?g), (?gc) and (?o) are useless here
2352 and must be globally applied -- japhy */
2354 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2355 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2356 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2357 if (! (wastedflags & wflagbit) ) {
2358 wastedflags |= wflagbit;
2361 "Useless (%s%c) - %suse /%c modifier",
2362 flagsp == &negflags ? "?-" : "?",
2364 flagsp == &negflags ? "don't " : "",
2370 else if (*RExC_parse == 'c') {
2371 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2372 if (! (wastedflags & wasted_c) ) {
2373 wastedflags |= wasted_gc;
2376 "Useless (%sc) - %suse /gc modifier",
2377 flagsp == &negflags ? "?-" : "?",
2378 flagsp == &negflags ? "don't " : ""
2383 else { pmflag(flagsp, *RExC_parse); }
2387 if (*RExC_parse == '-') {
2389 wastedflags = 0; /* reset so (?g-c) warns twice */
2393 RExC_flags |= posflags;
2394 RExC_flags &= ~negflags;
2395 if (*RExC_parse == ':') {
2401 if (*RExC_parse != ')') {
2403 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2405 nextchar(pRExC_state);
2413 ret = reganode(pRExC_state, OPEN, parno);
2414 Set_Node_Length(ret, 1); /* MJD */
2415 Set_Node_Offset(ret, RExC_parse); /* MJD */
2422 /* Pick up the branches, linking them together. */
2423 parse_start = RExC_parse; /* MJD */
2424 br = regbranch(pRExC_state, &flags, 1);
2425 /* branch_len = (paren != 0); */
2429 if (*RExC_parse == '|') {
2430 if (!SIZE_ONLY && RExC_extralen) {
2431 reginsert(pRExC_state, BRANCHJ, br);
2434 reginsert(pRExC_state, BRANCH, br);
2435 Set_Node_Length(br, paren != 0);
2436 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2440 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2442 else if (paren == ':') {
2443 *flagp |= flags&SIMPLE;
2445 if (open) { /* Starts with OPEN. */
2446 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2448 else if (paren != '?') /* Not Conditional */
2450 *flagp |= flags & (SPSTART | HASWIDTH);
2452 while (*RExC_parse == '|') {
2453 if (!SIZE_ONLY && RExC_extralen) {
2454 ender = reganode(pRExC_state, LONGJMP,0);
2455 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2458 RExC_extralen += 2; /* Account for LONGJMP. */
2459 nextchar(pRExC_state);
2460 br = regbranch(pRExC_state, &flags, 0);
2464 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2468 *flagp |= flags&SPSTART;
2471 if (have_branch || paren != ':') {
2472 /* Make a closing node, and hook it on the end. */
2475 ender = reg_node(pRExC_state, TAIL);
2478 ender = reganode(pRExC_state, CLOSE, parno);
2479 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2480 Set_Node_Length(ender,1); /* MJD */
2486 *flagp &= ~HASWIDTH;
2489 ender = reg_node(pRExC_state, SUCCEED);
2492 ender = reg_node(pRExC_state, END);
2495 regtail(pRExC_state, lastbr, ender);
2498 /* Hook the tails of the branches to the closing node. */
2499 for (br = ret; br != NULL; br = regnext(br)) {
2500 regoptail(pRExC_state, br, ender);
2507 static const char parens[] = "=!<,>";
2509 if (paren && (p = strchr(parens, paren))) {
2510 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2511 int flag = (p - parens) > 1;
2514 node = SUSPEND, flag = 0;
2515 reginsert(pRExC_state, node,ret);
2516 Set_Node_Cur_Length(ret);
2517 Set_Node_Offset(ret, parse_start + 1);
2519 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2523 /* Check for proper termination. */
2525 RExC_flags = oregflags;
2526 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2527 RExC_parse = oregcomp_parse;
2528 vFAIL("Unmatched (");
2531 else if (!paren && RExC_parse < RExC_end) {
2532 if (*RExC_parse == ')') {
2534 vFAIL("Unmatched )");
2537 FAIL("Junk on end of regexp"); /* "Can't happen". */
2545 - regbranch - one alternative of an | operator
2547 * Implements the concatenation operator.
2550 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2552 register regnode *ret;
2553 register regnode *chain = NULL;
2554 register regnode *latest;
2555 I32 flags = 0, c = 0;
2560 if (!SIZE_ONLY && RExC_extralen)
2561 ret = reganode(pRExC_state, BRANCHJ,0);
2563 ret = reg_node(pRExC_state, BRANCH);
2564 Set_Node_Length(ret, 1);
2568 if (!first && SIZE_ONLY)
2569 RExC_extralen += 1; /* BRANCHJ */
2571 *flagp = WORST; /* Tentatively. */
2574 nextchar(pRExC_state);
2575 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2577 latest = regpiece(pRExC_state, &flags);
2578 if (latest == NULL) {
2579 if (flags & TRYAGAIN)
2583 else if (ret == NULL)
2585 *flagp |= flags&HASWIDTH;
2586 if (chain == NULL) /* First piece. */
2587 *flagp |= flags&SPSTART;
2590 regtail(pRExC_state, chain, latest);
2595 if (chain == NULL) { /* Loop ran zero times. */
2596 chain = reg_node(pRExC_state, NOTHING);
2601 *flagp |= flags&SIMPLE;
2608 - regpiece - something followed by possible [*+?]
2610 * Note that the branching code sequences used for ? and the general cases
2611 * of * and + are somewhat optimized: they use the same NOTHING node as
2612 * both the endmarker for their branch list and the body of the last branch.
2613 * It might seem that this node could be dispensed with entirely, but the
2614 * endmarker role is not redundant.
2617 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2619 register regnode *ret;
2621 register char *next;
2623 const char * const origparse = RExC_parse;
2626 I32 max = REG_INFTY;
2629 ret = regatom(pRExC_state, &flags);
2631 if (flags & TRYAGAIN)
2638 if (op == '{' && regcurly(RExC_parse)) {
2639 parse_start = RExC_parse; /* MJD */
2640 next = RExC_parse + 1;
2642 while (isDIGIT(*next) || *next == ',') {
2651 if (*next == '}') { /* got one */
2655 min = atoi(RExC_parse);
2659 maxpos = RExC_parse;
2661 if (!max && *maxpos != '0')
2662 max = REG_INFTY; /* meaning "infinity" */
2663 else if (max >= REG_INFTY)
2664 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2666 nextchar(pRExC_state);
2669 if ((flags&SIMPLE)) {
2670 RExC_naughty += 2 + RExC_naughty / 2;
2671 reginsert(pRExC_state, CURLY, ret);
2672 Set_Node_Offset(ret, parse_start+1); /* MJD */
2673 Set_Node_Cur_Length(ret);
2676 regnode *w = reg_node(pRExC_state, WHILEM);
2679 regtail(pRExC_state, ret, w);
2680 if (!SIZE_ONLY && RExC_extralen) {
2681 reginsert(pRExC_state, LONGJMP,ret);
2682 reginsert(pRExC_state, NOTHING,ret);
2683 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2685 reginsert(pRExC_state, CURLYX,ret);
2687 Set_Node_Offset(ret, parse_start+1);
2688 Set_Node_Length(ret,
2689 op == '{' ? (RExC_parse - parse_start) : 1);
2691 if (!SIZE_ONLY && RExC_extralen)
2692 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2693 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2695 RExC_whilem_seen++, RExC_extralen += 3;
2696 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2704 if (max && max < min)
2705 vFAIL("Can't do {n,m} with n > m");
2707 ARG1_SET(ret, (U16)min);
2708 ARG2_SET(ret, (U16)max);
2720 #if 0 /* Now runtime fix should be reliable. */
2722 /* if this is reinstated, don't forget to put this back into perldiag:
2724 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2726 (F) The part of the regexp subject to either the * or + quantifier
2727 could match an empty string. The {#} shows in the regular
2728 expression about where the problem was discovered.
2732 if (!(flags&HASWIDTH) && op != '?')
2733 vFAIL("Regexp *+ operand could be empty");
2736 parse_start = RExC_parse;
2737 nextchar(pRExC_state);
2739 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2741 if (op == '*' && (flags&SIMPLE)) {
2742 reginsert(pRExC_state, STAR, ret);
2746 else if (op == '*') {
2750 else if (op == '+' && (flags&SIMPLE)) {
2751 reginsert(pRExC_state, PLUS, ret);
2755 else if (op == '+') {
2759 else if (op == '?') {
2764 if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
2766 "%.*s matches null string many times",
2767 RExC_parse - origparse,
2771 if (*RExC_parse == '?') {
2772 nextchar(pRExC_state);
2773 reginsert(pRExC_state, MINMOD, ret);
2774 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2776 if (ISMULT2(RExC_parse)) {
2778 vFAIL("Nested quantifiers");
2785 - regatom - the lowest level
2787 * Optimization: gobbles an entire sequence of ordinary characters so that
2788 * it can turn them into a single node, which is smaller to store and
2789 * faster to run. Backslashed characters are exceptions, each becoming a
2790 * separate node; the code is simpler that way and it's not worth fixing.
2792 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2794 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2796 register regnode *ret = 0;
2798 char *parse_start = RExC_parse;
2800 *flagp = WORST; /* Tentatively. */
2803 switch (*RExC_parse) {
2805 RExC_seen_zerolen++;
2806 nextchar(pRExC_state);
2807 if (RExC_flags & PMf_MULTILINE)
2808 ret = reg_node(pRExC_state, MBOL);
2809 else if (RExC_flags & PMf_SINGLELINE)
2810 ret = reg_node(pRExC_state, SBOL);
2812 ret = reg_node(pRExC_state, BOL);
2813 Set_Node_Length(ret, 1); /* MJD */
2816 nextchar(pRExC_state);
2818 RExC_seen_zerolen++;
2819 if (RExC_flags & PMf_MULTILINE)
2820 ret = reg_node(pRExC_state, MEOL);
2821 else if (RExC_flags & PMf_SINGLELINE)
2822 ret = reg_node(pRExC_state, SEOL);
2824 ret = reg_node(pRExC_state, EOL);
2825 Set_Node_Length(ret, 1); /* MJD */
2828 nextchar(pRExC_state);
2829 if (RExC_flags & PMf_SINGLELINE)
2830 ret = reg_node(pRExC_state, SANY);
2832 ret = reg_node(pRExC_state, REG_ANY);
2833 *flagp |= HASWIDTH|SIMPLE;
2835 Set_Node_Length(ret, 1); /* MJD */
2839 char *oregcomp_parse = ++RExC_parse;
2840 ret = regclass(pRExC_state);
2841 if (*RExC_parse != ']') {
2842 RExC_parse = oregcomp_parse;
2843 vFAIL("Unmatched [");
2845 nextchar(pRExC_state);
2846 *flagp |= HASWIDTH|SIMPLE;
2847 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2851 nextchar(pRExC_state);
2852 ret = reg(pRExC_state, 1, &flags);
2854 if (flags & TRYAGAIN) {
2855 if (RExC_parse == RExC_end) {
2856 /* Make parent create an empty node if needed. */
2864 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2868 if (flags & TRYAGAIN) {
2872 vFAIL("Internal urp");
2873 /* Supposed to be caught earlier. */
2876 if (!regcurly(RExC_parse)) {
2885 vFAIL("Quantifier follows nothing");
2888 switch (*++RExC_parse) {
2890 RExC_seen_zerolen++;
2891 ret = reg_node(pRExC_state, SBOL);
2893 nextchar(pRExC_state);
2894 Set_Node_Length(ret, 2); /* MJD */
2897 ret = reg_node(pRExC_state, GPOS);
2898 RExC_seen |= REG_SEEN_GPOS;
2900 nextchar(pRExC_state);
2901 Set_Node_Length(ret, 2); /* MJD */
2904 ret = reg_node(pRExC_state, SEOL);
2906 RExC_seen_zerolen++; /* Do not optimize RE away */
2907 nextchar(pRExC_state);
2910 ret = reg_node(pRExC_state, EOS);
2912 RExC_seen_zerolen++; /* Do not optimize RE away */
2913 nextchar(pRExC_state);
2914 Set_Node_Length(ret, 2); /* MJD */
2917 ret = reg_node(pRExC_state, CANY);
2918 RExC_seen |= REG_SEEN_CANY;
2919 *flagp |= HASWIDTH|SIMPLE;
2920 nextchar(pRExC_state);
2921 Set_Node_Length(ret, 2); /* MJD */
2924 ret = reg_node(pRExC_state, CLUMP);
2926 nextchar(pRExC_state);
2927 Set_Node_Length(ret, 2); /* MJD */
2930 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2931 *flagp |= HASWIDTH|SIMPLE;
2932 nextchar(pRExC_state);
2933 Set_Node_Length(ret, 2); /* MJD */
2936 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2937 *flagp |= HASWIDTH|SIMPLE;
2938 nextchar(pRExC_state);
2939 Set_Node_Length(ret, 2); /* MJD */
2942 RExC_seen_zerolen++;
2943 RExC_seen |= REG_SEEN_LOOKBEHIND;
2944 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2946 nextchar(pRExC_state);
2947 Set_Node_Length(ret, 2); /* MJD */
2950 RExC_seen_zerolen++;
2951 RExC_seen |= REG_SEEN_LOOKBEHIND;
2952 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2954 nextchar(pRExC_state);
2955 Set_Node_Length(ret, 2); /* MJD */
2958 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2959 *flagp |= HASWIDTH|SIMPLE;
2960 nextchar(pRExC_state);
2961 Set_Node_Length(ret, 2); /* MJD */
2964 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2965 *flagp |= HASWIDTH|SIMPLE;
2966 nextchar(pRExC_state);
2967 Set_Node_Length(ret, 2); /* MJD */
2970 ret = reg_node(pRExC_state, DIGIT);
2971 *flagp |= HASWIDTH|SIMPLE;
2972 nextchar(pRExC_state);
2973 Set_Node_Length(ret, 2); /* MJD */
2976 ret = reg_node(pRExC_state, NDIGIT);
2977 *flagp |= HASWIDTH|SIMPLE;
2978 nextchar(pRExC_state);
2979 Set_Node_Length(ret, 2); /* MJD */
2984 char* oldregxend = RExC_end;
2985 char* parse_start = RExC_parse - 2;
2987 if (RExC_parse[1] == '{') {
2988 /* a lovely hack--pretend we saw [\pX] instead */
2989 RExC_end = strchr(RExC_parse, '}');
2991 U8 c = (U8)*RExC_parse;
2993 RExC_end = oldregxend;
2994 vFAIL2("Missing right brace on \\%c{}", c);
2999 RExC_end = RExC_parse + 2;
3000 if (RExC_end > oldregxend)
3001 RExC_end = oldregxend;
3005 ret = regclass(pRExC_state);
3007 RExC_end = oldregxend;
3010 Set_Node_Offset(ret, parse_start + 2);
3011 Set_Node_Cur_Length(ret);
3012 nextchar(pRExC_state);
3013 *flagp |= HASWIDTH|SIMPLE;
3026 case '1': case '2': case '3': case '4':
3027 case '5': case '6': case '7': case '8': case '9':
3029 const I32 num = atoi(RExC_parse);
3031 if (num > 9 && num >= RExC_npar)
3034 char * parse_start = RExC_parse - 1; /* MJD */
3035 while (isDIGIT(*RExC_parse))
3038 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3039 vFAIL("Reference to nonexistent group");
3041 ret = reganode(pRExC_state,
3042 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3046 /* override incorrect value set in reganode MJD */
3047 Set_Node_Offset(ret, parse_start+1);
3048 Set_Node_Cur_Length(ret); /* MJD */
3050 nextchar(pRExC_state);
3055 if (RExC_parse >= RExC_end)
3056 FAIL("Trailing \\");
3059 /* Do not generate "unrecognized" warnings here, we fall
3060 back into the quick-grab loop below */
3067 if (RExC_flags & PMf_EXTENDED) {
3068 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3069 if (RExC_parse < RExC_end)
3075 register STRLEN len;
3080 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
3082 parse_start = RExC_parse - 1;
3088 ret = reg_node(pRExC_state,
3089 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3091 for (len = 0, p = RExC_parse - 1;
3092 len < 127 && p < RExC_end;
3097 if (RExC_flags & PMf_EXTENDED)
3098 p = regwhite(p, RExC_end);
3145 ender = ASCII_TO_NATIVE('\033');
3149 ender = ASCII_TO_NATIVE('\007');
3154 char* const e = strchr(p, '}');
3158 vFAIL("Missing right brace on \\x{}");
3161 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3162 | PERL_SCAN_DISALLOW_PREFIX;
3163 STRLEN numlen = e - p - 1;
3164 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3171 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3173 ender = grok_hex(p, &numlen, &flags, NULL);
3179 ender = UCHARAT(p++);
3180 ender = toCTRL(ender);
3182 case '0': case '1': case '2': case '3':case '4':
3183 case '5': case '6': case '7': case '8':case '9':
3185 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3188 ender = grok_oct(p, &numlen, &flags, NULL);
3198 FAIL("Trailing \\");
3201 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
3202 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3203 goto normal_default;
3208 if (UTF8_IS_START(*p) && UTF) {
3210 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3218 if (RExC_flags & PMf_EXTENDED)
3219 p = regwhite(p, RExC_end);
3221 /* Prime the casefolded buffer. */
3222 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3224 if (ISMULT2(p)) { /* Back off on ?+*. */
3231 /* Emit all the Unicode characters. */
3233 for (foldbuf = tmpbuf;
3235 foldlen -= numlen) {
3236 ender = utf8_to_uvchr(foldbuf, &numlen);
3238 reguni(pRExC_state, ender, s, &unilen);
3241 /* In EBCDIC the numlen
3242 * and unilen can differ. */
3244 if (numlen >= foldlen)
3248 break; /* "Can't happen." */
3252 reguni(pRExC_state, ender, s, &unilen);
3261 REGC((char)ender, s++);
3269 /* Emit all the Unicode characters. */
3271 for (foldbuf = tmpbuf;
3273 foldlen -= numlen) {
3274 ender = utf8_to_uvchr(foldbuf, &numlen);
3276 reguni(pRExC_state, ender, s, &unilen);
3279 /* In EBCDIC the numlen
3280 * and unilen can differ. */
3282 if (numlen >= foldlen)
3290 reguni(pRExC_state, ender, s, &unilen);
3299 REGC((char)ender, s++);
3303 Set_Node_Cur_Length(ret); /* MJD */
3304 nextchar(pRExC_state);
3306 /* len is STRLEN which is unsigned, need to copy to signed */
3309 vFAIL("Internal disaster");
3313 if (len == 1 && UNI_IS_INVARIANT(ender))
3318 RExC_size += STR_SZ(len);
3320 RExC_emit += STR_SZ(len);
3325 /* If the encoding pragma is in effect recode the text of
3326 * any EXACT-kind nodes. */
3327 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3328 STRLEN oldlen = STR_LEN(ret);
3329 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3333 if (sv_utf8_downgrade(sv, TRUE)) {
3334 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
3335 const STRLEN newlen = SvCUR(sv);
3340 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3341 (int)oldlen, STRING(ret),
3343 Copy(s, STRING(ret), newlen, char);
3344 STR_LEN(ret) += newlen - oldlen;
3345 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3347 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3355 S_regwhite(pTHX_ char *p, const char *e)
3360 else if (*p == '#') {
3363 } while (p < e && *p != '\n');
3371 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3372 Character classes ([:foo:]) can also be negated ([:^foo:]).
3373 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3374 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3375 but trigger failures because they are currently unimplemented. */
3377 #define POSIXCC_DONE(c) ((c) == ':')
3378 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3379 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3382 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3384 I32 namedclass = OOB_NAMEDCLASS;
3386 if (value == '[' && RExC_parse + 1 < RExC_end &&
3387 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3388 POSIXCC(UCHARAT(RExC_parse))) {
3389 const char c = UCHARAT(RExC_parse);
3390 char* s = RExC_parse++;
3392 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3394 if (RExC_parse == RExC_end)
3395 /* Grandfather lone [:, [=, [. */
3398 const char* t = RExC_parse++; /* skip over the c */
3399 const char *posixcc;
3403 if (UCHARAT(RExC_parse) == ']') {
3404 RExC_parse++; /* skip over the ending ] */
3407 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3408 const I32 skip = t - posixcc;
3410 /* Initially switch on the length of the name. */
3413 if (memEQ(posixcc, "word", 4)) {
3414 /* this is not POSIX, this is the Perl \w */;
3416 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3420 /* Names all of length 5. */
3421 /* alnum alpha ascii blank cntrl digit graph lower
3422 print punct space upper */
3423 /* Offset 4 gives the best switch position. */
3424 switch (posixcc[4]) {
3426 if (memEQ(posixcc, "alph", 4)) {
3429 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3433 if (memEQ(posixcc, "spac", 4)) {
3436 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3440 if (memEQ(posixcc, "grap", 4)) {
3443 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3447 if (memEQ(posixcc, "asci", 4)) {
3450 = complement ? ANYOF_NASCII : ANYOF_ASCII;
3454 if (memEQ(posixcc, "blan", 4)) {
3457 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
3461 if (memEQ(posixcc, "cntr", 4)) {
3464 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3468 if (memEQ(posixcc, "alnu", 4)) {
3471 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3475 if (memEQ(posixcc, "lowe", 4)) {
3478 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
3480 if (memEQ(posixcc, "uppe", 4)) {
3483 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
3487 if (memEQ(posixcc, "digi", 4)) {
3490 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3492 if (memEQ(posixcc, "prin", 4)) {
3495 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
3497 if (memEQ(posixcc, "punc", 4)) {
3500 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3506 if (memEQ(posixcc, "xdigit", 6)) {
3508 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3513 if (namedclass == OOB_NAMEDCLASS)
3515 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3518 assert (posixcc[skip] == ':');
3519 assert (posixcc[skip+1] == ']');
3520 } else if (!SIZE_ONLY) {
3521 /* [[=foo=]] and [[.foo.]] are still future. */
3523 /* adjust RExC_parse so the warning shows after
3525 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3527 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3530 /* Maternal grandfather:
3531 * "[:" ending in ":" but not in ":]" */
3541 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3543 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3544 const char *s = RExC_parse;
3545 const char c = *s++;
3547 while(*s && isALNUM(*s))
3549 if (*s && c == *s && s[1] == ']') {
3550 if (ckWARN(WARN_REGEXP))
3552 "POSIX syntax [%c %c] belongs inside character classes",
3555 /* [[=foo=]] and [[.foo.]] are still future. */
3556 if (POSIXCC_NOTYET(c)) {
3557 /* adjust RExC_parse so the error shows after
3559 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3561 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3568 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3571 register UV nextvalue;
3572 register IV prevvalue = OOB_UNICODE;
3573 register IV range = 0;
3574 register regnode *ret;
3577 char *rangebegin = 0;
3578 bool need_class = 0;
3579 SV *listsv = Nullsv;
3582 bool optimize_invert = TRUE;
3583 AV* unicode_alternate = 0;
3585 UV literal_endpoint = 0;
3588 ret = reganode(pRExC_state, ANYOF, 0);
3591 ANYOF_FLAGS(ret) = 0;
3593 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3597 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3601 RExC_size += ANYOF_SKIP;
3603 RExC_emit += ANYOF_SKIP;
3605 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3607 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3608 ANYOF_BITMAP_ZERO(ret);
3609 listsv = newSVpvn("# comment\n", 10);
3612 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3614 if (!SIZE_ONLY && POSIXCC(nextvalue))
3615 checkposixcc(pRExC_state);
3617 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3618 if (UCHARAT(RExC_parse) == ']')
3621 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3625 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3628 rangebegin = RExC_parse;
3630 value = utf8n_to_uvchr((U8*)RExC_parse,
3631 RExC_end - RExC_parse,
3633 RExC_parse += numlen;
3636 value = UCHARAT(RExC_parse++);
3637 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3638 if (value == '[' && POSIXCC(nextvalue))
3639 namedclass = regpposixcc(pRExC_state, value);
3640 else if (value == '\\') {
3642 value = utf8n_to_uvchr((U8*)RExC_parse,
3643 RExC_end - RExC_parse,
3645 RExC_parse += numlen;
3648 value = UCHARAT(RExC_parse++);
3649 /* Some compilers cannot handle switching on 64-bit integer
3650 * values, therefore value cannot be an UV. Yes, this will
3651 * be a problem later if we want switch on Unicode.
3652 * A similar issue a little bit later when switching on
3653 * namedclass. --jhi */
3654 switch ((I32)value) {
3655 case 'w': namedclass = ANYOF_ALNUM; break;
3656 case 'W': namedclass = ANYOF_NALNUM; break;
3657 case 's': namedclass = ANYOF_SPACE; break;
3658 case 'S': namedclass = ANYOF_NSPACE; break;
3659 case 'd': namedclass = ANYOF_DIGIT; break;
3660 case 'D': namedclass = ANYOF_NDIGIT; break;
3663 if (RExC_parse >= RExC_end)
3664 vFAIL2("Empty \\%c{}", (U8)value);
3665 if (*RExC_parse == '{') {
3666 const U8 c = (U8)value;
3667 e = strchr(RExC_parse++, '}');
3669 vFAIL2("Missing right brace on \\%c{}", c);
3670 while (isSPACE(UCHARAT(RExC_parse)))
3672 if (e == RExC_parse)
3673 vFAIL2("Empty \\%c{}", c);
3675 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3683 if (UCHARAT(RExC_parse) == '^') {
3686 value = value == 'p' ? 'P' : 'p'; /* toggle */
3687 while (isSPACE(UCHARAT(RExC_parse))) {
3693 Perl_sv_catpvf(aTHX_ listsv,
3694 "+utf8::%.*s\n", (int)n, RExC_parse);
3696 Perl_sv_catpvf(aTHX_ listsv,
3697 "!utf8::%.*s\n", (int)n, RExC_parse);
3700 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3701 namedclass = ANYOF_MAX; /* no official name, but it's named */
3703 case 'n': value = '\n'; break;
3704 case 'r': value = '\r'; break;
3705 case 't': value = '\t'; break;
3706 case 'f': value = '\f'; break;
3707 case 'b': value = '\b'; break;
3708 case 'e': value = ASCII_TO_NATIVE('\033');break;
3709 case 'a': value = ASCII_TO_NATIVE('\007');break;
3711 if (*RExC_parse == '{') {
3712 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3713 | PERL_SCAN_DISALLOW_PREFIX;
3714 e = strchr(RExC_parse++, '}');
3716 vFAIL("Missing right brace on \\x{}");
3718 numlen = e - RExC_parse;
3719 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3723 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3725 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3726 RExC_parse += numlen;
3730 value = UCHARAT(RExC_parse++);
3731 value = toCTRL(value);
3733 case '0': case '1': case '2': case '3': case '4':
3734 case '5': case '6': case '7': case '8': case '9':
3738 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3739 RExC_parse += numlen;
3743 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
3745 "Unrecognized escape \\%c in character class passed through",
3749 } /* end of \blah */
3755 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3757 if (!SIZE_ONLY && !need_class)
3758 ANYOF_CLASS_ZERO(ret);
3762 /* a bad range like a-\d, a-[:digit:] ? */
3765 if (ckWARN(WARN_REGEXP))
3767 "False [] range \"%*.*s\"",
3768 RExC_parse - rangebegin,
3769 RExC_parse - rangebegin,
3771 if (prevvalue < 256) {
3772 ANYOF_BITMAP_SET(ret, prevvalue);
3773 ANYOF_BITMAP_SET(ret, '-');
3776 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3777 Perl_sv_catpvf(aTHX_ listsv,
3778 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3782 range = 0; /* this was not a true range */
3786 const char *what = NULL;
3789 if (namedclass > OOB_NAMEDCLASS)
3790 optimize_invert = FALSE;
3791 /* Possible truncation here but in some 64-bit environments
3792 * the compiler gets heartburn about switch on 64-bit values.
3793 * A similar issue a little earlier when switching on value.
3795 switch ((I32)namedclass) {
3798 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3800 for (value = 0; value < 256; value++)
3802 ANYOF_BITMAP_SET(ret, value);
3809 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3811 for (value = 0; value < 256; value++)
3812 if (!isALNUM(value))
3813 ANYOF_BITMAP_SET(ret, value);
3820 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3822 for (value = 0; value < 256; value++)
3823 if (isALNUMC(value))
3824 ANYOF_BITMAP_SET(ret, value);
3831 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3833 for (value = 0; value < 256; value++)
3834 if (!isALNUMC(value))
3835 ANYOF_BITMAP_SET(ret, value);
3842 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3844 for (value = 0; value < 256; value++)
3846 ANYOF_BITMAP_SET(ret, value);
3853 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3855 for (value = 0; value < 256; value++)
3856 if (!isALPHA(value))
3857 ANYOF_BITMAP_SET(ret, value);
3864 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3867 for (value = 0; value < 128; value++)
3868 ANYOF_BITMAP_SET(ret, value);
3870 for (value = 0; value < 256; value++) {
3872 ANYOF_BITMAP_SET(ret, value);
3881 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3884 for (value = 128; value < 256; value++)
3885 ANYOF_BITMAP_SET(ret, value);
3887 for (value = 0; value < 256; value++) {
3888 if (!isASCII(value))
3889 ANYOF_BITMAP_SET(ret, value);
3898 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3900 for (value = 0; value < 256; value++)
3902 ANYOF_BITMAP_SET(ret, value);
3909 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3911 for (value = 0; value < 256; value++)
3912 if (!isBLANK(value))
3913 ANYOF_BITMAP_SET(ret, value);
3920 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3922 for (value = 0; value < 256; value++)
3924 ANYOF_BITMAP_SET(ret, value);
3931 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3933 for (value = 0; value < 256; value++)
3934 if (!isCNTRL(value))
3935 ANYOF_BITMAP_SET(ret, value);
3942 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3944 /* consecutive digits assumed */
3945 for (value = '0'; value <= '9'; value++)
3946 ANYOF_BITMAP_SET(ret, value);
3953 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3955 /* consecutive digits assumed */
3956 for (value = 0; value < '0'; value++)
3957 ANYOF_BITMAP_SET(ret, value);
3958 for (value = '9' + 1; value < 256; value++)
3959 ANYOF_BITMAP_SET(ret, value);
3966 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3968 for (value = 0; value < 256; value++)
3970 ANYOF_BITMAP_SET(ret, value);
3977 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3979 for (value = 0; value < 256; value++)
3980 if (!isGRAPH(value))
3981 ANYOF_BITMAP_SET(ret, value);
3988 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3990 for (value = 0; value < 256; value++)
3992 ANYOF_BITMAP_SET(ret, value);
3999 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
4001 for (value = 0; value < 256; value++)
4002 if (!isLOWER(value))
4003 ANYOF_BITMAP_SET(ret, value);
4010 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
4012 for (value = 0; value < 256; value++)
4014 ANYOF_BITMAP_SET(ret, value);
4021 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
4023 for (value = 0; value < 256; value++)
4024 if (!isPRINT(value))
4025 ANYOF_BITMAP_SET(ret, value);
4032 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
4034 for (value = 0; value < 256; value++)
4035 if (isPSXSPC(value))
4036 ANYOF_BITMAP_SET(ret, value);
4043 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
4045 for (value = 0; value < 256; value++)
4046 if (!isPSXSPC(value))
4047 ANYOF_BITMAP_SET(ret, value);
4054 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
4056 for (value = 0; value < 256; value++)
4058 ANYOF_BITMAP_SET(ret, value);
4065 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4067 for (value = 0; value < 256; value++)
4068 if (!isPUNCT(value))
4069 ANYOF_BITMAP_SET(ret, value);
4076 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4078 for (value = 0; value < 256; value++)
4080 ANYOF_BITMAP_SET(ret, value);
4087 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4089 for (value = 0; value < 256; value++)
4090 if (!isSPACE(value))
4091 ANYOF_BITMAP_SET(ret, value);
4098 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4100 for (value = 0; value < 256; value++)
4102 ANYOF_BITMAP_SET(ret, value);
4109 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4111 for (value = 0; value < 256; value++)
4112 if (!isUPPER(value))
4113 ANYOF_BITMAP_SET(ret, value);
4120 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4122 for (value = 0; value < 256; value++)
4123 if (isXDIGIT(value))
4124 ANYOF_BITMAP_SET(ret, value);
4131 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4133 for (value = 0; value < 256; value++)
4134 if (!isXDIGIT(value))
4135 ANYOF_BITMAP_SET(ret, value);
4141 /* this is to handle \p and \P */
4144 vFAIL("Invalid [::] class");
4148 /* Strings such as "+utf8::isWord\n" */
4149 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
4152 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4155 } /* end of namedclass \blah */
4158 if (prevvalue > (IV)value) /* b-a */ {
4159 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4160 RExC_parse - rangebegin,
4161 RExC_parse - rangebegin,
4163 range = 0; /* not a valid range */
4167 prevvalue = value; /* save the beginning of the range */
4168 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4169 RExC_parse[1] != ']') {
4172 /* a bad range like \w-, [:word:]- ? */
4173 if (namedclass > OOB_NAMEDCLASS) {
4174 if (ckWARN(WARN_REGEXP))
4176 "False [] range \"%*.*s\"",
4177 RExC_parse - rangebegin,
4178 RExC_parse - rangebegin,
4181 ANYOF_BITMAP_SET(ret, '-');
4183 range = 1; /* yeah, it's a range! */
4184 continue; /* but do it the next time */
4188 /* now is the next time */
4192 if (prevvalue < 256) {
4193 const IV ceilvalue = value < 256 ? value : 255;
4196 /* In EBCDIC [\x89-\x91] should include
4197 * the \x8e but [i-j] should not. */
4198 if (literal_endpoint == 2 &&
4199 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4200 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4202 if (isLOWER(prevvalue)) {
4203 for (i = prevvalue; i <= ceilvalue; i++)
4205 ANYOF_BITMAP_SET(ret, i);
4207 for (i = prevvalue; i <= ceilvalue; i++)
4209 ANYOF_BITMAP_SET(ret, i);
4214 for (i = prevvalue; i <= ceilvalue; i++)
4215 ANYOF_BITMAP_SET(ret, i);
4217 if (value > 255 || UTF) {
4218 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4219 const UV natvalue = NATIVE_TO_UNI(value);
4221 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4222 if (prevnatvalue < natvalue) { /* what about > ? */
4223 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4224 prevnatvalue, natvalue);
4226 else if (prevnatvalue == natvalue) {
4227 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4229 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
4231 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4233 /* If folding and foldable and a single
4234 * character, insert also the folded version
4235 * to the charclass. */
4237 if (foldlen == (STRLEN)UNISKIP(f))
4238 Perl_sv_catpvf(aTHX_ listsv,
4241 /* Any multicharacter foldings
4242 * require the following transform:
4243 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4244 * where E folds into "pq" and F folds
4245 * into "rst", all other characters
4246 * fold to single characters. We save
4247 * away these multicharacter foldings,
4248 * to be later saved as part of the
4249 * additional "s" data. */
4252 if (!unicode_alternate)
4253 unicode_alternate = newAV();
4254 sv = newSVpvn((char*)foldbuf, foldlen);
4256 av_push(unicode_alternate, sv);
4260 /* If folding and the value is one of the Greek
4261 * sigmas insert a few more sigmas to make the
4262 * folding rules of the sigmas to work right.
4263 * Note that not all the possible combinations
4264 * are handled here: some of them are handled
4265 * by the standard folding rules, and some of
4266 * them (literal or EXACTF cases) are handled
4267 * during runtime in regexec.c:S_find_byclass(). */
4268 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4269 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4270 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4271 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4272 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4274 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4275 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4276 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4281 literal_endpoint = 0;
4285 range = 0; /* this range (if it was one) is done now */
4289 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4291 RExC_size += ANYOF_CLASS_ADD_SKIP;
4293 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4296 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4298 /* If the only flag is folding (plus possibly inversion). */
4299 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4301 for (value = 0; value < 256; ++value) {
4302 if (ANYOF_BITMAP_TEST(ret, value)) {
4303 UV fold = PL_fold[value];
4306 ANYOF_BITMAP_SET(ret, fold);
4309 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4312 /* optimize inverted simple patterns (e.g. [^a-z]) */
4313 if (!SIZE_ONLY && optimize_invert &&
4314 /* If the only flag is inversion. */
4315 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4316 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4317 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4318 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4325 /* The 0th element stores the character class description
4326 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4327 * to initialize the appropriate swash (which gets stored in
4328 * the 1st element), and also useful for dumping the regnode.
4329 * The 2nd element stores the multicharacter foldings,
4330 * used later (regexec.c:S_reginclass()). */
4331 av_store(av, 0, listsv);
4332 av_store(av, 1, NULL);
4333 av_store(av, 2, (SV*)unicode_alternate);
4334 rv = newRV_noinc((SV*)av);
4335 n = add_data(pRExC_state, 1, "s");
4336 RExC_rx->data->data[n] = (void*)rv;
4344 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4346 char* retval = RExC_parse++;
4349 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4350 RExC_parse[2] == '#') {
4351 while (*RExC_parse != ')') {
4352 if (RExC_parse == RExC_end)
4353 FAIL("Sequence (?#... not terminated");
4359 if (RExC_flags & PMf_EXTENDED) {
4360 if (isSPACE(*RExC_parse)) {
4364 else if (*RExC_parse == '#') {
4365 while (RExC_parse < RExC_end)
4366 if (*RExC_parse++ == '\n') break;
4375 - reg_node - emit a node
4377 STATIC regnode * /* Location. */
4378 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4380 register regnode *ptr;
4381 regnode * const ret = RExC_emit;
4384 SIZE_ALIGN(RExC_size);
4389 NODE_ALIGN_FILL(ret);
4391 FILL_ADVANCE_NODE(ptr, op);
4392 if (RExC_offsets) { /* MJD */
4393 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4394 "reg_node", __LINE__,
4396 RExC_emit - RExC_emit_start > RExC_offsets[0]
4397 ? "Overwriting end of array!\n" : "OK",
4398 RExC_emit - RExC_emit_start,
4399 RExC_parse - RExC_start,
4401 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4410 - reganode - emit a node with an argument
4412 STATIC regnode * /* Location. */
4413 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4415 register regnode *ptr;
4416 regnode * const ret = RExC_emit;
4419 SIZE_ALIGN(RExC_size);
4424 NODE_ALIGN_FILL(ret);
4426 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4427 if (RExC_offsets) { /* MJD */
4428 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4432 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4433 "Overwriting end of array!\n" : "OK",
4434 RExC_emit - RExC_emit_start,
4435 RExC_parse - RExC_start,
4437 Set_Cur_Node_Offset;
4446 - reguni - emit (if appropriate) a Unicode character
4449 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4451 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4455 - reginsert - insert an operator in front of already-emitted operand
4457 * Means relocating the operand.
4460 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4462 register regnode *src;
4463 register regnode *dst;
4464 register regnode *place;
4465 const int offset = regarglen[(U8)op];
4467 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4470 RExC_size += NODE_STEP_REGNODE + offset;
4475 RExC_emit += NODE_STEP_REGNODE + offset;
4477 while (src > opnd) {
4478 StructCopy(--src, --dst, regnode);
4479 if (RExC_offsets) { /* MJD 20010112 */
4480 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4484 dst - RExC_emit_start > RExC_offsets[0]
4485 ? "Overwriting end of array!\n" : "OK",
4486 src - RExC_emit_start,
4487 dst - RExC_emit_start,
4489 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4490 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4495 place = opnd; /* Op node, where operand used to be. */
4496 if (RExC_offsets) { /* MJD */
4497 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4501 place - RExC_emit_start > RExC_offsets[0]
4502 ? "Overwriting end of array!\n" : "OK",
4503 place - RExC_emit_start,
4504 RExC_parse - RExC_start,
4506 Set_Node_Offset(place, RExC_parse);
4507 Set_Node_Length(place, 1);
4509 src = NEXTOPER(place);
4510 FILL_ADVANCE_NODE(place, op);
4511 Zero(src, offset, regnode);
4515 - regtail - set the next-pointer at the end of a node chain of p to val.
4518 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4520 register regnode *scan;
4525 /* Find last node. */
4528 regnode * const temp = regnext(scan);
4534 if (reg_off_by_arg[OP(scan)]) {
4535 ARG_SET(scan, val - scan);
4538 NEXT_OFF(scan) = val - scan;
4543 - regoptail - regtail on operand of first argument; nop if operandless
4546 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4548 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4549 if (p == NULL || SIZE_ONLY)
4551 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4552 regtail(pRExC_state, NEXTOPER(p), val);
4554 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4555 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4562 - regcurly - a little FSA that accepts {\d+,?\d*}
4565 S_regcurly(pTHX_ register const char *s)
4583 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4586 Perl_regdump(pTHX_ regexp *r)
4589 SV *sv = sv_newmortal();
4591 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4593 /* Header fields of interest. */
4594 if (r->anchored_substr)
4595 PerlIO_printf(Perl_debug_log,
4596 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
4598 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4599 SvPVX_const(r->anchored_substr),
4601 SvTAIL(r->anchored_substr) ? "$" : "",
4602 (IV)r->anchored_offset);
4603 else if (r->anchored_utf8)
4604 PerlIO_printf(Perl_debug_log,
4605 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
4607 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4608 SvPVX_const(r->anchored_utf8),
4610 SvTAIL(r->anchored_utf8) ? "$" : "",
4611 (IV)r->anchored_offset);
4612 if (r->float_substr)
4613 PerlIO_printf(Perl_debug_log,
4614 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4616 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4617 SvPVX_const(r->float_substr),
4619 SvTAIL(r->float_substr) ? "$" : "",
4620 (IV)r->float_min_offset, (UV)r->float_max_offset);
4621 else if (r->float_utf8)
4622 PerlIO_printf(Perl_debug_log,
4623 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4625 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4626 SvPVX_const(r->float_utf8),
4628 SvTAIL(r->float_utf8) ? "$" : "",
4629 (IV)r->float_min_offset, (UV)r->float_max_offset);
4630 if (r->check_substr || r->check_utf8)
4631 PerlIO_printf(Perl_debug_log,
4632 r->check_substr == r->float_substr
4633 && r->check_utf8 == r->float_utf8
4634 ? "(checking floating" : "(checking anchored");
4635 if (r->reganch & ROPT_NOSCAN)
4636 PerlIO_printf(Perl_debug_log, " noscan");
4637 if (r->reganch & ROPT_CHECK_ALL)
4638 PerlIO_printf(Perl_debug_log, " isall");
4639 if (r->check_substr || r->check_utf8)
4640 PerlIO_printf(Perl_debug_log, ") ");
4642 if (r->regstclass) {
4643 regprop(sv, r->regstclass);
4644 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
4646 if (r->reganch & ROPT_ANCH) {
4647 PerlIO_printf(Perl_debug_log, "anchored");
4648 if (r->reganch & ROPT_ANCH_BOL)
4649 PerlIO_printf(Perl_debug_log, "(BOL)");
4650 if (r->reganch & ROPT_ANCH_MBOL)
4651 PerlIO_printf(Perl_debug_log, "(MBOL)");
4652 if (r->reganch & ROPT_ANCH_SBOL)
4653 PerlIO_printf(Perl_debug_log, "(SBOL)");
4654 if (r->reganch & ROPT_ANCH_GPOS)
4655 PerlIO_printf(Perl_debug_log, "(GPOS)");
4656 PerlIO_putc(Perl_debug_log, ' ');
4658 if (r->reganch & ROPT_GPOS_SEEN)
4659 PerlIO_printf(Perl_debug_log, "GPOS ");
4660 if (r->reganch & ROPT_SKIP)
4661 PerlIO_printf(Perl_debug_log, "plus ");
4662 if (r->reganch & ROPT_IMPLICIT)
4663 PerlIO_printf(Perl_debug_log, "implicit ");
4664 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4665 if (r->reganch & ROPT_EVAL_SEEN)
4666 PerlIO_printf(Perl_debug_log, "with eval ");
4667 PerlIO_printf(Perl_debug_log, "\n");
4670 const U32 len = r->offsets[0];
4671 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4672 for (i = 1; i <= len; i++)
4673 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4674 (UV)r->offsets[i*2-1],
4675 (UV)r->offsets[i*2]);
4676 PerlIO_printf(Perl_debug_log, "\n");
4678 #endif /* DEBUGGING */
4682 - regprop - printable representation of opcode
4685 Perl_regprop(pTHX_ SV *sv, regnode *o)
4690 sv_setpvn(sv, "", 0);
4691 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4692 /* It would be nice to FAIL() here, but this may be called from
4693 regexec.c, and it would be hard to supply pRExC_state. */
4694 Perl_croak(aTHX_ "Corrupted regexp opcode");
4695 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
4697 k = PL_regkind[(U8)OP(o)];
4700 SV * const dsv = sv_2mortal(newSVpvn("", 0));
4701 /* Using is_utf8_string() is a crude hack but it may
4702 * be the best for now since we have no flag "this EXACTish
4703 * node was UTF-8" --jhi */
4704 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4705 const char * const s = do_utf8 ?
4706 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4707 UNI_DISPLAY_REGEX) :
4709 const int len = do_utf8 ?
4712 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4717 else if (k == CURLY) {
4718 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4719 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4720 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4722 else if (k == WHILEM && o->flags) /* Ordinal/of */
4723 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4724 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4725 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4726 else if (k == LOGICAL)
4727 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4728 else if (k == ANYOF) {
4729 int i, rangestart = -1;
4730 const U8 flags = ANYOF_FLAGS(o);
4731 const char * const anyofs[] = { /* Should be synchronized with
4732 * ANYOF_ #xdefines in regcomp.h */
4765 if (flags & ANYOF_LOCALE)
4766 sv_catpv(sv, "{loc}");
4767 if (flags & ANYOF_FOLD)
4768 sv_catpv(sv, "{i}");
4769 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4770 if (flags & ANYOF_INVERT)
4772 for (i = 0; i <= 256; i++) {
4773 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4774 if (rangestart == -1)
4776 } else if (rangestart != -1) {
4777 if (i <= rangestart + 3)
4778 for (; rangestart < i; rangestart++)
4779 put_byte(sv, rangestart);
4781 put_byte(sv, rangestart);
4783 put_byte(sv, i - 1);
4789 if (o->flags & ANYOF_CLASS)
4790 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4791 if (ANYOF_CLASS_TEST(o,i))
4792 sv_catpv(sv, anyofs[i]);
4794 if (flags & ANYOF_UNICODE)
4795 sv_catpv(sv, "{unicode}");
4796 else if (flags & ANYOF_UNICODE_ALL)
4797 sv_catpv(sv, "{unicode_all}");
4801 SV * const sw = regclass_swash(o, FALSE, &lv, 0);
4805 U8 s[UTF8_MAXBYTES_CASE+1];
4807 for (i = 0; i <= 256; i++) { /* just the first 256 */
4808 uvchr_to_utf8(s, i);
4810 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4811 if (rangestart == -1)
4813 } else if (rangestart != -1) {
4814 if (i <= rangestart + 3)
4815 for (; rangestart < i; rangestart++) {
4816 const U8 * const e = uvchr_to_utf8(s,rangestart);
4818 for(p = s; p < e; p++)
4822 const U8 *e = uvchr_to_utf8(s,rangestart);
4824 for (p = s; p < e; p++)
4826 sv_catpvn(sv, "-", 1);
4827 e = uvchr_to_utf8(s, i-1);
4828 for (p = s; p < e; p++)
4835 sv_catpv(sv, "..."); /* et cetera */
4839 char *s = savesvpv(lv);
4842 while(*s && *s != '\n') s++;
4845 const char * const t = ++s;
4863 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4865 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4866 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4867 #endif /* DEBUGGING */
4871 Perl_re_intuit_string(pTHX_ regexp *prog)
4872 { /* Assume that RE_INTUIT is set */
4875 const char * const s = SvPV_nolen_const(prog->check_substr
4876 ? prog->check_substr : prog->check_utf8);
4878 if (!PL_colorset) reginitcolors();
4879 PerlIO_printf(Perl_debug_log,
4880 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
4882 prog->check_substr ? "" : "utf8 ",
4883 PL_colors[5],PL_colors[0],
4886 (strlen(s) > 60 ? "..." : ""));
4889 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4893 Perl_pregfree(pTHX_ struct regexp *r)
4896 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4899 if (!r || (--r->refcnt > 0))
4902 const char *s = (r->reganch & ROPT_UTF8)
4903 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
4904 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4905 const int len = SvCUR(dsv);
4908 PerlIO_printf(Perl_debug_log,
4909 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4910 PL_colors[4],PL_colors[5],PL_colors[0],
4913 len > 60 ? "..." : "");
4916 /* gcov results gave these as non-null 100% of the time, so there's no
4917 optimisation in checking them before calling Safefree */
4918 Safefree(r->precomp);
4919 Safefree(r->offsets); /* 20010421 MJD */
4920 if (RX_MATCH_COPIED(r))
4921 Safefree(r->subbeg);
4923 if (r->anchored_substr)
4924 SvREFCNT_dec(r->anchored_substr);
4925 if (r->anchored_utf8)
4926 SvREFCNT_dec(r->anchored_utf8);
4927 if (r->float_substr)
4928 SvREFCNT_dec(r->float_substr);
4930 SvREFCNT_dec(r->float_utf8);
4931 Safefree(r->substrs);
4934 int n = r->data->count;
4935 PAD* new_comppad = NULL;
4940 /* If you add a ->what type here, update the comment in regcomp.h */
4941 switch (r->data->what[n]) {
4943 SvREFCNT_dec((SV*)r->data->data[n]);
4946 Safefree(r->data->data[n]);
4949 new_comppad = (AV*)r->data->data[n];
4952 if (new_comppad == NULL)
4953 Perl_croak(aTHX_ "panic: pregfree comppad");
4954 PAD_SAVE_LOCAL(old_comppad,
4955 /* Watch out for global destruction's random ordering. */
4956 (SvTYPE(new_comppad) == SVt_PVAV) ?
4957 new_comppad : Null(PAD *)
4960 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
4963 op_free((OP_4tree*)r->data->data[n]);
4965 PAD_RESTORE_LOCAL(old_comppad);
4966 SvREFCNT_dec((SV*)new_comppad);
4972 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4975 Safefree(r->data->what);
4978 Safefree(r->startp);
4984 - regnext - dig the "next" pointer out of a node
4986 * [Note, when REGALIGN is defined there are two places in regmatch()
4987 * that bypass this code for speed.]
4990 Perl_regnext(pTHX_ register regnode *p)
4992 register I32 offset;
4994 if (p == &PL_regdummy)
4997 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5005 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5008 STRLEN l1 = strlen(pat1);
5009 STRLEN l2 = strlen(pat2);
5012 const char *message;
5018 Copy(pat1, buf, l1 , char);
5019 Copy(pat2, buf + l1, l2 , char);
5020 buf[l1 + l2] = '\n';
5021 buf[l1 + l2 + 1] = '\0';
5023 /* ANSI variant takes additional second argument */
5024 va_start(args, pat2);
5028 msv = vmess(buf, &args);
5030 message = SvPV_const(msv,l1);
5033 Copy(message, buf, l1 , char);
5034 buf[l1-1] = '\0'; /* Overwrite \n */
5035 Perl_croak(aTHX_ "%s", buf);
5038 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5041 Perl_save_re_context(pTHX)
5044 struct re_save_state *state;
5046 SAVEVPTR(PL_curcop);
5047 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
5049 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
5050 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
5051 SSPUSHINT(SAVEt_RE_STATE);
5053 state->re_state_reg_flags = PL_reg_flags;
5054 state->re_state_bostr = PL_bostr;
5055 state->re_state_reginput = PL_reginput;
5056 state->re_state_regbol = PL_regbol;
5057 state->re_state_regeol = PL_regeol;
5058 state->re_state_regstartp = PL_regstartp;
5059 state->re_state_regendp = PL_regendp;
5060 state->re_state_reglastparen = PL_reglastparen;
5061 state->re_state_reglastcloseparen = PL_reglastcloseparen;
5062 state->re_state_regtill = PL_regtill;
5063 state->re_state_reg_start_tmp = PL_reg_start_tmp;
5064 state->re_state_reg_start_tmpl = PL_reg_start_tmpl;
5065 state->re_state_reg_eval_set = PL_reg_eval_set;
5066 state->re_state_regnarrate = PL_regnarrate;
5067 state->re_state_regindent = PL_regindent;
5068 state->re_state_reg_call_cc = PL_reg_call_cc;
5069 state->re_state_reg_re = PL_reg_re;
5070 state->re_state_reg_ganch = PL_reg_ganch;
5071 state->re_state_reg_sv = PL_reg_sv;
5072 state->re_state_reg_match_utf8 = PL_reg_match_utf8;
5073 state->re_state_reg_magic = PL_reg_magic;
5074 state->re_state_reg_oldpos = PL_reg_oldpos;
5075 state->re_state_reg_oldcurpm = PL_reg_oldcurpm;
5076 state->re_state_reg_curpm = PL_reg_curpm;
5077 state->re_state_reg_oldsaved = PL_reg_oldsaved;
5078 state->re_state_reg_oldsavedlen = PL_reg_oldsavedlen;
5079 state->re_state_reg_maxiter = PL_reg_maxiter;
5080 state->re_state_reg_leftiter = PL_reg_leftiter;
5081 state->re_state_reg_poscache = PL_reg_poscache;
5082 state->re_state_reg_poscache_size = PL_reg_poscache_size;
5083 state->re_state_regsize = PL_regsize;
5084 state->re_state_reg_starttry = PL_reg_starttry;
5086 /* These variables have been eliminated from 5.10: */
5087 state->re_state_regdata = PL_regdata;
5088 state->re_state_regprogram = PL_regprogram;
5089 state->re_state_regcc = PL_regcc;
5090 state->re_state_regprecomp = PL_regprecomp;
5091 state->re_state_regnpar = PL_regnpar;
5093 PL_reg_start_tmp = 0;
5094 PL_reg_start_tmpl = 0;
5095 PL_reg_oldsaved = Nullch;
5096 PL_reg_oldsavedlen = 0;
5098 PL_reg_leftiter = 0;
5099 PL_reg_poscache = Nullch;
5100 PL_reg_poscache_size = 0;
5103 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5106 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5108 for (i = 1; i <= rx->nparens; i++) {
5110 char digits[TYPE_CHARS(long)];
5112 const STRLEN len = snprintf(digits, sizeof(digits), "%lu", (long)i);
5114 const STRLEN len = my_sprintf(digits, "%lu", (long)i);
5115 #endif /* #ifdef USE_SNPRINTF */
5116 if ((mgv = gv_fetchpvn_flags(digits, len, 0, SVt_PV)))
5124 clear_re(pTHX_ void *r)
5126 ReREFCNT_dec((regexp *)r);
5132 S_put_byte(pTHX_ SV *sv, int c)
5134 if (isCNTRL(c) || c == 255 || !isPRINT(c))
5135 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
5136 else if (c == '-' || c == ']' || c == '\\' || c == '^')
5137 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
5139 Perl_sv_catpvf(aTHX_ sv, "%c", c);
5144 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
5146 register U8 op = EXACT; /* Arbitrary non-END op. */
5147 register regnode *next;
5149 while (op != END && (!last || node < last)) {
5150 /* While that wasn't END last time... */
5156 next = regnext(node);
5158 if (OP(node) == OPTIMIZED)
5161 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
5162 (int)(2*l + 1), "", SvPVX_const(sv));
5163 if (next == NULL) /* Next ptr. */
5164 PerlIO_printf(Perl_debug_log, "(0)");
5166 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
5167 (void)PerlIO_putc(Perl_debug_log, '\n');
5169 if (PL_regkind[(U8)op] == BRANCHJ) {
5170 register regnode *nnode = (OP(next) == LONGJMP
5173 if (last && nnode > last)
5175 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
5177 else if (PL_regkind[(U8)op] == BRANCH) {
5178 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
5180 else if ( op == CURLY) { /* "next" might be very big: optimizer */
5181 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5182 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
5184 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
5185 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
5188 else if ( op == PLUS || op == STAR) {
5189 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
5191 else if (op == ANYOF) {
5192 /* arglen 1 + class block */
5193 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
5194 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
5195 node = NEXTOPER(node);
5197 else if (PL_regkind[(U8)op] == EXACT) {
5198 /* Literal string, where present. */
5199 node += NODE_SZ_STR(node) - 1;
5200 node = NEXTOPER(node);
5203 node = NEXTOPER(node);
5204 node += regarglen[(U8)op];
5206 if (op == CURLYX || op == OPEN)
5208 else if (op == WHILEM)
5214 #endif /* DEBUGGING */
5218 * c-indentation-style: bsd
5220 * indent-tabs-mode: t
5223 * ex: set ts=8 sts=4 sw=4 noet: