5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (c) 1991-2003, Larry Wall
74 **** You may distribute under the terms of either the GNU General Public
75 **** License or the Artistic License, as specified in the README file.
78 * Beware that some of this code is subtly aware of the way operator
79 * precedence is structured in regular expressions. Serious changes in
80 * regular-expression syntax might require a total rethink.
83 #define PERL_IN_REGCOMP_C
86 #ifndef PERL_IN_XSUB_RE
98 # if defined(BUGGY_MSC6)
99 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
100 # pragma optimize("a",off)
101 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
102 # pragma optimize("w",on )
103 # endif /* BUGGY_MSC6 */
107 #define STATIC static
110 typedef struct RExC_state_t {
111 U32 flags; /* are we folding, multilining? */
112 char *precomp; /* uncompiled string. */
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* () count. */
130 char *starttry; /* -Dr: where regtry was called. */
131 #define RExC_starttry (pRExC_state->starttry)
135 #define RExC_flags (pRExC_state->flags)
136 #define RExC_precomp (pRExC_state->precomp)
137 #define RExC_rx (pRExC_state->rx)
138 #define RExC_start (pRExC_state->start)
139 #define RExC_end (pRExC_state->end)
140 #define RExC_parse (pRExC_state->parse)
141 #define RExC_whilem_seen (pRExC_state->whilem_seen)
142 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
143 #define RExC_emit (pRExC_state->emit)
144 #define RExC_emit_start (pRExC_state->emit_start)
145 #define RExC_naughty (pRExC_state->naughty)
146 #define RExC_sawback (pRExC_state->sawback)
147 #define RExC_seen (pRExC_state->seen)
148 #define RExC_size (pRExC_state->size)
149 #define RExC_npar (pRExC_state->npar)
150 #define RExC_extralen (pRExC_state->extralen)
151 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
152 #define RExC_seen_evals (pRExC_state->seen_evals)
153 #define RExC_utf8 (pRExC_state->utf8)
155 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
156 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
157 ((*s) == '{' && regcurly(s)))
160 #undef SPSTART /* dratted cpp namespace... */
163 * Flags to be passed up and down.
165 #define WORST 0 /* Worst case. */
166 #define HASWIDTH 0x1 /* Known to match non-null strings. */
167 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
168 #define SPSTART 0x4 /* Starts with * or +. */
169 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
171 /* Length of a variant. */
173 typedef struct scan_data_t {
179 I32 last_end; /* min value, <0 unless valid. */
182 SV **longest; /* Either &l_fixed, or &l_float. */
186 I32 offset_float_min;
187 I32 offset_float_max;
191 struct regnode_charclass_class *start_class;
195 * Forward declarations for pregcomp()'s friends.
198 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
202 #define SF_BEFORE_SEOL 0x1
203 #define SF_BEFORE_MEOL 0x2
204 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
205 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
208 # define SF_FIX_SHIFT_EOL (0+2)
209 # define SF_FL_SHIFT_EOL (0+4)
211 # define SF_FIX_SHIFT_EOL (+2)
212 # define SF_FL_SHIFT_EOL (+4)
215 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
216 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
218 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
219 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
220 #define SF_IS_INF 0x40
221 #define SF_HAS_PAR 0x80
222 #define SF_IN_PAR 0x100
223 #define SF_HAS_EVAL 0x200
224 #define SCF_DO_SUBSTR 0x400
225 #define SCF_DO_STCLASS_AND 0x0800
226 #define SCF_DO_STCLASS_OR 0x1000
227 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
228 #define SCF_WHILEM_VISITED_POS 0x2000
230 #define UTF (RExC_utf8 != 0)
231 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
232 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
234 #define OOB_UNICODE 12345678
235 #define OOB_NAMEDCLASS -1
237 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
238 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
241 /* length of regex to show in messages that don't mark a position within */
242 #define RegexLengthToShowInErrorMessages 127
245 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
246 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
247 * op/pragma/warn/regcomp.
249 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
250 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
252 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
255 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
256 * arg. Show regex, up to a maximum length. If it's too long, chop and add
259 #define FAIL(msg) STMT_START { \
260 char *ellipses = ""; \
261 IV len = RExC_end - RExC_precomp; \
264 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
265 if (len > RegexLengthToShowInErrorMessages) { \
266 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
267 len = RegexLengthToShowInErrorMessages - 10; \
270 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
271 msg, (int)len, RExC_precomp, ellipses); \
275 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
276 * args. Show regex, up to a maximum length. If it's too long, chop and add
279 #define FAIL2(pat,msg) STMT_START { \
280 char *ellipses = ""; \
281 IV len = RExC_end - RExC_precomp; \
284 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
285 if (len > RegexLengthToShowInErrorMessages) { \
286 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
287 len = RegexLengthToShowInErrorMessages - 10; \
290 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
291 msg, (int)len, RExC_precomp, ellipses); \
296 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
298 #define Simple_vFAIL(m) STMT_START { \
299 IV offset = RExC_parse - RExC_precomp; \
300 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
301 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
305 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
307 #define vFAIL(m) STMT_START { \
309 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
314 * Like Simple_vFAIL(), but accepts two arguments.
316 #define Simple_vFAIL2(m,a1) STMT_START { \
317 IV offset = RExC_parse - RExC_precomp; \
318 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
319 (int)offset, RExC_precomp, RExC_precomp + offset); \
323 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
325 #define vFAIL2(m,a1) STMT_START { \
327 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
328 Simple_vFAIL2(m, a1); \
333 * Like Simple_vFAIL(), but accepts three arguments.
335 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
336 IV offset = RExC_parse - RExC_precomp; \
337 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
338 (int)offset, RExC_precomp, RExC_precomp + offset); \
342 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
344 #define vFAIL3(m,a1,a2) STMT_START { \
346 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
347 Simple_vFAIL3(m, a1, a2); \
351 * Like Simple_vFAIL(), but accepts four arguments.
353 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
354 IV offset = RExC_parse - RExC_precomp; \
355 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
356 (int)offset, RExC_precomp, RExC_precomp + offset); \
360 * Like Simple_vFAIL(), but accepts five arguments.
362 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
363 IV offset = RExC_parse - RExC_precomp; \
364 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
365 (int)offset, RExC_precomp, RExC_precomp + offset); \
369 #define vWARN(loc,m) STMT_START { \
370 IV offset = loc - RExC_precomp; \
371 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
372 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
375 #define vWARNdep(loc,m) STMT_START { \
376 IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
378 "%s" REPORT_LOCATION, \
379 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
383 #define vWARN2(loc, m, a1) STMT_START { \
384 IV offset = loc - RExC_precomp; \
385 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
386 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
389 #define vWARN3(loc, m, a1, a2) STMT_START { \
390 IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
395 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
396 IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
408 /* Allow for side effects in s */
409 #define REGC(c,s) STMT_START { \
410 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
413 /* Macros for recording node offsets. 20001227 mjd@plover.com
414 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
415 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
416 * Element 0 holds the number n.
419 #define MJD_OFFSET_DEBUG(x)
420 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
423 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
425 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
426 __LINE__, (node), (byte))); \
428 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
430 RExC_offsets[2*(node)-1] = (byte); \
435 #define Set_Node_Offset(node,byte) \
436 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
437 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
439 #define Set_Node_Length_To_R(node,len) STMT_START { \
441 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
442 __LINE__, (node), (len))); \
444 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
446 RExC_offsets[2*(node)] = (len); \
451 #define Set_Node_Length(node,len) \
452 Set_Node_Length_To_R((node)-RExC_emit_start, len)
453 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
454 #define Set_Node_Cur_Length(node) \
455 Set_Node_Length(node, RExC_parse - parse_start)
457 /* Get offsets and lengths */
458 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
459 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
461 static void clear_re(pTHX_ void *r);
463 /* Mark that we cannot extend a found fixed substring at this point.
464 Updata the longest found anchored substring and the longest found
465 floating substrings if needed. */
468 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
470 STRLEN l = CHR_SVLEN(data->last_found);
471 STRLEN old_l = CHR_SVLEN(*data->longest);
473 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
474 sv_setsv(*data->longest, data->last_found);
475 if (*data->longest == data->longest_fixed) {
476 data->offset_fixed = l ? data->last_start_min : data->pos_min;
477 if (data->flags & SF_BEFORE_EOL)
479 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
481 data->flags &= ~SF_FIX_BEFORE_EOL;
484 data->offset_float_min = l ? data->last_start_min : data->pos_min;
485 data->offset_float_max = (l
486 ? data->last_start_max
487 : data->pos_min + data->pos_delta);
488 if ((U32)data->offset_float_max > (U32)I32_MAX)
489 data->offset_float_max = I32_MAX;
490 if (data->flags & SF_BEFORE_EOL)
492 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
494 data->flags &= ~SF_FL_BEFORE_EOL;
497 SvCUR_set(data->last_found, 0);
499 SV * sv = data->last_found;
501 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
502 if (mg && mg->mg_len > 0)
506 data->flags &= ~SF_BEFORE_EOL;
509 /* Can match anything (initialization) */
511 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
513 ANYOF_CLASS_ZERO(cl);
514 ANYOF_BITMAP_SETALL(cl);
515 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
517 cl->flags |= ANYOF_LOCALE;
520 /* Can match anything (initialization) */
522 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
526 for (value = 0; value <= ANYOF_MAX; value += 2)
527 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
529 if (!(cl->flags & ANYOF_UNICODE_ALL))
531 if (!ANYOF_BITMAP_TESTALLSET(cl))
536 /* Can match anything (initialization) */
538 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
540 Zero(cl, 1, struct regnode_charclass_class);
542 cl_anything(pRExC_state, cl);
546 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
548 Zero(cl, 1, struct regnode_charclass_class);
550 cl_anything(pRExC_state, cl);
552 cl->flags |= ANYOF_LOCALE;
555 /* 'And' a given class with another one. Can create false positives */
556 /* We assume that cl is not inverted */
558 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
559 struct regnode_charclass_class *and_with)
561 if (!(and_with->flags & ANYOF_CLASS)
562 && !(cl->flags & ANYOF_CLASS)
563 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
564 && !(and_with->flags & ANYOF_FOLD)
565 && !(cl->flags & ANYOF_FOLD)) {
568 if (and_with->flags & ANYOF_INVERT)
569 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
570 cl->bitmap[i] &= ~and_with->bitmap[i];
572 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
573 cl->bitmap[i] &= and_with->bitmap[i];
574 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
575 if (!(and_with->flags & ANYOF_EOS))
576 cl->flags &= ~ANYOF_EOS;
578 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
579 cl->flags &= ~ANYOF_UNICODE_ALL;
580 cl->flags |= ANYOF_UNICODE;
581 ARG_SET(cl, ARG(and_with));
583 if (!(and_with->flags & ANYOF_UNICODE_ALL))
584 cl->flags &= ~ANYOF_UNICODE_ALL;
585 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
586 cl->flags &= ~ANYOF_UNICODE;
589 /* 'OR' a given class with another one. Can create false positives */
590 /* We assume that cl is not inverted */
592 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
594 if (or_with->flags & ANYOF_INVERT) {
596 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
597 * <= (B1 | !B2) | (CL1 | !CL2)
598 * which is wasteful if CL2 is small, but we ignore CL2:
599 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
600 * XXXX Can we handle case-fold? Unclear:
601 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
602 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
604 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
605 && !(or_with->flags & ANYOF_FOLD)
606 && !(cl->flags & ANYOF_FOLD) ) {
609 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
610 cl->bitmap[i] |= ~or_with->bitmap[i];
611 } /* XXXX: logic is complicated otherwise */
613 cl_anything(pRExC_state, cl);
616 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
617 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
618 && (!(or_with->flags & ANYOF_FOLD)
619 || (cl->flags & ANYOF_FOLD)) ) {
622 /* OR char bitmap and class bitmap separately */
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= or_with->bitmap[i];
625 if (or_with->flags & ANYOF_CLASS) {
626 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
627 cl->classflags[i] |= or_with->classflags[i];
628 cl->flags |= ANYOF_CLASS;
631 else { /* XXXX: logic is complicated, leave it along for a moment. */
632 cl_anything(pRExC_state, cl);
635 if (or_with->flags & ANYOF_EOS)
636 cl->flags |= ANYOF_EOS;
638 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
639 ARG(cl) != ARG(or_with)) {
640 cl->flags |= ANYOF_UNICODE_ALL;
641 cl->flags &= ~ANYOF_UNICODE;
643 if (or_with->flags & ANYOF_UNICODE_ALL) {
644 cl->flags |= ANYOF_UNICODE_ALL;
645 cl->flags &= ~ANYOF_UNICODE;
650 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
651 * These need to be revisited when a newer toolchain becomes available.
653 #if defined(__sparc64__) && defined(__GNUC__)
654 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
655 # undef SPARC64_GCC_WORKAROUND
656 # define SPARC64_GCC_WORKAROUND 1
660 /* REx optimizer. Converts nodes into quickier variants "in place".
661 Finds fixed substrings. */
663 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
664 to the position after last scanned or to NULL. */
667 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
668 /* scanp: Start here (read-write). */
669 /* deltap: Write maxlen-minlen here. */
670 /* last: Stop before this one. */
672 I32 min = 0, pars = 0, code;
673 regnode *scan = *scanp, *next;
675 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
676 int is_inf_internal = 0; /* The studied chunk is infinite */
677 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
678 scan_data_t data_fake;
679 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
681 while (scan && OP(scan) != END && scan < last) {
682 /* Peephole optimizer: */
684 if (PL_regkind[(U8)OP(scan)] == EXACT) {
685 /* Merge several consecutive EXACTish nodes into one. */
686 regnode *n = regnext(scan);
689 regnode *stop = scan;
692 next = scan + NODE_SZ_STR(scan);
693 /* Skip NOTHING, merge EXACT*. */
695 ( PL_regkind[(U8)OP(n)] == NOTHING ||
696 (stringok && (OP(n) == OP(scan))))
698 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
699 if (OP(n) == TAIL || n > next)
701 if (PL_regkind[(U8)OP(n)] == NOTHING) {
702 NEXT_OFF(scan) += NEXT_OFF(n);
703 next = n + NODE_STEP_REGNODE;
711 int oldl = STR_LEN(scan);
712 regnode *nnext = regnext(n);
714 if (oldl + STR_LEN(n) > U8_MAX)
716 NEXT_OFF(scan) += NEXT_OFF(n);
717 STR_LEN(scan) += STR_LEN(n);
718 next = n + NODE_SZ_STR(n);
719 /* Now we can overwrite *n : */
720 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
728 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
730 Two problematic code points in Unicode casefolding of EXACT nodes:
732 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
733 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
739 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
740 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
742 This means that in case-insensitive matching (or "loose matching",
743 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
744 length of the above casefolded versions) can match a target string
745 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
746 This would rather mess up the minimum length computation.
748 What we'll do is to look for the tail four bytes, and then peek
749 at the preceding two bytes to see whether we need to decrease
750 the minimum length by four (six minus two).
752 Thanks to the design of UTF-8, there cannot be false matches:
753 A sequence of valid UTF-8 bytes cannot be a subsequence of
754 another valid sequence of UTF-8 bytes.
757 char *s0 = STRING(scan), *s, *t;
758 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
759 char *t0 = "\xcc\x88\xcc\x81";
763 s < s2 && (t = ninstr(s, s1, t0, t1));
765 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
766 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
773 n = scan + NODE_SZ_STR(scan);
775 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
783 /* Follow the next-chain of the current node and optimize
784 away all the NOTHINGs from it. */
785 if (OP(scan) != CURLYX) {
786 int max = (reg_off_by_arg[OP(scan)]
788 /* I32 may be smaller than U16 on CRAYs! */
789 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
790 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
794 /* Skip NOTHING and LONGJMP. */
795 while ((n = regnext(n))
796 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
797 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
800 if (reg_off_by_arg[OP(scan)])
803 NEXT_OFF(scan) = off;
805 /* The principal pseudo-switch. Cannot be a switch, since we
806 look into several different things. */
807 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
808 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
809 next = regnext(scan);
812 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
813 I32 max1 = 0, min1 = I32_MAX, num = 0;
814 struct regnode_charclass_class accum;
816 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
817 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
818 if (flags & SCF_DO_STCLASS)
819 cl_init_zero(pRExC_state, &accum);
820 while (OP(scan) == code) {
821 I32 deltanext, minnext, f = 0, fake;
822 struct regnode_charclass_class this_class;
827 data_fake.whilem_c = data->whilem_c;
828 data_fake.last_closep = data->last_closep;
831 data_fake.last_closep = &fake;
832 next = regnext(scan);
833 scan = NEXTOPER(scan);
835 scan = NEXTOPER(scan);
836 if (flags & SCF_DO_STCLASS) {
837 cl_init(pRExC_state, &this_class);
838 data_fake.start_class = &this_class;
839 f = SCF_DO_STCLASS_AND;
841 if (flags & SCF_WHILEM_VISITED_POS)
842 f |= SCF_WHILEM_VISITED_POS;
843 /* we suppose the run is continuous, last=next...*/
844 minnext = study_chunk(pRExC_state, &scan, &deltanext,
845 next, &data_fake, f);
848 if (max1 < minnext + deltanext)
849 max1 = minnext + deltanext;
850 if (deltanext == I32_MAX)
851 is_inf = is_inf_internal = 1;
853 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
855 if (data && (data_fake.flags & SF_HAS_EVAL))
856 data->flags |= SF_HAS_EVAL;
858 data->whilem_c = data_fake.whilem_c;
859 if (flags & SCF_DO_STCLASS)
860 cl_or(pRExC_state, &accum, &this_class);
864 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
866 if (flags & SCF_DO_SUBSTR) {
867 data->pos_min += min1;
868 data->pos_delta += max1 - min1;
869 if (max1 != min1 || is_inf)
870 data->longest = &(data->longest_float);
873 delta += max1 - min1;
874 if (flags & SCF_DO_STCLASS_OR) {
875 cl_or(pRExC_state, data->start_class, &accum);
877 cl_and(data->start_class, &and_with);
878 flags &= ~SCF_DO_STCLASS;
881 else if (flags & SCF_DO_STCLASS_AND) {
883 cl_and(data->start_class, &accum);
884 flags &= ~SCF_DO_STCLASS;
887 /* Switch to OR mode: cache the old value of
888 * data->start_class */
889 StructCopy(data->start_class, &and_with,
890 struct regnode_charclass_class);
891 flags &= ~SCF_DO_STCLASS_AND;
892 StructCopy(&accum, data->start_class,
893 struct regnode_charclass_class);
894 flags |= SCF_DO_STCLASS_OR;
895 data->start_class->flags |= ANYOF_EOS;
899 else if (code == BRANCHJ) /* single branch is optimized. */
900 scan = NEXTOPER(NEXTOPER(scan));
901 else /* single branch is optimized. */
902 scan = NEXTOPER(scan);
905 else if (OP(scan) == EXACT) {
906 I32 l = STR_LEN(scan);
907 UV uc = *((U8*)STRING(scan));
909 U8 *s = (U8*)STRING(scan);
910 l = utf8_length(s, s + l);
911 uc = utf8_to_uvchr(s, NULL);
914 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
915 /* The code below prefers earlier match for fixed
916 offset, later match for variable offset. */
917 if (data->last_end == -1) { /* Update the start info. */
918 data->last_start_min = data->pos_min;
919 data->last_start_max = is_inf
920 ? I32_MAX : data->pos_min + data->pos_delta;
922 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
924 SV * sv = data->last_found;
925 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
926 mg_find(sv, PERL_MAGIC_utf8) : NULL;
927 if (mg && mg->mg_len >= 0)
928 mg->mg_len += utf8_length(STRING(scan),
929 STRING(scan)+STR_LEN(scan));
932 SvUTF8_on(data->last_found);
933 data->last_end = data->pos_min + l;
934 data->pos_min += l; /* As in the first entry. */
935 data->flags &= ~SF_BEFORE_EOL;
937 if (flags & SCF_DO_STCLASS_AND) {
938 /* Check whether it is compatible with what we know already! */
942 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
943 && !ANYOF_BITMAP_TEST(data->start_class, uc)
944 && (!(data->start_class->flags & ANYOF_FOLD)
945 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
948 ANYOF_CLASS_ZERO(data->start_class);
949 ANYOF_BITMAP_ZERO(data->start_class);
951 ANYOF_BITMAP_SET(data->start_class, uc);
952 data->start_class->flags &= ~ANYOF_EOS;
954 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
956 else if (flags & SCF_DO_STCLASS_OR) {
957 /* false positive possible if the class is case-folded */
959 ANYOF_BITMAP_SET(data->start_class, uc);
961 data->start_class->flags |= ANYOF_UNICODE_ALL;
962 data->start_class->flags &= ~ANYOF_EOS;
963 cl_and(data->start_class, &and_with);
965 flags &= ~SCF_DO_STCLASS;
967 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
968 I32 l = STR_LEN(scan);
969 UV uc = *((U8*)STRING(scan));
971 /* Search for fixed substrings supports EXACT only. */
972 if (flags & SCF_DO_SUBSTR)
973 scan_commit(pRExC_state, data);
975 U8 *s = (U8 *)STRING(scan);
976 l = utf8_length(s, s + l);
977 uc = utf8_to_uvchr(s, NULL);
980 if (data && (flags & SCF_DO_SUBSTR))
982 if (flags & SCF_DO_STCLASS_AND) {
983 /* Check whether it is compatible with what we know already! */
987 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
988 && !ANYOF_BITMAP_TEST(data->start_class, uc)
989 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
991 ANYOF_CLASS_ZERO(data->start_class);
992 ANYOF_BITMAP_ZERO(data->start_class);
994 ANYOF_BITMAP_SET(data->start_class, uc);
995 data->start_class->flags &= ~ANYOF_EOS;
996 data->start_class->flags |= ANYOF_FOLD;
997 if (OP(scan) == EXACTFL)
998 data->start_class->flags |= ANYOF_LOCALE;
1001 else if (flags & SCF_DO_STCLASS_OR) {
1002 if (data->start_class->flags & ANYOF_FOLD) {
1003 /* false positive possible if the class is case-folded.
1004 Assume that the locale settings are the same... */
1006 ANYOF_BITMAP_SET(data->start_class, uc);
1007 data->start_class->flags &= ~ANYOF_EOS;
1009 cl_and(data->start_class, &and_with);
1011 flags &= ~SCF_DO_STCLASS;
1013 else if (strchr((char*)PL_varies,OP(scan))) {
1014 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1015 I32 f = flags, pos_before = 0;
1016 regnode *oscan = scan;
1017 struct regnode_charclass_class this_class;
1018 struct regnode_charclass_class *oclass = NULL;
1019 I32 next_is_eval = 0;
1021 switch (PL_regkind[(U8)OP(scan)]) {
1022 case WHILEM: /* End of (?:...)* . */
1023 scan = NEXTOPER(scan);
1026 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1027 next = NEXTOPER(scan);
1028 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1030 maxcount = REG_INFTY;
1031 next = regnext(scan);
1032 scan = NEXTOPER(scan);
1036 if (flags & SCF_DO_SUBSTR)
1041 if (flags & SCF_DO_STCLASS) {
1043 maxcount = REG_INFTY;
1044 next = regnext(scan);
1045 scan = NEXTOPER(scan);
1048 is_inf = is_inf_internal = 1;
1049 scan = regnext(scan);
1050 if (flags & SCF_DO_SUBSTR) {
1051 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1052 data->longest = &(data->longest_float);
1054 goto optimize_curly_tail;
1056 mincount = ARG1(scan);
1057 maxcount = ARG2(scan);
1058 next = regnext(scan);
1059 if (OP(scan) == CURLYX) {
1060 I32 lp = (data ? *(data->last_closep) : 0);
1062 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1064 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1065 next_is_eval = (OP(scan) == EVAL);
1067 if (flags & SCF_DO_SUBSTR) {
1068 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1069 pos_before = data->pos_min;
1073 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1075 data->flags |= SF_IS_INF;
1077 if (flags & SCF_DO_STCLASS) {
1078 cl_init(pRExC_state, &this_class);
1079 oclass = data->start_class;
1080 data->start_class = &this_class;
1081 f |= SCF_DO_STCLASS_AND;
1082 f &= ~SCF_DO_STCLASS_OR;
1084 /* These are the cases when once a subexpression
1085 fails at a particular position, it cannot succeed
1086 even after backtracking at the enclosing scope.
1088 XXXX what if minimal match and we are at the
1089 initial run of {n,m}? */
1090 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1091 f &= ~SCF_WHILEM_VISITED_POS;
1093 /* This will finish on WHILEM, setting scan, or on NULL: */
1094 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1096 ? (f & ~SCF_DO_SUBSTR) : f);
1098 if (flags & SCF_DO_STCLASS)
1099 data->start_class = oclass;
1100 if (mincount == 0 || minnext == 0) {
1101 if (flags & SCF_DO_STCLASS_OR) {
1102 cl_or(pRExC_state, data->start_class, &this_class);
1104 else if (flags & SCF_DO_STCLASS_AND) {
1105 /* Switch to OR mode: cache the old value of
1106 * data->start_class */
1107 StructCopy(data->start_class, &and_with,
1108 struct regnode_charclass_class);
1109 flags &= ~SCF_DO_STCLASS_AND;
1110 StructCopy(&this_class, data->start_class,
1111 struct regnode_charclass_class);
1112 flags |= SCF_DO_STCLASS_OR;
1113 data->start_class->flags |= ANYOF_EOS;
1115 } else { /* Non-zero len */
1116 if (flags & SCF_DO_STCLASS_OR) {
1117 cl_or(pRExC_state, data->start_class, &this_class);
1118 cl_and(data->start_class, &and_with);
1120 else if (flags & SCF_DO_STCLASS_AND)
1121 cl_and(data->start_class, &this_class);
1122 flags &= ~SCF_DO_STCLASS;
1124 if (!scan) /* It was not CURLYX, but CURLY. */
1126 if (ckWARN(WARN_REGEXP)
1127 /* ? quantifier ok, except for (?{ ... }) */
1128 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1129 && (minnext == 0) && (deltanext == 0)
1130 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1131 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1134 "Quantifier unexpected on zero-length expression");
1137 min += minnext * mincount;
1138 is_inf_internal |= ((maxcount == REG_INFTY
1139 && (minnext + deltanext) > 0)
1140 || deltanext == I32_MAX);
1141 is_inf |= is_inf_internal;
1142 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1144 /* Try powerful optimization CURLYX => CURLYN. */
1145 if ( OP(oscan) == CURLYX && data
1146 && data->flags & SF_IN_PAR
1147 && !(data->flags & SF_HAS_EVAL)
1148 && !deltanext && minnext == 1 ) {
1149 /* Try to optimize to CURLYN. */
1150 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1151 regnode *nxt1 = nxt;
1158 if (!strchr((char*)PL_simple,OP(nxt))
1159 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1160 && STR_LEN(nxt) == 1))
1166 if (OP(nxt) != CLOSE)
1168 /* Now we know that nxt2 is the only contents: */
1169 oscan->flags = (U8)ARG(nxt);
1171 OP(nxt1) = NOTHING; /* was OPEN. */
1173 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1174 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1175 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1176 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1177 OP(nxt + 1) = OPTIMIZED; /* was count. */
1178 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1183 /* Try optimization CURLYX => CURLYM. */
1184 if ( OP(oscan) == CURLYX && data
1185 && !(data->flags & SF_HAS_PAR)
1186 && !(data->flags & SF_HAS_EVAL)
1188 /* XXXX How to optimize if data == 0? */
1189 /* Optimize to a simpler form. */
1190 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1194 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1195 && (OP(nxt2) != WHILEM))
1197 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1198 /* Need to optimize away parenths. */
1199 if (data->flags & SF_IN_PAR) {
1200 /* Set the parenth number. */
1201 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1203 if (OP(nxt) != CLOSE)
1204 FAIL("Panic opt close");
1205 oscan->flags = (U8)ARG(nxt);
1206 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1207 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1209 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1210 OP(nxt + 1) = OPTIMIZED; /* was count. */
1211 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1212 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1215 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1216 regnode *nnxt = regnext(nxt1);
1219 if (reg_off_by_arg[OP(nxt1)])
1220 ARG_SET(nxt1, nxt2 - nxt1);
1221 else if (nxt2 - nxt1 < U16_MAX)
1222 NEXT_OFF(nxt1) = nxt2 - nxt1;
1224 OP(nxt) = NOTHING; /* Cannot beautify */
1229 /* Optimize again: */
1230 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1236 else if ((OP(oscan) == CURLYX)
1237 && (flags & SCF_WHILEM_VISITED_POS)
1238 /* See the comment on a similar expression above.
1239 However, this time it not a subexpression
1240 we care about, but the expression itself. */
1241 && (maxcount == REG_INFTY)
1242 && data && ++data->whilem_c < 16) {
1243 /* This stays as CURLYX, we can put the count/of pair. */
1244 /* Find WHILEM (as in regexec.c) */
1245 regnode *nxt = oscan + NEXT_OFF(oscan);
1247 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1249 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1250 | (RExC_whilem_seen << 4)); /* On WHILEM */
1252 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1254 if (flags & SCF_DO_SUBSTR) {
1255 SV *last_str = Nullsv;
1256 int counted = mincount != 0;
1258 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1259 #if defined(SPARC64_GCC_WORKAROUND)
1265 if (pos_before >= data->last_start_min)
1268 b = data->last_start_min;
1271 s = SvPV(data->last_found, l);
1272 old = b - data->last_start_min;
1275 I32 b = pos_before >= data->last_start_min
1276 ? pos_before : data->last_start_min;
1278 char *s = SvPV(data->last_found, l);
1279 I32 old = b - data->last_start_min;
1283 old = utf8_hop((U8*)s, old) - (U8*)s;
1286 /* Get the added string: */
1287 last_str = newSVpvn(s + old, l);
1289 SvUTF8_on(last_str);
1290 if (deltanext == 0 && pos_before == b) {
1291 /* What was added is a constant string */
1293 SvGROW(last_str, (mincount * l) + 1);
1294 repeatcpy(SvPVX(last_str) + l,
1295 SvPVX(last_str), l, mincount - 1);
1296 SvCUR(last_str) *= mincount;
1297 /* Add additional parts. */
1298 SvCUR_set(data->last_found,
1299 SvCUR(data->last_found) - l);
1300 sv_catsv(data->last_found, last_str);
1302 SV * sv = data->last_found;
1304 SvUTF8(sv) && SvMAGICAL(sv) ?
1305 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1306 if (mg && mg->mg_len >= 0)
1307 mg->mg_len += CHR_SVLEN(last_str);
1309 data->last_end += l * (mincount - 1);
1312 /* start offset must point into the last copy */
1313 data->last_start_min += minnext * (mincount - 1);
1314 data->last_start_max += is_inf ? I32_MAX
1315 : (maxcount - 1) * (minnext + data->pos_delta);
1318 /* It is counted once already... */
1319 data->pos_min += minnext * (mincount - counted);
1320 data->pos_delta += - counted * deltanext +
1321 (minnext + deltanext) * maxcount - minnext * mincount;
1322 if (mincount != maxcount) {
1323 /* Cannot extend fixed substrings found inside
1325 scan_commit(pRExC_state,data);
1326 if (mincount && last_str) {
1327 sv_setsv(data->last_found, last_str);
1328 data->last_end = data->pos_min;
1329 data->last_start_min =
1330 data->pos_min - CHR_SVLEN(last_str);
1331 data->last_start_max = is_inf
1333 : data->pos_min + data->pos_delta
1334 - CHR_SVLEN(last_str);
1336 data->longest = &(data->longest_float);
1338 SvREFCNT_dec(last_str);
1340 if (data && (fl & SF_HAS_EVAL))
1341 data->flags |= SF_HAS_EVAL;
1342 optimize_curly_tail:
1343 if (OP(oscan) != CURLYX) {
1344 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1346 NEXT_OFF(oscan) += NEXT_OFF(next);
1349 default: /* REF and CLUMP only? */
1350 if (flags & SCF_DO_SUBSTR) {
1351 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1352 data->longest = &(data->longest_float);
1354 is_inf = is_inf_internal = 1;
1355 if (flags & SCF_DO_STCLASS_OR)
1356 cl_anything(pRExC_state, data->start_class);
1357 flags &= ~SCF_DO_STCLASS;
1361 else if (strchr((char*)PL_simple,OP(scan))) {
1364 if (flags & SCF_DO_SUBSTR) {
1365 scan_commit(pRExC_state,data);
1369 if (flags & SCF_DO_STCLASS) {
1370 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1372 /* Some of the logic below assumes that switching
1373 locale on will only add false positives. */
1374 switch (PL_regkind[(U8)OP(scan)]) {
1378 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1379 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1380 cl_anything(pRExC_state, data->start_class);
1383 if (OP(scan) == SANY)
1385 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1386 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1387 || (data->start_class->flags & ANYOF_CLASS));
1388 cl_anything(pRExC_state, data->start_class);
1390 if (flags & SCF_DO_STCLASS_AND || !value)
1391 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1394 if (flags & SCF_DO_STCLASS_AND)
1395 cl_and(data->start_class,
1396 (struct regnode_charclass_class*)scan);
1398 cl_or(pRExC_state, data->start_class,
1399 (struct regnode_charclass_class*)scan);
1402 if (flags & SCF_DO_STCLASS_AND) {
1403 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1404 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1405 for (value = 0; value < 256; value++)
1406 if (!isALNUM(value))
1407 ANYOF_BITMAP_CLEAR(data->start_class, value);
1411 if (data->start_class->flags & ANYOF_LOCALE)
1412 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1414 for (value = 0; value < 256; value++)
1416 ANYOF_BITMAP_SET(data->start_class, value);
1421 if (flags & SCF_DO_STCLASS_AND) {
1422 if (data->start_class->flags & ANYOF_LOCALE)
1423 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1426 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1427 data->start_class->flags |= ANYOF_LOCALE;
1431 if (flags & SCF_DO_STCLASS_AND) {
1432 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1433 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1434 for (value = 0; value < 256; value++)
1436 ANYOF_BITMAP_CLEAR(data->start_class, value);
1440 if (data->start_class->flags & ANYOF_LOCALE)
1441 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1443 for (value = 0; value < 256; value++)
1444 if (!isALNUM(value))
1445 ANYOF_BITMAP_SET(data->start_class, value);
1450 if (flags & SCF_DO_STCLASS_AND) {
1451 if (data->start_class->flags & ANYOF_LOCALE)
1452 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1455 data->start_class->flags |= ANYOF_LOCALE;
1456 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1460 if (flags & SCF_DO_STCLASS_AND) {
1461 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1462 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1463 for (value = 0; value < 256; value++)
1464 if (!isSPACE(value))
1465 ANYOF_BITMAP_CLEAR(data->start_class, value);
1469 if (data->start_class->flags & ANYOF_LOCALE)
1470 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1472 for (value = 0; value < 256; value++)
1474 ANYOF_BITMAP_SET(data->start_class, value);
1479 if (flags & SCF_DO_STCLASS_AND) {
1480 if (data->start_class->flags & ANYOF_LOCALE)
1481 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1484 data->start_class->flags |= ANYOF_LOCALE;
1485 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1489 if (flags & SCF_DO_STCLASS_AND) {
1490 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1491 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1492 for (value = 0; value < 256; value++)
1494 ANYOF_BITMAP_CLEAR(data->start_class, value);
1498 if (data->start_class->flags & ANYOF_LOCALE)
1499 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1501 for (value = 0; value < 256; value++)
1502 if (!isSPACE(value))
1503 ANYOF_BITMAP_SET(data->start_class, value);
1508 if (flags & SCF_DO_STCLASS_AND) {
1509 if (data->start_class->flags & ANYOF_LOCALE) {
1510 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1511 for (value = 0; value < 256; value++)
1512 if (!isSPACE(value))
1513 ANYOF_BITMAP_CLEAR(data->start_class, value);
1517 data->start_class->flags |= ANYOF_LOCALE;
1518 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1522 if (flags & SCF_DO_STCLASS_AND) {
1523 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1524 for (value = 0; value < 256; value++)
1525 if (!isDIGIT(value))
1526 ANYOF_BITMAP_CLEAR(data->start_class, value);
1529 if (data->start_class->flags & ANYOF_LOCALE)
1530 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1532 for (value = 0; value < 256; value++)
1534 ANYOF_BITMAP_SET(data->start_class, value);
1539 if (flags & SCF_DO_STCLASS_AND) {
1540 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1541 for (value = 0; value < 256; value++)
1543 ANYOF_BITMAP_CLEAR(data->start_class, value);
1546 if (data->start_class->flags & ANYOF_LOCALE)
1547 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1549 for (value = 0; value < 256; value++)
1550 if (!isDIGIT(value))
1551 ANYOF_BITMAP_SET(data->start_class, value);
1556 if (flags & SCF_DO_STCLASS_OR)
1557 cl_and(data->start_class, &and_with);
1558 flags &= ~SCF_DO_STCLASS;
1561 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1562 data->flags |= (OP(scan) == MEOL
1566 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1567 /* Lookbehind, or need to calculate parens/evals/stclass: */
1568 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1569 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1570 /* Lookahead/lookbehind */
1571 I32 deltanext, minnext, fake = 0;
1573 struct regnode_charclass_class intrnl;
1576 data_fake.flags = 0;
1578 data_fake.whilem_c = data->whilem_c;
1579 data_fake.last_closep = data->last_closep;
1582 data_fake.last_closep = &fake;
1583 if ( flags & SCF_DO_STCLASS && !scan->flags
1584 && OP(scan) == IFMATCH ) { /* Lookahead */
1585 cl_init(pRExC_state, &intrnl);
1586 data_fake.start_class = &intrnl;
1587 f |= SCF_DO_STCLASS_AND;
1589 if (flags & SCF_WHILEM_VISITED_POS)
1590 f |= SCF_WHILEM_VISITED_POS;
1591 next = regnext(scan);
1592 nscan = NEXTOPER(NEXTOPER(scan));
1593 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1596 vFAIL("Variable length lookbehind not implemented");
1598 else if (minnext > U8_MAX) {
1599 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1601 scan->flags = (U8)minnext;
1603 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1605 if (data && (data_fake.flags & SF_HAS_EVAL))
1606 data->flags |= SF_HAS_EVAL;
1608 data->whilem_c = data_fake.whilem_c;
1609 if (f & SCF_DO_STCLASS_AND) {
1610 int was = (data->start_class->flags & ANYOF_EOS);
1612 cl_and(data->start_class, &intrnl);
1614 data->start_class->flags |= ANYOF_EOS;
1617 else if (OP(scan) == OPEN) {
1620 else if (OP(scan) == CLOSE) {
1621 if ((I32)ARG(scan) == is_par) {
1622 next = regnext(scan);
1624 if ( next && (OP(next) != WHILEM) && next < last)
1625 is_par = 0; /* Disable optimization */
1628 *(data->last_closep) = ARG(scan);
1630 else if (OP(scan) == EVAL) {
1632 data->flags |= SF_HAS_EVAL;
1634 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1635 if (flags & SCF_DO_SUBSTR) {
1636 scan_commit(pRExC_state,data);
1637 data->longest = &(data->longest_float);
1639 is_inf = is_inf_internal = 1;
1640 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1641 cl_anything(pRExC_state, data->start_class);
1642 flags &= ~SCF_DO_STCLASS;
1644 /* Else: zero-length, ignore. */
1645 scan = regnext(scan);
1650 *deltap = is_inf_internal ? I32_MAX : delta;
1651 if (flags & SCF_DO_SUBSTR && is_inf)
1652 data->pos_delta = I32_MAX - data->pos_min;
1653 if (is_par > U8_MAX)
1655 if (is_par && pars==1 && data) {
1656 data->flags |= SF_IN_PAR;
1657 data->flags &= ~SF_HAS_PAR;
1659 else if (pars && data) {
1660 data->flags |= SF_HAS_PAR;
1661 data->flags &= ~SF_IN_PAR;
1663 if (flags & SCF_DO_STCLASS_OR)
1664 cl_and(data->start_class, &and_with);
1669 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1671 if (RExC_rx->data) {
1672 Renewc(RExC_rx->data,
1673 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1674 char, struct reg_data);
1675 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1676 RExC_rx->data->count += n;
1679 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1680 char, struct reg_data);
1681 New(1208, RExC_rx->data->what, n, U8);
1682 RExC_rx->data->count = n;
1684 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1685 return RExC_rx->data->count - n;
1689 Perl_reginitcolors(pTHX)
1692 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1695 PL_colors[0] = s = savepv(s);
1697 s = strchr(s, '\t');
1703 PL_colors[i] = s = "";
1707 PL_colors[i++] = "";
1714 - pregcomp - compile a regular expression into internal code
1716 * We can't allocate space until we know how big the compiled form will be,
1717 * but we can't compile it (and thus know how big it is) until we've got a
1718 * place to put the code. So we cheat: we compile it twice, once with code
1719 * generation turned off and size counting turned on, and once "for real".
1720 * This also means that we don't allocate space until we are sure that the
1721 * thing really will compile successfully, and we never have to move the
1722 * code and thus invalidate pointers into it. (Note that it has to be in
1723 * one piece because free() must be able to free it all.) [NB: not true in perl]
1725 * Beware that the optimization-preparation code in here knows about some
1726 * of the structure of the compiled regexp. [I'll say.]
1729 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1739 RExC_state_t RExC_state;
1740 RExC_state_t *pRExC_state = &RExC_state;
1743 FAIL("NULL regexp argument");
1745 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1749 if (!PL_colorset) reginitcolors();
1750 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1751 PL_colors[4],PL_colors[5],PL_colors[0],
1752 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1754 RExC_flags = pm->op_pmflags;
1758 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1759 RExC_seen_evals = 0;
1762 /* First pass: determine size, legality. */
1769 RExC_emit = &PL_regdummy;
1770 RExC_whilem_seen = 0;
1771 #if 0 /* REGC() is (currently) a NOP at the first pass.
1772 * Clever compilers notice this and complain. --jhi */
1773 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1775 if (reg(pRExC_state, 0, &flags) == NULL) {
1776 RExC_precomp = Nullch;
1779 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1781 /* Small enough for pointer-storage convention?
1782 If extralen==0, this means that we will not need long jumps. */
1783 if (RExC_size >= 0x10000L && RExC_extralen)
1784 RExC_size += RExC_extralen;
1787 if (RExC_whilem_seen > 15)
1788 RExC_whilem_seen = 15;
1790 /* Allocate space and initialize. */
1791 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1794 FAIL("Regexp out of space");
1797 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1798 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1801 r->prelen = xend - exp;
1802 r->precomp = savepvn(RExC_precomp, r->prelen);
1804 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1805 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1807 r->substrs = 0; /* Useful during FAIL. */
1808 r->startp = 0; /* Useful during FAIL. */
1809 r->endp = 0; /* Useful during FAIL. */
1811 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1813 r->offsets[0] = RExC_size;
1815 DEBUG_r(PerlIO_printf(Perl_debug_log,
1816 "%s %"UVuf" bytes for offset annotations.\n",
1817 r->offsets ? "Got" : "Couldn't get",
1818 (UV)((2*RExC_size+1) * sizeof(U32))));
1822 /* Second pass: emit code. */
1823 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1828 RExC_emit_start = r->program;
1829 RExC_emit = r->program;
1830 /* Store the count of eval-groups for security checks: */
1831 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1832 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1834 if (reg(pRExC_state, 0, &flags) == NULL)
1837 /* Dig out information for optimizations. */
1838 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1839 pm->op_pmflags = RExC_flags;
1841 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1842 r->regstclass = NULL;
1843 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1844 r->reganch |= ROPT_NAUGHTY;
1845 scan = r->program + 1; /* First BRANCH. */
1847 /* XXXX To minimize changes to RE engine we always allocate
1848 3-units-long substrs field. */
1849 Newz(1004, r->substrs, 1, struct reg_substr_data);
1851 StructCopy(&zero_scan_data, &data, scan_data_t);
1852 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1853 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1855 STRLEN longest_float_length, longest_fixed_length;
1856 struct regnode_charclass_class ch_class;
1861 /* Skip introductions and multiplicators >= 1. */
1862 while ((OP(first) == OPEN && (sawopen = 1)) ||
1863 /* An OR of *one* alternative - should not happen now. */
1864 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1865 (OP(first) == PLUS) ||
1866 (OP(first) == MINMOD) ||
1867 /* An {n,m} with n>0 */
1868 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1869 if (OP(first) == PLUS)
1872 first += regarglen[(U8)OP(first)];
1873 first = NEXTOPER(first);
1876 /* Starting-point info. */
1878 if (PL_regkind[(U8)OP(first)] == EXACT) {
1879 if (OP(first) == EXACT)
1880 ; /* Empty, get anchored substr later. */
1881 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1882 r->regstclass = first;
1884 else if (strchr((char*)PL_simple,OP(first)))
1885 r->regstclass = first;
1886 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1887 PL_regkind[(U8)OP(first)] == NBOUND)
1888 r->regstclass = first;
1889 else if (PL_regkind[(U8)OP(first)] == BOL) {
1890 r->reganch |= (OP(first) == MBOL
1892 : (OP(first) == SBOL
1895 first = NEXTOPER(first);
1898 else if (OP(first) == GPOS) {
1899 r->reganch |= ROPT_ANCH_GPOS;
1900 first = NEXTOPER(first);
1903 else if (!sawopen && (OP(first) == STAR &&
1904 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1905 !(r->reganch & ROPT_ANCH) )
1907 /* turn .* into ^.* with an implied $*=1 */
1908 int type = OP(NEXTOPER(first));
1910 if (type == REG_ANY)
1911 type = ROPT_ANCH_MBOL;
1913 type = ROPT_ANCH_SBOL;
1915 r->reganch |= type | ROPT_IMPLICIT;
1916 first = NEXTOPER(first);
1919 if (sawplus && (!sawopen || !RExC_sawback)
1920 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1921 /* x+ must match at the 1st pos of run of x's */
1922 r->reganch |= ROPT_SKIP;
1924 /* Scan is after the zeroth branch, first is atomic matcher. */
1925 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1926 (IV)(first - scan + 1)));
1928 * If there's something expensive in the r.e., find the
1929 * longest literal string that must appear and make it the
1930 * regmust. Resolve ties in favor of later strings, since
1931 * the regstart check works with the beginning of the r.e.
1932 * and avoiding duplication strengthens checking. Not a
1933 * strong reason, but sufficient in the absence of others.
1934 * [Now we resolve ties in favor of the earlier string if
1935 * it happens that c_offset_min has been invalidated, since the
1936 * earlier string may buy us something the later one won't.]
1940 data.longest_fixed = newSVpvn("",0);
1941 data.longest_float = newSVpvn("",0);
1942 data.last_found = newSVpvn("",0);
1943 data.longest = &(data.longest_fixed);
1945 if (!r->regstclass) {
1946 cl_init(pRExC_state, &ch_class);
1947 data.start_class = &ch_class;
1948 stclass_flag = SCF_DO_STCLASS_AND;
1949 } else /* XXXX Check for BOUND? */
1951 data.last_closep = &last_close;
1953 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1954 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1955 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1956 && data.last_start_min == 0 && data.last_end > 0
1957 && !RExC_seen_zerolen
1958 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1959 r->reganch |= ROPT_CHECK_ALL;
1960 scan_commit(pRExC_state, &data);
1961 SvREFCNT_dec(data.last_found);
1963 longest_float_length = CHR_SVLEN(data.longest_float);
1964 if (longest_float_length
1965 || (data.flags & SF_FL_BEFORE_EOL
1966 && (!(data.flags & SF_FL_BEFORE_MEOL)
1967 || (RExC_flags & PMf_MULTILINE)))) {
1970 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1971 && data.offset_fixed == data.offset_float_min
1972 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1973 goto remove_float; /* As in (a)+. */
1975 if (SvUTF8(data.longest_float)) {
1976 r->float_utf8 = data.longest_float;
1977 r->float_substr = Nullsv;
1979 r->float_substr = data.longest_float;
1980 r->float_utf8 = Nullsv;
1982 r->float_min_offset = data.offset_float_min;
1983 r->float_max_offset = data.offset_float_max;
1984 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1985 && (!(data.flags & SF_FL_BEFORE_MEOL)
1986 || (RExC_flags & PMf_MULTILINE)));
1987 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1991 r->float_substr = r->float_utf8 = Nullsv;
1992 SvREFCNT_dec(data.longest_float);
1993 longest_float_length = 0;
1996 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
1997 if (longest_fixed_length
1998 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
1999 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2000 || (RExC_flags & PMf_MULTILINE)))) {
2003 if (SvUTF8(data.longest_fixed)) {
2004 r->anchored_utf8 = data.longest_fixed;
2005 r->anchored_substr = Nullsv;
2007 r->anchored_substr = data.longest_fixed;
2008 r->anchored_utf8 = Nullsv;
2010 r->anchored_offset = data.offset_fixed;
2011 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2012 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2013 || (RExC_flags & PMf_MULTILINE)));
2014 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2017 r->anchored_substr = r->anchored_utf8 = Nullsv;
2018 SvREFCNT_dec(data.longest_fixed);
2019 longest_fixed_length = 0;
2022 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2023 r->regstclass = NULL;
2024 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2026 && !(data.start_class->flags & ANYOF_EOS)
2027 && !cl_is_anything(data.start_class))
2029 I32 n = add_data(pRExC_state, 1, "f");
2031 New(1006, RExC_rx->data->data[n], 1,
2032 struct regnode_charclass_class);
2033 StructCopy(data.start_class,
2034 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2035 struct regnode_charclass_class);
2036 r->regstclass = (regnode*)RExC_rx->data->data[n];
2037 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2038 PL_regdata = r->data; /* for regprop() */
2039 DEBUG_r({ SV *sv = sv_newmortal();
2040 regprop(sv, (regnode*)data.start_class);
2041 PerlIO_printf(Perl_debug_log,
2042 "synthetic stclass `%s'.\n",
2046 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2047 if (longest_fixed_length > longest_float_length) {
2048 r->check_substr = r->anchored_substr;
2049 r->check_utf8 = r->anchored_utf8;
2050 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2051 if (r->reganch & ROPT_ANCH_SINGLE)
2052 r->reganch |= ROPT_NOSCAN;
2055 r->check_substr = r->float_substr;
2056 r->check_utf8 = r->float_utf8;
2057 r->check_offset_min = data.offset_float_min;
2058 r->check_offset_max = data.offset_float_max;
2060 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2061 This should be changed ASAP! */
2062 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2063 r->reganch |= RE_USE_INTUIT;
2064 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2065 r->reganch |= RE_INTUIT_TAIL;
2069 /* Several toplevels. Best we can is to set minlen. */
2071 struct regnode_charclass_class ch_class;
2074 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2075 scan = r->program + 1;
2076 cl_init(pRExC_state, &ch_class);
2077 data.start_class = &ch_class;
2078 data.last_closep = &last_close;
2079 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2080 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2081 = r->float_substr = r->float_utf8 = Nullsv;
2082 if (!(data.start_class->flags & ANYOF_EOS)
2083 && !cl_is_anything(data.start_class))
2085 I32 n = add_data(pRExC_state, 1, "f");
2087 New(1006, RExC_rx->data->data[n], 1,
2088 struct regnode_charclass_class);
2089 StructCopy(data.start_class,
2090 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2091 struct regnode_charclass_class);
2092 r->regstclass = (regnode*)RExC_rx->data->data[n];
2093 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2094 DEBUG_r({ SV* sv = sv_newmortal();
2095 regprop(sv, (regnode*)data.start_class);
2096 PerlIO_printf(Perl_debug_log,
2097 "synthetic stclass `%s'.\n",
2103 if (RExC_seen & REG_SEEN_GPOS)
2104 r->reganch |= ROPT_GPOS_SEEN;
2105 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2106 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2107 if (RExC_seen & REG_SEEN_EVAL)
2108 r->reganch |= ROPT_EVAL_SEEN;
2109 if (RExC_seen & REG_SEEN_CANY)
2110 r->reganch |= ROPT_CANY_SEEN;
2111 Newz(1002, r->startp, RExC_npar, I32);
2112 Newz(1002, r->endp, RExC_npar, I32);
2113 PL_regdata = r->data; /* for regprop() */
2114 DEBUG_r(regdump(r));
2119 - reg - regular expression, i.e. main body or parenthesized thing
2121 * Caller must absorb opening parenthesis.
2123 * Combining parenthesis handling with the base level of regular expression
2124 * is a trifle forced, but the need to tie the tails of the branches to what
2125 * follows makes it hard to avoid.
2128 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2129 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2131 register regnode *ret; /* Will be the head of the group. */
2132 register regnode *br;
2133 register regnode *lastbr;
2134 register regnode *ender = 0;
2135 register I32 parno = 0;
2136 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2138 /* for (?g), (?gc), and (?o) warnings; warning
2139 about (?c) will warn about (?g) -- japhy */
2141 I32 wastedflags = 0x00,
2144 wasted_gc = 0x02 | 0x04,
2147 char * parse_start = RExC_parse; /* MJD */
2148 char *oregcomp_parse = RExC_parse;
2151 *flagp = 0; /* Tentatively. */
2154 /* Make an OPEN node, if parenthesized. */
2156 if (*RExC_parse == '?') { /* (?...) */
2157 U32 posflags = 0, negflags = 0;
2158 U32 *flagsp = &posflags;
2160 char *seqstart = RExC_parse;
2163 paren = *RExC_parse++;
2164 ret = NULL; /* For look-ahead/behind. */
2166 case '<': /* (?<...) */
2167 RExC_seen |= REG_SEEN_LOOKBEHIND;
2168 if (*RExC_parse == '!')
2170 if (*RExC_parse != '=' && *RExC_parse != '!')
2173 case '=': /* (?=...) */
2174 case '!': /* (?!...) */
2175 RExC_seen_zerolen++;
2176 case ':': /* (?:...) */
2177 case '>': /* (?>...) */
2179 case '$': /* (?$...) */
2180 case '@': /* (?@...) */
2181 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2183 case '#': /* (?#...) */
2184 while (*RExC_parse && *RExC_parse != ')')
2186 if (*RExC_parse != ')')
2187 FAIL("Sequence (?#... not terminated");
2188 nextchar(pRExC_state);
2191 case 'p': /* (?p...) */
2192 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2193 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2195 case '?': /* (??...) */
2197 if (*RExC_parse != '{')
2199 paren = *RExC_parse++;
2201 case '{': /* (?{...}) */
2203 I32 count = 1, n = 0;
2205 char *s = RExC_parse;
2207 OP_4tree *sop, *rop;
2209 RExC_seen_zerolen++;
2210 RExC_seen |= REG_SEEN_EVAL;
2211 while (count && (c = *RExC_parse)) {
2212 if (c == '\\' && RExC_parse[1])
2220 if (*RExC_parse != ')')
2223 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2228 if (RExC_parse - 1 - s)
2229 sv = newSVpvn(s, RExC_parse - 1 - s);
2231 sv = newSVpvn("", 0);
2234 Perl_save_re_context(aTHX);
2235 rop = sv_compile_2op(sv, &sop, "re", &pad);
2236 sop->op_private |= OPpREFCOUNTED;
2237 /* re_dup will OpREFCNT_inc */
2238 OpREFCNT_set(sop, 1);
2241 n = add_data(pRExC_state, 3, "nop");
2242 RExC_rx->data->data[n] = (void*)rop;
2243 RExC_rx->data->data[n+1] = (void*)sop;
2244 RExC_rx->data->data[n+2] = (void*)pad;
2247 else { /* First pass */
2248 if (PL_reginterp_cnt < ++RExC_seen_evals
2249 && PL_curcop != &PL_compiling)
2250 /* No compiled RE interpolated, has runtime
2251 components ===> unsafe. */
2252 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2253 if (PL_tainting && PL_tainted)
2254 FAIL("Eval-group in insecure regular expression");
2257 nextchar(pRExC_state);
2259 ret = reg_node(pRExC_state, LOGICAL);
2262 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2263 /* deal with the length of this later - MJD */
2266 ret = reganode(pRExC_state, EVAL, n);
2267 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2268 Set_Node_Offset(ret, parse_start);
2271 case '(': /* (?(?{...})...) and (?(?=...)...) */
2273 if (RExC_parse[0] == '?') { /* (?(?...)) */
2274 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2275 || RExC_parse[1] == '<'
2276 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2279 ret = reg_node(pRExC_state, LOGICAL);
2282 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2286 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2288 parno = atoi(RExC_parse++);
2290 while (isDIGIT(*RExC_parse))
2292 ret = reganode(pRExC_state, GROUPP, parno);
2294 if ((c = *nextchar(pRExC_state)) != ')')
2295 vFAIL("Switch condition not recognized");
2297 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2298 br = regbranch(pRExC_state, &flags, 1);
2300 br = reganode(pRExC_state, LONGJMP, 0);
2302 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2303 c = *nextchar(pRExC_state);
2307 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2308 regbranch(pRExC_state, &flags, 1);
2309 regtail(pRExC_state, ret, lastbr);
2312 c = *nextchar(pRExC_state);
2317 vFAIL("Switch (?(condition)... contains too many branches");
2318 ender = reg_node(pRExC_state, TAIL);
2319 regtail(pRExC_state, br, ender);
2321 regtail(pRExC_state, lastbr, ender);
2322 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2325 regtail(pRExC_state, ret, ender);
2329 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2333 RExC_parse--; /* for vFAIL to print correctly */
2334 vFAIL("Sequence (? incomplete");
2338 parse_flags: /* (?i) */
2339 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2340 /* (?g), (?gc) and (?o) are useless here
2341 and must be globally applied -- japhy */
2343 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2344 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2345 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2346 if (! (wastedflags & wflagbit) ) {
2347 wastedflags |= wflagbit;
2350 "Useless (%s%c) - %suse /%c modifier",
2351 flagsp == &negflags ? "?-" : "?",
2353 flagsp == &negflags ? "don't " : "",
2359 else if (*RExC_parse == 'c') {
2360 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2361 if (! (wastedflags & wasted_c) ) {
2362 wastedflags |= wasted_gc;
2365 "Useless (%sc) - %suse /gc modifier",
2366 flagsp == &negflags ? "?-" : "?",
2367 flagsp == &negflags ? "don't " : ""
2372 else { pmflag(flagsp, *RExC_parse); }
2376 if (*RExC_parse == '-') {
2378 wastedflags = 0; /* reset so (?g-c) warns twice */
2382 RExC_flags |= posflags;
2383 RExC_flags &= ~negflags;
2384 if (*RExC_parse == ':') {
2390 if (*RExC_parse != ')') {
2392 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2394 nextchar(pRExC_state);
2402 ret = reganode(pRExC_state, OPEN, parno);
2403 Set_Node_Length(ret, 1); /* MJD */
2404 Set_Node_Offset(ret, RExC_parse); /* MJD */
2411 /* Pick up the branches, linking them together. */
2412 parse_start = RExC_parse; /* MJD */
2413 br = regbranch(pRExC_state, &flags, 1);
2414 /* branch_len = (paren != 0); */
2418 if (*RExC_parse == '|') {
2419 if (!SIZE_ONLY && RExC_extralen) {
2420 reginsert(pRExC_state, BRANCHJ, br);
2423 reginsert(pRExC_state, BRANCH, br);
2424 Set_Node_Length(br, paren != 0);
2425 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2429 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2431 else if (paren == ':') {
2432 *flagp |= flags&SIMPLE;
2434 if (open) { /* Starts with OPEN. */
2435 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2437 else if (paren != '?') /* Not Conditional */
2439 *flagp |= flags & (SPSTART | HASWIDTH);
2441 while (*RExC_parse == '|') {
2442 if (!SIZE_ONLY && RExC_extralen) {
2443 ender = reganode(pRExC_state, LONGJMP,0);
2444 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2447 RExC_extralen += 2; /* Account for LONGJMP. */
2448 nextchar(pRExC_state);
2449 br = regbranch(pRExC_state, &flags, 0);
2453 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2457 *flagp |= flags&SPSTART;
2460 if (have_branch || paren != ':') {
2461 /* Make a closing node, and hook it on the end. */
2464 ender = reg_node(pRExC_state, TAIL);
2467 ender = reganode(pRExC_state, CLOSE, parno);
2468 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2469 Set_Node_Length(ender,1); /* MJD */
2475 *flagp &= ~HASWIDTH;
2478 ender = reg_node(pRExC_state, SUCCEED);
2481 ender = reg_node(pRExC_state, END);
2484 regtail(pRExC_state, lastbr, ender);
2487 /* Hook the tails of the branches to the closing node. */
2488 for (br = ret; br != NULL; br = regnext(br)) {
2489 regoptail(pRExC_state, br, ender);
2496 static char parens[] = "=!<,>";
2498 if (paren && (p = strchr(parens, paren))) {
2499 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2500 int flag = (p - parens) > 1;
2503 node = SUSPEND, flag = 0;
2504 reginsert(pRExC_state, node,ret);
2505 Set_Node_Offset(ret, oregcomp_parse);
2506 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2);
2508 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2512 /* Check for proper termination. */
2514 RExC_flags = oregflags;
2515 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2516 RExC_parse = oregcomp_parse;
2517 vFAIL("Unmatched (");
2520 else if (!paren && RExC_parse < RExC_end) {
2521 if (*RExC_parse == ')') {
2523 vFAIL("Unmatched )");
2526 FAIL("Junk on end of regexp"); /* "Can't happen". */
2534 - regbranch - one alternative of an | operator
2536 * Implements the concatenation operator.
2539 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2541 register regnode *ret;
2542 register regnode *chain = NULL;
2543 register regnode *latest;
2544 I32 flags = 0, c = 0;
2549 if (!SIZE_ONLY && RExC_extralen)
2550 ret = reganode(pRExC_state, BRANCHJ,0);
2552 ret = reg_node(pRExC_state, BRANCH);
2553 Set_Node_Length(ret, 1);
2557 if (!first && SIZE_ONLY)
2558 RExC_extralen += 1; /* BRANCHJ */
2560 *flagp = WORST; /* Tentatively. */
2563 nextchar(pRExC_state);
2564 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2566 latest = regpiece(pRExC_state, &flags);
2567 if (latest == NULL) {
2568 if (flags & TRYAGAIN)
2572 else if (ret == NULL)
2574 *flagp |= flags&HASWIDTH;
2575 if (chain == NULL) /* First piece. */
2576 *flagp |= flags&SPSTART;
2579 regtail(pRExC_state, chain, latest);
2584 if (chain == NULL) { /* Loop ran zero times. */
2585 chain = reg_node(pRExC_state, NOTHING);
2590 *flagp |= flags&SIMPLE;
2597 - regpiece - something followed by possible [*+?]
2599 * Note that the branching code sequences used for ? and the general cases
2600 * of * and + are somewhat optimized: they use the same NOTHING node as
2601 * both the endmarker for their branch list and the body of the last branch.
2602 * It might seem that this node could be dispensed with entirely, but the
2603 * endmarker role is not redundant.
2606 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2608 register regnode *ret;
2610 register char *next;
2612 char *origparse = RExC_parse;
2615 I32 max = REG_INFTY;
2618 ret = regatom(pRExC_state, &flags);
2620 if (flags & TRYAGAIN)
2627 if (op == '{' && regcurly(RExC_parse)) {
2628 parse_start = RExC_parse; /* MJD */
2629 next = RExC_parse + 1;
2631 while (isDIGIT(*next) || *next == ',') {
2640 if (*next == '}') { /* got one */
2644 min = atoi(RExC_parse);
2648 maxpos = RExC_parse;
2650 if (!max && *maxpos != '0')
2651 max = REG_INFTY; /* meaning "infinity" */
2652 else if (max >= REG_INFTY)
2653 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2655 nextchar(pRExC_state);
2658 if ((flags&SIMPLE)) {
2659 RExC_naughty += 2 + RExC_naughty / 2;
2660 reginsert(pRExC_state, CURLY, ret);
2661 Set_Node_Offset(ret, parse_start+1); /* MJD */
2662 Set_Node_Cur_Length(ret);
2665 regnode *w = reg_node(pRExC_state, WHILEM);
2668 regtail(pRExC_state, ret, w);
2669 if (!SIZE_ONLY && RExC_extralen) {
2670 reginsert(pRExC_state, LONGJMP,ret);
2671 reginsert(pRExC_state, NOTHING,ret);
2672 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2674 reginsert(pRExC_state, CURLYX,ret);
2676 Set_Node_Offset(ret, parse_start+1);
2677 Set_Node_Length(ret,
2678 op == '{' ? (RExC_parse - parse_start) : 1);
2680 if (!SIZE_ONLY && RExC_extralen)
2681 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2682 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2684 RExC_whilem_seen++, RExC_extralen += 3;
2685 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2693 if (max && max < min)
2694 vFAIL("Can't do {n,m} with n > m");
2696 ARG1_SET(ret, (U16)min);
2697 ARG2_SET(ret, (U16)max);
2709 #if 0 /* Now runtime fix should be reliable. */
2711 /* if this is reinstated, don't forget to put this back into perldiag:
2713 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2715 (F) The part of the regexp subject to either the * or + quantifier
2716 could match an empty string. The {#} shows in the regular
2717 expression about where the problem was discovered.
2721 if (!(flags&HASWIDTH) && op != '?')
2722 vFAIL("Regexp *+ operand could be empty");
2725 parse_start = RExC_parse;
2726 nextchar(pRExC_state);
2728 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2730 if (op == '*' && (flags&SIMPLE)) {
2731 reginsert(pRExC_state, STAR, ret);
2735 else if (op == '*') {
2739 else if (op == '+' && (flags&SIMPLE)) {
2740 reginsert(pRExC_state, PLUS, ret);
2744 else if (op == '+') {
2748 else if (op == '?') {
2753 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2755 "%.*s matches null string many times",
2756 RExC_parse - origparse,
2760 if (*RExC_parse == '?') {
2761 nextchar(pRExC_state);
2762 reginsert(pRExC_state, MINMOD, ret);
2763 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2765 if (ISMULT2(RExC_parse)) {
2767 vFAIL("Nested quantifiers");
2774 - regatom - the lowest level
2776 * Optimization: gobbles an entire sequence of ordinary characters so that
2777 * it can turn them into a single node, which is smaller to store and
2778 * faster to run. Backslashed characters are exceptions, each becoming a
2779 * separate node; the code is simpler that way and it's not worth fixing.
2781 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2783 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2785 register regnode *ret = 0;
2787 char *parse_start = 0;
2789 *flagp = WORST; /* Tentatively. */
2792 switch (*RExC_parse) {
2794 RExC_seen_zerolen++;
2795 nextchar(pRExC_state);
2796 if (RExC_flags & PMf_MULTILINE)
2797 ret = reg_node(pRExC_state, MBOL);
2798 else if (RExC_flags & PMf_SINGLELINE)
2799 ret = reg_node(pRExC_state, SBOL);
2801 ret = reg_node(pRExC_state, BOL);
2802 Set_Node_Length(ret, 1); /* MJD */
2805 nextchar(pRExC_state);
2807 RExC_seen_zerolen++;
2808 if (RExC_flags & PMf_MULTILINE)
2809 ret = reg_node(pRExC_state, MEOL);
2810 else if (RExC_flags & PMf_SINGLELINE)
2811 ret = reg_node(pRExC_state, SEOL);
2813 ret = reg_node(pRExC_state, EOL);
2814 Set_Node_Length(ret, 1); /* MJD */
2817 nextchar(pRExC_state);
2818 if (RExC_flags & PMf_SINGLELINE)
2819 ret = reg_node(pRExC_state, SANY);
2821 ret = reg_node(pRExC_state, REG_ANY);
2822 *flagp |= HASWIDTH|SIMPLE;
2824 Set_Node_Length(ret, 1); /* MJD */
2828 char *oregcomp_parse = ++RExC_parse;
2829 ret = regclass(pRExC_state);
2830 if (*RExC_parse != ']') {
2831 RExC_parse = oregcomp_parse;
2832 vFAIL("Unmatched [");
2834 nextchar(pRExC_state);
2835 *flagp |= HASWIDTH|SIMPLE;
2836 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2840 nextchar(pRExC_state);
2841 ret = reg(pRExC_state, 1, &flags);
2843 if (flags & TRYAGAIN) {
2844 if (RExC_parse == RExC_end) {
2845 /* Make parent create an empty node if needed. */
2853 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2857 if (flags & TRYAGAIN) {
2861 vFAIL("Internal urp");
2862 /* Supposed to be caught earlier. */
2865 if (!regcurly(RExC_parse)) {
2874 vFAIL("Quantifier follows nothing");
2877 switch (*++RExC_parse) {
2879 RExC_seen_zerolen++;
2880 ret = reg_node(pRExC_state, SBOL);
2882 nextchar(pRExC_state);
2883 Set_Node_Length(ret, 2); /* MJD */
2886 ret = reg_node(pRExC_state, GPOS);
2887 RExC_seen |= REG_SEEN_GPOS;
2889 nextchar(pRExC_state);
2890 Set_Node_Length(ret, 2); /* MJD */
2893 ret = reg_node(pRExC_state, SEOL);
2895 RExC_seen_zerolen++; /* Do not optimize RE away */
2896 nextchar(pRExC_state);
2899 ret = reg_node(pRExC_state, EOS);
2901 RExC_seen_zerolen++; /* Do not optimize RE away */
2902 nextchar(pRExC_state);
2903 Set_Node_Length(ret, 2); /* MJD */
2906 ret = reg_node(pRExC_state, CANY);
2907 RExC_seen |= REG_SEEN_CANY;
2908 *flagp |= HASWIDTH|SIMPLE;
2909 nextchar(pRExC_state);
2910 Set_Node_Length(ret, 2); /* MJD */
2913 ret = reg_node(pRExC_state, CLUMP);
2915 nextchar(pRExC_state);
2916 Set_Node_Length(ret, 2); /* MJD */
2919 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2920 *flagp |= HASWIDTH|SIMPLE;
2921 nextchar(pRExC_state);
2922 Set_Node_Length(ret, 2); /* MJD */
2925 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2926 *flagp |= HASWIDTH|SIMPLE;
2927 nextchar(pRExC_state);
2928 Set_Node_Length(ret, 2); /* MJD */
2931 RExC_seen_zerolen++;
2932 RExC_seen |= REG_SEEN_LOOKBEHIND;
2933 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2935 nextchar(pRExC_state);
2936 Set_Node_Length(ret, 2); /* MJD */
2939 RExC_seen_zerolen++;
2940 RExC_seen |= REG_SEEN_LOOKBEHIND;
2941 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2943 nextchar(pRExC_state);
2944 Set_Node_Length(ret, 2); /* MJD */
2947 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2948 *flagp |= HASWIDTH|SIMPLE;
2949 nextchar(pRExC_state);
2950 Set_Node_Length(ret, 2); /* MJD */
2953 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2954 *flagp |= HASWIDTH|SIMPLE;
2955 nextchar(pRExC_state);
2956 Set_Node_Length(ret, 2); /* MJD */
2959 ret = reg_node(pRExC_state, DIGIT);
2960 *flagp |= HASWIDTH|SIMPLE;
2961 nextchar(pRExC_state);
2962 Set_Node_Length(ret, 2); /* MJD */
2965 ret = reg_node(pRExC_state, NDIGIT);
2966 *flagp |= HASWIDTH|SIMPLE;
2967 nextchar(pRExC_state);
2968 Set_Node_Length(ret, 2); /* MJD */
2973 char* oldregxend = RExC_end;
2974 char* parse_start = RExC_parse - 2;
2976 if (RExC_parse[1] == '{') {
2977 /* a lovely hack--pretend we saw [\pX] instead */
2978 RExC_end = strchr(RExC_parse, '}');
2980 U8 c = (U8)*RExC_parse;
2982 RExC_end = oldregxend;
2983 vFAIL2("Missing right brace on \\%c{}", c);
2988 RExC_end = RExC_parse + 2;
2989 if (RExC_end > oldregxend)
2990 RExC_end = oldregxend;
2994 ret = regclass(pRExC_state);
2996 RExC_end = oldregxend;
2999 Set_Node_Offset(ret, parse_start + 2);
3000 Set_Node_Cur_Length(ret);
3001 nextchar(pRExC_state);
3002 *flagp |= HASWIDTH|SIMPLE;
3015 case '1': case '2': case '3': case '4':
3016 case '5': case '6': case '7': case '8': case '9':
3018 I32 num = atoi(RExC_parse);
3020 if (num > 9 && num >= RExC_npar)
3023 char * parse_start = RExC_parse - 1; /* MJD */
3024 while (isDIGIT(*RExC_parse))
3027 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3028 vFAIL("Reference to nonexistent group");
3030 ret = reganode(pRExC_state,
3031 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3035 /* override incorrect value set in reganode MJD */
3036 Set_Node_Offset(ret, parse_start+1);
3037 Set_Node_Cur_Length(ret); /* MJD */
3039 nextchar(pRExC_state);
3044 if (RExC_parse >= RExC_end)
3045 FAIL("Trailing \\");
3048 /* Do not generate `unrecognized' warnings here, we fall
3049 back into the quick-grab loop below */
3055 if (RExC_flags & PMf_EXTENDED) {
3056 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3057 if (RExC_parse < RExC_end)
3063 register STRLEN len;
3069 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3071 parse_start = RExC_parse - 1;
3077 ret = reg_node(pRExC_state,
3078 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3080 for (len = 0, p = RExC_parse - 1;
3081 len < 127 && p < RExC_end;
3086 if (RExC_flags & PMf_EXTENDED)
3087 p = regwhite(p, RExC_end);
3134 ender = ASCII_TO_NATIVE('\033');
3138 ender = ASCII_TO_NATIVE('\007');
3143 char* e = strchr(p, '}');
3147 vFAIL("Missing right brace on \\x{}");
3150 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3151 | PERL_SCAN_DISALLOW_PREFIX;
3153 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3156 /* numlen is generous */
3157 if (numlen + len >= 127) {
3165 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3167 ender = grok_hex(p, &numlen, &flags, NULL);
3173 ender = UCHARAT(p++);
3174 ender = toCTRL(ender);
3176 case '0': case '1': case '2': case '3':case '4':
3177 case '5': case '6': case '7': case '8':case '9':
3179 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3182 ender = grok_oct(p, &numlen, &flags, NULL);
3192 FAIL("Trailing \\");
3195 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3196 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3197 goto normal_default;
3202 if (UTF8_IS_START(*p) && UTF) {
3203 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3211 if (RExC_flags & PMf_EXTENDED)
3212 p = regwhite(p, RExC_end);
3214 /* Prime the casefolded buffer. */
3215 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3217 if (ISMULT2(p)) { /* Back off on ?+*. */
3224 /* Emit all the Unicode characters. */
3225 for (foldbuf = tmpbuf;
3227 foldlen -= numlen) {
3228 ender = utf8_to_uvchr(foldbuf, &numlen);
3230 reguni(pRExC_state, ender, s, &unilen);
3233 /* In EBCDIC the numlen
3234 * and unilen can differ. */
3236 if (numlen >= foldlen)
3240 break; /* "Can't happen." */
3244 reguni(pRExC_state, ender, s, &unilen);
3253 REGC((char)ender, s++);
3261 /* Emit all the Unicode characters. */
3262 for (foldbuf = tmpbuf;
3264 foldlen -= numlen) {
3265 ender = utf8_to_uvchr(foldbuf, &numlen);
3267 reguni(pRExC_state, ender, s, &unilen);
3270 /* In EBCDIC the numlen
3271 * and unilen can differ. */
3273 if (numlen >= foldlen)
3281 reguni(pRExC_state, ender, s, &unilen);
3290 REGC((char)ender, s++);
3294 Set_Node_Cur_Length(ret); /* MJD */
3295 nextchar(pRExC_state);
3297 /* len is STRLEN which is unsigned, need to copy to signed */
3300 vFAIL("Internal disaster");
3309 RExC_size += STR_SZ(len);
3311 RExC_emit += STR_SZ(len);
3316 /* If the encoding pragma is in effect recode the text of
3317 * any EXACT-kind nodes. */
3318 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3319 STRLEN oldlen = STR_LEN(ret);
3320 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3324 if (sv_utf8_downgrade(sv, TRUE)) {
3325 char *s = sv_recode_to_utf8(sv, PL_encoding);
3326 STRLEN newlen = SvCUR(sv);
3331 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3332 (int)oldlen, STRING(ret),
3334 Copy(s, STRING(ret), newlen, char);
3335 STR_LEN(ret) += newlen - oldlen;
3336 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3338 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3346 S_regwhite(pTHX_ char *p, char *e)
3351 else if (*p == '#') {
3354 } while (p < e && *p != '\n');
3362 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3363 Character classes ([:foo:]) can also be negated ([:^foo:]).
3364 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3365 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3366 but trigger failures because they are currently unimplemented. */
3368 #define POSIXCC_DONE(c) ((c) == ':')
3369 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3370 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3373 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3376 I32 namedclass = OOB_NAMEDCLASS;
3378 if (value == '[' && RExC_parse + 1 < RExC_end &&
3379 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3380 POSIXCC(UCHARAT(RExC_parse))) {
3381 char c = UCHARAT(RExC_parse);
3382 char* s = RExC_parse++;
3384 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3386 if (RExC_parse == RExC_end)
3387 /* Grandfather lone [:, [=, [. */
3390 char* t = RExC_parse++; /* skip over the c */
3392 if (UCHARAT(RExC_parse) == ']') {
3393 RExC_parse++; /* skip over the ending ] */
3396 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3397 I32 skip = 5; /* the most common skip */
3401 if (strnEQ(posixcc, "alnum", 5))
3403 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3404 else if (strnEQ(posixcc, "alpha", 5))
3406 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3407 else if (strnEQ(posixcc, "ascii", 5))
3409 complement ? ANYOF_NASCII : ANYOF_ASCII;
3412 if (strnEQ(posixcc, "blank", 5))
3414 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3417 if (strnEQ(posixcc, "cntrl", 5))
3419 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3422 if (strnEQ(posixcc, "digit", 5))
3424 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3427 if (strnEQ(posixcc, "graph", 5))
3429 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3432 if (strnEQ(posixcc, "lower", 5))
3434 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3437 if (strnEQ(posixcc, "print", 5))
3439 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3440 else if (strnEQ(posixcc, "punct", 5))
3442 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3445 if (strnEQ(posixcc, "space", 5))
3447 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3450 if (strnEQ(posixcc, "upper", 5))
3452 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3454 case 'w': /* this is not POSIX, this is the Perl \w */
3455 if (strnEQ(posixcc, "word", 4)) {
3457 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3462 if (strnEQ(posixcc, "xdigit", 6)) {
3464 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3469 if (namedclass == OOB_NAMEDCLASS ||
3470 posixcc[skip] != ':' ||
3471 posixcc[skip+1] != ']')
3473 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3476 } else if (!SIZE_ONLY) {
3477 /* [[=foo=]] and [[.foo.]] are still future. */
3479 /* adjust RExC_parse so the warning shows after
3481 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3483 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3486 /* Maternal grandfather:
3487 * "[:" ending in ":" but not in ":]" */
3497 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3499 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3500 char *s = RExC_parse;
3503 while(*s && isALNUM(*s))
3505 if (*s && c == *s && s[1] == ']') {
3506 if (ckWARN(WARN_REGEXP))
3508 "POSIX syntax [%c %c] belongs inside character classes",
3511 /* [[=foo=]] and [[.foo.]] are still future. */
3512 if (POSIXCC_NOTYET(c)) {
3513 /* adjust RExC_parse so the error shows after
3515 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3517 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3524 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3527 register UV nextvalue;
3528 register IV prevvalue = OOB_UNICODE;
3529 register IV range = 0;
3530 register regnode *ret;
3533 char *rangebegin = 0;
3534 bool need_class = 0;
3535 SV *listsv = Nullsv;
3538 bool optimize_invert = TRUE;
3539 AV* unicode_alternate = 0;
3541 UV literal_endpoint = 0;
3544 ret = reganode(pRExC_state, ANYOF, 0);
3547 ANYOF_FLAGS(ret) = 0;
3549 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3553 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3557 RExC_size += ANYOF_SKIP;
3559 RExC_emit += ANYOF_SKIP;
3561 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3563 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3564 ANYOF_BITMAP_ZERO(ret);
3565 listsv = newSVpvn("# comment\n", 10);
3568 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3570 if (!SIZE_ONLY && POSIXCC(nextvalue))
3571 checkposixcc(pRExC_state);
3573 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3574 if (UCHARAT(RExC_parse) == ']')
3577 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3581 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3584 rangebegin = RExC_parse;
3586 value = utf8n_to_uvchr((U8*)RExC_parse,
3587 RExC_end - RExC_parse,
3589 RExC_parse += numlen;
3592 value = UCHARAT(RExC_parse++);
3593 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3594 if (value == '[' && POSIXCC(nextvalue))
3595 namedclass = regpposixcc(pRExC_state, value);
3596 else if (value == '\\') {
3598 value = utf8n_to_uvchr((U8*)RExC_parse,
3599 RExC_end - RExC_parse,
3601 RExC_parse += numlen;
3604 value = UCHARAT(RExC_parse++);
3605 /* Some compilers cannot handle switching on 64-bit integer
3606 * values, therefore value cannot be an UV. Yes, this will
3607 * be a problem later if we want switch on Unicode.
3608 * A similar issue a little bit later when switching on
3609 * namedclass. --jhi */
3610 switch ((I32)value) {
3611 case 'w': namedclass = ANYOF_ALNUM; break;
3612 case 'W': namedclass = ANYOF_NALNUM; break;
3613 case 's': namedclass = ANYOF_SPACE; break;
3614 case 'S': namedclass = ANYOF_NSPACE; break;
3615 case 'd': namedclass = ANYOF_DIGIT; break;
3616 case 'D': namedclass = ANYOF_NDIGIT; break;
3619 if (RExC_parse >= RExC_end)
3620 vFAIL2("Empty \\%c{}", (U8)value);
3621 if (*RExC_parse == '{') {
3623 e = strchr(RExC_parse++, '}');
3625 vFAIL2("Missing right brace on \\%c{}", c);
3626 while (isSPACE(UCHARAT(RExC_parse)))
3628 if (e == RExC_parse)
3629 vFAIL2("Empty \\%c{}", c);
3631 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3639 if (UCHARAT(RExC_parse) == '^') {
3642 value = value == 'p' ? 'P' : 'p'; /* toggle */
3643 while (isSPACE(UCHARAT(RExC_parse))) {
3649 Perl_sv_catpvf(aTHX_ listsv,
3650 "+utf8::%.*s\n", (int)n, RExC_parse);
3652 Perl_sv_catpvf(aTHX_ listsv,
3653 "!utf8::%.*s\n", (int)n, RExC_parse);
3656 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3658 case 'n': value = '\n'; break;
3659 case 'r': value = '\r'; break;
3660 case 't': value = '\t'; break;
3661 case 'f': value = '\f'; break;
3662 case 'b': value = '\b'; break;
3663 case 'e': value = ASCII_TO_NATIVE('\033');break;
3664 case 'a': value = ASCII_TO_NATIVE('\007');break;
3666 if (*RExC_parse == '{') {
3667 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3668 | PERL_SCAN_DISALLOW_PREFIX;
3669 e = strchr(RExC_parse++, '}');
3671 vFAIL("Missing right brace on \\x{}");
3673 numlen = e - RExC_parse;
3674 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3678 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3680 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3681 RExC_parse += numlen;
3685 value = UCHARAT(RExC_parse++);
3686 value = toCTRL(value);
3688 case '0': case '1': case '2': case '3': case '4':
3689 case '5': case '6': case '7': case '8': case '9':
3693 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3694 RExC_parse += numlen;
3698 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3700 "Unrecognized escape \\%c in character class passed through",
3704 } /* end of \blah */
3710 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3712 if (!SIZE_ONLY && !need_class)
3713 ANYOF_CLASS_ZERO(ret);
3717 /* a bad range like a-\d, a-[:digit:] ? */
3720 if (ckWARN(WARN_REGEXP))
3722 "False [] range \"%*.*s\"",
3723 RExC_parse - rangebegin,
3724 RExC_parse - rangebegin,
3726 if (prevvalue < 256) {
3727 ANYOF_BITMAP_SET(ret, prevvalue);
3728 ANYOF_BITMAP_SET(ret, '-');
3731 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3732 Perl_sv_catpvf(aTHX_ listsv,
3733 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3737 range = 0; /* this was not a true range */
3741 if (namedclass > OOB_NAMEDCLASS)
3742 optimize_invert = FALSE;
3743 /* Possible truncation here but in some 64-bit environments
3744 * the compiler gets heartburn about switch on 64-bit values.
3745 * A similar issue a little earlier when switching on value.
3747 switch ((I32)namedclass) {
3750 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3752 for (value = 0; value < 256; value++)
3754 ANYOF_BITMAP_SET(ret, value);
3756 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3760 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3762 for (value = 0; value < 256; value++)
3763 if (!isALNUM(value))
3764 ANYOF_BITMAP_SET(ret, value);
3766 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3770 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3772 for (value = 0; value < 256; value++)
3773 if (isALNUMC(value))
3774 ANYOF_BITMAP_SET(ret, value);
3776 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3780 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3782 for (value = 0; value < 256; value++)
3783 if (!isALNUMC(value))
3784 ANYOF_BITMAP_SET(ret, value);
3786 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3790 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3792 for (value = 0; value < 256; value++)
3794 ANYOF_BITMAP_SET(ret, value);
3796 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3800 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3802 for (value = 0; value < 256; value++)
3803 if (!isALPHA(value))
3804 ANYOF_BITMAP_SET(ret, value);
3806 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3810 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3813 for (value = 0; value < 128; value++)
3814 ANYOF_BITMAP_SET(ret, value);
3816 for (value = 0; value < 256; value++) {
3818 ANYOF_BITMAP_SET(ret, value);
3822 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3826 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3829 for (value = 128; value < 256; value++)
3830 ANYOF_BITMAP_SET(ret, value);
3832 for (value = 0; value < 256; value++) {
3833 if (!isASCII(value))
3834 ANYOF_BITMAP_SET(ret, value);
3838 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3842 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3844 for (value = 0; value < 256; value++)
3846 ANYOF_BITMAP_SET(ret, value);
3848 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3852 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3854 for (value = 0; value < 256; value++)
3855 if (!isBLANK(value))
3856 ANYOF_BITMAP_SET(ret, value);
3858 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3862 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3864 for (value = 0; value < 256; value++)
3866 ANYOF_BITMAP_SET(ret, value);
3868 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3872 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3874 for (value = 0; value < 256; value++)
3875 if (!isCNTRL(value))
3876 ANYOF_BITMAP_SET(ret, value);
3878 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3882 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3884 /* consecutive digits assumed */
3885 for (value = '0'; value <= '9'; value++)
3886 ANYOF_BITMAP_SET(ret, value);
3888 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3892 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3894 /* consecutive digits assumed */
3895 for (value = 0; value < '0'; value++)
3896 ANYOF_BITMAP_SET(ret, value);
3897 for (value = '9' + 1; value < 256; value++)
3898 ANYOF_BITMAP_SET(ret, value);
3900 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3904 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3906 for (value = 0; value < 256; value++)
3908 ANYOF_BITMAP_SET(ret, value);
3910 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3914 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3916 for (value = 0; value < 256; value++)
3917 if (!isGRAPH(value))
3918 ANYOF_BITMAP_SET(ret, value);
3920 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3924 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3926 for (value = 0; value < 256; value++)
3928 ANYOF_BITMAP_SET(ret, value);
3930 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3934 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3936 for (value = 0; value < 256; value++)
3937 if (!isLOWER(value))
3938 ANYOF_BITMAP_SET(ret, value);
3940 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3944 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3946 for (value = 0; value < 256; value++)
3948 ANYOF_BITMAP_SET(ret, value);
3950 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3954 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3956 for (value = 0; value < 256; value++)
3957 if (!isPRINT(value))
3958 ANYOF_BITMAP_SET(ret, value);
3960 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3964 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3966 for (value = 0; value < 256; value++)
3967 if (isPSXSPC(value))
3968 ANYOF_BITMAP_SET(ret, value);
3970 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3974 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3976 for (value = 0; value < 256; value++)
3977 if (!isPSXSPC(value))
3978 ANYOF_BITMAP_SET(ret, value);
3980 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3984 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3986 for (value = 0; value < 256; value++)
3988 ANYOF_BITMAP_SET(ret, value);
3990 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3994 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3996 for (value = 0; value < 256; value++)
3997 if (!isPUNCT(value))
3998 ANYOF_BITMAP_SET(ret, value);
4000 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4004 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4006 for (value = 0; value < 256; value++)
4008 ANYOF_BITMAP_SET(ret, value);
4010 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4014 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4016 for (value = 0; value < 256; value++)
4017 if (!isSPACE(value))
4018 ANYOF_BITMAP_SET(ret, value);
4020 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4024 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4026 for (value = 0; value < 256; value++)
4028 ANYOF_BITMAP_SET(ret, value);
4030 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4034 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4036 for (value = 0; value < 256; value++)
4037 if (!isUPPER(value))
4038 ANYOF_BITMAP_SET(ret, value);
4040 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4044 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4046 for (value = 0; value < 256; value++)
4047 if (isXDIGIT(value))
4048 ANYOF_BITMAP_SET(ret, value);
4050 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4054 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4056 for (value = 0; value < 256; value++)
4057 if (!isXDIGIT(value))
4058 ANYOF_BITMAP_SET(ret, value);
4060 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4063 vFAIL("Invalid [::] class");
4067 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4070 } /* end of namedclass \blah */
4073 if (prevvalue > (IV)value) /* b-a */ {
4074 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4075 RExC_parse - rangebegin,
4076 RExC_parse - rangebegin,
4078 range = 0; /* not a valid range */
4082 prevvalue = value; /* save the beginning of the range */
4083 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4084 RExC_parse[1] != ']') {
4087 /* a bad range like \w-, [:word:]- ? */
4088 if (namedclass > OOB_NAMEDCLASS) {
4089 if (ckWARN(WARN_REGEXP))
4091 "False [] range \"%*.*s\"",
4092 RExC_parse - rangebegin,
4093 RExC_parse - rangebegin,
4096 ANYOF_BITMAP_SET(ret, '-');
4098 range = 1; /* yeah, it's a range! */
4099 continue; /* but do it the next time */
4103 /* now is the next time */
4107 if (prevvalue < 256) {
4108 IV ceilvalue = value < 256 ? value : 255;
4111 /* In EBCDIC [\x89-\x91] should include
4112 * the \x8e but [i-j] should not. */
4113 if (literal_endpoint == 2 &&
4114 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4115 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4117 if (isLOWER(prevvalue)) {
4118 for (i = prevvalue; i <= ceilvalue; i++)
4120 ANYOF_BITMAP_SET(ret, i);
4122 for (i = prevvalue; i <= ceilvalue; i++)
4124 ANYOF_BITMAP_SET(ret, i);
4129 for (i = prevvalue; i <= ceilvalue; i++)
4130 ANYOF_BITMAP_SET(ret, i);
4132 if (value > 255 || UTF) {
4133 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4134 UV natvalue = NATIVE_TO_UNI(value);
4136 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4137 if (prevnatvalue < natvalue) { /* what about > ? */
4138 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4139 prevnatvalue, natvalue);
4141 else if (prevnatvalue == natvalue) {
4142 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4144 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4146 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4148 /* If folding and foldable and a single
4149 * character, insert also the folded version
4150 * to the charclass. */
4152 if (foldlen == (STRLEN)UNISKIP(f))
4153 Perl_sv_catpvf(aTHX_ listsv,
4156 /* Any multicharacter foldings
4157 * require the following transform:
4158 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4159 * where E folds into "pq" and F folds
4160 * into "rst", all other characters
4161 * fold to single characters. We save
4162 * away these multicharacter foldings,
4163 * to be later saved as part of the
4164 * additional "s" data. */
4167 if (!unicode_alternate)
4168 unicode_alternate = newAV();
4169 sv = newSVpvn((char*)foldbuf, foldlen);
4171 av_push(unicode_alternate, sv);
4175 /* If folding and the value is one of the Greek
4176 * sigmas insert a few more sigmas to make the
4177 * folding rules of the sigmas to work right.
4178 * Note that not all the possible combinations
4179 * are handled here: some of them are handled
4180 * by the standard folding rules, and some of
4181 * them (literal or EXACTF cases) are handled
4182 * during runtime in regexec.c:S_find_byclass(). */
4183 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4184 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4185 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4186 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4187 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4189 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4190 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4191 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4196 literal_endpoint = 0;
4200 range = 0; /* this range (if it was one) is done now */
4204 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4206 RExC_size += ANYOF_CLASS_ADD_SKIP;
4208 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4211 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4213 /* If the only flag is folding (plus possibly inversion). */
4214 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4216 for (value = 0; value < 256; ++value) {
4217 if (ANYOF_BITMAP_TEST(ret, value)) {
4218 UV fold = PL_fold[value];
4221 ANYOF_BITMAP_SET(ret, fold);
4224 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4227 /* optimize inverted simple patterns (e.g. [^a-z]) */
4228 if (!SIZE_ONLY && optimize_invert &&
4229 /* If the only flag is inversion. */
4230 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4231 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4232 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4233 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4240 /* The 0th element stores the character class description
4241 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4242 * to initialize the appropriate swash (which gets stored in
4243 * the 1st element), and also useful for dumping the regnode.
4244 * The 2nd element stores the multicharacter foldings,
4245 * used later (regexec.c:S_reginclass()). */
4246 av_store(av, 0, listsv);
4247 av_store(av, 1, NULL);
4248 av_store(av, 2, (SV*)unicode_alternate);
4249 rv = newRV_noinc((SV*)av);
4250 n = add_data(pRExC_state, 1, "s");
4251 RExC_rx->data->data[n] = (void*)rv;
4259 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4261 char* retval = RExC_parse++;
4264 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4265 RExC_parse[2] == '#') {
4266 while (*RExC_parse && *RExC_parse != ')')
4271 if (RExC_flags & PMf_EXTENDED) {
4272 if (isSPACE(*RExC_parse)) {
4276 else if (*RExC_parse == '#') {
4277 while (*RExC_parse && *RExC_parse != '\n')
4288 - reg_node - emit a node
4290 STATIC regnode * /* Location. */
4291 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4293 register regnode *ret;
4294 register regnode *ptr;
4298 SIZE_ALIGN(RExC_size);
4303 NODE_ALIGN_FILL(ret);
4305 FILL_ADVANCE_NODE(ptr, op);
4306 if (RExC_offsets) { /* MJD */
4307 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4308 "reg_node", __LINE__,
4310 RExC_emit - RExC_emit_start > RExC_offsets[0]
4311 ? "Overwriting end of array!\n" : "OK",
4312 RExC_emit - RExC_emit_start,
4313 RExC_parse - RExC_start,
4315 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4324 - reganode - emit a node with an argument
4326 STATIC regnode * /* Location. */
4327 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4329 register regnode *ret;
4330 register regnode *ptr;
4334 SIZE_ALIGN(RExC_size);
4339 NODE_ALIGN_FILL(ret);
4341 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4342 if (RExC_offsets) { /* MJD */
4343 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4347 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4348 "Overwriting end of array!\n" : "OK",
4349 RExC_emit - RExC_emit_start,
4350 RExC_parse - RExC_start,
4352 Set_Cur_Node_Offset;
4361 - reguni - emit (if appropriate) a Unicode character
4364 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4366 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4370 - reginsert - insert an operator in front of already-emitted operand
4372 * Means relocating the operand.
4375 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4377 register regnode *src;
4378 register regnode *dst;
4379 register regnode *place;
4380 register int offset = regarglen[(U8)op];
4382 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4385 RExC_size += NODE_STEP_REGNODE + offset;
4390 RExC_emit += NODE_STEP_REGNODE + offset;
4392 while (src > opnd) {
4393 StructCopy(--src, --dst, regnode);
4394 if (RExC_offsets) { /* MJD 20010112 */
4395 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4399 dst - RExC_emit_start > RExC_offsets[0]
4400 ? "Overwriting end of array!\n" : "OK",
4401 src - RExC_emit_start,
4402 dst - RExC_emit_start,
4404 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4405 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4410 place = opnd; /* Op node, where operand used to be. */
4411 if (RExC_offsets) { /* MJD */
4412 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4416 place - RExC_emit_start > RExC_offsets[0]
4417 ? "Overwriting end of array!\n" : "OK",
4418 place - RExC_emit_start,
4419 RExC_parse - RExC_start,
4421 Set_Node_Offset(place, RExC_parse);
4423 src = NEXTOPER(place);
4424 FILL_ADVANCE_NODE(place, op);
4425 Zero(src, offset, regnode);
4429 - regtail - set the next-pointer at the end of a node chain of p to val.
4432 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4434 register regnode *scan;
4435 register regnode *temp;
4440 /* Find last node. */
4443 temp = regnext(scan);
4449 if (reg_off_by_arg[OP(scan)]) {
4450 ARG_SET(scan, val - scan);
4453 NEXT_OFF(scan) = val - scan;
4458 - regoptail - regtail on operand of first argument; nop if operandless
4461 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4463 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4464 if (p == NULL || SIZE_ONLY)
4466 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4467 regtail(pRExC_state, NEXTOPER(p), val);
4469 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4470 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4477 - regcurly - a little FSA that accepts {\d+,?\d*}
4480 S_regcurly(pTHX_ register char *s)
4501 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4503 register U8 op = EXACT; /* Arbitrary non-END op. */
4504 register regnode *next;
4506 while (op != END && (!last || node < last)) {
4507 /* While that wasn't END last time... */
4513 next = regnext(node);
4515 if (OP(node) == OPTIMIZED)
4518 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4519 (int)(2*l + 1), "", SvPVX(sv));
4520 if (next == NULL) /* Next ptr. */
4521 PerlIO_printf(Perl_debug_log, "(0)");
4523 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4524 (void)PerlIO_putc(Perl_debug_log, '\n');
4526 if (PL_regkind[(U8)op] == BRANCHJ) {
4527 register regnode *nnode = (OP(next) == LONGJMP
4530 if (last && nnode > last)
4532 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4534 else if (PL_regkind[(U8)op] == BRANCH) {
4535 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4537 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4538 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4539 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4541 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4542 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4545 else if ( op == PLUS || op == STAR) {
4546 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4548 else if (op == ANYOF) {
4549 /* arglen 1 + class block */
4550 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4551 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4552 node = NEXTOPER(node);
4554 else if (PL_regkind[(U8)op] == EXACT) {
4555 /* Literal string, where present. */
4556 node += NODE_SZ_STR(node) - 1;
4557 node = NEXTOPER(node);
4560 node = NEXTOPER(node);
4561 node += regarglen[(U8)op];
4563 if (op == CURLYX || op == OPEN)
4565 else if (op == WHILEM)
4571 #endif /* DEBUGGING */
4574 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4577 Perl_regdump(pTHX_ regexp *r)
4580 SV *sv = sv_newmortal();
4582 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4584 /* Header fields of interest. */
4585 if (r->anchored_substr)
4586 PerlIO_printf(Perl_debug_log,
4587 "anchored `%s%.*s%s'%s at %"IVdf" ",
4589 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4590 SvPVX(r->anchored_substr),
4592 SvTAIL(r->anchored_substr) ? "$" : "",
4593 (IV)r->anchored_offset);
4594 else if (r->anchored_utf8)
4595 PerlIO_printf(Perl_debug_log,
4596 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4598 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4599 SvPVX(r->anchored_utf8),
4601 SvTAIL(r->anchored_utf8) ? "$" : "",
4602 (IV)r->anchored_offset);
4603 if (r->float_substr)
4604 PerlIO_printf(Perl_debug_log,
4605 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4607 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4608 SvPVX(r->float_substr),
4610 SvTAIL(r->float_substr) ? "$" : "",
4611 (IV)r->float_min_offset, (UV)r->float_max_offset);
4612 else if (r->float_utf8)
4613 PerlIO_printf(Perl_debug_log,
4614 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4616 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4617 SvPVX(r->float_utf8),
4619 SvTAIL(r->float_utf8) ? "$" : "",
4620 (IV)r->float_min_offset, (UV)r->float_max_offset);
4621 if (r->check_substr || r->check_utf8)
4622 PerlIO_printf(Perl_debug_log,
4623 r->check_substr == r->float_substr
4624 && r->check_utf8 == r->float_utf8
4625 ? "(checking floating" : "(checking anchored");
4626 if (r->reganch & ROPT_NOSCAN)
4627 PerlIO_printf(Perl_debug_log, " noscan");
4628 if (r->reganch & ROPT_CHECK_ALL)
4629 PerlIO_printf(Perl_debug_log, " isall");
4630 if (r->check_substr || r->check_utf8)
4631 PerlIO_printf(Perl_debug_log, ") ");
4633 if (r->regstclass) {
4634 regprop(sv, r->regstclass);
4635 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4637 if (r->reganch & ROPT_ANCH) {
4638 PerlIO_printf(Perl_debug_log, "anchored");
4639 if (r->reganch & ROPT_ANCH_BOL)
4640 PerlIO_printf(Perl_debug_log, "(BOL)");
4641 if (r->reganch & ROPT_ANCH_MBOL)
4642 PerlIO_printf(Perl_debug_log, "(MBOL)");
4643 if (r->reganch & ROPT_ANCH_SBOL)
4644 PerlIO_printf(Perl_debug_log, "(SBOL)");
4645 if (r->reganch & ROPT_ANCH_GPOS)
4646 PerlIO_printf(Perl_debug_log, "(GPOS)");
4647 PerlIO_putc(Perl_debug_log, ' ');
4649 if (r->reganch & ROPT_GPOS_SEEN)
4650 PerlIO_printf(Perl_debug_log, "GPOS ");
4651 if (r->reganch & ROPT_SKIP)
4652 PerlIO_printf(Perl_debug_log, "plus ");
4653 if (r->reganch & ROPT_IMPLICIT)
4654 PerlIO_printf(Perl_debug_log, "implicit ");
4655 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4656 if (r->reganch & ROPT_EVAL_SEEN)
4657 PerlIO_printf(Perl_debug_log, "with eval ");
4658 PerlIO_printf(Perl_debug_log, "\n");
4661 U32 len = r->offsets[0];
4662 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4663 for (i = 1; i <= len; i++)
4664 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4665 (UV)r->offsets[i*2-1],
4666 (UV)r->offsets[i*2]);
4667 PerlIO_printf(Perl_debug_log, "\n");
4669 #endif /* DEBUGGING */
4675 S_put_byte(pTHX_ SV *sv, int c)
4677 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4678 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4679 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4680 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4682 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4685 #endif /* DEBUGGING */
4688 - regprop - printable representation of opcode
4691 Perl_regprop(pTHX_ SV *sv, regnode *o)
4696 sv_setpvn(sv, "", 0);
4697 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4698 /* It would be nice to FAIL() here, but this may be called from
4699 regexec.c, and it would be hard to supply pRExC_state. */
4700 Perl_croak(aTHX_ "Corrupted regexp opcode");
4701 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4703 k = PL_regkind[(U8)OP(o)];
4706 SV *dsv = sv_2mortal(newSVpvn("", 0));
4707 /* Using is_utf8_string() is a crude hack but it may
4708 * be the best for now since we have no flag "this EXACTish
4709 * node was UTF-8" --jhi */
4710 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4712 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4713 UNI_DISPLAY_REGEX) :
4718 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4723 else if (k == CURLY) {
4724 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4725 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4726 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4728 else if (k == WHILEM && o->flags) /* Ordinal/of */
4729 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4730 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4731 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4732 else if (k == LOGICAL)
4733 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4734 else if (k == ANYOF) {
4735 int i, rangestart = -1;
4736 U8 flags = ANYOF_FLAGS(o);
4737 const char * const anyofs[] = { /* Should be synchronized with
4738 * ANYOF_ #xdefines in regcomp.h */
4771 if (flags & ANYOF_LOCALE)
4772 sv_catpv(sv, "{loc}");
4773 if (flags & ANYOF_FOLD)
4774 sv_catpv(sv, "{i}");
4775 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4776 if (flags & ANYOF_INVERT)
4778 for (i = 0; i <= 256; i++) {
4779 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4780 if (rangestart == -1)
4782 } else if (rangestart != -1) {
4783 if (i <= rangestart + 3)
4784 for (; rangestart < i; rangestart++)
4785 put_byte(sv, rangestart);
4787 put_byte(sv, rangestart);
4789 put_byte(sv, i - 1);
4795 if (o->flags & ANYOF_CLASS)
4796 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4797 if (ANYOF_CLASS_TEST(o,i))
4798 sv_catpv(sv, anyofs[i]);
4800 if (flags & ANYOF_UNICODE)
4801 sv_catpv(sv, "{unicode}");
4802 else if (flags & ANYOF_UNICODE_ALL)
4803 sv_catpv(sv, "{unicode_all}");
4807 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4811 U8 s[UTF8_MAXLEN+1];
4813 for (i = 0; i <= 256; i++) { /* just the first 256 */
4814 U8 *e = uvchr_to_utf8(s, i);
4816 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4817 if (rangestart == -1)
4819 } else if (rangestart != -1) {
4822 if (i <= rangestart + 3)
4823 for (; rangestart < i; rangestart++) {
4824 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4828 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4831 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4838 sv_catpv(sv, "..."); /* et cetera */
4842 char *s = savepv(SvPVX(lv));
4845 while(*s && *s != '\n') s++;
4866 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4868 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4869 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4870 #endif /* DEBUGGING */
4874 Perl_re_intuit_string(pTHX_ regexp *prog)
4875 { /* Assume that RE_INTUIT is set */
4878 char *s = SvPV(prog->check_substr
4879 ? prog->check_substr : prog->check_utf8, n_a);
4881 if (!PL_colorset) reginitcolors();
4882 PerlIO_printf(Perl_debug_log,
4883 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4885 prog->check_substr ? "" : "utf8 ",
4886 PL_colors[5],PL_colors[0],
4889 (strlen(s) > 60 ? "..." : ""));
4892 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4896 Perl_pregfree(pTHX_ struct regexp *r)
4899 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4902 if (!r || (--r->refcnt > 0))
4908 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4909 r->prelen, 60, UNI_DISPLAY_REGEX)
4910 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4914 PerlIO_printf(Perl_debug_log,
4915 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4916 PL_colors[4],PL_colors[5],PL_colors[0],
4919 len > 60 ? "..." : "");
4923 Safefree(r->precomp);
4924 if (r->offsets) /* 20010421 MJD */
4925 Safefree(r->offsets);
4926 if (RX_MATCH_COPIED(r))
4927 Safefree(r->subbeg);
4929 if (r->anchored_substr)
4930 SvREFCNT_dec(r->anchored_substr);
4931 if (r->anchored_utf8)
4932 SvREFCNT_dec(r->anchored_utf8);
4933 if (r->float_substr)
4934 SvREFCNT_dec(r->float_substr);
4936 SvREFCNT_dec(r->float_utf8);
4937 Safefree(r->substrs);
4940 int n = r->data->count;
4941 PAD* new_comppad = NULL;
4945 /* If you add a ->what type here, update the comment in regcomp.h */
4946 switch (r->data->what[n]) {
4948 SvREFCNT_dec((SV*)r->data->data[n]);
4951 Safefree(r->data->data[n]);
4954 new_comppad = (AV*)r->data->data[n];
4957 if (new_comppad == NULL)
4958 Perl_croak(aTHX_ "panic: pregfree comppad");
4959 PAD_SAVE_LOCAL(old_comppad,
4960 /* Watch out for global destruction's random ordering. */
4961 (SvTYPE(new_comppad) == SVt_PVAV) ?
4962 new_comppad : Null(PAD *)
4964 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4965 op_free((OP_4tree*)r->data->data[n]);
4968 PAD_RESTORE_LOCAL(old_comppad);
4969 SvREFCNT_dec((SV*)new_comppad);
4975 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4978 Safefree(r->data->what);
4981 Safefree(r->startp);
4987 - regnext - dig the "next" pointer out of a node
4989 * [Note, when REGALIGN is defined there are two places in regmatch()
4990 * that bypass this code for speed.]
4993 Perl_regnext(pTHX_ register regnode *p)
4995 register I32 offset;
4997 if (p == &PL_regdummy)
5000 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5008 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5011 STRLEN l1 = strlen(pat1);
5012 STRLEN l2 = strlen(pat2);
5021 Copy(pat1, buf, l1 , char);
5022 Copy(pat2, buf + l1, l2 , char);
5023 buf[l1 + l2] = '\n';
5024 buf[l1 + l2 + 1] = '\0';
5026 /* ANSI variant takes additional second argument */
5027 va_start(args, pat2);
5031 msv = vmess(buf, &args);
5033 message = SvPV(msv,l1);
5036 Copy(message, buf, l1 , char);
5037 buf[l1] = '\0'; /* Overwrite \n */
5038 Perl_croak(aTHX_ "%s", buf);
5041 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5044 Perl_save_re_context(pTHX)
5046 SAVEI32(PL_reg_flags); /* from regexec.c */
5048 SAVEPPTR(PL_reginput); /* String-input pointer. */
5049 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5050 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5051 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5052 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5053 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5054 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5055 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5056 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5057 PL_reg_start_tmp = 0;
5058 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5059 PL_reg_start_tmpl = 0;
5060 SAVEVPTR(PL_regdata);
5061 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5062 SAVEI32(PL_regnarrate); /* from regexec.c */
5063 SAVEVPTR(PL_regprogram); /* from regexec.c */
5064 SAVEINT(PL_regindent); /* from regexec.c */
5065 SAVEVPTR(PL_regcc); /* from regexec.c */
5066 SAVEVPTR(PL_curcop);
5067 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5068 SAVEVPTR(PL_reg_re); /* from regexec.c */
5069 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5070 SAVESPTR(PL_reg_sv); /* from regexec.c */
5071 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5072 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5073 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5074 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5075 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5076 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5077 PL_reg_oldsaved = Nullch;
5078 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5079 PL_reg_oldsavedlen = 0;
5080 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5082 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5083 PL_reg_leftiter = 0;
5084 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5085 PL_reg_poscache = Nullch;
5086 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5087 PL_reg_poscache_size = 0;
5088 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5089 SAVEI32(PL_regnpar); /* () count. */
5090 SAVEI32(PL_regsize); /* from regexec.c */
5093 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5099 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5100 for (i = 1; i <= rx->nparens; i++) {
5101 sprintf(digits, "%lu", (long)i);
5102 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5109 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5114 clear_re(pTHX_ void *r)
5116 ReREFCNT_dec((regexp *)r);