5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
9 * confused with the original package (see point 3 below). Thanks, Henry!
12 /* Additional note: this code is very heavily munged from Henry's version
13 * in places. In some spots I've traded clarity for efficiency, so don't
14 * blame Henry for some of the lack of readability.
17 /* The names of the functions have been changed from regcomp and
18 * regexec to pregcomp and pregexec in order to avoid conflicts
19 * with the POSIX routines of the same names.
22 #ifdef PERL_EXT_RE_BUILD
23 /* need to replace pregcomp et al, so enable that */
24 # ifndef PERL_IN_XSUB_RE
25 # define PERL_IN_XSUB_RE
27 /* need access to debugger hooks */
28 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
33 #ifdef PERL_IN_XSUB_RE
34 /* We *really* need to overwrite these symbols: */
35 # define Perl_pregcomp my_regcomp
36 # define Perl_regdump my_regdump
37 # define Perl_regprop my_regprop
38 # define Perl_pregfree my_regfree
39 # define Perl_re_intuit_string my_re_intuit_string
40 /* *These* symbols are masked to allow static link. */
41 # define Perl_regnext my_regnext
42 # define Perl_save_re_context my_save_re_context
43 # define Perl_reginitcolors my_reginitcolors
45 # define PERL_NO_GET_CONTEXT
50 * pregcomp and pregexec -- regsub and regerror are not used in perl
52 * Copyright (c) 1986 by University of Toronto.
53 * Written by Henry Spencer. Not derived from licensed software.
55 * Permission is granted to anyone to use this software for any
56 * purpose on any computer system, and to redistribute it freely,
57 * subject to the following restrictions:
59 * 1. The author is not responsible for the consequences of use of
60 * this software, no matter how awful, even if they arise
63 * 2. The origin of this software must not be misrepresented, either
64 * by explicit claim or by omission.
66 * 3. Altered versions must be plainly marked as such, and must not
67 * be misrepresented as being the original software.
70 **** Alterations to Henry's code are...
72 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
73 **** 2000, 2001, 2002, 2003, by Larry Wall and others
75 **** You may distribute under the terms of either the GNU General Public
76 **** License or the Artistic License, as specified in the README file.
79 * Beware that some of this code is subtly aware of the way operator
80 * precedence is structured in regular expressions. Serious changes in
81 * regular-expression syntax might require a total rethink.
84 #define PERL_IN_REGCOMP_C
87 #ifndef PERL_IN_XSUB_RE
99 # if defined(BUGGY_MSC6)
100 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
101 # pragma optimize("a",off)
102 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
103 # pragma optimize("w",on )
104 # endif /* BUGGY_MSC6 */
108 #define STATIC static
111 typedef struct RExC_state_t {
112 U32 flags; /* are we folding, multilining? */
113 char *precomp; /* uncompiled string. */
115 char *start; /* Start of input for compile */
116 char *end; /* End of input for compile */
117 char *parse; /* Input-scan pointer. */
118 I32 whilem_seen; /* number of WHILEM in this expr */
119 regnode *emit_start; /* Start of emitted-code area */
120 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
121 I32 naughty; /* How bad is this pattern? */
122 I32 sawback; /* Did we see \1, ...? */
124 I32 size; /* Code size. */
125 I32 npar; /* () count. */
131 char *starttry; /* -Dr: where regtry was called. */
132 #define RExC_starttry (pRExC_state->starttry)
136 #define RExC_flags (pRExC_state->flags)
137 #define RExC_precomp (pRExC_state->precomp)
138 #define RExC_rx (pRExC_state->rx)
139 #define RExC_start (pRExC_state->start)
140 #define RExC_end (pRExC_state->end)
141 #define RExC_parse (pRExC_state->parse)
142 #define RExC_whilem_seen (pRExC_state->whilem_seen)
143 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
144 #define RExC_emit (pRExC_state->emit)
145 #define RExC_emit_start (pRExC_state->emit_start)
146 #define RExC_naughty (pRExC_state->naughty)
147 #define RExC_sawback (pRExC_state->sawback)
148 #define RExC_seen (pRExC_state->seen)
149 #define RExC_size (pRExC_state->size)
150 #define RExC_npar (pRExC_state->npar)
151 #define RExC_extralen (pRExC_state->extralen)
152 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
153 #define RExC_seen_evals (pRExC_state->seen_evals)
154 #define RExC_utf8 (pRExC_state->utf8)
156 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
157 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
158 ((*s) == '{' && regcurly(s)))
161 #undef SPSTART /* dratted cpp namespace... */
164 * Flags to be passed up and down.
166 #define WORST 0 /* Worst case. */
167 #define HASWIDTH 0x1 /* Known to match non-null strings. */
168 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
169 #define SPSTART 0x4 /* Starts with * or +. */
170 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
172 /* Length of a variant. */
174 typedef struct scan_data_t {
180 I32 last_end; /* min value, <0 unless valid. */
183 SV **longest; /* Either &l_fixed, or &l_float. */
187 I32 offset_float_min;
188 I32 offset_float_max;
192 struct regnode_charclass_class *start_class;
196 * Forward declarations for pregcomp()'s friends.
199 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
203 #define SF_BEFORE_SEOL 0x1
204 #define SF_BEFORE_MEOL 0x2
205 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
206 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
209 # define SF_FIX_SHIFT_EOL (0+2)
210 # define SF_FL_SHIFT_EOL (0+4)
212 # define SF_FIX_SHIFT_EOL (+2)
213 # define SF_FL_SHIFT_EOL (+4)
216 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
217 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
219 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
220 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
221 #define SF_IS_INF 0x40
222 #define SF_HAS_PAR 0x80
223 #define SF_IN_PAR 0x100
224 #define SF_HAS_EVAL 0x200
225 #define SCF_DO_SUBSTR 0x400
226 #define SCF_DO_STCLASS_AND 0x0800
227 #define SCF_DO_STCLASS_OR 0x1000
228 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
229 #define SCF_WHILEM_VISITED_POS 0x2000
231 #define UTF (RExC_utf8 != 0)
232 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
233 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
235 #define OOB_UNICODE 12345678
236 #define OOB_NAMEDCLASS -1
238 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
239 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
242 /* length of regex to show in messages that don't mark a position within */
243 #define RegexLengthToShowInErrorMessages 127
246 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
247 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
248 * op/pragma/warn/regcomp.
250 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
251 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
253 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
256 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
257 * arg. Show regex, up to a maximum length. If it's too long, chop and add
260 #define FAIL(msg) STMT_START { \
261 char *ellipses = ""; \
262 IV len = RExC_end - RExC_precomp; \
265 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
266 if (len > RegexLengthToShowInErrorMessages) { \
267 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
268 len = RegexLengthToShowInErrorMessages - 10; \
271 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
272 msg, (int)len, RExC_precomp, ellipses); \
276 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
277 * args. Show regex, up to a maximum length. If it's too long, chop and add
280 #define FAIL2(pat,msg) STMT_START { \
281 char *ellipses = ""; \
282 IV len = RExC_end - RExC_precomp; \
285 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
286 if (len > RegexLengthToShowInErrorMessages) { \
287 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
288 len = RegexLengthToShowInErrorMessages - 10; \
291 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
292 msg, (int)len, RExC_precomp, ellipses); \
297 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
299 #define Simple_vFAIL(m) STMT_START { \
300 IV offset = RExC_parse - RExC_precomp; \
301 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
302 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
306 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
308 #define vFAIL(m) STMT_START { \
310 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
315 * Like Simple_vFAIL(), but accepts two arguments.
317 #define Simple_vFAIL2(m,a1) STMT_START { \
318 IV offset = RExC_parse - RExC_precomp; \
319 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
320 (int)offset, RExC_precomp, RExC_precomp + offset); \
324 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
326 #define vFAIL2(m,a1) STMT_START { \
328 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
329 Simple_vFAIL2(m, a1); \
334 * Like Simple_vFAIL(), but accepts three arguments.
336 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
337 IV offset = RExC_parse - RExC_precomp; \
338 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
339 (int)offset, RExC_precomp, RExC_precomp + offset); \
343 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
345 #define vFAIL3(m,a1,a2) STMT_START { \
347 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
348 Simple_vFAIL3(m, a1, a2); \
352 * Like Simple_vFAIL(), but accepts four arguments.
354 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
355 IV offset = RExC_parse - RExC_precomp; \
356 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
357 (int)offset, RExC_precomp, RExC_precomp + offset); \
361 * Like Simple_vFAIL(), but accepts five arguments.
363 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
364 IV offset = RExC_parse - RExC_precomp; \
365 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
366 (int)offset, RExC_precomp, RExC_precomp + offset); \
370 #define vWARN(loc,m) STMT_START { \
371 IV offset = loc - RExC_precomp; \
372 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
373 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
376 #define vWARNdep(loc,m) STMT_START { \
377 IV offset = loc - RExC_precomp; \
378 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
379 "%s" REPORT_LOCATION, \
380 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
384 #define vWARN2(loc, m, a1) STMT_START { \
385 IV offset = loc - RExC_precomp; \
386 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
387 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
390 #define vWARN3(loc, m, a1, a2) STMT_START { \
391 IV offset = loc - RExC_precomp; \
392 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
393 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
396 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
397 IV offset = loc - RExC_precomp; \
398 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
399 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
402 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
403 IV offset = loc - RExC_precomp; \
404 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
405 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
409 /* Allow for side effects in s */
410 #define REGC(c,s) STMT_START { \
411 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
414 /* Macros for recording node offsets. 20001227 mjd@plover.com
415 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
416 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
417 * Element 0 holds the number n.
420 #define MJD_OFFSET_DEBUG(x)
421 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
424 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
426 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
427 __LINE__, (node), (byte))); \
429 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
431 RExC_offsets[2*(node)-1] = (byte); \
436 #define Set_Node_Offset(node,byte) \
437 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
438 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
440 #define Set_Node_Length_To_R(node,len) STMT_START { \
442 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
443 __LINE__, (node), (len))); \
445 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
447 RExC_offsets[2*(node)] = (len); \
452 #define Set_Node_Length(node,len) \
453 Set_Node_Length_To_R((node)-RExC_emit_start, len)
454 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
455 #define Set_Node_Cur_Length(node) \
456 Set_Node_Length(node, RExC_parse - parse_start)
458 /* Get offsets and lengths */
459 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
460 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
462 static void clear_re(pTHX_ void *r);
464 /* Mark that we cannot extend a found fixed substring at this point.
465 Updata the longest found anchored substring and the longest found
466 floating substrings if needed. */
469 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
471 STRLEN l = CHR_SVLEN(data->last_found);
472 STRLEN old_l = CHR_SVLEN(*data->longest);
474 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
475 SvSetMagicSV(*data->longest, data->last_found);
476 if (*data->longest == data->longest_fixed) {
477 data->offset_fixed = l ? data->last_start_min : data->pos_min;
478 if (data->flags & SF_BEFORE_EOL)
480 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
482 data->flags &= ~SF_FIX_BEFORE_EOL;
485 data->offset_float_min = l ? data->last_start_min : data->pos_min;
486 data->offset_float_max = (l
487 ? data->last_start_max
488 : data->pos_min + data->pos_delta);
489 if ((U32)data->offset_float_max > (U32)I32_MAX)
490 data->offset_float_max = I32_MAX;
491 if (data->flags & SF_BEFORE_EOL)
493 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
495 data->flags &= ~SF_FL_BEFORE_EOL;
498 SvCUR_set(data->last_found, 0);
500 SV * sv = data->last_found;
502 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
503 if (mg && mg->mg_len > 0)
507 data->flags &= ~SF_BEFORE_EOL;
510 /* Can match anything (initialization) */
512 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
514 ANYOF_CLASS_ZERO(cl);
515 ANYOF_BITMAP_SETALL(cl);
516 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
518 cl->flags |= ANYOF_LOCALE;
521 /* Can match anything (initialization) */
523 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
527 for (value = 0; value <= ANYOF_MAX; value += 2)
528 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
530 if (!(cl->flags & ANYOF_UNICODE_ALL))
532 if (!ANYOF_BITMAP_TESTALLSET(cl))
537 /* Can match anything (initialization) */
539 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
541 Zero(cl, 1, struct regnode_charclass_class);
543 cl_anything(pRExC_state, cl);
547 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
549 Zero(cl, 1, struct regnode_charclass_class);
551 cl_anything(pRExC_state, cl);
553 cl->flags |= ANYOF_LOCALE;
556 /* 'And' a given class with another one. Can create false positives */
557 /* We assume that cl is not inverted */
559 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
560 struct regnode_charclass_class *and_with)
562 if (!(and_with->flags & ANYOF_CLASS)
563 && !(cl->flags & ANYOF_CLASS)
564 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
565 && !(and_with->flags & ANYOF_FOLD)
566 && !(cl->flags & ANYOF_FOLD)) {
569 if (and_with->flags & ANYOF_INVERT)
570 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
571 cl->bitmap[i] &= ~and_with->bitmap[i];
573 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
574 cl->bitmap[i] &= and_with->bitmap[i];
575 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
576 if (!(and_with->flags & ANYOF_EOS))
577 cl->flags &= ~ANYOF_EOS;
579 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
580 !(and_with->flags & ANYOF_INVERT)) {
581 cl->flags &= ~ANYOF_UNICODE_ALL;
582 cl->flags |= ANYOF_UNICODE;
583 ARG_SET(cl, ARG(and_with));
585 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
586 !(and_with->flags & ANYOF_INVERT))
587 cl->flags &= ~ANYOF_UNICODE_ALL;
588 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
589 !(and_with->flags & ANYOF_INVERT))
590 cl->flags &= ~ANYOF_UNICODE;
593 /* 'OR' a given class with another one. Can create false positives */
594 /* We assume that cl is not inverted */
596 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
598 if (or_with->flags & ANYOF_INVERT) {
600 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
601 * <= (B1 | !B2) | (CL1 | !CL2)
602 * which is wasteful if CL2 is small, but we ignore CL2:
603 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
604 * XXXX Can we handle case-fold? Unclear:
605 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
606 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
608 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
609 && !(or_with->flags & ANYOF_FOLD)
610 && !(cl->flags & ANYOF_FOLD) ) {
613 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
614 cl->bitmap[i] |= ~or_with->bitmap[i];
615 } /* XXXX: logic is complicated otherwise */
617 cl_anything(pRExC_state, cl);
620 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
621 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
622 && (!(or_with->flags & ANYOF_FOLD)
623 || (cl->flags & ANYOF_FOLD)) ) {
626 /* OR char bitmap and class bitmap separately */
627 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
628 cl->bitmap[i] |= or_with->bitmap[i];
629 if (or_with->flags & ANYOF_CLASS) {
630 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
631 cl->classflags[i] |= or_with->classflags[i];
632 cl->flags |= ANYOF_CLASS;
635 else { /* XXXX: logic is complicated, leave it along for a moment. */
636 cl_anything(pRExC_state, cl);
639 if (or_with->flags & ANYOF_EOS)
640 cl->flags |= ANYOF_EOS;
642 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
643 ARG(cl) != ARG(or_with)) {
644 cl->flags |= ANYOF_UNICODE_ALL;
645 cl->flags &= ~ANYOF_UNICODE;
647 if (or_with->flags & ANYOF_UNICODE_ALL) {
648 cl->flags |= ANYOF_UNICODE_ALL;
649 cl->flags &= ~ANYOF_UNICODE;
654 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
655 * These need to be revisited when a newer toolchain becomes available.
657 #if defined(__sparc64__) && defined(__GNUC__)
658 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
659 # undef SPARC64_GCC_WORKAROUND
660 # define SPARC64_GCC_WORKAROUND 1
664 /* REx optimizer. Converts nodes into quickier variants "in place".
665 Finds fixed substrings. */
667 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
668 to the position after last scanned or to NULL. */
671 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
672 /* scanp: Start here (read-write). */
673 /* deltap: Write maxlen-minlen here. */
674 /* last: Stop before this one. */
676 I32 min = 0, pars = 0, code;
677 regnode *scan = *scanp, *next;
679 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
680 int is_inf_internal = 0; /* The studied chunk is infinite */
681 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
682 scan_data_t data_fake;
683 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
685 while (scan && OP(scan) != END && scan < last) {
686 /* Peephole optimizer: */
688 if (PL_regkind[(U8)OP(scan)] == EXACT) {
689 /* Merge several consecutive EXACTish nodes into one. */
690 regnode *n = regnext(scan);
693 regnode *stop = scan;
696 next = scan + NODE_SZ_STR(scan);
697 /* Skip NOTHING, merge EXACT*. */
699 ( PL_regkind[(U8)OP(n)] == NOTHING ||
700 (stringok && (OP(n) == OP(scan))))
702 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
703 if (OP(n) == TAIL || n > next)
705 if (PL_regkind[(U8)OP(n)] == NOTHING) {
706 NEXT_OFF(scan) += NEXT_OFF(n);
707 next = n + NODE_STEP_REGNODE;
715 int oldl = STR_LEN(scan);
716 regnode *nnext = regnext(n);
718 if (oldl + STR_LEN(n) > U8_MAX)
720 NEXT_OFF(scan) += NEXT_OFF(n);
721 STR_LEN(scan) += STR_LEN(n);
722 next = n + NODE_SZ_STR(n);
723 /* Now we can overwrite *n : */
724 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
732 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
734 Two problematic code points in Unicode casefolding of EXACT nodes:
736 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
737 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
743 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
744 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
746 This means that in case-insensitive matching (or "loose matching",
747 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
748 length of the above casefolded versions) can match a target string
749 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
750 This would rather mess up the minimum length computation.
752 What we'll do is to look for the tail four bytes, and then peek
753 at the preceding two bytes to see whether we need to decrease
754 the minimum length by four (six minus two).
756 Thanks to the design of UTF-8, there cannot be false matches:
757 A sequence of valid UTF-8 bytes cannot be a subsequence of
758 another valid sequence of UTF-8 bytes.
761 char *s0 = STRING(scan), *s, *t;
762 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
763 char *t0 = "\xcc\x88\xcc\x81";
767 s < s2 && (t = ninstr(s, s1, t0, t1));
769 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
770 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
777 n = scan + NODE_SZ_STR(scan);
779 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
787 /* Follow the next-chain of the current node and optimize
788 away all the NOTHINGs from it. */
789 if (OP(scan) != CURLYX) {
790 int max = (reg_off_by_arg[OP(scan)]
792 /* I32 may be smaller than U16 on CRAYs! */
793 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
794 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
798 /* Skip NOTHING and LONGJMP. */
799 while ((n = regnext(n))
800 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
801 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
804 if (reg_off_by_arg[OP(scan)])
807 NEXT_OFF(scan) = off;
809 /* The principal pseudo-switch. Cannot be a switch, since we
810 look into several different things. */
811 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
812 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
813 next = regnext(scan);
816 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
817 I32 max1 = 0, min1 = I32_MAX, num = 0;
818 struct regnode_charclass_class accum;
820 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
821 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
822 if (flags & SCF_DO_STCLASS)
823 cl_init_zero(pRExC_state, &accum);
824 while (OP(scan) == code) {
825 I32 deltanext, minnext, f = 0, fake;
826 struct regnode_charclass_class this_class;
831 data_fake.whilem_c = data->whilem_c;
832 data_fake.last_closep = data->last_closep;
835 data_fake.last_closep = &fake;
836 next = regnext(scan);
837 scan = NEXTOPER(scan);
839 scan = NEXTOPER(scan);
840 if (flags & SCF_DO_STCLASS) {
841 cl_init(pRExC_state, &this_class);
842 data_fake.start_class = &this_class;
843 f = SCF_DO_STCLASS_AND;
845 if (flags & SCF_WHILEM_VISITED_POS)
846 f |= SCF_WHILEM_VISITED_POS;
847 /* we suppose the run is continuous, last=next...*/
848 minnext = study_chunk(pRExC_state, &scan, &deltanext,
849 next, &data_fake, f);
852 if (max1 < minnext + deltanext)
853 max1 = minnext + deltanext;
854 if (deltanext == I32_MAX)
855 is_inf = is_inf_internal = 1;
857 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
859 if (data && (data_fake.flags & SF_HAS_EVAL))
860 data->flags |= SF_HAS_EVAL;
862 data->whilem_c = data_fake.whilem_c;
863 if (flags & SCF_DO_STCLASS)
864 cl_or(pRExC_state, &accum, &this_class);
868 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
870 if (flags & SCF_DO_SUBSTR) {
871 data->pos_min += min1;
872 data->pos_delta += max1 - min1;
873 if (max1 != min1 || is_inf)
874 data->longest = &(data->longest_float);
877 delta += max1 - min1;
878 if (flags & SCF_DO_STCLASS_OR) {
879 cl_or(pRExC_state, data->start_class, &accum);
881 cl_and(data->start_class, &and_with);
882 flags &= ~SCF_DO_STCLASS;
885 else if (flags & SCF_DO_STCLASS_AND) {
887 cl_and(data->start_class, &accum);
888 flags &= ~SCF_DO_STCLASS;
891 /* Switch to OR mode: cache the old value of
892 * data->start_class */
893 StructCopy(data->start_class, &and_with,
894 struct regnode_charclass_class);
895 flags &= ~SCF_DO_STCLASS_AND;
896 StructCopy(&accum, data->start_class,
897 struct regnode_charclass_class);
898 flags |= SCF_DO_STCLASS_OR;
899 data->start_class->flags |= ANYOF_EOS;
903 else if (code == BRANCHJ) /* single branch is optimized. */
904 scan = NEXTOPER(NEXTOPER(scan));
905 else /* single branch is optimized. */
906 scan = NEXTOPER(scan);
909 else if (OP(scan) == EXACT) {
910 I32 l = STR_LEN(scan);
911 UV uc = *((U8*)STRING(scan));
913 U8 *s = (U8*)STRING(scan);
914 l = utf8_length(s, s + l);
915 uc = utf8_to_uvchr(s, NULL);
918 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
919 /* The code below prefers earlier match for fixed
920 offset, later match for variable offset. */
921 if (data->last_end == -1) { /* Update the start info. */
922 data->last_start_min = data->pos_min;
923 data->last_start_max = is_inf
924 ? I32_MAX : data->pos_min + data->pos_delta;
926 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
928 SV * sv = data->last_found;
929 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
930 mg_find(sv, PERL_MAGIC_utf8) : NULL;
931 if (mg && mg->mg_len >= 0)
932 mg->mg_len += utf8_length((U8*)STRING(scan),
933 (U8*)STRING(scan)+STR_LEN(scan));
936 SvUTF8_on(data->last_found);
937 data->last_end = data->pos_min + l;
938 data->pos_min += l; /* As in the first entry. */
939 data->flags &= ~SF_BEFORE_EOL;
941 if (flags & SCF_DO_STCLASS_AND) {
942 /* Check whether it is compatible with what we know already! */
946 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
947 && !ANYOF_BITMAP_TEST(data->start_class, uc)
948 && (!(data->start_class->flags & ANYOF_FOLD)
949 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
952 ANYOF_CLASS_ZERO(data->start_class);
953 ANYOF_BITMAP_ZERO(data->start_class);
955 ANYOF_BITMAP_SET(data->start_class, uc);
956 data->start_class->flags &= ~ANYOF_EOS;
958 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
960 else if (flags & SCF_DO_STCLASS_OR) {
961 /* false positive possible if the class is case-folded */
963 ANYOF_BITMAP_SET(data->start_class, uc);
965 data->start_class->flags |= ANYOF_UNICODE_ALL;
966 data->start_class->flags &= ~ANYOF_EOS;
967 cl_and(data->start_class, &and_with);
969 flags &= ~SCF_DO_STCLASS;
971 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
972 I32 l = STR_LEN(scan);
973 UV uc = *((U8*)STRING(scan));
975 /* Search for fixed substrings supports EXACT only. */
976 if (flags & SCF_DO_SUBSTR)
977 scan_commit(pRExC_state, data);
979 U8 *s = (U8 *)STRING(scan);
980 l = utf8_length(s, s + l);
981 uc = utf8_to_uvchr(s, NULL);
984 if (data && (flags & SCF_DO_SUBSTR))
986 if (flags & SCF_DO_STCLASS_AND) {
987 /* Check whether it is compatible with what we know already! */
991 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
992 && !ANYOF_BITMAP_TEST(data->start_class, uc)
993 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
995 ANYOF_CLASS_ZERO(data->start_class);
996 ANYOF_BITMAP_ZERO(data->start_class);
998 ANYOF_BITMAP_SET(data->start_class, uc);
999 data->start_class->flags &= ~ANYOF_EOS;
1000 data->start_class->flags |= ANYOF_FOLD;
1001 if (OP(scan) == EXACTFL)
1002 data->start_class->flags |= ANYOF_LOCALE;
1005 else if (flags & SCF_DO_STCLASS_OR) {
1006 if (data->start_class->flags & ANYOF_FOLD) {
1007 /* false positive possible if the class is case-folded.
1008 Assume that the locale settings are the same... */
1010 ANYOF_BITMAP_SET(data->start_class, uc);
1011 data->start_class->flags &= ~ANYOF_EOS;
1013 cl_and(data->start_class, &and_with);
1015 flags &= ~SCF_DO_STCLASS;
1017 else if (strchr((char*)PL_varies,OP(scan))) {
1018 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1019 I32 f = flags, pos_before = 0;
1020 regnode *oscan = scan;
1021 struct regnode_charclass_class this_class;
1022 struct regnode_charclass_class *oclass = NULL;
1023 I32 next_is_eval = 0;
1025 switch (PL_regkind[(U8)OP(scan)]) {
1026 case WHILEM: /* End of (?:...)* . */
1027 scan = NEXTOPER(scan);
1030 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1031 next = NEXTOPER(scan);
1032 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1034 maxcount = REG_INFTY;
1035 next = regnext(scan);
1036 scan = NEXTOPER(scan);
1040 if (flags & SCF_DO_SUBSTR)
1045 if (flags & SCF_DO_STCLASS) {
1047 maxcount = REG_INFTY;
1048 next = regnext(scan);
1049 scan = NEXTOPER(scan);
1052 is_inf = is_inf_internal = 1;
1053 scan = regnext(scan);
1054 if (flags & SCF_DO_SUBSTR) {
1055 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1056 data->longest = &(data->longest_float);
1058 goto optimize_curly_tail;
1060 mincount = ARG1(scan);
1061 maxcount = ARG2(scan);
1062 next = regnext(scan);
1063 if (OP(scan) == CURLYX) {
1064 I32 lp = (data ? *(data->last_closep) : 0);
1066 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1068 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1069 next_is_eval = (OP(scan) == EVAL);
1071 if (flags & SCF_DO_SUBSTR) {
1072 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1073 pos_before = data->pos_min;
1077 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1079 data->flags |= SF_IS_INF;
1081 if (flags & SCF_DO_STCLASS) {
1082 cl_init(pRExC_state, &this_class);
1083 oclass = data->start_class;
1084 data->start_class = &this_class;
1085 f |= SCF_DO_STCLASS_AND;
1086 f &= ~SCF_DO_STCLASS_OR;
1088 /* These are the cases when once a subexpression
1089 fails at a particular position, it cannot succeed
1090 even after backtracking at the enclosing scope.
1092 XXXX what if minimal match and we are at the
1093 initial run of {n,m}? */
1094 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1095 f &= ~SCF_WHILEM_VISITED_POS;
1097 /* This will finish on WHILEM, setting scan, or on NULL: */
1098 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1100 ? (f & ~SCF_DO_SUBSTR) : f);
1102 if (flags & SCF_DO_STCLASS)
1103 data->start_class = oclass;
1104 if (mincount == 0 || minnext == 0) {
1105 if (flags & SCF_DO_STCLASS_OR) {
1106 cl_or(pRExC_state, data->start_class, &this_class);
1108 else if (flags & SCF_DO_STCLASS_AND) {
1109 /* Switch to OR mode: cache the old value of
1110 * data->start_class */
1111 StructCopy(data->start_class, &and_with,
1112 struct regnode_charclass_class);
1113 flags &= ~SCF_DO_STCLASS_AND;
1114 StructCopy(&this_class, data->start_class,
1115 struct regnode_charclass_class);
1116 flags |= SCF_DO_STCLASS_OR;
1117 data->start_class->flags |= ANYOF_EOS;
1119 } else { /* Non-zero len */
1120 if (flags & SCF_DO_STCLASS_OR) {
1121 cl_or(pRExC_state, data->start_class, &this_class);
1122 cl_and(data->start_class, &and_with);
1124 else if (flags & SCF_DO_STCLASS_AND)
1125 cl_and(data->start_class, &this_class);
1126 flags &= ~SCF_DO_STCLASS;
1128 if (!scan) /* It was not CURLYX, but CURLY. */
1130 if (ckWARN(WARN_REGEXP)
1131 /* ? quantifier ok, except for (?{ ... }) */
1132 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1133 && (minnext == 0) && (deltanext == 0)
1134 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1135 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1138 "Quantifier unexpected on zero-length expression");
1141 min += minnext * mincount;
1142 is_inf_internal |= ((maxcount == REG_INFTY
1143 && (minnext + deltanext) > 0)
1144 || deltanext == I32_MAX);
1145 is_inf |= is_inf_internal;
1146 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1148 /* Try powerful optimization CURLYX => CURLYN. */
1149 if ( OP(oscan) == CURLYX && data
1150 && data->flags & SF_IN_PAR
1151 && !(data->flags & SF_HAS_EVAL)
1152 && !deltanext && minnext == 1 ) {
1153 /* Try to optimize to CURLYN. */
1154 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1155 regnode *nxt1 = nxt;
1162 if (!strchr((char*)PL_simple,OP(nxt))
1163 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1164 && STR_LEN(nxt) == 1))
1170 if (OP(nxt) != CLOSE)
1172 /* Now we know that nxt2 is the only contents: */
1173 oscan->flags = (U8)ARG(nxt);
1175 OP(nxt1) = NOTHING; /* was OPEN. */
1177 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1178 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1179 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1180 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1181 OP(nxt + 1) = OPTIMIZED; /* was count. */
1182 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1187 /* Try optimization CURLYX => CURLYM. */
1188 if ( OP(oscan) == CURLYX && data
1189 && !(data->flags & SF_HAS_PAR)
1190 && !(data->flags & SF_HAS_EVAL)
1192 /* XXXX How to optimize if data == 0? */
1193 /* Optimize to a simpler form. */
1194 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1198 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1199 && (OP(nxt2) != WHILEM))
1201 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1202 /* Need to optimize away parenths. */
1203 if (data->flags & SF_IN_PAR) {
1204 /* Set the parenth number. */
1205 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1207 if (OP(nxt) != CLOSE)
1208 FAIL("Panic opt close");
1209 oscan->flags = (U8)ARG(nxt);
1210 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1211 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1213 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1214 OP(nxt + 1) = OPTIMIZED; /* was count. */
1215 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1216 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1219 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1220 regnode *nnxt = regnext(nxt1);
1223 if (reg_off_by_arg[OP(nxt1)])
1224 ARG_SET(nxt1, nxt2 - nxt1);
1225 else if (nxt2 - nxt1 < U16_MAX)
1226 NEXT_OFF(nxt1) = nxt2 - nxt1;
1228 OP(nxt) = NOTHING; /* Cannot beautify */
1233 /* Optimize again: */
1234 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1240 else if ((OP(oscan) == CURLYX)
1241 && (flags & SCF_WHILEM_VISITED_POS)
1242 /* See the comment on a similar expression above.
1243 However, this time it not a subexpression
1244 we care about, but the expression itself. */
1245 && (maxcount == REG_INFTY)
1246 && data && ++data->whilem_c < 16) {
1247 /* This stays as CURLYX, we can put the count/of pair. */
1248 /* Find WHILEM (as in regexec.c) */
1249 regnode *nxt = oscan + NEXT_OFF(oscan);
1251 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1253 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1254 | (RExC_whilem_seen << 4)); /* On WHILEM */
1256 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1258 if (flags & SCF_DO_SUBSTR) {
1259 SV *last_str = Nullsv;
1260 int counted = mincount != 0;
1262 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1263 #if defined(SPARC64_GCC_WORKAROUND)
1269 if (pos_before >= data->last_start_min)
1272 b = data->last_start_min;
1275 s = SvPV(data->last_found, l);
1276 old = b - data->last_start_min;
1279 I32 b = pos_before >= data->last_start_min
1280 ? pos_before : data->last_start_min;
1282 char *s = SvPV(data->last_found, l);
1283 I32 old = b - data->last_start_min;
1287 old = utf8_hop((U8*)s, old) - (U8*)s;
1290 /* Get the added string: */
1291 last_str = newSVpvn(s + old, l);
1293 SvUTF8_on(last_str);
1294 if (deltanext == 0 && pos_before == b) {
1295 /* What was added is a constant string */
1297 SvGROW(last_str, (mincount * l) + 1);
1298 repeatcpy(SvPVX(last_str) + l,
1299 SvPVX(last_str), l, mincount - 1);
1300 SvCUR(last_str) *= mincount;
1301 /* Add additional parts. */
1302 SvCUR_set(data->last_found,
1303 SvCUR(data->last_found) - l);
1304 sv_catsv(data->last_found, last_str);
1306 SV * sv = data->last_found;
1308 SvUTF8(sv) && SvMAGICAL(sv) ?
1309 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1310 if (mg && mg->mg_len >= 0)
1311 mg->mg_len += CHR_SVLEN(last_str);
1313 data->last_end += l * (mincount - 1);
1316 /* start offset must point into the last copy */
1317 data->last_start_min += minnext * (mincount - 1);
1318 data->last_start_max += is_inf ? I32_MAX
1319 : (maxcount - 1) * (minnext + data->pos_delta);
1322 /* It is counted once already... */
1323 data->pos_min += minnext * (mincount - counted);
1324 data->pos_delta += - counted * deltanext +
1325 (minnext + deltanext) * maxcount - minnext * mincount;
1326 if (mincount != maxcount) {
1327 /* Cannot extend fixed substrings found inside
1329 scan_commit(pRExC_state,data);
1330 if (mincount && last_str) {
1331 sv_setsv(data->last_found, last_str);
1332 data->last_end = data->pos_min;
1333 data->last_start_min =
1334 data->pos_min - CHR_SVLEN(last_str);
1335 data->last_start_max = is_inf
1337 : data->pos_min + data->pos_delta
1338 - CHR_SVLEN(last_str);
1340 data->longest = &(data->longest_float);
1342 SvREFCNT_dec(last_str);
1344 if (data && (fl & SF_HAS_EVAL))
1345 data->flags |= SF_HAS_EVAL;
1346 optimize_curly_tail:
1347 if (OP(oscan) != CURLYX) {
1348 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1350 NEXT_OFF(oscan) += NEXT_OFF(next);
1353 default: /* REF and CLUMP only? */
1354 if (flags & SCF_DO_SUBSTR) {
1355 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1356 data->longest = &(data->longest_float);
1358 is_inf = is_inf_internal = 1;
1359 if (flags & SCF_DO_STCLASS_OR)
1360 cl_anything(pRExC_state, data->start_class);
1361 flags &= ~SCF_DO_STCLASS;
1365 else if (strchr((char*)PL_simple,OP(scan))) {
1368 if (flags & SCF_DO_SUBSTR) {
1369 scan_commit(pRExC_state,data);
1373 if (flags & SCF_DO_STCLASS) {
1374 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1376 /* Some of the logic below assumes that switching
1377 locale on will only add false positives. */
1378 switch (PL_regkind[(U8)OP(scan)]) {
1382 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1383 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1384 cl_anything(pRExC_state, data->start_class);
1387 if (OP(scan) == SANY)
1389 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1390 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1391 || (data->start_class->flags & ANYOF_CLASS));
1392 cl_anything(pRExC_state, data->start_class);
1394 if (flags & SCF_DO_STCLASS_AND || !value)
1395 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1398 if (flags & SCF_DO_STCLASS_AND)
1399 cl_and(data->start_class,
1400 (struct regnode_charclass_class*)scan);
1402 cl_or(pRExC_state, data->start_class,
1403 (struct regnode_charclass_class*)scan);
1406 if (flags & SCF_DO_STCLASS_AND) {
1407 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1408 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1409 for (value = 0; value < 256; value++)
1410 if (!isALNUM(value))
1411 ANYOF_BITMAP_CLEAR(data->start_class, value);
1415 if (data->start_class->flags & ANYOF_LOCALE)
1416 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1418 for (value = 0; value < 256; value++)
1420 ANYOF_BITMAP_SET(data->start_class, value);
1425 if (flags & SCF_DO_STCLASS_AND) {
1426 if (data->start_class->flags & ANYOF_LOCALE)
1427 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1430 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1431 data->start_class->flags |= ANYOF_LOCALE;
1435 if (flags & SCF_DO_STCLASS_AND) {
1436 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1437 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1438 for (value = 0; value < 256; value++)
1440 ANYOF_BITMAP_CLEAR(data->start_class, value);
1444 if (data->start_class->flags & ANYOF_LOCALE)
1445 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1447 for (value = 0; value < 256; value++)
1448 if (!isALNUM(value))
1449 ANYOF_BITMAP_SET(data->start_class, value);
1454 if (flags & SCF_DO_STCLASS_AND) {
1455 if (data->start_class->flags & ANYOF_LOCALE)
1456 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1459 data->start_class->flags |= ANYOF_LOCALE;
1460 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1464 if (flags & SCF_DO_STCLASS_AND) {
1465 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1466 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1467 for (value = 0; value < 256; value++)
1468 if (!isSPACE(value))
1469 ANYOF_BITMAP_CLEAR(data->start_class, value);
1473 if (data->start_class->flags & ANYOF_LOCALE)
1474 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1476 for (value = 0; value < 256; value++)
1478 ANYOF_BITMAP_SET(data->start_class, value);
1483 if (flags & SCF_DO_STCLASS_AND) {
1484 if (data->start_class->flags & ANYOF_LOCALE)
1485 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1488 data->start_class->flags |= ANYOF_LOCALE;
1489 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1493 if (flags & SCF_DO_STCLASS_AND) {
1494 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1495 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1496 for (value = 0; value < 256; value++)
1498 ANYOF_BITMAP_CLEAR(data->start_class, value);
1502 if (data->start_class->flags & ANYOF_LOCALE)
1503 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1505 for (value = 0; value < 256; value++)
1506 if (!isSPACE(value))
1507 ANYOF_BITMAP_SET(data->start_class, value);
1512 if (flags & SCF_DO_STCLASS_AND) {
1513 if (data->start_class->flags & ANYOF_LOCALE) {
1514 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1515 for (value = 0; value < 256; value++)
1516 if (!isSPACE(value))
1517 ANYOF_BITMAP_CLEAR(data->start_class, value);
1521 data->start_class->flags |= ANYOF_LOCALE;
1522 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1526 if (flags & SCF_DO_STCLASS_AND) {
1527 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1528 for (value = 0; value < 256; value++)
1529 if (!isDIGIT(value))
1530 ANYOF_BITMAP_CLEAR(data->start_class, value);
1533 if (data->start_class->flags & ANYOF_LOCALE)
1534 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1536 for (value = 0; value < 256; value++)
1538 ANYOF_BITMAP_SET(data->start_class, value);
1543 if (flags & SCF_DO_STCLASS_AND) {
1544 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1545 for (value = 0; value < 256; value++)
1547 ANYOF_BITMAP_CLEAR(data->start_class, value);
1550 if (data->start_class->flags & ANYOF_LOCALE)
1551 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1553 for (value = 0; value < 256; value++)
1554 if (!isDIGIT(value))
1555 ANYOF_BITMAP_SET(data->start_class, value);
1560 if (flags & SCF_DO_STCLASS_OR)
1561 cl_and(data->start_class, &and_with);
1562 flags &= ~SCF_DO_STCLASS;
1565 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1566 data->flags |= (OP(scan) == MEOL
1570 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1571 /* Lookbehind, or need to calculate parens/evals/stclass: */
1572 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1573 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1574 /* Lookahead/lookbehind */
1575 I32 deltanext, minnext, fake = 0;
1577 struct regnode_charclass_class intrnl;
1580 data_fake.flags = 0;
1582 data_fake.whilem_c = data->whilem_c;
1583 data_fake.last_closep = data->last_closep;
1586 data_fake.last_closep = &fake;
1587 if ( flags & SCF_DO_STCLASS && !scan->flags
1588 && OP(scan) == IFMATCH ) { /* Lookahead */
1589 cl_init(pRExC_state, &intrnl);
1590 data_fake.start_class = &intrnl;
1591 f |= SCF_DO_STCLASS_AND;
1593 if (flags & SCF_WHILEM_VISITED_POS)
1594 f |= SCF_WHILEM_VISITED_POS;
1595 next = regnext(scan);
1596 nscan = NEXTOPER(NEXTOPER(scan));
1597 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1600 vFAIL("Variable length lookbehind not implemented");
1602 else if (minnext > U8_MAX) {
1603 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1605 scan->flags = (U8)minnext;
1607 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1609 if (data && (data_fake.flags & SF_HAS_EVAL))
1610 data->flags |= SF_HAS_EVAL;
1612 data->whilem_c = data_fake.whilem_c;
1613 if (f & SCF_DO_STCLASS_AND) {
1614 int was = (data->start_class->flags & ANYOF_EOS);
1616 cl_and(data->start_class, &intrnl);
1618 data->start_class->flags |= ANYOF_EOS;
1621 else if (OP(scan) == OPEN) {
1624 else if (OP(scan) == CLOSE) {
1625 if ((I32)ARG(scan) == is_par) {
1626 next = regnext(scan);
1628 if ( next && (OP(next) != WHILEM) && next < last)
1629 is_par = 0; /* Disable optimization */
1632 *(data->last_closep) = ARG(scan);
1634 else if (OP(scan) == EVAL) {
1636 data->flags |= SF_HAS_EVAL;
1638 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1639 if (flags & SCF_DO_SUBSTR) {
1640 scan_commit(pRExC_state,data);
1641 data->longest = &(data->longest_float);
1643 is_inf = is_inf_internal = 1;
1644 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1645 cl_anything(pRExC_state, data->start_class);
1646 flags &= ~SCF_DO_STCLASS;
1648 /* Else: zero-length, ignore. */
1649 scan = regnext(scan);
1654 *deltap = is_inf_internal ? I32_MAX : delta;
1655 if (flags & SCF_DO_SUBSTR && is_inf)
1656 data->pos_delta = I32_MAX - data->pos_min;
1657 if (is_par > U8_MAX)
1659 if (is_par && pars==1 && data) {
1660 data->flags |= SF_IN_PAR;
1661 data->flags &= ~SF_HAS_PAR;
1663 else if (pars && data) {
1664 data->flags |= SF_HAS_PAR;
1665 data->flags &= ~SF_IN_PAR;
1667 if (flags & SCF_DO_STCLASS_OR)
1668 cl_and(data->start_class, &and_with);
1673 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1675 if (RExC_rx->data) {
1676 Renewc(RExC_rx->data,
1677 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1678 char, struct reg_data);
1679 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1680 RExC_rx->data->count += n;
1683 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1684 char, struct reg_data);
1685 New(1208, RExC_rx->data->what, n, U8);
1686 RExC_rx->data->count = n;
1688 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1689 return RExC_rx->data->count - n;
1693 Perl_reginitcolors(pTHX)
1696 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1699 PL_colors[0] = s = savepv(s);
1701 s = strchr(s, '\t');
1707 PL_colors[i] = s = "";
1711 PL_colors[i++] = "";
1718 - pregcomp - compile a regular expression into internal code
1720 * We can't allocate space until we know how big the compiled form will be,
1721 * but we can't compile it (and thus know how big it is) until we've got a
1722 * place to put the code. So we cheat: we compile it twice, once with code
1723 * generation turned off and size counting turned on, and once "for real".
1724 * This also means that we don't allocate space until we are sure that the
1725 * thing really will compile successfully, and we never have to move the
1726 * code and thus invalidate pointers into it. (Note that it has to be in
1727 * one piece because free() must be able to free it all.) [NB: not true in perl]
1729 * Beware that the optimization-preparation code in here knows about some
1730 * of the structure of the compiled regexp. [I'll say.]
1733 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1743 RExC_state_t RExC_state;
1744 RExC_state_t *pRExC_state = &RExC_state;
1747 FAIL("NULL regexp argument");
1749 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1753 if (!PL_colorset) reginitcolors();
1754 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1755 PL_colors[4],PL_colors[5],PL_colors[0],
1756 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1758 RExC_flags = pm->op_pmflags;
1762 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1763 RExC_seen_evals = 0;
1766 /* First pass: determine size, legality. */
1773 RExC_emit = &PL_regdummy;
1774 RExC_whilem_seen = 0;
1775 #if 0 /* REGC() is (currently) a NOP at the first pass.
1776 * Clever compilers notice this and complain. --jhi */
1777 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1779 if (reg(pRExC_state, 0, &flags) == NULL) {
1780 RExC_precomp = Nullch;
1783 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1785 /* Small enough for pointer-storage convention?
1786 If extralen==0, this means that we will not need long jumps. */
1787 if (RExC_size >= 0x10000L && RExC_extralen)
1788 RExC_size += RExC_extralen;
1791 if (RExC_whilem_seen > 15)
1792 RExC_whilem_seen = 15;
1794 /* Allocate space and initialize. */
1795 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1798 FAIL("Regexp out of space");
1801 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1802 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1805 r->prelen = xend - exp;
1806 r->precomp = savepvn(RExC_precomp, r->prelen);
1808 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1809 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1811 r->substrs = 0; /* Useful during FAIL. */
1812 r->startp = 0; /* Useful during FAIL. */
1813 r->endp = 0; /* Useful during FAIL. */
1815 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1817 r->offsets[0] = RExC_size;
1819 DEBUG_r(PerlIO_printf(Perl_debug_log,
1820 "%s %"UVuf" bytes for offset annotations.\n",
1821 r->offsets ? "Got" : "Couldn't get",
1822 (UV)((2*RExC_size+1) * sizeof(U32))));
1826 /* Second pass: emit code. */
1827 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1832 RExC_emit_start = r->program;
1833 RExC_emit = r->program;
1834 /* Store the count of eval-groups for security checks: */
1835 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1836 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1838 if (reg(pRExC_state, 0, &flags) == NULL)
1841 /* Dig out information for optimizations. */
1842 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1843 pm->op_pmflags = RExC_flags;
1845 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1846 r->regstclass = NULL;
1847 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1848 r->reganch |= ROPT_NAUGHTY;
1849 scan = r->program + 1; /* First BRANCH. */
1851 /* XXXX To minimize changes to RE engine we always allocate
1852 3-units-long substrs field. */
1853 Newz(1004, r->substrs, 1, struct reg_substr_data);
1855 StructCopy(&zero_scan_data, &data, scan_data_t);
1856 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1857 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1859 STRLEN longest_float_length, longest_fixed_length;
1860 struct regnode_charclass_class ch_class;
1865 /* Skip introductions and multiplicators >= 1. */
1866 while ((OP(first) == OPEN && (sawopen = 1)) ||
1867 /* An OR of *one* alternative - should not happen now. */
1868 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1869 (OP(first) == PLUS) ||
1870 (OP(first) == MINMOD) ||
1871 /* An {n,m} with n>0 */
1872 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1873 if (OP(first) == PLUS)
1876 first += regarglen[(U8)OP(first)];
1877 first = NEXTOPER(first);
1880 /* Starting-point info. */
1882 if (PL_regkind[(U8)OP(first)] == EXACT) {
1883 if (OP(first) == EXACT)
1884 ; /* Empty, get anchored substr later. */
1885 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1886 r->regstclass = first;
1888 else if (strchr((char*)PL_simple,OP(first)))
1889 r->regstclass = first;
1890 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1891 PL_regkind[(U8)OP(first)] == NBOUND)
1892 r->regstclass = first;
1893 else if (PL_regkind[(U8)OP(first)] == BOL) {
1894 r->reganch |= (OP(first) == MBOL
1896 : (OP(first) == SBOL
1899 first = NEXTOPER(first);
1902 else if (OP(first) == GPOS) {
1903 r->reganch |= ROPT_ANCH_GPOS;
1904 first = NEXTOPER(first);
1907 else if (!sawopen && (OP(first) == STAR &&
1908 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1909 !(r->reganch & ROPT_ANCH) )
1911 /* turn .* into ^.* with an implied $*=1 */
1912 int type = OP(NEXTOPER(first));
1914 if (type == REG_ANY)
1915 type = ROPT_ANCH_MBOL;
1917 type = ROPT_ANCH_SBOL;
1919 r->reganch |= type | ROPT_IMPLICIT;
1920 first = NEXTOPER(first);
1923 if (sawplus && (!sawopen || !RExC_sawback)
1924 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1925 /* x+ must match at the 1st pos of run of x's */
1926 r->reganch |= ROPT_SKIP;
1928 /* Scan is after the zeroth branch, first is atomic matcher. */
1929 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1930 (IV)(first - scan + 1)));
1932 * If there's something expensive in the r.e., find the
1933 * longest literal string that must appear and make it the
1934 * regmust. Resolve ties in favor of later strings, since
1935 * the regstart check works with the beginning of the r.e.
1936 * and avoiding duplication strengthens checking. Not a
1937 * strong reason, but sufficient in the absence of others.
1938 * [Now we resolve ties in favor of the earlier string if
1939 * it happens that c_offset_min has been invalidated, since the
1940 * earlier string may buy us something the later one won't.]
1944 data.longest_fixed = newSVpvn("",0);
1945 data.longest_float = newSVpvn("",0);
1946 data.last_found = newSVpvn("",0);
1947 data.longest = &(data.longest_fixed);
1949 if (!r->regstclass) {
1950 cl_init(pRExC_state, &ch_class);
1951 data.start_class = &ch_class;
1952 stclass_flag = SCF_DO_STCLASS_AND;
1953 } else /* XXXX Check for BOUND? */
1955 data.last_closep = &last_close;
1957 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1958 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1959 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1960 && data.last_start_min == 0 && data.last_end > 0
1961 && !RExC_seen_zerolen
1962 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1963 r->reganch |= ROPT_CHECK_ALL;
1964 scan_commit(pRExC_state, &data);
1965 SvREFCNT_dec(data.last_found);
1967 longest_float_length = CHR_SVLEN(data.longest_float);
1968 if (longest_float_length
1969 || (data.flags & SF_FL_BEFORE_EOL
1970 && (!(data.flags & SF_FL_BEFORE_MEOL)
1971 || (RExC_flags & PMf_MULTILINE)))) {
1974 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1975 && data.offset_fixed == data.offset_float_min
1976 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1977 goto remove_float; /* As in (a)+. */
1979 if (SvUTF8(data.longest_float)) {
1980 r->float_utf8 = data.longest_float;
1981 r->float_substr = Nullsv;
1983 r->float_substr = data.longest_float;
1984 r->float_utf8 = Nullsv;
1986 r->float_min_offset = data.offset_float_min;
1987 r->float_max_offset = data.offset_float_max;
1988 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1989 && (!(data.flags & SF_FL_BEFORE_MEOL)
1990 || (RExC_flags & PMf_MULTILINE)));
1991 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
1995 r->float_substr = r->float_utf8 = Nullsv;
1996 SvREFCNT_dec(data.longest_float);
1997 longest_float_length = 0;
2000 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2001 if (longest_fixed_length
2002 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2003 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2004 || (RExC_flags & PMf_MULTILINE)))) {
2007 if (SvUTF8(data.longest_fixed)) {
2008 r->anchored_utf8 = data.longest_fixed;
2009 r->anchored_substr = Nullsv;
2011 r->anchored_substr = data.longest_fixed;
2012 r->anchored_utf8 = Nullsv;
2014 r->anchored_offset = data.offset_fixed;
2015 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2016 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2017 || (RExC_flags & PMf_MULTILINE)));
2018 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2021 r->anchored_substr = r->anchored_utf8 = Nullsv;
2022 SvREFCNT_dec(data.longest_fixed);
2023 longest_fixed_length = 0;
2026 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2027 r->regstclass = NULL;
2028 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2030 && !(data.start_class->flags & ANYOF_EOS)
2031 && !cl_is_anything(data.start_class))
2033 I32 n = add_data(pRExC_state, 1, "f");
2035 New(1006, RExC_rx->data->data[n], 1,
2036 struct regnode_charclass_class);
2037 StructCopy(data.start_class,
2038 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2039 struct regnode_charclass_class);
2040 r->regstclass = (regnode*)RExC_rx->data->data[n];
2041 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2042 PL_regdata = r->data; /* for regprop() */
2043 DEBUG_r({ SV *sv = sv_newmortal();
2044 regprop(sv, (regnode*)data.start_class);
2045 PerlIO_printf(Perl_debug_log,
2046 "synthetic stclass `%s'.\n",
2050 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2051 if (longest_fixed_length > longest_float_length) {
2052 r->check_substr = r->anchored_substr;
2053 r->check_utf8 = r->anchored_utf8;
2054 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2055 if (r->reganch & ROPT_ANCH_SINGLE)
2056 r->reganch |= ROPT_NOSCAN;
2059 r->check_substr = r->float_substr;
2060 r->check_utf8 = r->float_utf8;
2061 r->check_offset_min = data.offset_float_min;
2062 r->check_offset_max = data.offset_float_max;
2064 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2065 This should be changed ASAP! */
2066 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2067 r->reganch |= RE_USE_INTUIT;
2068 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2069 r->reganch |= RE_INTUIT_TAIL;
2073 /* Several toplevels. Best we can is to set minlen. */
2075 struct regnode_charclass_class ch_class;
2078 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2079 scan = r->program + 1;
2080 cl_init(pRExC_state, &ch_class);
2081 data.start_class = &ch_class;
2082 data.last_closep = &last_close;
2083 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2084 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2085 = r->float_substr = r->float_utf8 = Nullsv;
2086 if (!(data.start_class->flags & ANYOF_EOS)
2087 && !cl_is_anything(data.start_class))
2089 I32 n = add_data(pRExC_state, 1, "f");
2091 New(1006, RExC_rx->data->data[n], 1,
2092 struct regnode_charclass_class);
2093 StructCopy(data.start_class,
2094 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2095 struct regnode_charclass_class);
2096 r->regstclass = (regnode*)RExC_rx->data->data[n];
2097 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2098 DEBUG_r({ SV* sv = sv_newmortal();
2099 regprop(sv, (regnode*)data.start_class);
2100 PerlIO_printf(Perl_debug_log,
2101 "synthetic stclass `%s'.\n",
2107 if (RExC_seen & REG_SEEN_GPOS)
2108 r->reganch |= ROPT_GPOS_SEEN;
2109 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2110 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2111 if (RExC_seen & REG_SEEN_EVAL)
2112 r->reganch |= ROPT_EVAL_SEEN;
2113 if (RExC_seen & REG_SEEN_CANY)
2114 r->reganch |= ROPT_CANY_SEEN;
2115 Newz(1002, r->startp, RExC_npar, I32);
2116 Newz(1002, r->endp, RExC_npar, I32);
2117 PL_regdata = r->data; /* for regprop() */
2118 DEBUG_r(regdump(r));
2123 - reg - regular expression, i.e. main body or parenthesized thing
2125 * Caller must absorb opening parenthesis.
2127 * Combining parenthesis handling with the base level of regular expression
2128 * is a trifle forced, but the need to tie the tails of the branches to what
2129 * follows makes it hard to avoid.
2132 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2133 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2135 register regnode *ret; /* Will be the head of the group. */
2136 register regnode *br;
2137 register regnode *lastbr;
2138 register regnode *ender = 0;
2139 register I32 parno = 0;
2140 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2142 /* for (?g), (?gc), and (?o) warnings; warning
2143 about (?c) will warn about (?g) -- japhy */
2145 I32 wastedflags = 0x00,
2148 wasted_gc = 0x02 | 0x04,
2151 char * parse_start = RExC_parse; /* MJD */
2152 char *oregcomp_parse = RExC_parse;
2155 *flagp = 0; /* Tentatively. */
2158 /* Make an OPEN node, if parenthesized. */
2160 if (*RExC_parse == '?') { /* (?...) */
2161 U32 posflags = 0, negflags = 0;
2162 U32 *flagsp = &posflags;
2164 char *seqstart = RExC_parse;
2167 paren = *RExC_parse++;
2168 ret = NULL; /* For look-ahead/behind. */
2170 case '<': /* (?<...) */
2171 RExC_seen |= REG_SEEN_LOOKBEHIND;
2172 if (*RExC_parse == '!')
2174 if (*RExC_parse != '=' && *RExC_parse != '!')
2177 case '=': /* (?=...) */
2178 case '!': /* (?!...) */
2179 RExC_seen_zerolen++;
2180 case ':': /* (?:...) */
2181 case '>': /* (?>...) */
2183 case '$': /* (?$...) */
2184 case '@': /* (?@...) */
2185 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2187 case '#': /* (?#...) */
2188 while (*RExC_parse && *RExC_parse != ')')
2190 if (*RExC_parse != ')')
2191 FAIL("Sequence (?#... not terminated");
2192 nextchar(pRExC_state);
2195 case 'p': /* (?p...) */
2196 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2197 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2199 case '?': /* (??...) */
2201 if (*RExC_parse != '{')
2203 paren = *RExC_parse++;
2205 case '{': /* (?{...}) */
2207 I32 count = 1, n = 0;
2209 char *s = RExC_parse;
2211 OP_4tree *sop, *rop;
2213 RExC_seen_zerolen++;
2214 RExC_seen |= REG_SEEN_EVAL;
2215 while (count && (c = *RExC_parse)) {
2216 if (c == '\\' && RExC_parse[1])
2224 if (*RExC_parse != ')')
2227 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2232 if (RExC_parse - 1 - s)
2233 sv = newSVpvn(s, RExC_parse - 1 - s);
2235 sv = newSVpvn("", 0);
2238 Perl_save_re_context(aTHX);
2239 rop = sv_compile_2op(sv, &sop, "re", &pad);
2240 sop->op_private |= OPpREFCOUNTED;
2241 /* re_dup will OpREFCNT_inc */
2242 OpREFCNT_set(sop, 1);
2245 n = add_data(pRExC_state, 3, "nop");
2246 RExC_rx->data->data[n] = (void*)rop;
2247 RExC_rx->data->data[n+1] = (void*)sop;
2248 RExC_rx->data->data[n+2] = (void*)pad;
2251 else { /* First pass */
2252 if (PL_reginterp_cnt < ++RExC_seen_evals
2253 && PL_curcop != &PL_compiling)
2254 /* No compiled RE interpolated, has runtime
2255 components ===> unsafe. */
2256 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2257 if (PL_tainting && PL_tainted)
2258 FAIL("Eval-group in insecure regular expression");
2261 nextchar(pRExC_state);
2263 ret = reg_node(pRExC_state, LOGICAL);
2266 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2267 /* deal with the length of this later - MJD */
2270 ret = reganode(pRExC_state, EVAL, n);
2271 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2272 Set_Node_Offset(ret, parse_start);
2275 case '(': /* (?(?{...})...) and (?(?=...)...) */
2277 if (RExC_parse[0] == '?') { /* (?(?...)) */
2278 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2279 || RExC_parse[1] == '<'
2280 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2283 ret = reg_node(pRExC_state, LOGICAL);
2286 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2290 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2292 parno = atoi(RExC_parse++);
2294 while (isDIGIT(*RExC_parse))
2296 ret = reganode(pRExC_state, GROUPP, parno);
2298 if ((c = *nextchar(pRExC_state)) != ')')
2299 vFAIL("Switch condition not recognized");
2301 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2302 br = regbranch(pRExC_state, &flags, 1);
2304 br = reganode(pRExC_state, LONGJMP, 0);
2306 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2307 c = *nextchar(pRExC_state);
2311 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2312 regbranch(pRExC_state, &flags, 1);
2313 regtail(pRExC_state, ret, lastbr);
2316 c = *nextchar(pRExC_state);
2321 vFAIL("Switch (?(condition)... contains too many branches");
2322 ender = reg_node(pRExC_state, TAIL);
2323 regtail(pRExC_state, br, ender);
2325 regtail(pRExC_state, lastbr, ender);
2326 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2329 regtail(pRExC_state, ret, ender);
2333 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2337 RExC_parse--; /* for vFAIL to print correctly */
2338 vFAIL("Sequence (? incomplete");
2342 parse_flags: /* (?i) */
2343 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2344 /* (?g), (?gc) and (?o) are useless here
2345 and must be globally applied -- japhy */
2347 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2348 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2349 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2350 if (! (wastedflags & wflagbit) ) {
2351 wastedflags |= wflagbit;
2354 "Useless (%s%c) - %suse /%c modifier",
2355 flagsp == &negflags ? "?-" : "?",
2357 flagsp == &negflags ? "don't " : "",
2363 else if (*RExC_parse == 'c') {
2364 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2365 if (! (wastedflags & wasted_c) ) {
2366 wastedflags |= wasted_gc;
2369 "Useless (%sc) - %suse /gc modifier",
2370 flagsp == &negflags ? "?-" : "?",
2371 flagsp == &negflags ? "don't " : ""
2376 else { pmflag(flagsp, *RExC_parse); }
2380 if (*RExC_parse == '-') {
2382 wastedflags = 0; /* reset so (?g-c) warns twice */
2386 RExC_flags |= posflags;
2387 RExC_flags &= ~negflags;
2388 if (*RExC_parse == ':') {
2394 if (*RExC_parse != ')') {
2396 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2398 nextchar(pRExC_state);
2406 ret = reganode(pRExC_state, OPEN, parno);
2407 Set_Node_Length(ret, 1); /* MJD */
2408 Set_Node_Offset(ret, RExC_parse); /* MJD */
2415 /* Pick up the branches, linking them together. */
2416 parse_start = RExC_parse; /* MJD */
2417 br = regbranch(pRExC_state, &flags, 1);
2418 /* branch_len = (paren != 0); */
2422 if (*RExC_parse == '|') {
2423 if (!SIZE_ONLY && RExC_extralen) {
2424 reginsert(pRExC_state, BRANCHJ, br);
2427 reginsert(pRExC_state, BRANCH, br);
2428 Set_Node_Length(br, paren != 0);
2429 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2433 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2435 else if (paren == ':') {
2436 *flagp |= flags&SIMPLE;
2438 if (open) { /* Starts with OPEN. */
2439 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2441 else if (paren != '?') /* Not Conditional */
2443 *flagp |= flags & (SPSTART | HASWIDTH);
2445 while (*RExC_parse == '|') {
2446 if (!SIZE_ONLY && RExC_extralen) {
2447 ender = reganode(pRExC_state, LONGJMP,0);
2448 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2451 RExC_extralen += 2; /* Account for LONGJMP. */
2452 nextchar(pRExC_state);
2453 br = regbranch(pRExC_state, &flags, 0);
2457 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2461 *flagp |= flags&SPSTART;
2464 if (have_branch || paren != ':') {
2465 /* Make a closing node, and hook it on the end. */
2468 ender = reg_node(pRExC_state, TAIL);
2471 ender = reganode(pRExC_state, CLOSE, parno);
2472 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2473 Set_Node_Length(ender,1); /* MJD */
2479 *flagp &= ~HASWIDTH;
2482 ender = reg_node(pRExC_state, SUCCEED);
2485 ender = reg_node(pRExC_state, END);
2488 regtail(pRExC_state, lastbr, ender);
2491 /* Hook the tails of the branches to the closing node. */
2492 for (br = ret; br != NULL; br = regnext(br)) {
2493 regoptail(pRExC_state, br, ender);
2500 static char parens[] = "=!<,>";
2502 if (paren && (p = strchr(parens, paren))) {
2503 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2504 int flag = (p - parens) > 1;
2507 node = SUSPEND, flag = 0;
2508 reginsert(pRExC_state, node,ret);
2509 Set_Node_Offset(ret, oregcomp_parse);
2510 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 2);
2512 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2516 /* Check for proper termination. */
2518 RExC_flags = oregflags;
2519 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2520 RExC_parse = oregcomp_parse;
2521 vFAIL("Unmatched (");
2524 else if (!paren && RExC_parse < RExC_end) {
2525 if (*RExC_parse == ')') {
2527 vFAIL("Unmatched )");
2530 FAIL("Junk on end of regexp"); /* "Can't happen". */
2538 - regbranch - one alternative of an | operator
2540 * Implements the concatenation operator.
2543 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2545 register regnode *ret;
2546 register regnode *chain = NULL;
2547 register regnode *latest;
2548 I32 flags = 0, c = 0;
2553 if (!SIZE_ONLY && RExC_extralen)
2554 ret = reganode(pRExC_state, BRANCHJ,0);
2556 ret = reg_node(pRExC_state, BRANCH);
2557 Set_Node_Length(ret, 1);
2561 if (!first && SIZE_ONLY)
2562 RExC_extralen += 1; /* BRANCHJ */
2564 *flagp = WORST; /* Tentatively. */
2567 nextchar(pRExC_state);
2568 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2570 latest = regpiece(pRExC_state, &flags);
2571 if (latest == NULL) {
2572 if (flags & TRYAGAIN)
2576 else if (ret == NULL)
2578 *flagp |= flags&HASWIDTH;
2579 if (chain == NULL) /* First piece. */
2580 *flagp |= flags&SPSTART;
2583 regtail(pRExC_state, chain, latest);
2588 if (chain == NULL) { /* Loop ran zero times. */
2589 chain = reg_node(pRExC_state, NOTHING);
2594 *flagp |= flags&SIMPLE;
2601 - regpiece - something followed by possible [*+?]
2603 * Note that the branching code sequences used for ? and the general cases
2604 * of * and + are somewhat optimized: they use the same NOTHING node as
2605 * both the endmarker for their branch list and the body of the last branch.
2606 * It might seem that this node could be dispensed with entirely, but the
2607 * endmarker role is not redundant.
2610 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2612 register regnode *ret;
2614 register char *next;
2616 char *origparse = RExC_parse;
2619 I32 max = REG_INFTY;
2622 ret = regatom(pRExC_state, &flags);
2624 if (flags & TRYAGAIN)
2631 if (op == '{' && regcurly(RExC_parse)) {
2632 parse_start = RExC_parse; /* MJD */
2633 next = RExC_parse + 1;
2635 while (isDIGIT(*next) || *next == ',') {
2644 if (*next == '}') { /* got one */
2648 min = atoi(RExC_parse);
2652 maxpos = RExC_parse;
2654 if (!max && *maxpos != '0')
2655 max = REG_INFTY; /* meaning "infinity" */
2656 else if (max >= REG_INFTY)
2657 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2659 nextchar(pRExC_state);
2662 if ((flags&SIMPLE)) {
2663 RExC_naughty += 2 + RExC_naughty / 2;
2664 reginsert(pRExC_state, CURLY, ret);
2665 Set_Node_Offset(ret, parse_start+1); /* MJD */
2666 Set_Node_Cur_Length(ret);
2669 regnode *w = reg_node(pRExC_state, WHILEM);
2672 regtail(pRExC_state, ret, w);
2673 if (!SIZE_ONLY && RExC_extralen) {
2674 reginsert(pRExC_state, LONGJMP,ret);
2675 reginsert(pRExC_state, NOTHING,ret);
2676 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2678 reginsert(pRExC_state, CURLYX,ret);
2680 Set_Node_Offset(ret, parse_start+1);
2681 Set_Node_Length(ret,
2682 op == '{' ? (RExC_parse - parse_start) : 1);
2684 if (!SIZE_ONLY && RExC_extralen)
2685 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2686 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2688 RExC_whilem_seen++, RExC_extralen += 3;
2689 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2697 if (max && max < min)
2698 vFAIL("Can't do {n,m} with n > m");
2700 ARG1_SET(ret, (U16)min);
2701 ARG2_SET(ret, (U16)max);
2713 #if 0 /* Now runtime fix should be reliable. */
2715 /* if this is reinstated, don't forget to put this back into perldiag:
2717 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2719 (F) The part of the regexp subject to either the * or + quantifier
2720 could match an empty string. The {#} shows in the regular
2721 expression about where the problem was discovered.
2725 if (!(flags&HASWIDTH) && op != '?')
2726 vFAIL("Regexp *+ operand could be empty");
2729 parse_start = RExC_parse;
2730 nextchar(pRExC_state);
2732 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2734 if (op == '*' && (flags&SIMPLE)) {
2735 reginsert(pRExC_state, STAR, ret);
2739 else if (op == '*') {
2743 else if (op == '+' && (flags&SIMPLE)) {
2744 reginsert(pRExC_state, PLUS, ret);
2748 else if (op == '+') {
2752 else if (op == '?') {
2757 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2759 "%.*s matches null string many times",
2760 RExC_parse - origparse,
2764 if (*RExC_parse == '?') {
2765 nextchar(pRExC_state);
2766 reginsert(pRExC_state, MINMOD, ret);
2767 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2769 if (ISMULT2(RExC_parse)) {
2771 vFAIL("Nested quantifiers");
2778 - regatom - the lowest level
2780 * Optimization: gobbles an entire sequence of ordinary characters so that
2781 * it can turn them into a single node, which is smaller to store and
2782 * faster to run. Backslashed characters are exceptions, each becoming a
2783 * separate node; the code is simpler that way and it's not worth fixing.
2785 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2787 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2789 register regnode *ret = 0;
2791 char *parse_start = 0;
2793 *flagp = WORST; /* Tentatively. */
2796 switch (*RExC_parse) {
2798 RExC_seen_zerolen++;
2799 nextchar(pRExC_state);
2800 if (RExC_flags & PMf_MULTILINE)
2801 ret = reg_node(pRExC_state, MBOL);
2802 else if (RExC_flags & PMf_SINGLELINE)
2803 ret = reg_node(pRExC_state, SBOL);
2805 ret = reg_node(pRExC_state, BOL);
2806 Set_Node_Length(ret, 1); /* MJD */
2809 nextchar(pRExC_state);
2811 RExC_seen_zerolen++;
2812 if (RExC_flags & PMf_MULTILINE)
2813 ret = reg_node(pRExC_state, MEOL);
2814 else if (RExC_flags & PMf_SINGLELINE)
2815 ret = reg_node(pRExC_state, SEOL);
2817 ret = reg_node(pRExC_state, EOL);
2818 Set_Node_Length(ret, 1); /* MJD */
2821 nextchar(pRExC_state);
2822 if (RExC_flags & PMf_SINGLELINE)
2823 ret = reg_node(pRExC_state, SANY);
2825 ret = reg_node(pRExC_state, REG_ANY);
2826 *flagp |= HASWIDTH|SIMPLE;
2828 Set_Node_Length(ret, 1); /* MJD */
2832 char *oregcomp_parse = ++RExC_parse;
2833 ret = regclass(pRExC_state);
2834 if (*RExC_parse != ']') {
2835 RExC_parse = oregcomp_parse;
2836 vFAIL("Unmatched [");
2838 nextchar(pRExC_state);
2839 *flagp |= HASWIDTH|SIMPLE;
2840 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2844 nextchar(pRExC_state);
2845 ret = reg(pRExC_state, 1, &flags);
2847 if (flags & TRYAGAIN) {
2848 if (RExC_parse == RExC_end) {
2849 /* Make parent create an empty node if needed. */
2857 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2861 if (flags & TRYAGAIN) {
2865 vFAIL("Internal urp");
2866 /* Supposed to be caught earlier. */
2869 if (!regcurly(RExC_parse)) {
2878 vFAIL("Quantifier follows nothing");
2881 switch (*++RExC_parse) {
2883 RExC_seen_zerolen++;
2884 ret = reg_node(pRExC_state, SBOL);
2886 nextchar(pRExC_state);
2887 Set_Node_Length(ret, 2); /* MJD */
2890 ret = reg_node(pRExC_state, GPOS);
2891 RExC_seen |= REG_SEEN_GPOS;
2893 nextchar(pRExC_state);
2894 Set_Node_Length(ret, 2); /* MJD */
2897 ret = reg_node(pRExC_state, SEOL);
2899 RExC_seen_zerolen++; /* Do not optimize RE away */
2900 nextchar(pRExC_state);
2903 ret = reg_node(pRExC_state, EOS);
2905 RExC_seen_zerolen++; /* Do not optimize RE away */
2906 nextchar(pRExC_state);
2907 Set_Node_Length(ret, 2); /* MJD */
2910 ret = reg_node(pRExC_state, CANY);
2911 RExC_seen |= REG_SEEN_CANY;
2912 *flagp |= HASWIDTH|SIMPLE;
2913 nextchar(pRExC_state);
2914 Set_Node_Length(ret, 2); /* MJD */
2917 ret = reg_node(pRExC_state, CLUMP);
2919 nextchar(pRExC_state);
2920 Set_Node_Length(ret, 2); /* MJD */
2923 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2924 *flagp |= HASWIDTH|SIMPLE;
2925 nextchar(pRExC_state);
2926 Set_Node_Length(ret, 2); /* MJD */
2929 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2930 *flagp |= HASWIDTH|SIMPLE;
2931 nextchar(pRExC_state);
2932 Set_Node_Length(ret, 2); /* MJD */
2935 RExC_seen_zerolen++;
2936 RExC_seen |= REG_SEEN_LOOKBEHIND;
2937 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2939 nextchar(pRExC_state);
2940 Set_Node_Length(ret, 2); /* MJD */
2943 RExC_seen_zerolen++;
2944 RExC_seen |= REG_SEEN_LOOKBEHIND;
2945 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2947 nextchar(pRExC_state);
2948 Set_Node_Length(ret, 2); /* MJD */
2951 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2952 *flagp |= HASWIDTH|SIMPLE;
2953 nextchar(pRExC_state);
2954 Set_Node_Length(ret, 2); /* MJD */
2957 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2958 *flagp |= HASWIDTH|SIMPLE;
2959 nextchar(pRExC_state);
2960 Set_Node_Length(ret, 2); /* MJD */
2963 ret = reg_node(pRExC_state, DIGIT);
2964 *flagp |= HASWIDTH|SIMPLE;
2965 nextchar(pRExC_state);
2966 Set_Node_Length(ret, 2); /* MJD */
2969 ret = reg_node(pRExC_state, NDIGIT);
2970 *flagp |= HASWIDTH|SIMPLE;
2971 nextchar(pRExC_state);
2972 Set_Node_Length(ret, 2); /* MJD */
2977 char* oldregxend = RExC_end;
2978 char* parse_start = RExC_parse - 2;
2980 if (RExC_parse[1] == '{') {
2981 /* a lovely hack--pretend we saw [\pX] instead */
2982 RExC_end = strchr(RExC_parse, '}');
2984 U8 c = (U8)*RExC_parse;
2986 RExC_end = oldregxend;
2987 vFAIL2("Missing right brace on \\%c{}", c);
2992 RExC_end = RExC_parse + 2;
2993 if (RExC_end > oldregxend)
2994 RExC_end = oldregxend;
2998 ret = regclass(pRExC_state);
3000 RExC_end = oldregxend;
3003 Set_Node_Offset(ret, parse_start + 2);
3004 Set_Node_Cur_Length(ret);
3005 nextchar(pRExC_state);
3006 *flagp |= HASWIDTH|SIMPLE;
3019 case '1': case '2': case '3': case '4':
3020 case '5': case '6': case '7': case '8': case '9':
3022 I32 num = atoi(RExC_parse);
3024 if (num > 9 && num >= RExC_npar)
3027 char * parse_start = RExC_parse - 1; /* MJD */
3028 while (isDIGIT(*RExC_parse))
3031 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3032 vFAIL("Reference to nonexistent group");
3034 ret = reganode(pRExC_state,
3035 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3039 /* override incorrect value set in reganode MJD */
3040 Set_Node_Offset(ret, parse_start+1);
3041 Set_Node_Cur_Length(ret); /* MJD */
3043 nextchar(pRExC_state);
3048 if (RExC_parse >= RExC_end)
3049 FAIL("Trailing \\");
3052 /* Do not generate `unrecognized' warnings here, we fall
3053 back into the quick-grab loop below */
3059 if (RExC_flags & PMf_EXTENDED) {
3060 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3061 if (RExC_parse < RExC_end)
3067 register STRLEN len;
3073 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3075 parse_start = RExC_parse - 1;
3081 ret = reg_node(pRExC_state,
3082 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3084 for (len = 0, p = RExC_parse - 1;
3085 len < 127 && p < RExC_end;
3090 if (RExC_flags & PMf_EXTENDED)
3091 p = regwhite(p, RExC_end);
3138 ender = ASCII_TO_NATIVE('\033');
3142 ender = ASCII_TO_NATIVE('\007');
3147 char* e = strchr(p, '}');
3151 vFAIL("Missing right brace on \\x{}");
3154 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3155 | PERL_SCAN_DISALLOW_PREFIX;
3157 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3164 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3166 ender = grok_hex(p, &numlen, &flags, NULL);
3172 ender = UCHARAT(p++);
3173 ender = toCTRL(ender);
3175 case '0': case '1': case '2': case '3':case '4':
3176 case '5': case '6': case '7': case '8':case '9':
3178 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3181 ender = grok_oct(p, &numlen, &flags, NULL);
3191 FAIL("Trailing \\");
3194 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3195 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3196 goto normal_default;
3201 if (UTF8_IS_START(*p) && UTF) {
3202 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3210 if (RExC_flags & PMf_EXTENDED)
3211 p = regwhite(p, RExC_end);
3213 /* Prime the casefolded buffer. */
3214 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3216 if (ISMULT2(p)) { /* Back off on ?+*. */
3223 /* Emit all the Unicode characters. */
3224 for (foldbuf = tmpbuf;
3226 foldlen -= numlen) {
3227 ender = utf8_to_uvchr(foldbuf, &numlen);
3229 reguni(pRExC_state, ender, s, &unilen);
3232 /* In EBCDIC the numlen
3233 * and unilen can differ. */
3235 if (numlen >= foldlen)
3239 break; /* "Can't happen." */
3243 reguni(pRExC_state, ender, s, &unilen);
3252 REGC((char)ender, s++);
3260 /* Emit all the Unicode characters. */
3261 for (foldbuf = tmpbuf;
3263 foldlen -= numlen) {
3264 ender = utf8_to_uvchr(foldbuf, &numlen);
3266 reguni(pRExC_state, ender, s, &unilen);
3269 /* In EBCDIC the numlen
3270 * and unilen can differ. */
3272 if (numlen >= foldlen)
3280 reguni(pRExC_state, ender, s, &unilen);
3289 REGC((char)ender, s++);
3293 Set_Node_Cur_Length(ret); /* MJD */
3294 nextchar(pRExC_state);
3296 /* len is STRLEN which is unsigned, need to copy to signed */
3299 vFAIL("Internal disaster");
3303 if (len == 1 && UNI_IS_INVARIANT(ender))
3308 RExC_size += STR_SZ(len);
3310 RExC_emit += STR_SZ(len);
3315 /* If the encoding pragma is in effect recode the text of
3316 * any EXACT-kind nodes. */
3317 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3318 STRLEN oldlen = STR_LEN(ret);
3319 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3323 if (sv_utf8_downgrade(sv, TRUE)) {
3324 char *s = sv_recode_to_utf8(sv, PL_encoding);
3325 STRLEN newlen = SvCUR(sv);
3330 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3331 (int)oldlen, STRING(ret),
3333 Copy(s, STRING(ret), newlen, char);
3334 STR_LEN(ret) += newlen - oldlen;
3335 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3337 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3345 S_regwhite(pTHX_ char *p, char *e)
3350 else if (*p == '#') {
3353 } while (p < e && *p != '\n');
3361 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3362 Character classes ([:foo:]) can also be negated ([:^foo:]).
3363 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3364 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3365 but trigger failures because they are currently unimplemented. */
3367 #define POSIXCC_DONE(c) ((c) == ':')
3368 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3369 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3372 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3375 I32 namedclass = OOB_NAMEDCLASS;
3377 if (value == '[' && RExC_parse + 1 < RExC_end &&
3378 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3379 POSIXCC(UCHARAT(RExC_parse))) {
3380 char c = UCHARAT(RExC_parse);
3381 char* s = RExC_parse++;
3383 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3385 if (RExC_parse == RExC_end)
3386 /* Grandfather lone [:, [=, [. */
3389 char* t = RExC_parse++; /* skip over the c */
3391 if (UCHARAT(RExC_parse) == ']') {
3392 RExC_parse++; /* skip over the ending ] */
3395 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3396 I32 skip = 5; /* the most common skip */
3400 if (strnEQ(posixcc, "alnum", 5))
3402 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3403 else if (strnEQ(posixcc, "alpha", 5))
3405 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3406 else if (strnEQ(posixcc, "ascii", 5))
3408 complement ? ANYOF_NASCII : ANYOF_ASCII;
3411 if (strnEQ(posixcc, "blank", 5))
3413 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3416 if (strnEQ(posixcc, "cntrl", 5))
3418 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3421 if (strnEQ(posixcc, "digit", 5))
3423 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3426 if (strnEQ(posixcc, "graph", 5))
3428 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3431 if (strnEQ(posixcc, "lower", 5))
3433 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3436 if (strnEQ(posixcc, "print", 5))
3438 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3439 else if (strnEQ(posixcc, "punct", 5))
3441 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3444 if (strnEQ(posixcc, "space", 5))
3446 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3449 if (strnEQ(posixcc, "upper", 5))
3451 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3453 case 'w': /* this is not POSIX, this is the Perl \w */
3454 if (strnEQ(posixcc, "word", 4)) {
3456 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3461 if (strnEQ(posixcc, "xdigit", 6)) {
3463 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3468 if (namedclass == OOB_NAMEDCLASS ||
3469 posixcc[skip] != ':' ||
3470 posixcc[skip+1] != ']')
3472 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3475 } else if (!SIZE_ONLY) {
3476 /* [[=foo=]] and [[.foo.]] are still future. */
3478 /* adjust RExC_parse so the warning shows after
3480 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3482 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3485 /* Maternal grandfather:
3486 * "[:" ending in ":" but not in ":]" */
3496 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3498 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3499 char *s = RExC_parse;
3502 while(*s && isALNUM(*s))
3504 if (*s && c == *s && s[1] == ']') {
3505 if (ckWARN(WARN_REGEXP))
3507 "POSIX syntax [%c %c] belongs inside character classes",
3510 /* [[=foo=]] and [[.foo.]] are still future. */
3511 if (POSIXCC_NOTYET(c)) {
3512 /* adjust RExC_parse so the error shows after
3514 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3516 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3523 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3526 register UV nextvalue;
3527 register IV prevvalue = OOB_UNICODE;
3528 register IV range = 0;
3529 register regnode *ret;
3532 char *rangebegin = 0;
3533 bool need_class = 0;
3534 SV *listsv = Nullsv;
3537 bool optimize_invert = TRUE;
3538 AV* unicode_alternate = 0;
3540 UV literal_endpoint = 0;
3543 ret = reganode(pRExC_state, ANYOF, 0);
3546 ANYOF_FLAGS(ret) = 0;
3548 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3552 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3556 RExC_size += ANYOF_SKIP;
3558 RExC_emit += ANYOF_SKIP;
3560 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3562 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3563 ANYOF_BITMAP_ZERO(ret);
3564 listsv = newSVpvn("# comment\n", 10);
3567 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3569 if (!SIZE_ONLY && POSIXCC(nextvalue))
3570 checkposixcc(pRExC_state);
3572 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3573 if (UCHARAT(RExC_parse) == ']')
3576 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3580 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3583 rangebegin = RExC_parse;
3585 value = utf8n_to_uvchr((U8*)RExC_parse,
3586 RExC_end - RExC_parse,
3588 RExC_parse += numlen;
3591 value = UCHARAT(RExC_parse++);
3592 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3593 if (value == '[' && POSIXCC(nextvalue))
3594 namedclass = regpposixcc(pRExC_state, value);
3595 else if (value == '\\') {
3597 value = utf8n_to_uvchr((U8*)RExC_parse,
3598 RExC_end - RExC_parse,
3600 RExC_parse += numlen;
3603 value = UCHARAT(RExC_parse++);
3604 /* Some compilers cannot handle switching on 64-bit integer
3605 * values, therefore value cannot be an UV. Yes, this will
3606 * be a problem later if we want switch on Unicode.
3607 * A similar issue a little bit later when switching on
3608 * namedclass. --jhi */
3609 switch ((I32)value) {
3610 case 'w': namedclass = ANYOF_ALNUM; break;
3611 case 'W': namedclass = ANYOF_NALNUM; break;
3612 case 's': namedclass = ANYOF_SPACE; break;
3613 case 'S': namedclass = ANYOF_NSPACE; break;
3614 case 'd': namedclass = ANYOF_DIGIT; break;
3615 case 'D': namedclass = ANYOF_NDIGIT; break;
3618 if (RExC_parse >= RExC_end)
3619 vFAIL2("Empty \\%c{}", (U8)value);
3620 if (*RExC_parse == '{') {
3622 e = strchr(RExC_parse++, '}');
3624 vFAIL2("Missing right brace on \\%c{}", c);
3625 while (isSPACE(UCHARAT(RExC_parse)))
3627 if (e == RExC_parse)
3628 vFAIL2("Empty \\%c{}", c);
3630 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3638 if (UCHARAT(RExC_parse) == '^') {
3641 value = value == 'p' ? 'P' : 'p'; /* toggle */
3642 while (isSPACE(UCHARAT(RExC_parse))) {
3648 Perl_sv_catpvf(aTHX_ listsv,
3649 "+utf8::%.*s\n", (int)n, RExC_parse);
3651 Perl_sv_catpvf(aTHX_ listsv,
3652 "!utf8::%.*s\n", (int)n, RExC_parse);
3655 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3657 case 'n': value = '\n'; break;
3658 case 'r': value = '\r'; break;
3659 case 't': value = '\t'; break;
3660 case 'f': value = '\f'; break;
3661 case 'b': value = '\b'; break;
3662 case 'e': value = ASCII_TO_NATIVE('\033');break;
3663 case 'a': value = ASCII_TO_NATIVE('\007');break;
3665 if (*RExC_parse == '{') {
3666 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3667 | PERL_SCAN_DISALLOW_PREFIX;
3668 e = strchr(RExC_parse++, '}');
3670 vFAIL("Missing right brace on \\x{}");
3672 numlen = e - RExC_parse;
3673 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3677 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3679 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3680 RExC_parse += numlen;
3684 value = UCHARAT(RExC_parse++);
3685 value = toCTRL(value);
3687 case '0': case '1': case '2': case '3': case '4':
3688 case '5': case '6': case '7': case '8': case '9':
3692 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3693 RExC_parse += numlen;
3697 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3699 "Unrecognized escape \\%c in character class passed through",
3703 } /* end of \blah */
3709 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3711 if (!SIZE_ONLY && !need_class)
3712 ANYOF_CLASS_ZERO(ret);
3716 /* a bad range like a-\d, a-[:digit:] ? */
3719 if (ckWARN(WARN_REGEXP))
3721 "False [] range \"%*.*s\"",
3722 RExC_parse - rangebegin,
3723 RExC_parse - rangebegin,
3725 if (prevvalue < 256) {
3726 ANYOF_BITMAP_SET(ret, prevvalue);
3727 ANYOF_BITMAP_SET(ret, '-');
3730 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3731 Perl_sv_catpvf(aTHX_ listsv,
3732 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3736 range = 0; /* this was not a true range */
3740 if (namedclass > OOB_NAMEDCLASS)
3741 optimize_invert = FALSE;
3742 /* Possible truncation here but in some 64-bit environments
3743 * the compiler gets heartburn about switch on 64-bit values.
3744 * A similar issue a little earlier when switching on value.
3746 switch ((I32)namedclass) {
3749 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3751 for (value = 0; value < 256; value++)
3753 ANYOF_BITMAP_SET(ret, value);
3755 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3759 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3761 for (value = 0; value < 256; value++)
3762 if (!isALNUM(value))
3763 ANYOF_BITMAP_SET(ret, value);
3765 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3769 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3771 for (value = 0; value < 256; value++)
3772 if (isALNUMC(value))
3773 ANYOF_BITMAP_SET(ret, value);
3775 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3779 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3781 for (value = 0; value < 256; value++)
3782 if (!isALNUMC(value))
3783 ANYOF_BITMAP_SET(ret, value);
3785 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3789 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3791 for (value = 0; value < 256; value++)
3793 ANYOF_BITMAP_SET(ret, value);
3795 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3799 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3801 for (value = 0; value < 256; value++)
3802 if (!isALPHA(value))
3803 ANYOF_BITMAP_SET(ret, value);
3805 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3809 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3812 for (value = 0; value < 128; value++)
3813 ANYOF_BITMAP_SET(ret, value);
3815 for (value = 0; value < 256; value++) {
3817 ANYOF_BITMAP_SET(ret, value);
3821 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3825 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3828 for (value = 128; value < 256; value++)
3829 ANYOF_BITMAP_SET(ret, value);
3831 for (value = 0; value < 256; value++) {
3832 if (!isASCII(value))
3833 ANYOF_BITMAP_SET(ret, value);
3837 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3841 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3843 for (value = 0; value < 256; value++)
3845 ANYOF_BITMAP_SET(ret, value);
3847 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3851 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3853 for (value = 0; value < 256; value++)
3854 if (!isBLANK(value))
3855 ANYOF_BITMAP_SET(ret, value);
3857 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3861 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3863 for (value = 0; value < 256; value++)
3865 ANYOF_BITMAP_SET(ret, value);
3867 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3871 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3873 for (value = 0; value < 256; value++)
3874 if (!isCNTRL(value))
3875 ANYOF_BITMAP_SET(ret, value);
3877 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3881 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3883 /* consecutive digits assumed */
3884 for (value = '0'; value <= '9'; value++)
3885 ANYOF_BITMAP_SET(ret, value);
3887 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3891 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3893 /* consecutive digits assumed */
3894 for (value = 0; value < '0'; value++)
3895 ANYOF_BITMAP_SET(ret, value);
3896 for (value = '9' + 1; value < 256; value++)
3897 ANYOF_BITMAP_SET(ret, value);
3899 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3903 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3905 for (value = 0; value < 256; value++)
3907 ANYOF_BITMAP_SET(ret, value);
3909 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3913 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3915 for (value = 0; value < 256; value++)
3916 if (!isGRAPH(value))
3917 ANYOF_BITMAP_SET(ret, value);
3919 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3923 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3925 for (value = 0; value < 256; value++)
3927 ANYOF_BITMAP_SET(ret, value);
3929 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3933 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3935 for (value = 0; value < 256; value++)
3936 if (!isLOWER(value))
3937 ANYOF_BITMAP_SET(ret, value);
3939 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3943 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3945 for (value = 0; value < 256; value++)
3947 ANYOF_BITMAP_SET(ret, value);
3949 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3953 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3955 for (value = 0; value < 256; value++)
3956 if (!isPRINT(value))
3957 ANYOF_BITMAP_SET(ret, value);
3959 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3963 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3965 for (value = 0; value < 256; value++)
3966 if (isPSXSPC(value))
3967 ANYOF_BITMAP_SET(ret, value);
3969 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3973 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3975 for (value = 0; value < 256; value++)
3976 if (!isPSXSPC(value))
3977 ANYOF_BITMAP_SET(ret, value);
3979 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3983 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3985 for (value = 0; value < 256; value++)
3987 ANYOF_BITMAP_SET(ret, value);
3989 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
3993 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
3995 for (value = 0; value < 256; value++)
3996 if (!isPUNCT(value))
3997 ANYOF_BITMAP_SET(ret, value);
3999 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4003 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4005 for (value = 0; value < 256; value++)
4007 ANYOF_BITMAP_SET(ret, value);
4009 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4013 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4015 for (value = 0; value < 256; value++)
4016 if (!isSPACE(value))
4017 ANYOF_BITMAP_SET(ret, value);
4019 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4023 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4025 for (value = 0; value < 256; value++)
4027 ANYOF_BITMAP_SET(ret, value);
4029 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4033 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4035 for (value = 0; value < 256; value++)
4036 if (!isUPPER(value))
4037 ANYOF_BITMAP_SET(ret, value);
4039 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4043 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4045 for (value = 0; value < 256; value++)
4046 if (isXDIGIT(value))
4047 ANYOF_BITMAP_SET(ret, value);
4049 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4053 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4055 for (value = 0; value < 256; value++)
4056 if (!isXDIGIT(value))
4057 ANYOF_BITMAP_SET(ret, value);
4059 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4062 vFAIL("Invalid [::] class");
4066 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4069 } /* end of namedclass \blah */
4072 if (prevvalue > (IV)value) /* b-a */ {
4073 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4074 RExC_parse - rangebegin,
4075 RExC_parse - rangebegin,
4077 range = 0; /* not a valid range */
4081 prevvalue = value; /* save the beginning of the range */
4082 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4083 RExC_parse[1] != ']') {
4086 /* a bad range like \w-, [:word:]- ? */
4087 if (namedclass > OOB_NAMEDCLASS) {
4088 if (ckWARN(WARN_REGEXP))
4090 "False [] range \"%*.*s\"",
4091 RExC_parse - rangebegin,
4092 RExC_parse - rangebegin,
4095 ANYOF_BITMAP_SET(ret, '-');
4097 range = 1; /* yeah, it's a range! */
4098 continue; /* but do it the next time */
4102 /* now is the next time */
4106 if (prevvalue < 256) {
4107 IV ceilvalue = value < 256 ? value : 255;
4110 /* In EBCDIC [\x89-\x91] should include
4111 * the \x8e but [i-j] should not. */
4112 if (literal_endpoint == 2 &&
4113 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4114 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4116 if (isLOWER(prevvalue)) {
4117 for (i = prevvalue; i <= ceilvalue; i++)
4119 ANYOF_BITMAP_SET(ret, i);
4121 for (i = prevvalue; i <= ceilvalue; i++)
4123 ANYOF_BITMAP_SET(ret, i);
4128 for (i = prevvalue; i <= ceilvalue; i++)
4129 ANYOF_BITMAP_SET(ret, i);
4131 if (value > 255 || UTF) {
4132 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4133 UV natvalue = NATIVE_TO_UNI(value);
4135 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4136 if (prevnatvalue < natvalue) { /* what about > ? */
4137 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4138 prevnatvalue, natvalue);
4140 else if (prevnatvalue == natvalue) {
4141 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4143 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4145 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4147 /* If folding and foldable and a single
4148 * character, insert also the folded version
4149 * to the charclass. */
4151 if (foldlen == (STRLEN)UNISKIP(f))
4152 Perl_sv_catpvf(aTHX_ listsv,
4155 /* Any multicharacter foldings
4156 * require the following transform:
4157 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4158 * where E folds into "pq" and F folds
4159 * into "rst", all other characters
4160 * fold to single characters. We save
4161 * away these multicharacter foldings,
4162 * to be later saved as part of the
4163 * additional "s" data. */
4166 if (!unicode_alternate)
4167 unicode_alternate = newAV();
4168 sv = newSVpvn((char*)foldbuf, foldlen);
4170 av_push(unicode_alternate, sv);
4174 /* If folding and the value is one of the Greek
4175 * sigmas insert a few more sigmas to make the
4176 * folding rules of the sigmas to work right.
4177 * Note that not all the possible combinations
4178 * are handled here: some of them are handled
4179 * by the standard folding rules, and some of
4180 * them (literal or EXACTF cases) are handled
4181 * during runtime in regexec.c:S_find_byclass(). */
4182 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4183 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4184 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4185 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4186 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4188 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4189 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4190 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4195 literal_endpoint = 0;
4199 range = 0; /* this range (if it was one) is done now */
4203 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4205 RExC_size += ANYOF_CLASS_ADD_SKIP;
4207 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4210 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4212 /* If the only flag is folding (plus possibly inversion). */
4213 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4215 for (value = 0; value < 256; ++value) {
4216 if (ANYOF_BITMAP_TEST(ret, value)) {
4217 UV fold = PL_fold[value];
4220 ANYOF_BITMAP_SET(ret, fold);
4223 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4226 /* optimize inverted simple patterns (e.g. [^a-z]) */
4227 if (!SIZE_ONLY && optimize_invert &&
4228 /* If the only flag is inversion. */
4229 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4230 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4231 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4232 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4239 /* The 0th element stores the character class description
4240 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4241 * to initialize the appropriate swash (which gets stored in
4242 * the 1st element), and also useful for dumping the regnode.
4243 * The 2nd element stores the multicharacter foldings,
4244 * used later (regexec.c:S_reginclass()). */
4245 av_store(av, 0, listsv);
4246 av_store(av, 1, NULL);
4247 av_store(av, 2, (SV*)unicode_alternate);
4248 rv = newRV_noinc((SV*)av);
4249 n = add_data(pRExC_state, 1, "s");
4250 RExC_rx->data->data[n] = (void*)rv;
4258 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4260 char* retval = RExC_parse++;
4263 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4264 RExC_parse[2] == '#') {
4265 while (*RExC_parse != ')') {
4266 if (RExC_parse == RExC_end)
4267 FAIL("Sequence (?#... not terminated");
4273 if (RExC_flags & PMf_EXTENDED) {
4274 if (isSPACE(*RExC_parse)) {
4278 else if (*RExC_parse == '#') {
4279 while (RExC_parse < RExC_end)
4280 if (*RExC_parse++ == '\n') break;
4289 - reg_node - emit a node
4291 STATIC regnode * /* Location. */
4292 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4294 register regnode *ret;
4295 register regnode *ptr;
4299 SIZE_ALIGN(RExC_size);
4304 NODE_ALIGN_FILL(ret);
4306 FILL_ADVANCE_NODE(ptr, op);
4307 if (RExC_offsets) { /* MJD */
4308 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4309 "reg_node", __LINE__,
4311 RExC_emit - RExC_emit_start > RExC_offsets[0]
4312 ? "Overwriting end of array!\n" : "OK",
4313 RExC_emit - RExC_emit_start,
4314 RExC_parse - RExC_start,
4316 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4325 - reganode - emit a node with an argument
4327 STATIC regnode * /* Location. */
4328 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4330 register regnode *ret;
4331 register regnode *ptr;
4335 SIZE_ALIGN(RExC_size);
4340 NODE_ALIGN_FILL(ret);
4342 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4343 if (RExC_offsets) { /* MJD */
4344 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4348 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4349 "Overwriting end of array!\n" : "OK",
4350 RExC_emit - RExC_emit_start,
4351 RExC_parse - RExC_start,
4353 Set_Cur_Node_Offset;
4362 - reguni - emit (if appropriate) a Unicode character
4365 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4367 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4371 - reginsert - insert an operator in front of already-emitted operand
4373 * Means relocating the operand.
4376 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4378 register regnode *src;
4379 register regnode *dst;
4380 register regnode *place;
4381 register int offset = regarglen[(U8)op];
4383 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4386 RExC_size += NODE_STEP_REGNODE + offset;
4391 RExC_emit += NODE_STEP_REGNODE + offset;
4393 while (src > opnd) {
4394 StructCopy(--src, --dst, regnode);
4395 if (RExC_offsets) { /* MJD 20010112 */
4396 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4400 dst - RExC_emit_start > RExC_offsets[0]
4401 ? "Overwriting end of array!\n" : "OK",
4402 src - RExC_emit_start,
4403 dst - RExC_emit_start,
4405 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4406 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4411 place = opnd; /* Op node, where operand used to be. */
4412 if (RExC_offsets) { /* MJD */
4413 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4417 place - RExC_emit_start > RExC_offsets[0]
4418 ? "Overwriting end of array!\n" : "OK",
4419 place - RExC_emit_start,
4420 RExC_parse - RExC_start,
4422 Set_Node_Offset(place, RExC_parse);
4424 src = NEXTOPER(place);
4425 FILL_ADVANCE_NODE(place, op);
4426 Zero(src, offset, regnode);
4430 - regtail - set the next-pointer at the end of a node chain of p to val.
4433 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4435 register regnode *scan;
4436 register regnode *temp;
4441 /* Find last node. */
4444 temp = regnext(scan);
4450 if (reg_off_by_arg[OP(scan)]) {
4451 ARG_SET(scan, val - scan);
4454 NEXT_OFF(scan) = val - scan;
4459 - regoptail - regtail on operand of first argument; nop if operandless
4462 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4464 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4465 if (p == NULL || SIZE_ONLY)
4467 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4468 regtail(pRExC_state, NEXTOPER(p), val);
4470 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4471 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4478 - regcurly - a little FSA that accepts {\d+,?\d*}
4481 S_regcurly(pTHX_ register char *s)
4502 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4504 register U8 op = EXACT; /* Arbitrary non-END op. */
4505 register regnode *next;
4507 while (op != END && (!last || node < last)) {
4508 /* While that wasn't END last time... */
4514 next = regnext(node);
4516 if (OP(node) == OPTIMIZED)
4519 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4520 (int)(2*l + 1), "", SvPVX(sv));
4521 if (next == NULL) /* Next ptr. */
4522 PerlIO_printf(Perl_debug_log, "(0)");
4524 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4525 (void)PerlIO_putc(Perl_debug_log, '\n');
4527 if (PL_regkind[(U8)op] == BRANCHJ) {
4528 register regnode *nnode = (OP(next) == LONGJMP
4531 if (last && nnode > last)
4533 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4535 else if (PL_regkind[(U8)op] == BRANCH) {
4536 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4538 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4539 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4540 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4542 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4543 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4546 else if ( op == PLUS || op == STAR) {
4547 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4549 else if (op == ANYOF) {
4550 /* arglen 1 + class block */
4551 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4552 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4553 node = NEXTOPER(node);
4555 else if (PL_regkind[(U8)op] == EXACT) {
4556 /* Literal string, where present. */
4557 node += NODE_SZ_STR(node) - 1;
4558 node = NEXTOPER(node);
4561 node = NEXTOPER(node);
4562 node += regarglen[(U8)op];
4564 if (op == CURLYX || op == OPEN)
4566 else if (op == WHILEM)
4572 #endif /* DEBUGGING */
4575 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4578 Perl_regdump(pTHX_ regexp *r)
4581 SV *sv = sv_newmortal();
4583 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4585 /* Header fields of interest. */
4586 if (r->anchored_substr)
4587 PerlIO_printf(Perl_debug_log,
4588 "anchored `%s%.*s%s'%s at %"IVdf" ",
4590 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4591 SvPVX(r->anchored_substr),
4593 SvTAIL(r->anchored_substr) ? "$" : "",
4594 (IV)r->anchored_offset);
4595 else if (r->anchored_utf8)
4596 PerlIO_printf(Perl_debug_log,
4597 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4599 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4600 SvPVX(r->anchored_utf8),
4602 SvTAIL(r->anchored_utf8) ? "$" : "",
4603 (IV)r->anchored_offset);
4604 if (r->float_substr)
4605 PerlIO_printf(Perl_debug_log,
4606 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4608 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4609 SvPVX(r->float_substr),
4611 SvTAIL(r->float_substr) ? "$" : "",
4612 (IV)r->float_min_offset, (UV)r->float_max_offset);
4613 else if (r->float_utf8)
4614 PerlIO_printf(Perl_debug_log,
4615 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4617 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4618 SvPVX(r->float_utf8),
4620 SvTAIL(r->float_utf8) ? "$" : "",
4621 (IV)r->float_min_offset, (UV)r->float_max_offset);
4622 if (r->check_substr || r->check_utf8)
4623 PerlIO_printf(Perl_debug_log,
4624 r->check_substr == r->float_substr
4625 && r->check_utf8 == r->float_utf8
4626 ? "(checking floating" : "(checking anchored");
4627 if (r->reganch & ROPT_NOSCAN)
4628 PerlIO_printf(Perl_debug_log, " noscan");
4629 if (r->reganch & ROPT_CHECK_ALL)
4630 PerlIO_printf(Perl_debug_log, " isall");
4631 if (r->check_substr || r->check_utf8)
4632 PerlIO_printf(Perl_debug_log, ") ");
4634 if (r->regstclass) {
4635 regprop(sv, r->regstclass);
4636 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4638 if (r->reganch & ROPT_ANCH) {
4639 PerlIO_printf(Perl_debug_log, "anchored");
4640 if (r->reganch & ROPT_ANCH_BOL)
4641 PerlIO_printf(Perl_debug_log, "(BOL)");
4642 if (r->reganch & ROPT_ANCH_MBOL)
4643 PerlIO_printf(Perl_debug_log, "(MBOL)");
4644 if (r->reganch & ROPT_ANCH_SBOL)
4645 PerlIO_printf(Perl_debug_log, "(SBOL)");
4646 if (r->reganch & ROPT_ANCH_GPOS)
4647 PerlIO_printf(Perl_debug_log, "(GPOS)");
4648 PerlIO_putc(Perl_debug_log, ' ');
4650 if (r->reganch & ROPT_GPOS_SEEN)
4651 PerlIO_printf(Perl_debug_log, "GPOS ");
4652 if (r->reganch & ROPT_SKIP)
4653 PerlIO_printf(Perl_debug_log, "plus ");
4654 if (r->reganch & ROPT_IMPLICIT)
4655 PerlIO_printf(Perl_debug_log, "implicit ");
4656 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4657 if (r->reganch & ROPT_EVAL_SEEN)
4658 PerlIO_printf(Perl_debug_log, "with eval ");
4659 PerlIO_printf(Perl_debug_log, "\n");
4662 U32 len = r->offsets[0];
4663 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4664 for (i = 1; i <= len; i++)
4665 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4666 (UV)r->offsets[i*2-1],
4667 (UV)r->offsets[i*2]);
4668 PerlIO_printf(Perl_debug_log, "\n");
4670 #endif /* DEBUGGING */
4676 S_put_byte(pTHX_ SV *sv, int c)
4678 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4679 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4680 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4681 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4683 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4686 #endif /* DEBUGGING */
4689 - regprop - printable representation of opcode
4692 Perl_regprop(pTHX_ SV *sv, regnode *o)
4697 sv_setpvn(sv, "", 0);
4698 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4699 /* It would be nice to FAIL() here, but this may be called from
4700 regexec.c, and it would be hard to supply pRExC_state. */
4701 Perl_croak(aTHX_ "Corrupted regexp opcode");
4702 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4704 k = PL_regkind[(U8)OP(o)];
4707 SV *dsv = sv_2mortal(newSVpvn("", 0));
4708 /* Using is_utf8_string() is a crude hack but it may
4709 * be the best for now since we have no flag "this EXACTish
4710 * node was UTF-8" --jhi */
4711 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4713 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4714 UNI_DISPLAY_REGEX) :
4719 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4724 else if (k == CURLY) {
4725 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4726 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4727 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4729 else if (k == WHILEM && o->flags) /* Ordinal/of */
4730 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4731 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4732 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4733 else if (k == LOGICAL)
4734 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4735 else if (k == ANYOF) {
4736 int i, rangestart = -1;
4737 U8 flags = ANYOF_FLAGS(o);
4738 const char * const anyofs[] = { /* Should be synchronized with
4739 * ANYOF_ #xdefines in regcomp.h */
4772 if (flags & ANYOF_LOCALE)
4773 sv_catpv(sv, "{loc}");
4774 if (flags & ANYOF_FOLD)
4775 sv_catpv(sv, "{i}");
4776 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4777 if (flags & ANYOF_INVERT)
4779 for (i = 0; i <= 256; i++) {
4780 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4781 if (rangestart == -1)
4783 } else if (rangestart != -1) {
4784 if (i <= rangestart + 3)
4785 for (; rangestart < i; rangestart++)
4786 put_byte(sv, rangestart);
4788 put_byte(sv, rangestart);
4790 put_byte(sv, i - 1);
4796 if (o->flags & ANYOF_CLASS)
4797 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4798 if (ANYOF_CLASS_TEST(o,i))
4799 sv_catpv(sv, anyofs[i]);
4801 if (flags & ANYOF_UNICODE)
4802 sv_catpv(sv, "{unicode}");
4803 else if (flags & ANYOF_UNICODE_ALL)
4804 sv_catpv(sv, "{unicode_all}");
4808 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4812 U8 s[UTF8_MAXLEN+1];
4814 for (i = 0; i <= 256; i++) { /* just the first 256 */
4815 U8 *e = uvchr_to_utf8(s, i);
4817 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4818 if (rangestart == -1)
4820 } else if (rangestart != -1) {
4823 if (i <= rangestart + 3)
4824 for (; rangestart < i; rangestart++) {
4825 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4829 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4832 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4839 sv_catpv(sv, "..."); /* et cetera */
4843 char *s = savepv(SvPVX(lv));
4846 while(*s && *s != '\n') s++;
4867 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4869 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4870 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4871 #endif /* DEBUGGING */
4875 Perl_re_intuit_string(pTHX_ regexp *prog)
4876 { /* Assume that RE_INTUIT is set */
4879 char *s = SvPV(prog->check_substr
4880 ? prog->check_substr : prog->check_utf8, n_a);
4882 if (!PL_colorset) reginitcolors();
4883 PerlIO_printf(Perl_debug_log,
4884 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4886 prog->check_substr ? "" : "utf8 ",
4887 PL_colors[5],PL_colors[0],
4890 (strlen(s) > 60 ? "..." : ""));
4893 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4897 Perl_pregfree(pTHX_ struct regexp *r)
4900 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4903 if (!r || (--r->refcnt > 0))
4909 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4910 r->prelen, 60, UNI_DISPLAY_REGEX)
4911 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4915 PerlIO_printf(Perl_debug_log,
4916 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4917 PL_colors[4],PL_colors[5],PL_colors[0],
4920 len > 60 ? "..." : "");
4924 Safefree(r->precomp);
4925 if (r->offsets) /* 20010421 MJD */
4926 Safefree(r->offsets);
4927 if (RX_MATCH_COPIED(r))
4928 Safefree(r->subbeg);
4930 if (r->anchored_substr)
4931 SvREFCNT_dec(r->anchored_substr);
4932 if (r->anchored_utf8)
4933 SvREFCNT_dec(r->anchored_utf8);
4934 if (r->float_substr)
4935 SvREFCNT_dec(r->float_substr);
4937 SvREFCNT_dec(r->float_utf8);
4938 Safefree(r->substrs);
4941 int n = r->data->count;
4942 PAD* new_comppad = NULL;
4946 /* If you add a ->what type here, update the comment in regcomp.h */
4947 switch (r->data->what[n]) {
4949 SvREFCNT_dec((SV*)r->data->data[n]);
4952 Safefree(r->data->data[n]);
4955 new_comppad = (AV*)r->data->data[n];
4958 if (new_comppad == NULL)
4959 Perl_croak(aTHX_ "panic: pregfree comppad");
4960 PAD_SAVE_LOCAL(old_comppad,
4961 /* Watch out for global destruction's random ordering. */
4962 (SvTYPE(new_comppad) == SVt_PVAV) ?
4963 new_comppad : Null(PAD *)
4965 if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
4966 op_free((OP_4tree*)r->data->data[n]);
4969 PAD_RESTORE_LOCAL(old_comppad);
4970 SvREFCNT_dec((SV*)new_comppad);
4976 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
4979 Safefree(r->data->what);
4982 Safefree(r->startp);
4988 - regnext - dig the "next" pointer out of a node
4990 * [Note, when REGALIGN is defined there are two places in regmatch()
4991 * that bypass this code for speed.]
4994 Perl_regnext(pTHX_ register regnode *p)
4996 register I32 offset;
4998 if (p == &PL_regdummy)
5001 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5009 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5012 STRLEN l1 = strlen(pat1);
5013 STRLEN l2 = strlen(pat2);
5022 Copy(pat1, buf, l1 , char);
5023 Copy(pat2, buf + l1, l2 , char);
5024 buf[l1 + l2] = '\n';
5025 buf[l1 + l2 + 1] = '\0';
5027 /* ANSI variant takes additional second argument */
5028 va_start(args, pat2);
5032 msv = vmess(buf, &args);
5034 message = SvPV(msv,l1);
5037 Copy(message, buf, l1 , char);
5038 buf[l1] = '\0'; /* Overwrite \n */
5039 Perl_croak(aTHX_ "%s", buf);
5042 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5045 Perl_save_re_context(pTHX)
5047 SAVEI32(PL_reg_flags); /* from regexec.c */
5049 SAVEPPTR(PL_reginput); /* String-input pointer. */
5050 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5051 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5052 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5053 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5054 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5055 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5056 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5057 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5058 PL_reg_start_tmp = 0;
5059 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5060 PL_reg_start_tmpl = 0;
5061 SAVEVPTR(PL_regdata);
5062 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5063 SAVEI32(PL_regnarrate); /* from regexec.c */
5064 SAVEVPTR(PL_regprogram); /* from regexec.c */
5065 SAVEINT(PL_regindent); /* from regexec.c */
5066 SAVEVPTR(PL_regcc); /* from regexec.c */
5067 SAVEVPTR(PL_curcop);
5068 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5069 SAVEVPTR(PL_reg_re); /* from regexec.c */
5070 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5071 SAVESPTR(PL_reg_sv); /* from regexec.c */
5072 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5073 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5074 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5075 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5076 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5077 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5078 PL_reg_oldsaved = Nullch;
5079 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5080 PL_reg_oldsavedlen = 0;
5081 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5083 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5084 PL_reg_leftiter = 0;
5085 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5086 PL_reg_poscache = Nullch;
5087 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5088 PL_reg_poscache_size = 0;
5089 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5090 SAVEI32(PL_regnpar); /* () count. */
5091 SAVEI32(PL_regsize); /* from regexec.c */
5094 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5100 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5101 for (i = 1; i <= rx->nparens; i++) {
5102 sprintf(digits, "%lu", (long)i);
5103 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5110 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5115 clear_re(pTHX_ void *r)
5117 ReREFCNT_dec((regexp *)r);