5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
80 **** Alterations to Henry's code are...
82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
83 **** 2000, 2001, 2002, 2003, by Larry Wall and others
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
94 #define PERL_IN_REGCOMP_C
97 #ifndef PERL_IN_XSUB_RE
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
121 typedef struct RExC_state_t {
122 U32 flags; /* are we folding, multilining? */
123 char *precomp; /* uncompiled string. */
125 char *start; /* Start of input for compile */
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
129 regnode *emit_start; /* Start of emitted-code area */
130 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
146 #define RExC_flags (pRExC_state->flags)
147 #define RExC_precomp (pRExC_state->precomp)
148 #define RExC_rx (pRExC_state->rx)
149 #define RExC_start (pRExC_state->start)
150 #define RExC_end (pRExC_state->end)
151 #define RExC_parse (pRExC_state->parse)
152 #define RExC_whilem_seen (pRExC_state->whilem_seen)
153 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
154 #define RExC_emit (pRExC_state->emit)
155 #define RExC_emit_start (pRExC_state->emit_start)
156 #define RExC_naughty (pRExC_state->naughty)
157 #define RExC_sawback (pRExC_state->sawback)
158 #define RExC_seen (pRExC_state->seen)
159 #define RExC_size (pRExC_state->size)
160 #define RExC_npar (pRExC_state->npar)
161 #define RExC_extralen (pRExC_state->extralen)
162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8 (pRExC_state->utf8)
166 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
171 #undef SPSTART /* dratted cpp namespace... */
174 * Flags to be passed up and down.
176 #define WORST 0 /* Worst case. */
177 #define HASWIDTH 0x1 /* Known to match non-null strings. */
178 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179 #define SPSTART 0x4 /* Starts with * or +. */
180 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
182 /* Length of a variant. */
184 typedef struct scan_data_t {
190 I32 last_end; /* min value, <0 unless valid. */
193 SV **longest; /* Either &l_fixed, or &l_float. */
197 I32 offset_float_min;
198 I32 offset_float_max;
202 struct regnode_charclass_class *start_class;
206 * Forward declarations for pregcomp()'s friends.
209 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213 #define SF_BEFORE_SEOL 0x1
214 #define SF_BEFORE_MEOL 0x2
215 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
219 # define SF_FIX_SHIFT_EOL (0+2)
220 # define SF_FL_SHIFT_EOL (0+4)
222 # define SF_FIX_SHIFT_EOL (+2)
223 # define SF_FL_SHIFT_EOL (+4)
226 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
229 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231 #define SF_IS_INF 0x40
232 #define SF_HAS_PAR 0x80
233 #define SF_IN_PAR 0x100
234 #define SF_HAS_EVAL 0x200
235 #define SCF_DO_SUBSTR 0x400
236 #define SCF_DO_STCLASS_AND 0x0800
237 #define SCF_DO_STCLASS_OR 0x1000
238 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
239 #define SCF_WHILEM_VISITED_POS 0x2000
241 #define UTF (RExC_utf8 != 0)
242 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
243 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
245 #define OOB_UNICODE 12345678
246 #define OOB_NAMEDCLASS -1
248 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
252 /* length of regex to show in messages that don't mark a position within */
253 #define RegexLengthToShowInErrorMessages 127
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
260 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
261 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
263 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
270 #define FAIL(msg) STMT_START { \
271 char *ellipses = ""; \
272 IV len = RExC_end - RExC_precomp; \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
290 #define FAIL2(pat,msg) STMT_START { \
291 char *ellipses = ""; \
292 IV len = RExC_end - RExC_precomp; \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
309 #define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
318 #define vFAIL(m) STMT_START { \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
325 * Like Simple_vFAIL(), but accepts two arguments.
327 #define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
336 #define vFAIL2(m,a1) STMT_START { \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
344 * Like Simple_vFAIL(), but accepts three arguments.
346 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 #define vFAIL3(m,a1,a2) STMT_START { \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
362 * Like Simple_vFAIL(), but accepts four arguments.
364 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
371 * Like Simple_vFAIL(), but accepts five arguments.
373 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
386 #define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
394 #define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
406 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
412 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
419 /* Allow for side effects in s */
420 #define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
424 /* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
430 #define MJD_OFFSET_DEBUG(x)
431 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
434 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
441 RExC_offsets[2*(node)-1] = (byte); \
446 #define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
450 #define Set_Node_Length_To_R(node,len) STMT_START { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
457 RExC_offsets[2*(node)] = (len); \
462 #define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465 #define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
468 /* Get offsets and lengths */
469 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
472 static void clear_re(pTHX_ void *r);
474 /* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
479 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
481 STRLEN l = CHR_SVLEN(data->last_found);
482 STRLEN old_l = CHR_SVLEN(*data->longest);
484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
485 SvSetMagicSV(*data->longest, data->last_found);
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
492 data->flags &= ~SF_FIX_BEFORE_EOL;
495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
496 data->offset_float_max = (l
497 ? data->last_start_max
498 : data->pos_min + data->pos_delta);
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
501 if (data->flags & SF_BEFORE_EOL)
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
505 data->flags &= ~SF_FL_BEFORE_EOL;
508 SvCUR_set(data->last_found, 0);
510 SV * sv = data->last_found;
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
597 cl->flags &= ~ANYOF_UNICODE_ALL;
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
600 cl->flags &= ~ANYOF_UNICODE;
603 /* 'OR' a given class with another one. Can create false positives */
604 /* We assume that cl is not inverted */
606 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
608 if (or_with->flags & ANYOF_INVERT) {
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
627 cl_anything(pRExC_state, cl);
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
632 && (!(or_with->flags & ANYOF_FOLD)
633 || (cl->flags & ANYOF_FOLD)) ) {
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
646 cl_anything(pRExC_state, cl);
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
664 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
665 * These need to be revisited when a newer toolchain becomes available.
667 #if defined(__sparc64__) && defined(__GNUC__)
668 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
669 # undef SPARC64_GCC_WORKAROUND
670 # define SPARC64_GCC_WORKAROUND 1
674 /* REx optimizer. Converts nodes into quickier variants "in place".
675 Finds fixed substrings. */
677 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
678 to the position after last scanned or to NULL. */
681 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
682 /* scanp: Start here (read-write). */
683 /* deltap: Write maxlen-minlen here. */
684 /* last: Stop before this one. */
686 I32 min = 0, pars = 0, code;
687 regnode *scan = *scanp, *next;
689 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
690 int is_inf_internal = 0; /* The studied chunk is infinite */
691 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
692 scan_data_t data_fake;
693 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
695 while (scan && OP(scan) != END && scan < last) {
696 /* Peephole optimizer: */
698 if (PL_regkind[(U8)OP(scan)] == EXACT) {
699 /* Merge several consecutive EXACTish nodes into one. */
700 regnode *n = regnext(scan);
703 regnode *stop = scan;
706 next = scan + NODE_SZ_STR(scan);
707 /* Skip NOTHING, merge EXACT*. */
709 ( PL_regkind[(U8)OP(n)] == NOTHING ||
710 (stringok && (OP(n) == OP(scan))))
712 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
713 if (OP(n) == TAIL || n > next)
715 if (PL_regkind[(U8)OP(n)] == NOTHING) {
716 NEXT_OFF(scan) += NEXT_OFF(n);
717 next = n + NODE_STEP_REGNODE;
725 int oldl = STR_LEN(scan);
726 regnode *nnext = regnext(n);
728 if (oldl + STR_LEN(n) > U8_MAX)
730 NEXT_OFF(scan) += NEXT_OFF(n);
731 STR_LEN(scan) += STR_LEN(n);
732 next = n + NODE_SZ_STR(n);
733 /* Now we can overwrite *n : */
734 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
742 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
744 Two problematic code points in Unicode casefolding of EXACT nodes:
746 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
747 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
753 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
754 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
756 This means that in case-insensitive matching (or "loose matching",
757 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
758 length of the above casefolded versions) can match a target string
759 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
760 This would rather mess up the minimum length computation.
762 What we'll do is to look for the tail four bytes, and then peek
763 at the preceding two bytes to see whether we need to decrease
764 the minimum length by four (six minus two).
766 Thanks to the design of UTF-8, there cannot be false matches:
767 A sequence of valid UTF-8 bytes cannot be a subsequence of
768 another valid sequence of UTF-8 bytes.
771 char *s0 = STRING(scan), *s, *t;
772 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
773 char *t0 = "\xcc\x88\xcc\x81";
777 s < s2 && (t = ninstr(s, s1, t0, t1));
779 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
780 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
787 n = scan + NODE_SZ_STR(scan);
789 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
797 /* Follow the next-chain of the current node and optimize
798 away all the NOTHINGs from it. */
799 if (OP(scan) != CURLYX) {
800 int max = (reg_off_by_arg[OP(scan)]
802 /* I32 may be smaller than U16 on CRAYs! */
803 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
804 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
808 /* Skip NOTHING and LONGJMP. */
809 while ((n = regnext(n))
810 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
811 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
814 if (reg_off_by_arg[OP(scan)])
817 NEXT_OFF(scan) = off;
819 /* The principal pseudo-switch. Cannot be a switch, since we
820 look into several different things. */
821 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
822 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
823 next = regnext(scan);
826 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
827 I32 max1 = 0, min1 = I32_MAX, num = 0;
828 struct regnode_charclass_class accum;
830 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
831 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
832 if (flags & SCF_DO_STCLASS)
833 cl_init_zero(pRExC_state, &accum);
834 while (OP(scan) == code) {
835 I32 deltanext, minnext, f = 0, fake;
836 struct regnode_charclass_class this_class;
841 data_fake.whilem_c = data->whilem_c;
842 data_fake.last_closep = data->last_closep;
845 data_fake.last_closep = &fake;
846 next = regnext(scan);
847 scan = NEXTOPER(scan);
849 scan = NEXTOPER(scan);
850 if (flags & SCF_DO_STCLASS) {
851 cl_init(pRExC_state, &this_class);
852 data_fake.start_class = &this_class;
853 f = SCF_DO_STCLASS_AND;
855 if (flags & SCF_WHILEM_VISITED_POS)
856 f |= SCF_WHILEM_VISITED_POS;
857 /* we suppose the run is continuous, last=next...*/
858 minnext = study_chunk(pRExC_state, &scan, &deltanext,
859 next, &data_fake, f);
862 if (max1 < minnext + deltanext)
863 max1 = minnext + deltanext;
864 if (deltanext == I32_MAX)
865 is_inf = is_inf_internal = 1;
867 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
869 if (data && (data_fake.flags & SF_HAS_EVAL))
870 data->flags |= SF_HAS_EVAL;
872 data->whilem_c = data_fake.whilem_c;
873 if (flags & SCF_DO_STCLASS)
874 cl_or(pRExC_state, &accum, &this_class);
878 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
880 if (flags & SCF_DO_SUBSTR) {
881 data->pos_min += min1;
882 data->pos_delta += max1 - min1;
883 if (max1 != min1 || is_inf)
884 data->longest = &(data->longest_float);
887 delta += max1 - min1;
888 if (flags & SCF_DO_STCLASS_OR) {
889 cl_or(pRExC_state, data->start_class, &accum);
891 cl_and(data->start_class, &and_with);
892 flags &= ~SCF_DO_STCLASS;
895 else if (flags & SCF_DO_STCLASS_AND) {
897 cl_and(data->start_class, &accum);
898 flags &= ~SCF_DO_STCLASS;
901 /* Switch to OR mode: cache the old value of
902 * data->start_class */
903 StructCopy(data->start_class, &and_with,
904 struct regnode_charclass_class);
905 flags &= ~SCF_DO_STCLASS_AND;
906 StructCopy(&accum, data->start_class,
907 struct regnode_charclass_class);
908 flags |= SCF_DO_STCLASS_OR;
909 data->start_class->flags |= ANYOF_EOS;
913 else if (code == BRANCHJ) /* single branch is optimized. */
914 scan = NEXTOPER(NEXTOPER(scan));
915 else /* single branch is optimized. */
916 scan = NEXTOPER(scan);
919 else if (OP(scan) == EXACT) {
920 I32 l = STR_LEN(scan);
921 UV uc = *((U8*)STRING(scan));
923 U8 *s = (U8*)STRING(scan);
924 l = utf8_length(s, s + l);
925 uc = utf8_to_uvchr(s, NULL);
928 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
929 /* The code below prefers earlier match for fixed
930 offset, later match for variable offset. */
931 if (data->last_end == -1) { /* Update the start info. */
932 data->last_start_min = data->pos_min;
933 data->last_start_max = is_inf
934 ? I32_MAX : data->pos_min + data->pos_delta;
936 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
938 SV * sv = data->last_found;
939 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
940 mg_find(sv, PERL_MAGIC_utf8) : NULL;
941 if (mg && mg->mg_len >= 0)
942 mg->mg_len += utf8_length((U8*)STRING(scan),
943 (U8*)STRING(scan)+STR_LEN(scan));
946 SvUTF8_on(data->last_found);
947 data->last_end = data->pos_min + l;
948 data->pos_min += l; /* As in the first entry. */
949 data->flags &= ~SF_BEFORE_EOL;
951 if (flags & SCF_DO_STCLASS_AND) {
952 /* Check whether it is compatible with what we know already! */
956 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
957 && !ANYOF_BITMAP_TEST(data->start_class, uc)
958 && (!(data->start_class->flags & ANYOF_FOLD)
959 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
962 ANYOF_CLASS_ZERO(data->start_class);
963 ANYOF_BITMAP_ZERO(data->start_class);
965 ANYOF_BITMAP_SET(data->start_class, uc);
966 data->start_class->flags &= ~ANYOF_EOS;
968 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
970 else if (flags & SCF_DO_STCLASS_OR) {
971 /* false positive possible if the class is case-folded */
973 ANYOF_BITMAP_SET(data->start_class, uc);
975 data->start_class->flags |= ANYOF_UNICODE_ALL;
976 data->start_class->flags &= ~ANYOF_EOS;
977 cl_and(data->start_class, &and_with);
979 flags &= ~SCF_DO_STCLASS;
981 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
982 I32 l = STR_LEN(scan);
983 UV uc = *((U8*)STRING(scan));
985 /* Search for fixed substrings supports EXACT only. */
986 if (flags & SCF_DO_SUBSTR)
987 scan_commit(pRExC_state, data);
989 U8 *s = (U8 *)STRING(scan);
990 l = utf8_length(s, s + l);
991 uc = utf8_to_uvchr(s, NULL);
994 if (data && (flags & SCF_DO_SUBSTR))
996 if (flags & SCF_DO_STCLASS_AND) {
997 /* Check whether it is compatible with what we know already! */
1001 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1002 && !ANYOF_BITMAP_TEST(data->start_class, uc)
1003 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
1005 ANYOF_CLASS_ZERO(data->start_class);
1006 ANYOF_BITMAP_ZERO(data->start_class);
1008 ANYOF_BITMAP_SET(data->start_class, uc);
1009 data->start_class->flags &= ~ANYOF_EOS;
1010 data->start_class->flags |= ANYOF_FOLD;
1011 if (OP(scan) == EXACTFL)
1012 data->start_class->flags |= ANYOF_LOCALE;
1015 else if (flags & SCF_DO_STCLASS_OR) {
1016 if (data->start_class->flags & ANYOF_FOLD) {
1017 /* false positive possible if the class is case-folded.
1018 Assume that the locale settings are the same... */
1020 ANYOF_BITMAP_SET(data->start_class, uc);
1021 data->start_class->flags &= ~ANYOF_EOS;
1023 cl_and(data->start_class, &and_with);
1025 flags &= ~SCF_DO_STCLASS;
1027 else if (strchr((char*)PL_varies,OP(scan))) {
1028 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1029 I32 f = flags, pos_before = 0;
1030 regnode *oscan = scan;
1031 struct regnode_charclass_class this_class;
1032 struct regnode_charclass_class *oclass = NULL;
1033 I32 next_is_eval = 0;
1035 switch (PL_regkind[(U8)OP(scan)]) {
1036 case WHILEM: /* End of (?:...)* . */
1037 scan = NEXTOPER(scan);
1040 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1041 next = NEXTOPER(scan);
1042 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1044 maxcount = REG_INFTY;
1045 next = regnext(scan);
1046 scan = NEXTOPER(scan);
1050 if (flags & SCF_DO_SUBSTR)
1055 if (flags & SCF_DO_STCLASS) {
1057 maxcount = REG_INFTY;
1058 next = regnext(scan);
1059 scan = NEXTOPER(scan);
1062 is_inf = is_inf_internal = 1;
1063 scan = regnext(scan);
1064 if (flags & SCF_DO_SUBSTR) {
1065 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1066 data->longest = &(data->longest_float);
1068 goto optimize_curly_tail;
1070 mincount = ARG1(scan);
1071 maxcount = ARG2(scan);
1072 next = regnext(scan);
1073 if (OP(scan) == CURLYX) {
1074 I32 lp = (data ? *(data->last_closep) : 0);
1076 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1078 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1079 next_is_eval = (OP(scan) == EVAL);
1081 if (flags & SCF_DO_SUBSTR) {
1082 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1083 pos_before = data->pos_min;
1087 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1089 data->flags |= SF_IS_INF;
1091 if (flags & SCF_DO_STCLASS) {
1092 cl_init(pRExC_state, &this_class);
1093 oclass = data->start_class;
1094 data->start_class = &this_class;
1095 f |= SCF_DO_STCLASS_AND;
1096 f &= ~SCF_DO_STCLASS_OR;
1098 /* These are the cases when once a subexpression
1099 fails at a particular position, it cannot succeed
1100 even after backtracking at the enclosing scope.
1102 XXXX what if minimal match and we are at the
1103 initial run of {n,m}? */
1104 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1105 f &= ~SCF_WHILEM_VISITED_POS;
1107 /* This will finish on WHILEM, setting scan, or on NULL: */
1108 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1110 ? (f & ~SCF_DO_SUBSTR) : f);
1112 if (flags & SCF_DO_STCLASS)
1113 data->start_class = oclass;
1114 if (mincount == 0 || minnext == 0) {
1115 if (flags & SCF_DO_STCLASS_OR) {
1116 cl_or(pRExC_state, data->start_class, &this_class);
1118 else if (flags & SCF_DO_STCLASS_AND) {
1119 /* Switch to OR mode: cache the old value of
1120 * data->start_class */
1121 StructCopy(data->start_class, &and_with,
1122 struct regnode_charclass_class);
1123 flags &= ~SCF_DO_STCLASS_AND;
1124 StructCopy(&this_class, data->start_class,
1125 struct regnode_charclass_class);
1126 flags |= SCF_DO_STCLASS_OR;
1127 data->start_class->flags |= ANYOF_EOS;
1129 } else { /* Non-zero len */
1130 if (flags & SCF_DO_STCLASS_OR) {
1131 cl_or(pRExC_state, data->start_class, &this_class);
1132 cl_and(data->start_class, &and_with);
1134 else if (flags & SCF_DO_STCLASS_AND)
1135 cl_and(data->start_class, &this_class);
1136 flags &= ~SCF_DO_STCLASS;
1138 if (!scan) /* It was not CURLYX, but CURLY. */
1140 if (ckWARN(WARN_REGEXP)
1141 /* ? quantifier ok, except for (?{ ... }) */
1142 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1143 && (minnext == 0) && (deltanext == 0)
1144 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1145 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1148 "Quantifier unexpected on zero-length expression");
1151 min += minnext * mincount;
1152 is_inf_internal |= ((maxcount == REG_INFTY
1153 && (minnext + deltanext) > 0)
1154 || deltanext == I32_MAX);
1155 is_inf |= is_inf_internal;
1156 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1158 /* Try powerful optimization CURLYX => CURLYN. */
1159 if ( OP(oscan) == CURLYX && data
1160 && data->flags & SF_IN_PAR
1161 && !(data->flags & SF_HAS_EVAL)
1162 && !deltanext && minnext == 1 ) {
1163 /* Try to optimize to CURLYN. */
1164 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1165 regnode *nxt1 = nxt;
1172 if (!strchr((char*)PL_simple,OP(nxt))
1173 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1174 && STR_LEN(nxt) == 1))
1180 if (OP(nxt) != CLOSE)
1182 /* Now we know that nxt2 is the only contents: */
1183 oscan->flags = (U8)ARG(nxt);
1185 OP(nxt1) = NOTHING; /* was OPEN. */
1187 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1188 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1189 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1190 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1191 OP(nxt + 1) = OPTIMIZED; /* was count. */
1192 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1197 /* Try optimization CURLYX => CURLYM. */
1198 if ( OP(oscan) == CURLYX && data
1199 && !(data->flags & SF_HAS_PAR)
1200 && !(data->flags & SF_HAS_EVAL)
1201 && !deltanext /* atom is fixed width */
1202 && minnext != 0 /* CURLYM can't handle zero width */
1204 /* XXXX How to optimize if data == 0? */
1205 /* Optimize to a simpler form. */
1206 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1210 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1211 && (OP(nxt2) != WHILEM))
1213 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1214 /* Need to optimize away parenths. */
1215 if (data->flags & SF_IN_PAR) {
1216 /* Set the parenth number. */
1217 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1219 if (OP(nxt) != CLOSE)
1220 FAIL("Panic opt close");
1221 oscan->flags = (U8)ARG(nxt);
1222 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1223 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1225 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1226 OP(nxt + 1) = OPTIMIZED; /* was count. */
1227 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1228 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1231 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1232 regnode *nnxt = regnext(nxt1);
1235 if (reg_off_by_arg[OP(nxt1)])
1236 ARG_SET(nxt1, nxt2 - nxt1);
1237 else if (nxt2 - nxt1 < U16_MAX)
1238 NEXT_OFF(nxt1) = nxt2 - nxt1;
1240 OP(nxt) = NOTHING; /* Cannot beautify */
1245 /* Optimize again: */
1246 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1252 else if ((OP(oscan) == CURLYX)
1253 && (flags & SCF_WHILEM_VISITED_POS)
1254 /* See the comment on a similar expression above.
1255 However, this time it not a subexpression
1256 we care about, but the expression itself. */
1257 && (maxcount == REG_INFTY)
1258 && data && ++data->whilem_c < 16) {
1259 /* This stays as CURLYX, we can put the count/of pair. */
1260 /* Find WHILEM (as in regexec.c) */
1261 regnode *nxt = oscan + NEXT_OFF(oscan);
1263 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1265 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1266 | (RExC_whilem_seen << 4)); /* On WHILEM */
1268 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1270 if (flags & SCF_DO_SUBSTR) {
1271 SV *last_str = Nullsv;
1272 int counted = mincount != 0;
1274 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1275 #if defined(SPARC64_GCC_WORKAROUND)
1281 if (pos_before >= data->last_start_min)
1284 b = data->last_start_min;
1287 s = SvPV(data->last_found, l);
1288 old = b - data->last_start_min;
1291 I32 b = pos_before >= data->last_start_min
1292 ? pos_before : data->last_start_min;
1294 char *s = SvPV(data->last_found, l);
1295 I32 old = b - data->last_start_min;
1299 old = utf8_hop((U8*)s, old) - (U8*)s;
1302 /* Get the added string: */
1303 last_str = newSVpvn(s + old, l);
1305 SvUTF8_on(last_str);
1306 if (deltanext == 0 && pos_before == b) {
1307 /* What was added is a constant string */
1309 SvGROW(last_str, (mincount * l) + 1);
1310 repeatcpy(SvPVX(last_str) + l,
1311 SvPVX(last_str), l, mincount - 1);
1312 SvCUR(last_str) *= mincount;
1313 /* Add additional parts. */
1314 SvCUR_set(data->last_found,
1315 SvCUR(data->last_found) - l);
1316 sv_catsv(data->last_found, last_str);
1318 SV * sv = data->last_found;
1320 SvUTF8(sv) && SvMAGICAL(sv) ?
1321 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1322 if (mg && mg->mg_len >= 0)
1323 mg->mg_len += CHR_SVLEN(last_str);
1325 data->last_end += l * (mincount - 1);
1328 /* start offset must point into the last copy */
1329 data->last_start_min += minnext * (mincount - 1);
1330 data->last_start_max += is_inf ? I32_MAX
1331 : (maxcount - 1) * (minnext + data->pos_delta);
1334 /* It is counted once already... */
1335 data->pos_min += minnext * (mincount - counted);
1336 data->pos_delta += - counted * deltanext +
1337 (minnext + deltanext) * maxcount - minnext * mincount;
1338 if (mincount != maxcount) {
1339 /* Cannot extend fixed substrings found inside
1341 scan_commit(pRExC_state,data);
1342 if (mincount && last_str) {
1343 sv_setsv(data->last_found, last_str);
1344 data->last_end = data->pos_min;
1345 data->last_start_min =
1346 data->pos_min - CHR_SVLEN(last_str);
1347 data->last_start_max = is_inf
1349 : data->pos_min + data->pos_delta
1350 - CHR_SVLEN(last_str);
1352 data->longest = &(data->longest_float);
1354 SvREFCNT_dec(last_str);
1356 if (data && (fl & SF_HAS_EVAL))
1357 data->flags |= SF_HAS_EVAL;
1358 optimize_curly_tail:
1359 if (OP(oscan) != CURLYX) {
1360 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1362 NEXT_OFF(oscan) += NEXT_OFF(next);
1365 default: /* REF and CLUMP only? */
1366 if (flags & SCF_DO_SUBSTR) {
1367 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1368 data->longest = &(data->longest_float);
1370 is_inf = is_inf_internal = 1;
1371 if (flags & SCF_DO_STCLASS_OR)
1372 cl_anything(pRExC_state, data->start_class);
1373 flags &= ~SCF_DO_STCLASS;
1377 else if (strchr((char*)PL_simple,OP(scan))) {
1380 if (flags & SCF_DO_SUBSTR) {
1381 scan_commit(pRExC_state,data);
1385 if (flags & SCF_DO_STCLASS) {
1386 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1388 /* Some of the logic below assumes that switching
1389 locale on will only add false positives. */
1390 switch (PL_regkind[(U8)OP(scan)]) {
1394 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1395 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1396 cl_anything(pRExC_state, data->start_class);
1399 if (OP(scan) == SANY)
1401 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1402 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1403 || (data->start_class->flags & ANYOF_CLASS));
1404 cl_anything(pRExC_state, data->start_class);
1406 if (flags & SCF_DO_STCLASS_AND || !value)
1407 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1410 if (flags & SCF_DO_STCLASS_AND)
1411 cl_and(data->start_class,
1412 (struct regnode_charclass_class*)scan);
1414 cl_or(pRExC_state, data->start_class,
1415 (struct regnode_charclass_class*)scan);
1418 if (flags & SCF_DO_STCLASS_AND) {
1419 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1420 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1421 for (value = 0; value < 256; value++)
1422 if (!isALNUM(value))
1423 ANYOF_BITMAP_CLEAR(data->start_class, value);
1427 if (data->start_class->flags & ANYOF_LOCALE)
1428 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1430 for (value = 0; value < 256; value++)
1432 ANYOF_BITMAP_SET(data->start_class, value);
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (data->start_class->flags & ANYOF_LOCALE)
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1443 data->start_class->flags |= ANYOF_LOCALE;
1447 if (flags & SCF_DO_STCLASS_AND) {
1448 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1449 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1450 for (value = 0; value < 256; value++)
1452 ANYOF_BITMAP_CLEAR(data->start_class, value);
1456 if (data->start_class->flags & ANYOF_LOCALE)
1457 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1459 for (value = 0; value < 256; value++)
1460 if (!isALNUM(value))
1461 ANYOF_BITMAP_SET(data->start_class, value);
1466 if (flags & SCF_DO_STCLASS_AND) {
1467 if (data->start_class->flags & ANYOF_LOCALE)
1468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1471 data->start_class->flags |= ANYOF_LOCALE;
1472 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1476 if (flags & SCF_DO_STCLASS_AND) {
1477 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1478 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1479 for (value = 0; value < 256; value++)
1480 if (!isSPACE(value))
1481 ANYOF_BITMAP_CLEAR(data->start_class, value);
1485 if (data->start_class->flags & ANYOF_LOCALE)
1486 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1488 for (value = 0; value < 256; value++)
1490 ANYOF_BITMAP_SET(data->start_class, value);
1495 if (flags & SCF_DO_STCLASS_AND) {
1496 if (data->start_class->flags & ANYOF_LOCALE)
1497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1500 data->start_class->flags |= ANYOF_LOCALE;
1501 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1505 if (flags & SCF_DO_STCLASS_AND) {
1506 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1507 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1508 for (value = 0; value < 256; value++)
1510 ANYOF_BITMAP_CLEAR(data->start_class, value);
1514 if (data->start_class->flags & ANYOF_LOCALE)
1515 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1517 for (value = 0; value < 256; value++)
1518 if (!isSPACE(value))
1519 ANYOF_BITMAP_SET(data->start_class, value);
1524 if (flags & SCF_DO_STCLASS_AND) {
1525 if (data->start_class->flags & ANYOF_LOCALE) {
1526 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1527 for (value = 0; value < 256; value++)
1528 if (!isSPACE(value))
1529 ANYOF_BITMAP_CLEAR(data->start_class, value);
1533 data->start_class->flags |= ANYOF_LOCALE;
1534 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1538 if (flags & SCF_DO_STCLASS_AND) {
1539 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1540 for (value = 0; value < 256; value++)
1541 if (!isDIGIT(value))
1542 ANYOF_BITMAP_CLEAR(data->start_class, value);
1545 if (data->start_class->flags & ANYOF_LOCALE)
1546 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1548 for (value = 0; value < 256; value++)
1550 ANYOF_BITMAP_SET(data->start_class, value);
1555 if (flags & SCF_DO_STCLASS_AND) {
1556 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1557 for (value = 0; value < 256; value++)
1559 ANYOF_BITMAP_CLEAR(data->start_class, value);
1562 if (data->start_class->flags & ANYOF_LOCALE)
1563 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1565 for (value = 0; value < 256; value++)
1566 if (!isDIGIT(value))
1567 ANYOF_BITMAP_SET(data->start_class, value);
1572 if (flags & SCF_DO_STCLASS_OR)
1573 cl_and(data->start_class, &and_with);
1574 flags &= ~SCF_DO_STCLASS;
1577 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1578 data->flags |= (OP(scan) == MEOL
1582 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1583 /* Lookbehind, or need to calculate parens/evals/stclass: */
1584 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1585 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1586 /* Lookahead/lookbehind */
1587 I32 deltanext, minnext, fake = 0;
1589 struct regnode_charclass_class intrnl;
1592 data_fake.flags = 0;
1594 data_fake.whilem_c = data->whilem_c;
1595 data_fake.last_closep = data->last_closep;
1598 data_fake.last_closep = &fake;
1599 if ( flags & SCF_DO_STCLASS && !scan->flags
1600 && OP(scan) == IFMATCH ) { /* Lookahead */
1601 cl_init(pRExC_state, &intrnl);
1602 data_fake.start_class = &intrnl;
1603 f |= SCF_DO_STCLASS_AND;
1605 if (flags & SCF_WHILEM_VISITED_POS)
1606 f |= SCF_WHILEM_VISITED_POS;
1607 next = regnext(scan);
1608 nscan = NEXTOPER(NEXTOPER(scan));
1609 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1612 vFAIL("Variable length lookbehind not implemented");
1614 else if (minnext > U8_MAX) {
1615 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1617 scan->flags = (U8)minnext;
1619 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1621 if (data && (data_fake.flags & SF_HAS_EVAL))
1622 data->flags |= SF_HAS_EVAL;
1624 data->whilem_c = data_fake.whilem_c;
1625 if (f & SCF_DO_STCLASS_AND) {
1626 int was = (data->start_class->flags & ANYOF_EOS);
1628 cl_and(data->start_class, &intrnl);
1630 data->start_class->flags |= ANYOF_EOS;
1633 else if (OP(scan) == OPEN) {
1636 else if (OP(scan) == CLOSE) {
1637 if ((I32)ARG(scan) == is_par) {
1638 next = regnext(scan);
1640 if ( next && (OP(next) != WHILEM) && next < last)
1641 is_par = 0; /* Disable optimization */
1644 *(data->last_closep) = ARG(scan);
1646 else if (OP(scan) == EVAL) {
1648 data->flags |= SF_HAS_EVAL;
1650 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1651 if (flags & SCF_DO_SUBSTR) {
1652 scan_commit(pRExC_state,data);
1653 data->longest = &(data->longest_float);
1655 is_inf = is_inf_internal = 1;
1656 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1657 cl_anything(pRExC_state, data->start_class);
1658 flags &= ~SCF_DO_STCLASS;
1660 /* Else: zero-length, ignore. */
1661 scan = regnext(scan);
1666 *deltap = is_inf_internal ? I32_MAX : delta;
1667 if (flags & SCF_DO_SUBSTR && is_inf)
1668 data->pos_delta = I32_MAX - data->pos_min;
1669 if (is_par > U8_MAX)
1671 if (is_par && pars==1 && data) {
1672 data->flags |= SF_IN_PAR;
1673 data->flags &= ~SF_HAS_PAR;
1675 else if (pars && data) {
1676 data->flags |= SF_HAS_PAR;
1677 data->flags &= ~SF_IN_PAR;
1679 if (flags & SCF_DO_STCLASS_OR)
1680 cl_and(data->start_class, &and_with);
1685 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1687 if (RExC_rx->data) {
1688 Renewc(RExC_rx->data,
1689 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1690 char, struct reg_data);
1691 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1692 RExC_rx->data->count += n;
1695 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1696 char, struct reg_data);
1697 New(1208, RExC_rx->data->what, n, U8);
1698 RExC_rx->data->count = n;
1700 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1701 return RExC_rx->data->count - n;
1705 Perl_reginitcolors(pTHX)
1708 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1711 PL_colors[0] = s = savepv(s);
1713 s = strchr(s, '\t');
1719 PL_colors[i] = s = "";
1723 PL_colors[i++] = "";
1730 - pregcomp - compile a regular expression into internal code
1732 * We can't allocate space until we know how big the compiled form will be,
1733 * but we can't compile it (and thus know how big it is) until we've got a
1734 * place to put the code. So we cheat: we compile it twice, once with code
1735 * generation turned off and size counting turned on, and once "for real".
1736 * This also means that we don't allocate space until we are sure that the
1737 * thing really will compile successfully, and we never have to move the
1738 * code and thus invalidate pointers into it. (Note that it has to be in
1739 * one piece because free() must be able to free it all.) [NB: not true in perl]
1741 * Beware that the optimization-preparation code in here knows about some
1742 * of the structure of the compiled regexp. [I'll say.]
1745 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1755 RExC_state_t RExC_state;
1756 RExC_state_t *pRExC_state = &RExC_state;
1759 FAIL("NULL regexp argument");
1761 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1765 if (!PL_colorset) reginitcolors();
1766 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1767 PL_colors[4],PL_colors[5],PL_colors[0],
1768 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1770 RExC_flags = pm->op_pmflags;
1774 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1775 RExC_seen_evals = 0;
1778 /* First pass: determine size, legality. */
1785 RExC_emit = &PL_regdummy;
1786 RExC_whilem_seen = 0;
1787 #if 0 /* REGC() is (currently) a NOP at the first pass.
1788 * Clever compilers notice this and complain. --jhi */
1789 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1791 if (reg(pRExC_state, 0, &flags) == NULL) {
1792 RExC_precomp = Nullch;
1795 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1797 /* Small enough for pointer-storage convention?
1798 If extralen==0, this means that we will not need long jumps. */
1799 if (RExC_size >= 0x10000L && RExC_extralen)
1800 RExC_size += RExC_extralen;
1803 if (RExC_whilem_seen > 15)
1804 RExC_whilem_seen = 15;
1806 /* Allocate space and initialize. */
1807 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1810 FAIL("Regexp out of space");
1813 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1814 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1817 r->prelen = xend - exp;
1818 r->precomp = savepvn(RExC_precomp, r->prelen);
1820 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1821 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1823 r->substrs = 0; /* Useful during FAIL. */
1824 r->startp = 0; /* Useful during FAIL. */
1825 r->endp = 0; /* Useful during FAIL. */
1827 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1829 r->offsets[0] = RExC_size;
1831 DEBUG_r(PerlIO_printf(Perl_debug_log,
1832 "%s %"UVuf" bytes for offset annotations.\n",
1833 r->offsets ? "Got" : "Couldn't get",
1834 (UV)((2*RExC_size+1) * sizeof(U32))));
1838 /* Second pass: emit code. */
1839 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1844 RExC_emit_start = r->program;
1845 RExC_emit = r->program;
1846 /* Store the count of eval-groups for security checks: */
1847 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1848 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1850 if (reg(pRExC_state, 0, &flags) == NULL)
1853 /* Dig out information for optimizations. */
1854 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1855 pm->op_pmflags = RExC_flags;
1857 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1858 r->regstclass = NULL;
1859 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1860 r->reganch |= ROPT_NAUGHTY;
1861 scan = r->program + 1; /* First BRANCH. */
1863 /* XXXX To minimize changes to RE engine we always allocate
1864 3-units-long substrs field. */
1865 Newz(1004, r->substrs, 1, struct reg_substr_data);
1867 StructCopy(&zero_scan_data, &data, scan_data_t);
1868 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1869 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1871 STRLEN longest_float_length, longest_fixed_length;
1872 struct regnode_charclass_class ch_class;
1877 /* Skip introductions and multiplicators >= 1. */
1878 while ((OP(first) == OPEN && (sawopen = 1)) ||
1879 /* An OR of *one* alternative - should not happen now. */
1880 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1881 (OP(first) == PLUS) ||
1882 (OP(first) == MINMOD) ||
1883 /* An {n,m} with n>0 */
1884 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1885 if (OP(first) == PLUS)
1888 first += regarglen[(U8)OP(first)];
1889 first = NEXTOPER(first);
1892 /* Starting-point info. */
1894 if (PL_regkind[(U8)OP(first)] == EXACT) {
1895 if (OP(first) == EXACT)
1896 ; /* Empty, get anchored substr later. */
1897 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1898 r->regstclass = first;
1900 else if (strchr((char*)PL_simple,OP(first)))
1901 r->regstclass = first;
1902 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1903 PL_regkind[(U8)OP(first)] == NBOUND)
1904 r->regstclass = first;
1905 else if (PL_regkind[(U8)OP(first)] == BOL) {
1906 r->reganch |= (OP(first) == MBOL
1908 : (OP(first) == SBOL
1911 first = NEXTOPER(first);
1914 else if (OP(first) == GPOS) {
1915 r->reganch |= ROPT_ANCH_GPOS;
1916 first = NEXTOPER(first);
1919 else if (!sawopen && (OP(first) == STAR &&
1920 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1921 !(r->reganch & ROPT_ANCH) )
1923 /* turn .* into ^.* with an implied $*=1 */
1924 int type = OP(NEXTOPER(first));
1926 if (type == REG_ANY)
1927 type = ROPT_ANCH_MBOL;
1929 type = ROPT_ANCH_SBOL;
1931 r->reganch |= type | ROPT_IMPLICIT;
1932 first = NEXTOPER(first);
1935 if (sawplus && (!sawopen || !RExC_sawback)
1936 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1937 /* x+ must match at the 1st pos of run of x's */
1938 r->reganch |= ROPT_SKIP;
1940 /* Scan is after the zeroth branch, first is atomic matcher. */
1941 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1942 (IV)(first - scan + 1)));
1944 * If there's something expensive in the r.e., find the
1945 * longest literal string that must appear and make it the
1946 * regmust. Resolve ties in favor of later strings, since
1947 * the regstart check works with the beginning of the r.e.
1948 * and avoiding duplication strengthens checking. Not a
1949 * strong reason, but sufficient in the absence of others.
1950 * [Now we resolve ties in favor of the earlier string if
1951 * it happens that c_offset_min has been invalidated, since the
1952 * earlier string may buy us something the later one won't.]
1956 data.longest_fixed = newSVpvn("",0);
1957 data.longest_float = newSVpvn("",0);
1958 data.last_found = newSVpvn("",0);
1959 data.longest = &(data.longest_fixed);
1961 if (!r->regstclass) {
1962 cl_init(pRExC_state, &ch_class);
1963 data.start_class = &ch_class;
1964 stclass_flag = SCF_DO_STCLASS_AND;
1965 } else /* XXXX Check for BOUND? */
1967 data.last_closep = &last_close;
1969 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1970 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1971 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1972 && data.last_start_min == 0 && data.last_end > 0
1973 && !RExC_seen_zerolen
1974 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1975 r->reganch |= ROPT_CHECK_ALL;
1976 scan_commit(pRExC_state, &data);
1977 SvREFCNT_dec(data.last_found);
1979 longest_float_length = CHR_SVLEN(data.longest_float);
1980 if (longest_float_length
1981 || (data.flags & SF_FL_BEFORE_EOL
1982 && (!(data.flags & SF_FL_BEFORE_MEOL)
1983 || (RExC_flags & PMf_MULTILINE)))) {
1986 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1987 && data.offset_fixed == data.offset_float_min
1988 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1989 goto remove_float; /* As in (a)+. */
1991 if (SvUTF8(data.longest_float)) {
1992 r->float_utf8 = data.longest_float;
1993 r->float_substr = Nullsv;
1995 r->float_substr = data.longest_float;
1996 r->float_utf8 = Nullsv;
1998 r->float_min_offset = data.offset_float_min;
1999 r->float_max_offset = data.offset_float_max;
2000 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
2001 && (!(data.flags & SF_FL_BEFORE_MEOL)
2002 || (RExC_flags & PMf_MULTILINE)));
2003 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2007 r->float_substr = r->float_utf8 = Nullsv;
2008 SvREFCNT_dec(data.longest_float);
2009 longest_float_length = 0;
2012 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2013 if (longest_fixed_length
2014 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2015 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2016 || (RExC_flags & PMf_MULTILINE)))) {
2019 if (SvUTF8(data.longest_fixed)) {
2020 r->anchored_utf8 = data.longest_fixed;
2021 r->anchored_substr = Nullsv;
2023 r->anchored_substr = data.longest_fixed;
2024 r->anchored_utf8 = Nullsv;
2026 r->anchored_offset = data.offset_fixed;
2027 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2028 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2029 || (RExC_flags & PMf_MULTILINE)));
2030 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2033 r->anchored_substr = r->anchored_utf8 = Nullsv;
2034 SvREFCNT_dec(data.longest_fixed);
2035 longest_fixed_length = 0;
2038 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2039 r->regstclass = NULL;
2040 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2042 && !(data.start_class->flags & ANYOF_EOS)
2043 && !cl_is_anything(data.start_class))
2045 I32 n = add_data(pRExC_state, 1, "f");
2047 New(1006, RExC_rx->data->data[n], 1,
2048 struct regnode_charclass_class);
2049 StructCopy(data.start_class,
2050 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2051 struct regnode_charclass_class);
2052 r->regstclass = (regnode*)RExC_rx->data->data[n];
2053 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2054 PL_regdata = r->data; /* for regprop() */
2055 DEBUG_r({ SV *sv = sv_newmortal();
2056 regprop(sv, (regnode*)data.start_class);
2057 PerlIO_printf(Perl_debug_log,
2058 "synthetic stclass `%s'.\n",
2062 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2063 if (longest_fixed_length > longest_float_length) {
2064 r->check_substr = r->anchored_substr;
2065 r->check_utf8 = r->anchored_utf8;
2066 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2067 if (r->reganch & ROPT_ANCH_SINGLE)
2068 r->reganch |= ROPT_NOSCAN;
2071 r->check_substr = r->float_substr;
2072 r->check_utf8 = r->float_utf8;
2073 r->check_offset_min = data.offset_float_min;
2074 r->check_offset_max = data.offset_float_max;
2076 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2077 This should be changed ASAP! */
2078 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2079 r->reganch |= RE_USE_INTUIT;
2080 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2081 r->reganch |= RE_INTUIT_TAIL;
2085 /* Several toplevels. Best we can is to set minlen. */
2087 struct regnode_charclass_class ch_class;
2090 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2091 scan = r->program + 1;
2092 cl_init(pRExC_state, &ch_class);
2093 data.start_class = &ch_class;
2094 data.last_closep = &last_close;
2095 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2096 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2097 = r->float_substr = r->float_utf8 = Nullsv;
2098 if (!(data.start_class->flags & ANYOF_EOS)
2099 && !cl_is_anything(data.start_class))
2101 I32 n = add_data(pRExC_state, 1, "f");
2103 New(1006, RExC_rx->data->data[n], 1,
2104 struct regnode_charclass_class);
2105 StructCopy(data.start_class,
2106 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2107 struct regnode_charclass_class);
2108 r->regstclass = (regnode*)RExC_rx->data->data[n];
2109 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2110 DEBUG_r({ SV* sv = sv_newmortal();
2111 regprop(sv, (regnode*)data.start_class);
2112 PerlIO_printf(Perl_debug_log,
2113 "synthetic stclass `%s'.\n",
2119 if (RExC_seen & REG_SEEN_GPOS)
2120 r->reganch |= ROPT_GPOS_SEEN;
2121 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2122 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2123 if (RExC_seen & REG_SEEN_EVAL)
2124 r->reganch |= ROPT_EVAL_SEEN;
2125 if (RExC_seen & REG_SEEN_CANY)
2126 r->reganch |= ROPT_CANY_SEEN;
2127 Newz(1002, r->startp, RExC_npar, I32);
2128 Newz(1002, r->endp, RExC_npar, I32);
2129 PL_regdata = r->data; /* for regprop() */
2130 DEBUG_r(regdump(r));
2135 - reg - regular expression, i.e. main body or parenthesized thing
2137 * Caller must absorb opening parenthesis.
2139 * Combining parenthesis handling with the base level of regular expression
2140 * is a trifle forced, but the need to tie the tails of the branches to what
2141 * follows makes it hard to avoid.
2144 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2145 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2147 register regnode *ret; /* Will be the head of the group. */
2148 register regnode *br;
2149 register regnode *lastbr;
2150 register regnode *ender = 0;
2151 register I32 parno = 0;
2152 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2154 /* for (?g), (?gc), and (?o) warnings; warning
2155 about (?c) will warn about (?g) -- japhy */
2157 I32 wastedflags = 0x00,
2160 wasted_gc = 0x02 | 0x04,
2163 char * parse_start = RExC_parse; /* MJD */
2164 char *oregcomp_parse = RExC_parse;
2167 *flagp = 0; /* Tentatively. */
2170 /* Make an OPEN node, if parenthesized. */
2172 if (*RExC_parse == '?') { /* (?...) */
2173 U32 posflags = 0, negflags = 0;
2174 U32 *flagsp = &posflags;
2176 char *seqstart = RExC_parse;
2179 paren = *RExC_parse++;
2180 ret = NULL; /* For look-ahead/behind. */
2182 case '<': /* (?<...) */
2183 RExC_seen |= REG_SEEN_LOOKBEHIND;
2184 if (*RExC_parse == '!')
2186 if (*RExC_parse != '=' && *RExC_parse != '!')
2189 case '=': /* (?=...) */
2190 case '!': /* (?!...) */
2191 RExC_seen_zerolen++;
2192 case ':': /* (?:...) */
2193 case '>': /* (?>...) */
2195 case '$': /* (?$...) */
2196 case '@': /* (?@...) */
2197 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2199 case '#': /* (?#...) */
2200 while (*RExC_parse && *RExC_parse != ')')
2202 if (*RExC_parse != ')')
2203 FAIL("Sequence (?#... not terminated");
2204 nextchar(pRExC_state);
2207 case 'p': /* (?p...) */
2208 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2209 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2211 case '?': /* (??...) */
2213 if (*RExC_parse != '{')
2215 paren = *RExC_parse++;
2217 case '{': /* (?{...}) */
2219 I32 count = 1, n = 0;
2221 char *s = RExC_parse;
2223 OP_4tree *sop, *rop;
2225 RExC_seen_zerolen++;
2226 RExC_seen |= REG_SEEN_EVAL;
2227 while (count && (c = *RExC_parse)) {
2228 if (c == '\\' && RExC_parse[1])
2236 if (*RExC_parse != ')')
2239 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2244 if (RExC_parse - 1 - s)
2245 sv = newSVpvn(s, RExC_parse - 1 - s);
2247 sv = newSVpvn("", 0);
2250 Perl_save_re_context(aTHX);
2251 rop = sv_compile_2op(sv, &sop, "re", &pad);
2252 sop->op_private |= OPpREFCOUNTED;
2253 /* re_dup will OpREFCNT_inc */
2254 OpREFCNT_set(sop, 1);
2257 n = add_data(pRExC_state, 3, "nop");
2258 RExC_rx->data->data[n] = (void*)rop;
2259 RExC_rx->data->data[n+1] = (void*)sop;
2260 RExC_rx->data->data[n+2] = (void*)pad;
2263 else { /* First pass */
2264 if (PL_reginterp_cnt < ++RExC_seen_evals
2266 /* No compiled RE interpolated, has runtime
2267 components ===> unsafe. */
2268 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2269 if (PL_tainting && PL_tainted)
2270 FAIL("Eval-group in insecure regular expression");
2273 nextchar(pRExC_state);
2275 ret = reg_node(pRExC_state, LOGICAL);
2278 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2279 /* deal with the length of this later - MJD */
2282 ret = reganode(pRExC_state, EVAL, n);
2283 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2284 Set_Node_Offset(ret, parse_start);
2287 case '(': /* (?(?{...})...) and (?(?=...)...) */
2289 if (RExC_parse[0] == '?') { /* (?(?...)) */
2290 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2291 || RExC_parse[1] == '<'
2292 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2295 ret = reg_node(pRExC_state, LOGICAL);
2298 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2302 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2304 parno = atoi(RExC_parse++);
2306 while (isDIGIT(*RExC_parse))
2308 ret = reganode(pRExC_state, GROUPP, parno);
2310 if ((c = *nextchar(pRExC_state)) != ')')
2311 vFAIL("Switch condition not recognized");
2313 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2314 br = regbranch(pRExC_state, &flags, 1);
2316 br = reganode(pRExC_state, LONGJMP, 0);
2318 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2319 c = *nextchar(pRExC_state);
2323 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2324 regbranch(pRExC_state, &flags, 1);
2325 regtail(pRExC_state, ret, lastbr);
2328 c = *nextchar(pRExC_state);
2333 vFAIL("Switch (?(condition)... contains too many branches");
2334 ender = reg_node(pRExC_state, TAIL);
2335 regtail(pRExC_state, br, ender);
2337 regtail(pRExC_state, lastbr, ender);
2338 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2341 regtail(pRExC_state, ret, ender);
2345 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2349 RExC_parse--; /* for vFAIL to print correctly */
2350 vFAIL("Sequence (? incomplete");
2354 parse_flags: /* (?i) */
2355 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2356 /* (?g), (?gc) and (?o) are useless here
2357 and must be globally applied -- japhy */
2359 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2360 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2361 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2362 if (! (wastedflags & wflagbit) ) {
2363 wastedflags |= wflagbit;
2366 "Useless (%s%c) - %suse /%c modifier",
2367 flagsp == &negflags ? "?-" : "?",
2369 flagsp == &negflags ? "don't " : "",
2375 else if (*RExC_parse == 'c') {
2376 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2377 if (! (wastedflags & wasted_c) ) {
2378 wastedflags |= wasted_gc;
2381 "Useless (%sc) - %suse /gc modifier",
2382 flagsp == &negflags ? "?-" : "?",
2383 flagsp == &negflags ? "don't " : ""
2388 else { pmflag(flagsp, *RExC_parse); }
2392 if (*RExC_parse == '-') {
2394 wastedflags = 0; /* reset so (?g-c) warns twice */
2398 RExC_flags |= posflags;
2399 RExC_flags &= ~negflags;
2400 if (*RExC_parse == ':') {
2406 if (*RExC_parse != ')') {
2408 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2410 nextchar(pRExC_state);
2418 ret = reganode(pRExC_state, OPEN, parno);
2419 Set_Node_Length(ret, 1); /* MJD */
2420 Set_Node_Offset(ret, RExC_parse); /* MJD */
2427 /* Pick up the branches, linking them together. */
2428 parse_start = RExC_parse; /* MJD */
2429 br = regbranch(pRExC_state, &flags, 1);
2430 /* branch_len = (paren != 0); */
2434 if (*RExC_parse == '|') {
2435 if (!SIZE_ONLY && RExC_extralen) {
2436 reginsert(pRExC_state, BRANCHJ, br);
2439 reginsert(pRExC_state, BRANCH, br);
2440 Set_Node_Length(br, paren != 0);
2441 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2445 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2447 else if (paren == ':') {
2448 *flagp |= flags&SIMPLE;
2450 if (open) { /* Starts with OPEN. */
2451 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2453 else if (paren != '?') /* Not Conditional */
2455 *flagp |= flags & (SPSTART | HASWIDTH);
2457 while (*RExC_parse == '|') {
2458 if (!SIZE_ONLY && RExC_extralen) {
2459 ender = reganode(pRExC_state, LONGJMP,0);
2460 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2463 RExC_extralen += 2; /* Account for LONGJMP. */
2464 nextchar(pRExC_state);
2465 br = regbranch(pRExC_state, &flags, 0);
2469 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2473 *flagp |= flags&SPSTART;
2476 if (have_branch || paren != ':') {
2477 /* Make a closing node, and hook it on the end. */
2480 ender = reg_node(pRExC_state, TAIL);
2483 ender = reganode(pRExC_state, CLOSE, parno);
2484 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2485 Set_Node_Length(ender,1); /* MJD */
2491 *flagp &= ~HASWIDTH;
2494 ender = reg_node(pRExC_state, SUCCEED);
2497 ender = reg_node(pRExC_state, END);
2500 regtail(pRExC_state, lastbr, ender);
2503 /* Hook the tails of the branches to the closing node. */
2504 for (br = ret; br != NULL; br = regnext(br)) {
2505 regoptail(pRExC_state, br, ender);
2512 static char parens[] = "=!<,>";
2514 if (paren && (p = strchr(parens, paren))) {
2515 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2516 int flag = (p - parens) > 1;
2519 node = SUSPEND, flag = 0;
2520 reginsert(pRExC_state, node,ret);
2521 Set_Node_Cur_Length(ret);
2522 Set_Node_Offset(ret, parse_start + 1);
2524 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2528 /* Check for proper termination. */
2530 RExC_flags = oregflags;
2531 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2532 RExC_parse = oregcomp_parse;
2533 vFAIL("Unmatched (");
2536 else if (!paren && RExC_parse < RExC_end) {
2537 if (*RExC_parse == ')') {
2539 vFAIL("Unmatched )");
2542 FAIL("Junk on end of regexp"); /* "Can't happen". */
2550 - regbranch - one alternative of an | operator
2552 * Implements the concatenation operator.
2555 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2557 register regnode *ret;
2558 register regnode *chain = NULL;
2559 register regnode *latest;
2560 I32 flags = 0, c = 0;
2565 if (!SIZE_ONLY && RExC_extralen)
2566 ret = reganode(pRExC_state, BRANCHJ,0);
2568 ret = reg_node(pRExC_state, BRANCH);
2569 Set_Node_Length(ret, 1);
2573 if (!first && SIZE_ONLY)
2574 RExC_extralen += 1; /* BRANCHJ */
2576 *flagp = WORST; /* Tentatively. */
2579 nextchar(pRExC_state);
2580 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2582 latest = regpiece(pRExC_state, &flags);
2583 if (latest == NULL) {
2584 if (flags & TRYAGAIN)
2588 else if (ret == NULL)
2590 *flagp |= flags&HASWIDTH;
2591 if (chain == NULL) /* First piece. */
2592 *flagp |= flags&SPSTART;
2595 regtail(pRExC_state, chain, latest);
2600 if (chain == NULL) { /* Loop ran zero times. */
2601 chain = reg_node(pRExC_state, NOTHING);
2606 *flagp |= flags&SIMPLE;
2613 - regpiece - something followed by possible [*+?]
2615 * Note that the branching code sequences used for ? and the general cases
2616 * of * and + are somewhat optimized: they use the same NOTHING node as
2617 * both the endmarker for their branch list and the body of the last branch.
2618 * It might seem that this node could be dispensed with entirely, but the
2619 * endmarker role is not redundant.
2622 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2624 register regnode *ret;
2626 register char *next;
2628 char *origparse = RExC_parse;
2631 I32 max = REG_INFTY;
2634 ret = regatom(pRExC_state, &flags);
2636 if (flags & TRYAGAIN)
2643 if (op == '{' && regcurly(RExC_parse)) {
2644 parse_start = RExC_parse; /* MJD */
2645 next = RExC_parse + 1;
2647 while (isDIGIT(*next) || *next == ',') {
2656 if (*next == '}') { /* got one */
2660 min = atoi(RExC_parse);
2664 maxpos = RExC_parse;
2666 if (!max && *maxpos != '0')
2667 max = REG_INFTY; /* meaning "infinity" */
2668 else if (max >= REG_INFTY)
2669 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2671 nextchar(pRExC_state);
2674 if ((flags&SIMPLE)) {
2675 RExC_naughty += 2 + RExC_naughty / 2;
2676 reginsert(pRExC_state, CURLY, ret);
2677 Set_Node_Offset(ret, parse_start+1); /* MJD */
2678 Set_Node_Cur_Length(ret);
2681 regnode *w = reg_node(pRExC_state, WHILEM);
2684 regtail(pRExC_state, ret, w);
2685 if (!SIZE_ONLY && RExC_extralen) {
2686 reginsert(pRExC_state, LONGJMP,ret);
2687 reginsert(pRExC_state, NOTHING,ret);
2688 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2690 reginsert(pRExC_state, CURLYX,ret);
2692 Set_Node_Offset(ret, parse_start+1);
2693 Set_Node_Length(ret,
2694 op == '{' ? (RExC_parse - parse_start) : 1);
2696 if (!SIZE_ONLY && RExC_extralen)
2697 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2698 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2700 RExC_whilem_seen++, RExC_extralen += 3;
2701 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2709 if (max && max < min)
2710 vFAIL("Can't do {n,m} with n > m");
2712 ARG1_SET(ret, (U16)min);
2713 ARG2_SET(ret, (U16)max);
2725 #if 0 /* Now runtime fix should be reliable. */
2727 /* if this is reinstated, don't forget to put this back into perldiag:
2729 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2731 (F) The part of the regexp subject to either the * or + quantifier
2732 could match an empty string. The {#} shows in the regular
2733 expression about where the problem was discovered.
2737 if (!(flags&HASWIDTH) && op != '?')
2738 vFAIL("Regexp *+ operand could be empty");
2741 parse_start = RExC_parse;
2742 nextchar(pRExC_state);
2744 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2746 if (op == '*' && (flags&SIMPLE)) {
2747 reginsert(pRExC_state, STAR, ret);
2751 else if (op == '*') {
2755 else if (op == '+' && (flags&SIMPLE)) {
2756 reginsert(pRExC_state, PLUS, ret);
2760 else if (op == '+') {
2764 else if (op == '?') {
2769 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2771 "%.*s matches null string many times",
2772 RExC_parse - origparse,
2776 if (*RExC_parse == '?') {
2777 nextchar(pRExC_state);
2778 reginsert(pRExC_state, MINMOD, ret);
2779 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2781 if (ISMULT2(RExC_parse)) {
2783 vFAIL("Nested quantifiers");
2790 - regatom - the lowest level
2792 * Optimization: gobbles an entire sequence of ordinary characters so that
2793 * it can turn them into a single node, which is smaller to store and
2794 * faster to run. Backslashed characters are exceptions, each becoming a
2795 * separate node; the code is simpler that way and it's not worth fixing.
2797 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2799 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2801 register regnode *ret = 0;
2803 char *parse_start = RExC_parse;
2805 *flagp = WORST; /* Tentatively. */
2808 switch (*RExC_parse) {
2810 RExC_seen_zerolen++;
2811 nextchar(pRExC_state);
2812 if (RExC_flags & PMf_MULTILINE)
2813 ret = reg_node(pRExC_state, MBOL);
2814 else if (RExC_flags & PMf_SINGLELINE)
2815 ret = reg_node(pRExC_state, SBOL);
2817 ret = reg_node(pRExC_state, BOL);
2818 Set_Node_Length(ret, 1); /* MJD */
2821 nextchar(pRExC_state);
2823 RExC_seen_zerolen++;
2824 if (RExC_flags & PMf_MULTILINE)
2825 ret = reg_node(pRExC_state, MEOL);
2826 else if (RExC_flags & PMf_SINGLELINE)
2827 ret = reg_node(pRExC_state, SEOL);
2829 ret = reg_node(pRExC_state, EOL);
2830 Set_Node_Length(ret, 1); /* MJD */
2833 nextchar(pRExC_state);
2834 if (RExC_flags & PMf_SINGLELINE)
2835 ret = reg_node(pRExC_state, SANY);
2837 ret = reg_node(pRExC_state, REG_ANY);
2838 *flagp |= HASWIDTH|SIMPLE;
2840 Set_Node_Length(ret, 1); /* MJD */
2844 char *oregcomp_parse = ++RExC_parse;
2845 ret = regclass(pRExC_state);
2846 if (*RExC_parse != ']') {
2847 RExC_parse = oregcomp_parse;
2848 vFAIL("Unmatched [");
2850 nextchar(pRExC_state);
2851 *flagp |= HASWIDTH|SIMPLE;
2852 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2856 nextchar(pRExC_state);
2857 ret = reg(pRExC_state, 1, &flags);
2859 if (flags & TRYAGAIN) {
2860 if (RExC_parse == RExC_end) {
2861 /* Make parent create an empty node if needed. */
2869 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2873 if (flags & TRYAGAIN) {
2877 vFAIL("Internal urp");
2878 /* Supposed to be caught earlier. */
2881 if (!regcurly(RExC_parse)) {
2890 vFAIL("Quantifier follows nothing");
2893 switch (*++RExC_parse) {
2895 RExC_seen_zerolen++;
2896 ret = reg_node(pRExC_state, SBOL);
2898 nextchar(pRExC_state);
2899 Set_Node_Length(ret, 2); /* MJD */
2902 ret = reg_node(pRExC_state, GPOS);
2903 RExC_seen |= REG_SEEN_GPOS;
2905 nextchar(pRExC_state);
2906 Set_Node_Length(ret, 2); /* MJD */
2909 ret = reg_node(pRExC_state, SEOL);
2911 RExC_seen_zerolen++; /* Do not optimize RE away */
2912 nextchar(pRExC_state);
2915 ret = reg_node(pRExC_state, EOS);
2917 RExC_seen_zerolen++; /* Do not optimize RE away */
2918 nextchar(pRExC_state);
2919 Set_Node_Length(ret, 2); /* MJD */
2922 ret = reg_node(pRExC_state, CANY);
2923 RExC_seen |= REG_SEEN_CANY;
2924 *flagp |= HASWIDTH|SIMPLE;
2925 nextchar(pRExC_state);
2926 Set_Node_Length(ret, 2); /* MJD */
2929 ret = reg_node(pRExC_state, CLUMP);
2931 nextchar(pRExC_state);
2932 Set_Node_Length(ret, 2); /* MJD */
2935 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2936 *flagp |= HASWIDTH|SIMPLE;
2937 nextchar(pRExC_state);
2938 Set_Node_Length(ret, 2); /* MJD */
2941 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2942 *flagp |= HASWIDTH|SIMPLE;
2943 nextchar(pRExC_state);
2944 Set_Node_Length(ret, 2); /* MJD */
2947 RExC_seen_zerolen++;
2948 RExC_seen |= REG_SEEN_LOOKBEHIND;
2949 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2951 nextchar(pRExC_state);
2952 Set_Node_Length(ret, 2); /* MJD */
2955 RExC_seen_zerolen++;
2956 RExC_seen |= REG_SEEN_LOOKBEHIND;
2957 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2959 nextchar(pRExC_state);
2960 Set_Node_Length(ret, 2); /* MJD */
2963 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2964 *flagp |= HASWIDTH|SIMPLE;
2965 nextchar(pRExC_state);
2966 Set_Node_Length(ret, 2); /* MJD */
2969 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2970 *flagp |= HASWIDTH|SIMPLE;
2971 nextchar(pRExC_state);
2972 Set_Node_Length(ret, 2); /* MJD */
2975 ret = reg_node(pRExC_state, DIGIT);
2976 *flagp |= HASWIDTH|SIMPLE;
2977 nextchar(pRExC_state);
2978 Set_Node_Length(ret, 2); /* MJD */
2981 ret = reg_node(pRExC_state, NDIGIT);
2982 *flagp |= HASWIDTH|SIMPLE;
2983 nextchar(pRExC_state);
2984 Set_Node_Length(ret, 2); /* MJD */
2989 char* oldregxend = RExC_end;
2990 char* parse_start = RExC_parse - 2;
2992 if (RExC_parse[1] == '{') {
2993 /* a lovely hack--pretend we saw [\pX] instead */
2994 RExC_end = strchr(RExC_parse, '}');
2996 U8 c = (U8)*RExC_parse;
2998 RExC_end = oldregxend;
2999 vFAIL2("Missing right brace on \\%c{}", c);
3004 RExC_end = RExC_parse + 2;
3005 if (RExC_end > oldregxend)
3006 RExC_end = oldregxend;
3010 ret = regclass(pRExC_state);
3012 RExC_end = oldregxend;
3015 Set_Node_Offset(ret, parse_start + 2);
3016 Set_Node_Cur_Length(ret);
3017 nextchar(pRExC_state);
3018 *flagp |= HASWIDTH|SIMPLE;
3031 case '1': case '2': case '3': case '4':
3032 case '5': case '6': case '7': case '8': case '9':
3034 I32 num = atoi(RExC_parse);
3036 if (num > 9 && num >= RExC_npar)
3039 char * parse_start = RExC_parse - 1; /* MJD */
3040 while (isDIGIT(*RExC_parse))
3043 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3044 vFAIL("Reference to nonexistent group");
3046 ret = reganode(pRExC_state,
3047 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3051 /* override incorrect value set in reganode MJD */
3052 Set_Node_Offset(ret, parse_start+1);
3053 Set_Node_Cur_Length(ret); /* MJD */
3055 nextchar(pRExC_state);
3060 if (RExC_parse >= RExC_end)
3061 FAIL("Trailing \\");
3064 /* Do not generate `unrecognized' warnings here, we fall
3065 back into the quick-grab loop below */
3072 if (RExC_flags & PMf_EXTENDED) {
3073 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3074 if (RExC_parse < RExC_end)
3080 register STRLEN len;
3086 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3088 parse_start = RExC_parse - 1;
3094 ret = reg_node(pRExC_state,
3095 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3097 for (len = 0, p = RExC_parse - 1;
3098 len < 127 && p < RExC_end;
3103 if (RExC_flags & PMf_EXTENDED)
3104 p = regwhite(p, RExC_end);
3151 ender = ASCII_TO_NATIVE('\033');
3155 ender = ASCII_TO_NATIVE('\007');
3160 char* e = strchr(p, '}');
3164 vFAIL("Missing right brace on \\x{}");
3167 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3168 | PERL_SCAN_DISALLOW_PREFIX;
3170 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3177 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3179 ender = grok_hex(p, &numlen, &flags, NULL);
3185 ender = UCHARAT(p++);
3186 ender = toCTRL(ender);
3188 case '0': case '1': case '2': case '3':case '4':
3189 case '5': case '6': case '7': case '8':case '9':
3191 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3194 ender = grok_oct(p, &numlen, &flags, NULL);
3204 FAIL("Trailing \\");
3207 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3208 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3209 goto normal_default;
3214 if (UTF8_IS_START(*p) && UTF) {
3215 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3223 if (RExC_flags & PMf_EXTENDED)
3224 p = regwhite(p, RExC_end);
3226 /* Prime the casefolded buffer. */
3227 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3229 if (ISMULT2(p)) { /* Back off on ?+*. */
3236 /* Emit all the Unicode characters. */
3237 for (foldbuf = tmpbuf;
3239 foldlen -= numlen) {
3240 ender = utf8_to_uvchr(foldbuf, &numlen);
3242 reguni(pRExC_state, ender, s, &unilen);
3245 /* In EBCDIC the numlen
3246 * and unilen can differ. */
3248 if (numlen >= foldlen)
3252 break; /* "Can't happen." */
3256 reguni(pRExC_state, ender, s, &unilen);
3265 REGC((char)ender, s++);
3273 /* Emit all the Unicode characters. */
3274 for (foldbuf = tmpbuf;
3276 foldlen -= numlen) {
3277 ender = utf8_to_uvchr(foldbuf, &numlen);
3279 reguni(pRExC_state, ender, s, &unilen);
3282 /* In EBCDIC the numlen
3283 * and unilen can differ. */
3285 if (numlen >= foldlen)
3293 reguni(pRExC_state, ender, s, &unilen);
3302 REGC((char)ender, s++);
3306 Set_Node_Cur_Length(ret); /* MJD */
3307 nextchar(pRExC_state);
3309 /* len is STRLEN which is unsigned, need to copy to signed */
3312 vFAIL("Internal disaster");
3316 if (len == 1 && UNI_IS_INVARIANT(ender))
3321 RExC_size += STR_SZ(len);
3323 RExC_emit += STR_SZ(len);
3328 /* If the encoding pragma is in effect recode the text of
3329 * any EXACT-kind nodes. */
3330 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3331 STRLEN oldlen = STR_LEN(ret);
3332 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3336 if (sv_utf8_downgrade(sv, TRUE)) {
3337 char *s = sv_recode_to_utf8(sv, PL_encoding);
3338 STRLEN newlen = SvCUR(sv);
3343 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3344 (int)oldlen, STRING(ret),
3346 Copy(s, STRING(ret), newlen, char);
3347 STR_LEN(ret) += newlen - oldlen;
3348 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3350 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3358 S_regwhite(pTHX_ char *p, char *e)
3363 else if (*p == '#') {
3366 } while (p < e && *p != '\n');
3374 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3375 Character classes ([:foo:]) can also be negated ([:^foo:]).
3376 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3377 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3378 but trigger failures because they are currently unimplemented. */
3380 #define POSIXCC_DONE(c) ((c) == ':')
3381 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3382 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3385 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3388 I32 namedclass = OOB_NAMEDCLASS;
3390 if (value == '[' && RExC_parse + 1 < RExC_end &&
3391 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3392 POSIXCC(UCHARAT(RExC_parse))) {
3393 char c = UCHARAT(RExC_parse);
3394 char* s = RExC_parse++;
3396 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3398 if (RExC_parse == RExC_end)
3399 /* Grandfather lone [:, [=, [. */
3402 char* t = RExC_parse++; /* skip over the c */
3404 if (UCHARAT(RExC_parse) == ']') {
3405 RExC_parse++; /* skip over the ending ] */
3408 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3409 I32 skip = 5; /* the most common skip */
3413 if (strnEQ(posixcc, "alnum", 5))
3415 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3416 else if (strnEQ(posixcc, "alpha", 5))
3418 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3419 else if (strnEQ(posixcc, "ascii", 5))
3421 complement ? ANYOF_NASCII : ANYOF_ASCII;
3424 if (strnEQ(posixcc, "blank", 5))
3426 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3429 if (strnEQ(posixcc, "cntrl", 5))
3431 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3434 if (strnEQ(posixcc, "digit", 5))
3436 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3439 if (strnEQ(posixcc, "graph", 5))
3441 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3444 if (strnEQ(posixcc, "lower", 5))
3446 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3449 if (strnEQ(posixcc, "print", 5))
3451 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3452 else if (strnEQ(posixcc, "punct", 5))
3454 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3457 if (strnEQ(posixcc, "space", 5))
3459 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3462 if (strnEQ(posixcc, "upper", 5))
3464 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3466 case 'w': /* this is not POSIX, this is the Perl \w */
3467 if (strnEQ(posixcc, "word", 4)) {
3469 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3474 if (strnEQ(posixcc, "xdigit", 6)) {
3476 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3481 if (namedclass == OOB_NAMEDCLASS ||
3482 posixcc[skip] != ':' ||
3483 posixcc[skip+1] != ']')
3485 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3488 } else if (!SIZE_ONLY) {
3489 /* [[=foo=]] and [[.foo.]] are still future. */
3491 /* adjust RExC_parse so the warning shows after
3493 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3495 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3498 /* Maternal grandfather:
3499 * "[:" ending in ":" but not in ":]" */
3509 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3511 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3512 char *s = RExC_parse;
3515 while(*s && isALNUM(*s))
3517 if (*s && c == *s && s[1] == ']') {
3518 if (ckWARN(WARN_REGEXP))
3520 "POSIX syntax [%c %c] belongs inside character classes",
3523 /* [[=foo=]] and [[.foo.]] are still future. */
3524 if (POSIXCC_NOTYET(c)) {
3525 /* adjust RExC_parse so the error shows after
3527 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3529 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3536 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3539 register UV nextvalue;
3540 register IV prevvalue = OOB_UNICODE;
3541 register IV range = 0;
3542 register regnode *ret;
3545 char *rangebegin = 0;
3546 bool need_class = 0;
3547 SV *listsv = Nullsv;
3550 bool optimize_invert = TRUE;
3551 AV* unicode_alternate = 0;
3553 UV literal_endpoint = 0;
3556 ret = reganode(pRExC_state, ANYOF, 0);
3559 ANYOF_FLAGS(ret) = 0;
3561 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3565 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3569 RExC_size += ANYOF_SKIP;
3571 RExC_emit += ANYOF_SKIP;
3573 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3575 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3576 ANYOF_BITMAP_ZERO(ret);
3577 listsv = newSVpvn("# comment\n", 10);
3580 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3582 if (!SIZE_ONLY && POSIXCC(nextvalue))
3583 checkposixcc(pRExC_state);
3585 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3586 if (UCHARAT(RExC_parse) == ']')
3589 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3593 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3596 rangebegin = RExC_parse;
3598 value = utf8n_to_uvchr((U8*)RExC_parse,
3599 RExC_end - RExC_parse,
3601 RExC_parse += numlen;
3604 value = UCHARAT(RExC_parse++);
3605 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3606 if (value == '[' && POSIXCC(nextvalue))
3607 namedclass = regpposixcc(pRExC_state, value);
3608 else if (value == '\\') {
3610 value = utf8n_to_uvchr((U8*)RExC_parse,
3611 RExC_end - RExC_parse,
3613 RExC_parse += numlen;
3616 value = UCHARAT(RExC_parse++);
3617 /* Some compilers cannot handle switching on 64-bit integer
3618 * values, therefore value cannot be an UV. Yes, this will
3619 * be a problem later if we want switch on Unicode.
3620 * A similar issue a little bit later when switching on
3621 * namedclass. --jhi */
3622 switch ((I32)value) {
3623 case 'w': namedclass = ANYOF_ALNUM; break;
3624 case 'W': namedclass = ANYOF_NALNUM; break;
3625 case 's': namedclass = ANYOF_SPACE; break;
3626 case 'S': namedclass = ANYOF_NSPACE; break;
3627 case 'd': namedclass = ANYOF_DIGIT; break;
3628 case 'D': namedclass = ANYOF_NDIGIT; break;
3631 if (RExC_parse >= RExC_end)
3632 vFAIL2("Empty \\%c{}", (U8)value);
3633 if (*RExC_parse == '{') {
3635 e = strchr(RExC_parse++, '}');
3637 vFAIL2("Missing right brace on \\%c{}", c);
3638 while (isSPACE(UCHARAT(RExC_parse)))
3640 if (e == RExC_parse)
3641 vFAIL2("Empty \\%c{}", c);
3643 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3651 if (UCHARAT(RExC_parse) == '^') {
3654 value = value == 'p' ? 'P' : 'p'; /* toggle */
3655 while (isSPACE(UCHARAT(RExC_parse))) {
3661 Perl_sv_catpvf(aTHX_ listsv,
3662 "+utf8::%.*s\n", (int)n, RExC_parse);
3664 Perl_sv_catpvf(aTHX_ listsv,
3665 "!utf8::%.*s\n", (int)n, RExC_parse);
3668 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3669 namedclass = ANYOF_MAX; /* no official name, but it's named */
3671 case 'n': value = '\n'; break;
3672 case 'r': value = '\r'; break;
3673 case 't': value = '\t'; break;
3674 case 'f': value = '\f'; break;
3675 case 'b': value = '\b'; break;
3676 case 'e': value = ASCII_TO_NATIVE('\033');break;
3677 case 'a': value = ASCII_TO_NATIVE('\007');break;
3679 if (*RExC_parse == '{') {
3680 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3681 | PERL_SCAN_DISALLOW_PREFIX;
3682 e = strchr(RExC_parse++, '}');
3684 vFAIL("Missing right brace on \\x{}");
3686 numlen = e - RExC_parse;
3687 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3691 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3693 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3694 RExC_parse += numlen;
3698 value = UCHARAT(RExC_parse++);
3699 value = toCTRL(value);
3701 case '0': case '1': case '2': case '3': case '4':
3702 case '5': case '6': case '7': case '8': case '9':
3706 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3707 RExC_parse += numlen;
3711 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3713 "Unrecognized escape \\%c in character class passed through",
3717 } /* end of \blah */
3723 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3725 if (!SIZE_ONLY && !need_class)
3726 ANYOF_CLASS_ZERO(ret);
3730 /* a bad range like a-\d, a-[:digit:] ? */
3733 if (ckWARN(WARN_REGEXP))
3735 "False [] range \"%*.*s\"",
3736 RExC_parse - rangebegin,
3737 RExC_parse - rangebegin,
3739 if (prevvalue < 256) {
3740 ANYOF_BITMAP_SET(ret, prevvalue);
3741 ANYOF_BITMAP_SET(ret, '-');
3744 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3745 Perl_sv_catpvf(aTHX_ listsv,
3746 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3750 range = 0; /* this was not a true range */
3754 if (namedclass > OOB_NAMEDCLASS)
3755 optimize_invert = FALSE;
3756 /* Possible truncation here but in some 64-bit environments
3757 * the compiler gets heartburn about switch on 64-bit values.
3758 * A similar issue a little earlier when switching on value.
3760 switch ((I32)namedclass) {
3763 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3765 for (value = 0; value < 256; value++)
3767 ANYOF_BITMAP_SET(ret, value);
3769 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3773 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3775 for (value = 0; value < 256; value++)
3776 if (!isALNUM(value))
3777 ANYOF_BITMAP_SET(ret, value);
3779 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3783 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3785 for (value = 0; value < 256; value++)
3786 if (isALNUMC(value))
3787 ANYOF_BITMAP_SET(ret, value);
3789 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3793 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3795 for (value = 0; value < 256; value++)
3796 if (!isALNUMC(value))
3797 ANYOF_BITMAP_SET(ret, value);
3799 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3803 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3805 for (value = 0; value < 256; value++)
3807 ANYOF_BITMAP_SET(ret, value);
3809 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3813 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3815 for (value = 0; value < 256; value++)
3816 if (!isALPHA(value))
3817 ANYOF_BITMAP_SET(ret, value);
3819 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3823 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3826 for (value = 0; value < 128; value++)
3827 ANYOF_BITMAP_SET(ret, value);
3829 for (value = 0; value < 256; value++) {
3831 ANYOF_BITMAP_SET(ret, value);
3835 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3839 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3842 for (value = 128; value < 256; value++)
3843 ANYOF_BITMAP_SET(ret, value);
3845 for (value = 0; value < 256; value++) {
3846 if (!isASCII(value))
3847 ANYOF_BITMAP_SET(ret, value);
3851 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3855 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3857 for (value = 0; value < 256; value++)
3859 ANYOF_BITMAP_SET(ret, value);
3861 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3865 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3867 for (value = 0; value < 256; value++)
3868 if (!isBLANK(value))
3869 ANYOF_BITMAP_SET(ret, value);
3871 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3875 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3877 for (value = 0; value < 256; value++)
3879 ANYOF_BITMAP_SET(ret, value);
3881 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3885 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3887 for (value = 0; value < 256; value++)
3888 if (!isCNTRL(value))
3889 ANYOF_BITMAP_SET(ret, value);
3891 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3895 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3897 /* consecutive digits assumed */
3898 for (value = '0'; value <= '9'; value++)
3899 ANYOF_BITMAP_SET(ret, value);
3901 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3905 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3907 /* consecutive digits assumed */
3908 for (value = 0; value < '0'; value++)
3909 ANYOF_BITMAP_SET(ret, value);
3910 for (value = '9' + 1; value < 256; value++)
3911 ANYOF_BITMAP_SET(ret, value);
3913 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3917 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3919 for (value = 0; value < 256; value++)
3921 ANYOF_BITMAP_SET(ret, value);
3923 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
3927 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3929 for (value = 0; value < 256; value++)
3930 if (!isGRAPH(value))
3931 ANYOF_BITMAP_SET(ret, value);
3933 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
3937 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3939 for (value = 0; value < 256; value++)
3941 ANYOF_BITMAP_SET(ret, value);
3943 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
3947 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
3949 for (value = 0; value < 256; value++)
3950 if (!isLOWER(value))
3951 ANYOF_BITMAP_SET(ret, value);
3953 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
3957 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
3959 for (value = 0; value < 256; value++)
3961 ANYOF_BITMAP_SET(ret, value);
3963 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
3967 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
3969 for (value = 0; value < 256; value++)
3970 if (!isPRINT(value))
3971 ANYOF_BITMAP_SET(ret, value);
3973 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
3977 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
3979 for (value = 0; value < 256; value++)
3980 if (isPSXSPC(value))
3981 ANYOF_BITMAP_SET(ret, value);
3983 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
3987 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
3989 for (value = 0; value < 256; value++)
3990 if (!isPSXSPC(value))
3991 ANYOF_BITMAP_SET(ret, value);
3993 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
3997 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
3999 for (value = 0; value < 256; value++)
4001 ANYOF_BITMAP_SET(ret, value);
4003 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
4007 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4009 for (value = 0; value < 256; value++)
4010 if (!isPUNCT(value))
4011 ANYOF_BITMAP_SET(ret, value);
4013 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
4017 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4019 for (value = 0; value < 256; value++)
4021 ANYOF_BITMAP_SET(ret, value);
4023 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
4027 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4029 for (value = 0; value < 256; value++)
4030 if (!isSPACE(value))
4031 ANYOF_BITMAP_SET(ret, value);
4033 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
4037 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4039 for (value = 0; value < 256; value++)
4041 ANYOF_BITMAP_SET(ret, value);
4043 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
4047 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4049 for (value = 0; value < 256; value++)
4050 if (!isUPPER(value))
4051 ANYOF_BITMAP_SET(ret, value);
4053 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
4057 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4059 for (value = 0; value < 256; value++)
4060 if (isXDIGIT(value))
4061 ANYOF_BITMAP_SET(ret, value);
4063 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
4067 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4069 for (value = 0; value < 256; value++)
4070 if (!isXDIGIT(value))
4071 ANYOF_BITMAP_SET(ret, value);
4073 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
4076 /* this is to handle \p and \P */
4079 vFAIL("Invalid [::] class");
4083 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4086 } /* end of namedclass \blah */
4089 if (prevvalue > (IV)value) /* b-a */ {
4090 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4091 RExC_parse - rangebegin,
4092 RExC_parse - rangebegin,
4094 range = 0; /* not a valid range */
4098 prevvalue = value; /* save the beginning of the range */
4099 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4100 RExC_parse[1] != ']') {
4103 /* a bad range like \w-, [:word:]- ? */
4104 if (namedclass > OOB_NAMEDCLASS) {
4105 if (ckWARN(WARN_REGEXP))
4107 "False [] range \"%*.*s\"",
4108 RExC_parse - rangebegin,
4109 RExC_parse - rangebegin,
4112 ANYOF_BITMAP_SET(ret, '-');
4114 range = 1; /* yeah, it's a range! */
4115 continue; /* but do it the next time */
4119 /* now is the next time */
4123 if (prevvalue < 256) {
4124 IV ceilvalue = value < 256 ? value : 255;
4127 /* In EBCDIC [\x89-\x91] should include
4128 * the \x8e but [i-j] should not. */
4129 if (literal_endpoint == 2 &&
4130 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4131 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4133 if (isLOWER(prevvalue)) {
4134 for (i = prevvalue; i <= ceilvalue; i++)
4136 ANYOF_BITMAP_SET(ret, i);
4138 for (i = prevvalue; i <= ceilvalue; i++)
4140 ANYOF_BITMAP_SET(ret, i);
4145 for (i = prevvalue; i <= ceilvalue; i++)
4146 ANYOF_BITMAP_SET(ret, i);
4148 if (value > 255 || UTF) {
4149 UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4150 UV natvalue = NATIVE_TO_UNI(value);
4152 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4153 if (prevnatvalue < natvalue) { /* what about > ? */
4154 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4155 prevnatvalue, natvalue);
4157 else if (prevnatvalue == natvalue) {
4158 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4160 U8 foldbuf[UTF8_MAXLEN_FOLD+1];
4162 UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4164 /* If folding and foldable and a single
4165 * character, insert also the folded version
4166 * to the charclass. */
4168 if (foldlen == (STRLEN)UNISKIP(f))
4169 Perl_sv_catpvf(aTHX_ listsv,
4172 /* Any multicharacter foldings
4173 * require the following transform:
4174 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4175 * where E folds into "pq" and F folds
4176 * into "rst", all other characters
4177 * fold to single characters. We save
4178 * away these multicharacter foldings,
4179 * to be later saved as part of the
4180 * additional "s" data. */
4183 if (!unicode_alternate)
4184 unicode_alternate = newAV();
4185 sv = newSVpvn((char*)foldbuf, foldlen);
4187 av_push(unicode_alternate, sv);
4191 /* If folding and the value is one of the Greek
4192 * sigmas insert a few more sigmas to make the
4193 * folding rules of the sigmas to work right.
4194 * Note that not all the possible combinations
4195 * are handled here: some of them are handled
4196 * by the standard folding rules, and some of
4197 * them (literal or EXACTF cases) are handled
4198 * during runtime in regexec.c:S_find_byclass(). */
4199 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4200 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4201 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4202 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4203 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4205 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4206 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4207 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4212 literal_endpoint = 0;
4216 range = 0; /* this range (if it was one) is done now */
4220 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4222 RExC_size += ANYOF_CLASS_ADD_SKIP;
4224 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4227 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4229 /* If the only flag is folding (plus possibly inversion). */
4230 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4232 for (value = 0; value < 256; ++value) {
4233 if (ANYOF_BITMAP_TEST(ret, value)) {
4234 UV fold = PL_fold[value];
4237 ANYOF_BITMAP_SET(ret, fold);
4240 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4243 /* optimize inverted simple patterns (e.g. [^a-z]) */
4244 if (!SIZE_ONLY && optimize_invert &&
4245 /* If the only flag is inversion. */
4246 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4247 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4248 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4249 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4256 /* The 0th element stores the character class description
4257 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4258 * to initialize the appropriate swash (which gets stored in
4259 * the 1st element), and also useful for dumping the regnode.
4260 * The 2nd element stores the multicharacter foldings,
4261 * used later (regexec.c:S_reginclass()). */
4262 av_store(av, 0, listsv);
4263 av_store(av, 1, NULL);
4264 av_store(av, 2, (SV*)unicode_alternate);
4265 rv = newRV_noinc((SV*)av);
4266 n = add_data(pRExC_state, 1, "s");
4267 RExC_rx->data->data[n] = (void*)rv;
4275 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4277 char* retval = RExC_parse++;
4280 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4281 RExC_parse[2] == '#') {
4282 while (*RExC_parse != ')') {
4283 if (RExC_parse == RExC_end)
4284 FAIL("Sequence (?#... not terminated");
4290 if (RExC_flags & PMf_EXTENDED) {
4291 if (isSPACE(*RExC_parse)) {
4295 else if (*RExC_parse == '#') {
4296 while (RExC_parse < RExC_end)
4297 if (*RExC_parse++ == '\n') break;
4306 - reg_node - emit a node
4308 STATIC regnode * /* Location. */
4309 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4311 register regnode *ret;
4312 register regnode *ptr;
4316 SIZE_ALIGN(RExC_size);
4321 NODE_ALIGN_FILL(ret);
4323 FILL_ADVANCE_NODE(ptr, op);
4324 if (RExC_offsets) { /* MJD */
4325 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4326 "reg_node", __LINE__,
4328 RExC_emit - RExC_emit_start > RExC_offsets[0]
4329 ? "Overwriting end of array!\n" : "OK",
4330 RExC_emit - RExC_emit_start,
4331 RExC_parse - RExC_start,
4333 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4342 - reganode - emit a node with an argument
4344 STATIC regnode * /* Location. */
4345 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4347 register regnode *ret;
4348 register regnode *ptr;
4352 SIZE_ALIGN(RExC_size);
4357 NODE_ALIGN_FILL(ret);
4359 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4360 if (RExC_offsets) { /* MJD */
4361 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4365 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4366 "Overwriting end of array!\n" : "OK",
4367 RExC_emit - RExC_emit_start,
4368 RExC_parse - RExC_start,
4370 Set_Cur_Node_Offset;
4379 - reguni - emit (if appropriate) a Unicode character
4382 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4384 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4388 - reginsert - insert an operator in front of already-emitted operand
4390 * Means relocating the operand.
4393 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4395 register regnode *src;
4396 register regnode *dst;
4397 register regnode *place;
4398 register int offset = regarglen[(U8)op];
4400 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4403 RExC_size += NODE_STEP_REGNODE + offset;
4408 RExC_emit += NODE_STEP_REGNODE + offset;
4410 while (src > opnd) {
4411 StructCopy(--src, --dst, regnode);
4412 if (RExC_offsets) { /* MJD 20010112 */
4413 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4417 dst - RExC_emit_start > RExC_offsets[0]
4418 ? "Overwriting end of array!\n" : "OK",
4419 src - RExC_emit_start,
4420 dst - RExC_emit_start,
4422 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4423 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4428 place = opnd; /* Op node, where operand used to be. */
4429 if (RExC_offsets) { /* MJD */
4430 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4434 place - RExC_emit_start > RExC_offsets[0]
4435 ? "Overwriting end of array!\n" : "OK",
4436 place - RExC_emit_start,
4437 RExC_parse - RExC_start,
4439 Set_Node_Offset(place, RExC_parse);
4440 Set_Node_Length(place, 1);
4442 src = NEXTOPER(place);
4443 FILL_ADVANCE_NODE(place, op);
4444 Zero(src, offset, regnode);
4448 - regtail - set the next-pointer at the end of a node chain of p to val.
4451 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4453 register regnode *scan;
4454 register regnode *temp;
4459 /* Find last node. */
4462 temp = regnext(scan);
4468 if (reg_off_by_arg[OP(scan)]) {
4469 ARG_SET(scan, val - scan);
4472 NEXT_OFF(scan) = val - scan;
4477 - regoptail - regtail on operand of first argument; nop if operandless
4480 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4482 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4483 if (p == NULL || SIZE_ONLY)
4485 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4486 regtail(pRExC_state, NEXTOPER(p), val);
4488 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4489 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4496 - regcurly - a little FSA that accepts {\d+,?\d*}
4499 S_regcurly(pTHX_ register char *s)
4520 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4522 register U8 op = EXACT; /* Arbitrary non-END op. */
4523 register regnode *next;
4525 while (op != END && (!last || node < last)) {
4526 /* While that wasn't END last time... */
4532 next = regnext(node);
4534 if (OP(node) == OPTIMIZED)
4537 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4538 (int)(2*l + 1), "", SvPVX(sv));
4539 if (next == NULL) /* Next ptr. */
4540 PerlIO_printf(Perl_debug_log, "(0)");
4542 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4543 (void)PerlIO_putc(Perl_debug_log, '\n');
4545 if (PL_regkind[(U8)op] == BRANCHJ) {
4546 register regnode *nnode = (OP(next) == LONGJMP
4549 if (last && nnode > last)
4551 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4553 else if (PL_regkind[(U8)op] == BRANCH) {
4554 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4556 else if ( op == CURLY) { /* `next' might be very big: optimizer */
4557 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4558 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4560 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4561 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4564 else if ( op == PLUS || op == STAR) {
4565 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4567 else if (op == ANYOF) {
4568 /* arglen 1 + class block */
4569 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4570 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4571 node = NEXTOPER(node);
4573 else if (PL_regkind[(U8)op] == EXACT) {
4574 /* Literal string, where present. */
4575 node += NODE_SZ_STR(node) - 1;
4576 node = NEXTOPER(node);
4579 node = NEXTOPER(node);
4580 node += regarglen[(U8)op];
4582 if (op == CURLYX || op == OPEN)
4584 else if (op == WHILEM)
4590 #endif /* DEBUGGING */
4593 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4596 Perl_regdump(pTHX_ regexp *r)
4599 SV *sv = sv_newmortal();
4601 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4603 /* Header fields of interest. */
4604 if (r->anchored_substr)
4605 PerlIO_printf(Perl_debug_log,
4606 "anchored `%s%.*s%s'%s at %"IVdf" ",
4608 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4609 SvPVX(r->anchored_substr),
4611 SvTAIL(r->anchored_substr) ? "$" : "",
4612 (IV)r->anchored_offset);
4613 else if (r->anchored_utf8)
4614 PerlIO_printf(Perl_debug_log,
4615 "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
4617 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4618 SvPVX(r->anchored_utf8),
4620 SvTAIL(r->anchored_utf8) ? "$" : "",
4621 (IV)r->anchored_offset);
4622 if (r->float_substr)
4623 PerlIO_printf(Perl_debug_log,
4624 "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4626 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4627 SvPVX(r->float_substr),
4629 SvTAIL(r->float_substr) ? "$" : "",
4630 (IV)r->float_min_offset, (UV)r->float_max_offset);
4631 else if (r->float_utf8)
4632 PerlIO_printf(Perl_debug_log,
4633 "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
4635 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4636 SvPVX(r->float_utf8),
4638 SvTAIL(r->float_utf8) ? "$" : "",
4639 (IV)r->float_min_offset, (UV)r->float_max_offset);
4640 if (r->check_substr || r->check_utf8)
4641 PerlIO_printf(Perl_debug_log,
4642 r->check_substr == r->float_substr
4643 && r->check_utf8 == r->float_utf8
4644 ? "(checking floating" : "(checking anchored");
4645 if (r->reganch & ROPT_NOSCAN)
4646 PerlIO_printf(Perl_debug_log, " noscan");
4647 if (r->reganch & ROPT_CHECK_ALL)
4648 PerlIO_printf(Perl_debug_log, " isall");
4649 if (r->check_substr || r->check_utf8)
4650 PerlIO_printf(Perl_debug_log, ") ");
4652 if (r->regstclass) {
4653 regprop(sv, r->regstclass);
4654 PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv));
4656 if (r->reganch & ROPT_ANCH) {
4657 PerlIO_printf(Perl_debug_log, "anchored");
4658 if (r->reganch & ROPT_ANCH_BOL)
4659 PerlIO_printf(Perl_debug_log, "(BOL)");
4660 if (r->reganch & ROPT_ANCH_MBOL)
4661 PerlIO_printf(Perl_debug_log, "(MBOL)");
4662 if (r->reganch & ROPT_ANCH_SBOL)
4663 PerlIO_printf(Perl_debug_log, "(SBOL)");
4664 if (r->reganch & ROPT_ANCH_GPOS)
4665 PerlIO_printf(Perl_debug_log, "(GPOS)");
4666 PerlIO_putc(Perl_debug_log, ' ');
4668 if (r->reganch & ROPT_GPOS_SEEN)
4669 PerlIO_printf(Perl_debug_log, "GPOS ");
4670 if (r->reganch & ROPT_SKIP)
4671 PerlIO_printf(Perl_debug_log, "plus ");
4672 if (r->reganch & ROPT_IMPLICIT)
4673 PerlIO_printf(Perl_debug_log, "implicit ");
4674 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4675 if (r->reganch & ROPT_EVAL_SEEN)
4676 PerlIO_printf(Perl_debug_log, "with eval ");
4677 PerlIO_printf(Perl_debug_log, "\n");
4680 U32 len = r->offsets[0];
4681 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4682 for (i = 1; i <= len; i++)
4683 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4684 (UV)r->offsets[i*2-1],
4685 (UV)r->offsets[i*2]);
4686 PerlIO_printf(Perl_debug_log, "\n");
4688 #endif /* DEBUGGING */
4694 S_put_byte(pTHX_ SV *sv, int c)
4696 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4697 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4698 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4699 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4701 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4704 #endif /* DEBUGGING */
4707 - regprop - printable representation of opcode
4710 Perl_regprop(pTHX_ SV *sv, regnode *o)
4715 sv_setpvn(sv, "", 0);
4716 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4717 /* It would be nice to FAIL() here, but this may be called from
4718 regexec.c, and it would be hard to supply pRExC_state. */
4719 Perl_croak(aTHX_ "Corrupted regexp opcode");
4720 sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
4722 k = PL_regkind[(U8)OP(o)];
4725 SV *dsv = sv_2mortal(newSVpvn("", 0));
4726 /* Using is_utf8_string() is a crude hack but it may
4727 * be the best for now since we have no flag "this EXACTish
4728 * node was UTF-8" --jhi */
4729 bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4731 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4732 UNI_DISPLAY_REGEX) :
4737 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4742 else if (k == CURLY) {
4743 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4744 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4745 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4747 else if (k == WHILEM && o->flags) /* Ordinal/of */
4748 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4749 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4750 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4751 else if (k == LOGICAL)
4752 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4753 else if (k == ANYOF) {
4754 int i, rangestart = -1;
4755 U8 flags = ANYOF_FLAGS(o);
4756 const char * const anyofs[] = { /* Should be synchronized with
4757 * ANYOF_ #xdefines in regcomp.h */
4790 if (flags & ANYOF_LOCALE)
4791 sv_catpv(sv, "{loc}");
4792 if (flags & ANYOF_FOLD)
4793 sv_catpv(sv, "{i}");
4794 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4795 if (flags & ANYOF_INVERT)
4797 for (i = 0; i <= 256; i++) {
4798 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4799 if (rangestart == -1)
4801 } else if (rangestart != -1) {
4802 if (i <= rangestart + 3)
4803 for (; rangestart < i; rangestart++)
4804 put_byte(sv, rangestart);
4806 put_byte(sv, rangestart);
4808 put_byte(sv, i - 1);
4814 if (o->flags & ANYOF_CLASS)
4815 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4816 if (ANYOF_CLASS_TEST(o,i))
4817 sv_catpv(sv, anyofs[i]);
4819 if (flags & ANYOF_UNICODE)
4820 sv_catpv(sv, "{unicode}");
4821 else if (flags & ANYOF_UNICODE_ALL)
4822 sv_catpv(sv, "{unicode_all}");
4826 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4830 U8 s[UTF8_MAXLEN+1];
4832 for (i = 0; i <= 256; i++) { /* just the first 256 */
4833 U8 *e = uvchr_to_utf8(s, i);
4835 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4836 if (rangestart == -1)
4838 } else if (rangestart != -1) {
4841 if (i <= rangestart + 3)
4842 for (; rangestart < i; rangestart++) {
4843 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4847 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4850 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4857 sv_catpv(sv, "..."); /* et cetera */
4861 char *s = savepv(SvPVX(lv));
4864 while(*s && *s != '\n') s++;
4885 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4887 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4888 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4889 #endif /* DEBUGGING */
4893 Perl_re_intuit_string(pTHX_ regexp *prog)
4894 { /* Assume that RE_INTUIT is set */
4897 char *s = SvPV(prog->check_substr
4898 ? prog->check_substr : prog->check_utf8, n_a);
4900 if (!PL_colorset) reginitcolors();
4901 PerlIO_printf(Perl_debug_log,
4902 "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
4904 prog->check_substr ? "" : "utf8 ",
4905 PL_colors[5],PL_colors[0],
4908 (strlen(s) > 60 ? "..." : ""));
4911 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4915 Perl_pregfree(pTHX_ struct regexp *r)
4918 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4921 if (!r || (--r->refcnt > 0))
4927 s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
4928 r->prelen, 60, UNI_DISPLAY_REGEX)
4929 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4933 PerlIO_printf(Perl_debug_log,
4934 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
4935 PL_colors[4],PL_colors[5],PL_colors[0],
4938 len > 60 ? "..." : "");
4942 Safefree(r->precomp);
4943 if (r->offsets) /* 20010421 MJD */
4944 Safefree(r->offsets);
4945 if (RX_MATCH_COPIED(r))
4946 Safefree(r->subbeg);
4948 if (r->anchored_substr)
4949 SvREFCNT_dec(r->anchored_substr);
4950 if (r->anchored_utf8)
4951 SvREFCNT_dec(r->anchored_utf8);
4952 if (r->float_substr)
4953 SvREFCNT_dec(r->float_substr);
4955 SvREFCNT_dec(r->float_utf8);
4956 Safefree(r->substrs);
4959 int n = r->data->count;
4960 PAD* new_comppad = NULL;
4965 /* If you add a ->what type here, update the comment in regcomp.h */
4966 switch (r->data->what[n]) {
4968 SvREFCNT_dec((SV*)r->data->data[n]);
4971 Safefree(r->data->data[n]);
4974 new_comppad = (AV*)r->data->data[n];
4977 if (new_comppad == NULL)
4978 Perl_croak(aTHX_ "panic: pregfree comppad");
4979 PAD_SAVE_LOCAL(old_comppad,
4980 /* Watch out for global destruction's random ordering. */
4981 (SvTYPE(new_comppad) == SVt_PVAV) ?
4982 new_comppad : Null(PAD *)
4985 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
4988 op_free((OP_4tree*)r->data->data[n]);
4990 PAD_RESTORE_LOCAL(old_comppad);
4991 SvREFCNT_dec((SV*)new_comppad);
4997 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
5000 Safefree(r->data->what);
5003 Safefree(r->startp);
5009 - regnext - dig the "next" pointer out of a node
5011 * [Note, when REGALIGN is defined there are two places in regmatch()
5012 * that bypass this code for speed.]
5015 Perl_regnext(pTHX_ register regnode *p)
5017 register I32 offset;
5019 if (p == &PL_regdummy)
5022 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5030 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5033 STRLEN l1 = strlen(pat1);
5034 STRLEN l2 = strlen(pat2);
5043 Copy(pat1, buf, l1 , char);
5044 Copy(pat2, buf + l1, l2 , char);
5045 buf[l1 + l2] = '\n';
5046 buf[l1 + l2 + 1] = '\0';
5048 /* ANSI variant takes additional second argument */
5049 va_start(args, pat2);
5053 msv = vmess(buf, &args);
5055 message = SvPV(msv,l1);
5058 Copy(message, buf, l1 , char);
5059 buf[l1-1] = '\0'; /* Overwrite \n */
5060 Perl_croak(aTHX_ "%s", buf);
5063 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5066 Perl_save_re_context(pTHX)
5068 SAVEI32(PL_reg_flags); /* from regexec.c */
5070 SAVEPPTR(PL_reginput); /* String-input pointer. */
5071 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5072 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5073 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5074 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5075 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5076 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5077 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5078 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5079 PL_reg_start_tmp = 0;
5080 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5081 PL_reg_start_tmpl = 0;
5082 SAVEVPTR(PL_regdata);
5083 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5084 SAVEI32(PL_regnarrate); /* from regexec.c */
5085 SAVEVPTR(PL_regprogram); /* from regexec.c */
5086 SAVEINT(PL_regindent); /* from regexec.c */
5087 SAVEVPTR(PL_regcc); /* from regexec.c */
5088 SAVEVPTR(PL_curcop);
5089 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5090 SAVEVPTR(PL_reg_re); /* from regexec.c */
5091 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5092 SAVESPTR(PL_reg_sv); /* from regexec.c */
5093 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5094 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5095 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5096 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5097 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5098 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5099 PL_reg_oldsaved = Nullch;
5100 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5101 PL_reg_oldsavedlen = 0;
5102 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5104 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5105 PL_reg_leftiter = 0;
5106 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5107 PL_reg_poscache = Nullch;
5108 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5109 PL_reg_poscache_size = 0;
5110 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5111 SAVEI32(PL_regnpar); /* () count. */
5112 SAVEI32(PL_regsize); /* from regexec.c */
5115 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5119 char digits[TYPE_CHARS(long)];
5121 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5122 for (i = 1; i <= rx->nparens; i++) {
5123 sprintf(digits, "%lu", (long)i);
5124 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5131 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5136 clear_re(pTHX_ void *r)
5138 ReREFCNT_dec((regexp *)r);