5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
80 **** Alterations to Henry's code are...
82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
83 **** 2000, 2001, 2002, 2003, 2004, 2005, by Larry Wall and others
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
94 #define PERL_IN_REGCOMP_C
97 #ifndef PERL_IN_XSUB_RE
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
121 typedef struct RExC_state_t {
122 U32 flags; /* are we folding, multilining? */
123 char *precomp; /* uncompiled string. */
125 char *start; /* Start of input for compile */
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
129 regnode *emit_start; /* Start of emitted-code area */
130 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
146 #define RExC_flags (pRExC_state->flags)
147 #define RExC_precomp (pRExC_state->precomp)
148 #define RExC_rx (pRExC_state->rx)
149 #define RExC_start (pRExC_state->start)
150 #define RExC_end (pRExC_state->end)
151 #define RExC_parse (pRExC_state->parse)
152 #define RExC_whilem_seen (pRExC_state->whilem_seen)
153 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
154 #define RExC_emit (pRExC_state->emit)
155 #define RExC_emit_start (pRExC_state->emit_start)
156 #define RExC_naughty (pRExC_state->naughty)
157 #define RExC_sawback (pRExC_state->sawback)
158 #define RExC_seen (pRExC_state->seen)
159 #define RExC_size (pRExC_state->size)
160 #define RExC_npar (pRExC_state->npar)
161 #define RExC_extralen (pRExC_state->extralen)
162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8 (pRExC_state->utf8)
166 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
171 #undef SPSTART /* dratted cpp namespace... */
174 * Flags to be passed up and down.
176 #define WORST 0 /* Worst case. */
177 #define HASWIDTH 0x1 /* Known to match non-null strings. */
178 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179 #define SPSTART 0x4 /* Starts with * or +. */
180 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
182 /* Length of a variant. */
184 typedef struct scan_data_t {
190 I32 last_end; /* min value, <0 unless valid. */
193 SV **longest; /* Either &l_fixed, or &l_float. */
197 I32 offset_float_min;
198 I32 offset_float_max;
202 struct regnode_charclass_class *start_class;
206 * Forward declarations for pregcomp()'s friends.
209 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213 #define SF_BEFORE_SEOL 0x1
214 #define SF_BEFORE_MEOL 0x2
215 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
219 # define SF_FIX_SHIFT_EOL (0+2)
220 # define SF_FL_SHIFT_EOL (0+4)
222 # define SF_FIX_SHIFT_EOL (+2)
223 # define SF_FL_SHIFT_EOL (+4)
226 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
229 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231 #define SF_IS_INF 0x40
232 #define SF_HAS_PAR 0x80
233 #define SF_IN_PAR 0x100
234 #define SF_HAS_EVAL 0x200
235 #define SCF_DO_SUBSTR 0x400
236 #define SCF_DO_STCLASS_AND 0x0800
237 #define SCF_DO_STCLASS_OR 0x1000
238 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
239 #define SCF_WHILEM_VISITED_POS 0x2000
241 #define UTF (RExC_utf8 != 0)
242 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
243 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
245 #define OOB_UNICODE 12345678
246 #define OOB_NAMEDCLASS -1
248 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
252 /* length of regex to show in messages that don't mark a position within */
253 #define RegexLengthToShowInErrorMessages 127
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
260 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
261 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
263 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
270 #define FAIL(msg) STMT_START { \
271 const char *ellipses = ""; \
272 IV len = RExC_end - RExC_precomp; \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
290 #define FAIL2(pat,msg) STMT_START { \
291 const char *ellipses = ""; \
292 IV len = RExC_end - RExC_precomp; \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
309 #define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
318 #define vFAIL(m) STMT_START { \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
325 * Like Simple_vFAIL(), but accepts two arguments.
327 #define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
336 #define vFAIL2(m,a1) STMT_START { \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
344 * Like Simple_vFAIL(), but accepts three arguments.
346 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 #define vFAIL3(m,a1,a2) STMT_START { \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
362 * Like Simple_vFAIL(), but accepts four arguments.
364 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
371 * Like Simple_vFAIL(), but accepts five arguments.
373 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
386 #define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
394 #define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
406 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
412 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
419 /* Allow for side effects in s */
420 #define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
424 /* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
430 #define MJD_OFFSET_DEBUG(x)
431 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
434 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
441 RExC_offsets[2*(node)-1] = (byte); \
446 #define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
450 #define Set_Node_Length_To_R(node,len) STMT_START { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
457 RExC_offsets[2*(node)] = (len); \
462 #define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465 #define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
468 /* Get offsets and lengths */
469 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
472 static void clear_re(pTHX_ void *r);
474 /* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
479 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
481 const STRLEN l = CHR_SVLEN(data->last_found);
482 const STRLEN old_l = CHR_SVLEN(*data->longest);
484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
485 SvSetMagicSV(*data->longest, data->last_found);
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
492 data->flags &= ~SF_FIX_BEFORE_EOL;
495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
496 data->offset_float_max = (l
497 ? data->last_start_max
498 : data->pos_min + data->pos_delta);
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
501 if (data->flags & SF_BEFORE_EOL)
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
505 data->flags &= ~SF_FL_BEFORE_EOL;
508 SvCUR_set(data->last_found, 0);
510 SV * sv = data->last_found;
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
597 cl->flags &= ~ANYOF_UNICODE_ALL;
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
600 cl->flags &= ~ANYOF_UNICODE;
603 /* 'OR' a given class with another one. Can create false positives */
604 /* We assume that cl is not inverted */
606 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
608 if (or_with->flags & ANYOF_INVERT) {
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
627 cl_anything(pRExC_state, cl);
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
632 && (!(or_with->flags & ANYOF_FOLD)
633 || (cl->flags & ANYOF_FOLD)) ) {
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
646 cl_anything(pRExC_state, cl);
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
664 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
665 * These need to be revisited when a newer toolchain becomes available.
667 #if defined(__sparc64__) && defined(__GNUC__)
668 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
669 # undef SPARC64_GCC_WORKAROUND
670 # define SPARC64_GCC_WORKAROUND 1
674 /* REx optimizer. Converts nodes into quickier variants "in place".
675 Finds fixed substrings. */
677 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
678 to the position after last scanned or to NULL. */
681 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
682 /* scanp: Start here (read-write). */
683 /* deltap: Write maxlen-minlen here. */
684 /* last: Stop before this one. */
686 I32 min = 0, pars = 0, code;
687 regnode *scan = *scanp, *next;
689 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
690 int is_inf_internal = 0; /* The studied chunk is infinite */
691 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
692 scan_data_t data_fake;
693 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
695 while (scan && OP(scan) != END && scan < last) {
696 /* Peephole optimizer: */
698 if (PL_regkind[(U8)OP(scan)] == EXACT) {
699 /* Merge several consecutive EXACTish nodes into one. */
700 regnode *n = regnext(scan);
703 regnode *stop = scan;
706 next = scan + NODE_SZ_STR(scan);
707 /* Skip NOTHING, merge EXACT*. */
709 ( PL_regkind[(U8)OP(n)] == NOTHING ||
710 (stringok && (OP(n) == OP(scan))))
712 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
713 if (OP(n) == TAIL || n > next)
715 if (PL_regkind[(U8)OP(n)] == NOTHING) {
716 NEXT_OFF(scan) += NEXT_OFF(n);
717 next = n + NODE_STEP_REGNODE;
725 const int oldl = STR_LEN(scan);
726 regnode *nnext = regnext(n);
728 if (oldl + STR_LEN(n) > U8_MAX)
730 NEXT_OFF(scan) += NEXT_OFF(n);
731 STR_LEN(scan) += STR_LEN(n);
732 next = n + NODE_SZ_STR(n);
733 /* Now we can overwrite *n : */
734 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
742 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
744 Two problematic code points in Unicode casefolding of EXACT nodes:
746 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
747 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
753 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
754 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
756 This means that in case-insensitive matching (or "loose matching",
757 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
758 length of the above casefolded versions) can match a target string
759 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
760 This would rather mess up the minimum length computation.
762 What we'll do is to look for the tail four bytes, and then peek
763 at the preceding two bytes to see whether we need to decrease
764 the minimum length by four (six minus two).
766 Thanks to the design of UTF-8, there cannot be false matches:
767 A sequence of valid UTF-8 bytes cannot be a subsequence of
768 another valid sequence of UTF-8 bytes.
771 char *s0 = STRING(scan), *s, *t;
772 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
773 const char *t0 = "\xcc\x88\xcc\x81";
774 const char *t1 = t0 + 3;
777 s < s2 && (t = ninstr(s, s1, t0, t1));
779 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
780 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
787 n = scan + NODE_SZ_STR(scan);
789 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
797 /* Follow the next-chain of the current node and optimize
798 away all the NOTHINGs from it. */
799 if (OP(scan) != CURLYX) {
800 const int max = (reg_off_by_arg[OP(scan)]
802 /* I32 may be smaller than U16 on CRAYs! */
803 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
804 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
808 /* Skip NOTHING and LONGJMP. */
809 while ((n = regnext(n))
810 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
811 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
814 if (reg_off_by_arg[OP(scan)])
817 NEXT_OFF(scan) = off;
819 /* The principal pseudo-switch. Cannot be a switch, since we
820 look into several different things. */
821 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
822 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
823 next = regnext(scan);
826 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
827 I32 max1 = 0, min1 = I32_MAX, num = 0;
828 struct regnode_charclass_class accum;
830 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
831 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
832 if (flags & SCF_DO_STCLASS)
833 cl_init_zero(pRExC_state, &accum);
834 while (OP(scan) == code) {
835 I32 deltanext, minnext, f = 0, fake;
836 struct regnode_charclass_class this_class;
841 data_fake.whilem_c = data->whilem_c;
842 data_fake.last_closep = data->last_closep;
845 data_fake.last_closep = &fake;
846 next = regnext(scan);
847 scan = NEXTOPER(scan);
849 scan = NEXTOPER(scan);
850 if (flags & SCF_DO_STCLASS) {
851 cl_init(pRExC_state, &this_class);
852 data_fake.start_class = &this_class;
853 f = SCF_DO_STCLASS_AND;
855 if (flags & SCF_WHILEM_VISITED_POS)
856 f |= SCF_WHILEM_VISITED_POS;
857 /* we suppose the run is continuous, last=next...*/
858 minnext = study_chunk(pRExC_state, &scan, &deltanext,
859 next, &data_fake, f);
862 if (max1 < minnext + deltanext)
863 max1 = minnext + deltanext;
864 if (deltanext == I32_MAX)
865 is_inf = is_inf_internal = 1;
867 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
869 if (data && (data_fake.flags & SF_HAS_EVAL))
870 data->flags |= SF_HAS_EVAL;
872 data->whilem_c = data_fake.whilem_c;
873 if (flags & SCF_DO_STCLASS)
874 cl_or(pRExC_state, &accum, &this_class);
878 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
880 if (flags & SCF_DO_SUBSTR) {
881 data->pos_min += min1;
882 data->pos_delta += max1 - min1;
883 if (max1 != min1 || is_inf)
884 data->longest = &(data->longest_float);
887 delta += max1 - min1;
888 if (flags & SCF_DO_STCLASS_OR) {
889 cl_or(pRExC_state, data->start_class, &accum);
891 cl_and(data->start_class, &and_with);
892 flags &= ~SCF_DO_STCLASS;
895 else if (flags & SCF_DO_STCLASS_AND) {
897 cl_and(data->start_class, &accum);
898 flags &= ~SCF_DO_STCLASS;
901 /* Switch to OR mode: cache the old value of
902 * data->start_class */
903 StructCopy(data->start_class, &and_with,
904 struct regnode_charclass_class);
905 flags &= ~SCF_DO_STCLASS_AND;
906 StructCopy(&accum, data->start_class,
907 struct regnode_charclass_class);
908 flags |= SCF_DO_STCLASS_OR;
909 data->start_class->flags |= ANYOF_EOS;
913 else if (code == BRANCHJ) /* single branch is optimized. */
914 scan = NEXTOPER(NEXTOPER(scan));
915 else /* single branch is optimized. */
916 scan = NEXTOPER(scan);
919 else if (OP(scan) == EXACT) {
920 I32 l = STR_LEN(scan);
921 UV uc = *((U8*)STRING(scan));
923 const U8 * const s = (U8*)STRING(scan);
924 l = utf8_length(s, s + l);
925 uc = utf8_to_uvchr(s, NULL);
928 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
929 /* The code below prefers earlier match for fixed
930 offset, later match for variable offset. */
931 if (data->last_end == -1) { /* Update the start info. */
932 data->last_start_min = data->pos_min;
933 data->last_start_max = is_inf
934 ? I32_MAX : data->pos_min + data->pos_delta;
936 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
938 SV * sv = data->last_found;
939 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
940 mg_find(sv, PERL_MAGIC_utf8) : NULL;
941 if (mg && mg->mg_len >= 0)
942 mg->mg_len += utf8_length((U8*)STRING(scan),
943 (U8*)STRING(scan)+STR_LEN(scan));
946 SvUTF8_on(data->last_found);
947 data->last_end = data->pos_min + l;
948 data->pos_min += l; /* As in the first entry. */
949 data->flags &= ~SF_BEFORE_EOL;
951 if (flags & SCF_DO_STCLASS_AND) {
952 /* Check whether it is compatible with what we know already! */
956 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
957 && !ANYOF_BITMAP_TEST(data->start_class, uc)
958 && (!(data->start_class->flags & ANYOF_FOLD)
959 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
962 ANYOF_CLASS_ZERO(data->start_class);
963 ANYOF_BITMAP_ZERO(data->start_class);
965 ANYOF_BITMAP_SET(data->start_class, uc);
966 data->start_class->flags &= ~ANYOF_EOS;
968 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
970 else if (flags & SCF_DO_STCLASS_OR) {
971 /* false positive possible if the class is case-folded */
973 ANYOF_BITMAP_SET(data->start_class, uc);
975 data->start_class->flags |= ANYOF_UNICODE_ALL;
976 data->start_class->flags &= ~ANYOF_EOS;
977 cl_and(data->start_class, &and_with);
979 flags &= ~SCF_DO_STCLASS;
981 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
982 I32 l = STR_LEN(scan);
983 UV uc = *((U8*)STRING(scan));
985 /* Search for fixed substrings supports EXACT only. */
986 if (flags & SCF_DO_SUBSTR)
987 scan_commit(pRExC_state, data);
989 U8 *s = (U8 *)STRING(scan);
990 l = utf8_length(s, s + l);
991 uc = utf8_to_uvchr(s, NULL);
994 if (data && (flags & SCF_DO_SUBSTR))
996 if (flags & SCF_DO_STCLASS_AND) {
997 /* Check whether it is compatible with what we know already! */
1001 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1002 && !ANYOF_BITMAP_TEST(data->start_class, uc)
1003 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
1005 ANYOF_CLASS_ZERO(data->start_class);
1006 ANYOF_BITMAP_ZERO(data->start_class);
1008 ANYOF_BITMAP_SET(data->start_class, uc);
1009 data->start_class->flags &= ~ANYOF_EOS;
1010 data->start_class->flags |= ANYOF_FOLD;
1011 if (OP(scan) == EXACTFL)
1012 data->start_class->flags |= ANYOF_LOCALE;
1015 else if (flags & SCF_DO_STCLASS_OR) {
1016 if (data->start_class->flags & ANYOF_FOLD) {
1017 /* false positive possible if the class is case-folded.
1018 Assume that the locale settings are the same... */
1020 ANYOF_BITMAP_SET(data->start_class, uc);
1021 data->start_class->flags &= ~ANYOF_EOS;
1023 cl_and(data->start_class, &and_with);
1025 flags &= ~SCF_DO_STCLASS;
1027 else if (strchr((const char*)PL_varies,OP(scan))) {
1028 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1029 I32 f = flags, pos_before = 0;
1030 regnode *oscan = scan;
1031 struct regnode_charclass_class this_class;
1032 struct regnode_charclass_class *oclass = NULL;
1033 I32 next_is_eval = 0;
1035 switch (PL_regkind[(U8)OP(scan)]) {
1036 case WHILEM: /* End of (?:...)* . */
1037 scan = NEXTOPER(scan);
1040 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1041 next = NEXTOPER(scan);
1042 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1044 maxcount = REG_INFTY;
1045 next = regnext(scan);
1046 scan = NEXTOPER(scan);
1050 if (flags & SCF_DO_SUBSTR)
1055 if (flags & SCF_DO_STCLASS) {
1057 maxcount = REG_INFTY;
1058 next = regnext(scan);
1059 scan = NEXTOPER(scan);
1062 is_inf = is_inf_internal = 1;
1063 scan = regnext(scan);
1064 if (flags & SCF_DO_SUBSTR) {
1065 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1066 data->longest = &(data->longest_float);
1068 goto optimize_curly_tail;
1070 mincount = ARG1(scan);
1071 maxcount = ARG2(scan);
1072 next = regnext(scan);
1073 if (OP(scan) == CURLYX) {
1074 I32 lp = (data ? *(data->last_closep) : 0);
1076 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1078 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1079 next_is_eval = (OP(scan) == EVAL);
1081 if (flags & SCF_DO_SUBSTR) {
1082 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1083 pos_before = data->pos_min;
1087 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1089 data->flags |= SF_IS_INF;
1091 if (flags & SCF_DO_STCLASS) {
1092 cl_init(pRExC_state, &this_class);
1093 oclass = data->start_class;
1094 data->start_class = &this_class;
1095 f |= SCF_DO_STCLASS_AND;
1096 f &= ~SCF_DO_STCLASS_OR;
1098 /* These are the cases when once a subexpression
1099 fails at a particular position, it cannot succeed
1100 even after backtracking at the enclosing scope.
1102 XXXX what if minimal match and we are at the
1103 initial run of {n,m}? */
1104 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1105 f &= ~SCF_WHILEM_VISITED_POS;
1107 /* This will finish on WHILEM, setting scan, or on NULL: */
1108 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1110 ? (f & ~SCF_DO_SUBSTR) : f);
1112 if (flags & SCF_DO_STCLASS)
1113 data->start_class = oclass;
1114 if (mincount == 0 || minnext == 0) {
1115 if (flags & SCF_DO_STCLASS_OR) {
1116 cl_or(pRExC_state, data->start_class, &this_class);
1118 else if (flags & SCF_DO_STCLASS_AND) {
1119 /* Switch to OR mode: cache the old value of
1120 * data->start_class */
1121 StructCopy(data->start_class, &and_with,
1122 struct regnode_charclass_class);
1123 flags &= ~SCF_DO_STCLASS_AND;
1124 StructCopy(&this_class, data->start_class,
1125 struct regnode_charclass_class);
1126 flags |= SCF_DO_STCLASS_OR;
1127 data->start_class->flags |= ANYOF_EOS;
1129 } else { /* Non-zero len */
1130 if (flags & SCF_DO_STCLASS_OR) {
1131 cl_or(pRExC_state, data->start_class, &this_class);
1132 cl_and(data->start_class, &and_with);
1134 else if (flags & SCF_DO_STCLASS_AND)
1135 cl_and(data->start_class, &this_class);
1136 flags &= ~SCF_DO_STCLASS;
1138 if (!scan) /* It was not CURLYX, but CURLY. */
1140 if (ckWARN(WARN_REGEXP)
1141 /* ? quantifier ok, except for (?{ ... }) */
1142 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1143 && (minnext == 0) && (deltanext == 0)
1144 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1145 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1148 "Quantifier unexpected on zero-length expression");
1151 min += minnext * mincount;
1152 is_inf_internal |= ((maxcount == REG_INFTY
1153 && (minnext + deltanext) > 0)
1154 || deltanext == I32_MAX);
1155 is_inf |= is_inf_internal;
1156 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1158 /* Try powerful optimization CURLYX => CURLYN. */
1159 if ( OP(oscan) == CURLYX && data
1160 && data->flags & SF_IN_PAR
1161 && !(data->flags & SF_HAS_EVAL)
1162 && !deltanext && minnext == 1 ) {
1163 /* Try to optimize to CURLYN. */
1164 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1165 regnode *nxt1 = nxt;
1172 if (!strchr((const char*)PL_simple,OP(nxt))
1173 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1174 && STR_LEN(nxt) == 1))
1180 if (OP(nxt) != CLOSE)
1182 /* Now we know that nxt2 is the only contents: */
1183 oscan->flags = (U8)ARG(nxt);
1185 OP(nxt1) = NOTHING; /* was OPEN. */
1187 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1188 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1189 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1190 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1191 OP(nxt + 1) = OPTIMIZED; /* was count. */
1192 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1197 /* Try optimization CURLYX => CURLYM. */
1198 if ( OP(oscan) == CURLYX && data
1199 && !(data->flags & SF_HAS_PAR)
1200 && !(data->flags & SF_HAS_EVAL)
1201 && !deltanext /* atom is fixed width */
1202 && minnext != 0 /* CURLYM can't handle zero width */
1204 /* XXXX How to optimize if data == 0? */
1205 /* Optimize to a simpler form. */
1206 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1210 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1211 && (OP(nxt2) != WHILEM))
1213 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1214 /* Need to optimize away parenths. */
1215 if (data->flags & SF_IN_PAR) {
1216 /* Set the parenth number. */
1217 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1219 if (OP(nxt) != CLOSE)
1220 FAIL("Panic opt close");
1221 oscan->flags = (U8)ARG(nxt);
1222 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1223 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1225 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1226 OP(nxt + 1) = OPTIMIZED; /* was count. */
1227 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1228 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1231 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1232 regnode *nnxt = regnext(nxt1);
1235 if (reg_off_by_arg[OP(nxt1)])
1236 ARG_SET(nxt1, nxt2 - nxt1);
1237 else if (nxt2 - nxt1 < U16_MAX)
1238 NEXT_OFF(nxt1) = nxt2 - nxt1;
1240 OP(nxt) = NOTHING; /* Cannot beautify */
1245 /* Optimize again: */
1246 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1252 else if ((OP(oscan) == CURLYX)
1253 && (flags & SCF_WHILEM_VISITED_POS)
1254 /* See the comment on a similar expression above.
1255 However, this time it not a subexpression
1256 we care about, but the expression itself. */
1257 && (maxcount == REG_INFTY)
1258 && data && ++data->whilem_c < 16) {
1259 /* This stays as CURLYX, we can put the count/of pair. */
1260 /* Find WHILEM (as in regexec.c) */
1261 regnode *nxt = oscan + NEXT_OFF(oscan);
1263 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1265 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1266 | (RExC_whilem_seen << 4)); /* On WHILEM */
1268 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1270 if (flags & SCF_DO_SUBSTR) {
1271 SV *last_str = Nullsv;
1272 int counted = mincount != 0;
1274 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1275 #if defined(SPARC64_GCC_WORKAROUND)
1281 if (pos_before >= data->last_start_min)
1284 b = data->last_start_min;
1287 s = SvPV(data->last_found, l);
1288 old = b - data->last_start_min;
1291 I32 b = pos_before >= data->last_start_min
1292 ? pos_before : data->last_start_min;
1294 char *s = SvPV(data->last_found, l);
1295 I32 old = b - data->last_start_min;
1299 old = utf8_hop((U8*)s, old) - (U8*)s;
1302 /* Get the added string: */
1303 last_str = newSVpvn(s + old, l);
1305 SvUTF8_on(last_str);
1306 if (deltanext == 0 && pos_before == b) {
1307 /* What was added is a constant string */
1309 SvGROW(last_str, (mincount * l) + 1);
1310 repeatcpy(SvPVX(last_str) + l,
1311 SvPVX_const(last_str), l, mincount - 1);
1312 SvCUR_set(last_str, SvCUR(last_str) * mincount);
1313 /* Add additional parts. */
1314 SvCUR_set(data->last_found,
1315 SvCUR(data->last_found) - l);
1316 sv_catsv(data->last_found, last_str);
1318 SV * sv = data->last_found;
1320 SvUTF8(sv) && SvMAGICAL(sv) ?
1321 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1322 if (mg && mg->mg_len >= 0)
1323 mg->mg_len += CHR_SVLEN(last_str);
1325 data->last_end += l * (mincount - 1);
1328 /* start offset must point into the last copy */
1329 data->last_start_min += minnext * (mincount - 1);
1330 data->last_start_max += is_inf ? I32_MAX
1331 : (maxcount - 1) * (minnext + data->pos_delta);
1334 /* It is counted once already... */
1335 data->pos_min += minnext * (mincount - counted);
1336 data->pos_delta += - counted * deltanext +
1337 (minnext + deltanext) * maxcount - minnext * mincount;
1338 if (mincount != maxcount) {
1339 /* Cannot extend fixed substrings found inside
1341 scan_commit(pRExC_state,data);
1342 if (mincount && last_str) {
1343 sv_setsv(data->last_found, last_str);
1344 data->last_end = data->pos_min;
1345 data->last_start_min =
1346 data->pos_min - CHR_SVLEN(last_str);
1347 data->last_start_max = is_inf
1349 : data->pos_min + data->pos_delta
1350 - CHR_SVLEN(last_str);
1352 data->longest = &(data->longest_float);
1354 SvREFCNT_dec(last_str);
1356 if (data && (fl & SF_HAS_EVAL))
1357 data->flags |= SF_HAS_EVAL;
1358 optimize_curly_tail:
1359 if (OP(oscan) != CURLYX) {
1360 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1362 NEXT_OFF(oscan) += NEXT_OFF(next);
1365 default: /* REF and CLUMP only? */
1366 if (flags & SCF_DO_SUBSTR) {
1367 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1368 data->longest = &(data->longest_float);
1370 is_inf = is_inf_internal = 1;
1371 if (flags & SCF_DO_STCLASS_OR)
1372 cl_anything(pRExC_state, data->start_class);
1373 flags &= ~SCF_DO_STCLASS;
1377 else if (strchr((const char*)PL_simple,OP(scan))) {
1380 if (flags & SCF_DO_SUBSTR) {
1381 scan_commit(pRExC_state,data);
1385 if (flags & SCF_DO_STCLASS) {
1386 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1388 /* Some of the logic below assumes that switching
1389 locale on will only add false positives. */
1390 switch (PL_regkind[(U8)OP(scan)]) {
1394 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1395 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1396 cl_anything(pRExC_state, data->start_class);
1399 if (OP(scan) == SANY)
1401 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1402 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1403 || (data->start_class->flags & ANYOF_CLASS));
1404 cl_anything(pRExC_state, data->start_class);
1406 if (flags & SCF_DO_STCLASS_AND || !value)
1407 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1410 if (flags & SCF_DO_STCLASS_AND)
1411 cl_and(data->start_class,
1412 (struct regnode_charclass_class*)scan);
1414 cl_or(pRExC_state, data->start_class,
1415 (struct regnode_charclass_class*)scan);
1418 if (flags & SCF_DO_STCLASS_AND) {
1419 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1420 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1421 for (value = 0; value < 256; value++)
1422 if (!isALNUM(value))
1423 ANYOF_BITMAP_CLEAR(data->start_class, value);
1427 if (data->start_class->flags & ANYOF_LOCALE)
1428 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1430 for (value = 0; value < 256; value++)
1432 ANYOF_BITMAP_SET(data->start_class, value);
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (data->start_class->flags & ANYOF_LOCALE)
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1443 data->start_class->flags |= ANYOF_LOCALE;
1447 if (flags & SCF_DO_STCLASS_AND) {
1448 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1449 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1450 for (value = 0; value < 256; value++)
1452 ANYOF_BITMAP_CLEAR(data->start_class, value);
1456 if (data->start_class->flags & ANYOF_LOCALE)
1457 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1459 for (value = 0; value < 256; value++)
1460 if (!isALNUM(value))
1461 ANYOF_BITMAP_SET(data->start_class, value);
1466 if (flags & SCF_DO_STCLASS_AND) {
1467 if (data->start_class->flags & ANYOF_LOCALE)
1468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1471 data->start_class->flags |= ANYOF_LOCALE;
1472 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1476 if (flags & SCF_DO_STCLASS_AND) {
1477 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1478 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1479 for (value = 0; value < 256; value++)
1480 if (!isSPACE(value))
1481 ANYOF_BITMAP_CLEAR(data->start_class, value);
1485 if (data->start_class->flags & ANYOF_LOCALE)
1486 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1488 for (value = 0; value < 256; value++)
1490 ANYOF_BITMAP_SET(data->start_class, value);
1495 if (flags & SCF_DO_STCLASS_AND) {
1496 if (data->start_class->flags & ANYOF_LOCALE)
1497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1500 data->start_class->flags |= ANYOF_LOCALE;
1501 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1505 if (flags & SCF_DO_STCLASS_AND) {
1506 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1507 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1508 for (value = 0; value < 256; value++)
1510 ANYOF_BITMAP_CLEAR(data->start_class, value);
1514 if (data->start_class->flags & ANYOF_LOCALE)
1515 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1517 for (value = 0; value < 256; value++)
1518 if (!isSPACE(value))
1519 ANYOF_BITMAP_SET(data->start_class, value);
1524 if (flags & SCF_DO_STCLASS_AND) {
1525 if (data->start_class->flags & ANYOF_LOCALE) {
1526 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1527 for (value = 0; value < 256; value++)
1528 if (!isSPACE(value))
1529 ANYOF_BITMAP_CLEAR(data->start_class, value);
1533 data->start_class->flags |= ANYOF_LOCALE;
1534 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1538 if (flags & SCF_DO_STCLASS_AND) {
1539 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1540 for (value = 0; value < 256; value++)
1541 if (!isDIGIT(value))
1542 ANYOF_BITMAP_CLEAR(data->start_class, value);
1545 if (data->start_class->flags & ANYOF_LOCALE)
1546 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1548 for (value = 0; value < 256; value++)
1550 ANYOF_BITMAP_SET(data->start_class, value);
1555 if (flags & SCF_DO_STCLASS_AND) {
1556 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1557 for (value = 0; value < 256; value++)
1559 ANYOF_BITMAP_CLEAR(data->start_class, value);
1562 if (data->start_class->flags & ANYOF_LOCALE)
1563 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1565 for (value = 0; value < 256; value++)
1566 if (!isDIGIT(value))
1567 ANYOF_BITMAP_SET(data->start_class, value);
1572 if (flags & SCF_DO_STCLASS_OR)
1573 cl_and(data->start_class, &and_with);
1574 flags &= ~SCF_DO_STCLASS;
1577 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1578 data->flags |= (OP(scan) == MEOL
1582 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1583 /* Lookbehind, or need to calculate parens/evals/stclass: */
1584 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1585 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1586 /* Lookahead/lookbehind */
1587 I32 deltanext, minnext, fake = 0;
1589 struct regnode_charclass_class intrnl;
1592 data_fake.flags = 0;
1594 data_fake.whilem_c = data->whilem_c;
1595 data_fake.last_closep = data->last_closep;
1598 data_fake.last_closep = &fake;
1599 if ( flags & SCF_DO_STCLASS && !scan->flags
1600 && OP(scan) == IFMATCH ) { /* Lookahead */
1601 cl_init(pRExC_state, &intrnl);
1602 data_fake.start_class = &intrnl;
1603 f |= SCF_DO_STCLASS_AND;
1605 if (flags & SCF_WHILEM_VISITED_POS)
1606 f |= SCF_WHILEM_VISITED_POS;
1607 next = regnext(scan);
1608 nscan = NEXTOPER(NEXTOPER(scan));
1609 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1612 vFAIL("Variable length lookbehind not implemented");
1614 else if (minnext > U8_MAX) {
1615 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1617 scan->flags = (U8)minnext;
1619 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1621 if (data && (data_fake.flags & SF_HAS_EVAL))
1622 data->flags |= SF_HAS_EVAL;
1624 data->whilem_c = data_fake.whilem_c;
1625 if (f & SCF_DO_STCLASS_AND) {
1626 int was = (data->start_class->flags & ANYOF_EOS);
1628 cl_and(data->start_class, &intrnl);
1630 data->start_class->flags |= ANYOF_EOS;
1633 else if (OP(scan) == OPEN) {
1636 else if (OP(scan) == CLOSE) {
1637 if ((I32)ARG(scan) == is_par) {
1638 next = regnext(scan);
1640 if ( next && (OP(next) != WHILEM) && next < last)
1641 is_par = 0; /* Disable optimization */
1644 *(data->last_closep) = ARG(scan);
1646 else if (OP(scan) == EVAL) {
1648 data->flags |= SF_HAS_EVAL;
1650 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1651 if (flags & SCF_DO_SUBSTR) {
1652 scan_commit(pRExC_state,data);
1653 data->longest = &(data->longest_float);
1655 is_inf = is_inf_internal = 1;
1656 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1657 cl_anything(pRExC_state, data->start_class);
1658 flags &= ~SCF_DO_STCLASS;
1660 /* Else: zero-length, ignore. */
1661 scan = regnext(scan);
1666 *deltap = is_inf_internal ? I32_MAX : delta;
1667 if (flags & SCF_DO_SUBSTR && is_inf)
1668 data->pos_delta = I32_MAX - data->pos_min;
1669 if (is_par > U8_MAX)
1671 if (is_par && pars==1 && data) {
1672 data->flags |= SF_IN_PAR;
1673 data->flags &= ~SF_HAS_PAR;
1675 else if (pars && data) {
1676 data->flags |= SF_HAS_PAR;
1677 data->flags &= ~SF_IN_PAR;
1679 if (flags & SCF_DO_STCLASS_OR)
1680 cl_and(data->start_class, &and_with);
1685 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, const char *s)
1687 if (RExC_rx->data) {
1688 Renewc(RExC_rx->data,
1689 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1690 char, struct reg_data);
1691 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1692 RExC_rx->data->count += n;
1695 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1696 char, struct reg_data);
1697 New(1208, RExC_rx->data->what, n, U8);
1698 RExC_rx->data->count = n;
1700 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1701 return RExC_rx->data->count - n;
1705 Perl_reginitcolors(pTHX)
1707 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
1709 char *t = savepv(s);
1713 t = strchr(t, '\t');
1719 PL_colors[i] = t = (char *)"";
1724 PL_colors[i++] = (char *)"";
1731 - pregcomp - compile a regular expression into internal code
1733 * We can't allocate space until we know how big the compiled form will be,
1734 * but we can't compile it (and thus know how big it is) until we've got a
1735 * place to put the code. So we cheat: we compile it twice, once with code
1736 * generation turned off and size counting turned on, and once "for real".
1737 * This also means that we don't allocate space until we are sure that the
1738 * thing really will compile successfully, and we never have to move the
1739 * code and thus invalidate pointers into it. (Note that it has to be in
1740 * one piece because free() must be able to free it all.) [NB: not true in perl]
1742 * Beware that the optimization-preparation code in here knows about some
1743 * of the structure of the compiled regexp. [I'll say.]
1746 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1756 RExC_state_t RExC_state;
1757 RExC_state_t *pRExC_state = &RExC_state;
1760 FAIL("NULL regexp argument");
1762 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1766 if (!PL_colorset) reginitcolors();
1767 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1768 PL_colors[4],PL_colors[5],PL_colors[0],
1769 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1771 RExC_flags = pm->op_pmflags;
1775 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1776 RExC_seen_evals = 0;
1779 /* First pass: determine size, legality. */
1786 RExC_emit = &PL_regdummy;
1787 RExC_whilem_seen = 0;
1788 #if 0 /* REGC() is (currently) a NOP at the first pass.
1789 * Clever compilers notice this and complain. --jhi */
1790 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1792 if (reg(pRExC_state, 0, &flags) == NULL) {
1793 RExC_precomp = Nullch;
1796 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1798 /* Small enough for pointer-storage convention?
1799 If extralen==0, this means that we will not need long jumps. */
1800 if (RExC_size >= 0x10000L && RExC_extralen)
1801 RExC_size += RExC_extralen;
1804 if (RExC_whilem_seen > 15)
1805 RExC_whilem_seen = 15;
1807 /* Allocate space and initialize. */
1808 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1811 FAIL("Regexp out of space");
1814 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1815 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1818 r->prelen = xend - exp;
1819 r->precomp = savepvn(RExC_precomp, r->prelen);
1821 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1822 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1824 r->substrs = 0; /* Useful during FAIL. */
1825 r->startp = 0; /* Useful during FAIL. */
1826 r->endp = 0; /* Useful during FAIL. */
1828 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1830 r->offsets[0] = RExC_size;
1832 DEBUG_r(PerlIO_printf(Perl_debug_log,
1833 "%s %"UVuf" bytes for offset annotations.\n",
1834 r->offsets ? "Got" : "Couldn't get",
1835 (UV)((2*RExC_size+1) * sizeof(U32))));
1839 /* Second pass: emit code. */
1840 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1845 RExC_emit_start = r->program;
1846 RExC_emit = r->program;
1847 /* Store the count of eval-groups for security checks: */
1848 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1849 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1851 if (reg(pRExC_state, 0, &flags) == NULL)
1854 /* Dig out information for optimizations. */
1855 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1856 pm->op_pmflags = RExC_flags;
1858 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1859 r->regstclass = NULL;
1860 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1861 r->reganch |= ROPT_NAUGHTY;
1862 scan = r->program + 1; /* First BRANCH. */
1864 /* XXXX To minimize changes to RE engine we always allocate
1865 3-units-long substrs field. */
1866 Newz(1004, r->substrs, 1, struct reg_substr_data);
1868 StructCopy(&zero_scan_data, &data, scan_data_t);
1869 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1870 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1872 STRLEN longest_float_length, longest_fixed_length;
1873 struct regnode_charclass_class ch_class;
1878 /* Skip introductions and multiplicators >= 1. */
1879 while ((OP(first) == OPEN && (sawopen = 1)) ||
1880 /* An OR of *one* alternative - should not happen now. */
1881 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1882 (OP(first) == PLUS) ||
1883 (OP(first) == MINMOD) ||
1884 /* An {n,m} with n>0 */
1885 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1886 if (OP(first) == PLUS)
1889 first += regarglen[(U8)OP(first)];
1890 first = NEXTOPER(first);
1893 /* Starting-point info. */
1895 if (PL_regkind[(U8)OP(first)] == EXACT) {
1896 if (OP(first) == EXACT)
1897 ; /* Empty, get anchored substr later. */
1898 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1899 r->regstclass = first;
1901 else if (strchr((const char*)PL_simple,OP(first)))
1902 r->regstclass = first;
1903 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1904 PL_regkind[(U8)OP(first)] == NBOUND)
1905 r->regstclass = first;
1906 else if (PL_regkind[(U8)OP(first)] == BOL) {
1907 r->reganch |= (OP(first) == MBOL
1909 : (OP(first) == SBOL
1912 first = NEXTOPER(first);
1915 else if (OP(first) == GPOS) {
1916 r->reganch |= ROPT_ANCH_GPOS;
1917 first = NEXTOPER(first);
1920 else if (!sawopen && (OP(first) == STAR &&
1921 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1922 !(r->reganch & ROPT_ANCH) )
1924 /* turn .* into ^.* with an implied $*=1 */
1926 (OP(NEXTOPER(first)) == REG_ANY)
1929 r->reganch |= type | ROPT_IMPLICIT;
1930 first = NEXTOPER(first);
1933 if (sawplus && (!sawopen || !RExC_sawback)
1934 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1935 /* x+ must match at the 1st pos of run of x's */
1936 r->reganch |= ROPT_SKIP;
1938 /* Scan is after the zeroth branch, first is atomic matcher. */
1939 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1940 (IV)(first - scan + 1)));
1942 * If there's something expensive in the r.e., find the
1943 * longest literal string that must appear and make it the
1944 * regmust. Resolve ties in favor of later strings, since
1945 * the regstart check works with the beginning of the r.e.
1946 * and avoiding duplication strengthens checking. Not a
1947 * strong reason, but sufficient in the absence of others.
1948 * [Now we resolve ties in favor of the earlier string if
1949 * it happens that c_offset_min has been invalidated, since the
1950 * earlier string may buy us something the later one won't.]
1954 data.longest_fixed = newSVpvn("",0);
1955 data.longest_float = newSVpvn("",0);
1956 data.last_found = newSVpvn("",0);
1957 data.longest = &(data.longest_fixed);
1959 if (!r->regstclass) {
1960 cl_init(pRExC_state, &ch_class);
1961 data.start_class = &ch_class;
1962 stclass_flag = SCF_DO_STCLASS_AND;
1963 } else /* XXXX Check for BOUND? */
1965 data.last_closep = &last_close;
1967 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1968 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1969 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1970 && data.last_start_min == 0 && data.last_end > 0
1971 && !RExC_seen_zerolen
1972 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1973 r->reganch |= ROPT_CHECK_ALL;
1974 scan_commit(pRExC_state, &data);
1975 SvREFCNT_dec(data.last_found);
1977 longest_float_length = CHR_SVLEN(data.longest_float);
1978 if (longest_float_length
1979 || (data.flags & SF_FL_BEFORE_EOL
1980 && (!(data.flags & SF_FL_BEFORE_MEOL)
1981 || (RExC_flags & PMf_MULTILINE)))) {
1984 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1985 && data.offset_fixed == data.offset_float_min
1986 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1987 goto remove_float; /* As in (a)+. */
1989 if (SvUTF8(data.longest_float)) {
1990 r->float_utf8 = data.longest_float;
1991 r->float_substr = Nullsv;
1993 r->float_substr = data.longest_float;
1994 r->float_utf8 = Nullsv;
1996 r->float_min_offset = data.offset_float_min;
1997 r->float_max_offset = data.offset_float_max;
1998 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1999 && (!(data.flags & SF_FL_BEFORE_MEOL)
2000 || (RExC_flags & PMf_MULTILINE)));
2001 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2005 r->float_substr = r->float_utf8 = Nullsv;
2006 SvREFCNT_dec(data.longest_float);
2007 longest_float_length = 0;
2010 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2011 if (longest_fixed_length
2012 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2013 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2014 || (RExC_flags & PMf_MULTILINE)))) {
2017 if (SvUTF8(data.longest_fixed)) {
2018 r->anchored_utf8 = data.longest_fixed;
2019 r->anchored_substr = Nullsv;
2021 r->anchored_substr = data.longest_fixed;
2022 r->anchored_utf8 = Nullsv;
2024 r->anchored_offset = data.offset_fixed;
2025 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2026 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2027 || (RExC_flags & PMf_MULTILINE)));
2028 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2031 r->anchored_substr = r->anchored_utf8 = Nullsv;
2032 SvREFCNT_dec(data.longest_fixed);
2033 longest_fixed_length = 0;
2036 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2037 r->regstclass = NULL;
2038 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2040 && !(data.start_class->flags & ANYOF_EOS)
2041 && !cl_is_anything(data.start_class))
2043 const I32 n = add_data(pRExC_state, 1, "f");
2045 New(1006, RExC_rx->data->data[n], 1,
2046 struct regnode_charclass_class);
2047 StructCopy(data.start_class,
2048 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2049 struct regnode_charclass_class);
2050 r->regstclass = (regnode*)RExC_rx->data->data[n];
2051 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2052 PL_regdata = r->data; /* for regprop() */
2053 DEBUG_r({ SV *sv = sv_newmortal();
2054 regprop(sv, (regnode*)data.start_class);
2055 PerlIO_printf(Perl_debug_log,
2056 "synthetic stclass \"%s\".\n",
2057 SvPVX_const(sv));});
2060 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2061 if (longest_fixed_length > longest_float_length) {
2062 r->check_substr = r->anchored_substr;
2063 r->check_utf8 = r->anchored_utf8;
2064 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2065 if (r->reganch & ROPT_ANCH_SINGLE)
2066 r->reganch |= ROPT_NOSCAN;
2069 r->check_substr = r->float_substr;
2070 r->check_utf8 = r->float_utf8;
2071 r->check_offset_min = data.offset_float_min;
2072 r->check_offset_max = data.offset_float_max;
2074 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2075 This should be changed ASAP! */
2076 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2077 r->reganch |= RE_USE_INTUIT;
2078 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2079 r->reganch |= RE_INTUIT_TAIL;
2083 /* Several toplevels. Best we can is to set minlen. */
2085 struct regnode_charclass_class ch_class;
2088 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2089 scan = r->program + 1;
2090 cl_init(pRExC_state, &ch_class);
2091 data.start_class = &ch_class;
2092 data.last_closep = &last_close;
2093 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2094 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2095 = r->float_substr = r->float_utf8 = Nullsv;
2096 if (!(data.start_class->flags & ANYOF_EOS)
2097 && !cl_is_anything(data.start_class))
2099 const I32 n = add_data(pRExC_state, 1, "f");
2101 New(1006, RExC_rx->data->data[n], 1,
2102 struct regnode_charclass_class);
2103 StructCopy(data.start_class,
2104 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2105 struct regnode_charclass_class);
2106 r->regstclass = (regnode*)RExC_rx->data->data[n];
2107 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2108 DEBUG_r({ SV* sv = sv_newmortal();
2109 regprop(sv, (regnode*)data.start_class);
2110 PerlIO_printf(Perl_debug_log,
2111 "synthetic stclass \"%s\".\n",
2112 SvPVX_const(sv));});
2117 if (RExC_seen & REG_SEEN_GPOS)
2118 r->reganch |= ROPT_GPOS_SEEN;
2119 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2120 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2121 if (RExC_seen & REG_SEEN_EVAL)
2122 r->reganch |= ROPT_EVAL_SEEN;
2123 if (RExC_seen & REG_SEEN_CANY)
2124 r->reganch |= ROPT_CANY_SEEN;
2125 Newz(1002, r->startp, RExC_npar, I32);
2126 Newz(1002, r->endp, RExC_npar, I32);
2127 PL_regdata = r->data; /* for regprop() */
2128 DEBUG_r(regdump(r));
2133 - reg - regular expression, i.e. main body or parenthesized thing
2135 * Caller must absorb opening parenthesis.
2137 * Combining parenthesis handling with the base level of regular expression
2138 * is a trifle forced, but the need to tie the tails of the branches to what
2139 * follows makes it hard to avoid.
2142 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2143 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2145 register regnode *ret; /* Will be the head of the group. */
2146 register regnode *br;
2147 register regnode *lastbr;
2148 register regnode *ender = 0;
2149 register I32 parno = 0;
2150 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2152 /* for (?g), (?gc), and (?o) warnings; warning
2153 about (?c) will warn about (?g) -- japhy */
2155 I32 wastedflags = 0x00,
2158 wasted_gc = 0x02 | 0x04,
2161 char * parse_start = RExC_parse; /* MJD */
2162 char *oregcomp_parse = RExC_parse;
2165 *flagp = 0; /* Tentatively. */
2168 /* Make an OPEN node, if parenthesized. */
2170 if (*RExC_parse == '?') { /* (?...) */
2171 U32 posflags = 0, negflags = 0;
2172 U32 *flagsp = &posflags;
2174 char *seqstart = RExC_parse;
2177 paren = *RExC_parse++;
2178 ret = NULL; /* For look-ahead/behind. */
2180 case '<': /* (?<...) */
2181 RExC_seen |= REG_SEEN_LOOKBEHIND;
2182 if (*RExC_parse == '!')
2184 if (*RExC_parse != '=' && *RExC_parse != '!')
2187 case '=': /* (?=...) */
2188 case '!': /* (?!...) */
2189 RExC_seen_zerolen++;
2190 case ':': /* (?:...) */
2191 case '>': /* (?>...) */
2193 case '$': /* (?$...) */
2194 case '@': /* (?@...) */
2195 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2197 case '#': /* (?#...) */
2198 while (*RExC_parse && *RExC_parse != ')')
2200 if (*RExC_parse != ')')
2201 FAIL("Sequence (?#... not terminated");
2202 nextchar(pRExC_state);
2205 case 'p': /* (?p...) */
2206 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2207 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2209 case '?': /* (??...) */
2211 if (*RExC_parse != '{')
2213 paren = *RExC_parse++;
2215 case '{': /* (?{...}) */
2217 I32 count = 1, n = 0;
2219 char *s = RExC_parse;
2221 OP_4tree *sop, *rop;
2223 RExC_seen_zerolen++;
2224 RExC_seen |= REG_SEEN_EVAL;
2225 while (count && (c = *RExC_parse)) {
2226 if (c == '\\' && RExC_parse[1])
2234 if (*RExC_parse != ')')
2237 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2242 if (RExC_parse - 1 - s)
2243 sv = newSVpvn(s, RExC_parse - 1 - s);
2245 sv = newSVpvn("", 0);
2248 Perl_save_re_context(aTHX);
2249 rop = sv_compile_2op(sv, &sop, "re", &pad);
2250 sop->op_private |= OPpREFCOUNTED;
2251 /* re_dup will OpREFCNT_inc */
2252 OpREFCNT_set(sop, 1);
2255 n = add_data(pRExC_state, 3, "nop");
2256 RExC_rx->data->data[n] = (void*)rop;
2257 RExC_rx->data->data[n+1] = (void*)sop;
2258 RExC_rx->data->data[n+2] = (void*)pad;
2261 else { /* First pass */
2262 if (PL_reginterp_cnt < ++RExC_seen_evals
2264 /* No compiled RE interpolated, has runtime
2265 components ===> unsafe. */
2266 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2267 if (PL_tainting && PL_tainted)
2268 FAIL("Eval-group in insecure regular expression");
2271 nextchar(pRExC_state);
2273 ret = reg_node(pRExC_state, LOGICAL);
2276 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2277 /* deal with the length of this later - MJD */
2280 ret = reganode(pRExC_state, EVAL, n);
2281 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2282 Set_Node_Offset(ret, parse_start);
2285 case '(': /* (?(?{...})...) and (?(?=...)...) */
2287 if (RExC_parse[0] == '?') { /* (?(?...)) */
2288 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2289 || RExC_parse[1] == '<'
2290 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2293 ret = reg_node(pRExC_state, LOGICAL);
2296 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2300 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2302 parno = atoi(RExC_parse++);
2304 while (isDIGIT(*RExC_parse))
2306 ret = reganode(pRExC_state, GROUPP, parno);
2308 if ((c = *nextchar(pRExC_state)) != ')')
2309 vFAIL("Switch condition not recognized");
2311 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2312 br = regbranch(pRExC_state, &flags, 1);
2314 br = reganode(pRExC_state, LONGJMP, 0);
2316 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2317 c = *nextchar(pRExC_state);
2321 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2322 regbranch(pRExC_state, &flags, 1);
2323 regtail(pRExC_state, ret, lastbr);
2326 c = *nextchar(pRExC_state);
2331 vFAIL("Switch (?(condition)... contains too many branches");
2332 ender = reg_node(pRExC_state, TAIL);
2333 regtail(pRExC_state, br, ender);
2335 regtail(pRExC_state, lastbr, ender);
2336 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2339 regtail(pRExC_state, ret, ender);
2343 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2347 RExC_parse--; /* for vFAIL to print correctly */
2348 vFAIL("Sequence (? incomplete");
2352 parse_flags: /* (?i) */
2353 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2354 /* (?g), (?gc) and (?o) are useless here
2355 and must be globally applied -- japhy */
2357 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2358 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2359 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2360 if (! (wastedflags & wflagbit) ) {
2361 wastedflags |= wflagbit;
2364 "Useless (%s%c) - %suse /%c modifier",
2365 flagsp == &negflags ? "?-" : "?",
2367 flagsp == &negflags ? "don't " : "",
2373 else if (*RExC_parse == 'c') {
2374 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2375 if (! (wastedflags & wasted_c) ) {
2376 wastedflags |= wasted_gc;
2379 "Useless (%sc) - %suse /gc modifier",
2380 flagsp == &negflags ? "?-" : "?",
2381 flagsp == &negflags ? "don't " : ""
2386 else { pmflag(flagsp, *RExC_parse); }
2390 if (*RExC_parse == '-') {
2392 wastedflags = 0; /* reset so (?g-c) warns twice */
2396 RExC_flags |= posflags;
2397 RExC_flags &= ~negflags;
2398 if (*RExC_parse == ':') {
2404 if (*RExC_parse != ')') {
2406 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2408 nextchar(pRExC_state);
2416 ret = reganode(pRExC_state, OPEN, parno);
2417 Set_Node_Length(ret, 1); /* MJD */
2418 Set_Node_Offset(ret, RExC_parse); /* MJD */
2425 /* Pick up the branches, linking them together. */
2426 parse_start = RExC_parse; /* MJD */
2427 br = regbranch(pRExC_state, &flags, 1);
2428 /* branch_len = (paren != 0); */
2432 if (*RExC_parse == '|') {
2433 if (!SIZE_ONLY && RExC_extralen) {
2434 reginsert(pRExC_state, BRANCHJ, br);
2437 reginsert(pRExC_state, BRANCH, br);
2438 Set_Node_Length(br, paren != 0);
2439 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2443 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2445 else if (paren == ':') {
2446 *flagp |= flags&SIMPLE;
2448 if (open) { /* Starts with OPEN. */
2449 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2451 else if (paren != '?') /* Not Conditional */
2453 *flagp |= flags & (SPSTART | HASWIDTH);
2455 while (*RExC_parse == '|') {
2456 if (!SIZE_ONLY && RExC_extralen) {
2457 ender = reganode(pRExC_state, LONGJMP,0);
2458 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2461 RExC_extralen += 2; /* Account for LONGJMP. */
2462 nextchar(pRExC_state);
2463 br = regbranch(pRExC_state, &flags, 0);
2467 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2471 *flagp |= flags&SPSTART;
2474 if (have_branch || paren != ':') {
2475 /* Make a closing node, and hook it on the end. */
2478 ender = reg_node(pRExC_state, TAIL);
2481 ender = reganode(pRExC_state, CLOSE, parno);
2482 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2483 Set_Node_Length(ender,1); /* MJD */
2489 *flagp &= ~HASWIDTH;
2492 ender = reg_node(pRExC_state, SUCCEED);
2495 ender = reg_node(pRExC_state, END);
2498 regtail(pRExC_state, lastbr, ender);
2501 /* Hook the tails of the branches to the closing node. */
2502 for (br = ret; br != NULL; br = regnext(br)) {
2503 regoptail(pRExC_state, br, ender);
2510 static const char parens[] = "=!<,>";
2512 if (paren && (p = strchr(parens, paren))) {
2513 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2514 int flag = (p - parens) > 1;
2517 node = SUSPEND, flag = 0;
2518 reginsert(pRExC_state, node,ret);
2519 Set_Node_Cur_Length(ret);
2520 Set_Node_Offset(ret, parse_start + 1);
2522 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2526 /* Check for proper termination. */
2528 RExC_flags = oregflags;
2529 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2530 RExC_parse = oregcomp_parse;
2531 vFAIL("Unmatched (");
2534 else if (!paren && RExC_parse < RExC_end) {
2535 if (*RExC_parse == ')') {
2537 vFAIL("Unmatched )");
2540 FAIL("Junk on end of regexp"); /* "Can't happen". */
2548 - regbranch - one alternative of an | operator
2550 * Implements the concatenation operator.
2553 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2555 register regnode *ret;
2556 register regnode *chain = NULL;
2557 register regnode *latest;
2558 I32 flags = 0, c = 0;
2563 if (!SIZE_ONLY && RExC_extralen)
2564 ret = reganode(pRExC_state, BRANCHJ,0);
2566 ret = reg_node(pRExC_state, BRANCH);
2567 Set_Node_Length(ret, 1);
2571 if (!first && SIZE_ONLY)
2572 RExC_extralen += 1; /* BRANCHJ */
2574 *flagp = WORST; /* Tentatively. */
2577 nextchar(pRExC_state);
2578 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2580 latest = regpiece(pRExC_state, &flags);
2581 if (latest == NULL) {
2582 if (flags & TRYAGAIN)
2586 else if (ret == NULL)
2588 *flagp |= flags&HASWIDTH;
2589 if (chain == NULL) /* First piece. */
2590 *flagp |= flags&SPSTART;
2593 regtail(pRExC_state, chain, latest);
2598 if (chain == NULL) { /* Loop ran zero times. */
2599 chain = reg_node(pRExC_state, NOTHING);
2604 *flagp |= flags&SIMPLE;
2611 - regpiece - something followed by possible [*+?]
2613 * Note that the branching code sequences used for ? and the general cases
2614 * of * and + are somewhat optimized: they use the same NOTHING node as
2615 * both the endmarker for their branch list and the body of the last branch.
2616 * It might seem that this node could be dispensed with entirely, but the
2617 * endmarker role is not redundant.
2620 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2622 register regnode *ret;
2624 register char *next;
2626 const char * const origparse = RExC_parse;
2629 I32 max = REG_INFTY;
2632 ret = regatom(pRExC_state, &flags);
2634 if (flags & TRYAGAIN)
2641 if (op == '{' && regcurly(RExC_parse)) {
2642 parse_start = RExC_parse; /* MJD */
2643 next = RExC_parse + 1;
2645 while (isDIGIT(*next) || *next == ',') {
2654 if (*next == '}') { /* got one */
2658 min = atoi(RExC_parse);
2662 maxpos = RExC_parse;
2664 if (!max && *maxpos != '0')
2665 max = REG_INFTY; /* meaning "infinity" */
2666 else if (max >= REG_INFTY)
2667 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2669 nextchar(pRExC_state);
2672 if ((flags&SIMPLE)) {
2673 RExC_naughty += 2 + RExC_naughty / 2;
2674 reginsert(pRExC_state, CURLY, ret);
2675 Set_Node_Offset(ret, parse_start+1); /* MJD */
2676 Set_Node_Cur_Length(ret);
2679 regnode *w = reg_node(pRExC_state, WHILEM);
2682 regtail(pRExC_state, ret, w);
2683 if (!SIZE_ONLY && RExC_extralen) {
2684 reginsert(pRExC_state, LONGJMP,ret);
2685 reginsert(pRExC_state, NOTHING,ret);
2686 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2688 reginsert(pRExC_state, CURLYX,ret);
2690 Set_Node_Offset(ret, parse_start+1);
2691 Set_Node_Length(ret,
2692 op == '{' ? (RExC_parse - parse_start) : 1);
2694 if (!SIZE_ONLY && RExC_extralen)
2695 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2696 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2698 RExC_whilem_seen++, RExC_extralen += 3;
2699 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2707 if (max && max < min)
2708 vFAIL("Can't do {n,m} with n > m");
2710 ARG1_SET(ret, (U16)min);
2711 ARG2_SET(ret, (U16)max);
2723 #if 0 /* Now runtime fix should be reliable. */
2725 /* if this is reinstated, don't forget to put this back into perldiag:
2727 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2729 (F) The part of the regexp subject to either the * or + quantifier
2730 could match an empty string. The {#} shows in the regular
2731 expression about where the problem was discovered.
2735 if (!(flags&HASWIDTH) && op != '?')
2736 vFAIL("Regexp *+ operand could be empty");
2739 parse_start = RExC_parse;
2740 nextchar(pRExC_state);
2742 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2744 if (op == '*' && (flags&SIMPLE)) {
2745 reginsert(pRExC_state, STAR, ret);
2749 else if (op == '*') {
2753 else if (op == '+' && (flags&SIMPLE)) {
2754 reginsert(pRExC_state, PLUS, ret);
2758 else if (op == '+') {
2762 else if (op == '?') {
2767 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2769 "%.*s matches null string many times",
2770 RExC_parse - origparse,
2774 if (*RExC_parse == '?') {
2775 nextchar(pRExC_state);
2776 reginsert(pRExC_state, MINMOD, ret);
2777 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2779 if (ISMULT2(RExC_parse)) {
2781 vFAIL("Nested quantifiers");
2788 - regatom - the lowest level
2790 * Optimization: gobbles an entire sequence of ordinary characters so that
2791 * it can turn them into a single node, which is smaller to store and
2792 * faster to run. Backslashed characters are exceptions, each becoming a
2793 * separate node; the code is simpler that way and it's not worth fixing.
2795 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2797 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2799 register regnode *ret = 0;
2801 char *parse_start = RExC_parse;
2803 *flagp = WORST; /* Tentatively. */
2806 switch (*RExC_parse) {
2808 RExC_seen_zerolen++;
2809 nextchar(pRExC_state);
2810 if (RExC_flags & PMf_MULTILINE)
2811 ret = reg_node(pRExC_state, MBOL);
2812 else if (RExC_flags & PMf_SINGLELINE)
2813 ret = reg_node(pRExC_state, SBOL);
2815 ret = reg_node(pRExC_state, BOL);
2816 Set_Node_Length(ret, 1); /* MJD */
2819 nextchar(pRExC_state);
2821 RExC_seen_zerolen++;
2822 if (RExC_flags & PMf_MULTILINE)
2823 ret = reg_node(pRExC_state, MEOL);
2824 else if (RExC_flags & PMf_SINGLELINE)
2825 ret = reg_node(pRExC_state, SEOL);
2827 ret = reg_node(pRExC_state, EOL);
2828 Set_Node_Length(ret, 1); /* MJD */
2831 nextchar(pRExC_state);
2832 if (RExC_flags & PMf_SINGLELINE)
2833 ret = reg_node(pRExC_state, SANY);
2835 ret = reg_node(pRExC_state, REG_ANY);
2836 *flagp |= HASWIDTH|SIMPLE;
2838 Set_Node_Length(ret, 1); /* MJD */
2842 char *oregcomp_parse = ++RExC_parse;
2843 ret = regclass(pRExC_state);
2844 if (*RExC_parse != ']') {
2845 RExC_parse = oregcomp_parse;
2846 vFAIL("Unmatched [");
2848 nextchar(pRExC_state);
2849 *flagp |= HASWIDTH|SIMPLE;
2850 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2854 nextchar(pRExC_state);
2855 ret = reg(pRExC_state, 1, &flags);
2857 if (flags & TRYAGAIN) {
2858 if (RExC_parse == RExC_end) {
2859 /* Make parent create an empty node if needed. */
2867 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2871 if (flags & TRYAGAIN) {
2875 vFAIL("Internal urp");
2876 /* Supposed to be caught earlier. */
2879 if (!regcurly(RExC_parse)) {
2888 vFAIL("Quantifier follows nothing");
2891 switch (*++RExC_parse) {
2893 RExC_seen_zerolen++;
2894 ret = reg_node(pRExC_state, SBOL);
2896 nextchar(pRExC_state);
2897 Set_Node_Length(ret, 2); /* MJD */
2900 ret = reg_node(pRExC_state, GPOS);
2901 RExC_seen |= REG_SEEN_GPOS;
2903 nextchar(pRExC_state);
2904 Set_Node_Length(ret, 2); /* MJD */
2907 ret = reg_node(pRExC_state, SEOL);
2909 RExC_seen_zerolen++; /* Do not optimize RE away */
2910 nextchar(pRExC_state);
2913 ret = reg_node(pRExC_state, EOS);
2915 RExC_seen_zerolen++; /* Do not optimize RE away */
2916 nextchar(pRExC_state);
2917 Set_Node_Length(ret, 2); /* MJD */
2920 ret = reg_node(pRExC_state, CANY);
2921 RExC_seen |= REG_SEEN_CANY;
2922 *flagp |= HASWIDTH|SIMPLE;
2923 nextchar(pRExC_state);
2924 Set_Node_Length(ret, 2); /* MJD */
2927 ret = reg_node(pRExC_state, CLUMP);
2929 nextchar(pRExC_state);
2930 Set_Node_Length(ret, 2); /* MJD */
2933 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2934 *flagp |= HASWIDTH|SIMPLE;
2935 nextchar(pRExC_state);
2936 Set_Node_Length(ret, 2); /* MJD */
2939 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2940 *flagp |= HASWIDTH|SIMPLE;
2941 nextchar(pRExC_state);
2942 Set_Node_Length(ret, 2); /* MJD */
2945 RExC_seen_zerolen++;
2946 RExC_seen |= REG_SEEN_LOOKBEHIND;
2947 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2949 nextchar(pRExC_state);
2950 Set_Node_Length(ret, 2); /* MJD */
2953 RExC_seen_zerolen++;
2954 RExC_seen |= REG_SEEN_LOOKBEHIND;
2955 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2957 nextchar(pRExC_state);
2958 Set_Node_Length(ret, 2); /* MJD */
2961 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2962 *flagp |= HASWIDTH|SIMPLE;
2963 nextchar(pRExC_state);
2964 Set_Node_Length(ret, 2); /* MJD */
2967 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2968 *flagp |= HASWIDTH|SIMPLE;
2969 nextchar(pRExC_state);
2970 Set_Node_Length(ret, 2); /* MJD */
2973 ret = reg_node(pRExC_state, DIGIT);
2974 *flagp |= HASWIDTH|SIMPLE;
2975 nextchar(pRExC_state);
2976 Set_Node_Length(ret, 2); /* MJD */
2979 ret = reg_node(pRExC_state, NDIGIT);
2980 *flagp |= HASWIDTH|SIMPLE;
2981 nextchar(pRExC_state);
2982 Set_Node_Length(ret, 2); /* MJD */
2987 char* oldregxend = RExC_end;
2988 char* parse_start = RExC_parse - 2;
2990 if (RExC_parse[1] == '{') {
2991 /* a lovely hack--pretend we saw [\pX] instead */
2992 RExC_end = strchr(RExC_parse, '}');
2994 U8 c = (U8)*RExC_parse;
2996 RExC_end = oldregxend;
2997 vFAIL2("Missing right brace on \\%c{}", c);
3002 RExC_end = RExC_parse + 2;
3003 if (RExC_end > oldregxend)
3004 RExC_end = oldregxend;
3008 ret = regclass(pRExC_state);
3010 RExC_end = oldregxend;
3013 Set_Node_Offset(ret, parse_start + 2);
3014 Set_Node_Cur_Length(ret);
3015 nextchar(pRExC_state);
3016 *flagp |= HASWIDTH|SIMPLE;
3029 case '1': case '2': case '3': case '4':
3030 case '5': case '6': case '7': case '8': case '9':
3032 const I32 num = atoi(RExC_parse);
3034 if (num > 9 && num >= RExC_npar)
3037 char * parse_start = RExC_parse - 1; /* MJD */
3038 while (isDIGIT(*RExC_parse))
3041 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3042 vFAIL("Reference to nonexistent group");
3044 ret = reganode(pRExC_state,
3045 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3049 /* override incorrect value set in reganode MJD */
3050 Set_Node_Offset(ret, parse_start+1);
3051 Set_Node_Cur_Length(ret); /* MJD */
3053 nextchar(pRExC_state);
3058 if (RExC_parse >= RExC_end)
3059 FAIL("Trailing \\");
3062 /* Do not generate "unrecognized" warnings here, we fall
3063 back into the quick-grab loop below */
3070 if (RExC_flags & PMf_EXTENDED) {
3071 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3072 if (RExC_parse < RExC_end)
3078 register STRLEN len;
3083 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
3085 parse_start = RExC_parse - 1;
3091 ret = reg_node(pRExC_state,
3092 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3094 for (len = 0, p = RExC_parse - 1;
3095 len < 127 && p < RExC_end;
3100 if (RExC_flags & PMf_EXTENDED)
3101 p = regwhite(p, RExC_end);
3148 ender = ASCII_TO_NATIVE('\033');
3152 ender = ASCII_TO_NATIVE('\007');
3157 char* const e = strchr(p, '}');
3161 vFAIL("Missing right brace on \\x{}");
3164 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3165 | PERL_SCAN_DISALLOW_PREFIX;
3166 STRLEN numlen = e - p - 1;
3167 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3174 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3176 ender = grok_hex(p, &numlen, &flags, NULL);
3182 ender = UCHARAT(p++);
3183 ender = toCTRL(ender);
3185 case '0': case '1': case '2': case '3':case '4':
3186 case '5': case '6': case '7': case '8':case '9':
3188 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3191 ender = grok_oct(p, &numlen, &flags, NULL);
3201 FAIL("Trailing \\");
3204 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3205 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3206 goto normal_default;
3211 if (UTF8_IS_START(*p) && UTF) {
3213 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3221 if (RExC_flags & PMf_EXTENDED)
3222 p = regwhite(p, RExC_end);
3224 /* Prime the casefolded buffer. */
3225 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3227 if (ISMULT2(p)) { /* Back off on ?+*. */
3234 /* Emit all the Unicode characters. */
3236 for (foldbuf = tmpbuf;
3238 foldlen -= numlen) {
3239 ender = utf8_to_uvchr(foldbuf, &numlen);
3241 reguni(pRExC_state, ender, s, &unilen);
3244 /* In EBCDIC the numlen
3245 * and unilen can differ. */
3247 if (numlen >= foldlen)
3251 break; /* "Can't happen." */
3255 reguni(pRExC_state, ender, s, &unilen);
3264 REGC((char)ender, s++);
3272 /* Emit all the Unicode characters. */
3274 for (foldbuf = tmpbuf;
3276 foldlen -= numlen) {
3277 ender = utf8_to_uvchr(foldbuf, &numlen);
3279 reguni(pRExC_state, ender, s, &unilen);
3282 /* In EBCDIC the numlen
3283 * and unilen can differ. */
3285 if (numlen >= foldlen)
3293 reguni(pRExC_state, ender, s, &unilen);
3302 REGC((char)ender, s++);
3306 Set_Node_Cur_Length(ret); /* MJD */
3307 nextchar(pRExC_state);
3309 /* len is STRLEN which is unsigned, need to copy to signed */
3312 vFAIL("Internal disaster");
3316 if (len == 1 && UNI_IS_INVARIANT(ender))
3321 RExC_size += STR_SZ(len);
3323 RExC_emit += STR_SZ(len);
3328 /* If the encoding pragma is in effect recode the text of
3329 * any EXACT-kind nodes. */
3330 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3331 STRLEN oldlen = STR_LEN(ret);
3332 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3336 if (sv_utf8_downgrade(sv, TRUE)) {
3337 const char * const s = sv_recode_to_utf8(sv, PL_encoding);
3338 const STRLEN newlen = SvCUR(sv);
3343 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3344 (int)oldlen, STRING(ret),
3346 Copy(s, STRING(ret), newlen, char);
3347 STR_LEN(ret) += newlen - oldlen;
3348 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3350 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3358 S_regwhite(pTHX_ char *p, const char *e)
3363 else if (*p == '#') {
3366 } while (p < e && *p != '\n');
3374 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3375 Character classes ([:foo:]) can also be negated ([:^foo:]).
3376 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3377 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3378 but trigger failures because they are currently unimplemented. */
3380 #define POSIXCC_DONE(c) ((c) == ':')
3381 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3382 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3385 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3388 I32 namedclass = OOB_NAMEDCLASS;
3390 if (value == '[' && RExC_parse + 1 < RExC_end &&
3391 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3392 POSIXCC(UCHARAT(RExC_parse))) {
3393 const char c = UCHARAT(RExC_parse);
3394 char* s = RExC_parse++;
3396 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3398 if (RExC_parse == RExC_end)
3399 /* Grandfather lone [:, [=, [. */
3402 const char* t = RExC_parse++; /* skip over the c */
3406 if (UCHARAT(RExC_parse) == ']') {
3407 RExC_parse++; /* skip over the ending ] */
3410 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3411 const I32 skip = t - posixcc;
3413 /* Initially switch on the length of the name. */
3416 if (memEQ(posixcc, "word", 4)) {
3417 /* this is not POSIX, this is the Perl \w */;
3419 = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3423 /* Names all of length 5. */
3424 /* alnum alpha ascii blank cntrl digit graph lower
3425 print punct space upper */
3426 /* Offset 4 gives the best switch position. */
3427 switch (posixcc[4]) {
3429 if (memEQ(posixcc, "alph", 4)) {
3432 = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3436 if (memEQ(posixcc, "spac", 4)) {
3439 = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3443 if (memEQ(posixcc, "grap", 4)) {
3446 = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3450 if (memEQ(posixcc, "asci", 4)) {
3453 = complement ? ANYOF_NASCII : ANYOF_ASCII;
3457 if (memEQ(posixcc, "blan", 4)) {
3460 = complement ? ANYOF_NBLANK : ANYOF_BLANK;
3464 if (memEQ(posixcc, "cntr", 4)) {
3467 = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3471 if (memEQ(posixcc, "alnu", 4)) {
3474 = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3478 if (memEQ(posixcc, "lowe", 4)) {
3481 = complement ? ANYOF_NLOWER : ANYOF_LOWER;
3483 if (memEQ(posixcc, "uppe", 4)) {
3486 = complement ? ANYOF_NUPPER : ANYOF_UPPER;
3490 if (memEQ(posixcc, "digi", 4)) {
3493 = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3495 if (memEQ(posixcc, "prin", 4)) {
3498 = complement ? ANYOF_NPRINT : ANYOF_PRINT;
3500 if (memEQ(posixcc, "punc", 4)) {
3503 = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3509 if (memEQ(posixcc, "xdigit", 6)) {
3511 = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3516 if (namedclass == OOB_NAMEDCLASS)
3518 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3521 assert (posixcc[skip] == ':');
3522 assert (posixcc[skip+1] == ']');
3523 } else if (!SIZE_ONLY) {
3524 /* [[=foo=]] and [[.foo.]] are still future. */
3526 /* adjust RExC_parse so the warning shows after
3528 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3530 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3533 /* Maternal grandfather:
3534 * "[:" ending in ":" but not in ":]" */
3544 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3546 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3547 const char *s = RExC_parse;
3548 const char c = *s++;
3550 while(*s && isALNUM(*s))
3552 if (*s && c == *s && s[1] == ']') {
3553 if (ckWARN(WARN_REGEXP))
3555 "POSIX syntax [%c %c] belongs inside character classes",
3558 /* [[=foo=]] and [[.foo.]] are still future. */
3559 if (POSIXCC_NOTYET(c)) {
3560 /* adjust RExC_parse so the error shows after
3562 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3564 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3571 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3574 register UV nextvalue;
3575 register IV prevvalue = OOB_UNICODE;
3576 register IV range = 0;
3577 register regnode *ret;
3580 char *rangebegin = 0;
3581 bool need_class = 0;
3582 SV *listsv = Nullsv;
3585 bool optimize_invert = TRUE;
3586 AV* unicode_alternate = 0;
3588 UV literal_endpoint = 0;
3591 ret = reganode(pRExC_state, ANYOF, 0);
3594 ANYOF_FLAGS(ret) = 0;
3596 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3600 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3604 RExC_size += ANYOF_SKIP;
3606 RExC_emit += ANYOF_SKIP;
3608 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3610 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3611 ANYOF_BITMAP_ZERO(ret);
3612 listsv = newSVpvn("# comment\n", 10);
3615 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3617 if (!SIZE_ONLY && POSIXCC(nextvalue))
3618 checkposixcc(pRExC_state);
3620 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3621 if (UCHARAT(RExC_parse) == ']')
3624 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3628 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3631 rangebegin = RExC_parse;
3633 value = utf8n_to_uvchr((U8*)RExC_parse,
3634 RExC_end - RExC_parse,
3636 RExC_parse += numlen;
3639 value = UCHARAT(RExC_parse++);
3640 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3641 if (value == '[' && POSIXCC(nextvalue))
3642 namedclass = regpposixcc(pRExC_state, value);
3643 else if (value == '\\') {
3645 value = utf8n_to_uvchr((U8*)RExC_parse,
3646 RExC_end - RExC_parse,
3648 RExC_parse += numlen;
3651 value = UCHARAT(RExC_parse++);
3652 /* Some compilers cannot handle switching on 64-bit integer
3653 * values, therefore value cannot be an UV. Yes, this will
3654 * be a problem later if we want switch on Unicode.
3655 * A similar issue a little bit later when switching on
3656 * namedclass. --jhi */
3657 switch ((I32)value) {
3658 case 'w': namedclass = ANYOF_ALNUM; break;
3659 case 'W': namedclass = ANYOF_NALNUM; break;
3660 case 's': namedclass = ANYOF_SPACE; break;
3661 case 'S': namedclass = ANYOF_NSPACE; break;
3662 case 'd': namedclass = ANYOF_DIGIT; break;
3663 case 'D': namedclass = ANYOF_NDIGIT; break;
3666 if (RExC_parse >= RExC_end)
3667 vFAIL2("Empty \\%c{}", (U8)value);
3668 if (*RExC_parse == '{') {
3669 const U8 c = (U8)value;
3670 e = strchr(RExC_parse++, '}');
3672 vFAIL2("Missing right brace on \\%c{}", c);
3673 while (isSPACE(UCHARAT(RExC_parse)))
3675 if (e == RExC_parse)
3676 vFAIL2("Empty \\%c{}", c);
3678 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3686 if (UCHARAT(RExC_parse) == '^') {
3689 value = value == 'p' ? 'P' : 'p'; /* toggle */
3690 while (isSPACE(UCHARAT(RExC_parse))) {
3696 Perl_sv_catpvf(aTHX_ listsv,
3697 "+utf8::%.*s\n", (int)n, RExC_parse);
3699 Perl_sv_catpvf(aTHX_ listsv,
3700 "!utf8::%.*s\n", (int)n, RExC_parse);
3703 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3704 namedclass = ANYOF_MAX; /* no official name, but it's named */
3706 case 'n': value = '\n'; break;
3707 case 'r': value = '\r'; break;
3708 case 't': value = '\t'; break;
3709 case 'f': value = '\f'; break;
3710 case 'b': value = '\b'; break;
3711 case 'e': value = ASCII_TO_NATIVE('\033');break;
3712 case 'a': value = ASCII_TO_NATIVE('\007');break;
3714 if (*RExC_parse == '{') {
3715 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3716 | PERL_SCAN_DISALLOW_PREFIX;
3717 e = strchr(RExC_parse++, '}');
3719 vFAIL("Missing right brace on \\x{}");
3721 numlen = e - RExC_parse;
3722 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3726 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3728 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3729 RExC_parse += numlen;
3733 value = UCHARAT(RExC_parse++);
3734 value = toCTRL(value);
3736 case '0': case '1': case '2': case '3': case '4':
3737 case '5': case '6': case '7': case '8': case '9':
3741 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3742 RExC_parse += numlen;
3746 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3748 "Unrecognized escape \\%c in character class passed through",
3752 } /* end of \blah */
3758 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3760 if (!SIZE_ONLY && !need_class)
3761 ANYOF_CLASS_ZERO(ret);
3765 /* a bad range like a-\d, a-[:digit:] ? */
3768 if (ckWARN(WARN_REGEXP))
3770 "False [] range \"%*.*s\"",
3771 RExC_parse - rangebegin,
3772 RExC_parse - rangebegin,
3774 if (prevvalue < 256) {
3775 ANYOF_BITMAP_SET(ret, prevvalue);
3776 ANYOF_BITMAP_SET(ret, '-');
3779 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3780 Perl_sv_catpvf(aTHX_ listsv,
3781 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3785 range = 0; /* this was not a true range */
3789 const char *what = NULL;
3792 if (namedclass > OOB_NAMEDCLASS)
3793 optimize_invert = FALSE;
3794 /* Possible truncation here but in some 64-bit environments
3795 * the compiler gets heartburn about switch on 64-bit values.
3796 * A similar issue a little earlier when switching on value.
3798 switch ((I32)namedclass) {
3801 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3803 for (value = 0; value < 256; value++)
3805 ANYOF_BITMAP_SET(ret, value);
3812 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3814 for (value = 0; value < 256; value++)
3815 if (!isALNUM(value))
3816 ANYOF_BITMAP_SET(ret, value);
3823 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3825 for (value = 0; value < 256; value++)
3826 if (isALNUMC(value))
3827 ANYOF_BITMAP_SET(ret, value);
3834 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3836 for (value = 0; value < 256; value++)
3837 if (!isALNUMC(value))
3838 ANYOF_BITMAP_SET(ret, value);
3845 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3847 for (value = 0; value < 256; value++)
3849 ANYOF_BITMAP_SET(ret, value);
3856 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3858 for (value = 0; value < 256; value++)
3859 if (!isALPHA(value))
3860 ANYOF_BITMAP_SET(ret, value);
3867 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3870 for (value = 0; value < 128; value++)
3871 ANYOF_BITMAP_SET(ret, value);
3873 for (value = 0; value < 256; value++) {
3875 ANYOF_BITMAP_SET(ret, value);
3884 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3887 for (value = 128; value < 256; value++)
3888 ANYOF_BITMAP_SET(ret, value);
3890 for (value = 0; value < 256; value++) {
3891 if (!isASCII(value))
3892 ANYOF_BITMAP_SET(ret, value);
3901 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3903 for (value = 0; value < 256; value++)
3905 ANYOF_BITMAP_SET(ret, value);
3912 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3914 for (value = 0; value < 256; value++)
3915 if (!isBLANK(value))
3916 ANYOF_BITMAP_SET(ret, value);
3923 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3925 for (value = 0; value < 256; value++)
3927 ANYOF_BITMAP_SET(ret, value);
3934 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3936 for (value = 0; value < 256; value++)
3937 if (!isCNTRL(value))
3938 ANYOF_BITMAP_SET(ret, value);
3945 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3947 /* consecutive digits assumed */
3948 for (value = '0'; value <= '9'; value++)
3949 ANYOF_BITMAP_SET(ret, value);
3956 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3958 /* consecutive digits assumed */
3959 for (value = 0; value < '0'; value++)
3960 ANYOF_BITMAP_SET(ret, value);
3961 for (value = '9' + 1; value < 256; value++)
3962 ANYOF_BITMAP_SET(ret, value);
3969 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3971 for (value = 0; value < 256; value++)
3973 ANYOF_BITMAP_SET(ret, value);
3980 ANYOF_CLASS_SET(ret, ANYOF_NGRAPH);
3982 for (value = 0; value < 256; value++)
3983 if (!isGRAPH(value))
3984 ANYOF_BITMAP_SET(ret, value);
3991 ANYOF_CLASS_SET(ret, ANYOF_LOWER);
3993 for (value = 0; value < 256; value++)
3995 ANYOF_BITMAP_SET(ret, value);
4002 ANYOF_CLASS_SET(ret, ANYOF_NLOWER);
4004 for (value = 0; value < 256; value++)
4005 if (!isLOWER(value))
4006 ANYOF_BITMAP_SET(ret, value);
4013 ANYOF_CLASS_SET(ret, ANYOF_PRINT);
4015 for (value = 0; value < 256; value++)
4017 ANYOF_BITMAP_SET(ret, value);
4024 ANYOF_CLASS_SET(ret, ANYOF_NPRINT);
4026 for (value = 0; value < 256; value++)
4027 if (!isPRINT(value))
4028 ANYOF_BITMAP_SET(ret, value);
4035 ANYOF_CLASS_SET(ret, ANYOF_PSXSPC);
4037 for (value = 0; value < 256; value++)
4038 if (isPSXSPC(value))
4039 ANYOF_BITMAP_SET(ret, value);
4046 ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC);
4048 for (value = 0; value < 256; value++)
4049 if (!isPSXSPC(value))
4050 ANYOF_BITMAP_SET(ret, value);
4057 ANYOF_CLASS_SET(ret, ANYOF_PUNCT);
4059 for (value = 0; value < 256; value++)
4061 ANYOF_BITMAP_SET(ret, value);
4068 ANYOF_CLASS_SET(ret, ANYOF_NPUNCT);
4070 for (value = 0; value < 256; value++)
4071 if (!isPUNCT(value))
4072 ANYOF_BITMAP_SET(ret, value);
4079 ANYOF_CLASS_SET(ret, ANYOF_SPACE);
4081 for (value = 0; value < 256; value++)
4083 ANYOF_BITMAP_SET(ret, value);
4090 ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
4092 for (value = 0; value < 256; value++)
4093 if (!isSPACE(value))
4094 ANYOF_BITMAP_SET(ret, value);
4101 ANYOF_CLASS_SET(ret, ANYOF_UPPER);
4103 for (value = 0; value < 256; value++)
4105 ANYOF_BITMAP_SET(ret, value);
4112 ANYOF_CLASS_SET(ret, ANYOF_NUPPER);
4114 for (value = 0; value < 256; value++)
4115 if (!isUPPER(value))
4116 ANYOF_BITMAP_SET(ret, value);
4123 ANYOF_CLASS_SET(ret, ANYOF_XDIGIT);
4125 for (value = 0; value < 256; value++)
4126 if (isXDIGIT(value))
4127 ANYOF_BITMAP_SET(ret, value);
4134 ANYOF_CLASS_SET(ret, ANYOF_NXDIGIT);
4136 for (value = 0; value < 256; value++)
4137 if (!isXDIGIT(value))
4138 ANYOF_BITMAP_SET(ret, value);
4144 /* this is to handle \p and \P */
4147 vFAIL("Invalid [::] class");
4151 /* Strings such as "+utf8::isWord\n" */
4152 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
4155 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
4158 } /* end of namedclass \blah */
4161 if (prevvalue > (IV)value) /* b-a */ {
4162 Simple_vFAIL4("Invalid [] range \"%*.*s\"",
4163 RExC_parse - rangebegin,
4164 RExC_parse - rangebegin,
4166 range = 0; /* not a valid range */
4170 prevvalue = value; /* save the beginning of the range */
4171 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
4172 RExC_parse[1] != ']') {
4175 /* a bad range like \w-, [:word:]- ? */
4176 if (namedclass > OOB_NAMEDCLASS) {
4177 if (ckWARN(WARN_REGEXP))
4179 "False [] range \"%*.*s\"",
4180 RExC_parse - rangebegin,
4181 RExC_parse - rangebegin,
4184 ANYOF_BITMAP_SET(ret, '-');
4186 range = 1; /* yeah, it's a range! */
4187 continue; /* but do it the next time */
4191 /* now is the next time */
4195 if (prevvalue < 256) {
4196 const IV ceilvalue = value < 256 ? value : 255;
4199 /* In EBCDIC [\x89-\x91] should include
4200 * the \x8e but [i-j] should not. */
4201 if (literal_endpoint == 2 &&
4202 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
4203 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
4205 if (isLOWER(prevvalue)) {
4206 for (i = prevvalue; i <= ceilvalue; i++)
4208 ANYOF_BITMAP_SET(ret, i);
4210 for (i = prevvalue; i <= ceilvalue; i++)
4212 ANYOF_BITMAP_SET(ret, i);
4217 for (i = prevvalue; i <= ceilvalue; i++)
4218 ANYOF_BITMAP_SET(ret, i);
4220 if (value > 255 || UTF) {
4221 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
4222 const UV natvalue = NATIVE_TO_UNI(value);
4224 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
4225 if (prevnatvalue < natvalue) { /* what about > ? */
4226 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
4227 prevnatvalue, natvalue);
4229 else if (prevnatvalue == natvalue) {
4230 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
4232 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
4234 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
4236 /* If folding and foldable and a single
4237 * character, insert also the folded version
4238 * to the charclass. */
4240 if (foldlen == (STRLEN)UNISKIP(f))
4241 Perl_sv_catpvf(aTHX_ listsv,
4244 /* Any multicharacter foldings
4245 * require the following transform:
4246 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
4247 * where E folds into "pq" and F folds
4248 * into "rst", all other characters
4249 * fold to single characters. We save
4250 * away these multicharacter foldings,
4251 * to be later saved as part of the
4252 * additional "s" data. */
4255 if (!unicode_alternate)
4256 unicode_alternate = newAV();
4257 sv = newSVpvn((char*)foldbuf, foldlen);
4259 av_push(unicode_alternate, sv);
4263 /* If folding and the value is one of the Greek
4264 * sigmas insert a few more sigmas to make the
4265 * folding rules of the sigmas to work right.
4266 * Note that not all the possible combinations
4267 * are handled here: some of them are handled
4268 * by the standard folding rules, and some of
4269 * them (literal or EXACTF cases) are handled
4270 * during runtime in regexec.c:S_find_byclass(). */
4271 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
4272 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4273 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
4274 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4275 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4277 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
4278 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
4279 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
4284 literal_endpoint = 0;
4288 range = 0; /* this range (if it was one) is done now */
4292 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
4294 RExC_size += ANYOF_CLASS_ADD_SKIP;
4296 RExC_emit += ANYOF_CLASS_ADD_SKIP;
4299 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
4301 /* If the only flag is folding (plus possibly inversion). */
4302 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
4304 for (value = 0; value < 256; ++value) {
4305 if (ANYOF_BITMAP_TEST(ret, value)) {
4306 UV fold = PL_fold[value];
4309 ANYOF_BITMAP_SET(ret, fold);
4312 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
4315 /* optimize inverted simple patterns (e.g. [^a-z]) */
4316 if (!SIZE_ONLY && optimize_invert &&
4317 /* If the only flag is inversion. */
4318 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
4319 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
4320 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
4321 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
4328 /* The 0th element stores the character class description
4329 * in its textual form: used later (regexec.c:Perl_regclass_swash())
4330 * to initialize the appropriate swash (which gets stored in
4331 * the 1st element), and also useful for dumping the regnode.
4332 * The 2nd element stores the multicharacter foldings,
4333 * used later (regexec.c:S_reginclass()). */
4334 av_store(av, 0, listsv);
4335 av_store(av, 1, NULL);
4336 av_store(av, 2, (SV*)unicode_alternate);
4337 rv = newRV_noinc((SV*)av);
4338 n = add_data(pRExC_state, 1, "s");
4339 RExC_rx->data->data[n] = (void*)rv;
4347 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
4349 char* retval = RExC_parse++;
4352 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
4353 RExC_parse[2] == '#') {
4354 while (*RExC_parse != ')') {
4355 if (RExC_parse == RExC_end)
4356 FAIL("Sequence (?#... not terminated");
4362 if (RExC_flags & PMf_EXTENDED) {
4363 if (isSPACE(*RExC_parse)) {
4367 else if (*RExC_parse == '#') {
4368 while (RExC_parse < RExC_end)
4369 if (*RExC_parse++ == '\n') break;
4378 - reg_node - emit a node
4380 STATIC regnode * /* Location. */
4381 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
4383 register regnode *ptr;
4384 regnode * const ret = RExC_emit;
4387 SIZE_ALIGN(RExC_size);
4392 NODE_ALIGN_FILL(ret);
4394 FILL_ADVANCE_NODE(ptr, op);
4395 if (RExC_offsets) { /* MJD */
4396 MJD_OFFSET_DEBUG(("%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
4397 "reg_node", __LINE__,
4399 RExC_emit - RExC_emit_start > RExC_offsets[0]
4400 ? "Overwriting end of array!\n" : "OK",
4401 RExC_emit - RExC_emit_start,
4402 RExC_parse - RExC_start,
4404 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
4413 - reganode - emit a node with an argument
4415 STATIC regnode * /* Location. */
4416 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
4418 register regnode *ptr;
4419 regnode * const ret = RExC_emit;
4422 SIZE_ALIGN(RExC_size);
4427 NODE_ALIGN_FILL(ret);
4429 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
4430 if (RExC_offsets) { /* MJD */
4431 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4435 RExC_emit - RExC_emit_start > RExC_offsets[0] ?
4436 "Overwriting end of array!\n" : "OK",
4437 RExC_emit - RExC_emit_start,
4438 RExC_parse - RExC_start,
4440 Set_Cur_Node_Offset;
4449 - reguni - emit (if appropriate) a Unicode character
4452 S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
4454 *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
4458 - reginsert - insert an operator in front of already-emitted operand
4460 * Means relocating the operand.
4463 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
4465 register regnode *src;
4466 register regnode *dst;
4467 register regnode *place;
4468 const int offset = regarglen[(U8)op];
4470 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
4473 RExC_size += NODE_STEP_REGNODE + offset;
4478 RExC_emit += NODE_STEP_REGNODE + offset;
4480 while (src > opnd) {
4481 StructCopy(--src, --dst, regnode);
4482 if (RExC_offsets) { /* MJD 20010112 */
4483 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %u -> %u (max %u).\n",
4487 dst - RExC_emit_start > RExC_offsets[0]
4488 ? "Overwriting end of array!\n" : "OK",
4489 src - RExC_emit_start,
4490 dst - RExC_emit_start,
4492 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
4493 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
4498 place = opnd; /* Op node, where operand used to be. */
4499 if (RExC_offsets) { /* MJD */
4500 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %u <- %u (max %u).\n",
4504 place - RExC_emit_start > RExC_offsets[0]
4505 ? "Overwriting end of array!\n" : "OK",
4506 place - RExC_emit_start,
4507 RExC_parse - RExC_start,
4509 Set_Node_Offset(place, RExC_parse);
4510 Set_Node_Length(place, 1);
4512 src = NEXTOPER(place);
4513 FILL_ADVANCE_NODE(place, op);
4514 Zero(src, offset, regnode);
4518 - regtail - set the next-pointer at the end of a node chain of p to val.
4521 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4523 register regnode *scan;
4528 /* Find last node. */
4531 regnode * const temp = regnext(scan);
4537 if (reg_off_by_arg[OP(scan)]) {
4538 ARG_SET(scan, val - scan);
4541 NEXT_OFF(scan) = val - scan;
4546 - regoptail - regtail on operand of first argument; nop if operandless
4549 S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
4551 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
4552 if (p == NULL || SIZE_ONLY)
4554 if (PL_regkind[(U8)OP(p)] == BRANCH) {
4555 regtail(pRExC_state, NEXTOPER(p), val);
4557 else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
4558 regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
4565 - regcurly - a little FSA that accepts {\d+,?\d*}
4568 S_regcurly(pTHX_ register const char *s)
4589 S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
4591 register U8 op = EXACT; /* Arbitrary non-END op. */
4592 register regnode *next;
4594 while (op != END && (!last || node < last)) {
4595 /* While that wasn't END last time... */
4601 next = regnext(node);
4603 if (OP(node) == OPTIMIZED)
4606 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
4607 (int)(2*l + 1), "", SvPVX_const(sv));
4608 if (next == NULL) /* Next ptr. */
4609 PerlIO_printf(Perl_debug_log, "(0)");
4611 PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
4612 (void)PerlIO_putc(Perl_debug_log, '\n');
4614 if (PL_regkind[(U8)op] == BRANCHJ) {
4615 register regnode *nnode = (OP(next) == LONGJMP
4618 if (last && nnode > last)
4620 node = dumpuntil(start, NEXTOPER(NEXTOPER(node)), nnode, sv, l + 1);
4622 else if (PL_regkind[(U8)op] == BRANCH) {
4623 node = dumpuntil(start, NEXTOPER(node), next, sv, l + 1);
4625 else if ( op == CURLY) { /* "next" might be very big: optimizer */
4626 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4627 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1, sv, l + 1);
4629 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
4630 node = dumpuntil(start, NEXTOPER(node) + EXTRA_STEP_2ARGS,
4633 else if ( op == PLUS || op == STAR) {
4634 node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
4636 else if (op == ANYOF) {
4637 /* arglen 1 + class block */
4638 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
4639 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
4640 node = NEXTOPER(node);
4642 else if (PL_regkind[(U8)op] == EXACT) {
4643 /* Literal string, where present. */
4644 node += NODE_SZ_STR(node) - 1;
4645 node = NEXTOPER(node);
4648 node = NEXTOPER(node);
4649 node += regarglen[(U8)op];
4651 if (op == CURLYX || op == OPEN)
4653 else if (op == WHILEM)
4659 #endif /* DEBUGGING */
4662 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
4665 Perl_regdump(pTHX_ regexp *r)
4668 SV *sv = sv_newmortal();
4670 (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
4672 /* Header fields of interest. */
4673 if (r->anchored_substr)
4674 PerlIO_printf(Perl_debug_log,
4675 "anchored \"%s%.*s%s\"%s at %"IVdf" ",
4677 (int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
4678 SvPVX_const(r->anchored_substr),
4680 SvTAIL(r->anchored_substr) ? "$" : "",
4681 (IV)r->anchored_offset);
4682 else if (r->anchored_utf8)
4683 PerlIO_printf(Perl_debug_log,
4684 "anchored utf8 \"%s%.*s%s\"%s at %"IVdf" ",
4686 (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
4687 SvPVX_const(r->anchored_utf8),
4689 SvTAIL(r->anchored_utf8) ? "$" : "",
4690 (IV)r->anchored_offset);
4691 if (r->float_substr)
4692 PerlIO_printf(Perl_debug_log,
4693 "floating \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4695 (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
4696 SvPVX_const(r->float_substr),
4698 SvTAIL(r->float_substr) ? "$" : "",
4699 (IV)r->float_min_offset, (UV)r->float_max_offset);
4700 else if (r->float_utf8)
4701 PerlIO_printf(Perl_debug_log,
4702 "floating utf8 \"%s%.*s%s\"%s at %"IVdf"..%"UVuf" ",
4704 (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
4705 SvPVX_const(r->float_utf8),
4707 SvTAIL(r->float_utf8) ? "$" : "",
4708 (IV)r->float_min_offset, (UV)r->float_max_offset);
4709 if (r->check_substr || r->check_utf8)
4710 PerlIO_printf(Perl_debug_log,
4711 r->check_substr == r->float_substr
4712 && r->check_utf8 == r->float_utf8
4713 ? "(checking floating" : "(checking anchored");
4714 if (r->reganch & ROPT_NOSCAN)
4715 PerlIO_printf(Perl_debug_log, " noscan");
4716 if (r->reganch & ROPT_CHECK_ALL)
4717 PerlIO_printf(Perl_debug_log, " isall");
4718 if (r->check_substr || r->check_utf8)
4719 PerlIO_printf(Perl_debug_log, ") ");
4721 if (r->regstclass) {
4722 regprop(sv, r->regstclass);
4723 PerlIO_printf(Perl_debug_log, "stclass \"%s\" ", SvPVX_const(sv));
4725 if (r->reganch & ROPT_ANCH) {
4726 PerlIO_printf(Perl_debug_log, "anchored");
4727 if (r->reganch & ROPT_ANCH_BOL)
4728 PerlIO_printf(Perl_debug_log, "(BOL)");
4729 if (r->reganch & ROPT_ANCH_MBOL)
4730 PerlIO_printf(Perl_debug_log, "(MBOL)");
4731 if (r->reganch & ROPT_ANCH_SBOL)
4732 PerlIO_printf(Perl_debug_log, "(SBOL)");
4733 if (r->reganch & ROPT_ANCH_GPOS)
4734 PerlIO_printf(Perl_debug_log, "(GPOS)");
4735 PerlIO_putc(Perl_debug_log, ' ');
4737 if (r->reganch & ROPT_GPOS_SEEN)
4738 PerlIO_printf(Perl_debug_log, "GPOS ");
4739 if (r->reganch & ROPT_SKIP)
4740 PerlIO_printf(Perl_debug_log, "plus ");
4741 if (r->reganch & ROPT_IMPLICIT)
4742 PerlIO_printf(Perl_debug_log, "implicit ");
4743 PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen);
4744 if (r->reganch & ROPT_EVAL_SEEN)
4745 PerlIO_printf(Perl_debug_log, "with eval ");
4746 PerlIO_printf(Perl_debug_log, "\n");
4749 const U32 len = r->offsets[0];
4750 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
4751 for (i = 1; i <= len; i++)
4752 PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
4753 (UV)r->offsets[i*2-1],
4754 (UV)r->offsets[i*2]);
4755 PerlIO_printf(Perl_debug_log, "\n");
4757 #endif /* DEBUGGING */
4763 S_put_byte(pTHX_ SV *sv, int c)
4765 if (isCNTRL(c) || c == 255 || !isPRINT(c))
4766 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
4767 else if (c == '-' || c == ']' || c == '\\' || c == '^')
4768 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
4770 Perl_sv_catpvf(aTHX_ sv, "%c", c);
4773 #endif /* DEBUGGING */
4776 - regprop - printable representation of opcode
4779 Perl_regprop(pTHX_ SV *sv, regnode *o)
4784 sv_setpvn(sv, "", 0);
4785 if (OP(o) >= reg_num) /* regnode.type is unsigned */
4786 /* It would be nice to FAIL() here, but this may be called from
4787 regexec.c, and it would be hard to supply pRExC_state. */
4788 Perl_croak(aTHX_ "Corrupted regexp opcode");
4789 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
4791 k = PL_regkind[(U8)OP(o)];
4794 SV *dsv = sv_2mortal(newSVpvn("", 0));
4795 /* Using is_utf8_string() is a crude hack but it may
4796 * be the best for now since we have no flag "this EXACTish
4797 * node was UTF-8" --jhi */
4798 const bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
4799 const char *s = do_utf8 ?
4800 pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
4801 UNI_DISPLAY_REGEX) :
4803 const int len = do_utf8 ?
4806 Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
4811 else if (k == CURLY) {
4812 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
4813 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
4814 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
4816 else if (k == WHILEM && o->flags) /* Ordinal/of */
4817 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
4818 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP )
4819 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
4820 else if (k == LOGICAL)
4821 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
4822 else if (k == ANYOF) {
4823 int i, rangestart = -1;
4824 U8 flags = ANYOF_FLAGS(o);
4825 const char * const anyofs[] = { /* Should be synchronized with
4826 * ANYOF_ #xdefines in regcomp.h */
4859 if (flags & ANYOF_LOCALE)
4860 sv_catpv(sv, "{loc}");
4861 if (flags & ANYOF_FOLD)
4862 sv_catpv(sv, "{i}");
4863 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
4864 if (flags & ANYOF_INVERT)
4866 for (i = 0; i <= 256; i++) {
4867 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
4868 if (rangestart == -1)
4870 } else if (rangestart != -1) {
4871 if (i <= rangestart + 3)
4872 for (; rangestart < i; rangestart++)
4873 put_byte(sv, rangestart);
4875 put_byte(sv, rangestart);
4877 put_byte(sv, i - 1);
4883 if (o->flags & ANYOF_CLASS)
4884 for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
4885 if (ANYOF_CLASS_TEST(o,i))
4886 sv_catpv(sv, anyofs[i]);
4888 if (flags & ANYOF_UNICODE)
4889 sv_catpv(sv, "{unicode}");
4890 else if (flags & ANYOF_UNICODE_ALL)
4891 sv_catpv(sv, "{unicode_all}");
4895 SV *sw = regclass_swash(o, FALSE, &lv, 0);
4899 U8 s[UTF8_MAXBYTES_CASE+1];
4901 for (i = 0; i <= 256; i++) { /* just the first 256 */
4902 uvchr_to_utf8(s, i);
4904 if (i < 256 && swash_fetch(sw, s, TRUE)) {
4905 if (rangestart == -1)
4907 } else if (rangestart != -1) {
4910 if (i <= rangestart + 3)
4911 for (; rangestart < i; rangestart++) {
4913 for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4918 for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
4921 for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
4928 sv_catpv(sv, "..."); /* et cetera */
4932 char *s = savesvpv(lv);
4935 while(*s && *s != '\n') s++;
4938 const char *t = ++s;
4956 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
4958 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
4959 Perl_sv_catpvf(aTHX_ sv, "[-%d]", o->flags);
4960 #endif /* DEBUGGING */
4964 Perl_re_intuit_string(pTHX_ regexp *prog)
4965 { /* Assume that RE_INTUIT is set */
4968 const char *s = SvPV_nolen_const(prog->check_substr
4969 ? prog->check_substr : prog->check_utf8);
4971 if (!PL_colorset) reginitcolors();
4972 PerlIO_printf(Perl_debug_log,
4973 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
4975 prog->check_substr ? "" : "utf8 ",
4976 PL_colors[5],PL_colors[0],
4979 (strlen(s) > 60 ? "..." : ""));
4982 return prog->check_substr ? prog->check_substr : prog->check_utf8;
4986 Perl_pregfree(pTHX_ struct regexp *r)
4989 SV *dsv = PERL_DEBUG_PAD_ZERO(0);
4992 if (!r || (--r->refcnt > 0))
4995 const char *s = (r->reganch & ROPT_UTF8)
4996 ? pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, UNI_DISPLAY_REGEX)
4997 : pv_display(dsv, r->precomp, r->prelen, 0, 60);
4998 const int len = SvCUR(dsv);
5001 PerlIO_printf(Perl_debug_log,
5002 "%sFreeing REx:%s `%s%*.*s%s%s'\n",
5003 PL_colors[4],PL_colors[5],PL_colors[0],
5006 len > 60 ? "..." : "");
5010 Safefree(r->precomp);
5011 if (r->offsets) /* 20010421 MJD */
5012 Safefree(r->offsets);
5013 if (RX_MATCH_COPIED(r))
5014 Safefree(r->subbeg);
5016 if (r->anchored_substr)
5017 SvREFCNT_dec(r->anchored_substr);
5018 if (r->anchored_utf8)
5019 SvREFCNT_dec(r->anchored_utf8);
5020 if (r->float_substr)
5021 SvREFCNT_dec(r->float_substr);
5023 SvREFCNT_dec(r->float_utf8);
5024 Safefree(r->substrs);
5027 int n = r->data->count;
5028 PAD* new_comppad = NULL;
5033 /* If you add a ->what type here, update the comment in regcomp.h */
5034 switch (r->data->what[n]) {
5036 SvREFCNT_dec((SV*)r->data->data[n]);
5039 Safefree(r->data->data[n]);
5042 new_comppad = (AV*)r->data->data[n];
5045 if (new_comppad == NULL)
5046 Perl_croak(aTHX_ "panic: pregfree comppad");
5047 PAD_SAVE_LOCAL(old_comppad,
5048 /* Watch out for global destruction's random ordering. */
5049 (SvTYPE(new_comppad) == SVt_PVAV) ?
5050 new_comppad : Null(PAD *)
5053 refcnt = OpREFCNT_dec((OP_4tree*)r->data->data[n]);
5056 op_free((OP_4tree*)r->data->data[n]);
5058 PAD_RESTORE_LOCAL(old_comppad);
5059 SvREFCNT_dec((SV*)new_comppad);
5065 Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
5068 Safefree(r->data->what);
5071 Safefree(r->startp);
5077 - regnext - dig the "next" pointer out of a node
5079 * [Note, when REGALIGN is defined there are two places in regmatch()
5080 * that bypass this code for speed.]
5083 Perl_regnext(pTHX_ register regnode *p)
5085 register I32 offset;
5087 if (p == &PL_regdummy)
5090 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
5098 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
5101 STRLEN l1 = strlen(pat1);
5102 STRLEN l2 = strlen(pat2);
5105 const char *message;
5111 Copy(pat1, buf, l1 , char);
5112 Copy(pat2, buf + l1, l2 , char);
5113 buf[l1 + l2] = '\n';
5114 buf[l1 + l2 + 1] = '\0';
5116 /* ANSI variant takes additional second argument */
5117 va_start(args, pat2);
5121 msv = vmess(buf, &args);
5123 message = SvPV(msv,l1);
5126 Copy(message, buf, l1 , char);
5127 buf[l1-1] = '\0'; /* Overwrite \n */
5128 Perl_croak(aTHX_ "%s", buf);
5131 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
5134 Perl_save_re_context(pTHX)
5136 SAVEI32(PL_reg_flags); /* from regexec.c */
5138 SAVEPPTR(PL_reginput); /* String-input pointer. */
5139 SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
5140 SAVEPPTR(PL_regeol); /* End of input, for $ check. */
5141 SAVEVPTR(PL_regstartp); /* Pointer to startp array. */
5142 SAVEVPTR(PL_regendp); /* Ditto for endp. */
5143 SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
5144 SAVEVPTR(PL_reglastcloseparen); /* Similarly for lastcloseparen. */
5145 SAVEPPTR(PL_regtill); /* How far we are required to go. */
5146 SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
5147 PL_reg_start_tmp = 0;
5148 SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
5149 PL_reg_start_tmpl = 0;
5150 SAVEVPTR(PL_regdata);
5151 SAVEI32(PL_reg_eval_set); /* from regexec.c */
5152 SAVEI32(PL_regnarrate); /* from regexec.c */
5153 SAVEVPTR(PL_regprogram); /* from regexec.c */
5154 SAVEINT(PL_regindent); /* from regexec.c */
5155 SAVEVPTR(PL_regcc); /* from regexec.c */
5156 SAVEVPTR(PL_curcop);
5157 SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
5158 SAVEVPTR(PL_reg_re); /* from regexec.c */
5159 SAVEPPTR(PL_reg_ganch); /* from regexec.c */
5160 SAVESPTR(PL_reg_sv); /* from regexec.c */
5161 SAVEBOOL(PL_reg_match_utf8); /* from regexec.c */
5162 SAVEVPTR(PL_reg_magic); /* from regexec.c */
5163 SAVEI32(PL_reg_oldpos); /* from regexec.c */
5164 SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
5165 SAVEVPTR(PL_reg_curpm); /* from regexec.c */
5166 SAVEPPTR(PL_reg_oldsaved); /* old saved substr during match */
5167 PL_reg_oldsaved = Nullch;
5168 SAVEI32(PL_reg_oldsavedlen); /* old length of saved substr during match */
5169 PL_reg_oldsavedlen = 0;
5170 SAVEI32(PL_reg_maxiter); /* max wait until caching pos */
5172 SAVEI32(PL_reg_leftiter); /* wait until caching pos */
5173 PL_reg_leftiter = 0;
5174 SAVEGENERICPV(PL_reg_poscache); /* cache of pos of WHILEM */
5175 PL_reg_poscache = Nullch;
5176 SAVEI32(PL_reg_poscache_size); /* size of pos cache of WHILEM */
5177 PL_reg_poscache_size = 0;
5178 SAVEPPTR(PL_regprecomp); /* uncompiled string. */
5179 SAVEI32(PL_regnpar); /* () count. */
5180 SAVEI32(PL_regsize); /* from regexec.c */
5183 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
5186 if (PL_curpm && (rx = PM_GETRE(PL_curpm))) {
5188 for (i = 1; i <= rx->nparens; i++) {
5190 char digits[TYPE_CHARS(long)];
5191 sprintf(digits, "%lu", (long)i);
5192 if ((mgv = gv_fetchpv(digits, FALSE, SVt_PV)))
5199 SAVEPPTR(PL_reg_starttry); /* from regexec.c */
5204 clear_re(pTHX_ void *r)
5206 ReREFCNT_dec((regexp *)r);
5211 * c-indentation-style: bsd
5213 * indent-tabs-mode: t
5216 * ex: set ts=8 sts=4 sw=4 noet: