5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
13 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
14 * confused with the original package (see point 3 below). Thanks, Henry!
17 /* Additional note: this code is very heavily munged from Henry's version
18 * in places. In some spots I've traded clarity for efficiency, so don't
19 * blame Henry for some of the lack of readability.
22 /* The names of the functions have been changed from regcomp and
23 * regexec to pregcomp and pregexec in order to avoid conflicts
24 * with the POSIX routines of the same names.
27 #ifdef PERL_EXT_RE_BUILD
28 /* need to replace pregcomp et al, so enable that */
29 # ifndef PERL_IN_XSUB_RE
30 # define PERL_IN_XSUB_RE
32 /* need access to debugger hooks */
33 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
38 #ifdef PERL_IN_XSUB_RE
39 /* We *really* need to overwrite these symbols: */
40 # define Perl_pregcomp my_regcomp
41 # define Perl_regdump my_regdump
42 # define Perl_regprop my_regprop
43 # define Perl_pregfree my_regfree
44 # define Perl_re_intuit_string my_re_intuit_string
45 /* *These* symbols are masked to allow static link. */
46 # define Perl_regnext my_regnext
47 # define Perl_save_re_context my_save_re_context
48 # define Perl_reginitcolors my_reginitcolors
50 # define PERL_NO_GET_CONTEXT
55 * pregcomp and pregexec -- regsub and regerror are not used in perl
57 * Copyright (c) 1986 by University of Toronto.
58 * Written by Henry Spencer. Not derived from licensed software.
60 * Permission is granted to anyone to use this software for any
61 * purpose on any computer system, and to redistribute it freely,
62 * subject to the following restrictions:
64 * 1. The author is not responsible for the consequences of use of
65 * this software, no matter how awful, even if they arise
68 * 2. The origin of this software must not be misrepresented, either
69 * by explicit claim or by omission.
71 * 3. Altered versions must be plainly marked as such, and must not
72 * be misrepresented as being the original software.
75 **** Alterations to Henry's code are...
77 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
78 **** 2000, 2001, 2002, 2003, by Larry Wall and others
80 **** You may distribute under the terms of either the GNU General Public
81 **** License or the Artistic License, as specified in the README file.
84 * Beware that some of this code is subtly aware of the way operator
85 * precedence is structured in regular expressions. Serious changes in
86 * regular-expression syntax might require a total rethink.
89 #define PERL_IN_REGCOMP_C
92 #ifndef PERL_IN_XSUB_RE
104 # if defined(BUGGY_MSC6)
105 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
106 # pragma optimize("a",off)
107 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
108 # pragma optimize("w",on )
109 # endif /* BUGGY_MSC6 */
113 #define STATIC static
116 typedef struct RExC_state_t {
117 U32 flags; /* are we folding, multilining? */
118 char *precomp; /* uncompiled string. */
120 char *start; /* Start of input for compile */
121 char *end; /* End of input for compile */
122 char *parse; /* Input-scan pointer. */
123 I32 whilem_seen; /* number of WHILEM in this expr */
124 regnode *emit_start; /* Start of emitted-code area */
125 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
126 I32 naughty; /* How bad is this pattern? */
127 I32 sawback; /* Did we see \1, ...? */
129 I32 size; /* Code size. */
130 I32 npar; /* () count. */
136 char *starttry; /* -Dr: where regtry was called. */
137 #define RExC_starttry (pRExC_state->starttry)
141 #define RExC_flags (pRExC_state->flags)
142 #define RExC_precomp (pRExC_state->precomp)
143 #define RExC_rx (pRExC_state->rx)
144 #define RExC_start (pRExC_state->start)
145 #define RExC_end (pRExC_state->end)
146 #define RExC_parse (pRExC_state->parse)
147 #define RExC_whilem_seen (pRExC_state->whilem_seen)
148 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
149 #define RExC_emit (pRExC_state->emit)
150 #define RExC_emit_start (pRExC_state->emit_start)
151 #define RExC_naughty (pRExC_state->naughty)
152 #define RExC_sawback (pRExC_state->sawback)
153 #define RExC_seen (pRExC_state->seen)
154 #define RExC_size (pRExC_state->size)
155 #define RExC_npar (pRExC_state->npar)
156 #define RExC_extralen (pRExC_state->extralen)
157 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
158 #define RExC_seen_evals (pRExC_state->seen_evals)
159 #define RExC_utf8 (pRExC_state->utf8)
161 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
162 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
163 ((*s) == '{' && regcurly(s)))
166 #undef SPSTART /* dratted cpp namespace... */
169 * Flags to be passed up and down.
171 #define WORST 0 /* Worst case. */
172 #define HASWIDTH 0x1 /* Known to match non-null strings. */
173 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
174 #define SPSTART 0x4 /* Starts with * or +. */
175 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
177 /* Length of a variant. */
179 typedef struct scan_data_t {
185 I32 last_end; /* min value, <0 unless valid. */
188 SV **longest; /* Either &l_fixed, or &l_float. */
192 I32 offset_float_min;
193 I32 offset_float_max;
197 struct regnode_charclass_class *start_class;
201 * Forward declarations for pregcomp()'s friends.
204 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
207 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
208 #define SF_BEFORE_SEOL 0x1
209 #define SF_BEFORE_MEOL 0x2
210 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
211 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
214 # define SF_FIX_SHIFT_EOL (0+2)
215 # define SF_FL_SHIFT_EOL (0+4)
217 # define SF_FIX_SHIFT_EOL (+2)
218 # define SF_FL_SHIFT_EOL (+4)
221 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
222 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
224 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
225 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
226 #define SF_IS_INF 0x40
227 #define SF_HAS_PAR 0x80
228 #define SF_IN_PAR 0x100
229 #define SF_HAS_EVAL 0x200
230 #define SCF_DO_SUBSTR 0x400
231 #define SCF_DO_STCLASS_AND 0x0800
232 #define SCF_DO_STCLASS_OR 0x1000
233 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
234 #define SCF_WHILEM_VISITED_POS 0x2000
236 #define UTF (RExC_utf8 != 0)
237 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
238 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
240 #define OOB_UNICODE 12345678
241 #define OOB_NAMEDCLASS -1
243 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
244 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
247 /* length of regex to show in messages that don't mark a position within */
248 #define RegexLengthToShowInErrorMessages 127
251 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
252 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
253 * op/pragma/warn/regcomp.
255 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
256 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
258 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
261 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
262 * arg. Show regex, up to a maximum length. If it's too long, chop and add
265 #define FAIL(msg) STMT_START { \
266 char *ellipses = ""; \
267 IV len = RExC_end - RExC_precomp; \
270 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
271 if (len > RegexLengthToShowInErrorMessages) { \
272 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
273 len = RegexLengthToShowInErrorMessages - 10; \
276 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
277 msg, (int)len, RExC_precomp, ellipses); \
281 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
282 * args. Show regex, up to a maximum length. If it's too long, chop and add
285 #define FAIL2(pat,msg) STMT_START { \
286 char *ellipses = ""; \
287 IV len = RExC_end - RExC_precomp; \
290 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
291 if (len > RegexLengthToShowInErrorMessages) { \
292 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
293 len = RegexLengthToShowInErrorMessages - 10; \
296 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
297 msg, (int)len, RExC_precomp, ellipses); \
302 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
304 #define Simple_vFAIL(m) STMT_START { \
305 IV offset = RExC_parse - RExC_precomp; \
306 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
307 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
311 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
313 #define vFAIL(m) STMT_START { \
315 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
320 * Like Simple_vFAIL(), but accepts two arguments.
322 #define Simple_vFAIL2(m,a1) STMT_START { \
323 IV offset = RExC_parse - RExC_precomp; \
324 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
325 (int)offset, RExC_precomp, RExC_precomp + offset); \
329 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
331 #define vFAIL2(m,a1) STMT_START { \
333 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
334 Simple_vFAIL2(m, a1); \
339 * Like Simple_vFAIL(), but accepts three arguments.
341 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
342 IV offset = RExC_parse - RExC_precomp; \
343 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
344 (int)offset, RExC_precomp, RExC_precomp + offset); \
348 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
350 #define vFAIL3(m,a1,a2) STMT_START { \
352 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
353 Simple_vFAIL3(m, a1, a2); \
357 * Like Simple_vFAIL(), but accepts four arguments.
359 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
360 IV offset = RExC_parse - RExC_precomp; \
361 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
362 (int)offset, RExC_precomp, RExC_precomp + offset); \
366 * Like Simple_vFAIL(), but accepts five arguments.
368 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
369 IV offset = RExC_parse - RExC_precomp; \
370 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
371 (int)offset, RExC_precomp, RExC_precomp + offset); \
375 #define vWARN(loc,m) STMT_START { \
376 IV offset = loc - RExC_precomp; \
377 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
378 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
381 #define vWARNdep(loc,m) STMT_START { \
382 IV offset = loc - RExC_precomp; \
383 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
384 "%s" REPORT_LOCATION, \
385 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
389 #define vWARN2(loc, m, a1) STMT_START { \
390 IV offset = loc - RExC_precomp; \
391 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
392 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
395 #define vWARN3(loc, m, a1, a2) STMT_START { \
396 IV offset = loc - RExC_precomp; \
397 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
398 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
401 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
402 IV offset = loc - RExC_precomp; \
403 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
404 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
407 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
408 IV offset = loc - RExC_precomp; \
409 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
410 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
414 /* Allow for side effects in s */
415 #define REGC(c,s) STMT_START { \
416 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
419 /* Macros for recording node offsets. 20001227 mjd@plover.com
420 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
421 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
422 * Element 0 holds the number n.
425 #define MJD_OFFSET_DEBUG(x)
426 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
429 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
431 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
432 __LINE__, (node), (byte))); \
434 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
436 RExC_offsets[2*(node)-1] = (byte); \
441 #define Set_Node_Offset(node,byte) \
442 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
443 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
445 #define Set_Node_Length_To_R(node,len) STMT_START { \
447 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
448 __LINE__, (node), (len))); \
450 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
452 RExC_offsets[2*(node)] = (len); \
457 #define Set_Node_Length(node,len) \
458 Set_Node_Length_To_R((node)-RExC_emit_start, len)
459 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
460 #define Set_Node_Cur_Length(node) \
461 Set_Node_Length(node, RExC_parse - parse_start)
463 /* Get offsets and lengths */
464 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
465 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
467 static void clear_re(pTHX_ void *r);
469 /* Mark that we cannot extend a found fixed substring at this point.
470 Updata the longest found anchored substring and the longest found
471 floating substrings if needed. */
474 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
476 STRLEN l = CHR_SVLEN(data->last_found);
477 STRLEN old_l = CHR_SVLEN(*data->longest);
479 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
480 SvSetMagicSV(*data->longest, data->last_found);
481 if (*data->longest == data->longest_fixed) {
482 data->offset_fixed = l ? data->last_start_min : data->pos_min;
483 if (data->flags & SF_BEFORE_EOL)
485 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
487 data->flags &= ~SF_FIX_BEFORE_EOL;
490 data->offset_float_min = l ? data->last_start_min : data->pos_min;
491 data->offset_float_max = (l
492 ? data->last_start_max
493 : data->pos_min + data->pos_delta);
494 if ((U32)data->offset_float_max > (U32)I32_MAX)
495 data->offset_float_max = I32_MAX;
496 if (data->flags & SF_BEFORE_EOL)
498 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
500 data->flags &= ~SF_FL_BEFORE_EOL;
503 SvCUR_set(data->last_found, 0);
505 SV * sv = data->last_found;
507 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
508 if (mg && mg->mg_len > 0)
512 data->flags &= ~SF_BEFORE_EOL;
515 /* Can match anything (initialization) */
517 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
519 ANYOF_CLASS_ZERO(cl);
520 ANYOF_BITMAP_SETALL(cl);
521 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
523 cl->flags |= ANYOF_LOCALE;
526 /* Can match anything (initialization) */
528 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
532 for (value = 0; value <= ANYOF_MAX; value += 2)
533 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
535 if (!(cl->flags & ANYOF_UNICODE_ALL))
537 if (!ANYOF_BITMAP_TESTALLSET(cl))
542 /* Can match anything (initialization) */
544 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
546 Zero(cl, 1, struct regnode_charclass_class);
548 cl_anything(pRExC_state, cl);
552 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
554 Zero(cl, 1, struct regnode_charclass_class);
556 cl_anything(pRExC_state, cl);
558 cl->flags |= ANYOF_LOCALE;
561 /* 'And' a given class with another one. Can create false positives */
562 /* We assume that cl is not inverted */
564 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
565 struct regnode_charclass_class *and_with)
567 if (!(and_with->flags & ANYOF_CLASS)
568 && !(cl->flags & ANYOF_CLASS)
569 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
570 && !(and_with->flags & ANYOF_FOLD)
571 && !(cl->flags & ANYOF_FOLD)) {
574 if (and_with->flags & ANYOF_INVERT)
575 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
576 cl->bitmap[i] &= ~and_with->bitmap[i];
578 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
579 cl->bitmap[i] &= and_with->bitmap[i];
580 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
581 if (!(and_with->flags & ANYOF_EOS))
582 cl->flags &= ~ANYOF_EOS;
584 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
585 !(and_with->flags & ANYOF_INVERT)) {
586 cl->flags &= ~ANYOF_UNICODE_ALL;
587 cl->flags |= ANYOF_UNICODE;
588 ARG_SET(cl, ARG(and_with));
590 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
591 !(and_with->flags & ANYOF_INVERT))
592 cl->flags &= ~ANYOF_UNICODE_ALL;
593 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
594 !(and_with->flags & ANYOF_INVERT))
595 cl->flags &= ~ANYOF_UNICODE;
598 /* 'OR' a given class with another one. Can create false positives */
599 /* We assume that cl is not inverted */
601 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
603 if (or_with->flags & ANYOF_INVERT) {
605 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
606 * <= (B1 | !B2) | (CL1 | !CL2)
607 * which is wasteful if CL2 is small, but we ignore CL2:
608 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
609 * XXXX Can we handle case-fold? Unclear:
610 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
611 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
613 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
614 && !(or_with->flags & ANYOF_FOLD)
615 && !(cl->flags & ANYOF_FOLD) ) {
618 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
619 cl->bitmap[i] |= ~or_with->bitmap[i];
620 } /* XXXX: logic is complicated otherwise */
622 cl_anything(pRExC_state, cl);
625 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
626 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
627 && (!(or_with->flags & ANYOF_FOLD)
628 || (cl->flags & ANYOF_FOLD)) ) {
631 /* OR char bitmap and class bitmap separately */
632 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
633 cl->bitmap[i] |= or_with->bitmap[i];
634 if (or_with->flags & ANYOF_CLASS) {
635 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
636 cl->classflags[i] |= or_with->classflags[i];
637 cl->flags |= ANYOF_CLASS;
640 else { /* XXXX: logic is complicated, leave it along for a moment. */
641 cl_anything(pRExC_state, cl);
644 if (or_with->flags & ANYOF_EOS)
645 cl->flags |= ANYOF_EOS;
647 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
648 ARG(cl) != ARG(or_with)) {
649 cl->flags |= ANYOF_UNICODE_ALL;
650 cl->flags &= ~ANYOF_UNICODE;
652 if (or_with->flags & ANYOF_UNICODE_ALL) {
653 cl->flags |= ANYOF_UNICODE_ALL;
654 cl->flags &= ~ANYOF_UNICODE;
659 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
660 * These need to be revisited when a newer toolchain becomes available.
662 #if defined(__sparc64__) && defined(__GNUC__)
663 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
664 # undef SPARC64_GCC_WORKAROUND
665 # define SPARC64_GCC_WORKAROUND 1
669 /* REx optimizer. Converts nodes into quickier variants "in place".
670 Finds fixed substrings. */
672 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
673 to the position after last scanned or to NULL. */
676 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
677 /* scanp: Start here (read-write). */
678 /* deltap: Write maxlen-minlen here. */
679 /* last: Stop before this one. */
681 I32 min = 0, pars = 0, code;
682 regnode *scan = *scanp, *next;
684 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
685 int is_inf_internal = 0; /* The studied chunk is infinite */
686 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
687 scan_data_t data_fake;
688 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
690 while (scan && OP(scan) != END && scan < last) {
691 /* Peephole optimizer: */
693 if (PL_regkind[(U8)OP(scan)] == EXACT) {
694 /* Merge several consecutive EXACTish nodes into one. */
695 regnode *n = regnext(scan);
698 regnode *stop = scan;
701 next = scan + NODE_SZ_STR(scan);
702 /* Skip NOTHING, merge EXACT*. */
704 ( PL_regkind[(U8)OP(n)] == NOTHING ||
705 (stringok && (OP(n) == OP(scan))))
707 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
708 if (OP(n) == TAIL || n > next)
710 if (PL_regkind[(U8)OP(n)] == NOTHING) {
711 NEXT_OFF(scan) += NEXT_OFF(n);
712 next = n + NODE_STEP_REGNODE;
720 int oldl = STR_LEN(scan);
721 regnode *nnext = regnext(n);
723 if (oldl + STR_LEN(n) > U8_MAX)
725 NEXT_OFF(scan) += NEXT_OFF(n);
726 STR_LEN(scan) += STR_LEN(n);
727 next = n + NODE_SZ_STR(n);
728 /* Now we can overwrite *n : */
729 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
737 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
739 Two problematic code points in Unicode casefolding of EXACT nodes:
741 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
742 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
748 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
749 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
751 This means that in case-insensitive matching (or "loose matching",
752 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
753 length of the above casefolded versions) can match a target string
754 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
755 This would rather mess up the minimum length computation.
757 What we'll do is to look for the tail four bytes, and then peek
758 at the preceding two bytes to see whether we need to decrease
759 the minimum length by four (six minus two).
761 Thanks to the design of UTF-8, there cannot be false matches:
762 A sequence of valid UTF-8 bytes cannot be a subsequence of
763 another valid sequence of UTF-8 bytes.
766 char *s0 = STRING(scan), *s, *t;
767 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
768 char *t0 = "\xcc\x88\xcc\x81";
772 s < s2 && (t = ninstr(s, s1, t0, t1));
774 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
775 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
782 n = scan + NODE_SZ_STR(scan);
784 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
792 /* Follow the next-chain of the current node and optimize
793 away all the NOTHINGs from it. */
794 if (OP(scan) != CURLYX) {
795 int max = (reg_off_by_arg[OP(scan)]
797 /* I32 may be smaller than U16 on CRAYs! */
798 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
799 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
803 /* Skip NOTHING and LONGJMP. */
804 while ((n = regnext(n))
805 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
806 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
809 if (reg_off_by_arg[OP(scan)])
812 NEXT_OFF(scan) = off;
814 /* The principal pseudo-switch. Cannot be a switch, since we
815 look into several different things. */
816 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
817 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
818 next = regnext(scan);
821 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
822 I32 max1 = 0, min1 = I32_MAX, num = 0;
823 struct regnode_charclass_class accum;
825 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
826 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
827 if (flags & SCF_DO_STCLASS)
828 cl_init_zero(pRExC_state, &accum);
829 while (OP(scan) == code) {
830 I32 deltanext, minnext, f = 0, fake;
831 struct regnode_charclass_class this_class;
836 data_fake.whilem_c = data->whilem_c;
837 data_fake.last_closep = data->last_closep;
840 data_fake.last_closep = &fake;
841 next = regnext(scan);
842 scan = NEXTOPER(scan);
844 scan = NEXTOPER(scan);
845 if (flags & SCF_DO_STCLASS) {
846 cl_init(pRExC_state, &this_class);
847 data_fake.start_class = &this_class;
848 f = SCF_DO_STCLASS_AND;
850 if (flags & SCF_WHILEM_VISITED_POS)
851 f |= SCF_WHILEM_VISITED_POS;
852 /* we suppose the run is continuous, last=next...*/
853 minnext = study_chunk(pRExC_state, &scan, &deltanext,
854 next, &data_fake, f);
857 if (max1 < minnext + deltanext)
858 max1 = minnext + deltanext;
859 if (deltanext == I32_MAX)
860 is_inf = is_inf_internal = 1;
862 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
864 if (data && (data_fake.flags & SF_HAS_EVAL))
865 data->flags |= SF_HAS_EVAL;
867 data->whilem_c = data_fake.whilem_c;
868 if (flags & SCF_DO_STCLASS)
869 cl_or(pRExC_state, &accum, &this_class);
873 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
875 if (flags & SCF_DO_SUBSTR) {
876 data->pos_min += min1;
877 data->pos_delta += max1 - min1;
878 if (max1 != min1 || is_inf)
879 data->longest = &(data->longest_float);
882 delta += max1 - min1;
883 if (flags & SCF_DO_STCLASS_OR) {
884 cl_or(pRExC_state, data->start_class, &accum);
886 cl_and(data->start_class, &and_with);
887 flags &= ~SCF_DO_STCLASS;
890 else if (flags & SCF_DO_STCLASS_AND) {
892 cl_and(data->start_class, &accum);
893 flags &= ~SCF_DO_STCLASS;
896 /* Switch to OR mode: cache the old value of
897 * data->start_class */
898 StructCopy(data->start_class, &and_with,
899 struct regnode_charclass_class);
900 flags &= ~SCF_DO_STCLASS_AND;
901 StructCopy(&accum, data->start_class,
902 struct regnode_charclass_class);
903 flags |= SCF_DO_STCLASS_OR;
904 data->start_class->flags |= ANYOF_EOS;
908 else if (code == BRANCHJ) /* single branch is optimized. */
909 scan = NEXTOPER(NEXTOPER(scan));
910 else /* single branch is optimized. */
911 scan = NEXTOPER(scan);
914 else if (OP(scan) == EXACT) {
915 I32 l = STR_LEN(scan);
916 UV uc = *((U8*)STRING(scan));
918 U8 *s = (U8*)STRING(scan);
919 l = utf8_length(s, s + l);
920 uc = utf8_to_uvchr(s, NULL);
923 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
924 /* The code below prefers earlier match for fixed
925 offset, later match for variable offset. */
926 if (data->last_end == -1) { /* Update the start info. */
927 data->last_start_min = data->pos_min;
928 data->last_start_max = is_inf
929 ? I32_MAX : data->pos_min + data->pos_delta;
931 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
933 SV * sv = data->last_found;
934 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
935 mg_find(sv, PERL_MAGIC_utf8) : NULL;
936 if (mg && mg->mg_len >= 0)
937 mg->mg_len += utf8_length((U8*)STRING(scan),
938 (U8*)STRING(scan)+STR_LEN(scan));
941 SvUTF8_on(data->last_found);
942 data->last_end = data->pos_min + l;
943 data->pos_min += l; /* As in the first entry. */
944 data->flags &= ~SF_BEFORE_EOL;
946 if (flags & SCF_DO_STCLASS_AND) {
947 /* Check whether it is compatible with what we know already! */
951 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
952 && !ANYOF_BITMAP_TEST(data->start_class, uc)
953 && (!(data->start_class->flags & ANYOF_FOLD)
954 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
957 ANYOF_CLASS_ZERO(data->start_class);
958 ANYOF_BITMAP_ZERO(data->start_class);
960 ANYOF_BITMAP_SET(data->start_class, uc);
961 data->start_class->flags &= ~ANYOF_EOS;
963 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
965 else if (flags & SCF_DO_STCLASS_OR) {
966 /* false positive possible if the class is case-folded */
968 ANYOF_BITMAP_SET(data->start_class, uc);
970 data->start_class->flags |= ANYOF_UNICODE_ALL;
971 data->start_class->flags &= ~ANYOF_EOS;
972 cl_and(data->start_class, &and_with);
974 flags &= ~SCF_DO_STCLASS;
976 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
977 I32 l = STR_LEN(scan);
978 UV uc = *((U8*)STRING(scan));
980 /* Search for fixed substrings supports EXACT only. */
981 if (flags & SCF_DO_SUBSTR)
982 scan_commit(pRExC_state, data);
984 U8 *s = (U8 *)STRING(scan);
985 l = utf8_length(s, s + l);
986 uc = utf8_to_uvchr(s, NULL);
989 if (data && (flags & SCF_DO_SUBSTR))
991 if (flags & SCF_DO_STCLASS_AND) {
992 /* Check whether it is compatible with what we know already! */
996 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
997 && !ANYOF_BITMAP_TEST(data->start_class, uc)
998 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
1000 ANYOF_CLASS_ZERO(data->start_class);
1001 ANYOF_BITMAP_ZERO(data->start_class);
1003 ANYOF_BITMAP_SET(data->start_class, uc);
1004 data->start_class->flags &= ~ANYOF_EOS;
1005 data->start_class->flags |= ANYOF_FOLD;
1006 if (OP(scan) == EXACTFL)
1007 data->start_class->flags |= ANYOF_LOCALE;
1010 else if (flags & SCF_DO_STCLASS_OR) {
1011 if (data->start_class->flags & ANYOF_FOLD) {
1012 /* false positive possible if the class is case-folded.
1013 Assume that the locale settings are the same... */
1015 ANYOF_BITMAP_SET(data->start_class, uc);
1016 data->start_class->flags &= ~ANYOF_EOS;
1018 cl_and(data->start_class, &and_with);
1020 flags &= ~SCF_DO_STCLASS;
1022 else if (strchr((char*)PL_varies,OP(scan))) {
1023 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1024 I32 f = flags, pos_before = 0;
1025 regnode *oscan = scan;
1026 struct regnode_charclass_class this_class;
1027 struct regnode_charclass_class *oclass = NULL;
1028 I32 next_is_eval = 0;
1030 switch (PL_regkind[(U8)OP(scan)]) {
1031 case WHILEM: /* End of (?:...)* . */
1032 scan = NEXTOPER(scan);
1035 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1036 next = NEXTOPER(scan);
1037 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1039 maxcount = REG_INFTY;
1040 next = regnext(scan);
1041 scan = NEXTOPER(scan);
1045 if (flags & SCF_DO_SUBSTR)
1050 if (flags & SCF_DO_STCLASS) {
1052 maxcount = REG_INFTY;
1053 next = regnext(scan);
1054 scan = NEXTOPER(scan);
1057 is_inf = is_inf_internal = 1;
1058 scan = regnext(scan);
1059 if (flags & SCF_DO_SUBSTR) {
1060 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1061 data->longest = &(data->longest_float);
1063 goto optimize_curly_tail;
1065 mincount = ARG1(scan);
1066 maxcount = ARG2(scan);
1067 next = regnext(scan);
1068 if (OP(scan) == CURLYX) {
1069 I32 lp = (data ? *(data->last_closep) : 0);
1071 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1073 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1074 next_is_eval = (OP(scan) == EVAL);
1076 if (flags & SCF_DO_SUBSTR) {
1077 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1078 pos_before = data->pos_min;
1082 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1084 data->flags |= SF_IS_INF;
1086 if (flags & SCF_DO_STCLASS) {
1087 cl_init(pRExC_state, &this_class);
1088 oclass = data->start_class;
1089 data->start_class = &this_class;
1090 f |= SCF_DO_STCLASS_AND;
1091 f &= ~SCF_DO_STCLASS_OR;
1093 /* These are the cases when once a subexpression
1094 fails at a particular position, it cannot succeed
1095 even after backtracking at the enclosing scope.
1097 XXXX what if minimal match and we are at the
1098 initial run of {n,m}? */
1099 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1100 f &= ~SCF_WHILEM_VISITED_POS;
1102 /* This will finish on WHILEM, setting scan, or on NULL: */
1103 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1105 ? (f & ~SCF_DO_SUBSTR) : f);
1107 if (flags & SCF_DO_STCLASS)
1108 data->start_class = oclass;
1109 if (mincount == 0 || minnext == 0) {
1110 if (flags & SCF_DO_STCLASS_OR) {
1111 cl_or(pRExC_state, data->start_class, &this_class);
1113 else if (flags & SCF_DO_STCLASS_AND) {
1114 /* Switch to OR mode: cache the old value of
1115 * data->start_class */
1116 StructCopy(data->start_class, &and_with,
1117 struct regnode_charclass_class);
1118 flags &= ~SCF_DO_STCLASS_AND;
1119 StructCopy(&this_class, data->start_class,
1120 struct regnode_charclass_class);
1121 flags |= SCF_DO_STCLASS_OR;
1122 data->start_class->flags |= ANYOF_EOS;
1124 } else { /* Non-zero len */
1125 if (flags & SCF_DO_STCLASS_OR) {
1126 cl_or(pRExC_state, data->start_class, &this_class);
1127 cl_and(data->start_class, &and_with);
1129 else if (flags & SCF_DO_STCLASS_AND)
1130 cl_and(data->start_class, &this_class);
1131 flags &= ~SCF_DO_STCLASS;
1133 if (!scan) /* It was not CURLYX, but CURLY. */
1135 if (ckWARN(WARN_REGEXP)
1136 /* ? quantifier ok, except for (?{ ... }) */
1137 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1138 && (minnext == 0) && (deltanext == 0)
1139 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1140 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1143 "Quantifier unexpected on zero-length expression");
1146 min += minnext * mincount;
1147 is_inf_internal |= ((maxcount == REG_INFTY
1148 && (minnext + deltanext) > 0)
1149 || deltanext == I32_MAX);
1150 is_inf |= is_inf_internal;
1151 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1153 /* Try powerful optimization CURLYX => CURLYN. */
1154 if ( OP(oscan) == CURLYX && data
1155 && data->flags & SF_IN_PAR
1156 && !(data->flags & SF_HAS_EVAL)
1157 && !deltanext && minnext == 1 ) {
1158 /* Try to optimize to CURLYN. */
1159 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1160 regnode *nxt1 = nxt;
1167 if (!strchr((char*)PL_simple,OP(nxt))
1168 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1169 && STR_LEN(nxt) == 1))
1175 if (OP(nxt) != CLOSE)
1177 /* Now we know that nxt2 is the only contents: */
1178 oscan->flags = (U8)ARG(nxt);
1180 OP(nxt1) = NOTHING; /* was OPEN. */
1182 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1183 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1184 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1185 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1186 OP(nxt + 1) = OPTIMIZED; /* was count. */
1187 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1192 /* Try optimization CURLYX => CURLYM. */
1193 if ( OP(oscan) == CURLYX && data
1194 && !(data->flags & SF_HAS_PAR)
1195 && !(data->flags & SF_HAS_EVAL)
1196 && !deltanext /* atom is fixed width */
1197 && minnext != 0 /* CURLYM can't handle zero width */
1199 /* XXXX How to optimize if data == 0? */
1200 /* Optimize to a simpler form. */
1201 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1205 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1206 && (OP(nxt2) != WHILEM))
1208 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1209 /* Need to optimize away parenths. */
1210 if (data->flags & SF_IN_PAR) {
1211 /* Set the parenth number. */
1212 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1214 if (OP(nxt) != CLOSE)
1215 FAIL("Panic opt close");
1216 oscan->flags = (U8)ARG(nxt);
1217 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1218 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1220 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1221 OP(nxt + 1) = OPTIMIZED; /* was count. */
1222 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1223 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1226 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1227 regnode *nnxt = regnext(nxt1);
1230 if (reg_off_by_arg[OP(nxt1)])
1231 ARG_SET(nxt1, nxt2 - nxt1);
1232 else if (nxt2 - nxt1 < U16_MAX)
1233 NEXT_OFF(nxt1) = nxt2 - nxt1;
1235 OP(nxt) = NOTHING; /* Cannot beautify */
1240 /* Optimize again: */
1241 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1247 else if ((OP(oscan) == CURLYX)
1248 && (flags & SCF_WHILEM_VISITED_POS)
1249 /* See the comment on a similar expression above.
1250 However, this time it not a subexpression
1251 we care about, but the expression itself. */
1252 && (maxcount == REG_INFTY)
1253 && data && ++data->whilem_c < 16) {
1254 /* This stays as CURLYX, we can put the count/of pair. */
1255 /* Find WHILEM (as in regexec.c) */
1256 regnode *nxt = oscan + NEXT_OFF(oscan);
1258 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1260 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1261 | (RExC_whilem_seen << 4)); /* On WHILEM */
1263 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1265 if (flags & SCF_DO_SUBSTR) {
1266 SV *last_str = Nullsv;
1267 int counted = mincount != 0;
1269 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1270 #if defined(SPARC64_GCC_WORKAROUND)
1276 if (pos_before >= data->last_start_min)
1279 b = data->last_start_min;
1282 s = SvPV(data->last_found, l);
1283 old = b - data->last_start_min;
1286 I32 b = pos_before >= data->last_start_min
1287 ? pos_before : data->last_start_min;
1289 char *s = SvPV(data->last_found, l);
1290 I32 old = b - data->last_start_min;
1294 old = utf8_hop((U8*)s, old) - (U8*)s;
1297 /* Get the added string: */
1298 last_str = newSVpvn(s + old, l);
1300 SvUTF8_on(last_str);
1301 if (deltanext == 0 && pos_before == b) {
1302 /* What was added is a constant string */
1304 SvGROW(last_str, (mincount * l) + 1);
1305 repeatcpy(SvPVX(last_str) + l,
1306 SvPVX(last_str), l, mincount - 1);
1307 SvCUR(last_str) *= mincount;
1308 /* Add additional parts. */
1309 SvCUR_set(data->last_found,
1310 SvCUR(data->last_found) - l);
1311 sv_catsv(data->last_found, last_str);
1313 SV * sv = data->last_found;
1315 SvUTF8(sv) && SvMAGICAL(sv) ?
1316 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1317 if (mg && mg->mg_len >= 0)
1318 mg->mg_len += CHR_SVLEN(last_str);
1320 data->last_end += l * (mincount - 1);
1323 /* start offset must point into the last copy */
1324 data->last_start_min += minnext * (mincount - 1);
1325 data->last_start_max += is_inf ? I32_MAX
1326 : (maxcount - 1) * (minnext + data->pos_delta);
1329 /* It is counted once already... */
1330 data->pos_min += minnext * (mincount - counted);
1331 data->pos_delta += - counted * deltanext +
1332 (minnext + deltanext) * maxcount - minnext * mincount;
1333 if (mincount != maxcount) {
1334 /* Cannot extend fixed substrings found inside
1336 scan_commit(pRExC_state,data);
1337 if (mincount && last_str) {
1338 sv_setsv(data->last_found, last_str);
1339 data->last_end = data->pos_min;
1340 data->last_start_min =
1341 data->pos_min - CHR_SVLEN(last_str);
1342 data->last_start_max = is_inf
1344 : data->pos_min + data->pos_delta
1345 - CHR_SVLEN(last_str);
1347 data->longest = &(data->longest_float);
1349 SvREFCNT_dec(last_str);
1351 if (data && (fl & SF_HAS_EVAL))
1352 data->flags |= SF_HAS_EVAL;
1353 optimize_curly_tail:
1354 if (OP(oscan) != CURLYX) {
1355 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1357 NEXT_OFF(oscan) += NEXT_OFF(next);
1360 default: /* REF and CLUMP only? */
1361 if (flags & SCF_DO_SUBSTR) {
1362 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1363 data->longest = &(data->longest_float);
1365 is_inf = is_inf_internal = 1;
1366 if (flags & SCF_DO_STCLASS_OR)
1367 cl_anything(pRExC_state, data->start_class);
1368 flags &= ~SCF_DO_STCLASS;
1372 else if (strchr((char*)PL_simple,OP(scan))) {
1375 if (flags & SCF_DO_SUBSTR) {
1376 scan_commit(pRExC_state,data);
1380 if (flags & SCF_DO_STCLASS) {
1381 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1383 /* Some of the logic below assumes that switching
1384 locale on will only add false positives. */
1385 switch (PL_regkind[(U8)OP(scan)]) {
1389 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1390 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1391 cl_anything(pRExC_state, data->start_class);
1394 if (OP(scan) == SANY)
1396 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1397 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1398 || (data->start_class->flags & ANYOF_CLASS));
1399 cl_anything(pRExC_state, data->start_class);
1401 if (flags & SCF_DO_STCLASS_AND || !value)
1402 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1405 if (flags & SCF_DO_STCLASS_AND)
1406 cl_and(data->start_class,
1407 (struct regnode_charclass_class*)scan);
1409 cl_or(pRExC_state, data->start_class,
1410 (struct regnode_charclass_class*)scan);
1413 if (flags & SCF_DO_STCLASS_AND) {
1414 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1415 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1416 for (value = 0; value < 256; value++)
1417 if (!isALNUM(value))
1418 ANYOF_BITMAP_CLEAR(data->start_class, value);
1422 if (data->start_class->flags & ANYOF_LOCALE)
1423 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1425 for (value = 0; value < 256; value++)
1427 ANYOF_BITMAP_SET(data->start_class, value);
1432 if (flags & SCF_DO_STCLASS_AND) {
1433 if (data->start_class->flags & ANYOF_LOCALE)
1434 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1437 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1438 data->start_class->flags |= ANYOF_LOCALE;
1442 if (flags & SCF_DO_STCLASS_AND) {
1443 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1444 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1445 for (value = 0; value < 256; value++)
1447 ANYOF_BITMAP_CLEAR(data->start_class, value);
1451 if (data->start_class->flags & ANYOF_LOCALE)
1452 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1454 for (value = 0; value < 256; value++)
1455 if (!isALNUM(value))
1456 ANYOF_BITMAP_SET(data->start_class, value);
1461 if (flags & SCF_DO_STCLASS_AND) {
1462 if (data->start_class->flags & ANYOF_LOCALE)
1463 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1466 data->start_class->flags |= ANYOF_LOCALE;
1467 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1471 if (flags & SCF_DO_STCLASS_AND) {
1472 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1473 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1474 for (value = 0; value < 256; value++)
1475 if (!isSPACE(value))
1476 ANYOF_BITMAP_CLEAR(data->start_class, value);
1480 if (data->start_class->flags & ANYOF_LOCALE)
1481 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1483 for (value = 0; value < 256; value++)
1485 ANYOF_BITMAP_SET(data->start_class, value);
1490 if (flags & SCF_DO_STCLASS_AND) {
1491 if (data->start_class->flags & ANYOF_LOCALE)
1492 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1495 data->start_class->flags |= ANYOF_LOCALE;
1496 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1500 if (flags & SCF_DO_STCLASS_AND) {
1501 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1502 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1503 for (value = 0; value < 256; value++)
1505 ANYOF_BITMAP_CLEAR(data->start_class, value);
1509 if (data->start_class->flags & ANYOF_LOCALE)
1510 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1512 for (value = 0; value < 256; value++)
1513 if (!isSPACE(value))
1514 ANYOF_BITMAP_SET(data->start_class, value);
1519 if (flags & SCF_DO_STCLASS_AND) {
1520 if (data->start_class->flags & ANYOF_LOCALE) {
1521 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1522 for (value = 0; value < 256; value++)
1523 if (!isSPACE(value))
1524 ANYOF_BITMAP_CLEAR(data->start_class, value);
1528 data->start_class->flags |= ANYOF_LOCALE;
1529 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1533 if (flags & SCF_DO_STCLASS_AND) {
1534 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1535 for (value = 0; value < 256; value++)
1536 if (!isDIGIT(value))
1537 ANYOF_BITMAP_CLEAR(data->start_class, value);
1540 if (data->start_class->flags & ANYOF_LOCALE)
1541 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1543 for (value = 0; value < 256; value++)
1545 ANYOF_BITMAP_SET(data->start_class, value);
1550 if (flags & SCF_DO_STCLASS_AND) {
1551 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1552 for (value = 0; value < 256; value++)
1554 ANYOF_BITMAP_CLEAR(data->start_class, value);
1557 if (data->start_class->flags & ANYOF_LOCALE)
1558 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1560 for (value = 0; value < 256; value++)
1561 if (!isDIGIT(value))
1562 ANYOF_BITMAP_SET(data->start_class, value);
1567 if (flags & SCF_DO_STCLASS_OR)
1568 cl_and(data->start_class, &and_with);
1569 flags &= ~SCF_DO_STCLASS;
1572 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1573 data->flags |= (OP(scan) == MEOL
1577 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1578 /* Lookbehind, or need to calculate parens/evals/stclass: */
1579 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1580 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1581 /* Lookahead/lookbehind */
1582 I32 deltanext, minnext, fake = 0;
1584 struct regnode_charclass_class intrnl;
1587 data_fake.flags = 0;
1589 data_fake.whilem_c = data->whilem_c;
1590 data_fake.last_closep = data->last_closep;
1593 data_fake.last_closep = &fake;
1594 if ( flags & SCF_DO_STCLASS && !scan->flags
1595 && OP(scan) == IFMATCH ) { /* Lookahead */
1596 cl_init(pRExC_state, &intrnl);
1597 data_fake.start_class = &intrnl;
1598 f |= SCF_DO_STCLASS_AND;
1600 if (flags & SCF_WHILEM_VISITED_POS)
1601 f |= SCF_WHILEM_VISITED_POS;
1602 next = regnext(scan);
1603 nscan = NEXTOPER(NEXTOPER(scan));
1604 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1607 vFAIL("Variable length lookbehind not implemented");
1609 else if (minnext > U8_MAX) {
1610 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1612 scan->flags = (U8)minnext;
1614 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1616 if (data && (data_fake.flags & SF_HAS_EVAL))
1617 data->flags |= SF_HAS_EVAL;
1619 data->whilem_c = data_fake.whilem_c;
1620 if (f & SCF_DO_STCLASS_AND) {
1621 int was = (data->start_class->flags & ANYOF_EOS);
1623 cl_and(data->start_class, &intrnl);
1625 data->start_class->flags |= ANYOF_EOS;
1628 else if (OP(scan) == OPEN) {
1631 else if (OP(scan) == CLOSE) {
1632 if ((I32)ARG(scan) == is_par) {
1633 next = regnext(scan);
1635 if ( next && (OP(next) != WHILEM) && next < last)
1636 is_par = 0; /* Disable optimization */
1639 *(data->last_closep) = ARG(scan);
1641 else if (OP(scan) == EVAL) {
1643 data->flags |= SF_HAS_EVAL;
1645 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1646 if (flags & SCF_DO_SUBSTR) {
1647 scan_commit(pRExC_state,data);
1648 data->longest = &(data->longest_float);
1650 is_inf = is_inf_internal = 1;
1651 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1652 cl_anything(pRExC_state, data->start_class);
1653 flags &= ~SCF_DO_STCLASS;
1655 /* Else: zero-length, ignore. */
1656 scan = regnext(scan);
1661 *deltap = is_inf_internal ? I32_MAX : delta;
1662 if (flags & SCF_DO_SUBSTR && is_inf)
1663 data->pos_delta = I32_MAX - data->pos_min;
1664 if (is_par > U8_MAX)
1666 if (is_par && pars==1 && data) {
1667 data->flags |= SF_IN_PAR;
1668 data->flags &= ~SF_HAS_PAR;
1670 else if (pars && data) {
1671 data->flags |= SF_HAS_PAR;
1672 data->flags &= ~SF_IN_PAR;
1674 if (flags & SCF_DO_STCLASS_OR)
1675 cl_and(data->start_class, &and_with);
1680 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1682 if (RExC_rx->data) {
1683 Renewc(RExC_rx->data,
1684 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1685 char, struct reg_data);
1686 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1687 RExC_rx->data->count += n;
1690 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1691 char, struct reg_data);
1692 New(1208, RExC_rx->data->what, n, U8);
1693 RExC_rx->data->count = n;
1695 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1696 return RExC_rx->data->count - n;
1700 Perl_reginitcolors(pTHX)
1703 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1706 PL_colors[0] = s = savepv(s);
1708 s = strchr(s, '\t');
1714 PL_colors[i] = s = "";
1718 PL_colors[i++] = "";
1725 - pregcomp - compile a regular expression into internal code
1727 * We can't allocate space until we know how big the compiled form will be,
1728 * but we can't compile it (and thus know how big it is) until we've got a
1729 * place to put the code. So we cheat: we compile it twice, once with code
1730 * generation turned off and size counting turned on, and once "for real".
1731 * This also means that we don't allocate space until we are sure that the
1732 * thing really will compile successfully, and we never have to move the
1733 * code and thus invalidate pointers into it. (Note that it has to be in
1734 * one piece because free() must be able to free it all.) [NB: not true in perl]
1736 * Beware that the optimization-preparation code in here knows about some
1737 * of the structure of the compiled regexp. [I'll say.]
1740 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1750 RExC_state_t RExC_state;
1751 RExC_state_t *pRExC_state = &RExC_state;
1754 FAIL("NULL regexp argument");
1756 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1760 if (!PL_colorset) reginitcolors();
1761 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1762 PL_colors[4],PL_colors[5],PL_colors[0],
1763 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1765 RExC_flags = pm->op_pmflags;
1769 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1770 RExC_seen_evals = 0;
1773 /* First pass: determine size, legality. */
1780 RExC_emit = &PL_regdummy;
1781 RExC_whilem_seen = 0;
1782 #if 0 /* REGC() is (currently) a NOP at the first pass.
1783 * Clever compilers notice this and complain. --jhi */
1784 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1786 if (reg(pRExC_state, 0, &flags) == NULL) {
1787 RExC_precomp = Nullch;
1790 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1792 /* Small enough for pointer-storage convention?
1793 If extralen==0, this means that we will not need long jumps. */
1794 if (RExC_size >= 0x10000L && RExC_extralen)
1795 RExC_size += RExC_extralen;
1798 if (RExC_whilem_seen > 15)
1799 RExC_whilem_seen = 15;
1801 /* Allocate space and initialize. */
1802 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1805 FAIL("Regexp out of space");
1808 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1809 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1812 r->prelen = xend - exp;
1813 r->precomp = savepvn(RExC_precomp, r->prelen);
1815 #ifdef PERL_COPY_ON_WRITE
1816 r->saved_copy = Nullsv;
1818 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1819 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1821 r->substrs = 0; /* Useful during FAIL. */
1822 r->startp = 0; /* Useful during FAIL. */
1823 r->endp = 0; /* Useful during FAIL. */
1825 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1827 r->offsets[0] = RExC_size;
1829 DEBUG_r(PerlIO_printf(Perl_debug_log,
1830 "%s %"UVuf" bytes for offset annotations.\n",
1831 r->offsets ? "Got" : "Couldn't get",
1832 (UV)((2*RExC_size+1) * sizeof(U32))));
1836 /* Second pass: emit code. */
1837 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1842 RExC_emit_start = r->program;
1843 RExC_emit = r->program;
1844 /* Store the count of eval-groups for security checks: */
1845 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1846 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1848 if (reg(pRExC_state, 0, &flags) == NULL)
1851 /* Dig out information for optimizations. */
1852 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1853 pm->op_pmflags = RExC_flags;
1855 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1856 r->regstclass = NULL;
1857 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1858 r->reganch |= ROPT_NAUGHTY;
1859 scan = r->program + 1; /* First BRANCH. */
1861 /* XXXX To minimize changes to RE engine we always allocate
1862 3-units-long substrs field. */
1863 Newz(1004, r->substrs, 1, struct reg_substr_data);
1865 StructCopy(&zero_scan_data, &data, scan_data_t);
1866 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1867 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1869 STRLEN longest_float_length, longest_fixed_length;
1870 struct regnode_charclass_class ch_class;
1875 /* Skip introductions and multiplicators >= 1. */
1876 while ((OP(first) == OPEN && (sawopen = 1)) ||
1877 /* An OR of *one* alternative - should not happen now. */
1878 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1879 (OP(first) == PLUS) ||
1880 (OP(first) == MINMOD) ||
1881 /* An {n,m} with n>0 */
1882 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1883 if (OP(first) == PLUS)
1886 first += regarglen[(U8)OP(first)];
1887 first = NEXTOPER(first);
1890 /* Starting-point info. */
1892 if (PL_regkind[(U8)OP(first)] == EXACT) {
1893 if (OP(first) == EXACT)
1894 ; /* Empty, get anchored substr later. */
1895 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1896 r->regstclass = first;
1898 else if (strchr((char*)PL_simple,OP(first)))
1899 r->regstclass = first;
1900 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1901 PL_regkind[(U8)OP(first)] == NBOUND)
1902 r->regstclass = first;
1903 else if (PL_regkind[(U8)OP(first)] == BOL) {
1904 r->reganch |= (OP(first) == MBOL
1906 : (OP(first) == SBOL
1909 first = NEXTOPER(first);
1912 else if (OP(first) == GPOS) {
1913 r->reganch |= ROPT_ANCH_GPOS;
1914 first = NEXTOPER(first);
1917 else if (!sawopen && (OP(first) == STAR &&
1918 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1919 !(r->reganch & ROPT_ANCH) )
1921 /* turn .* into ^.* with an implied $*=1 */
1922 int type = OP(NEXTOPER(first));
1924 if (type == REG_ANY)
1925 type = ROPT_ANCH_MBOL;
1927 type = ROPT_ANCH_SBOL;
1929 r->reganch |= type | ROPT_IMPLICIT;
1930 first = NEXTOPER(first);
1933 if (sawplus && (!sawopen || !RExC_sawback)
1934 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1935 /* x+ must match at the 1st pos of run of x's */
1936 r->reganch |= ROPT_SKIP;
1938 /* Scan is after the zeroth branch, first is atomic matcher. */
1939 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1940 (IV)(first - scan + 1)));
1942 * If there's something expensive in the r.e., find the
1943 * longest literal string that must appear and make it the
1944 * regmust. Resolve ties in favor of later strings, since
1945 * the regstart check works with the beginning of the r.e.
1946 * and avoiding duplication strengthens checking. Not a
1947 * strong reason, but sufficient in the absence of others.
1948 * [Now we resolve ties in favor of the earlier string if
1949 * it happens that c_offset_min has been invalidated, since the
1950 * earlier string may buy us something the later one won't.]
1954 data.longest_fixed = newSVpvn("",0);
1955 data.longest_float = newSVpvn("",0);
1956 data.last_found = newSVpvn("",0);
1957 data.longest = &(data.longest_fixed);
1959 if (!r->regstclass) {
1960 cl_init(pRExC_state, &ch_class);
1961 data.start_class = &ch_class;
1962 stclass_flag = SCF_DO_STCLASS_AND;
1963 } else /* XXXX Check for BOUND? */
1965 data.last_closep = &last_close;
1967 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1968 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1969 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1970 && data.last_start_min == 0 && data.last_end > 0
1971 && !RExC_seen_zerolen
1972 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1973 r->reganch |= ROPT_CHECK_ALL;
1974 scan_commit(pRExC_state, &data);
1975 SvREFCNT_dec(data.last_found);
1977 longest_float_length = CHR_SVLEN(data.longest_float);
1978 if (longest_float_length
1979 || (data.flags & SF_FL_BEFORE_EOL
1980 && (!(data.flags & SF_FL_BEFORE_MEOL)
1981 || (RExC_flags & PMf_MULTILINE)))) {
1984 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1985 && data.offset_fixed == data.offset_float_min
1986 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1987 goto remove_float; /* As in (a)+. */
1989 if (SvUTF8(data.longest_float)) {
1990 r->float_utf8 = data.longest_float;
1991 r->float_substr = Nullsv;
1993 r->float_substr = data.longest_float;
1994 r->float_utf8 = Nullsv;
1996 r->float_min_offset = data.offset_float_min;
1997 r->float_max_offset = data.offset_float_max;
1998 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
1999 && (!(data.flags & SF_FL_BEFORE_MEOL)
2000 || (RExC_flags & PMf_MULTILINE)));
2001 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2005 r->float_substr = r->float_utf8 = Nullsv;
2006 SvREFCNT_dec(data.longest_float);
2007 longest_float_length = 0;
2010 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2011 if (longest_fixed_length
2012 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2013 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2014 || (RExC_flags & PMf_MULTILINE)))) {
2017 if (SvUTF8(data.longest_fixed)) {
2018 r->anchored_utf8 = data.longest_fixed;
2019 r->anchored_substr = Nullsv;
2021 r->anchored_substr = data.longest_fixed;
2022 r->anchored_utf8 = Nullsv;
2024 r->anchored_offset = data.offset_fixed;
2025 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2026 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2027 || (RExC_flags & PMf_MULTILINE)));
2028 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2031 r->anchored_substr = r->anchored_utf8 = Nullsv;
2032 SvREFCNT_dec(data.longest_fixed);
2033 longest_fixed_length = 0;
2036 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2037 r->regstclass = NULL;
2038 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2040 && !(data.start_class->flags & ANYOF_EOS)
2041 && !cl_is_anything(data.start_class))
2043 I32 n = add_data(pRExC_state, 1, "f");
2045 New(1006, RExC_rx->data->data[n], 1,
2046 struct regnode_charclass_class);
2047 StructCopy(data.start_class,
2048 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2049 struct regnode_charclass_class);
2050 r->regstclass = (regnode*)RExC_rx->data->data[n];
2051 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2052 PL_regdata = r->data; /* for regprop() */
2053 DEBUG_r({ SV *sv = sv_newmortal();
2054 regprop(sv, (regnode*)data.start_class);
2055 PerlIO_printf(Perl_debug_log,
2056 "synthetic stclass `%s'.\n",
2060 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2061 if (longest_fixed_length > longest_float_length) {
2062 r->check_substr = r->anchored_substr;
2063 r->check_utf8 = r->anchored_utf8;
2064 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2065 if (r->reganch & ROPT_ANCH_SINGLE)
2066 r->reganch |= ROPT_NOSCAN;
2069 r->check_substr = r->float_substr;
2070 r->check_utf8 = r->float_utf8;
2071 r->check_offset_min = data.offset_float_min;
2072 r->check_offset_max = data.offset_float_max;
2074 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2075 This should be changed ASAP! */
2076 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2077 r->reganch |= RE_USE_INTUIT;
2078 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2079 r->reganch |= RE_INTUIT_TAIL;
2083 /* Several toplevels. Best we can is to set minlen. */
2085 struct regnode_charclass_class ch_class;
2088 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2089 scan = r->program + 1;
2090 cl_init(pRExC_state, &ch_class);
2091 data.start_class = &ch_class;
2092 data.last_closep = &last_close;
2093 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2094 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2095 = r->float_substr = r->float_utf8 = Nullsv;
2096 if (!(data.start_class->flags & ANYOF_EOS)
2097 && !cl_is_anything(data.start_class))
2099 I32 n = add_data(pRExC_state, 1, "f");
2101 New(1006, RExC_rx->data->data[n], 1,
2102 struct regnode_charclass_class);
2103 StructCopy(data.start_class,
2104 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2105 struct regnode_charclass_class);
2106 r->regstclass = (regnode*)RExC_rx->data->data[n];
2107 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2108 DEBUG_r({ SV* sv = sv_newmortal();
2109 regprop(sv, (regnode*)data.start_class);
2110 PerlIO_printf(Perl_debug_log,
2111 "synthetic stclass `%s'.\n",
2117 if (RExC_seen & REG_SEEN_GPOS)
2118 r->reganch |= ROPT_GPOS_SEEN;
2119 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2120 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2121 if (RExC_seen & REG_SEEN_EVAL)
2122 r->reganch |= ROPT_EVAL_SEEN;
2123 if (RExC_seen & REG_SEEN_CANY)
2124 r->reganch |= ROPT_CANY_SEEN;
2125 Newz(1002, r->startp, RExC_npar, I32);
2126 Newz(1002, r->endp, RExC_npar, I32);
2127 PL_regdata = r->data; /* for regprop() */
2128 DEBUG_r(regdump(r));
2133 - reg - regular expression, i.e. main body or parenthesized thing
2135 * Caller must absorb opening parenthesis.
2137 * Combining parenthesis handling with the base level of regular expression
2138 * is a trifle forced, but the need to tie the tails of the branches to what
2139 * follows makes it hard to avoid.
2142 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2143 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2145 register regnode *ret; /* Will be the head of the group. */
2146 register regnode *br;
2147 register regnode *lastbr;
2148 register regnode *ender = 0;
2149 register I32 parno = 0;
2150 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2152 /* for (?g), (?gc), and (?o) warnings; warning
2153 about (?c) will warn about (?g) -- japhy */
2155 I32 wastedflags = 0x00,
2158 wasted_gc = 0x02 | 0x04,
2161 char * parse_start = RExC_parse; /* MJD */
2162 char *oregcomp_parse = RExC_parse;
2165 *flagp = 0; /* Tentatively. */
2168 /* Make an OPEN node, if parenthesized. */
2170 if (*RExC_parse == '?') { /* (?...) */
2171 U32 posflags = 0, negflags = 0;
2172 U32 *flagsp = &posflags;
2174 char *seqstart = RExC_parse;
2177 paren = *RExC_parse++;
2178 ret = NULL; /* For look-ahead/behind. */
2180 case '<': /* (?<...) */
2181 RExC_seen |= REG_SEEN_LOOKBEHIND;
2182 if (*RExC_parse == '!')
2184 if (*RExC_parse != '=' && *RExC_parse != '!')
2187 case '=': /* (?=...) */
2188 case '!': /* (?!...) */
2189 RExC_seen_zerolen++;
2190 case ':': /* (?:...) */
2191 case '>': /* (?>...) */
2193 case '$': /* (?$...) */
2194 case '@': /* (?@...) */
2195 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2197 case '#': /* (?#...) */
2198 while (*RExC_parse && *RExC_parse != ')')
2200 if (*RExC_parse != ')')
2201 FAIL("Sequence (?#... not terminated");
2202 nextchar(pRExC_state);
2205 case 'p': /* (?p...) */
2206 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2207 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2209 case '?': /* (??...) */
2211 if (*RExC_parse != '{')
2213 paren = *RExC_parse++;
2215 case '{': /* (?{...}) */
2217 I32 count = 1, n = 0;
2219 char *s = RExC_parse;
2221 OP_4tree *sop, *rop;
2223 RExC_seen_zerolen++;
2224 RExC_seen |= REG_SEEN_EVAL;
2225 while (count && (c = *RExC_parse)) {
2226 if (c == '\\' && RExC_parse[1])
2234 if (*RExC_parse != ')')
2237 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2242 if (RExC_parse - 1 - s)
2243 sv = newSVpvn(s, RExC_parse - 1 - s);
2245 sv = newSVpvn("", 0);
2248 Perl_save_re_context(aTHX);
2249 rop = sv_compile_2op(sv, &sop, "re", &pad);
2250 sop->op_private |= OPpREFCOUNTED;
2251 /* re_dup will OpREFCNT_inc */
2252 OpREFCNT_set(sop, 1);
2255 n = add_data(pRExC_state, 3, "nop");
2256 RExC_rx->data->data[n] = (void*)rop;
2257 RExC_rx->data->data[n+1] = (void*)sop;
2258 RExC_rx->data->data[n+2] = (void*)pad;
2261 else { /* First pass */
2262 if (PL_reginterp_cnt < ++RExC_seen_evals
2264 /* No compiled RE interpolated, has runtime
2265 components ===> unsafe. */
2266 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2267 if (PL_tainting && PL_tainted)
2268 FAIL("Eval-group in insecure regular expression");
2269 if (IN_PERL_COMPILETIME)
2273 nextchar(pRExC_state);
2275 ret = reg_node(pRExC_state, LOGICAL);
2278 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2279 /* deal with the length of this later - MJD */
2282 ret = reganode(pRExC_state, EVAL, n);
2283 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2284 Set_Node_Offset(ret, parse_start);
2287 case '(': /* (?(?{...})...) and (?(?=...)...) */
2289 if (RExC_parse[0] == '?') { /* (?(?...)) */
2290 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2291 || RExC_parse[1] == '<'
2292 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2295 ret = reg_node(pRExC_state, LOGICAL);
2298 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2302 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2304 parno = atoi(RExC_parse++);
2306 while (isDIGIT(*RExC_parse))
2308 ret = reganode(pRExC_state, GROUPP, parno);
2310 if ((c = *nextchar(pRExC_state)) != ')')
2311 vFAIL("Switch condition not recognized");
2313 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2314 br = regbranch(pRExC_state, &flags, 1);
2316 br = reganode(pRExC_state, LONGJMP, 0);
2318 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2319 c = *nextchar(pRExC_state);
2323 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2324 regbranch(pRExC_state, &flags, 1);
2325 regtail(pRExC_state, ret, lastbr);
2328 c = *nextchar(pRExC_state);
2333 vFAIL("Switch (?(condition)... contains too many branches");
2334 ender = reg_node(pRExC_state, TAIL);
2335 regtail(pRExC_state, br, ender);
2337 regtail(pRExC_state, lastbr, ender);
2338 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2341 regtail(pRExC_state, ret, ender);
2345 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2349 RExC_parse--; /* for vFAIL to print correctly */
2350 vFAIL("Sequence (? incomplete");
2354 parse_flags: /* (?i) */
2355 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2356 /* (?g), (?gc) and (?o) are useless here
2357 and must be globally applied -- japhy */
2359 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2360 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2361 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2362 if (! (wastedflags & wflagbit) ) {
2363 wastedflags |= wflagbit;
2366 "Useless (%s%c) - %suse /%c modifier",
2367 flagsp == &negflags ? "?-" : "?",
2369 flagsp == &negflags ? "don't " : "",
2375 else if (*RExC_parse == 'c') {
2376 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2377 if (! (wastedflags & wasted_c) ) {
2378 wastedflags |= wasted_gc;
2381 "Useless (%sc) - %suse /gc modifier",
2382 flagsp == &negflags ? "?-" : "?",
2383 flagsp == &negflags ? "don't " : ""
2388 else { pmflag(flagsp, *RExC_parse); }
2392 if (*RExC_parse == '-') {
2394 wastedflags = 0; /* reset so (?g-c) warns twice */
2398 RExC_flags |= posflags;
2399 RExC_flags &= ~negflags;
2400 if (*RExC_parse == ':') {
2406 if (*RExC_parse != ')') {
2408 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2410 nextchar(pRExC_state);
2418 ret = reganode(pRExC_state, OPEN, parno);
2419 Set_Node_Length(ret, 1); /* MJD */
2420 Set_Node_Offset(ret, RExC_parse); /* MJD */
2427 /* Pick up the branches, linking them together. */
2428 parse_start = RExC_parse; /* MJD */
2429 br = regbranch(pRExC_state, &flags, 1);
2430 /* branch_len = (paren != 0); */
2434 if (*RExC_parse == '|') {
2435 if (!SIZE_ONLY && RExC_extralen) {
2436 reginsert(pRExC_state, BRANCHJ, br);
2439 reginsert(pRExC_state, BRANCH, br);
2440 Set_Node_Length(br, paren != 0);
2441 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2445 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2447 else if (paren == ':') {
2448 *flagp |= flags&SIMPLE;
2450 if (open) { /* Starts with OPEN. */
2451 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2453 else if (paren != '?') /* Not Conditional */
2455 *flagp |= flags & (SPSTART | HASWIDTH);
2457 while (*RExC_parse == '|') {
2458 if (!SIZE_ONLY && RExC_extralen) {
2459 ender = reganode(pRExC_state, LONGJMP,0);
2460 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2463 RExC_extralen += 2; /* Account for LONGJMP. */
2464 nextchar(pRExC_state);
2465 br = regbranch(pRExC_state, &flags, 0);
2469 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2473 *flagp |= flags&SPSTART;
2476 if (have_branch || paren != ':') {
2477 /* Make a closing node, and hook it on the end. */
2480 ender = reg_node(pRExC_state, TAIL);
2483 ender = reganode(pRExC_state, CLOSE, parno);
2484 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2485 Set_Node_Length(ender,1); /* MJD */
2491 *flagp &= ~HASWIDTH;
2494 ender = reg_node(pRExC_state, SUCCEED);
2497 ender = reg_node(pRExC_state, END);
2500 regtail(pRExC_state, lastbr, ender);
2503 /* Hook the tails of the branches to the closing node. */
2504 for (br = ret; br != NULL; br = regnext(br)) {
2505 regoptail(pRExC_state, br, ender);
2512 static char parens[] = "=!<,>";
2514 if (paren && (p = strchr(parens, paren))) {
2515 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2516 int flag = (p - parens) > 1;
2519 node = SUSPEND, flag = 0;
2520 reginsert(pRExC_state, node,ret);
2521 Set_Node_Cur_Length(ret);
2522 Set_Node_Offset(ret, parse_start + 1);
2524 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2528 /* Check for proper termination. */
2530 RExC_flags = oregflags;
2531 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2532 RExC_parse = oregcomp_parse;
2533 vFAIL("Unmatched (");
2536 else if (!paren && RExC_parse < RExC_end) {
2537 if (*RExC_parse == ')') {
2539 vFAIL("Unmatched )");
2542 FAIL("Junk on end of regexp"); /* "Can't happen". */
2550 - regbranch - one alternative of an | operator
2552 * Implements the concatenation operator.
2555 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2557 register regnode *ret;
2558 register regnode *chain = NULL;
2559 register regnode *latest;
2560 I32 flags = 0, c = 0;
2565 if (!SIZE_ONLY && RExC_extralen)
2566 ret = reganode(pRExC_state, BRANCHJ,0);
2568 ret = reg_node(pRExC_state, BRANCH);
2569 Set_Node_Length(ret, 1);
2573 if (!first && SIZE_ONLY)
2574 RExC_extralen += 1; /* BRANCHJ */
2576 *flagp = WORST; /* Tentatively. */
2579 nextchar(pRExC_state);
2580 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2582 latest = regpiece(pRExC_state, &flags);
2583 if (latest == NULL) {
2584 if (flags & TRYAGAIN)
2588 else if (ret == NULL)
2590 *flagp |= flags&HASWIDTH;
2591 if (chain == NULL) /* First piece. */
2592 *flagp |= flags&SPSTART;
2595 regtail(pRExC_state, chain, latest);
2600 if (chain == NULL) { /* Loop ran zero times. */
2601 chain = reg_node(pRExC_state, NOTHING);
2606 *flagp |= flags&SIMPLE;
2613 - regpiece - something followed by possible [*+?]
2615 * Note that the branching code sequences used for ? and the general cases
2616 * of * and + are somewhat optimized: they use the same NOTHING node as
2617 * both the endmarker for their branch list and the body of the last branch.
2618 * It might seem that this node could be dispensed with entirely, but the
2619 * endmarker role is not redundant.
2622 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2624 register regnode *ret;
2626 register char *next;
2628 char *origparse = RExC_parse;
2631 I32 max = REG_INFTY;
2634 ret = regatom(pRExC_state, &flags);
2636 if (flags & TRYAGAIN)
2643 if (op == '{' && regcurly(RExC_parse)) {
2644 parse_start = RExC_parse; /* MJD */
2645 next = RExC_parse + 1;
2647 while (isDIGIT(*next) || *next == ',') {
2656 if (*next == '}') { /* got one */
2660 min = atoi(RExC_parse);
2664 maxpos = RExC_parse;
2666 if (!max && *maxpos != '0')
2667 max = REG_INFTY; /* meaning "infinity" */
2668 else if (max >= REG_INFTY)
2669 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2671 nextchar(pRExC_state);
2674 if ((flags&SIMPLE)) {
2675 RExC_naughty += 2 + RExC_naughty / 2;
2676 reginsert(pRExC_state, CURLY, ret);
2677 Set_Node_Offset(ret, parse_start+1); /* MJD */
2678 Set_Node_Cur_Length(ret);
2681 regnode *w = reg_node(pRExC_state, WHILEM);
2684 regtail(pRExC_state, ret, w);
2685 if (!SIZE_ONLY && RExC_extralen) {
2686 reginsert(pRExC_state, LONGJMP,ret);
2687 reginsert(pRExC_state, NOTHING,ret);
2688 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2690 reginsert(pRExC_state, CURLYX,ret);
2692 Set_Node_Offset(ret, parse_start+1);
2693 Set_Node_Length(ret,
2694 op == '{' ? (RExC_parse - parse_start) : 1);
2696 if (!SIZE_ONLY && RExC_extralen)
2697 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2698 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2700 RExC_whilem_seen++, RExC_extralen += 3;
2701 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2709 if (max && max < min)
2710 vFAIL("Can't do {n,m} with n > m");
2712 ARG1_SET(ret, (U16)min);
2713 ARG2_SET(ret, (U16)max);
2725 #if 0 /* Now runtime fix should be reliable. */
2727 /* if this is reinstated, don't forget to put this back into perldiag:
2729 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2731 (F) The part of the regexp subject to either the * or + quantifier
2732 could match an empty string. The {#} shows in the regular
2733 expression about where the problem was discovered.
2737 if (!(flags&HASWIDTH) && op != '?')
2738 vFAIL("Regexp *+ operand could be empty");
2741 parse_start = RExC_parse;
2742 nextchar(pRExC_state);
2744 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2746 if (op == '*' && (flags&SIMPLE)) {
2747 reginsert(pRExC_state, STAR, ret);
2751 else if (op == '*') {
2755 else if (op == '+' && (flags&SIMPLE)) {
2756 reginsert(pRExC_state, PLUS, ret);
2760 else if (op == '+') {
2764 else if (op == '?') {
2769 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2771 "%.*s matches null string many times",
2772 RExC_parse - origparse,
2776 if (*RExC_parse == '?') {
2777 nextchar(pRExC_state);
2778 reginsert(pRExC_state, MINMOD, ret);
2779 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2781 if (ISMULT2(RExC_parse)) {
2783 vFAIL("Nested quantifiers");
2790 - regatom - the lowest level
2792 * Optimization: gobbles an entire sequence of ordinary characters so that
2793 * it can turn them into a single node, which is smaller to store and
2794 * faster to run. Backslashed characters are exceptions, each becoming a
2795 * separate node; the code is simpler that way and it's not worth fixing.
2797 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2799 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2801 register regnode *ret = 0;
2803 char *parse_start = RExC_parse;
2805 *flagp = WORST; /* Tentatively. */
2808 switch (*RExC_parse) {
2810 RExC_seen_zerolen++;
2811 nextchar(pRExC_state);
2812 if (RExC_flags & PMf_MULTILINE)
2813 ret = reg_node(pRExC_state, MBOL);
2814 else if (RExC_flags & PMf_SINGLELINE)
2815 ret = reg_node(pRExC_state, SBOL);
2817 ret = reg_node(pRExC_state, BOL);
2818 Set_Node_Length(ret, 1); /* MJD */
2821 nextchar(pRExC_state);
2823 RExC_seen_zerolen++;
2824 if (RExC_flags & PMf_MULTILINE)
2825 ret = reg_node(pRExC_state, MEOL);
2826 else if (RExC_flags & PMf_SINGLELINE)
2827 ret = reg_node(pRExC_state, SEOL);
2829 ret = reg_node(pRExC_state, EOL);
2830 Set_Node_Length(ret, 1); /* MJD */
2833 nextchar(pRExC_state);
2834 if (RExC_flags & PMf_SINGLELINE)
2835 ret = reg_node(pRExC_state, SANY);
2837 ret = reg_node(pRExC_state, REG_ANY);
2838 *flagp |= HASWIDTH|SIMPLE;
2840 Set_Node_Length(ret, 1); /* MJD */
2844 char *oregcomp_parse = ++RExC_parse;
2845 ret = regclass(pRExC_state);
2846 if (*RExC_parse != ']') {
2847 RExC_parse = oregcomp_parse;
2848 vFAIL("Unmatched [");
2850 nextchar(pRExC_state);
2851 *flagp |= HASWIDTH|SIMPLE;
2852 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2856 nextchar(pRExC_state);
2857 ret = reg(pRExC_state, 1, &flags);
2859 if (flags & TRYAGAIN) {
2860 if (RExC_parse == RExC_end) {
2861 /* Make parent create an empty node if needed. */
2869 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2873 if (flags & TRYAGAIN) {
2877 vFAIL("Internal urp");
2878 /* Supposed to be caught earlier. */
2881 if (!regcurly(RExC_parse)) {
2890 vFAIL("Quantifier follows nothing");
2893 switch (*++RExC_parse) {
2895 RExC_seen_zerolen++;
2896 ret = reg_node(pRExC_state, SBOL);
2898 nextchar(pRExC_state);
2899 Set_Node_Length(ret, 2); /* MJD */
2902 ret = reg_node(pRExC_state, GPOS);
2903 RExC_seen |= REG_SEEN_GPOS;
2905 nextchar(pRExC_state);
2906 Set_Node_Length(ret, 2); /* MJD */
2909 ret = reg_node(pRExC_state, SEOL);
2911 RExC_seen_zerolen++; /* Do not optimize RE away */
2912 nextchar(pRExC_state);
2915 ret = reg_node(pRExC_state, EOS);
2917 RExC_seen_zerolen++; /* Do not optimize RE away */
2918 nextchar(pRExC_state);
2919 Set_Node_Length(ret, 2); /* MJD */
2922 ret = reg_node(pRExC_state, CANY);
2923 RExC_seen |= REG_SEEN_CANY;
2924 *flagp |= HASWIDTH|SIMPLE;
2925 nextchar(pRExC_state);
2926 Set_Node_Length(ret, 2); /* MJD */
2929 ret = reg_node(pRExC_state, CLUMP);
2931 nextchar(pRExC_state);
2932 Set_Node_Length(ret, 2); /* MJD */
2935 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2936 *flagp |= HASWIDTH|SIMPLE;
2937 nextchar(pRExC_state);
2938 Set_Node_Length(ret, 2); /* MJD */
2941 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2942 *flagp |= HASWIDTH|SIMPLE;
2943 nextchar(pRExC_state);
2944 Set_Node_Length(ret, 2); /* MJD */
2947 RExC_seen_zerolen++;
2948 RExC_seen |= REG_SEEN_LOOKBEHIND;
2949 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2951 nextchar(pRExC_state);
2952 Set_Node_Length(ret, 2); /* MJD */
2955 RExC_seen_zerolen++;
2956 RExC_seen |= REG_SEEN_LOOKBEHIND;
2957 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2959 nextchar(pRExC_state);
2960 Set_Node_Length(ret, 2); /* MJD */
2963 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2964 *flagp |= HASWIDTH|SIMPLE;
2965 nextchar(pRExC_state);
2966 Set_Node_Length(ret, 2); /* MJD */
2969 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2970 *flagp |= HASWIDTH|SIMPLE;
2971 nextchar(pRExC_state);
2972 Set_Node_Length(ret, 2); /* MJD */
2975 ret = reg_node(pRExC_state, DIGIT);
2976 *flagp |= HASWIDTH|SIMPLE;
2977 nextchar(pRExC_state);
2978 Set_Node_Length(ret, 2); /* MJD */
2981 ret = reg_node(pRExC_state, NDIGIT);
2982 *flagp |= HASWIDTH|SIMPLE;
2983 nextchar(pRExC_state);
2984 Set_Node_Length(ret, 2); /* MJD */
2989 char* oldregxend = RExC_end;
2990 char* parse_start = RExC_parse - 2;
2992 if (RExC_parse[1] == '{') {
2993 /* a lovely hack--pretend we saw [\pX] instead */
2994 RExC_end = strchr(RExC_parse, '}');
2996 U8 c = (U8)*RExC_parse;
2998 RExC_end = oldregxend;
2999 vFAIL2("Missing right brace on \\%c{}", c);
3004 RExC_end = RExC_parse + 2;
3005 if (RExC_end > oldregxend)
3006 RExC_end = oldregxend;
3010 ret = regclass(pRExC_state);
3012 RExC_end = oldregxend;
3015 Set_Node_Offset(ret, parse_start + 2);
3016 Set_Node_Cur_Length(ret);
3017 nextchar(pRExC_state);
3018 *flagp |= HASWIDTH|SIMPLE;
3031 case '1': case '2': case '3': case '4':
3032 case '5': case '6': case '7': case '8': case '9':
3034 I32 num = atoi(RExC_parse);
3036 if (num > 9 && num >= RExC_npar)
3039 char * parse_start = RExC_parse - 1; /* MJD */
3040 while (isDIGIT(*RExC_parse))
3043 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3044 vFAIL("Reference to nonexistent group");
3046 ret = reganode(pRExC_state,
3047 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3051 /* override incorrect value set in reganode MJD */
3052 Set_Node_Offset(ret, parse_start+1);
3053 Set_Node_Cur_Length(ret); /* MJD */
3055 nextchar(pRExC_state);
3060 if (RExC_parse >= RExC_end)
3061 FAIL("Trailing \\");
3064 /* Do not generate `unrecognized' warnings here, we fall
3065 back into the quick-grab loop below */
3072 if (RExC_flags & PMf_EXTENDED) {
3073 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3074 if (RExC_parse < RExC_end)
3080 register STRLEN len;
3086 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3088 parse_start = RExC_parse - 1;
3094 ret = reg_node(pRExC_state,
3095 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3097 for (len = 0, p = RExC_parse - 1;
3098 len < 127 && p < RExC_end;
3103 if (RExC_flags & PMf_EXTENDED)
3104 p = regwhite(p, RExC_end);
3151 ender = ASCII_TO_NATIVE('\033');
3155 ender = ASCII_TO_NATIVE('\007');
3160 char* e = strchr(p, '}');
3164 vFAIL("Missing right brace on \\x{}");
3167 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3168 | PERL_SCAN_DISALLOW_PREFIX;
3170 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3177 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3179 ender = grok_hex(p, &numlen, &flags, NULL);
3185 ender = UCHARAT(p++);
3186 ender = toCTRL(ender);
3188 case '0': case '1': case '2': case '3':case '4':
3189 case '5': case '6': case '7': case '8':case '9':
3191 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3194 ender = grok_oct(p, &numlen, &flags, NULL);
3204 FAIL("Trailing \\");
3207 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3208 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3209 goto normal_default;
3214 if (UTF8_IS_START(*p) && UTF) {
3215 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3223 if (RExC_flags & PMf_EXTENDED)
3224 p = regwhite(p, RExC_end);
3226 /* Prime the casefolded buffer. */
3227 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3229 if (ISMULT2(p)) { /* Back off on ?+*. */
3236 /* Emit all the Unicode characters. */
3237 for (foldbuf = tmpbuf;
3239 foldlen -= numlen) {
3240 ender = utf8_to_uvchr(foldbuf, &numlen);
3242 reguni(pRExC_state, ender, s, &unilen);
3245 /* In EBCDIC the numlen
3246 * and unilen can differ. */
3248 if (numlen >= foldlen)
3252 break; /* "Can't happen." */
3256 reguni(pRExC_state, ender, s, &unilen);
3265 REGC((char)ender, s++);
3273 /* Emit all the Unicode characters. */
3274 for (foldbuf = tmpbuf;
3276 foldlen -= numlen) {
3277 ender = utf8_to_uvchr(foldbuf, &numlen);
3279 reguni(pRExC_state, ender, s, &unilen);
3282 /* In EBCDIC the numlen
3283 * and unilen can differ. */
3285 if (numlen >= foldlen)
3293 reguni(pRExC_state, ender, s, &unilen);
3302 REGC((char)ender, s++);
3306 Set_Node_Cur_Length(ret); /* MJD */
3307 nextchar(pRExC_state);
3309 /* len is STRLEN which is unsigned, need to copy to signed */
3312 vFAIL("Internal disaster");
3316 if (len == 1 && UNI_IS_INVARIANT(ender))
3321 RExC_size += STR_SZ(len);
3323 RExC_emit += STR_SZ(len);
3328 /* If the encoding pragma is in effect recode the text of
3329 * any EXACT-kind nodes. */
3330 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3331 STRLEN oldlen = STR_LEN(ret);
3332 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3336 if (sv_utf8_downgrade(sv, TRUE)) {
3337 char *s = sv_recode_to_utf8(sv, PL_encoding);
3338 STRLEN newlen = SvCUR(sv);
3343 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3344 (int)oldlen, STRING(ret),
3346 Copy(s, STRING(ret), newlen, char);
3347 STR_LEN(ret) += newlen - oldlen;
3348 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3350 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3358 S_regwhite(pTHX_ char *p, char *e)
3363 else if (*p == '#') {
3366 } while (p < e && *p != '\n');
3374 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3375 Character classes ([:foo:]) can also be negated ([:^foo:]).
3376 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3377 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3378 but trigger failures because they are currently unimplemented. */
3380 #define POSIXCC_DONE(c) ((c) == ':')
3381 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3382 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3385 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3388 I32 namedclass = OOB_NAMEDCLASS;
3390 if (value == '[' && RExC_parse + 1 < RExC_end &&
3391 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3392 POSIXCC(UCHARAT(RExC_parse))) {
3393 char c = UCHARAT(RExC_parse);
3394 char* s = RExC_parse++;
3396 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3398 if (RExC_parse == RExC_end)
3399 /* Grandfather lone [:, [=, [. */
3402 char* t = RExC_parse++; /* skip over the c */
3404 if (UCHARAT(RExC_parse) == ']') {
3405 RExC_parse++; /* skip over the ending ] */
3408 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3409 I32 skip = 5; /* the most common skip */
3413 if (strnEQ(posixcc, "alnum", 5))
3415 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3416 else if (strnEQ(posixcc, "alpha", 5))
3418 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3419 else if (strnEQ(posixcc, "ascii", 5))
3421 complement ? ANYOF_NASCII : ANYOF_ASCII;
3424 if (strnEQ(posixcc, "blank", 5))
3426 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3429 if (strnEQ(posixcc, "cntrl", 5))
3431 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3434 if (strnEQ(posixcc, "digit", 5))
3436 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3439 if (strnEQ(posixcc, "graph", 5))
3441 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3444 if (strnEQ(posixcc, "lower", 5))
3446 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3449 if (strnEQ(posixcc, "print", 5))
3451 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3452 else if (strnEQ(posixcc, "punct", 5))
3454 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3457 if (strnEQ(posixcc, "space", 5))
3459 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3462 if (strnEQ(posixcc, "upper", 5))
3464 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3466 case 'w': /* this is not POSIX, this is the Perl \w */
3467 if (strnEQ(posixcc, "word", 4)) {
3469 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3474 if (strnEQ(posixcc, "xdigit", 6)) {
3476 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3481 if (namedclass == OOB_NAMEDCLASS ||
3482 posixcc[skip] != ':' ||
3483 posixcc[skip+1] != ']')
3485 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3488 } else if (!SIZE_ONLY) {
3489 /* [[=foo=]] and [[.foo.]] are still future. */
3491 /* adjust RExC_parse so the warning shows after
3493 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3495 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3498 /* Maternal grandfather:
3499 * "[:" ending in ":" but not in ":]" */
3509 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3511 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3512 char *s = RExC_parse;
3515 while(*s && isALNUM(*s))
3517 if (*s && c == *s && s[1] == ']') {
3518 if (ckWARN(WARN_REGEXP))
3520 "POSIX syntax [%c %c] belongs inside character classes",
3523 /* [[=foo=]] and [[.foo.]] are still future. */
3524 if (POSIXCC_NOTYET(c)) {
3525 /* adjust RExC_parse so the error shows after
3527 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3529 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3536 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3539 register UV nextvalue;
3540 register IV prevvalue = OOB_UNICODE;
3541 register IV range = 0;
3542 register regnode *ret;
3545 char *rangebegin = 0;
3546 bool need_class = 0;
3547 SV *listsv = Nullsv;
3550 bool optimize_invert = TRUE;
3551 AV* unicode_alternate = 0;
3553 UV literal_endpoint = 0;
3556 ret = reganode(pRExC_state, ANYOF, 0);
3559 ANYOF_FLAGS(ret) = 0;
3561 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3565 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3569 RExC_size += ANYOF_SKIP;
3571 RExC_emit += ANYOF_SKIP;
3573 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3575 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3576 ANYOF_BITMAP_ZERO(ret);
3577 listsv = newSVpvn("# comment\n", 10);
3580 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3582 if (!SIZE_ONLY && POSIXCC(nextvalue))
3583 checkposixcc(pRExC_state);
3585 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3586 if (UCHARAT(RExC_parse) == ']')
3589 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3593 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3596 rangebegin = RExC_parse;
3598 value = utf8n_to_uvchr((U8*)RExC_parse,
3599 RExC_end - RExC_parse,
3601 RExC_parse += numlen;
3604 value = UCHARAT(RExC_parse++);
3605 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3606 if (value == '[' && POSIXCC(nextvalue))
3607 namedclass = regpposixcc(pRExC_state, value);
3608 else if (value == '\\') {
3610 value = utf8n_to_uvchr((U8*)RExC_parse,
3611 RExC_end - RExC_parse,
3613 RExC_parse += numlen;
3616 value = UCHARAT(RExC_parse++);
3617 /* Some compilers cannot handle switching on 64-bit integer
3618 * values, therefore value cannot be an UV. Yes, this will
3619 * be a problem later if we want switch on Unicode.
3620 * A similar issue a little bit later when switching on
3621 * namedclass. --jhi */
3622 switch ((I32)value) {
3623 case 'w': namedclass = ANYOF_ALNUM; break;
3624 case 'W': namedclass = ANYOF_NALNUM; break;
3625 case 's': namedclass = ANYOF_SPACE; break;
3626 case 'S': namedclass = ANYOF_NSPACE; break;
3627 case 'd': namedclass = ANYOF_DIGIT; break;
3628 case 'D': namedclass = ANYOF_NDIGIT; break;
3631 if (RExC_parse >= RExC_end)
3632 vFAIL2("Empty \\%c{}", (U8)value);
3633 if (*RExC_parse == '{') {
3635 e = strchr(RExC_parse++, '}');
3637 vFAIL2("Missing right brace on \\%c{}", c);
3638 while (isSPACE(UCHARAT(RExC_parse)))
3640 if (e == RExC_parse)
3641 vFAIL2("Empty \\%c{}", c);
3643 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3651 if (UCHARAT(RExC_parse) == '^') {
3654 value = value == 'p' ? 'P' : 'p'; /* toggle */
3655 while (isSPACE(UCHARAT(RExC_parse))) {
3661 Perl_sv_catpvf(aTHX_ listsv,
3662 "+utf8::%.*s\n", (int)n, RExC_parse);
3664 Perl_sv_catpvf(aTHX_ listsv,
3665 "!utf8::%.*s\n", (int)n, RExC_parse);
3668 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3669 namedclass = ANYOF_MAX; /* no official name, but it's named */
3671 case 'n': value = '\n'; break;
3672 case 'r': value = '\r'; break;
3673 case 't': value = '\t'; break;
3674 case 'f': value = '\f'; break;
3675 case 'b': value = '\b'; break;
3676 case 'e': value = ASCII_TO_NATIVE('\033');break;
3677 case 'a': value = ASCII_TO_NATIVE('\007');break;
3679 if (*RExC_parse == '{') {
3680 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3681 | PERL_SCAN_DISALLOW_PREFIX;
3682 e = strchr(RExC_parse++, '}');
3684 vFAIL("Missing right brace on \\x{}");
3686 numlen = e - RExC_parse;
3687 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3691 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3693 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3694 RExC_parse += numlen;
3698 value = UCHARAT(RExC_parse++);
3699 value = toCTRL(value);
3701 case '0': case '1': case '2': case '3': case '4':
3702 case '5': case '6': case '7': case '8': case '9':
3706 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3707 RExC_parse += numlen;
3711 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3713 "Unrecognized escape \\%c in character class passed through",
3717 } /* end of \blah */
3723 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3725 if (!SIZE_ONLY && !need_class)
3726 ANYOF_CLASS_ZERO(ret);
3730 /* a bad range like a-\d, a-[:digit:] ? */
3733 if (ckWARN(WARN_REGEXP))
3735 "False [] range \"%*.*s\"",
3736 RExC_parse - rangebegin,
3737 RExC_parse - rangebegin,
3739 if (prevvalue < 256) {
3740 ANYOF_BITMAP_SET(ret, prevvalue);
3741 ANYOF_BITMAP_SET(ret, '-');
3744 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3745 Perl_sv_catpvf(aTHX_ listsv,
3746 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3750 range = 0; /* this was not a true range */
3754 if (namedclass > OOB_NAMEDCLASS)
3755 optimize_invert = FALSE;
3756 /* Possible truncation here but in some 64-bit environments
3757 * the compiler gets heartburn about switch on 64-bit values.
3758 * A similar issue a little earlier when switching on value.
3760 switch ((I32)namedclass) {
3763 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3765 for (value = 0; value < 256; value++)
3767 ANYOF_BITMAP_SET(ret, value);
3769 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3773 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3775 for (value = 0; value < 256; value++)
3776 if (!isALNUM(value))
3777 ANYOF_BITMAP_SET(ret, value);
3779 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3783 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3785 for (value = 0; value < 256; value++)
3786 if (isALNUMC(value))
3787 ANYOF_BITMAP_SET(ret, value);
3789 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3793 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3795 for (value = 0; value < 256; value++)
3796 if (!isALNUMC(value))
3797 ANYOF_BITMAP_SET(ret, value);
3799 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3803 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3805 for (value = 0; value < 256; value++)
3807 ANYOF_BITMAP_SET(ret, value);
3809 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3813 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3815 for (value = 0; value < 256; value++)
3816 if (!isALPHA(value))
3817 ANYOF_BITMAP_SET(ret, value);
3819 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3823 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3826 for (value = 0; value < 128; value++)
3827 ANYOF_BITMAP_SET(ret, value);
3829 for (value = 0; value < 256; value++) {
3831 ANYOF_BITMAP_SET(ret, value);
3835 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3839 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3842 for (value = 128; value < 256; value++)
3843 ANYOF_BITMAP_SET(ret, value);
3845 for (value = 0; value < 256; value++) {
3846 if (!isASCII(value))
3847 ANYOF_BITMAP_SET(ret, value);
3851 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3855 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3857 for (value = 0; value < 256; value++)
3859 ANYOF_BITMAP_SET(ret, value);
3861 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3865 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3867 for (value = 0; value < 256; value++)
3868 if (!isBLANK(value))
3869 ANYOF_BITMAP_SET(ret, value);
3871 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3875 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3877 for (value = 0; value < 256; value++)
3879 ANYOF_BITMAP_SET(ret, value);
3881 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3885 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3887 for (value = 0; value < 256; value++)
3888 if (!isCNTRL(value))
3889 ANYOF_BITMAP_SET(ret, value);
3891 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3895 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3897 /* consecutive digits assumed */
3898 for (value = '0'; value <= '9'; value++)
3899 ANYOF_BITMAP_SET(ret, value);
3901 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3905 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3907 /* consecutive digits assumed */
3908 for (value = 0; value < '0'; value++)
3909 ANYOF_BITMAP_SET(ret, value);
3910 for (value = '9' + 1; value < 256; value++)
3911 ANYOF_BITMAP_SET(ret, value);
3913 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");