5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
8 /* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
10 * a regular expression.
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
18 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
22 /* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
27 /* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
32 #ifdef PERL_EXT_RE_BUILD
33 /* need to replace pregcomp et al, so enable that */
34 # ifndef PERL_IN_XSUB_RE
35 # define PERL_IN_XSUB_RE
37 /* need access to debugger hooks */
38 # if defined(PERL_EXT_RE_DEBUG) && !defined(DEBUGGING)
43 #ifdef PERL_IN_XSUB_RE
44 /* We *really* need to overwrite these symbols: */
45 # define Perl_pregcomp my_regcomp
46 # define Perl_regdump my_regdump
47 # define Perl_regprop my_regprop
48 # define Perl_pregfree my_regfree
49 # define Perl_re_intuit_string my_re_intuit_string
50 /* *These* symbols are masked to allow static link. */
51 # define Perl_regnext my_regnext
52 # define Perl_save_re_context my_save_re_context
53 # define Perl_reginitcolors my_reginitcolors
55 # define PERL_NO_GET_CONTEXT
60 * pregcomp and pregexec -- regsub and regerror are not used in perl
62 * Copyright (c) 1986 by University of Toronto.
63 * Written by Henry Spencer. Not derived from licensed software.
65 * Permission is granted to anyone to use this software for any
66 * purpose on any computer system, and to redistribute it freely,
67 * subject to the following restrictions:
69 * 1. The author is not responsible for the consequences of use of
70 * this software, no matter how awful, even if they arise
73 * 2. The origin of this software must not be misrepresented, either
74 * by explicit claim or by omission.
76 * 3. Altered versions must be plainly marked as such, and must not
77 * be misrepresented as being the original software.
80 **** Alterations to Henry's code are...
82 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
83 **** 2000, 2001, 2002, 2003, by Larry Wall and others
85 **** You may distribute under the terms of either the GNU General Public
86 **** License or the Artistic License, as specified in the README file.
89 * Beware that some of this code is subtly aware of the way operator
90 * precedence is structured in regular expressions. Serious changes in
91 * regular-expression syntax might require a total rethink.
94 #define PERL_IN_REGCOMP_C
97 #ifndef PERL_IN_XSUB_RE
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
121 typedef struct RExC_state_t {
122 U32 flags; /* are we folding, multilining? */
123 char *precomp; /* uncompiled string. */
125 char *start; /* Start of input for compile */
126 char *end; /* End of input for compile */
127 char *parse; /* Input-scan pointer. */
128 I32 whilem_seen; /* number of WHILEM in this expr */
129 regnode *emit_start; /* Start of emitted-code area */
130 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
131 I32 naughty; /* How bad is this pattern? */
132 I32 sawback; /* Did we see \1, ...? */
134 I32 size; /* Code size. */
135 I32 npar; /* () count. */
141 char *starttry; /* -Dr: where regtry was called. */
142 #define RExC_starttry (pRExC_state->starttry)
146 #define RExC_flags (pRExC_state->flags)
147 #define RExC_precomp (pRExC_state->precomp)
148 #define RExC_rx (pRExC_state->rx)
149 #define RExC_start (pRExC_state->start)
150 #define RExC_end (pRExC_state->end)
151 #define RExC_parse (pRExC_state->parse)
152 #define RExC_whilem_seen (pRExC_state->whilem_seen)
153 #define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
154 #define RExC_emit (pRExC_state->emit)
155 #define RExC_emit_start (pRExC_state->emit_start)
156 #define RExC_naughty (pRExC_state->naughty)
157 #define RExC_sawback (pRExC_state->sawback)
158 #define RExC_seen (pRExC_state->seen)
159 #define RExC_size (pRExC_state->size)
160 #define RExC_npar (pRExC_state->npar)
161 #define RExC_extralen (pRExC_state->extralen)
162 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
163 #define RExC_seen_evals (pRExC_state->seen_evals)
164 #define RExC_utf8 (pRExC_state->utf8)
166 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
167 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
168 ((*s) == '{' && regcurly(s)))
171 #undef SPSTART /* dratted cpp namespace... */
174 * Flags to be passed up and down.
176 #define WORST 0 /* Worst case. */
177 #define HASWIDTH 0x1 /* Known to match non-null strings. */
178 #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */
179 #define SPSTART 0x4 /* Starts with * or +. */
180 #define TRYAGAIN 0x8 /* Weeded out a declaration. */
182 /* Length of a variant. */
184 typedef struct scan_data_t {
190 I32 last_end; /* min value, <0 unless valid. */
193 SV **longest; /* Either &l_fixed, or &l_float. */
197 I32 offset_float_min;
198 I32 offset_float_max;
202 struct regnode_charclass_class *start_class;
206 * Forward declarations for pregcomp()'s friends.
209 static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
212 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
213 #define SF_BEFORE_SEOL 0x1
214 #define SF_BEFORE_MEOL 0x2
215 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
216 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
219 # define SF_FIX_SHIFT_EOL (0+2)
220 # define SF_FL_SHIFT_EOL (0+4)
222 # define SF_FIX_SHIFT_EOL (+2)
223 # define SF_FL_SHIFT_EOL (+4)
226 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
227 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
229 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
230 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
231 #define SF_IS_INF 0x40
232 #define SF_HAS_PAR 0x80
233 #define SF_IN_PAR 0x100
234 #define SF_HAS_EVAL 0x200
235 #define SCF_DO_SUBSTR 0x400
236 #define SCF_DO_STCLASS_AND 0x0800
237 #define SCF_DO_STCLASS_OR 0x1000
238 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
239 #define SCF_WHILEM_VISITED_POS 0x2000
241 #define UTF (RExC_utf8 != 0)
242 #define LOC ((RExC_flags & PMf_LOCALE) != 0)
243 #define FOLD ((RExC_flags & PMf_FOLD) != 0)
245 #define OOB_UNICODE 12345678
246 #define OOB_NAMEDCLASS -1
248 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
249 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
252 /* length of regex to show in messages that don't mark a position within */
253 #define RegexLengthToShowInErrorMessages 127
256 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
257 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
258 * op/pragma/warn/regcomp.
260 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
261 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
263 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
266 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
267 * arg. Show regex, up to a maximum length. If it's too long, chop and add
270 #define FAIL(msg) STMT_START { \
271 char *ellipses = ""; \
272 IV len = RExC_end - RExC_precomp; \
275 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
276 if (len > RegexLengthToShowInErrorMessages) { \
277 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
278 len = RegexLengthToShowInErrorMessages - 10; \
281 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
282 msg, (int)len, RExC_precomp, ellipses); \
286 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
287 * args. Show regex, up to a maximum length. If it's too long, chop and add
290 #define FAIL2(pat,msg) STMT_START { \
291 char *ellipses = ""; \
292 IV len = RExC_end - RExC_precomp; \
295 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
296 if (len > RegexLengthToShowInErrorMessages) { \
297 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
298 len = RegexLengthToShowInErrorMessages - 10; \
301 S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
302 msg, (int)len, RExC_precomp, ellipses); \
307 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
309 #define Simple_vFAIL(m) STMT_START { \
310 IV offset = RExC_parse - RExC_precomp; \
311 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
312 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
316 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
318 #define vFAIL(m) STMT_START { \
320 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
325 * Like Simple_vFAIL(), but accepts two arguments.
327 #define Simple_vFAIL2(m,a1) STMT_START { \
328 IV offset = RExC_parse - RExC_precomp; \
329 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
330 (int)offset, RExC_precomp, RExC_precomp + offset); \
334 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
336 #define vFAIL2(m,a1) STMT_START { \
338 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
339 Simple_vFAIL2(m, a1); \
344 * Like Simple_vFAIL(), but accepts three arguments.
346 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
347 IV offset = RExC_parse - RExC_precomp; \
348 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
349 (int)offset, RExC_precomp, RExC_precomp + offset); \
353 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
355 #define vFAIL3(m,a1,a2) STMT_START { \
357 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
358 Simple_vFAIL3(m, a1, a2); \
362 * Like Simple_vFAIL(), but accepts four arguments.
364 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
365 IV offset = RExC_parse - RExC_precomp; \
366 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
367 (int)offset, RExC_precomp, RExC_precomp + offset); \
371 * Like Simple_vFAIL(), but accepts five arguments.
373 #define Simple_vFAIL5(m, a1, a2, a3, a4) STMT_START { \
374 IV offset = RExC_parse - RExC_precomp; \
375 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4, \
376 (int)offset, RExC_precomp, RExC_precomp + offset); \
380 #define vWARN(loc,m) STMT_START { \
381 IV offset = loc - RExC_precomp; \
382 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
383 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
386 #define vWARNdep(loc,m) STMT_START { \
387 IV offset = loc - RExC_precomp; \
388 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
389 "%s" REPORT_LOCATION, \
390 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
394 #define vWARN2(loc, m, a1) STMT_START { \
395 IV offset = loc - RExC_precomp; \
396 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
397 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
400 #define vWARN3(loc, m, a1, a2) STMT_START { \
401 IV offset = loc - RExC_precomp; \
402 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
403 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
406 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
407 IV offset = loc - RExC_precomp; \
408 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
409 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
412 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
413 IV offset = loc - RExC_precomp; \
414 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
415 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
419 /* Allow for side effects in s */
420 #define REGC(c,s) STMT_START { \
421 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
424 /* Macros for recording node offsets. 20001227 mjd@plover.com
425 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
426 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
427 * Element 0 holds the number n.
430 #define MJD_OFFSET_DEBUG(x)
431 /* #define MJD_OFFSET_DEBUG(x) Perl_warn_nocontext x */
434 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
436 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
437 __LINE__, (node), (byte))); \
439 Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
441 RExC_offsets[2*(node)-1] = (byte); \
446 #define Set_Node_Offset(node,byte) \
447 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
448 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
450 #define Set_Node_Length_To_R(node,len) STMT_START { \
452 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
453 __LINE__, (node), (len))); \
455 Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
457 RExC_offsets[2*(node)] = (len); \
462 #define Set_Node_Length(node,len) \
463 Set_Node_Length_To_R((node)-RExC_emit_start, len)
464 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
465 #define Set_Node_Cur_Length(node) \
466 Set_Node_Length(node, RExC_parse - parse_start)
468 /* Get offsets and lengths */
469 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
470 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
472 static void clear_re(pTHX_ void *r);
474 /* Mark that we cannot extend a found fixed substring at this point.
475 Updata the longest found anchored substring and the longest found
476 floating substrings if needed. */
479 S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
481 STRLEN l = CHR_SVLEN(data->last_found);
482 STRLEN old_l = CHR_SVLEN(*data->longest);
484 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
485 SvSetMagicSV(*data->longest, data->last_found);
486 if (*data->longest == data->longest_fixed) {
487 data->offset_fixed = l ? data->last_start_min : data->pos_min;
488 if (data->flags & SF_BEFORE_EOL)
490 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
492 data->flags &= ~SF_FIX_BEFORE_EOL;
495 data->offset_float_min = l ? data->last_start_min : data->pos_min;
496 data->offset_float_max = (l
497 ? data->last_start_max
498 : data->pos_min + data->pos_delta);
499 if ((U32)data->offset_float_max > (U32)I32_MAX)
500 data->offset_float_max = I32_MAX;
501 if (data->flags & SF_BEFORE_EOL)
503 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
505 data->flags &= ~SF_FL_BEFORE_EOL;
508 SvCUR_set(data->last_found, 0);
510 SV * sv = data->last_found;
512 SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL;
513 if (mg && mg->mg_len > 0)
517 data->flags &= ~SF_BEFORE_EOL;
520 /* Can match anything (initialization) */
522 S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
524 ANYOF_CLASS_ZERO(cl);
525 ANYOF_BITMAP_SETALL(cl);
526 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
528 cl->flags |= ANYOF_LOCALE;
531 /* Can match anything (initialization) */
533 S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
537 for (value = 0; value <= ANYOF_MAX; value += 2)
538 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
540 if (!(cl->flags & ANYOF_UNICODE_ALL))
542 if (!ANYOF_BITMAP_TESTALLSET(cl))
547 /* Can match anything (initialization) */
549 S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
551 Zero(cl, 1, struct regnode_charclass_class);
553 cl_anything(pRExC_state, cl);
557 S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
559 Zero(cl, 1, struct regnode_charclass_class);
561 cl_anything(pRExC_state, cl);
563 cl->flags |= ANYOF_LOCALE;
566 /* 'And' a given class with another one. Can create false positives */
567 /* We assume that cl is not inverted */
569 S_cl_and(pTHX_ struct regnode_charclass_class *cl,
570 struct regnode_charclass_class *and_with)
572 if (!(and_with->flags & ANYOF_CLASS)
573 && !(cl->flags & ANYOF_CLASS)
574 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
575 && !(and_with->flags & ANYOF_FOLD)
576 && !(cl->flags & ANYOF_FOLD)) {
579 if (and_with->flags & ANYOF_INVERT)
580 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
581 cl->bitmap[i] &= ~and_with->bitmap[i];
583 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
584 cl->bitmap[i] &= and_with->bitmap[i];
585 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
586 if (!(and_with->flags & ANYOF_EOS))
587 cl->flags &= ~ANYOF_EOS;
589 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
590 !(and_with->flags & ANYOF_INVERT)) {
591 cl->flags &= ~ANYOF_UNICODE_ALL;
592 cl->flags |= ANYOF_UNICODE;
593 ARG_SET(cl, ARG(and_with));
595 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
596 !(and_with->flags & ANYOF_INVERT))
597 cl->flags &= ~ANYOF_UNICODE_ALL;
598 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
599 !(and_with->flags & ANYOF_INVERT))
600 cl->flags &= ~ANYOF_UNICODE;
603 /* 'OR' a given class with another one. Can create false positives */
604 /* We assume that cl is not inverted */
606 S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
608 if (or_with->flags & ANYOF_INVERT) {
610 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
611 * <= (B1 | !B2) | (CL1 | !CL2)
612 * which is wasteful if CL2 is small, but we ignore CL2:
613 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
614 * XXXX Can we handle case-fold? Unclear:
615 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
616 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
618 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
619 && !(or_with->flags & ANYOF_FOLD)
620 && !(cl->flags & ANYOF_FOLD) ) {
623 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
624 cl->bitmap[i] |= ~or_with->bitmap[i];
625 } /* XXXX: logic is complicated otherwise */
627 cl_anything(pRExC_state, cl);
630 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
631 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
632 && (!(or_with->flags & ANYOF_FOLD)
633 || (cl->flags & ANYOF_FOLD)) ) {
636 /* OR char bitmap and class bitmap separately */
637 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
638 cl->bitmap[i] |= or_with->bitmap[i];
639 if (or_with->flags & ANYOF_CLASS) {
640 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
641 cl->classflags[i] |= or_with->classflags[i];
642 cl->flags |= ANYOF_CLASS;
645 else { /* XXXX: logic is complicated, leave it along for a moment. */
646 cl_anything(pRExC_state, cl);
649 if (or_with->flags & ANYOF_EOS)
650 cl->flags |= ANYOF_EOS;
652 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
653 ARG(cl) != ARG(or_with)) {
654 cl->flags |= ANYOF_UNICODE_ALL;
655 cl->flags &= ~ANYOF_UNICODE;
657 if (or_with->flags & ANYOF_UNICODE_ALL) {
658 cl->flags |= ANYOF_UNICODE_ALL;
659 cl->flags &= ~ANYOF_UNICODE;
664 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
665 * These need to be revisited when a newer toolchain becomes available.
667 #if defined(__sparc64__) && defined(__GNUC__)
668 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
669 # undef SPARC64_GCC_WORKAROUND
670 # define SPARC64_GCC_WORKAROUND 1
674 /* REx optimizer. Converts nodes into quickier variants "in place".
675 Finds fixed substrings. */
677 /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set
678 to the position after last scanned or to NULL. */
681 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
682 /* scanp: Start here (read-write). */
683 /* deltap: Write maxlen-minlen here. */
684 /* last: Stop before this one. */
686 I32 min = 0, pars = 0, code;
687 regnode *scan = *scanp, *next;
689 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
690 int is_inf_internal = 0; /* The studied chunk is infinite */
691 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
692 scan_data_t data_fake;
693 struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
695 while (scan && OP(scan) != END && scan < last) {
696 /* Peephole optimizer: */
698 if (PL_regkind[(U8)OP(scan)] == EXACT) {
699 /* Merge several consecutive EXACTish nodes into one. */
700 regnode *n = regnext(scan);
703 regnode *stop = scan;
706 next = scan + NODE_SZ_STR(scan);
707 /* Skip NOTHING, merge EXACT*. */
709 ( PL_regkind[(U8)OP(n)] == NOTHING ||
710 (stringok && (OP(n) == OP(scan))))
712 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
713 if (OP(n) == TAIL || n > next)
715 if (PL_regkind[(U8)OP(n)] == NOTHING) {
716 NEXT_OFF(scan) += NEXT_OFF(n);
717 next = n + NODE_STEP_REGNODE;
725 int oldl = STR_LEN(scan);
726 regnode *nnext = regnext(n);
728 if (oldl + STR_LEN(n) > U8_MAX)
730 NEXT_OFF(scan) += NEXT_OFF(n);
731 STR_LEN(scan) += STR_LEN(n);
732 next = n + NODE_SZ_STR(n);
733 /* Now we can overwrite *n : */
734 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
742 if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
744 Two problematic code points in Unicode casefolding of EXACT nodes:
746 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
747 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
753 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
754 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
756 This means that in case-insensitive matching (or "loose matching",
757 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
758 length of the above casefolded versions) can match a target string
759 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
760 This would rather mess up the minimum length computation.
762 What we'll do is to look for the tail four bytes, and then peek
763 at the preceding two bytes to see whether we need to decrease
764 the minimum length by four (six minus two).
766 Thanks to the design of UTF-8, there cannot be false matches:
767 A sequence of valid UTF-8 bytes cannot be a subsequence of
768 another valid sequence of UTF-8 bytes.
771 char *s0 = STRING(scan), *s, *t;
772 char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
773 char *t0 = "\xcc\x88\xcc\x81";
777 s < s2 && (t = ninstr(s, s1, t0, t1));
779 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
780 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
787 n = scan + NODE_SZ_STR(scan);
789 if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) {
797 /* Follow the next-chain of the current node and optimize
798 away all the NOTHINGs from it. */
799 if (OP(scan) != CURLYX) {
800 int max = (reg_off_by_arg[OP(scan)]
802 /* I32 may be smaller than U16 on CRAYs! */
803 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
804 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
808 /* Skip NOTHING and LONGJMP. */
809 while ((n = regnext(n))
810 && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
811 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
814 if (reg_off_by_arg[OP(scan)])
817 NEXT_OFF(scan) = off;
819 /* The principal pseudo-switch. Cannot be a switch, since we
820 look into several different things. */
821 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
822 || OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
823 next = regnext(scan);
826 if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
827 I32 max1 = 0, min1 = I32_MAX, num = 0;
828 struct regnode_charclass_class accum;
830 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
831 scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
832 if (flags & SCF_DO_STCLASS)
833 cl_init_zero(pRExC_state, &accum);
834 while (OP(scan) == code) {
835 I32 deltanext, minnext, f = 0, fake;
836 struct regnode_charclass_class this_class;
841 data_fake.whilem_c = data->whilem_c;
842 data_fake.last_closep = data->last_closep;
845 data_fake.last_closep = &fake;
846 next = regnext(scan);
847 scan = NEXTOPER(scan);
849 scan = NEXTOPER(scan);
850 if (flags & SCF_DO_STCLASS) {
851 cl_init(pRExC_state, &this_class);
852 data_fake.start_class = &this_class;
853 f = SCF_DO_STCLASS_AND;
855 if (flags & SCF_WHILEM_VISITED_POS)
856 f |= SCF_WHILEM_VISITED_POS;
857 /* we suppose the run is continuous, last=next...*/
858 minnext = study_chunk(pRExC_state, &scan, &deltanext,
859 next, &data_fake, f);
862 if (max1 < minnext + deltanext)
863 max1 = minnext + deltanext;
864 if (deltanext == I32_MAX)
865 is_inf = is_inf_internal = 1;
867 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
869 if (data && (data_fake.flags & SF_HAS_EVAL))
870 data->flags |= SF_HAS_EVAL;
872 data->whilem_c = data_fake.whilem_c;
873 if (flags & SCF_DO_STCLASS)
874 cl_or(pRExC_state, &accum, &this_class);
878 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
880 if (flags & SCF_DO_SUBSTR) {
881 data->pos_min += min1;
882 data->pos_delta += max1 - min1;
883 if (max1 != min1 || is_inf)
884 data->longest = &(data->longest_float);
887 delta += max1 - min1;
888 if (flags & SCF_DO_STCLASS_OR) {
889 cl_or(pRExC_state, data->start_class, &accum);
891 cl_and(data->start_class, &and_with);
892 flags &= ~SCF_DO_STCLASS;
895 else if (flags & SCF_DO_STCLASS_AND) {
897 cl_and(data->start_class, &accum);
898 flags &= ~SCF_DO_STCLASS;
901 /* Switch to OR mode: cache the old value of
902 * data->start_class */
903 StructCopy(data->start_class, &and_with,
904 struct regnode_charclass_class);
905 flags &= ~SCF_DO_STCLASS_AND;
906 StructCopy(&accum, data->start_class,
907 struct regnode_charclass_class);
908 flags |= SCF_DO_STCLASS_OR;
909 data->start_class->flags |= ANYOF_EOS;
913 else if (code == BRANCHJ) /* single branch is optimized. */
914 scan = NEXTOPER(NEXTOPER(scan));
915 else /* single branch is optimized. */
916 scan = NEXTOPER(scan);
919 else if (OP(scan) == EXACT) {
920 I32 l = STR_LEN(scan);
921 UV uc = *((U8*)STRING(scan));
923 U8 *s = (U8*)STRING(scan);
924 l = utf8_length(s, s + l);
925 uc = utf8_to_uvchr(s, NULL);
928 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
929 /* The code below prefers earlier match for fixed
930 offset, later match for variable offset. */
931 if (data->last_end == -1) { /* Update the start info. */
932 data->last_start_min = data->pos_min;
933 data->last_start_max = is_inf
934 ? I32_MAX : data->pos_min + data->pos_delta;
936 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
938 SV * sv = data->last_found;
939 MAGIC *mg = SvUTF8(sv) && SvMAGICAL(sv) ?
940 mg_find(sv, PERL_MAGIC_utf8) : NULL;
941 if (mg && mg->mg_len >= 0)
942 mg->mg_len += utf8_length((U8*)STRING(scan),
943 (U8*)STRING(scan)+STR_LEN(scan));
946 SvUTF8_on(data->last_found);
947 data->last_end = data->pos_min + l;
948 data->pos_min += l; /* As in the first entry. */
949 data->flags &= ~SF_BEFORE_EOL;
951 if (flags & SCF_DO_STCLASS_AND) {
952 /* Check whether it is compatible with what we know already! */
956 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
957 && !ANYOF_BITMAP_TEST(data->start_class, uc)
958 && (!(data->start_class->flags & ANYOF_FOLD)
959 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
962 ANYOF_CLASS_ZERO(data->start_class);
963 ANYOF_BITMAP_ZERO(data->start_class);
965 ANYOF_BITMAP_SET(data->start_class, uc);
966 data->start_class->flags &= ~ANYOF_EOS;
968 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
970 else if (flags & SCF_DO_STCLASS_OR) {
971 /* false positive possible if the class is case-folded */
973 ANYOF_BITMAP_SET(data->start_class, uc);
975 data->start_class->flags |= ANYOF_UNICODE_ALL;
976 data->start_class->flags &= ~ANYOF_EOS;
977 cl_and(data->start_class, &and_with);
979 flags &= ~SCF_DO_STCLASS;
981 else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
982 I32 l = STR_LEN(scan);
983 UV uc = *((U8*)STRING(scan));
985 /* Search for fixed substrings supports EXACT only. */
986 if (flags & SCF_DO_SUBSTR)
987 scan_commit(pRExC_state, data);
989 U8 *s = (U8 *)STRING(scan);
990 l = utf8_length(s, s + l);
991 uc = utf8_to_uvchr(s, NULL);
994 if (data && (flags & SCF_DO_SUBSTR))
996 if (flags & SCF_DO_STCLASS_AND) {
997 /* Check whether it is compatible with what we know already! */
1001 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
1002 && !ANYOF_BITMAP_TEST(data->start_class, uc)
1003 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
1005 ANYOF_CLASS_ZERO(data->start_class);
1006 ANYOF_BITMAP_ZERO(data->start_class);
1008 ANYOF_BITMAP_SET(data->start_class, uc);
1009 data->start_class->flags &= ~ANYOF_EOS;
1010 data->start_class->flags |= ANYOF_FOLD;
1011 if (OP(scan) == EXACTFL)
1012 data->start_class->flags |= ANYOF_LOCALE;
1015 else if (flags & SCF_DO_STCLASS_OR) {
1016 if (data->start_class->flags & ANYOF_FOLD) {
1017 /* false positive possible if the class is case-folded.
1018 Assume that the locale settings are the same... */
1020 ANYOF_BITMAP_SET(data->start_class, uc);
1021 data->start_class->flags &= ~ANYOF_EOS;
1023 cl_and(data->start_class, &and_with);
1025 flags &= ~SCF_DO_STCLASS;
1027 else if (strchr((char*)PL_varies,OP(scan))) {
1028 I32 mincount, maxcount, minnext, deltanext, fl = 0;
1029 I32 f = flags, pos_before = 0;
1030 regnode *oscan = scan;
1031 struct regnode_charclass_class this_class;
1032 struct regnode_charclass_class *oclass = NULL;
1033 I32 next_is_eval = 0;
1035 switch (PL_regkind[(U8)OP(scan)]) {
1036 case WHILEM: /* End of (?:...)* . */
1037 scan = NEXTOPER(scan);
1040 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
1041 next = NEXTOPER(scan);
1042 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
1044 maxcount = REG_INFTY;
1045 next = regnext(scan);
1046 scan = NEXTOPER(scan);
1050 if (flags & SCF_DO_SUBSTR)
1055 if (flags & SCF_DO_STCLASS) {
1057 maxcount = REG_INFTY;
1058 next = regnext(scan);
1059 scan = NEXTOPER(scan);
1062 is_inf = is_inf_internal = 1;
1063 scan = regnext(scan);
1064 if (flags & SCF_DO_SUBSTR) {
1065 scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
1066 data->longest = &(data->longest_float);
1068 goto optimize_curly_tail;
1070 mincount = ARG1(scan);
1071 maxcount = ARG2(scan);
1072 next = regnext(scan);
1073 if (OP(scan) == CURLYX) {
1074 I32 lp = (data ? *(data->last_closep) : 0);
1076 scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
1078 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
1079 next_is_eval = (OP(scan) == EVAL);
1081 if (flags & SCF_DO_SUBSTR) {
1082 if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
1083 pos_before = data->pos_min;
1087 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
1089 data->flags |= SF_IS_INF;
1091 if (flags & SCF_DO_STCLASS) {
1092 cl_init(pRExC_state, &this_class);
1093 oclass = data->start_class;
1094 data->start_class = &this_class;
1095 f |= SCF_DO_STCLASS_AND;
1096 f &= ~SCF_DO_STCLASS_OR;
1098 /* These are the cases when once a subexpression
1099 fails at a particular position, it cannot succeed
1100 even after backtracking at the enclosing scope.
1102 XXXX what if minimal match and we are at the
1103 initial run of {n,m}? */
1104 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
1105 f &= ~SCF_WHILEM_VISITED_POS;
1107 /* This will finish on WHILEM, setting scan, or on NULL: */
1108 minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
1110 ? (f & ~SCF_DO_SUBSTR) : f);
1112 if (flags & SCF_DO_STCLASS)
1113 data->start_class = oclass;
1114 if (mincount == 0 || minnext == 0) {
1115 if (flags & SCF_DO_STCLASS_OR) {
1116 cl_or(pRExC_state, data->start_class, &this_class);
1118 else if (flags & SCF_DO_STCLASS_AND) {
1119 /* Switch to OR mode: cache the old value of
1120 * data->start_class */
1121 StructCopy(data->start_class, &and_with,
1122 struct regnode_charclass_class);
1123 flags &= ~SCF_DO_STCLASS_AND;
1124 StructCopy(&this_class, data->start_class,
1125 struct regnode_charclass_class);
1126 flags |= SCF_DO_STCLASS_OR;
1127 data->start_class->flags |= ANYOF_EOS;
1129 } else { /* Non-zero len */
1130 if (flags & SCF_DO_STCLASS_OR) {
1131 cl_or(pRExC_state, data->start_class, &this_class);
1132 cl_and(data->start_class, &and_with);
1134 else if (flags & SCF_DO_STCLASS_AND)
1135 cl_and(data->start_class, &this_class);
1136 flags &= ~SCF_DO_STCLASS;
1138 if (!scan) /* It was not CURLYX, but CURLY. */
1140 if (ckWARN(WARN_REGEXP)
1141 /* ? quantifier ok, except for (?{ ... }) */
1142 && (next_is_eval || !(mincount == 0 && maxcount == 1))
1143 && (minnext == 0) && (deltanext == 0)
1144 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
1145 && maxcount <= REG_INFTY/3) /* Complement check for big count */
1148 "Quantifier unexpected on zero-length expression");
1151 min += minnext * mincount;
1152 is_inf_internal |= ((maxcount == REG_INFTY
1153 && (minnext + deltanext) > 0)
1154 || deltanext == I32_MAX);
1155 is_inf |= is_inf_internal;
1156 delta += (minnext + deltanext) * maxcount - minnext * mincount;
1158 /* Try powerful optimization CURLYX => CURLYN. */
1159 if ( OP(oscan) == CURLYX && data
1160 && data->flags & SF_IN_PAR
1161 && !(data->flags & SF_HAS_EVAL)
1162 && !deltanext && minnext == 1 ) {
1163 /* Try to optimize to CURLYN. */
1164 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
1165 regnode *nxt1 = nxt;
1172 if (!strchr((char*)PL_simple,OP(nxt))
1173 && !(PL_regkind[(U8)OP(nxt)] == EXACT
1174 && STR_LEN(nxt) == 1))
1180 if (OP(nxt) != CLOSE)
1182 /* Now we know that nxt2 is the only contents: */
1183 oscan->flags = (U8)ARG(nxt);
1185 OP(nxt1) = NOTHING; /* was OPEN. */
1187 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1188 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
1189 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
1190 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1191 OP(nxt + 1) = OPTIMIZED; /* was count. */
1192 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
1197 /* Try optimization CURLYX => CURLYM. */
1198 if ( OP(oscan) == CURLYX && data
1199 && !(data->flags & SF_HAS_PAR)
1200 && !(data->flags & SF_HAS_EVAL)
1201 && !deltanext /* atom is fixed width */
1202 && minnext != 0 /* CURLYM can't handle zero width */
1204 /* XXXX How to optimize if data == 0? */
1205 /* Optimize to a simpler form. */
1206 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
1210 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
1211 && (OP(nxt2) != WHILEM))
1213 OP(nxt2) = SUCCEED; /* Whas WHILEM */
1214 /* Need to optimize away parenths. */
1215 if (data->flags & SF_IN_PAR) {
1216 /* Set the parenth number. */
1217 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
1219 if (OP(nxt) != CLOSE)
1220 FAIL("Panic opt close");
1221 oscan->flags = (U8)ARG(nxt);
1222 OP(nxt1) = OPTIMIZED; /* was OPEN. */
1223 OP(nxt) = OPTIMIZED; /* was CLOSE. */
1225 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
1226 OP(nxt + 1) = OPTIMIZED; /* was count. */
1227 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
1228 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
1231 while ( nxt1 && (OP(nxt1) != WHILEM)) {
1232 regnode *nnxt = regnext(nxt1);
1235 if (reg_off_by_arg[OP(nxt1)])
1236 ARG_SET(nxt1, nxt2 - nxt1);
1237 else if (nxt2 - nxt1 < U16_MAX)
1238 NEXT_OFF(nxt1) = nxt2 - nxt1;
1240 OP(nxt) = NOTHING; /* Cannot beautify */
1245 /* Optimize again: */
1246 study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
1252 else if ((OP(oscan) == CURLYX)
1253 && (flags & SCF_WHILEM_VISITED_POS)
1254 /* See the comment on a similar expression above.
1255 However, this time it not a subexpression
1256 we care about, but the expression itself. */
1257 && (maxcount == REG_INFTY)
1258 && data && ++data->whilem_c < 16) {
1259 /* This stays as CURLYX, we can put the count/of pair. */
1260 /* Find WHILEM (as in regexec.c) */
1261 regnode *nxt = oscan + NEXT_OFF(oscan);
1263 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
1265 PREVOPER(nxt)->flags = (U8)(data->whilem_c
1266 | (RExC_whilem_seen << 4)); /* On WHILEM */
1268 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
1270 if (flags & SCF_DO_SUBSTR) {
1271 SV *last_str = Nullsv;
1272 int counted = mincount != 0;
1274 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
1275 #if defined(SPARC64_GCC_WORKAROUND)
1281 if (pos_before >= data->last_start_min)
1284 b = data->last_start_min;
1287 s = SvPV(data->last_found, l);
1288 old = b - data->last_start_min;
1291 I32 b = pos_before >= data->last_start_min
1292 ? pos_before : data->last_start_min;
1294 char *s = SvPV(data->last_found, l);
1295 I32 old = b - data->last_start_min;
1299 old = utf8_hop((U8*)s, old) - (U8*)s;
1302 /* Get the added string: */
1303 last_str = newSVpvn(s + old, l);
1305 SvUTF8_on(last_str);
1306 if (deltanext == 0 && pos_before == b) {
1307 /* What was added is a constant string */
1309 SvGROW(last_str, (mincount * l) + 1);
1310 repeatcpy(SvPVX(last_str) + l,
1311 SvPVX(last_str), l, mincount - 1);
1312 SvCUR(last_str) *= mincount;
1313 /* Add additional parts. */
1314 SvCUR_set(data->last_found,
1315 SvCUR(data->last_found) - l);
1316 sv_catsv(data->last_found, last_str);
1318 SV * sv = data->last_found;
1320 SvUTF8(sv) && SvMAGICAL(sv) ?
1321 mg_find(sv, PERL_MAGIC_utf8) : NULL;
1322 if (mg && mg->mg_len >= 0)
1323 mg->mg_len += CHR_SVLEN(last_str);
1325 data->last_end += l * (mincount - 1);
1328 /* start offset must point into the last copy */
1329 data->last_start_min += minnext * (mincount - 1);
1330 data->last_start_max += is_inf ? I32_MAX
1331 : (maxcount - 1) * (minnext + data->pos_delta);
1334 /* It is counted once already... */
1335 data->pos_min += minnext * (mincount - counted);
1336 data->pos_delta += - counted * deltanext +
1337 (minnext + deltanext) * maxcount - minnext * mincount;
1338 if (mincount != maxcount) {
1339 /* Cannot extend fixed substrings found inside
1341 scan_commit(pRExC_state,data);
1342 if (mincount && last_str) {
1343 sv_setsv(data->last_found, last_str);
1344 data->last_end = data->pos_min;
1345 data->last_start_min =
1346 data->pos_min - CHR_SVLEN(last_str);
1347 data->last_start_max = is_inf
1349 : data->pos_min + data->pos_delta
1350 - CHR_SVLEN(last_str);
1352 data->longest = &(data->longest_float);
1354 SvREFCNT_dec(last_str);
1356 if (data && (fl & SF_HAS_EVAL))
1357 data->flags |= SF_HAS_EVAL;
1358 optimize_curly_tail:
1359 if (OP(oscan) != CURLYX) {
1360 while (PL_regkind[(U8)OP(next = regnext(oscan))] == NOTHING
1362 NEXT_OFF(oscan) += NEXT_OFF(next);
1365 default: /* REF and CLUMP only? */
1366 if (flags & SCF_DO_SUBSTR) {
1367 scan_commit(pRExC_state,data); /* Cannot expect anything... */
1368 data->longest = &(data->longest_float);
1370 is_inf = is_inf_internal = 1;
1371 if (flags & SCF_DO_STCLASS_OR)
1372 cl_anything(pRExC_state, data->start_class);
1373 flags &= ~SCF_DO_STCLASS;
1377 else if (strchr((char*)PL_simple,OP(scan))) {
1380 if (flags & SCF_DO_SUBSTR) {
1381 scan_commit(pRExC_state,data);
1385 if (flags & SCF_DO_STCLASS) {
1386 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
1388 /* Some of the logic below assumes that switching
1389 locale on will only add false positives. */
1390 switch (PL_regkind[(U8)OP(scan)]) {
1394 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
1395 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1396 cl_anything(pRExC_state, data->start_class);
1399 if (OP(scan) == SANY)
1401 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
1402 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
1403 || (data->start_class->flags & ANYOF_CLASS));
1404 cl_anything(pRExC_state, data->start_class);
1406 if (flags & SCF_DO_STCLASS_AND || !value)
1407 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
1410 if (flags & SCF_DO_STCLASS_AND)
1411 cl_and(data->start_class,
1412 (struct regnode_charclass_class*)scan);
1414 cl_or(pRExC_state, data->start_class,
1415 (struct regnode_charclass_class*)scan);
1418 if (flags & SCF_DO_STCLASS_AND) {
1419 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1420 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1421 for (value = 0; value < 256; value++)
1422 if (!isALNUM(value))
1423 ANYOF_BITMAP_CLEAR(data->start_class, value);
1427 if (data->start_class->flags & ANYOF_LOCALE)
1428 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1430 for (value = 0; value < 256; value++)
1432 ANYOF_BITMAP_SET(data->start_class, value);
1437 if (flags & SCF_DO_STCLASS_AND) {
1438 if (data->start_class->flags & ANYOF_LOCALE)
1439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
1442 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
1443 data->start_class->flags |= ANYOF_LOCALE;
1447 if (flags & SCF_DO_STCLASS_AND) {
1448 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1449 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1450 for (value = 0; value < 256; value++)
1452 ANYOF_BITMAP_CLEAR(data->start_class, value);
1456 if (data->start_class->flags & ANYOF_LOCALE)
1457 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1459 for (value = 0; value < 256; value++)
1460 if (!isALNUM(value))
1461 ANYOF_BITMAP_SET(data->start_class, value);
1466 if (flags & SCF_DO_STCLASS_AND) {
1467 if (data->start_class->flags & ANYOF_LOCALE)
1468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
1471 data->start_class->flags |= ANYOF_LOCALE;
1472 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
1476 if (flags & SCF_DO_STCLASS_AND) {
1477 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1478 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1479 for (value = 0; value < 256; value++)
1480 if (!isSPACE(value))
1481 ANYOF_BITMAP_CLEAR(data->start_class, value);
1485 if (data->start_class->flags & ANYOF_LOCALE)
1486 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1488 for (value = 0; value < 256; value++)
1490 ANYOF_BITMAP_SET(data->start_class, value);
1495 if (flags & SCF_DO_STCLASS_AND) {
1496 if (data->start_class->flags & ANYOF_LOCALE)
1497 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
1500 data->start_class->flags |= ANYOF_LOCALE;
1501 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
1505 if (flags & SCF_DO_STCLASS_AND) {
1506 if (!(data->start_class->flags & ANYOF_LOCALE)) {
1507 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1508 for (value = 0; value < 256; value++)
1510 ANYOF_BITMAP_CLEAR(data->start_class, value);
1514 if (data->start_class->flags & ANYOF_LOCALE)
1515 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1517 for (value = 0; value < 256; value++)
1518 if (!isSPACE(value))
1519 ANYOF_BITMAP_SET(data->start_class, value);
1524 if (flags & SCF_DO_STCLASS_AND) {
1525 if (data->start_class->flags & ANYOF_LOCALE) {
1526 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
1527 for (value = 0; value < 256; value++)
1528 if (!isSPACE(value))
1529 ANYOF_BITMAP_CLEAR(data->start_class, value);
1533 data->start_class->flags |= ANYOF_LOCALE;
1534 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
1538 if (flags & SCF_DO_STCLASS_AND) {
1539 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
1540 for (value = 0; value < 256; value++)
1541 if (!isDIGIT(value))
1542 ANYOF_BITMAP_CLEAR(data->start_class, value);
1545 if (data->start_class->flags & ANYOF_LOCALE)
1546 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
1548 for (value = 0; value < 256; value++)
1550 ANYOF_BITMAP_SET(data->start_class, value);
1555 if (flags & SCF_DO_STCLASS_AND) {
1556 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
1557 for (value = 0; value < 256; value++)
1559 ANYOF_BITMAP_CLEAR(data->start_class, value);
1562 if (data->start_class->flags & ANYOF_LOCALE)
1563 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
1565 for (value = 0; value < 256; value++)
1566 if (!isDIGIT(value))
1567 ANYOF_BITMAP_SET(data->start_class, value);
1572 if (flags & SCF_DO_STCLASS_OR)
1573 cl_and(data->start_class, &and_with);
1574 flags &= ~SCF_DO_STCLASS;
1577 else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
1578 data->flags |= (OP(scan) == MEOL
1582 else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ
1583 /* Lookbehind, or need to calculate parens/evals/stclass: */
1584 && (scan->flags || data || (flags & SCF_DO_STCLASS))
1585 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
1586 /* Lookahead/lookbehind */
1587 I32 deltanext, minnext, fake = 0;
1589 struct regnode_charclass_class intrnl;
1592 data_fake.flags = 0;
1594 data_fake.whilem_c = data->whilem_c;
1595 data_fake.last_closep = data->last_closep;
1598 data_fake.last_closep = &fake;
1599 if ( flags & SCF_DO_STCLASS && !scan->flags
1600 && OP(scan) == IFMATCH ) { /* Lookahead */
1601 cl_init(pRExC_state, &intrnl);
1602 data_fake.start_class = &intrnl;
1603 f |= SCF_DO_STCLASS_AND;
1605 if (flags & SCF_WHILEM_VISITED_POS)
1606 f |= SCF_WHILEM_VISITED_POS;
1607 next = regnext(scan);
1608 nscan = NEXTOPER(NEXTOPER(scan));
1609 minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
1612 vFAIL("Variable length lookbehind not implemented");
1614 else if (minnext > U8_MAX) {
1615 vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
1617 scan->flags = (U8)minnext;
1619 if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1621 if (data && (data_fake.flags & SF_HAS_EVAL))
1622 data->flags |= SF_HAS_EVAL;
1624 data->whilem_c = data_fake.whilem_c;
1625 if (f & SCF_DO_STCLASS_AND) {
1626 int was = (data->start_class->flags & ANYOF_EOS);
1628 cl_and(data->start_class, &intrnl);
1630 data->start_class->flags |= ANYOF_EOS;
1633 else if (OP(scan) == OPEN) {
1636 else if (OP(scan) == CLOSE) {
1637 if ((I32)ARG(scan) == is_par) {
1638 next = regnext(scan);
1640 if ( next && (OP(next) != WHILEM) && next < last)
1641 is_par = 0; /* Disable optimization */
1644 *(data->last_closep) = ARG(scan);
1646 else if (OP(scan) == EVAL) {
1648 data->flags |= SF_HAS_EVAL;
1650 else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
1651 if (flags & SCF_DO_SUBSTR) {
1652 scan_commit(pRExC_state,data);
1653 data->longest = &(data->longest_float);
1655 is_inf = is_inf_internal = 1;
1656 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
1657 cl_anything(pRExC_state, data->start_class);
1658 flags &= ~SCF_DO_STCLASS;
1660 /* Else: zero-length, ignore. */
1661 scan = regnext(scan);
1666 *deltap = is_inf_internal ? I32_MAX : delta;
1667 if (flags & SCF_DO_SUBSTR && is_inf)
1668 data->pos_delta = I32_MAX - data->pos_min;
1669 if (is_par > U8_MAX)
1671 if (is_par && pars==1 && data) {
1672 data->flags |= SF_IN_PAR;
1673 data->flags &= ~SF_HAS_PAR;
1675 else if (pars && data) {
1676 data->flags |= SF_HAS_PAR;
1677 data->flags &= ~SF_IN_PAR;
1679 if (flags & SCF_DO_STCLASS_OR)
1680 cl_and(data->start_class, &and_with);
1685 S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
1687 if (RExC_rx->data) {
1688 Renewc(RExC_rx->data,
1689 sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
1690 char, struct reg_data);
1691 Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
1692 RExC_rx->data->count += n;
1695 Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
1696 char, struct reg_data);
1697 New(1208, RExC_rx->data->what, n, U8);
1698 RExC_rx->data->count = n;
1700 Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
1701 return RExC_rx->data->count - n;
1705 Perl_reginitcolors(pTHX)
1708 char *s = PerlEnv_getenv("PERL_RE_COLORS");
1711 PL_colors[0] = s = savepv(s);
1713 s = strchr(s, '\t');
1719 PL_colors[i] = s = "";
1723 PL_colors[i++] = "";
1730 - pregcomp - compile a regular expression into internal code
1732 * We can't allocate space until we know how big the compiled form will be,
1733 * but we can't compile it (and thus know how big it is) until we've got a
1734 * place to put the code. So we cheat: we compile it twice, once with code
1735 * generation turned off and size counting turned on, and once "for real".
1736 * This also means that we don't allocate space until we are sure that the
1737 * thing really will compile successfully, and we never have to move the
1738 * code and thus invalidate pointers into it. (Note that it has to be in
1739 * one piece because free() must be able to free it all.) [NB: not true in perl]
1741 * Beware that the optimization-preparation code in here knows about some
1742 * of the structure of the compiled regexp. [I'll say.]
1745 Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
1755 RExC_state_t RExC_state;
1756 RExC_state_t *pRExC_state = &RExC_state;
1759 FAIL("NULL regexp argument");
1761 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
1765 if (!PL_colorset) reginitcolors();
1766 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
1767 PL_colors[4],PL_colors[5],PL_colors[0],
1768 (int)(xend - exp), RExC_precomp, PL_colors[1]);
1770 RExC_flags = pm->op_pmflags;
1774 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
1775 RExC_seen_evals = 0;
1778 /* First pass: determine size, legality. */
1785 RExC_emit = &PL_regdummy;
1786 RExC_whilem_seen = 0;
1787 #if 0 /* REGC() is (currently) a NOP at the first pass.
1788 * Clever compilers notice this and complain. --jhi */
1789 REGC((U8)REG_MAGIC, (char*)RExC_emit);
1791 if (reg(pRExC_state, 0, &flags) == NULL) {
1792 RExC_precomp = Nullch;
1795 DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
1797 /* Small enough for pointer-storage convention?
1798 If extralen==0, this means that we will not need long jumps. */
1799 if (RExC_size >= 0x10000L && RExC_extralen)
1800 RExC_size += RExC_extralen;
1803 if (RExC_whilem_seen > 15)
1804 RExC_whilem_seen = 15;
1806 /* Allocate space and initialize. */
1807 Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
1810 FAIL("Regexp out of space");
1813 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
1814 Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
1817 r->prelen = xend - exp;
1818 r->precomp = savepvn(RExC_precomp, r->prelen);
1820 #ifdef PERL_COPY_ON_WRITE
1821 r->saved_copy = Nullsv;
1823 r->reganch = pm->op_pmflags & PMf_COMPILETIME;
1824 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
1826 r->substrs = 0; /* Useful during FAIL. */
1827 r->startp = 0; /* Useful during FAIL. */
1828 r->endp = 0; /* Useful during FAIL. */
1830 Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
1832 r->offsets[0] = RExC_size;
1834 DEBUG_r(PerlIO_printf(Perl_debug_log,
1835 "%s %"UVuf" bytes for offset annotations.\n",
1836 r->offsets ? "Got" : "Couldn't get",
1837 (UV)((2*RExC_size+1) * sizeof(U32))));
1841 /* Second pass: emit code. */
1842 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
1847 RExC_emit_start = r->program;
1848 RExC_emit = r->program;
1849 /* Store the count of eval-groups for security checks: */
1850 RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
1851 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
1853 if (reg(pRExC_state, 0, &flags) == NULL)
1856 /* Dig out information for optimizations. */
1857 r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
1858 pm->op_pmflags = RExC_flags;
1860 r->reganch |= ROPT_UTF8; /* Unicode in it? */
1861 r->regstclass = NULL;
1862 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
1863 r->reganch |= ROPT_NAUGHTY;
1864 scan = r->program + 1; /* First BRANCH. */
1866 /* XXXX To minimize changes to RE engine we always allocate
1867 3-units-long substrs field. */
1868 Newz(1004, r->substrs, 1, struct reg_substr_data);
1870 StructCopy(&zero_scan_data, &data, scan_data_t);
1871 /* XXXX Should not we check for something else? Usually it is OPEN1... */
1872 if (OP(scan) != BRANCH) { /* Only one top-level choice. */
1874 STRLEN longest_float_length, longest_fixed_length;
1875 struct regnode_charclass_class ch_class;
1880 /* Skip introductions and multiplicators >= 1. */
1881 while ((OP(first) == OPEN && (sawopen = 1)) ||
1882 /* An OR of *one* alternative - should not happen now. */
1883 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
1884 (OP(first) == PLUS) ||
1885 (OP(first) == MINMOD) ||
1886 /* An {n,m} with n>0 */
1887 (PL_regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) {
1888 if (OP(first) == PLUS)
1891 first += regarglen[(U8)OP(first)];
1892 first = NEXTOPER(first);
1895 /* Starting-point info. */
1897 if (PL_regkind[(U8)OP(first)] == EXACT) {
1898 if (OP(first) == EXACT)
1899 ; /* Empty, get anchored substr later. */
1900 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
1901 r->regstclass = first;
1903 else if (strchr((char*)PL_simple,OP(first)))
1904 r->regstclass = first;
1905 else if (PL_regkind[(U8)OP(first)] == BOUND ||
1906 PL_regkind[(U8)OP(first)] == NBOUND)
1907 r->regstclass = first;
1908 else if (PL_regkind[(U8)OP(first)] == BOL) {
1909 r->reganch |= (OP(first) == MBOL
1911 : (OP(first) == SBOL
1914 first = NEXTOPER(first);
1917 else if (OP(first) == GPOS) {
1918 r->reganch |= ROPT_ANCH_GPOS;
1919 first = NEXTOPER(first);
1922 else if (!sawopen && (OP(first) == STAR &&
1923 PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
1924 !(r->reganch & ROPT_ANCH) )
1926 /* turn .* into ^.* with an implied $*=1 */
1927 int type = OP(NEXTOPER(first));
1929 if (type == REG_ANY)
1930 type = ROPT_ANCH_MBOL;
1932 type = ROPT_ANCH_SBOL;
1934 r->reganch |= type | ROPT_IMPLICIT;
1935 first = NEXTOPER(first);
1938 if (sawplus && (!sawopen || !RExC_sawback)
1939 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
1940 /* x+ must match at the 1st pos of run of x's */
1941 r->reganch |= ROPT_SKIP;
1943 /* Scan is after the zeroth branch, first is atomic matcher. */
1944 DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
1945 (IV)(first - scan + 1)));
1947 * If there's something expensive in the r.e., find the
1948 * longest literal string that must appear and make it the
1949 * regmust. Resolve ties in favor of later strings, since
1950 * the regstart check works with the beginning of the r.e.
1951 * and avoiding duplication strengthens checking. Not a
1952 * strong reason, but sufficient in the absence of others.
1953 * [Now we resolve ties in favor of the earlier string if
1954 * it happens that c_offset_min has been invalidated, since the
1955 * earlier string may buy us something the later one won't.]
1959 data.longest_fixed = newSVpvn("",0);
1960 data.longest_float = newSVpvn("",0);
1961 data.last_found = newSVpvn("",0);
1962 data.longest = &(data.longest_fixed);
1964 if (!r->regstclass) {
1965 cl_init(pRExC_state, &ch_class);
1966 data.start_class = &ch_class;
1967 stclass_flag = SCF_DO_STCLASS_AND;
1968 } else /* XXXX Check for BOUND? */
1970 data.last_closep = &last_close;
1972 minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
1973 &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
1974 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
1975 && data.last_start_min == 0 && data.last_end > 0
1976 && !RExC_seen_zerolen
1977 && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
1978 r->reganch |= ROPT_CHECK_ALL;
1979 scan_commit(pRExC_state, &data);
1980 SvREFCNT_dec(data.last_found);
1982 longest_float_length = CHR_SVLEN(data.longest_float);
1983 if (longest_float_length
1984 || (data.flags & SF_FL_BEFORE_EOL
1985 && (!(data.flags & SF_FL_BEFORE_MEOL)
1986 || (RExC_flags & PMf_MULTILINE)))) {
1989 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
1990 && data.offset_fixed == data.offset_float_min
1991 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
1992 goto remove_float; /* As in (a)+. */
1994 if (SvUTF8(data.longest_float)) {
1995 r->float_utf8 = data.longest_float;
1996 r->float_substr = Nullsv;
1998 r->float_substr = data.longest_float;
1999 r->float_utf8 = Nullsv;
2001 r->float_min_offset = data.offset_float_min;
2002 r->float_max_offset = data.offset_float_max;
2003 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
2004 && (!(data.flags & SF_FL_BEFORE_MEOL)
2005 || (RExC_flags & PMf_MULTILINE)));
2006 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
2010 r->float_substr = r->float_utf8 = Nullsv;
2011 SvREFCNT_dec(data.longest_float);
2012 longest_float_length = 0;
2015 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
2016 if (longest_fixed_length
2017 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
2018 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2019 || (RExC_flags & PMf_MULTILINE)))) {
2022 if (SvUTF8(data.longest_fixed)) {
2023 r->anchored_utf8 = data.longest_fixed;
2024 r->anchored_substr = Nullsv;
2026 r->anchored_substr = data.longest_fixed;
2027 r->anchored_utf8 = Nullsv;
2029 r->anchored_offset = data.offset_fixed;
2030 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
2031 && (!(data.flags & SF_FIX_BEFORE_MEOL)
2032 || (RExC_flags & PMf_MULTILINE)));
2033 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
2036 r->anchored_substr = r->anchored_utf8 = Nullsv;
2037 SvREFCNT_dec(data.longest_fixed);
2038 longest_fixed_length = 0;
2041 && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
2042 r->regstclass = NULL;
2043 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
2045 && !(data.start_class->flags & ANYOF_EOS)
2046 && !cl_is_anything(data.start_class))
2048 I32 n = add_data(pRExC_state, 1, "f");
2050 New(1006, RExC_rx->data->data[n], 1,
2051 struct regnode_charclass_class);
2052 StructCopy(data.start_class,
2053 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2054 struct regnode_charclass_class);
2055 r->regstclass = (regnode*)RExC_rx->data->data[n];
2056 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2057 PL_regdata = r->data; /* for regprop() */
2058 DEBUG_r({ SV *sv = sv_newmortal();
2059 regprop(sv, (regnode*)data.start_class);
2060 PerlIO_printf(Perl_debug_log,
2061 "synthetic stclass `%s'.\n",
2065 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
2066 if (longest_fixed_length > longest_float_length) {
2067 r->check_substr = r->anchored_substr;
2068 r->check_utf8 = r->anchored_utf8;
2069 r->check_offset_min = r->check_offset_max = r->anchored_offset;
2070 if (r->reganch & ROPT_ANCH_SINGLE)
2071 r->reganch |= ROPT_NOSCAN;
2074 r->check_substr = r->float_substr;
2075 r->check_utf8 = r->float_utf8;
2076 r->check_offset_min = data.offset_float_min;
2077 r->check_offset_max = data.offset_float_max;
2079 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
2080 This should be changed ASAP! */
2081 if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
2082 r->reganch |= RE_USE_INTUIT;
2083 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
2084 r->reganch |= RE_INTUIT_TAIL;
2088 /* Several toplevels. Best we can is to set minlen. */
2090 struct regnode_charclass_class ch_class;
2093 DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
2094 scan = r->program + 1;
2095 cl_init(pRExC_state, &ch_class);
2096 data.start_class = &ch_class;
2097 data.last_closep = &last_close;
2098 minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
2099 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
2100 = r->float_substr = r->float_utf8 = Nullsv;
2101 if (!(data.start_class->flags & ANYOF_EOS)
2102 && !cl_is_anything(data.start_class))
2104 I32 n = add_data(pRExC_state, 1, "f");
2106 New(1006, RExC_rx->data->data[n], 1,
2107 struct regnode_charclass_class);
2108 StructCopy(data.start_class,
2109 (struct regnode_charclass_class*)RExC_rx->data->data[n],
2110 struct regnode_charclass_class);
2111 r->regstclass = (regnode*)RExC_rx->data->data[n];
2112 r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
2113 DEBUG_r({ SV* sv = sv_newmortal();
2114 regprop(sv, (regnode*)data.start_class);
2115 PerlIO_printf(Perl_debug_log,
2116 "synthetic stclass `%s'.\n",
2122 if (RExC_seen & REG_SEEN_GPOS)
2123 r->reganch |= ROPT_GPOS_SEEN;
2124 if (RExC_seen & REG_SEEN_LOOKBEHIND)
2125 r->reganch |= ROPT_LOOKBEHIND_SEEN;
2126 if (RExC_seen & REG_SEEN_EVAL)
2127 r->reganch |= ROPT_EVAL_SEEN;
2128 if (RExC_seen & REG_SEEN_CANY)
2129 r->reganch |= ROPT_CANY_SEEN;
2130 Newz(1002, r->startp, RExC_npar, I32);
2131 Newz(1002, r->endp, RExC_npar, I32);
2132 PL_regdata = r->data; /* for regprop() */
2133 DEBUG_r(regdump(r));
2138 - reg - regular expression, i.e. main body or parenthesized thing
2140 * Caller must absorb opening parenthesis.
2142 * Combining parenthesis handling with the base level of regular expression
2143 * is a trifle forced, but the need to tie the tails of the branches to what
2144 * follows makes it hard to avoid.
2147 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
2148 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
2150 register regnode *ret; /* Will be the head of the group. */
2151 register regnode *br;
2152 register regnode *lastbr;
2153 register regnode *ender = 0;
2154 register I32 parno = 0;
2155 I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
2157 /* for (?g), (?gc), and (?o) warnings; warning
2158 about (?c) will warn about (?g) -- japhy */
2160 I32 wastedflags = 0x00,
2163 wasted_gc = 0x02 | 0x04,
2166 char * parse_start = RExC_parse; /* MJD */
2167 char *oregcomp_parse = RExC_parse;
2170 *flagp = 0; /* Tentatively. */
2173 /* Make an OPEN node, if parenthesized. */
2175 if (*RExC_parse == '?') { /* (?...) */
2176 U32 posflags = 0, negflags = 0;
2177 U32 *flagsp = &posflags;
2179 char *seqstart = RExC_parse;
2182 paren = *RExC_parse++;
2183 ret = NULL; /* For look-ahead/behind. */
2185 case '<': /* (?<...) */
2186 RExC_seen |= REG_SEEN_LOOKBEHIND;
2187 if (*RExC_parse == '!')
2189 if (*RExC_parse != '=' && *RExC_parse != '!')
2192 case '=': /* (?=...) */
2193 case '!': /* (?!...) */
2194 RExC_seen_zerolen++;
2195 case ':': /* (?:...) */
2196 case '>': /* (?>...) */
2198 case '$': /* (?$...) */
2199 case '@': /* (?@...) */
2200 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
2202 case '#': /* (?#...) */
2203 while (*RExC_parse && *RExC_parse != ')')
2205 if (*RExC_parse != ')')
2206 FAIL("Sequence (?#... not terminated");
2207 nextchar(pRExC_state);
2210 case 'p': /* (?p...) */
2211 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
2212 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
2214 case '?': /* (??...) */
2216 if (*RExC_parse != '{')
2218 paren = *RExC_parse++;
2220 case '{': /* (?{...}) */
2222 I32 count = 1, n = 0;
2224 char *s = RExC_parse;
2226 OP_4tree *sop, *rop;
2228 RExC_seen_zerolen++;
2229 RExC_seen |= REG_SEEN_EVAL;
2230 while (count && (c = *RExC_parse)) {
2231 if (c == '\\' && RExC_parse[1])
2239 if (*RExC_parse != ')')
2242 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
2247 if (RExC_parse - 1 - s)
2248 sv = newSVpvn(s, RExC_parse - 1 - s);
2250 sv = newSVpvn("", 0);
2253 Perl_save_re_context(aTHX);
2254 rop = sv_compile_2op(sv, &sop, "re", &pad);
2255 sop->op_private |= OPpREFCOUNTED;
2256 /* re_dup will OpREFCNT_inc */
2257 OpREFCNT_set(sop, 1);
2260 n = add_data(pRExC_state, 3, "nop");
2261 RExC_rx->data->data[n] = (void*)rop;
2262 RExC_rx->data->data[n+1] = (void*)sop;
2263 RExC_rx->data->data[n+2] = (void*)pad;
2266 else { /* First pass */
2267 if (PL_reginterp_cnt < ++RExC_seen_evals
2269 /* No compiled RE interpolated, has runtime
2270 components ===> unsafe. */
2271 FAIL("Eval-group not allowed at runtime, use re 'eval'");
2272 if (PL_tainting && PL_tainted)
2273 FAIL("Eval-group in insecure regular expression");
2274 if (IN_PERL_COMPILETIME)
2278 nextchar(pRExC_state);
2280 ret = reg_node(pRExC_state, LOGICAL);
2283 regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
2284 /* deal with the length of this later - MJD */
2287 ret = reganode(pRExC_state, EVAL, n);
2288 Set_Node_Length(ret, RExC_parse - parse_start + 1);
2289 Set_Node_Offset(ret, parse_start);
2292 case '(': /* (?(?{...})...) and (?(?=...)...) */
2294 if (RExC_parse[0] == '?') { /* (?(?...)) */
2295 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
2296 || RExC_parse[1] == '<'
2297 || RExC_parse[1] == '{') { /* Lookahead or eval. */
2300 ret = reg_node(pRExC_state, LOGICAL);
2303 regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
2307 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
2309 parno = atoi(RExC_parse++);
2311 while (isDIGIT(*RExC_parse))
2313 ret = reganode(pRExC_state, GROUPP, parno);
2315 if ((c = *nextchar(pRExC_state)) != ')')
2316 vFAIL("Switch condition not recognized");
2318 regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
2319 br = regbranch(pRExC_state, &flags, 1);
2321 br = reganode(pRExC_state, LONGJMP, 0);
2323 regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
2324 c = *nextchar(pRExC_state);
2328 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
2329 regbranch(pRExC_state, &flags, 1);
2330 regtail(pRExC_state, ret, lastbr);
2333 c = *nextchar(pRExC_state);
2338 vFAIL("Switch (?(condition)... contains too many branches");
2339 ender = reg_node(pRExC_state, TAIL);
2340 regtail(pRExC_state, br, ender);
2342 regtail(pRExC_state, lastbr, ender);
2343 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
2346 regtail(pRExC_state, ret, ender);
2350 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
2354 RExC_parse--; /* for vFAIL to print correctly */
2355 vFAIL("Sequence (? incomplete");
2359 parse_flags: /* (?i) */
2360 while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
2361 /* (?g), (?gc) and (?o) are useless here
2362 and must be globally applied -- japhy */
2364 if (*RExC_parse == 'o' || *RExC_parse == 'g') {
2365 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2366 I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
2367 if (! (wastedflags & wflagbit) ) {
2368 wastedflags |= wflagbit;
2371 "Useless (%s%c) - %suse /%c modifier",
2372 flagsp == &negflags ? "?-" : "?",
2374 flagsp == &negflags ? "don't " : "",
2380 else if (*RExC_parse == 'c') {
2381 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
2382 if (! (wastedflags & wasted_c) ) {
2383 wastedflags |= wasted_gc;
2386 "Useless (%sc) - %suse /gc modifier",
2387 flagsp == &negflags ? "?-" : "?",
2388 flagsp == &negflags ? "don't " : ""
2393 else { pmflag(flagsp, *RExC_parse); }
2397 if (*RExC_parse == '-') {
2399 wastedflags = 0; /* reset so (?g-c) warns twice */
2403 RExC_flags |= posflags;
2404 RExC_flags &= ~negflags;
2405 if (*RExC_parse == ':') {
2411 if (*RExC_parse != ')') {
2413 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
2415 nextchar(pRExC_state);
2423 ret = reganode(pRExC_state, OPEN, parno);
2424 Set_Node_Length(ret, 1); /* MJD */
2425 Set_Node_Offset(ret, RExC_parse); /* MJD */
2432 /* Pick up the branches, linking them together. */
2433 parse_start = RExC_parse; /* MJD */
2434 br = regbranch(pRExC_state, &flags, 1);
2435 /* branch_len = (paren != 0); */
2439 if (*RExC_parse == '|') {
2440 if (!SIZE_ONLY && RExC_extralen) {
2441 reginsert(pRExC_state, BRANCHJ, br);
2444 reginsert(pRExC_state, BRANCH, br);
2445 Set_Node_Length(br, paren != 0);
2446 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
2450 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
2452 else if (paren == ':') {
2453 *flagp |= flags&SIMPLE;
2455 if (open) { /* Starts with OPEN. */
2456 regtail(pRExC_state, ret, br); /* OPEN -> first. */
2458 else if (paren != '?') /* Not Conditional */
2460 *flagp |= flags & (SPSTART | HASWIDTH);
2462 while (*RExC_parse == '|') {
2463 if (!SIZE_ONLY && RExC_extralen) {
2464 ender = reganode(pRExC_state, LONGJMP,0);
2465 regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
2468 RExC_extralen += 2; /* Account for LONGJMP. */
2469 nextchar(pRExC_state);
2470 br = regbranch(pRExC_state, &flags, 0);
2474 regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
2478 *flagp |= flags&SPSTART;
2481 if (have_branch || paren != ':') {
2482 /* Make a closing node, and hook it on the end. */
2485 ender = reg_node(pRExC_state, TAIL);
2488 ender = reganode(pRExC_state, CLOSE, parno);
2489 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
2490 Set_Node_Length(ender,1); /* MJD */
2496 *flagp &= ~HASWIDTH;
2499 ender = reg_node(pRExC_state, SUCCEED);
2502 ender = reg_node(pRExC_state, END);
2505 regtail(pRExC_state, lastbr, ender);
2508 /* Hook the tails of the branches to the closing node. */
2509 for (br = ret; br != NULL; br = regnext(br)) {
2510 regoptail(pRExC_state, br, ender);
2517 static char parens[] = "=!<,>";
2519 if (paren && (p = strchr(parens, paren))) {
2520 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
2521 int flag = (p - parens) > 1;
2524 node = SUSPEND, flag = 0;
2525 reginsert(pRExC_state, node,ret);
2526 Set_Node_Cur_Length(ret);
2527 Set_Node_Offset(ret, parse_start + 1);
2529 regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
2533 /* Check for proper termination. */
2535 RExC_flags = oregflags;
2536 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
2537 RExC_parse = oregcomp_parse;
2538 vFAIL("Unmatched (");
2541 else if (!paren && RExC_parse < RExC_end) {
2542 if (*RExC_parse == ')') {
2544 vFAIL("Unmatched )");
2547 FAIL("Junk on end of regexp"); /* "Can't happen". */
2555 - regbranch - one alternative of an | operator
2557 * Implements the concatenation operator.
2560 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
2562 register regnode *ret;
2563 register regnode *chain = NULL;
2564 register regnode *latest;
2565 I32 flags = 0, c = 0;
2570 if (!SIZE_ONLY && RExC_extralen)
2571 ret = reganode(pRExC_state, BRANCHJ,0);
2573 ret = reg_node(pRExC_state, BRANCH);
2574 Set_Node_Length(ret, 1);
2578 if (!first && SIZE_ONLY)
2579 RExC_extralen += 1; /* BRANCHJ */
2581 *flagp = WORST; /* Tentatively. */
2584 nextchar(pRExC_state);
2585 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
2587 latest = regpiece(pRExC_state, &flags);
2588 if (latest == NULL) {
2589 if (flags & TRYAGAIN)
2593 else if (ret == NULL)
2595 *flagp |= flags&HASWIDTH;
2596 if (chain == NULL) /* First piece. */
2597 *flagp |= flags&SPSTART;
2600 regtail(pRExC_state, chain, latest);
2605 if (chain == NULL) { /* Loop ran zero times. */
2606 chain = reg_node(pRExC_state, NOTHING);
2611 *flagp |= flags&SIMPLE;
2618 - regpiece - something followed by possible [*+?]
2620 * Note that the branching code sequences used for ? and the general cases
2621 * of * and + are somewhat optimized: they use the same NOTHING node as
2622 * both the endmarker for their branch list and the body of the last branch.
2623 * It might seem that this node could be dispensed with entirely, but the
2624 * endmarker role is not redundant.
2627 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2629 register regnode *ret;
2631 register char *next;
2633 char *origparse = RExC_parse;
2636 I32 max = REG_INFTY;
2639 ret = regatom(pRExC_state, &flags);
2641 if (flags & TRYAGAIN)
2648 if (op == '{' && regcurly(RExC_parse)) {
2649 parse_start = RExC_parse; /* MJD */
2650 next = RExC_parse + 1;
2652 while (isDIGIT(*next) || *next == ',') {
2661 if (*next == '}') { /* got one */
2665 min = atoi(RExC_parse);
2669 maxpos = RExC_parse;
2671 if (!max && *maxpos != '0')
2672 max = REG_INFTY; /* meaning "infinity" */
2673 else if (max >= REG_INFTY)
2674 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
2676 nextchar(pRExC_state);
2679 if ((flags&SIMPLE)) {
2680 RExC_naughty += 2 + RExC_naughty / 2;
2681 reginsert(pRExC_state, CURLY, ret);
2682 Set_Node_Offset(ret, parse_start+1); /* MJD */
2683 Set_Node_Cur_Length(ret);
2686 regnode *w = reg_node(pRExC_state, WHILEM);
2689 regtail(pRExC_state, ret, w);
2690 if (!SIZE_ONLY && RExC_extralen) {
2691 reginsert(pRExC_state, LONGJMP,ret);
2692 reginsert(pRExC_state, NOTHING,ret);
2693 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
2695 reginsert(pRExC_state, CURLYX,ret);
2697 Set_Node_Offset(ret, parse_start+1);
2698 Set_Node_Length(ret,
2699 op == '{' ? (RExC_parse - parse_start) : 1);
2701 if (!SIZE_ONLY && RExC_extralen)
2702 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
2703 regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
2705 RExC_whilem_seen++, RExC_extralen += 3;
2706 RExC_naughty += 4 + RExC_naughty; /* compound interest */
2714 if (max && max < min)
2715 vFAIL("Can't do {n,m} with n > m");
2717 ARG1_SET(ret, (U16)min);
2718 ARG2_SET(ret, (U16)max);
2730 #if 0 /* Now runtime fix should be reliable. */
2732 /* if this is reinstated, don't forget to put this back into perldiag:
2734 =item Regexp *+ operand could be empty at {#} in regex m/%s/
2736 (F) The part of the regexp subject to either the * or + quantifier
2737 could match an empty string. The {#} shows in the regular
2738 expression about where the problem was discovered.
2742 if (!(flags&HASWIDTH) && op != '?')
2743 vFAIL("Regexp *+ operand could be empty");
2746 parse_start = RExC_parse;
2747 nextchar(pRExC_state);
2749 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
2751 if (op == '*' && (flags&SIMPLE)) {
2752 reginsert(pRExC_state, STAR, ret);
2756 else if (op == '*') {
2760 else if (op == '+' && (flags&SIMPLE)) {
2761 reginsert(pRExC_state, PLUS, ret);
2765 else if (op == '+') {
2769 else if (op == '?') {
2774 if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
2776 "%.*s matches null string many times",
2777 RExC_parse - origparse,
2781 if (*RExC_parse == '?') {
2782 nextchar(pRExC_state);
2783 reginsert(pRExC_state, MINMOD, ret);
2784 regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
2786 if (ISMULT2(RExC_parse)) {
2788 vFAIL("Nested quantifiers");
2795 - regatom - the lowest level
2797 * Optimization: gobbles an entire sequence of ordinary characters so that
2798 * it can turn them into a single node, which is smaller to store and
2799 * faster to run. Backslashed characters are exceptions, each becoming a
2800 * separate node; the code is simpler that way and it's not worth fixing.
2802 * [Yes, it is worth fixing, some scripts can run twice the speed.] */
2804 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
2806 register regnode *ret = 0;
2808 char *parse_start = RExC_parse;
2810 *flagp = WORST; /* Tentatively. */
2813 switch (*RExC_parse) {
2815 RExC_seen_zerolen++;
2816 nextchar(pRExC_state);
2817 if (RExC_flags & PMf_MULTILINE)
2818 ret = reg_node(pRExC_state, MBOL);
2819 else if (RExC_flags & PMf_SINGLELINE)
2820 ret = reg_node(pRExC_state, SBOL);
2822 ret = reg_node(pRExC_state, BOL);
2823 Set_Node_Length(ret, 1); /* MJD */
2826 nextchar(pRExC_state);
2828 RExC_seen_zerolen++;
2829 if (RExC_flags & PMf_MULTILINE)
2830 ret = reg_node(pRExC_state, MEOL);
2831 else if (RExC_flags & PMf_SINGLELINE)
2832 ret = reg_node(pRExC_state, SEOL);
2834 ret = reg_node(pRExC_state, EOL);
2835 Set_Node_Length(ret, 1); /* MJD */
2838 nextchar(pRExC_state);
2839 if (RExC_flags & PMf_SINGLELINE)
2840 ret = reg_node(pRExC_state, SANY);
2842 ret = reg_node(pRExC_state, REG_ANY);
2843 *flagp |= HASWIDTH|SIMPLE;
2845 Set_Node_Length(ret, 1); /* MJD */
2849 char *oregcomp_parse = ++RExC_parse;
2850 ret = regclass(pRExC_state);
2851 if (*RExC_parse != ']') {
2852 RExC_parse = oregcomp_parse;
2853 vFAIL("Unmatched [");
2855 nextchar(pRExC_state);
2856 *flagp |= HASWIDTH|SIMPLE;
2857 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
2861 nextchar(pRExC_state);
2862 ret = reg(pRExC_state, 1, &flags);
2864 if (flags & TRYAGAIN) {
2865 if (RExC_parse == RExC_end) {
2866 /* Make parent create an empty node if needed. */
2874 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE);
2878 if (flags & TRYAGAIN) {
2882 vFAIL("Internal urp");
2883 /* Supposed to be caught earlier. */
2886 if (!regcurly(RExC_parse)) {
2895 vFAIL("Quantifier follows nothing");
2898 switch (*++RExC_parse) {
2900 RExC_seen_zerolen++;
2901 ret = reg_node(pRExC_state, SBOL);
2903 nextchar(pRExC_state);
2904 Set_Node_Length(ret, 2); /* MJD */
2907 ret = reg_node(pRExC_state, GPOS);
2908 RExC_seen |= REG_SEEN_GPOS;
2910 nextchar(pRExC_state);
2911 Set_Node_Length(ret, 2); /* MJD */
2914 ret = reg_node(pRExC_state, SEOL);
2916 RExC_seen_zerolen++; /* Do not optimize RE away */
2917 nextchar(pRExC_state);
2920 ret = reg_node(pRExC_state, EOS);
2922 RExC_seen_zerolen++; /* Do not optimize RE away */
2923 nextchar(pRExC_state);
2924 Set_Node_Length(ret, 2); /* MJD */
2927 ret = reg_node(pRExC_state, CANY);
2928 RExC_seen |= REG_SEEN_CANY;
2929 *flagp |= HASWIDTH|SIMPLE;
2930 nextchar(pRExC_state);
2931 Set_Node_Length(ret, 2); /* MJD */
2934 ret = reg_node(pRExC_state, CLUMP);
2936 nextchar(pRExC_state);
2937 Set_Node_Length(ret, 2); /* MJD */
2940 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
2941 *flagp |= HASWIDTH|SIMPLE;
2942 nextchar(pRExC_state);
2943 Set_Node_Length(ret, 2); /* MJD */
2946 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
2947 *flagp |= HASWIDTH|SIMPLE;
2948 nextchar(pRExC_state);
2949 Set_Node_Length(ret, 2); /* MJD */
2952 RExC_seen_zerolen++;
2953 RExC_seen |= REG_SEEN_LOOKBEHIND;
2954 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
2956 nextchar(pRExC_state);
2957 Set_Node_Length(ret, 2); /* MJD */
2960 RExC_seen_zerolen++;
2961 RExC_seen |= REG_SEEN_LOOKBEHIND;
2962 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
2964 nextchar(pRExC_state);
2965 Set_Node_Length(ret, 2); /* MJD */
2968 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
2969 *flagp |= HASWIDTH|SIMPLE;
2970 nextchar(pRExC_state);
2971 Set_Node_Length(ret, 2); /* MJD */
2974 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
2975 *flagp |= HASWIDTH|SIMPLE;
2976 nextchar(pRExC_state);
2977 Set_Node_Length(ret, 2); /* MJD */
2980 ret = reg_node(pRExC_state, DIGIT);
2981 *flagp |= HASWIDTH|SIMPLE;
2982 nextchar(pRExC_state);
2983 Set_Node_Length(ret, 2); /* MJD */
2986 ret = reg_node(pRExC_state, NDIGIT);
2987 *flagp |= HASWIDTH|SIMPLE;
2988 nextchar(pRExC_state);
2989 Set_Node_Length(ret, 2); /* MJD */
2994 char* oldregxend = RExC_end;
2995 char* parse_start = RExC_parse - 2;
2997 if (RExC_parse[1] == '{') {
2998 /* a lovely hack--pretend we saw [\pX] instead */
2999 RExC_end = strchr(RExC_parse, '}');
3001 U8 c = (U8)*RExC_parse;
3003 RExC_end = oldregxend;
3004 vFAIL2("Missing right brace on \\%c{}", c);
3009 RExC_end = RExC_parse + 2;
3010 if (RExC_end > oldregxend)
3011 RExC_end = oldregxend;
3015 ret = regclass(pRExC_state);
3017 RExC_end = oldregxend;
3020 Set_Node_Offset(ret, parse_start + 2);
3021 Set_Node_Cur_Length(ret);
3022 nextchar(pRExC_state);
3023 *flagp |= HASWIDTH|SIMPLE;
3036 case '1': case '2': case '3': case '4':
3037 case '5': case '6': case '7': case '8': case '9':
3039 I32 num = atoi(RExC_parse);
3041 if (num > 9 && num >= RExC_npar)
3044 char * parse_start = RExC_parse - 1; /* MJD */
3045 while (isDIGIT(*RExC_parse))
3048 if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
3049 vFAIL("Reference to nonexistent group");
3051 ret = reganode(pRExC_state,
3052 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
3056 /* override incorrect value set in reganode MJD */
3057 Set_Node_Offset(ret, parse_start+1);
3058 Set_Node_Cur_Length(ret); /* MJD */
3060 nextchar(pRExC_state);
3065 if (RExC_parse >= RExC_end)
3066 FAIL("Trailing \\");
3069 /* Do not generate `unrecognized' warnings here, we fall
3070 back into the quick-grab loop below */
3077 if (RExC_flags & PMf_EXTENDED) {
3078 while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
3079 if (RExC_parse < RExC_end)
3085 register STRLEN len;
3091 U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
3093 parse_start = RExC_parse - 1;
3099 ret = reg_node(pRExC_state,
3100 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
3102 for (len = 0, p = RExC_parse - 1;
3103 len < 127 && p < RExC_end;
3108 if (RExC_flags & PMf_EXTENDED)
3109 p = regwhite(p, RExC_end);
3156 ender = ASCII_TO_NATIVE('\033');
3160 ender = ASCII_TO_NATIVE('\007');
3165 char* e = strchr(p, '}');
3169 vFAIL("Missing right brace on \\x{}");
3172 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3173 | PERL_SCAN_DISALLOW_PREFIX;
3175 ender = grok_hex(p + 1, &numlen, &flags, NULL);
3182 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3184 ender = grok_hex(p, &numlen, &flags, NULL);
3190 ender = UCHARAT(p++);
3191 ender = toCTRL(ender);
3193 case '0': case '1': case '2': case '3':case '4':
3194 case '5': case '6': case '7': case '8':case '9':
3196 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
3199 ender = grok_oct(p, &numlen, &flags, NULL);
3209 FAIL("Trailing \\");
3212 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
3213 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
3214 goto normal_default;
3219 if (UTF8_IS_START(*p) && UTF) {
3220 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
3228 if (RExC_flags & PMf_EXTENDED)
3229 p = regwhite(p, RExC_end);
3231 /* Prime the casefolded buffer. */
3232 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
3234 if (ISMULT2(p)) { /* Back off on ?+*. */
3241 /* Emit all the Unicode characters. */
3242 for (foldbuf = tmpbuf;
3244 foldlen -= numlen) {
3245 ender = utf8_to_uvchr(foldbuf, &numlen);
3247 reguni(pRExC_state, ender, s, &unilen);
3250 /* In EBCDIC the numlen
3251 * and unilen can differ. */
3253 if (numlen >= foldlen)
3257 break; /* "Can't happen." */
3261 reguni(pRExC_state, ender, s, &unilen);
3270 REGC((char)ender, s++);
3278 /* Emit all the Unicode characters. */
3279 for (foldbuf = tmpbuf;
3281 foldlen -= numlen) {
3282 ender = utf8_to_uvchr(foldbuf, &numlen);
3284 reguni(pRExC_state, ender, s, &unilen);
3287 /* In EBCDIC the numlen
3288 * and unilen can differ. */
3290 if (numlen >= foldlen)
3298 reguni(pRExC_state, ender, s, &unilen);
3307 REGC((char)ender, s++);
3311 Set_Node_Cur_Length(ret); /* MJD */
3312 nextchar(pRExC_state);
3314 /* len is STRLEN which is unsigned, need to copy to signed */
3317 vFAIL("Internal disaster");
3321 if (len == 1 && UNI_IS_INVARIANT(ender))
3326 RExC_size += STR_SZ(len);
3328 RExC_emit += STR_SZ(len);
3333 /* If the encoding pragma is in effect recode the text of
3334 * any EXACT-kind nodes. */
3335 if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
3336 STRLEN oldlen = STR_LEN(ret);
3337 SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
3341 if (sv_utf8_downgrade(sv, TRUE)) {
3342 char *s = sv_recode_to_utf8(sv, PL_encoding);
3343 STRLEN newlen = SvCUR(sv);
3348 DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
3349 (int)oldlen, STRING(ret),
3351 Copy(s, STRING(ret), newlen, char);
3352 STR_LEN(ret) += newlen - oldlen;
3353 RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
3355 RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
3363 S_regwhite(pTHX_ char *p, char *e)
3368 else if (*p == '#') {
3371 } while (p < e && *p != '\n');
3379 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
3380 Character classes ([:foo:]) can also be negated ([:^foo:]).
3381 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
3382 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
3383 but trigger failures because they are currently unimplemented. */
3385 #define POSIXCC_DONE(c) ((c) == ':')
3386 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
3387 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
3390 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
3393 I32 namedclass = OOB_NAMEDCLASS;
3395 if (value == '[' && RExC_parse + 1 < RExC_end &&
3396 /* I smell either [: or [= or [. -- POSIX has been here, right? */
3397 POSIXCC(UCHARAT(RExC_parse))) {
3398 char c = UCHARAT(RExC_parse);
3399 char* s = RExC_parse++;
3401 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
3403 if (RExC_parse == RExC_end)
3404 /* Grandfather lone [:, [=, [. */
3407 char* t = RExC_parse++; /* skip over the c */
3409 if (UCHARAT(RExC_parse) == ']') {
3410 RExC_parse++; /* skip over the ending ] */
3413 I32 complement = *posixcc == '^' ? *posixcc++ : 0;
3414 I32 skip = 5; /* the most common skip */
3418 if (strnEQ(posixcc, "alnum", 5))
3420 complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
3421 else if (strnEQ(posixcc, "alpha", 5))
3423 complement ? ANYOF_NALPHA : ANYOF_ALPHA;
3424 else if (strnEQ(posixcc, "ascii", 5))
3426 complement ? ANYOF_NASCII : ANYOF_ASCII;
3429 if (strnEQ(posixcc, "blank", 5))
3431 complement ? ANYOF_NBLANK : ANYOF_BLANK;
3434 if (strnEQ(posixcc, "cntrl", 5))
3436 complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
3439 if (strnEQ(posixcc, "digit", 5))
3441 complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
3444 if (strnEQ(posixcc, "graph", 5))
3446 complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
3449 if (strnEQ(posixcc, "lower", 5))
3451 complement ? ANYOF_NLOWER : ANYOF_LOWER;
3454 if (strnEQ(posixcc, "print", 5))
3456 complement ? ANYOF_NPRINT : ANYOF_PRINT;
3457 else if (strnEQ(posixcc, "punct", 5))
3459 complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
3462 if (strnEQ(posixcc, "space", 5))
3464 complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
3467 if (strnEQ(posixcc, "upper", 5))
3469 complement ? ANYOF_NUPPER : ANYOF_UPPER;
3471 case 'w': /* this is not POSIX, this is the Perl \w */
3472 if (strnEQ(posixcc, "word", 4)) {
3474 complement ? ANYOF_NALNUM : ANYOF_ALNUM;
3479 if (strnEQ(posixcc, "xdigit", 6)) {
3481 complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
3486 if (namedclass == OOB_NAMEDCLASS ||
3487 posixcc[skip] != ':' ||
3488 posixcc[skip+1] != ']')
3490 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
3493 } else if (!SIZE_ONLY) {
3494 /* [[=foo=]] and [[.foo.]] are still future. */
3496 /* adjust RExC_parse so the warning shows after
3498 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
3500 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3503 /* Maternal grandfather:
3504 * "[:" ending in ":" but not in ":]" */
3514 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
3516 if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
3517 char *s = RExC_parse;
3520 while(*s && isALNUM(*s))
3522 if (*s && c == *s && s[1] == ']') {
3523 if (ckWARN(WARN_REGEXP))
3525 "POSIX syntax [%c %c] belongs inside character classes",
3528 /* [[=foo=]] and [[.foo.]] are still future. */
3529 if (POSIXCC_NOTYET(c)) {
3530 /* adjust RExC_parse so the error shows after
3532 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3534 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
3541 S_regclass(pTHX_ RExC_state_t *pRExC_state)
3544 register UV nextvalue;
3545 register IV prevvalue = OOB_UNICODE;
3546 register IV range = 0;
3547 register regnode *ret;
3550 char *rangebegin = 0;
3551 bool need_class = 0;
3552 SV *listsv = Nullsv;
3555 bool optimize_invert = TRUE;
3556 AV* unicode_alternate = 0;
3558 UV literal_endpoint = 0;
3561 ret = reganode(pRExC_state, ANYOF, 0);
3564 ANYOF_FLAGS(ret) = 0;
3566 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
3570 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
3574 RExC_size += ANYOF_SKIP;
3576 RExC_emit += ANYOF_SKIP;
3578 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
3580 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3581 ANYOF_BITMAP_ZERO(ret);
3582 listsv = newSVpvn("# comment\n", 10);
3585 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3587 if (!SIZE_ONLY && POSIXCC(nextvalue))
3588 checkposixcc(pRExC_state);
3590 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
3591 if (UCHARAT(RExC_parse) == ']')
3594 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
3598 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
3601 rangebegin = RExC_parse;
3603 value = utf8n_to_uvchr((U8*)RExC_parse,
3604 RExC_end - RExC_parse,
3606 RExC_parse += numlen;
3609 value = UCHARAT(RExC_parse++);
3610 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
3611 if (value == '[' && POSIXCC(nextvalue))
3612 namedclass = regpposixcc(pRExC_state, value);
3613 else if (value == '\\') {
3615 value = utf8n_to_uvchr((U8*)RExC_parse,
3616 RExC_end - RExC_parse,
3618 RExC_parse += numlen;
3621 value = UCHARAT(RExC_parse++);
3622 /* Some compilers cannot handle switching on 64-bit integer
3623 * values, therefore value cannot be an UV. Yes, this will
3624 * be a problem later if we want switch on Unicode.
3625 * A similar issue a little bit later when switching on
3626 * namedclass. --jhi */
3627 switch ((I32)value) {
3628 case 'w': namedclass = ANYOF_ALNUM; break;
3629 case 'W': namedclass = ANYOF_NALNUM; break;
3630 case 's': namedclass = ANYOF_SPACE; break;
3631 case 'S': namedclass = ANYOF_NSPACE; break;
3632 case 'd': namedclass = ANYOF_DIGIT; break;
3633 case 'D': namedclass = ANYOF_NDIGIT; break;
3636 if (RExC_parse >= RExC_end)
3637 vFAIL2("Empty \\%c{}", (U8)value);
3638 if (*RExC_parse == '{') {
3640 e = strchr(RExC_parse++, '}');
3642 vFAIL2("Missing right brace on \\%c{}", c);
3643 while (isSPACE(UCHARAT(RExC_parse)))
3645 if (e == RExC_parse)
3646 vFAIL2("Empty \\%c{}", c);
3648 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
3656 if (UCHARAT(RExC_parse) == '^') {
3659 value = value == 'p' ? 'P' : 'p'; /* toggle */
3660 while (isSPACE(UCHARAT(RExC_parse))) {
3666 Perl_sv_catpvf(aTHX_ listsv,
3667 "+utf8::%.*s\n", (int)n, RExC_parse);
3669 Perl_sv_catpvf(aTHX_ listsv,
3670 "!utf8::%.*s\n", (int)n, RExC_parse);
3673 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3674 namedclass = ANYOF_MAX; /* no official name, but it's named */
3676 case 'n': value = '\n'; break;
3677 case 'r': value = '\r'; break;
3678 case 't': value = '\t'; break;
3679 case 'f': value = '\f'; break;
3680 case 'b': value = '\b'; break;
3681 case 'e': value = ASCII_TO_NATIVE('\033');break;
3682 case 'a': value = ASCII_TO_NATIVE('\007');break;
3684 if (*RExC_parse == '{') {
3685 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
3686 | PERL_SCAN_DISALLOW_PREFIX;
3687 e = strchr(RExC_parse++, '}');
3689 vFAIL("Missing right brace on \\x{}");
3691 numlen = e - RExC_parse;
3692 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3696 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
3698 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
3699 RExC_parse += numlen;
3703 value = UCHARAT(RExC_parse++);
3704 value = toCTRL(value);
3706 case '0': case '1': case '2': case '3': case '4':
3707 case '5': case '6': case '7': case '8': case '9':
3711 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
3712 RExC_parse += numlen;
3716 if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
3718 "Unrecognized escape \\%c in character class passed through",
3722 } /* end of \blah */
3728 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
3730 if (!SIZE_ONLY && !need_class)
3731 ANYOF_CLASS_ZERO(ret);
3735 /* a bad range like a-\d, a-[:digit:] ? */
3738 if (ckWARN(WARN_REGEXP))
3740 "False [] range \"%*.*s\"",
3741 RExC_parse - rangebegin,
3742 RExC_parse - rangebegin,
3744 if (prevvalue < 256) {
3745 ANYOF_BITMAP_SET(ret, prevvalue);
3746 ANYOF_BITMAP_SET(ret, '-');
3749 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
3750 Perl_sv_catpvf(aTHX_ listsv,
3751 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
3755 range = 0; /* this was not a true range */
3759 if (namedclass > OOB_NAMEDCLASS)
3760 optimize_invert = FALSE;
3761 /* Possible truncation here but in some 64-bit environments
3762 * the compiler gets heartburn about switch on 64-bit values.
3763 * A similar issue a little earlier when switching on value.
3765 switch ((I32)namedclass) {
3768 ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
3770 for (value = 0; value < 256; value++)
3772 ANYOF_BITMAP_SET(ret, value);
3774 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
3778 ANYOF_CLASS_SET(ret, ANYOF_NALNUM);
3780 for (value = 0; value < 256; value++)
3781 if (!isALNUM(value))
3782 ANYOF_BITMAP_SET(ret, value);
3784 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
3788 ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
3790 for (value = 0; value < 256; value++)
3791 if (isALNUMC(value))
3792 ANYOF_BITMAP_SET(ret, value);
3794 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
3798 ANYOF_CLASS_SET(ret, ANYOF_NALNUMC);
3800 for (value = 0; value < 256; value++)
3801 if (!isALNUMC(value))
3802 ANYOF_BITMAP_SET(ret, value);
3804 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
3808 ANYOF_CLASS_SET(ret, ANYOF_ALPHA);
3810 for (value = 0; value < 256; value++)
3812 ANYOF_BITMAP_SET(ret, value);
3814 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
3818 ANYOF_CLASS_SET(ret, ANYOF_NALPHA);
3820 for (value = 0; value < 256; value++)
3821 if (!isALPHA(value))
3822 ANYOF_BITMAP_SET(ret, value);
3824 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
3828 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
3831 for (value = 0; value < 128; value++)
3832 ANYOF_BITMAP_SET(ret, value);
3834 for (value = 0; value < 256; value++) {
3836 ANYOF_BITMAP_SET(ret, value);
3840 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
3844 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
3847 for (value = 128; value < 256; value++)
3848 ANYOF_BITMAP_SET(ret, value);
3850 for (value = 0; value < 256; value++) {
3851 if (!isASCII(value))
3852 ANYOF_BITMAP_SET(ret, value);
3856 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
3860 ANYOF_CLASS_SET(ret, ANYOF_BLANK);
3862 for (value = 0; value < 256; value++)
3864 ANYOF_BITMAP_SET(ret, value);
3866 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
3870 ANYOF_CLASS_SET(ret, ANYOF_NBLANK);
3872 for (value = 0; value < 256; value++)
3873 if (!isBLANK(value))
3874 ANYOF_BITMAP_SET(ret, value);
3876 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
3880 ANYOF_CLASS_SET(ret, ANYOF_CNTRL);
3882 for (value = 0; value < 256; value++)
3884 ANYOF_BITMAP_SET(ret, value);
3886 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
3890 ANYOF_CLASS_SET(ret, ANYOF_NCNTRL);
3892 for (value = 0; value < 256; value++)
3893 if (!isCNTRL(value))
3894 ANYOF_BITMAP_SET(ret, value);
3896 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
3900 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
3902 /* consecutive digits assumed */
3903 for (value = '0'; value <= '9'; value++)
3904 ANYOF_BITMAP_SET(ret, value);
3906 Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
3910 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
3912 /* consecutive digits assumed */
3913 for (value = 0; value < '0'; value++)
3914 ANYOF_BITMAP_SET(ret, value);
3915 for (value = '9' + 1; value < 256; value++)
3916 ANYOF_BITMAP_SET(ret, value);
3918 Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
3922 ANYOF_CLASS_SET(ret, ANYOF_GRAPH);
3924 for (value = 0; value < 256; value++)