5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
95 # if defined(BUGGY_MSC6)
96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
97 # pragma optimize("a",off)
98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
99 # pragma optimize("w",on )
100 # endif /* BUGGY_MSC6 */
104 #define STATIC static
107 typedef struct RExC_state_t {
108 U32 flags; /* are we folding, multilining? */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
113 char *start; /* Start of input for compile */
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
117 regnode *emit_start; /* Start of emitted-code area */
118 regnode *emit_bound; /* First regnode outside of the allocated space */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
126 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 HV *paren_names; /* Paren names */
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
142 char *starttry; /* -Dr: where regtry was called. */
143 #define RExC_starttry (pRExC_state->starttry)
146 const char *lastparse;
148 AV *paren_name_list; /* idx -> name */
149 #define RExC_lastparse (pRExC_state->lastparse)
150 #define RExC_lastnum (pRExC_state->lastnum)
151 #define RExC_paren_name_list (pRExC_state->paren_name_list)
155 #define RExC_flags (pRExC_state->flags)
156 #define RExC_precomp (pRExC_state->precomp)
157 #define RExC_rx_sv (pRExC_state->rx_sv)
158 #define RExC_rx (pRExC_state->rx)
159 #define RExC_rxi (pRExC_state->rxi)
160 #define RExC_start (pRExC_state->start)
161 #define RExC_end (pRExC_state->end)
162 #define RExC_parse (pRExC_state->parse)
163 #define RExC_whilem_seen (pRExC_state->whilem_seen)
164 #ifdef RE_TRACK_PATTERN_OFFSETS
165 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
167 #define RExC_emit (pRExC_state->emit)
168 #define RExC_emit_start (pRExC_state->emit_start)
169 #define RExC_emit_bound (pRExC_state->emit_bound)
170 #define RExC_naughty (pRExC_state->naughty)
171 #define RExC_sawback (pRExC_state->sawback)
172 #define RExC_seen (pRExC_state->seen)
173 #define RExC_size (pRExC_state->size)
174 #define RExC_npar (pRExC_state->npar)
175 #define RExC_nestroot (pRExC_state->nestroot)
176 #define RExC_extralen (pRExC_state->extralen)
177 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178 #define RExC_seen_evals (pRExC_state->seen_evals)
179 #define RExC_utf8 (pRExC_state->utf8)
180 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
181 #define RExC_open_parens (pRExC_state->open_parens)
182 #define RExC_close_parens (pRExC_state->close_parens)
183 #define RExC_opend (pRExC_state->opend)
184 #define RExC_paren_names (pRExC_state->paren_names)
185 #define RExC_recurse (pRExC_state->recurse)
186 #define RExC_recurse_count (pRExC_state->recurse_count)
189 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
194 #undef SPSTART /* dratted cpp namespace... */
197 * Flags to be passed up and down.
199 #define WORST 0 /* Worst case. */
200 #define HASWIDTH 0x01 /* Known to match non-null strings. */
202 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
205 #define SPSTART 0x04 /* Starts with * or +. */
206 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
207 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
209 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
211 /* whether trie related optimizations are enabled */
212 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213 #define TRIE_STUDY_OPT
214 #define FULL_TRIE_STUDY
220 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221 #define PBITVAL(paren) (1 << ((paren) & 7))
222 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
226 /* If not already in utf8, do a longjmp back to the beginning */
227 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228 #define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
232 /* About scan_data_t.
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
238 string that must appear at a fixed location, and we look for the
239 longest string that may appear at a floating location. So for instance
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
248 The strings can be composites, for instance
252 will result in a composite fixed substring 'foo'.
254 For each string some basic information is maintained:
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
259 characters must match before the string we are searching for.
260 Likewise when combined with minlenp and the length of the string it
261 tells us how many characters must appear after the string we have
265 Only used for floating strings. This is the rightmost point that
266 the string can appear at. If set to I32 max it indicates that the
267 string can occur infinitely far to the right.
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
307 typedef struct scan_data_t {
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
313 I32 last_end; /* min value, <0 unless valid. */
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
329 struct regnode_charclass_class *start_class;
333 * Forward declarations for pregcomp()'s friends.
336 static const scan_data_t zero_scan_data =
337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
339 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
340 #define SF_BEFORE_SEOL 0x0001
341 #define SF_BEFORE_MEOL 0x0002
342 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
346 # define SF_FIX_SHIFT_EOL (0+2)
347 # define SF_FL_SHIFT_EOL (0+4)
349 # define SF_FIX_SHIFT_EOL (+2)
350 # define SF_FL_SHIFT_EOL (+4)
353 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
356 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
358 #define SF_IS_INF 0x0040
359 #define SF_HAS_PAR 0x0080
360 #define SF_IN_PAR 0x0100
361 #define SF_HAS_EVAL 0x0200
362 #define SCF_DO_SUBSTR 0x0400
363 #define SCF_DO_STCLASS_AND 0x0800
364 #define SCF_DO_STCLASS_OR 0x1000
365 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
366 #define SCF_WHILEM_VISITED_POS 0x2000
368 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
369 #define SCF_SEEN_ACCEPT 0x8000
371 #define UTF cBOOL(RExC_utf8)
372 #define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373 #define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
376 #define OOB_UNICODE 12345678
377 #define OOB_NAMEDCLASS -1
379 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
383 /* length of regex to show in messages that don't mark a position within */
384 #define RegexLengthToShowInErrorMessages 127
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
391 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
392 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
394 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
401 #define _FAIL(code) STMT_START { \
402 const char *ellipses = ""; \
403 IV len = RExC_end - RExC_precomp; \
406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
415 #define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
419 #define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
426 #define Simple_vFAIL(m) STMT_START { \
427 const IV offset = RExC_parse - RExC_precomp; \
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
435 #define vFAIL(m) STMT_START { \
437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
442 * Like Simple_vFAIL(), but accepts two arguments.
444 #define Simple_vFAIL2(m,a1) STMT_START { \
445 const IV offset = RExC_parse - RExC_precomp; \
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
453 #define vFAIL2(m,a1) STMT_START { \
455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
456 Simple_vFAIL2(m, a1); \
461 * Like Simple_vFAIL(), but accepts three arguments.
463 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
464 const IV offset = RExC_parse - RExC_precomp; \
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
472 #define vFAIL3(m,a1,a2) STMT_START { \
474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
475 Simple_vFAIL3(m, a1, a2); \
479 * Like Simple_vFAIL(), but accepts four arguments.
481 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
482 const IV offset = RExC_parse - RExC_precomp; \
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
487 #define ckWARNreg(loc,m) STMT_START { \
488 const IV offset = loc - RExC_precomp; \
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
493 #define ckWARNregdep(loc,m) STMT_START { \
494 const IV offset = loc - RExC_precomp; \
495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
500 #define ckWARN2reg(loc, m, a1) STMT_START { \
501 const IV offset = loc - RExC_precomp; \
502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
506 #define vWARN3(loc, m, a1, a2) STMT_START { \
507 const IV offset = loc - RExC_precomp; \
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
512 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
518 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
519 const IV offset = loc - RExC_precomp; \
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
524 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
530 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
531 const IV offset = loc - RExC_precomp; \
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
537 /* Allow for side effects in s */
538 #define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
542 /* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
546 * Position is 1 indexed.
548 #ifndef RE_TRACK_PATTERN_OFFSETS
549 #define Set_Node_Offset_To_R(node,byte)
550 #define Set_Node_Offset(node,byte)
551 #define Set_Cur_Node_Offset
552 #define Set_Node_Length_To_R(node,len)
553 #define Set_Node_Length(node,len)
554 #define Set_Node_Cur_Length(node)
555 #define Node_Offset(n)
556 #define Node_Length(n)
557 #define Set_Node_Offset_Length(node,offset,len)
558 #define ProgLen(ri) ri->u.proglen
559 #define SetProgLen(ri,x) ri->u.proglen = x
561 #define ProgLen(ri) ri->u.offsets[0]
562 #define SetProgLen(ri,x) ri->u.offsets[0] = x
563 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
566 __LINE__, (int)(node), (int)(byte))); \
568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
570 RExC_offsets[2*(node)-1] = (byte); \
575 #define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
579 #define Set_Node_Length_To_R(node,len) STMT_START { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
582 __LINE__, (int)(node), (int)(len))); \
584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
586 RExC_offsets[2*(node)] = (len); \
591 #define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594 #define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
597 /* Get offsets and lengths */
598 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
601 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
607 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608 #define EXPERIMENTAL_INPLACESCAN
609 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
611 #define DEBUG_STUDYDATA(str,data,depth) \
612 DEBUG_OPTIMISE_MORE_r(if(data){ \
613 PerlIO_printf(Perl_debug_log, \
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
619 (UV)((data)->flags), \
620 (IV)((data)->whilem_c), \
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
642 PerlIO_printf(Perl_debug_log,"\n"); \
645 static void clear_re(pTHX_ void *r);
647 /* Mark that we cannot extend a found fixed substring at this point.
648 Update the longest found anchored substring and the longest found
649 floating substrings if needed. */
652 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
656 GET_RE_DEBUG_FLAGS_DECL;
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
661 SvSetMagicSV(*data->longest, data->last_found);
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
668 data->flags &= ~SF_FIX_BEFORE_EOL;
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
672 else { /* *data->longest == data->longest_float */
673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
674 data->offset_float_max = (l
675 ? data->last_start_max
676 : data->pos_min + data->pos_delta);
677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
678 data->offset_float_max = I32_MAX;
679 if (data->flags & SF_BEFORE_EOL)
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
683 data->flags &= ~SF_FL_BEFORE_EOL;
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
688 SvCUR_set(data->last_found, 0);
690 SV * const sv = data->last_found;
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
698 data->flags &= ~SF_BEFORE_EOL;
699 DEBUG_STUDYDATA("commit: ",data,0);
702 /* Can match anything (initialization) */
704 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
706 PERL_ARGS_ASSERT_CL_ANYTHING;
708 ANYOF_CLASS_ZERO(cl);
709 ANYOF_BITMAP_SETALL(cl);
710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
712 cl->flags |= ANYOF_LOCALE;
713 cl->flags |= ANYOF_FOLD;
716 /* Can match anything (initialization) */
718 S_cl_is_anything(const struct regnode_charclass_class *cl)
722 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
724 for (value = 0; value <= ANYOF_MAX; value += 2)
725 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
727 if (!(cl->flags & ANYOF_UNICODE_ALL))
729 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
734 /* Can match anything (initialization) */
736 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
738 PERL_ARGS_ASSERT_CL_INIT;
740 Zero(cl, 1, struct regnode_charclass_class);
742 cl_anything(pRExC_state, cl);
746 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
748 PERL_ARGS_ASSERT_CL_INIT_ZERO;
750 Zero(cl, 1, struct regnode_charclass_class);
752 cl_anything(pRExC_state, cl);
754 cl->flags |= ANYOF_LOCALE;
757 /* 'And' a given class with another one. Can create false positives */
758 /* We assume that cl is not inverted */
760 S_cl_and(struct regnode_charclass_class *cl,
761 const struct regnode_charclass_class *and_with)
763 PERL_ARGS_ASSERT_CL_AND;
765 assert(and_with->type == ANYOF);
766 if (!(and_with->flags & ANYOF_CLASS)
767 && !(cl->flags & ANYOF_CLASS)
768 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
769 && !(and_with->flags & ANYOF_FOLD)
770 && !(cl->flags & ANYOF_FOLD)) {
773 if (and_with->flags & ANYOF_INVERT)
774 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
775 cl->bitmap[i] &= ~and_with->bitmap[i];
777 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
778 cl->bitmap[i] &= and_with->bitmap[i];
779 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
780 if (!(and_with->flags & ANYOF_EOS))
781 cl->flags &= ~ANYOF_EOS;
783 if (!(and_with->flags & ANYOF_FOLD))
784 cl->flags &= ~ANYOF_FOLD;
786 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
787 !(and_with->flags & ANYOF_INVERT)) {
788 cl->flags &= ~ANYOF_UNICODE_ALL;
789 cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
792 ARG_SET(cl, ARG(and_with));
794 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
795 !(and_with->flags & ANYOF_INVERT))
796 cl->flags &= ~ANYOF_UNICODE_ALL;
797 if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
798 !(and_with->flags & ANYOF_INVERT))
799 cl->flags &= ~ANYOF_NONBITMAP;
802 /* 'OR' a given class with another one. Can create false positives */
803 /* We assume that cl is not inverted */
805 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
807 PERL_ARGS_ASSERT_CL_OR;
809 if (or_with->flags & ANYOF_INVERT) {
811 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
812 * <= (B1 | !B2) | (CL1 | !CL2)
813 * which is wasteful if CL2 is small, but we ignore CL2:
814 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
815 * XXXX Can we handle case-fold? Unclear:
816 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
817 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
819 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
820 && !(or_with->flags & ANYOF_FOLD)
821 && !(cl->flags & ANYOF_FOLD) ) {
824 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
825 cl->bitmap[i] |= ~or_with->bitmap[i];
826 } /* XXXX: logic is complicated otherwise */
828 cl_anything(pRExC_state, cl);
831 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
832 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
833 && (!(or_with->flags & ANYOF_FOLD)
834 || (cl->flags & ANYOF_FOLD)) ) {
837 /* OR char bitmap and class bitmap separately */
838 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
839 cl->bitmap[i] |= or_with->bitmap[i];
840 if (or_with->flags & ANYOF_CLASS) {
841 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
842 cl->classflags[i] |= or_with->classflags[i];
843 cl->flags |= ANYOF_CLASS;
846 else { /* XXXX: logic is complicated, leave it along for a moment. */
847 cl_anything(pRExC_state, cl);
850 if (or_with->flags & ANYOF_EOS)
851 cl->flags |= ANYOF_EOS;
853 if (or_with->flags & ANYOF_FOLD)
854 cl->flags |= ANYOF_FOLD;
856 /* If both nodes match something outside the bitmap, but what they match
857 * outside is not the same pointer, and hence not easily compared, give up
858 * and allow the start class to match everything outside the bitmap */
859 if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
860 ARG(cl) != ARG(or_with)) {
861 cl->flags |= ANYOF_UNICODE_ALL;
864 if (or_with->flags & ANYOF_UNICODE_ALL) {
865 cl->flags |= ANYOF_UNICODE_ALL;
869 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
870 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
871 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
872 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
877 dump_trie(trie,widecharmap,revcharmap)
878 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
879 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
881 These routines dump out a trie in a somewhat readable format.
882 The _interim_ variants are used for debugging the interim
883 tables that are used to generate the final compressed
884 representation which is what dump_trie expects.
886 Part of the reason for their existance is to provide a form
887 of documentation as to how the different representations function.
892 Dumps the final compressed table form of the trie to Perl_debug_log.
893 Used for debugging make_trie().
897 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
898 AV *revcharmap, U32 depth)
901 SV *sv=sv_newmortal();
902 int colwidth= widecharmap ? 6 : 4;
904 GET_RE_DEBUG_FLAGS_DECL;
906 PERL_ARGS_ASSERT_DUMP_TRIE;
908 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
909 (int)depth * 2 + 2,"",
910 "Match","Base","Ofs" );
912 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
913 SV ** const tmp = av_fetch( revcharmap, state, 0);
915 PerlIO_printf( Perl_debug_log, "%*s",
917 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
918 PL_colors[0], PL_colors[1],
919 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
920 PERL_PV_ESCAPE_FIRSTCHAR
925 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
926 (int)depth * 2 + 2,"");
928 for( state = 0 ; state < trie->uniquecharcount ; state++ )
929 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
930 PerlIO_printf( Perl_debug_log, "\n");
932 for( state = 1 ; state < trie->statecount ; state++ ) {
933 const U32 base = trie->states[ state ].trans.base;
935 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
937 if ( trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
940 PerlIO_printf( Perl_debug_log, "%6s", "" );
943 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
948 while( ( base + ofs < trie->uniquecharcount ) ||
949 ( base + ofs - trie->uniquecharcount < trie->lasttrans
950 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
953 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
955 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
956 if ( ( base + ofs >= trie->uniquecharcount ) &&
957 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
958 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
960 PerlIO_printf( Perl_debug_log, "%*"UVXf,
962 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
964 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
968 PerlIO_printf( Perl_debug_log, "]");
971 PerlIO_printf( Perl_debug_log, "\n" );
973 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
974 for (word=1; word <= trie->wordcount; word++) {
975 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
976 (int)word, (int)(trie->wordinfo[word].prev),
977 (int)(trie->wordinfo[word].len));
979 PerlIO_printf(Perl_debug_log, "\n" );
982 Dumps a fully constructed but uncompressed trie in list form.
983 List tries normally only are used for construction when the number of
984 possible chars (trie->uniquecharcount) is very high.
985 Used for debugging make_trie().
988 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
989 HV *widecharmap, AV *revcharmap, U32 next_alloc,
993 SV *sv=sv_newmortal();
994 int colwidth= widecharmap ? 6 : 4;
995 GET_RE_DEBUG_FLAGS_DECL;
997 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
999 /* print out the table precompression. */
1000 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1001 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1002 "------:-----+-----------------\n" );
1004 for( state=1 ; state < next_alloc ; state ++ ) {
1007 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1008 (int)depth * 2 + 2,"", (UV)state );
1009 if ( ! trie->states[ state ].wordnum ) {
1010 PerlIO_printf( Perl_debug_log, "%5s| ","");
1012 PerlIO_printf( Perl_debug_log, "W%4x| ",
1013 trie->states[ state ].wordnum
1016 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1017 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1019 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1021 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1022 PL_colors[0], PL_colors[1],
1023 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1024 PERL_PV_ESCAPE_FIRSTCHAR
1026 TRIE_LIST_ITEM(state,charid).forid,
1027 (UV)TRIE_LIST_ITEM(state,charid).newstate
1030 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1031 (int)((depth * 2) + 14), "");
1034 PerlIO_printf( Perl_debug_log, "\n");
1039 Dumps a fully constructed but uncompressed trie in table form.
1040 This is the normal DFA style state transition table, with a few
1041 twists to facilitate compression later.
1042 Used for debugging make_trie().
1045 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1046 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1051 SV *sv=sv_newmortal();
1052 int colwidth= widecharmap ? 6 : 4;
1053 GET_RE_DEBUG_FLAGS_DECL;
1055 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1058 print out the table precompression so that we can do a visual check
1059 that they are identical.
1062 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1064 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1065 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1067 PerlIO_printf( Perl_debug_log, "%*s",
1069 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1070 PL_colors[0], PL_colors[1],
1071 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1072 PERL_PV_ESCAPE_FIRSTCHAR
1078 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1080 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1081 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1084 PerlIO_printf( Perl_debug_log, "\n" );
1086 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1088 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1089 (int)depth * 2 + 2,"",
1090 (UV)TRIE_NODENUM( state ) );
1092 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1093 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1095 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1097 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1099 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1100 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1102 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1103 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1111 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1112 startbranch: the first branch in the whole branch sequence
1113 first : start branch of sequence of branch-exact nodes.
1114 May be the same as startbranch
1115 last : Thing following the last branch.
1116 May be the same as tail.
1117 tail : item following the branch sequence
1118 count : words in the sequence
1119 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1120 depth : indent depth
1122 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1124 A trie is an N'ary tree where the branches are determined by digital
1125 decomposition of the key. IE, at the root node you look up the 1st character and
1126 follow that branch repeat until you find the end of the branches. Nodes can be
1127 marked as "accepting" meaning they represent a complete word. Eg:
1131 would convert into the following structure. Numbers represent states, letters
1132 following numbers represent valid transitions on the letter from that state, if
1133 the number is in square brackets it represents an accepting state, otherwise it
1134 will be in parenthesis.
1136 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1140 (1) +-i->(6)-+-s->[7]
1142 +-s->(3)-+-h->(4)-+-e->[5]
1144 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1146 This shows that when matching against the string 'hers' we will begin at state 1
1147 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1148 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1149 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1150 single traverse. We store a mapping from accepting to state to which word was
1151 matched, and then when we have multiple possibilities we try to complete the
1152 rest of the regex in the order in which they occured in the alternation.
1154 The only prior NFA like behaviour that would be changed by the TRIE support is
1155 the silent ignoring of duplicate alternations which are of the form:
1157 / (DUPE|DUPE) X? (?{ ... }) Y /x
1159 Thus EVAL blocks following a trie may be called a different number of times with
1160 and without the optimisation. With the optimisations dupes will be silently
1161 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1162 the following demonstrates:
1164 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1166 which prints out 'word' three times, but
1168 'words'=~/(word|word|word)(?{ print $1 })S/
1170 which doesnt print it out at all. This is due to other optimisations kicking in.
1172 Example of what happens on a structural level:
1174 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1176 1: CURLYM[1] {1,32767}(18)
1187 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1188 and should turn into:
1190 1: CURLYM[1] {1,32767}(18)
1192 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1200 Cases where tail != last would be like /(?foo|bar)baz/:
1210 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1211 and would end up looking like:
1214 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1221 d = uvuni_to_utf8_flags(d, uv, 0);
1223 is the recommended Unicode-aware way of saying
1228 #define TRIE_STORE_REVCHAR \
1231 SV *zlopp = newSV(2); \
1232 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1233 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1234 SvCUR_set(zlopp, kapow - flrbbbbb); \
1237 av_push(revcharmap, zlopp); \
1239 char ooooff = (char)uvc; \
1240 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1244 #define TRIE_READ_CHAR STMT_START { \
1248 if ( foldlen > 0 ) { \
1249 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1254 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1255 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1256 foldlen -= UNISKIP( uvc ); \
1257 scan = foldbuf + UNISKIP( uvc ); \
1260 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1270 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1271 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1272 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1273 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1275 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1276 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1277 TRIE_LIST_CUR( state )++; \
1280 #define TRIE_LIST_NEW(state) STMT_START { \
1281 Newxz( trie->states[ state ].trans.list, \
1282 4, reg_trie_trans_le ); \
1283 TRIE_LIST_CUR( state ) = 1; \
1284 TRIE_LIST_LEN( state ) = 4; \
1287 #define TRIE_HANDLE_WORD(state) STMT_START { \
1288 U16 dupe= trie->states[ state ].wordnum; \
1289 regnode * const noper_next = regnext( noper ); \
1292 /* store the word for dumping */ \
1294 if (OP(noper) != NOTHING) \
1295 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1297 tmp = newSVpvn_utf8( "", 0, UTF ); \
1298 av_push( trie_words, tmp ); \
1302 trie->wordinfo[curword].prev = 0; \
1303 trie->wordinfo[curword].len = wordlen; \
1304 trie->wordinfo[curword].accept = state; \
1306 if ( noper_next < tail ) { \
1308 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1309 trie->jump[curword] = (U16)(noper_next - convert); \
1311 jumper = noper_next; \
1313 nextbranch= regnext(cur); \
1317 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1318 /* chain, so that when the bits of chain are later */\
1319 /* linked together, the dups appear in the chain */\
1320 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1321 trie->wordinfo[dupe].prev = curword; \
1323 /* we haven't inserted this word yet. */ \
1324 trie->states[ state ].wordnum = curword; \
1329 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1330 ( ( base + charid >= ucharcount \
1331 && base + charid < ubound \
1332 && state == trie->trans[ base - ucharcount + charid ].check \
1333 && trie->trans[ base - ucharcount + charid ].next ) \
1334 ? trie->trans[ base - ucharcount + charid ].next \
1335 : ( state==1 ? special : 0 ) \
1339 #define MADE_JUMP_TRIE 2
1340 #define MADE_EXACT_TRIE 4
1343 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1346 /* first pass, loop through and scan words */
1347 reg_trie_data *trie;
1348 HV *widecharmap = NULL;
1349 AV *revcharmap = newAV();
1351 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1356 regnode *jumper = NULL;
1357 regnode *nextbranch = NULL;
1358 regnode *convert = NULL;
1359 U32 *prev_states; /* temp array mapping each state to previous one */
1360 /* we just use folder as a flag in utf8 */
1361 const U8 * folder = NULL;
1364 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1365 AV *trie_words = NULL;
1366 /* along with revcharmap, this only used during construction but both are
1367 * useful during debugging so we store them in the struct when debugging.
1370 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1371 STRLEN trie_charcount=0;
1373 SV *re_trie_maxbuff;
1374 GET_RE_DEBUG_FLAGS_DECL;
1376 PERL_ARGS_ASSERT_MAKE_TRIE;
1378 PERL_UNUSED_ARG(depth);
1382 case EXACTFU: folder = PL_fold_latin1; break;
1383 case EXACTF: folder = PL_fold; break;
1384 case EXACTFL: folder = PL_fold_locale; break;
1387 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1389 trie->startstate = 1;
1390 trie->wordcount = word_count;
1391 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1392 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1393 if (!(UTF && folder))
1394 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1395 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1396 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1399 trie_words = newAV();
1402 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1403 if (!SvIOK(re_trie_maxbuff)) {
1404 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1407 PerlIO_printf( Perl_debug_log,
1408 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1409 (int)depth * 2 + 2, "",
1410 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1411 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1415 /* Find the node we are going to overwrite */
1416 if ( first == startbranch && OP( last ) != BRANCH ) {
1417 /* whole branch chain */
1420 /* branch sub-chain */
1421 convert = NEXTOPER( first );
1424 /* -- First loop and Setup --
1426 We first traverse the branches and scan each word to determine if it
1427 contains widechars, and how many unique chars there are, this is
1428 important as we have to build a table with at least as many columns as we
1431 We use an array of integers to represent the character codes 0..255
1432 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1433 native representation of the character value as the key and IV's for the
1436 *TODO* If we keep track of how many times each character is used we can
1437 remap the columns so that the table compression later on is more
1438 efficient in terms of memory by ensuring the most common value is in the
1439 middle and the least common are on the outside. IMO this would be better
1440 than a most to least common mapping as theres a decent chance the most
1441 common letter will share a node with the least common, meaning the node
1442 will not be compressable. With a middle is most common approach the worst
1443 case is when we have the least common nodes twice.
1447 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1448 regnode * const noper = NEXTOPER( cur );
1449 const U8 *uc = (U8*)STRING( noper );
1450 const U8 * const e = uc + STR_LEN( noper );
1452 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1453 const U8 *scan = (U8*)NULL;
1454 U32 wordlen = 0; /* required init */
1456 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1458 if (OP(noper) == NOTHING) {
1462 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1463 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1464 regardless of encoding */
1466 for ( ; uc < e ; uc += len ) {
1467 TRIE_CHARCOUNT(trie)++;
1471 if ( !trie->charmap[ uvc ] ) {
1472 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1474 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1478 /* store the codepoint in the bitmap, and its folded
1480 TRIE_BITMAP_SET(trie,uvc);
1482 /* store the folded codepoint */
1483 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1486 /* store first byte of utf8 representation of
1487 codepoints in the 127 < uvc < 256 range */
1488 if (127 < uvc && uvc < 192) {
1489 TRIE_BITMAP_SET(trie,194);
1490 } else if (191 < uvc ) {
1491 TRIE_BITMAP_SET(trie,195);
1492 /* && uvc < 256 -- we know uvc is < 256 already */
1495 set_bit = 0; /* We've done our bit :-) */
1500 widecharmap = newHV();
1502 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1505 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1507 if ( !SvTRUE( *svpp ) ) {
1508 sv_setiv( *svpp, ++trie->uniquecharcount );
1513 if( cur == first ) {
1516 } else if (chars < trie->minlen) {
1518 } else if (chars > trie->maxlen) {
1522 } /* end first pass */
1523 DEBUG_TRIE_COMPILE_r(
1524 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1525 (int)depth * 2 + 2,"",
1526 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1527 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1528 (int)trie->minlen, (int)trie->maxlen )
1532 We now know what we are dealing with in terms of unique chars and
1533 string sizes so we can calculate how much memory a naive
1534 representation using a flat table will take. If it's over a reasonable
1535 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1536 conservative but potentially much slower representation using an array
1539 At the end we convert both representations into the same compressed
1540 form that will be used in regexec.c for matching with. The latter
1541 is a form that cannot be used to construct with but has memory
1542 properties similar to the list form and access properties similar
1543 to the table form making it both suitable for fast searches and
1544 small enough that its feasable to store for the duration of a program.
1546 See the comment in the code where the compressed table is produced
1547 inplace from the flat tabe representation for an explanation of how
1548 the compression works.
1553 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1556 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1558 Second Pass -- Array Of Lists Representation
1560 Each state will be represented by a list of charid:state records
1561 (reg_trie_trans_le) the first such element holds the CUR and LEN
1562 points of the allocated array. (See defines above).
1564 We build the initial structure using the lists, and then convert
1565 it into the compressed table form which allows faster lookups
1566 (but cant be modified once converted).
1569 STRLEN transcount = 1;
1571 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1572 "%*sCompiling trie using list compiler\n",
1573 (int)depth * 2 + 2, ""));
1575 trie->states = (reg_trie_state *)
1576 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1577 sizeof(reg_trie_state) );
1581 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1583 regnode * const noper = NEXTOPER( cur );
1584 U8 *uc = (U8*)STRING( noper );
1585 const U8 * const e = uc + STR_LEN( noper );
1586 U32 state = 1; /* required init */
1587 U16 charid = 0; /* sanity init */
1588 U8 *scan = (U8*)NULL; /* sanity init */
1589 STRLEN foldlen = 0; /* required init */
1590 U32 wordlen = 0; /* required init */
1591 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1593 if (OP(noper) != NOTHING) {
1594 for ( ; uc < e ; uc += len ) {
1599 charid = trie->charmap[ uvc ];
1601 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1605 charid=(U16)SvIV( *svpp );
1608 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1615 if ( !trie->states[ state ].trans.list ) {
1616 TRIE_LIST_NEW( state );
1618 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1619 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1620 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1625 newstate = next_alloc++;
1626 prev_states[newstate] = state;
1627 TRIE_LIST_PUSH( state, charid, newstate );
1632 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1636 TRIE_HANDLE_WORD(state);
1638 } /* end second pass */
1640 /* next alloc is the NEXT state to be allocated */
1641 trie->statecount = next_alloc;
1642 trie->states = (reg_trie_state *)
1643 PerlMemShared_realloc( trie->states,
1645 * sizeof(reg_trie_state) );
1647 /* and now dump it out before we compress it */
1648 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1649 revcharmap, next_alloc,
1653 trie->trans = (reg_trie_trans *)
1654 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1661 for( state=1 ; state < next_alloc ; state ++ ) {
1665 DEBUG_TRIE_COMPILE_MORE_r(
1666 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1670 if (trie->states[state].trans.list) {
1671 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1675 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1676 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1677 if ( forid < minid ) {
1679 } else if ( forid > maxid ) {
1683 if ( transcount < tp + maxid - minid + 1) {
1685 trie->trans = (reg_trie_trans *)
1686 PerlMemShared_realloc( trie->trans,
1688 * sizeof(reg_trie_trans) );
1689 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1691 base = trie->uniquecharcount + tp - minid;
1692 if ( maxid == minid ) {
1694 for ( ; zp < tp ; zp++ ) {
1695 if ( ! trie->trans[ zp ].next ) {
1696 base = trie->uniquecharcount + zp - minid;
1697 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1698 trie->trans[ zp ].check = state;
1704 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1705 trie->trans[ tp ].check = state;
1710 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1711 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1712 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1713 trie->trans[ tid ].check = state;
1715 tp += ( maxid - minid + 1 );
1717 Safefree(trie->states[ state ].trans.list);
1720 DEBUG_TRIE_COMPILE_MORE_r(
1721 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1724 trie->states[ state ].trans.base=base;
1726 trie->lasttrans = tp + 1;
1730 Second Pass -- Flat Table Representation.
1732 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1733 We know that we will need Charcount+1 trans at most to store the data
1734 (one row per char at worst case) So we preallocate both structures
1735 assuming worst case.
1737 We then construct the trie using only the .next slots of the entry
1740 We use the .check field of the first entry of the node temporarily to
1741 make compression both faster and easier by keeping track of how many non
1742 zero fields are in the node.
1744 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1747 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1748 number representing the first entry of the node, and state as a
1749 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1750 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1751 are 2 entrys per node. eg:
1759 The table is internally in the right hand, idx form. However as we also
1760 have to deal with the states array which is indexed by nodenum we have to
1761 use TRIE_NODENUM() to convert.
1764 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1765 "%*sCompiling trie using table compiler\n",
1766 (int)depth * 2 + 2, ""));
1768 trie->trans = (reg_trie_trans *)
1769 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1770 * trie->uniquecharcount + 1,
1771 sizeof(reg_trie_trans) );
1772 trie->states = (reg_trie_state *)
1773 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1774 sizeof(reg_trie_state) );
1775 next_alloc = trie->uniquecharcount + 1;
1778 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1780 regnode * const noper = NEXTOPER( cur );
1781 const U8 *uc = (U8*)STRING( noper );
1782 const U8 * const e = uc + STR_LEN( noper );
1784 U32 state = 1; /* required init */
1786 U16 charid = 0; /* sanity init */
1787 U32 accept_state = 0; /* sanity init */
1788 U8 *scan = (U8*)NULL; /* sanity init */
1790 STRLEN foldlen = 0; /* required init */
1791 U32 wordlen = 0; /* required init */
1792 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1794 if ( OP(noper) != NOTHING ) {
1795 for ( ; uc < e ; uc += len ) {
1800 charid = trie->charmap[ uvc ];
1802 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1803 charid = svpp ? (U16)SvIV(*svpp) : 0;
1807 if ( !trie->trans[ state + charid ].next ) {
1808 trie->trans[ state + charid ].next = next_alloc;
1809 trie->trans[ state ].check++;
1810 prev_states[TRIE_NODENUM(next_alloc)]
1811 = TRIE_NODENUM(state);
1812 next_alloc += trie->uniquecharcount;
1814 state = trie->trans[ state + charid ].next;
1816 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1818 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1821 accept_state = TRIE_NODENUM( state );
1822 TRIE_HANDLE_WORD(accept_state);
1824 } /* end second pass */
1826 /* and now dump it out before we compress it */
1827 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1829 next_alloc, depth+1));
1833 * Inplace compress the table.*
1835 For sparse data sets the table constructed by the trie algorithm will
1836 be mostly 0/FAIL transitions or to put it another way mostly empty.
1837 (Note that leaf nodes will not contain any transitions.)
1839 This algorithm compresses the tables by eliminating most such
1840 transitions, at the cost of a modest bit of extra work during lookup:
1842 - Each states[] entry contains a .base field which indicates the
1843 index in the state[] array wheres its transition data is stored.
1845 - If .base is 0 there are no valid transitions from that node.
1847 - If .base is nonzero then charid is added to it to find an entry in
1850 -If trans[states[state].base+charid].check!=state then the
1851 transition is taken to be a 0/Fail transition. Thus if there are fail
1852 transitions at the front of the node then the .base offset will point
1853 somewhere inside the previous nodes data (or maybe even into a node
1854 even earlier), but the .check field determines if the transition is
1858 The following process inplace converts the table to the compressed
1859 table: We first do not compress the root node 1,and mark all its
1860 .check pointers as 1 and set its .base pointer as 1 as well. This
1861 allows us to do a DFA construction from the compressed table later,
1862 and ensures that any .base pointers we calculate later are greater
1865 - We set 'pos' to indicate the first entry of the second node.
1867 - We then iterate over the columns of the node, finding the first and
1868 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1869 and set the .check pointers accordingly, and advance pos
1870 appropriately and repreat for the next node. Note that when we copy
1871 the next pointers we have to convert them from the original
1872 NODEIDX form to NODENUM form as the former is not valid post
1875 - If a node has no transitions used we mark its base as 0 and do not
1876 advance the pos pointer.
1878 - If a node only has one transition we use a second pointer into the
1879 structure to fill in allocated fail transitions from other states.
1880 This pointer is independent of the main pointer and scans forward
1881 looking for null transitions that are allocated to a state. When it
1882 finds one it writes the single transition into the "hole". If the
1883 pointer doesnt find one the single transition is appended as normal.
1885 - Once compressed we can Renew/realloc the structures to release the
1888 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1889 specifically Fig 3.47 and the associated pseudocode.
1893 const U32 laststate = TRIE_NODENUM( next_alloc );
1896 trie->statecount = laststate;
1898 for ( state = 1 ; state < laststate ; state++ ) {
1900 const U32 stateidx = TRIE_NODEIDX( state );
1901 const U32 o_used = trie->trans[ stateidx ].check;
1902 U32 used = trie->trans[ stateidx ].check;
1903 trie->trans[ stateidx ].check = 0;
1905 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1906 if ( flag || trie->trans[ stateidx + charid ].next ) {
1907 if ( trie->trans[ stateidx + charid ].next ) {
1909 for ( ; zp < pos ; zp++ ) {
1910 if ( ! trie->trans[ zp ].next ) {
1914 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1915 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1916 trie->trans[ zp ].check = state;
1917 if ( ++zp > pos ) pos = zp;
1924 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1926 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1927 trie->trans[ pos ].check = state;
1932 trie->lasttrans = pos + 1;
1933 trie->states = (reg_trie_state *)
1934 PerlMemShared_realloc( trie->states, laststate
1935 * sizeof(reg_trie_state) );
1936 DEBUG_TRIE_COMPILE_MORE_r(
1937 PerlIO_printf( Perl_debug_log,
1938 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1939 (int)depth * 2 + 2,"",
1940 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1943 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1946 } /* end table compress */
1948 DEBUG_TRIE_COMPILE_MORE_r(
1949 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1950 (int)depth * 2 + 2, "",
1951 (UV)trie->statecount,
1952 (UV)trie->lasttrans)
1954 /* resize the trans array to remove unused space */
1955 trie->trans = (reg_trie_trans *)
1956 PerlMemShared_realloc( trie->trans, trie->lasttrans
1957 * sizeof(reg_trie_trans) );
1959 { /* Modify the program and insert the new TRIE node */
1960 U8 nodetype =(U8)(flags & 0xFF);
1964 regnode *optimize = NULL;
1965 #ifdef RE_TRACK_PATTERN_OFFSETS
1968 U32 mjd_nodelen = 0;
1969 #endif /* RE_TRACK_PATTERN_OFFSETS */
1970 #endif /* DEBUGGING */
1972 This means we convert either the first branch or the first Exact,
1973 depending on whether the thing following (in 'last') is a branch
1974 or not and whther first is the startbranch (ie is it a sub part of
1975 the alternation or is it the whole thing.)
1976 Assuming its a sub part we convert the EXACT otherwise we convert
1977 the whole branch sequence, including the first.
1979 /* Find the node we are going to overwrite */
1980 if ( first != startbranch || OP( last ) == BRANCH ) {
1981 /* branch sub-chain */
1982 NEXT_OFF( first ) = (U16)(last - first);
1983 #ifdef RE_TRACK_PATTERN_OFFSETS
1985 mjd_offset= Node_Offset((convert));
1986 mjd_nodelen= Node_Length((convert));
1989 /* whole branch chain */
1991 #ifdef RE_TRACK_PATTERN_OFFSETS
1994 const regnode *nop = NEXTOPER( convert );
1995 mjd_offset= Node_Offset((nop));
1996 mjd_nodelen= Node_Length((nop));
2000 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2001 (int)depth * 2 + 2, "",
2002 (UV)mjd_offset, (UV)mjd_nodelen)
2005 /* But first we check to see if there is a common prefix we can
2006 split out as an EXACT and put in front of the TRIE node. */
2007 trie->startstate= 1;
2008 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2010 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2014 const U32 base = trie->states[ state ].trans.base;
2016 if ( trie->states[state].wordnum )
2019 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2020 if ( ( base + ofs >= trie->uniquecharcount ) &&
2021 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2022 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2024 if ( ++count > 1 ) {
2025 SV **tmp = av_fetch( revcharmap, ofs, 0);
2026 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2027 if ( state == 1 ) break;
2029 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2031 PerlIO_printf(Perl_debug_log,
2032 "%*sNew Start State=%"UVuf" Class: [",
2033 (int)depth * 2 + 2, "",
2036 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2037 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2039 TRIE_BITMAP_SET(trie,*ch);
2041 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2043 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2047 TRIE_BITMAP_SET(trie,*ch);
2049 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2050 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2056 SV **tmp = av_fetch( revcharmap, idx, 0);
2058 char *ch = SvPV( *tmp, len );
2060 SV *sv=sv_newmortal();
2061 PerlIO_printf( Perl_debug_log,
2062 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2063 (int)depth * 2 + 2, "",
2065 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2066 PL_colors[0], PL_colors[1],
2067 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2068 PERL_PV_ESCAPE_FIRSTCHAR
2073 OP( convert ) = nodetype;
2074 str=STRING(convert);
2077 STR_LEN(convert) += len;
2083 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2088 trie->prefixlen = (state-1);
2090 regnode *n = convert+NODE_SZ_STR(convert);
2091 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2092 trie->startstate = state;
2093 trie->minlen -= (state - 1);
2094 trie->maxlen -= (state - 1);
2096 /* At least the UNICOS C compiler choked on this
2097 * being argument to DEBUG_r(), so let's just have
2100 #ifdef PERL_EXT_RE_BUILD
2106 regnode *fix = convert;
2107 U32 word = trie->wordcount;
2109 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2110 while( ++fix < n ) {
2111 Set_Node_Offset_Length(fix, 0, 0);
2114 SV ** const tmp = av_fetch( trie_words, word, 0 );
2116 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2117 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2119 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2127 NEXT_OFF(convert) = (U16)(tail - convert);
2128 DEBUG_r(optimize= n);
2134 if ( trie->maxlen ) {
2135 NEXT_OFF( convert ) = (U16)(tail - convert);
2136 ARG_SET( convert, data_slot );
2137 /* Store the offset to the first unabsorbed branch in
2138 jump[0], which is otherwise unused by the jump logic.
2139 We use this when dumping a trie and during optimisation. */
2141 trie->jump[0] = (U16)(nextbranch - convert);
2143 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2144 * and there is a bitmap
2145 * and the first "jump target" node we found leaves enough room
2146 * then convert the TRIE node into a TRIEC node, with the bitmap
2147 * embedded inline in the opcode - this is hypothetically faster.
2149 if ( !trie->states[trie->startstate].wordnum
2151 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2153 OP( convert ) = TRIEC;
2154 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2155 PerlMemShared_free(trie->bitmap);
2158 OP( convert ) = TRIE;
2160 /* store the type in the flags */
2161 convert->flags = nodetype;
2165 + regarglen[ OP( convert ) ];
2167 /* XXX We really should free up the resource in trie now,
2168 as we won't use them - (which resources?) dmq */
2170 /* needed for dumping*/
2171 DEBUG_r(if (optimize) {
2172 regnode *opt = convert;
2174 while ( ++opt < optimize) {
2175 Set_Node_Offset_Length(opt,0,0);
2178 Try to clean up some of the debris left after the
2181 while( optimize < jumper ) {
2182 mjd_nodelen += Node_Length((optimize));
2183 OP( optimize ) = OPTIMIZED;
2184 Set_Node_Offset_Length(optimize,0,0);
2187 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2189 } /* end node insert */
2191 /* Finish populating the prev field of the wordinfo array. Walk back
2192 * from each accept state until we find another accept state, and if
2193 * so, point the first word's .prev field at the second word. If the
2194 * second already has a .prev field set, stop now. This will be the
2195 * case either if we've already processed that word's accept state,
2196 * or that state had multiple words, and the overspill words were
2197 * already linked up earlier.
2204 for (word=1; word <= trie->wordcount; word++) {
2206 if (trie->wordinfo[word].prev)
2208 state = trie->wordinfo[word].accept;
2210 state = prev_states[state];
2213 prev = trie->states[state].wordnum;
2217 trie->wordinfo[word].prev = prev;
2219 Safefree(prev_states);
2223 /* and now dump out the compressed format */
2224 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2226 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2228 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2229 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2231 SvREFCNT_dec(revcharmap);
2235 : trie->startstate>1
2241 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2243 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2245 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2246 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2249 We find the fail state for each state in the trie, this state is the longest proper
2250 suffix of the current state's 'word' that is also a proper prefix of another word in our
2251 trie. State 1 represents the word '' and is thus the default fail state. This allows
2252 the DFA not to have to restart after its tried and failed a word at a given point, it
2253 simply continues as though it had been matching the other word in the first place.
2255 'abcdgu'=~/abcdefg|cdgu/
2256 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2257 fail, which would bring us to the state representing 'd' in the second word where we would
2258 try 'g' and succeed, proceeding to match 'cdgu'.
2260 /* add a fail transition */
2261 const U32 trie_offset = ARG(source);
2262 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2264 const U32 ucharcount = trie->uniquecharcount;
2265 const U32 numstates = trie->statecount;
2266 const U32 ubound = trie->lasttrans + ucharcount;
2270 U32 base = trie->states[ 1 ].trans.base;
2273 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2274 GET_RE_DEBUG_FLAGS_DECL;
2276 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2278 PERL_UNUSED_ARG(depth);
2282 ARG_SET( stclass, data_slot );
2283 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2284 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2285 aho->trie=trie_offset;
2286 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2287 Copy( trie->states, aho->states, numstates, reg_trie_state );
2288 Newxz( q, numstates, U32);
2289 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2292 /* initialize fail[0..1] to be 1 so that we always have
2293 a valid final fail state */
2294 fail[ 0 ] = fail[ 1 ] = 1;
2296 for ( charid = 0; charid < ucharcount ; charid++ ) {
2297 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2299 q[ q_write ] = newstate;
2300 /* set to point at the root */
2301 fail[ q[ q_write++ ] ]=1;
2304 while ( q_read < q_write) {
2305 const U32 cur = q[ q_read++ % numstates ];
2306 base = trie->states[ cur ].trans.base;
2308 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2309 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2311 U32 fail_state = cur;
2314 fail_state = fail[ fail_state ];
2315 fail_base = aho->states[ fail_state ].trans.base;
2316 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2318 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2319 fail[ ch_state ] = fail_state;
2320 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2322 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2324 q[ q_write++ % numstates] = ch_state;
2328 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2329 when we fail in state 1, this allows us to use the
2330 charclass scan to find a valid start char. This is based on the principle
2331 that theres a good chance the string being searched contains lots of stuff
2332 that cant be a start char.
2334 fail[ 0 ] = fail[ 1 ] = 0;
2335 DEBUG_TRIE_COMPILE_r({
2336 PerlIO_printf(Perl_debug_log,
2337 "%*sStclass Failtable (%"UVuf" states): 0",
2338 (int)(depth * 2), "", (UV)numstates
2340 for( q_read=1; q_read<numstates; q_read++ ) {
2341 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2343 PerlIO_printf(Perl_debug_log, "\n");
2346 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2351 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2352 * These need to be revisited when a newer toolchain becomes available.
2354 #if defined(__sparc64__) && defined(__GNUC__)
2355 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2356 # undef SPARC64_GCC_WORKAROUND
2357 # define SPARC64_GCC_WORKAROUND 1
2361 #define DEBUG_PEEP(str,scan,depth) \
2362 DEBUG_OPTIMISE_r({if (scan){ \
2363 SV * const mysv=sv_newmortal(); \
2364 regnode *Next = regnext(scan); \
2365 regprop(RExC_rx, mysv, scan); \
2366 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2367 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2368 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2375 #define JOIN_EXACT(scan,min,flags) \
2376 if (PL_regkind[OP(scan)] == EXACT) \
2377 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2380 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2381 /* Merge several consecutive EXACTish nodes into one. */
2382 regnode *n = regnext(scan);
2384 regnode *next = scan + NODE_SZ_STR(scan);
2388 regnode *stop = scan;
2389 GET_RE_DEBUG_FLAGS_DECL;
2391 PERL_UNUSED_ARG(depth);
2394 PERL_ARGS_ASSERT_JOIN_EXACT;
2395 #ifndef EXPERIMENTAL_INPLACESCAN
2396 PERL_UNUSED_ARG(flags);
2397 PERL_UNUSED_ARG(val);
2399 DEBUG_PEEP("join",scan,depth);
2401 /* Skip NOTHING, merge EXACT*. */
2403 ( PL_regkind[OP(n)] == NOTHING ||
2404 (stringok && (OP(n) == OP(scan))))
2406 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2408 if (OP(n) == TAIL || n > next)
2410 if (PL_regkind[OP(n)] == NOTHING) {
2411 DEBUG_PEEP("skip:",n,depth);
2412 NEXT_OFF(scan) += NEXT_OFF(n);
2413 next = n + NODE_STEP_REGNODE;
2420 else if (stringok) {
2421 const unsigned int oldl = STR_LEN(scan);
2422 regnode * const nnext = regnext(n);
2424 DEBUG_PEEP("merg",n,depth);
2427 if (oldl + STR_LEN(n) > U8_MAX)
2429 NEXT_OFF(scan) += NEXT_OFF(n);
2430 STR_LEN(scan) += STR_LEN(n);
2431 next = n + NODE_SZ_STR(n);
2432 /* Now we can overwrite *n : */
2433 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2441 #ifdef EXPERIMENTAL_INPLACESCAN
2442 if (flags && !NEXT_OFF(n)) {
2443 DEBUG_PEEP("atch", val, depth);
2444 if (reg_off_by_arg[OP(n)]) {
2445 ARG_SET(n, val - n);
2448 NEXT_OFF(n) = val - n;
2456 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2457 && ( STR_LEN(scan) >= 6 ) )
2460 Two problematic code points in Unicode casefolding of EXACT nodes:
2462 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2463 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2469 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2470 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2472 This means that in case-insensitive matching (or "loose matching",
2473 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2474 length of the above casefolded versions) can match a target string
2475 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2476 This would rather mess up the minimum length computation.
2478 What we'll do is to look for the tail four bytes, and then peek
2479 at the preceding two bytes to see whether we need to decrease
2480 the minimum length by four (six minus two).
2482 Thanks to the design of UTF-8, there cannot be false matches:
2483 A sequence of valid UTF-8 bytes cannot be a subsequence of
2484 another valid sequence of UTF-8 bytes.
2487 char * const s0 = STRING(scan), *s, *t;
2488 char * const s1 = s0 + STR_LEN(scan) - 1;
2489 char * const s2 = s1 - 4;
2490 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2491 const char t0[] = "\xaf\x49\xaf\x42";
2493 const char t0[] = "\xcc\x88\xcc\x81";
2495 const char * const t1 = t0 + 3;
2498 s < s2 && (t = ninstr(s, s1, t0, t1));
2501 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2502 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2504 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2505 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2513 n = scan + NODE_SZ_STR(scan);
2515 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2522 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2526 /* REx optimizer. Converts nodes into quickier variants "in place".
2527 Finds fixed substrings. */
2529 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2530 to the position after last scanned or to NULL. */
2532 #define INIT_AND_WITHP \
2533 assert(!and_withp); \
2534 Newx(and_withp,1,struct regnode_charclass_class); \
2535 SAVEFREEPV(and_withp)
2537 /* this is a chain of data about sub patterns we are processing that
2538 need to be handled seperately/specially in study_chunk. Its so
2539 we can simulate recursion without losing state. */
2541 typedef struct scan_frame {
2542 regnode *last; /* last node to process in this frame */
2543 regnode *next; /* next node to process when last is reached */
2544 struct scan_frame *prev; /*previous frame*/
2545 I32 stop; /* what stopparen do we use */
2549 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2551 #define CASE_SYNST_FNC(nAmE) \
2553 if (flags & SCF_DO_STCLASS_AND) { \
2554 for (value = 0; value < 256; value++) \
2555 if (!is_ ## nAmE ## _cp(value)) \
2556 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2559 for (value = 0; value < 256; value++) \
2560 if (is_ ## nAmE ## _cp(value)) \
2561 ANYOF_BITMAP_SET(data->start_class, value); \
2565 if (flags & SCF_DO_STCLASS_AND) { \
2566 for (value = 0; value < 256; value++) \
2567 if (is_ ## nAmE ## _cp(value)) \
2568 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2571 for (value = 0; value < 256; value++) \
2572 if (!is_ ## nAmE ## _cp(value)) \
2573 ANYOF_BITMAP_SET(data->start_class, value); \
2580 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2581 I32 *minlenp, I32 *deltap,
2586 struct regnode_charclass_class *and_withp,
2587 U32 flags, U32 depth)
2588 /* scanp: Start here (read-write). */
2589 /* deltap: Write maxlen-minlen here. */
2590 /* last: Stop before this one. */
2591 /* data: string data about the pattern */
2592 /* stopparen: treat close N as END */
2593 /* recursed: which subroutines have we recursed into */
2594 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2597 I32 min = 0, pars = 0, code;
2598 regnode *scan = *scanp, *next;
2600 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2601 int is_inf_internal = 0; /* The studied chunk is infinite */
2602 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2603 scan_data_t data_fake;
2604 SV *re_trie_maxbuff = NULL;
2605 regnode *first_non_open = scan;
2606 I32 stopmin = I32_MAX;
2607 scan_frame *frame = NULL;
2608 GET_RE_DEBUG_FLAGS_DECL;
2610 PERL_ARGS_ASSERT_STUDY_CHUNK;
2613 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2617 while (first_non_open && OP(first_non_open) == OPEN)
2618 first_non_open=regnext(first_non_open);
2623 while ( scan && OP(scan) != END && scan < last ){
2624 /* Peephole optimizer: */
2625 DEBUG_STUDYDATA("Peep:", data,depth);
2626 DEBUG_PEEP("Peep",scan,depth);
2627 JOIN_EXACT(scan,&min,0);
2629 /* Follow the next-chain of the current node and optimize
2630 away all the NOTHINGs from it. */
2631 if (OP(scan) != CURLYX) {
2632 const int max = (reg_off_by_arg[OP(scan)]
2634 /* I32 may be smaller than U16 on CRAYs! */
2635 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2636 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2640 /* Skip NOTHING and LONGJMP. */
2641 while ((n = regnext(n))
2642 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2643 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2644 && off + noff < max)
2646 if (reg_off_by_arg[OP(scan)])
2649 NEXT_OFF(scan) = off;
2654 /* The principal pseudo-switch. Cannot be a switch, since we
2655 look into several different things. */
2656 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2657 || OP(scan) == IFTHEN) {
2658 next = regnext(scan);
2660 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2662 if (OP(next) == code || code == IFTHEN) {
2663 /* NOTE - There is similar code to this block below for handling
2664 TRIE nodes on a re-study. If you change stuff here check there
2666 I32 max1 = 0, min1 = I32_MAX, num = 0;
2667 struct regnode_charclass_class accum;
2668 regnode * const startbranch=scan;
2670 if (flags & SCF_DO_SUBSTR)
2671 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2672 if (flags & SCF_DO_STCLASS)
2673 cl_init_zero(pRExC_state, &accum);
2675 while (OP(scan) == code) {
2676 I32 deltanext, minnext, f = 0, fake;
2677 struct regnode_charclass_class this_class;
2680 data_fake.flags = 0;
2682 data_fake.whilem_c = data->whilem_c;
2683 data_fake.last_closep = data->last_closep;
2686 data_fake.last_closep = &fake;
2688 data_fake.pos_delta = delta;
2689 next = regnext(scan);
2690 scan = NEXTOPER(scan);
2692 scan = NEXTOPER(scan);
2693 if (flags & SCF_DO_STCLASS) {
2694 cl_init(pRExC_state, &this_class);
2695 data_fake.start_class = &this_class;
2696 f = SCF_DO_STCLASS_AND;
2698 if (flags & SCF_WHILEM_VISITED_POS)
2699 f |= SCF_WHILEM_VISITED_POS;
2701 /* we suppose the run is continuous, last=next...*/
2702 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2704 stopparen, recursed, NULL, f,depth+1);
2707 if (max1 < minnext + deltanext)
2708 max1 = minnext + deltanext;
2709 if (deltanext == I32_MAX)
2710 is_inf = is_inf_internal = 1;
2712 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2714 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2715 if ( stopmin > minnext)
2716 stopmin = min + min1;
2717 flags &= ~SCF_DO_SUBSTR;
2719 data->flags |= SCF_SEEN_ACCEPT;
2722 if (data_fake.flags & SF_HAS_EVAL)
2723 data->flags |= SF_HAS_EVAL;
2724 data->whilem_c = data_fake.whilem_c;
2726 if (flags & SCF_DO_STCLASS)
2727 cl_or(pRExC_state, &accum, &this_class);
2729 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2731 if (flags & SCF_DO_SUBSTR) {
2732 data->pos_min += min1;
2733 data->pos_delta += max1 - min1;
2734 if (max1 != min1 || is_inf)
2735 data->longest = &(data->longest_float);
2738 delta += max1 - min1;
2739 if (flags & SCF_DO_STCLASS_OR) {
2740 cl_or(pRExC_state, data->start_class, &accum);
2742 cl_and(data->start_class, and_withp);
2743 flags &= ~SCF_DO_STCLASS;
2746 else if (flags & SCF_DO_STCLASS_AND) {
2748 cl_and(data->start_class, &accum);
2749 flags &= ~SCF_DO_STCLASS;
2752 /* Switch to OR mode: cache the old value of
2753 * data->start_class */
2755 StructCopy(data->start_class, and_withp,
2756 struct regnode_charclass_class);
2757 flags &= ~SCF_DO_STCLASS_AND;
2758 StructCopy(&accum, data->start_class,
2759 struct regnode_charclass_class);
2760 flags |= SCF_DO_STCLASS_OR;
2761 data->start_class->flags |= ANYOF_EOS;
2765 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2768 Assuming this was/is a branch we are dealing with: 'scan' now
2769 points at the item that follows the branch sequence, whatever
2770 it is. We now start at the beginning of the sequence and look
2777 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2779 If we can find such a subseqence we need to turn the first
2780 element into a trie and then add the subsequent branch exact
2781 strings to the trie.
2785 1. patterns where the whole set of branches can be converted.
2787 2. patterns where only a subset can be converted.
2789 In case 1 we can replace the whole set with a single regop
2790 for the trie. In case 2 we need to keep the start and end
2793 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2794 becomes BRANCH TRIE; BRANCH X;
2796 There is an additional case, that being where there is a
2797 common prefix, which gets split out into an EXACT like node
2798 preceding the TRIE node.
2800 If x(1..n)==tail then we can do a simple trie, if not we make
2801 a "jump" trie, such that when we match the appropriate word
2802 we "jump" to the appopriate tail node. Essentailly we turn
2803 a nested if into a case structure of sorts.
2808 if (!re_trie_maxbuff) {
2809 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2810 if (!SvIOK(re_trie_maxbuff))
2811 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2813 if ( SvIV(re_trie_maxbuff)>=0 ) {
2815 regnode *first = (regnode *)NULL;
2816 regnode *last = (regnode *)NULL;
2817 regnode *tail = scan;
2822 SV * const mysv = sv_newmortal(); /* for dumping */
2824 /* var tail is used because there may be a TAIL
2825 regop in the way. Ie, the exacts will point to the
2826 thing following the TAIL, but the last branch will
2827 point at the TAIL. So we advance tail. If we
2828 have nested (?:) we may have to move through several
2832 while ( OP( tail ) == TAIL ) {
2833 /* this is the TAIL generated by (?:) */
2834 tail = regnext( tail );
2839 regprop(RExC_rx, mysv, tail );
2840 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2841 (int)depth * 2 + 2, "",
2842 "Looking for TRIE'able sequences. Tail node is: ",
2843 SvPV_nolen_const( mysv )
2849 step through the branches, cur represents each
2850 branch, noper is the first thing to be matched
2851 as part of that branch and noper_next is the
2852 regnext() of that node. if noper is an EXACT
2853 and noper_next is the same as scan (our current
2854 position in the regex) then the EXACT branch is
2855 a possible optimization target. Once we have
2856 two or more consequetive such branches we can
2857 create a trie of the EXACT's contents and stich
2858 it in place. If the sequence represents all of
2859 the branches we eliminate the whole thing and
2860 replace it with a single TRIE. If it is a
2861 subsequence then we need to stitch it in. This
2862 means the first branch has to remain, and needs
2863 to be repointed at the item on the branch chain
2864 following the last branch optimized. This could
2865 be either a BRANCH, in which case the
2866 subsequence is internal, or it could be the
2867 item following the branch sequence in which
2868 case the subsequence is at the end.
2872 /* dont use tail as the end marker for this traverse */
2873 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2874 regnode * const noper = NEXTOPER( cur );
2875 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2876 regnode * const noper_next = regnext( noper );
2880 regprop(RExC_rx, mysv, cur);
2881 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2882 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2884 regprop(RExC_rx, mysv, noper);
2885 PerlIO_printf( Perl_debug_log, " -> %s",
2886 SvPV_nolen_const(mysv));
2889 regprop(RExC_rx, mysv, noper_next );
2890 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2891 SvPV_nolen_const(mysv));
2893 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2894 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2896 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2897 : PL_regkind[ OP( noper ) ] == EXACT )
2898 || OP(noper) == NOTHING )
2900 && noper_next == tail
2905 if ( !first || optype == NOTHING ) {
2906 if (!first) first = cur;
2907 optype = OP( noper );
2913 Currently we do not believe that the trie logic can
2914 handle case insensitive matching properly when the
2915 pattern is not unicode (thus forcing unicode semantics).
2917 If/when this is fixed the following define can be swapped
2918 in below to fully enable trie logic.
2920 #define TRIE_TYPE_IS_SAFE 1
2923 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2925 if ( last && TRIE_TYPE_IS_SAFE ) {
2926 make_trie( pRExC_state,
2927 startbranch, first, cur, tail, count,
2930 if ( PL_regkind[ OP( noper ) ] == EXACT
2932 && noper_next == tail
2937 optype = OP( noper );
2947 regprop(RExC_rx, mysv, cur);
2948 PerlIO_printf( Perl_debug_log,
2949 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2950 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2954 if ( last && TRIE_TYPE_IS_SAFE ) {
2955 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2956 #ifdef TRIE_STUDY_OPT
2957 if ( ((made == MADE_EXACT_TRIE &&
2958 startbranch == first)
2959 || ( first_non_open == first )) &&
2961 flags |= SCF_TRIE_RESTUDY;
2962 if ( startbranch == first
2965 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2975 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2976 scan = NEXTOPER(NEXTOPER(scan));
2977 } else /* single branch is optimized. */
2978 scan = NEXTOPER(scan);
2980 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2981 scan_frame *newframe = NULL;
2986 if (OP(scan) != SUSPEND) {
2987 /* set the pointer */
2988 if (OP(scan) == GOSUB) {
2990 RExC_recurse[ARG2L(scan)] = scan;
2991 start = RExC_open_parens[paren-1];
2992 end = RExC_close_parens[paren-1];
2995 start = RExC_rxi->program + 1;
2999 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3000 SAVEFREEPV(recursed);
3002 if (!PAREN_TEST(recursed,paren+1)) {
3003 PAREN_SET(recursed,paren+1);
3004 Newx(newframe,1,scan_frame);
3006 if (flags & SCF_DO_SUBSTR) {
3007 SCAN_COMMIT(pRExC_state,data,minlenp);
3008 data->longest = &(data->longest_float);
3010 is_inf = is_inf_internal = 1;
3011 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3012 cl_anything(pRExC_state, data->start_class);
3013 flags &= ~SCF_DO_STCLASS;
3016 Newx(newframe,1,scan_frame);
3019 end = regnext(scan);
3024 SAVEFREEPV(newframe);
3025 newframe->next = regnext(scan);
3026 newframe->last = last;
3027 newframe->stop = stopparen;
3028 newframe->prev = frame;
3038 else if (OP(scan) == EXACT) {
3039 I32 l = STR_LEN(scan);
3042 const U8 * const s = (U8*)STRING(scan);
3043 l = utf8_length(s, s + l);
3044 uc = utf8_to_uvchr(s, NULL);
3046 uc = *((U8*)STRING(scan));
3049 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3050 /* The code below prefers earlier match for fixed
3051 offset, later match for variable offset. */
3052 if (data->last_end == -1) { /* Update the start info. */
3053 data->last_start_min = data->pos_min;
3054 data->last_start_max = is_inf
3055 ? I32_MAX : data->pos_min + data->pos_delta;
3057 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3059 SvUTF8_on(data->last_found);
3061 SV * const sv = data->last_found;
3062 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3063 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3064 if (mg && mg->mg_len >= 0)
3065 mg->mg_len += utf8_length((U8*)STRING(scan),
3066 (U8*)STRING(scan)+STR_LEN(scan));
3068 data->last_end = data->pos_min + l;
3069 data->pos_min += l; /* As in the first entry. */
3070 data->flags &= ~SF_BEFORE_EOL;
3072 if (flags & SCF_DO_STCLASS_AND) {
3073 /* Check whether it is compatible with what we know already! */
3077 /* If compatibile, we or it in below. It is compatible if is
3078 * in the bitmp and either 1) its bit or its fold is set, or 2)
3079 * it's for a locale. Even if there isn't unicode semantics
3080 * here, at runtime there may be because of matching against a
3081 * utf8 string, so accept a possible false positive for
3082 * latin1-range folds */
3084 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3085 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3086 && (!(data->start_class->flags & ANYOF_FOLD)
3087 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3090 ANYOF_CLASS_ZERO(data->start_class);
3091 ANYOF_BITMAP_ZERO(data->start_class);
3093 ANYOF_BITMAP_SET(data->start_class, uc);
3094 data->start_class->flags &= ~ANYOF_EOS;
3096 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3098 else if (flags & SCF_DO_STCLASS_OR) {
3099 /* false positive possible if the class is case-folded */
3101 ANYOF_BITMAP_SET(data->start_class, uc);
3103 data->start_class->flags |= ANYOF_UNICODE_ALL;
3104 data->start_class->flags &= ~ANYOF_EOS;
3105 cl_and(data->start_class, and_withp);
3107 flags &= ~SCF_DO_STCLASS;
3109 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3110 I32 l = STR_LEN(scan);
3111 UV uc = *((U8*)STRING(scan));
3113 /* Search for fixed substrings supports EXACT only. */
3114 if (flags & SCF_DO_SUBSTR) {
3116 SCAN_COMMIT(pRExC_state, data, minlenp);
3119 const U8 * const s = (U8 *)STRING(scan);
3120 l = utf8_length(s, s + l);
3121 uc = utf8_to_uvchr(s, NULL);
3124 if (flags & SCF_DO_SUBSTR)
3126 if (flags & SCF_DO_STCLASS_AND) {
3127 /* Check whether it is compatible with what we know already! */
3130 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3131 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3132 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3136 ANYOF_CLASS_ZERO(data->start_class);
3137 ANYOF_BITMAP_ZERO(data->start_class);
3139 ANYOF_BITMAP_SET(data->start_class, uc);
3140 data->start_class->flags &= ~ANYOF_EOS;
3141 data->start_class->flags |= ANYOF_FOLD;
3142 if (OP(scan) == EXACTFL) {
3143 data->start_class->flags |= ANYOF_LOCALE;
3147 /* Also set the other member of the fold pair. In case
3148 * that unicode semantics is called for at runtime, use
3149 * the full latin1 fold. (Can't do this for locale,
3150 * because not known until runtime */
3151 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3155 else if (flags & SCF_DO_STCLASS_OR) {
3156 if (data->start_class->flags & ANYOF_FOLD) {
3157 /* false positive possible if the class is case-folded.
3158 Assume that the locale settings are the same... */
3160 ANYOF_BITMAP_SET(data->start_class, uc);
3161 if (OP(scan) != EXACTFL) {
3163 /* And set the other member of the fold pair, but
3164 * can't do that in locale because not known until
3166 ANYOF_BITMAP_SET(data->start_class,
3167 PL_fold_latin1[uc]);
3170 data->start_class->flags &= ~ANYOF_EOS;
3172 cl_and(data->start_class, and_withp);
3174 flags &= ~SCF_DO_STCLASS;
3176 else if (REGNODE_VARIES(OP(scan))) {
3177 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3178 I32 f = flags, pos_before = 0;
3179 regnode * const oscan = scan;
3180 struct regnode_charclass_class this_class;
3181 struct regnode_charclass_class *oclass = NULL;
3182 I32 next_is_eval = 0;
3184 switch (PL_regkind[OP(scan)]) {
3185 case WHILEM: /* End of (?:...)* . */
3186 scan = NEXTOPER(scan);
3189 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3190 next = NEXTOPER(scan);
3191 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3193 maxcount = REG_INFTY;
3194 next = regnext(scan);
3195 scan = NEXTOPER(scan);
3199 if (flags & SCF_DO_SUBSTR)
3204 if (flags & SCF_DO_STCLASS) {
3206 maxcount = REG_INFTY;
3207 next = regnext(scan);
3208 scan = NEXTOPER(scan);
3211 is_inf = is_inf_internal = 1;
3212 scan = regnext(scan);
3213 if (flags & SCF_DO_SUBSTR) {
3214 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3215 data->longest = &(data->longest_float);
3217 goto optimize_curly_tail;
3219 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3220 && (scan->flags == stopparen))
3225 mincount = ARG1(scan);
3226 maxcount = ARG2(scan);
3228 next = regnext(scan);
3229 if (OP(scan) == CURLYX) {
3230 I32 lp = (data ? *(data->last_closep) : 0);
3231 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3233 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3234 next_is_eval = (OP(scan) == EVAL);
3236 if (flags & SCF_DO_SUBSTR) {
3237 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3238 pos_before = data->pos_min;
3242 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3244 data->flags |= SF_IS_INF;
3246 if (flags & SCF_DO_STCLASS) {
3247 cl_init(pRExC_state, &this_class);
3248 oclass = data->start_class;
3249 data->start_class = &this_class;
3250 f |= SCF_DO_STCLASS_AND;
3251 f &= ~SCF_DO_STCLASS_OR;
3253 /* Exclude from super-linear cache processing any {n,m}
3254 regops for which the combination of input pos and regex
3255 pos is not enough information to determine if a match
3258 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3259 regex pos at the \s*, the prospects for a match depend not
3260 only on the input position but also on how many (bar\s*)
3261 repeats into the {4,8} we are. */
3262 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3263 f &= ~SCF_WHILEM_VISITED_POS;
3265 /* This will finish on WHILEM, setting scan, or on NULL: */
3266 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3267 last, data, stopparen, recursed, NULL,
3269 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3271 if (flags & SCF_DO_STCLASS)
3272 data->start_class = oclass;
3273 if (mincount == 0 || minnext == 0) {
3274 if (flags & SCF_DO_STCLASS_OR) {
3275 cl_or(pRExC_state, data->start_class, &this_class);
3277 else if (flags & SCF_DO_STCLASS_AND) {
3278 /* Switch to OR mode: cache the old value of
3279 * data->start_class */
3281 StructCopy(data->start_class, and_withp,
3282 struct regnode_charclass_class);
3283 flags &= ~SCF_DO_STCLASS_AND;
3284 StructCopy(&this_class, data->start_class,
3285 struct regnode_charclass_class);
3286 flags |= SCF_DO_STCLASS_OR;
3287 data->start_class->flags |= ANYOF_EOS;
3289 } else { /* Non-zero len */
3290 if (flags & SCF_DO_STCLASS_OR) {
3291 cl_or(pRExC_state, data->start_class, &this_class);
3292 cl_and(data->start_class, and_withp);
3294 else if (flags & SCF_DO_STCLASS_AND)
3295 cl_and(data->start_class, &this_class);
3296 flags &= ~SCF_DO_STCLASS;
3298 if (!scan) /* It was not CURLYX, but CURLY. */
3300 if ( /* ? quantifier ok, except for (?{ ... }) */
3301 (next_is_eval || !(mincount == 0 && maxcount == 1))
3302 && (minnext == 0) && (deltanext == 0)
3303 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3304 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3306 ckWARNreg(RExC_parse,
3307 "Quantifier unexpected on zero-length expression");
3310 min += minnext * mincount;
3311 is_inf_internal |= ((maxcount == REG_INFTY
3312 && (minnext + deltanext) > 0)
3313 || deltanext == I32_MAX);
3314 is_inf |= is_inf_internal;
3315 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3317 /* Try powerful optimization CURLYX => CURLYN. */
3318 if ( OP(oscan) == CURLYX && data
3319 && data->flags & SF_IN_PAR
3320 && !(data->flags & SF_HAS_EVAL)
3321 && !deltanext && minnext == 1 ) {
3322 /* Try to optimize to CURLYN. */
3323 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3324 regnode * const nxt1 = nxt;
3331 if (!REGNODE_SIMPLE(OP(nxt))
3332 && !(PL_regkind[OP(nxt)] == EXACT
3333 && STR_LEN(nxt) == 1))
3339 if (OP(nxt) != CLOSE)
3341 if (RExC_open_parens) {
3342 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3343 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3345 /* Now we know that nxt2 is the only contents: */
3346 oscan->flags = (U8)ARG(nxt);
3348 OP(nxt1) = NOTHING; /* was OPEN. */
3351 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3352 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3353 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3354 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3355 OP(nxt + 1) = OPTIMIZED; /* was count. */
3356 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3361 /* Try optimization CURLYX => CURLYM. */
3362 if ( OP(oscan) == CURLYX && data
3363 && !(data->flags & SF_HAS_PAR)
3364 && !(data->flags & SF_HAS_EVAL)
3365 && !deltanext /* atom is fixed width */
3366 && minnext != 0 /* CURLYM can't handle zero width */
3368 /* XXXX How to optimize if data == 0? */
3369 /* Optimize to a simpler form. */
3370 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3374 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3375 && (OP(nxt2) != WHILEM))
3377 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3378 /* Need to optimize away parenths. */
3379 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3380 /* Set the parenth number. */
3381 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3383 oscan->flags = (U8)ARG(nxt);
3384 if (RExC_open_parens) {
3385 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3386 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3388 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3389 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3392 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3393 OP(nxt + 1) = OPTIMIZED; /* was count. */
3394 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3395 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3398 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3399 regnode *nnxt = regnext(nxt1);
3401 if (reg_off_by_arg[OP(nxt1)])
3402 ARG_SET(nxt1, nxt2 - nxt1);
3403 else if (nxt2 - nxt1 < U16_MAX)
3404 NEXT_OFF(nxt1) = nxt2 - nxt1;
3406 OP(nxt) = NOTHING; /* Cannot beautify */
3411 /* Optimize again: */
3412 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3413 NULL, stopparen, recursed, NULL, 0,depth+1);
3418 else if ((OP(oscan) == CURLYX)
3419 && (flags & SCF_WHILEM_VISITED_POS)
3420 /* See the comment on a similar expression above.
3421 However, this time it's not a subexpression
3422 we care about, but the expression itself. */
3423 && (maxcount == REG_INFTY)
3424 && data && ++data->whilem_c < 16) {
3425 /* This stays as CURLYX, we can put the count/of pair. */
3426 /* Find WHILEM (as in regexec.c) */
3427 regnode *nxt = oscan + NEXT_OFF(oscan);
3429 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3431 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3432 | (RExC_whilem_seen << 4)); /* On WHILEM */
3434 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3436 if (flags & SCF_DO_SUBSTR) {
3437 SV *last_str = NULL;
3438 int counted = mincount != 0;
3440 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3441 #if defined(SPARC64_GCC_WORKAROUND)
3444 const char *s = NULL;
3447 if (pos_before >= data->last_start_min)
3450 b = data->last_start_min;
3453 s = SvPV_const(data->last_found, l);
3454 old = b - data->last_start_min;
3457 I32 b = pos_before >= data->last_start_min
3458 ? pos_before : data->last_start_min;
3460 const char * const s = SvPV_const(data->last_found, l);
3461 I32 old = b - data->last_start_min;
3465 old = utf8_hop((U8*)s, old) - (U8*)s;
3467 /* Get the added string: */
3468 last_str = newSVpvn_utf8(s + old, l, UTF);
3469 if (deltanext == 0 && pos_before == b) {
3470 /* What was added is a constant string */
3472 SvGROW(last_str, (mincount * l) + 1);
3473 repeatcpy(SvPVX(last_str) + l,
3474 SvPVX_const(last_str), l, mincount - 1);
3475 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3476 /* Add additional parts. */
3477 SvCUR_set(data->last_found,
3478 SvCUR(data->last_found) - l);
3479 sv_catsv(data->last_found, last_str);
3481 SV * sv = data->last_found;
3483 SvUTF8(sv) && SvMAGICAL(sv) ?
3484 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3485 if (mg && mg->mg_len >= 0)
3486 mg->mg_len += CHR_SVLEN(last_str) - l;
3488 data->last_end += l * (mincount - 1);
3491 /* start offset must point into the last copy */
3492 data->last_start_min += minnext * (mincount - 1);
3493 data->last_start_max += is_inf ? I32_MAX
3494 : (maxcount - 1) * (minnext + data->pos_delta);
3497 /* It is counted once already... */
3498 data->pos_min += minnext * (mincount - counted);
3499 data->pos_delta += - counted * deltanext +
3500 (minnext + deltanext) * maxcount - minnext * mincount;
3501 if (mincount != maxcount) {
3502 /* Cannot extend fixed substrings found inside
3504 SCAN_COMMIT(pRExC_state,data,minlenp);
3505 if (mincount && last_str) {
3506 SV * const sv = data->last_found;
3507 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3508 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3512 sv_setsv(sv, last_str);
3513 data->last_end = data->pos_min;
3514 data->last_start_min =
3515 data->pos_min - CHR_SVLEN(last_str);
3516 data->last_start_max = is_inf
3518 : data->pos_min + data->pos_delta
3519 - CHR_SVLEN(last_str);
3521 data->longest = &(data->longest_float);
3523 SvREFCNT_dec(last_str);
3525 if (data && (fl & SF_HAS_EVAL))
3526 data->flags |= SF_HAS_EVAL;
3527 optimize_curly_tail:
3528 if (OP(oscan) != CURLYX) {
3529 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3531 NEXT_OFF(oscan) += NEXT_OFF(next);
3534 default: /* REF and CLUMP only? */
3535 if (flags & SCF_DO_SUBSTR) {
3536 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3537 data->longest = &(data->longest_float);
3539 is_inf = is_inf_internal = 1;
3540 if (flags & SCF_DO_STCLASS_OR)
3541 cl_anything(pRExC_state, data->start_class);
3542 flags &= ~SCF_DO_STCLASS;
3546 else if (OP(scan) == LNBREAK) {
3547 if (flags & SCF_DO_STCLASS) {
3549 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3550 if (flags & SCF_DO_STCLASS_AND) {
3551 for (value = 0; value < 256; value++)
3552 if (!is_VERTWS_cp(value))
3553 ANYOF_BITMAP_CLEAR(data->start_class, value);
3556 for (value = 0; value < 256; value++)
3557 if (is_VERTWS_cp(value))
3558 ANYOF_BITMAP_SET(data->start_class, value);
3560 if (flags & SCF_DO_STCLASS_OR)
3561 cl_and(data->start_class, and_withp);
3562 flags &= ~SCF_DO_STCLASS;
3566 if (flags & SCF_DO_SUBSTR) {
3567 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3569 data->pos_delta += 1;
3570 data->longest = &(data->longest_float);
3573 else if (OP(scan) == FOLDCHAR) {
3574 int d = ARG(scan)==0xDF ? 1 : 2;
3575 flags &= ~SCF_DO_STCLASS;
3578 if (flags & SCF_DO_SUBSTR) {
3579 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3581 data->pos_delta += d;
3582 data->longest = &(data->longest_float);
3585 else if (REGNODE_SIMPLE(OP(scan))) {
3588 if (flags & SCF_DO_SUBSTR) {
3589 SCAN_COMMIT(pRExC_state,data,minlenp);
3593 if (flags & SCF_DO_STCLASS) {
3594 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3596 /* Some of the logic below assumes that switching
3597 locale on will only add false positives. */
3598 switch (PL_regkind[OP(scan)]) {
3602 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3603 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3604 cl_anything(pRExC_state, data->start_class);
3607 if (OP(scan) == SANY)
3609 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3610 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3611 || ((data->start_class->flags & ANYOF_CLASS)
3612 && ANYOF_CLASS_TEST_ANY_SET(data->start_class)));
3613 cl_anything(pRExC_state, data->start_class);
3615 if (flags & SCF_DO_STCLASS_AND || !value)
3616 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3619 if (flags & SCF_DO_STCLASS_AND)
3620 cl_and(data->start_class,
3621 (struct regnode_charclass_class*)scan);
3623 cl_or(pRExC_state, data->start_class,
3624 (struct regnode_charclass_class*)scan);
3627 if (flags & SCF_DO_STCLASS_AND) {
3628 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3629 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3630 if (FLAGS(scan) & USE_UNI) {
3631 for (value = 0; value < 256; value++) {
3632 if (!isWORDCHAR_L1(value)) {
3633 ANYOF_BITMAP_CLEAR(data->start_class, value);
3637 for (value = 0; value < 256; value++) {
3638 if (!isALNUM(value)) {
3639 ANYOF_BITMAP_CLEAR(data->start_class, value);
3646 if (data->start_class->flags & ANYOF_LOCALE)
3647 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3648 else if (FLAGS(scan) & USE_UNI) {
3649 for (value = 0; value < 256; value++) {
3650 if (isWORDCHAR_L1(value)) {
3651 ANYOF_BITMAP_SET(data->start_class, value);
3655 for (value = 0; value < 256; value++) {
3656 if (isALNUM(value)) {
3657 ANYOF_BITMAP_SET(data->start_class, value);
3664 if (flags & SCF_DO_STCLASS_AND) {
3665 if (data->start_class->flags & ANYOF_LOCALE)
3666 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3669 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3670 data->start_class->flags |= ANYOF_LOCALE;
3674 if (flags & SCF_DO_STCLASS_AND) {
3675 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3676 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3677 if (FLAGS(scan) & USE_UNI) {
3678 for (value = 0; value < 256; value++) {
3679 if (isWORDCHAR_L1(value)) {
3680 ANYOF_BITMAP_CLEAR(data->start_class, value);
3684 for (value = 0; value < 256; value++) {
3685 if (isALNUM(value)) {
3686 ANYOF_BITMAP_CLEAR(data->start_class, value);
3693 if (data->start_class->flags & ANYOF_LOCALE)
3694 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3696 for (value = 0; value < 256; value++)
3697 if (!isALNUM(value))
3698 ANYOF_BITMAP_SET(data->start_class, value);
3703 if (flags & SCF_DO_STCLASS_AND) {
3704 if (data->start_class->flags & ANYOF_LOCALE)
3705 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3708 data->start_class->flags |= ANYOF_LOCALE;
3709 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3713 if (flags & SCF_DO_STCLASS_AND) {
3714 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3715 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3716 if (FLAGS(scan) & USE_UNI) {
3717 for (value = 0; value < 256; value++) {
3718 if (!isSPACE_L1(value)) {
3719 ANYOF_BITMAP_CLEAR(data->start_class, value);
3723 for (value = 0; value < 256; value++) {
3724 if (!isSPACE(value)) {
3725 ANYOF_BITMAP_CLEAR(data->start_class, value);
3732 if (data->start_class->flags & ANYOF_LOCALE) {
3733 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3735 else if (FLAGS(scan) & USE_UNI) {
3736 for (value = 0; value < 256; value++) {
3737 if (isSPACE_L1(value)) {
3738 ANYOF_BITMAP_SET(data->start_class, value);
3742 for (value = 0; value < 256; value++) {
3743 if (isSPACE(value)) {
3744 ANYOF_BITMAP_SET(data->start_class, value);
3751 if (flags & SCF_DO_STCLASS_AND) {
3752 if (data->start_class->flags & ANYOF_LOCALE)
3753 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3756 data->start_class->flags |= ANYOF_LOCALE;
3757 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3761 if (flags & SCF_DO_STCLASS_AND) {
3762 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3763 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3764 if (FLAGS(scan) & USE_UNI) {
3765 for (value = 0; value < 256; value++) {
3766 if (isSPACE_L1(value)) {
3767 ANYOF_BITMAP_CLEAR(data->start_class, value);
3771 for (value = 0; value < 256; value++) {
3772 if (isSPACE(value)) {
3773 ANYOF_BITMAP_CLEAR(data->start_class, value);
3780 if (data->start_class->flags & ANYOF_LOCALE)
3781 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3782 else if (FLAGS(scan) & USE_UNI) {
3783 for (value = 0; value < 256; value++) {
3784 if (!isSPACE_L1(value)) {
3785 ANYOF_BITMAP_SET(data->start_class, value);
3790 for (value = 0; value < 256; value++) {
3791 if (!isSPACE(value)) {
3792 ANYOF_BITMAP_SET(data->start_class, value);
3799 if (flags & SCF_DO_STCLASS_AND) {
3800 if (data->start_class->flags & ANYOF_LOCALE) {
3801 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3802 for (value = 0; value < 256; value++)
3803 if (!isSPACE(value))
3804 ANYOF_BITMAP_CLEAR(data->start_class, value);
3808 data->start_class->flags |= ANYOF_LOCALE;
3809 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3813 if (flags & SCF_DO_STCLASS_AND) {
3814 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3815 for (value = 0; value < 256; value++)
3816 if (!isDIGIT(value))
3817 ANYOF_BITMAP_CLEAR(data->start_class, value);
3820 if (data->start_class->flags & ANYOF_LOCALE)
3821 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3823 for (value = 0; value < 256; value++)
3825 ANYOF_BITMAP_SET(data->start_class, value);
3830 if (flags & SCF_DO_STCLASS_AND) {
3831 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3832 for (value = 0; value < 256; value++)
3834 ANYOF_BITMAP_CLEAR(data->start_class, value);
3837 if (data->start_class->flags & ANYOF_LOCALE)
3838 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3840 for (value = 0; value < 256; value++)
3841 if (!isDIGIT(value))
3842 ANYOF_BITMAP_SET(data->start_class, value);
3846 CASE_SYNST_FNC(VERTWS);
3847 CASE_SYNST_FNC(HORIZWS);
3850 if (flags & SCF_DO_STCLASS_OR)
3851 cl_and(data->start_class, and_withp);
3852 flags &= ~SCF_DO_STCLASS;
3855 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3856 data->flags |= (OP(scan) == MEOL
3860 else if ( PL_regkind[OP(scan)] == BRANCHJ
3861 /* Lookbehind, or need to calculate parens/evals/stclass: */
3862 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3863 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3864 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3865 || OP(scan) == UNLESSM )
3867 /* Negative Lookahead/lookbehind
3868 In this case we can't do fixed string optimisation.
3871 I32 deltanext, minnext, fake = 0;
3873 struct regnode_charclass_class intrnl;
3876 data_fake.flags = 0;
3878 data_fake.whilem_c = data->whilem_c;
3879 data_fake.last_closep = data->last_closep;
3882 data_fake.last_closep = &fake;
3883 data_fake.pos_delta = delta;
3884 if ( flags & SCF_DO_STCLASS && !scan->flags
3885 && OP(scan) == IFMATCH ) { /* Lookahead */
3886 cl_init(pRExC_state, &intrnl);
3887 data_fake.start_class = &intrnl;
3888 f |= SCF_DO_STCLASS_AND;
3890 if (flags & SCF_WHILEM_VISITED_POS)
3891 f |= SCF_WHILEM_VISITED_POS;
3892 next = regnext(scan);
3893 nscan = NEXTOPER(NEXTOPER(scan));
3894 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3895 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3898 FAIL("Variable length lookbehind not implemented");
3900 else if (minnext > (I32)U8_MAX) {
3901 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3903 scan->flags = (U8)minnext;
3906 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3908 if (data_fake.flags & SF_HAS_EVAL)
3909 data->flags |= SF_HAS_EVAL;
3910 data->whilem_c = data_fake.whilem_c;
3912 if (f & SCF_DO_STCLASS_AND) {
3913 if (flags & SCF_DO_STCLASS_OR) {
3914 /* OR before, AND after: ideally we would recurse with
3915 * data_fake to get the AND applied by study of the
3916 * remainder of the pattern, and then derecurse;
3917 * *** HACK *** for now just treat as "no information".
3918 * See [perl #56690].
3920 cl_init(pRExC_state, data->start_class);
3922 /* AND before and after: combine and continue */
3923 const int was = (data->start_class->flags & ANYOF_EOS);
3925 cl_and(data->start_class, &intrnl);
3927 data->start_class->flags |= ANYOF_EOS;
3931 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3933 /* Positive Lookahead/lookbehind
3934 In this case we can do fixed string optimisation,
3935 but we must be careful about it. Note in the case of
3936 lookbehind the positions will be offset by the minimum
3937 length of the pattern, something we won't know about
3938 until after the recurse.
3940 I32 deltanext, fake = 0;
3942 struct regnode_charclass_class intrnl;
3944 /* We use SAVEFREEPV so that when the full compile
3945 is finished perl will clean up the allocated
3946 minlens when it's all done. This way we don't
3947 have to worry about freeing them when we know
3948 they wont be used, which would be a pain.
3951 Newx( minnextp, 1, I32 );
3952 SAVEFREEPV(minnextp);
3955 StructCopy(data, &data_fake, scan_data_t);
3956 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3959 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3960 data_fake.last_found=newSVsv(data->last_found);
3964 data_fake.last_closep = &fake;
3965 data_fake.flags = 0;
3966 data_fake.pos_delta = delta;
3968 data_fake.flags |= SF_IS_INF;
3969 if ( flags & SCF_DO_STCLASS && !scan->flags
3970 && OP(scan) == IFMATCH ) { /* Lookahead */
3971 cl_init(pRExC_state, &intrnl);
3972 data_fake.start_class = &intrnl;
3973 f |= SCF_DO_STCLASS_AND;
3975 if (flags & SCF_WHILEM_VISITED_POS)
3976 f |= SCF_WHILEM_VISITED_POS;
3977 next = regnext(scan);
3978 nscan = NEXTOPER(NEXTOPER(scan));
3980 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3981 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3984 FAIL("Variable length lookbehind not implemented");
3986 else if (*minnextp > (I32)U8_MAX) {
3987 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3989 scan->flags = (U8)*minnextp;
3994 if (f & SCF_DO_STCLASS_AND) {
3995 const int was = (data->start_class->flags & ANYOF_EOS);
3997 cl_and(data->start_class, &intrnl);
3999 data->start_class->flags |= ANYOF_EOS;
4002 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4004 if (data_fake.flags & SF_HAS_EVAL)
4005 data->flags |= SF_HAS_EVAL;
4006 data->whilem_c = data_fake.whilem_c;
4007 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4008 if (RExC_rx->minlen<*minnextp)
4009 RExC_rx->minlen=*minnextp;
4010 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4011 SvREFCNT_dec(data_fake.last_found);
4013 if ( data_fake.minlen_fixed != minlenp )
4015 data->offset_fixed= data_fake.offset_fixed;
4016 data->minlen_fixed= data_fake.minlen_fixed;
4017 data->lookbehind_fixed+= scan->flags;
4019 if ( data_fake.minlen_float != minlenp )
4021 data->minlen_float= data_fake.minlen_float;
4022 data->offset_float_min=data_fake.offset_float_min;
4023 data->offset_float_max=data_fake.offset_float_max;
4024 data->lookbehind_float+= scan->flags;
4033 else if (OP(scan) == OPEN) {
4034 if (stopparen != (I32)ARG(scan))
4037 else if (OP(scan) == CLOSE) {
4038 if (stopparen == (I32)ARG(scan)) {
4041 if ((I32)ARG(scan) == is_par) {
4042 next = regnext(scan);
4044 if ( next && (OP(next) != WHILEM) && next < last)
4045 is_par = 0; /* Disable optimization */
4048 *(data->last_closep) = ARG(scan);
4050 else if (OP(scan) == EVAL) {
4052 data->flags |= SF_HAS_EVAL;
4054 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4055 if (flags & SCF_DO_SUBSTR) {
4056 SCAN_COMMIT(pRExC_state,data,minlenp);
4057 flags &= ~SCF_DO_SUBSTR;
4059 if (data && OP(scan)==ACCEPT) {
4060 data->flags |= SCF_SEEN_ACCEPT;
4065 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4067 if (flags & SCF_DO_SUBSTR) {
4068 SCAN_COMMIT(pRExC_state,data,minlenp);
4069 data->longest = &(data->longest_float);
4071 is_inf = is_inf_internal = 1;
4072 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4073 cl_anything(pRExC_state, data->start_class);
4074 flags &= ~SCF_DO_STCLASS;
4076 else if (OP(scan) == GPOS) {
4077 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4078 !(delta || is_inf || (data && data->pos_delta)))
4080 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4081 RExC_rx->extflags |= RXf_ANCH_GPOS;
4082 if (RExC_rx->gofs < (U32)min)
4083 RExC_rx->gofs = min;
4085 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4089 #ifdef TRIE_STUDY_OPT
4090 #ifdef FULL_TRIE_STUDY
4091 else if (PL_regkind[OP(scan)] == TRIE) {
4092 /* NOTE - There is similar code to this block above for handling
4093 BRANCH nodes on the initial study. If you change stuff here
4095 regnode *trie_node= scan;
4096 regnode *tail= regnext(scan);
4097 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4098 I32 max1 = 0, min1 = I32_MAX;
4099 struct regnode_charclass_class accum;
4101 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4102 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4103 if (flags & SCF_DO_STCLASS)
4104 cl_init_zero(pRExC_state, &accum);
4110 const regnode *nextbranch= NULL;
4113 for ( word=1 ; word <= trie->wordcount ; word++)
4115 I32 deltanext=0, minnext=0, f = 0, fake;
4116 struct regnode_charclass_class this_class;
4118 data_fake.flags = 0;
4120 data_fake.whilem_c = data->whilem_c;
4121 data_fake.last_closep = data->last_closep;
4124 data_fake.last_closep = &fake;
4125 data_fake.pos_delta = delta;
4126 if (flags & SCF_DO_STCLASS) {
4127 cl_init(pRExC_state, &this_class);
4128 data_fake.start_class = &this_class;
4129 f = SCF_DO_STCLASS_AND;
4131 if (flags & SCF_WHILEM_VISITED_POS)
4132 f |= SCF_WHILEM_VISITED_POS;
4134 if (trie->jump[word]) {
4136 nextbranch = trie_node + trie->jump[0];
4137 scan= trie_node + trie->jump[word];
4138 /* We go from the jump point to the branch that follows
4139 it. Note this means we need the vestigal unused branches
4140 even though they arent otherwise used.
4142 minnext = study_chunk(pRExC_state, &scan, minlenp,
4143 &deltanext, (regnode *)nextbranch, &data_fake,
4144 stopparen, recursed, NULL, f,depth+1);
4146 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4147 nextbranch= regnext((regnode*)nextbranch);
4149 if (min1 > (I32)(minnext + trie->minlen))
4150 min1 = minnext + trie->minlen;
4151 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4152 max1 = minnext + deltanext + trie->maxlen;
4153 if (deltanext == I32_MAX)
4154 is_inf = is_inf_internal = 1;
4156 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4158 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4159 if ( stopmin > min + min1)
4160 stopmin = min + min1;
4161 flags &= ~SCF_DO_SUBSTR;
4163 data->flags |= SCF_SEEN_ACCEPT;
4166 if (data_fake.flags & SF_HAS_EVAL)
4167 data->flags |= SF_HAS_EVAL;
4168 data->whilem_c = data_fake.whilem_c;
4170 if (flags & SCF_DO_STCLASS)
4171 cl_or(pRExC_state, &accum, &this_class);
4174 if (flags & SCF_DO_SUBSTR) {
4175 data->pos_min += min1;
4176 data->pos_delta += max1 - min1;
4177 if (max1 != min1 || is_inf)
4178 data->longest = &(data->longest_float);
4181 delta += max1 - min1;
4182 if (flags & SCF_DO_STCLASS_OR) {
4183 cl_or(pRExC_state, data->start_class, &accum);
4185 cl_and(data->start_class, and_withp);
4186 flags &= ~SCF_DO_STCLASS;
4189 else if (flags & SCF_DO_STCLASS_AND) {
4191 cl_and(data->start_class, &accum);
4192 flags &= ~SCF_DO_STCLASS;
4195 /* Switch to OR mode: cache the old value of
4196 * data->start_class */
4198 StructCopy(data->start_class, and_withp,
4199 struct regnode_charclass_class);
4200 flags &= ~SCF_DO_STCLASS_AND;
4201 StructCopy(&accum, data->start_class,
4202 struct regnode_charclass_class);
4203 flags |= SCF_DO_STCLASS_OR;
4204 data->start_class->flags |= ANYOF_EOS;
4211 else if (PL_regkind[OP(scan)] == TRIE) {
4212 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4215 min += trie->minlen;
4216 delta += (trie->maxlen - trie->minlen);
4217 flags &= ~SCF_DO_STCLASS; /* xxx */
4218 if (flags & SCF_DO_SUBSTR) {
4219 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4220 data->pos_min += trie->minlen;
4221 data->pos_delta += (trie->maxlen - trie->minlen);
4222 if (trie->maxlen != trie->minlen)
4223 data->longest = &(data->longest_float);
4225 if (trie->jump) /* no more substrings -- for now /grr*/
4226 flags &= ~SCF_DO_SUBSTR;
4228 #endif /* old or new */
4229 #endif /* TRIE_STUDY_OPT */
4231 /* Else: zero-length, ignore. */
4232 scan = regnext(scan);
4237 stopparen = frame->stop;
4238 frame = frame->prev;
4239 goto fake_study_recurse;
4244 DEBUG_STUDYDATA("pre-fin:",data,depth);
4247 *deltap = is_inf_internal ? I32_MAX : delta;
4248 if (flags & SCF_DO_SUBSTR && is_inf)
4249 data->pos_delta = I32_MAX - data->pos_min;
4250 if (is_par > (I32)U8_MAX)
4252 if (is_par && pars==1 && data) {
4253 data->flags |= SF_IN_PAR;
4254 data->flags &= ~SF_HAS_PAR;
4256 else if (pars && data) {
4257 data->flags |= SF_HAS_PAR;
4258 data->flags &= ~SF_IN_PAR;
4260 if (flags & SCF_DO_STCLASS_OR)
4261 cl_and(data->start_class, and_withp);
4262 if (flags & SCF_TRIE_RESTUDY)
4263 data->flags |= SCF_TRIE_RESTUDY;
4265 DEBUG_STUDYDATA("post-fin:",data,depth);
4267 return min < stopmin ? min : stopmin;
4271 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4273 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4275 PERL_ARGS_ASSERT_ADD_DATA;
4277 Renewc(RExC_rxi->data,
4278 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4279 char, struct reg_data);
4281 Renew(RExC_rxi->data->what, count + n, U8);
4283 Newx(RExC_rxi->data->what, n, U8);
4284 RExC_rxi->data->count = count + n;
4285 Copy(s, RExC_rxi->data->what + count, n, U8);
4289 /*XXX: todo make this not included in a non debugging perl */
4290 #ifndef PERL_IN_XSUB_RE
4292 Perl_reginitcolors(pTHX)
4295 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4297 char *t = savepv(s);
4301 t = strchr(t, '\t');
4307 PL_colors[i] = t = (char *)"";
4312 PL_colors[i++] = (char *)"";
4319 #ifdef TRIE_STUDY_OPT
4320 #define CHECK_RESTUDY_GOTO \
4322 (data.flags & SCF_TRIE_RESTUDY) \
4326 #define CHECK_RESTUDY_GOTO
4330 - pregcomp - compile a regular expression into internal code
4332 * We can't allocate space until we know how big the compiled form will be,
4333 * but we can't compile it (and thus know how big it is) until we've got a
4334 * place to put the code. So we cheat: we compile it twice, once with code
4335 * generation turned off and size counting turned on, and once "for real".
4336 * This also means that we don't allocate space until we are sure that the
4337 * thing really will compile successfully, and we never have to move the
4338 * code and thus invalidate pointers into it. (Note that it has to be in
4339 * one piece because free() must be able to free it all.) [NB: not true in perl]
4341 * Beware that the optimization-preparation code in here knows about some
4342 * of the structure of the compiled regexp. [I'll say.]
4347 #ifndef PERL_IN_XSUB_RE
4348 #define RE_ENGINE_PTR &PL_core_reg_engine
4350 extern const struct regexp_engine my_reg_engine;
4351 #define RE_ENGINE_PTR &my_reg_engine
4354 #ifndef PERL_IN_XSUB_RE
4356 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4359 HV * const table = GvHV(PL_hintgv);
4361 PERL_ARGS_ASSERT_PREGCOMP;
4363 /* Dispatch a request to compile a regexp to correct
4366 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4367 GET_RE_DEBUG_FLAGS_DECL;
4368 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4369 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4371 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4374 return CALLREGCOMP_ENG(eng, pattern, flags);
4377 return Perl_re_compile(aTHX_ pattern, flags);
4382 Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
4387 register regexp_internal *ri;
4395 /* these are all flags - maybe they should be turned
4396 * into a single int with different bit masks */
4397 I32 sawlookahead = 0;
4404 RExC_state_t RExC_state;
4405 RExC_state_t * const pRExC_state = &RExC_state;
4406 #ifdef TRIE_STUDY_OPT
4408 RExC_state_t copyRExC_state;
4410 GET_RE_DEBUG_FLAGS_DECL;
4412 PERL_ARGS_ASSERT_RE_COMPILE;
4414 DEBUG_r(if (!PL_colorset) reginitcolors());
4416 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4419 /* Longjmp back to here if have to switch in midstream to utf8 */
4420 if (! RExC_orig_utf8) {
4421 JMPENV_PUSH(jump_ret);
4424 if (jump_ret == 0) { /* First time through */
4425 exp = SvPV(pattern, plen);
4429 SV *dsv= sv_newmortal();
4430 RE_PV_QUOTED_DECL(s, RExC_utf8,
4431 dsv, exp, plen, 60);
4432 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4433 PL_colors[4],PL_colors[5],s);
4436 else { /* longjumped back */
4439 /* If the cause for the longjmp was other than changing to utf8, pop
4440 * our own setjmp, and longjmp to the correct handler */
4441 if (jump_ret != UTF8_LONGJMP) {
4443 JMPENV_JUMP(jump_ret);
4448 /* It's possible to write a regexp in ascii that represents Unicode
4449 codepoints outside of the byte range, such as via \x{100}. If we
4450 detect such a sequence we have to convert the entire pattern to utf8
4451 and then recompile, as our sizing calculation will have been based
4452 on 1 byte == 1 character, but we will need to use utf8 to encode
4453 at least some part of the pattern, and therefore must convert the whole
4456 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4457 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4458 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4460 RExC_orig_utf8 = RExC_utf8 = 1;
4464 #ifdef TRIE_STUDY_OPT
4468 /* Set to use unicode semantics if the pattern is in utf8 and has the
4469 * 'dual' charset specified, as it means unicode when utf8 */
4470 if (RExC_utf8 && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
4471 pm_flags |= RXf_PMf_UNICODE;
4475 RExC_flags = pm_flags;
4479 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4480 RExC_seen_evals = 0;
4483 /* First pass: determine size, legality. */
4491 RExC_emit = &PL_regdummy;
4492 RExC_whilem_seen = 0;
4493 RExC_open_parens = NULL;
4494 RExC_close_parens = NULL;
4496 RExC_paren_names = NULL;
4498 RExC_paren_name_list = NULL;
4500 RExC_recurse = NULL;
4501 RExC_recurse_count = 0;
4503 #if 0 /* REGC() is (currently) a NOP at the first pass.
4504 * Clever compilers notice this and complain. --jhi */
4505 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4507 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4508 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4509 RExC_precomp = NULL;
4513 /* Here, finished first pass. Get rid of our setjmp, which we added for
4514 * efficiency only if the passed-in string wasn't in utf8, as shown by
4515 * RExC_orig_utf8. But if the first pass was redone, that variable will be
4516 * 1 here even though the original string wasn't utf8, but in this case
4517 * there will have been a long jump */
4518 if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
4522 PerlIO_printf(Perl_debug_log,
4523 "Required size %"IVdf" nodes\n"
4524 "Starting second pass (creation)\n",
4527 RExC_lastparse=NULL;
4529 /* Small enough for pointer-storage convention?
4530 If extralen==0, this means that we will not need long jumps. */
4531 if (RExC_size >= 0x10000L && RExC_extralen)
4532 RExC_size += RExC_extralen;
4535 if (RExC_whilem_seen > 15)
4536 RExC_whilem_seen = 15;
4538 /* Allocate space and zero-initialize. Note, the two step process
4539 of zeroing when in debug mode, thus anything assigned has to
4540 happen after that */
4541 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4542 r = (struct regexp*)SvANY(rx);
4543 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4544 char, regexp_internal);
4545 if ( r == NULL || ri == NULL )
4546 FAIL("Regexp out of space");
4548 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4549 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4551 /* bulk initialize base fields with 0. */
4552 Zero(ri, sizeof(regexp_internal), char);
4555 /* non-zero initialization begins here */
4557 r->engine= RE_ENGINE_PTR;
4558 r->extflags = pm_flags;
4560 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4561 bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
4563 /* The caret is output if there are any defaults: if not all the STD
4564 * flags are set, or if no character set specifier is needed */
4566 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4568 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4569 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4570 >> RXf_PMf_STD_PMMOD_SHIFT);
4571 const char *fptr = STD_PAT_MODS; /*"msix"*/
4573 /* Allocate for the worst case, which is all the std flags are turned
4574 * on. If more precision is desired, we could do a population count of
4575 * the flags set. This could be done with a small lookup table, or by
4576 * shifting, masking and adding, or even, when available, assembly
4577 * language for a machine-language population count.
4578 * We never output a minus, as all those are defaults, so are
4579 * covered by the caret */
4580 const STRLEN wraplen = plen + has_p + has_runon
4581 + has_default /* If needs a caret */
4582 + has_charset /* If needs a character set specifier */
4583 + (sizeof(STD_PAT_MODS) - 1)
4584 + (sizeof("(?:)") - 1);
4586 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4588 SvFLAGS(rx) |= SvUTF8(pattern);
4591 /* If a default, cover it using the caret */
4593 *p++= DEFAULT_PAT_MOD;
4596 if (r->extflags & RXf_PMf_LOCALE) {
4597 *p++ = LOCALE_PAT_MOD;
4599 *p++ = UNICODE_PAT_MOD;
4603 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4606 while((ch = *fptr++)) {
4614 Copy(RExC_precomp, p, plen, char);
4615 assert ((RX_WRAPPED(rx) - p) < 16);
4616 r->pre_prefix = p - RX_WRAPPED(rx);
4622 SvCUR_set(rx, p - SvPVX_const(rx));
4626 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4628 if (RExC_seen & REG_SEEN_RECURSE) {
4629 Newxz(RExC_open_parens, RExC_npar,regnode *);
4630 SAVEFREEPV(RExC_open_parens);
4631 Newxz(RExC_close_parens,RExC_npar,regnode *);
4632 SAVEFREEPV(RExC_close_parens);
4635 /* Useful during FAIL. */
4636 #ifdef RE_TRACK_PATTERN_OFFSETS
4637 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4638 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4639 "%s %"UVuf" bytes for offset annotations.\n",
4640 ri->u.offsets ? "Got" : "Couldn't get",
4641 (UV)((2*RExC_size+1) * sizeof(U32))));
4643 SetProgLen(ri,RExC_size);
4648 /* Second pass: emit code. */
4649 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4654 RExC_emit_start = ri->program;
4655 RExC_emit = ri->program;
4656 RExC_emit_bound = ri->program + RExC_size + 1;
4658 /* Store the count of eval-groups for security checks: */
4659 RExC_rx->seen_evals = RExC_seen_evals;
4660 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4661 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4665 /* XXXX To minimize changes to RE engine we always allocate
4666 3-units-long substrs field. */
4667 Newx(r->substrs, 1, struct reg_substr_data);
4668 if (RExC_recurse_count) {
4669 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4670 SAVEFREEPV(RExC_recurse);
4674 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4675 Zero(r->substrs, 1, struct reg_substr_data);
4677 #ifdef TRIE_STUDY_OPT
4679 StructCopy(&zero_scan_data, &data, scan_data_t);
4680 copyRExC_state = RExC_state;
4683 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4685 RExC_state = copyRExC_state;
4686 if (seen & REG_TOP_LEVEL_BRANCHES)
4687 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4689 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4690 if (data.last_found) {
4691 SvREFCNT_dec(data.longest_fixed);
4692 SvREFCNT_dec(data.longest_float);
4693 SvREFCNT_dec(data.last_found);
4695 StructCopy(&zero_scan_data, &data, scan_data_t);
4698 StructCopy(&zero_scan_data, &data, scan_data_t);
4701 /* Dig out information for optimizations. */
4702 r->extflags = RExC_flags; /* was pm_op */
4703 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4706 SvUTF8_on(rx); /* Unicode in it? */
4707 ri->regstclass = NULL;
4708 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4709 r->intflags |= PREGf_NAUGHTY;
4710 scan = ri->program + 1; /* First BRANCH. */
4712 /* testing for BRANCH here tells us whether there is "must appear"
4713 data in the pattern. If there is then we can use it for optimisations */
4714 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4716 STRLEN longest_float_length, longest_fixed_length;
4717 struct regnode_charclass_class ch_class; /* pointed to by data */
4719 I32 last_close = 0; /* pointed to by data */
4720 regnode *first= scan;
4721 regnode *first_next= regnext(first);
4723 * Skip introductions and multiplicators >= 1
4724 * so that we can extract the 'meat' of the pattern that must
4725 * match in the large if() sequence following.
4726 * NOTE that EXACT is NOT covered here, as it is normally
4727 * picked up by the optimiser separately.
4729 * This is unfortunate as the optimiser isnt handling lookahead
4730 * properly currently.
4733 while ((OP(first) == OPEN && (sawopen = 1)) ||
4734 /* An OR of *one* alternative - should not happen now. */
4735 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4736 /* for now we can't handle lookbehind IFMATCH*/
4737 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4738 (OP(first) == PLUS) ||
4739 (OP(first) == MINMOD) ||
4740 /* An {n,m} with n>0 */
4741 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4742 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4745 * the only op that could be a regnode is PLUS, all the rest
4746 * will be regnode_1 or regnode_2.
4749 if (OP(first) == PLUS)
4752 first += regarglen[OP(first)];
4754 first = NEXTOPER(first);
4755 first_next= regnext(first);
4758 /* Starting-point info. */
4760 DEBUG_PEEP("first:",first,0);
4761 /* Ignore EXACT as we deal with it later. */
4762 if (PL_regkind[OP(first)] == EXACT) {
4763 if (OP(first) == EXACT)
4764 NOOP; /* Empty, get anchored substr later. */
4766 ri->regstclass = first;
4769 else if (PL_regkind[OP(first)] == TRIE &&
4770 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4773 /* this can happen only on restudy */
4774 if ( OP(first) == TRIE ) {
4775 struct regnode_1 *trieop = (struct regnode_1 *)
4776 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4777 StructCopy(first,trieop,struct regnode_1);
4778 trie_op=(regnode *)trieop;
4780 struct regnode_charclass *trieop = (struct regnode_charclass *)
4781 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4782 StructCopy(first,trieop,struct regnode_charclass);
4783 trie_op=(regnode *)trieop;
4786 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4787 ri->regstclass = trie_op;
4790 else if (REGNODE_SIMPLE(OP(first)))
4791 ri->regstclass = first;
4792 else if (PL_regkind[OP(first)] == BOUND ||
4793 PL_regkind[OP(first)] == NBOUND)
4794 ri->regstclass = first;
4795 else if (PL_regkind[OP(first)] == BOL) {
4796 r->extflags |= (OP(first) == MBOL
4798 : (OP(first) == SBOL
4801 first = NEXTOPER(first);
4804 else if (OP(first) == GPOS) {
4805 r->extflags |= RXf_ANCH_GPOS;
4806 first = NEXTOPER(first);
4809 else if ((!sawopen || !RExC_sawback) &&
4810 (OP(first) == STAR &&
4811 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4812 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4814 /* turn .* into ^.* with an implied $*=1 */
4816 (OP(NEXTOPER(first)) == REG_ANY)
4819 r->extflags |= type;
4820 r->intflags |= PREGf_IMPLICIT;
4821 first = NEXTOPER(first);
4824 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4825 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4826 /* x+ must match at the 1st pos of run of x's */
4827 r->intflags |= PREGf_SKIP;
4829 /* Scan is after the zeroth branch, first is atomic matcher. */
4830 #ifdef TRIE_STUDY_OPT
4833 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4834 (IV)(first - scan + 1))
4838 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4839 (IV)(first - scan + 1))
4845 * If there's something expensive in the r.e., find the
4846 * longest literal string that must appear and make it the
4847 * regmust. Resolve ties in favor of later strings, since
4848 * the regstart check works with the beginning of the r.e.
4849 * and avoiding duplication strengthens checking. Not a
4850 * strong reason, but sufficient in the absence of others.
4851 * [Now we resolve ties in favor of the earlier string if
4852 * it happens that c_offset_min has been invalidated, since the
4853 * earlier string may buy us something the later one won't.]
4856 data.longest_fixed = newSVpvs("");
4857 data.longest_float = newSVpvs("");
4858 data.last_found = newSVpvs("");
4859 data.longest = &(data.longest_fixed);
4861 if (!ri->regstclass) {
4862 cl_init(pRExC_state, &ch_class);
4863 data.start_class = &ch_class;
4864 stclass_flag = SCF_DO_STCLASS_AND;
4865 } else /* XXXX Check for BOUND? */
4867 data.last_closep = &last_close;
4869 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4870 &data, -1, NULL, NULL,
4871 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4877 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4878 && data.last_start_min == 0 && data.last_end > 0
4879 && !RExC_seen_zerolen
4880 && !(RExC_seen & REG_SEEN_VERBARG)
4881 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4882 r->extflags |= RXf_CHECK_ALL;
4883 scan_commit(pRExC_state, &data,&minlen,0);
4884 SvREFCNT_dec(data.last_found);
4886 /* Note that code very similar to this but for anchored string
4887 follows immediately below, changes may need to be made to both.
4890 longest_float_length = CHR_SVLEN(data.longest_float);
4891 if (longest_float_length
4892 || (data.flags & SF_FL_BEFORE_EOL
4893 && (!(data.flags & SF_FL_BEFORE_MEOL)
4894 || (RExC_flags & RXf_PMf_MULTILINE))))
4898 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4899 && data.offset_fixed == data.offset_float_min
4900 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4901 goto remove_float; /* As in (a)+. */
4903 /* copy the information about the longest float from the reg_scan_data
4904 over to the program. */
4905 if (SvUTF8(data.longest_float)) {
4906 r->float_utf8 = data.longest_float;
4907 r->float_substr = NULL;
4909 r->float_substr = data.longest_float;
4910 r->float_utf8 = NULL;
4912 /* float_end_shift is how many chars that must be matched that
4913 follow this item. We calculate it ahead of time as once the
4914 lookbehind offset is added in we lose the ability to correctly
4916 ml = data.minlen_float ? *(data.minlen_float)
4917 : (I32)longest_float_length;
4918 r->float_end_shift = ml - data.offset_float_min
4919 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4920 + data.lookbehind_float;
4921 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4922 r->float_max_offset = data.offset_float_max;
4923 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4924 r->float_max_offset -= data.lookbehind_float;
4926 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4927 && (!(data.flags & SF_FL_BEFORE_MEOL)
4928 || (RExC_flags & RXf_PMf_MULTILINE)));
4929 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4933 r->float_substr = r->float_utf8 = NULL;
4934 SvREFCNT_dec(data.longest_float);
4935 longest_float_length = 0;
4938 /* Note that code very similar to this but for floating string
4939 is immediately above, changes may need to be made to both.
4942 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4943 if (longest_fixed_length
4944 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4945 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4946 || (RExC_flags & RXf_PMf_MULTILINE))))
4950 /* copy the information about the longest fixed
4951 from the reg_scan_data over to the program. */
4952 if (SvUTF8(data.longest_fixed)) {
4953 r->anchored_utf8 = data.longest_fixed;
4954 r->anchored_substr = NULL;
4956 r->anchored_substr = data.longest_fixed;
4957 r->anchored_utf8 = NULL;
4959 /* fixed_end_shift is how many chars that must be matched that
4960 follow this item. We calculate it ahead of time as once the
4961 lookbehind offset is added in we lose the ability to correctly
4963 ml = data.minlen_fixed ? *(data.minlen_fixed)
4964 : (I32)longest_fixed_length;
4965 r->anchored_end_shift = ml - data.offset_fixed
4966 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4967 + data.lookbehind_fixed;
4968 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4970 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4971 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4972 || (RExC_flags & RXf_PMf_MULTILINE)));
4973 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4976 r->anchored_substr = r->anchored_utf8 = NULL;
4977 SvREFCNT_dec(data.longest_fixed);
4978 longest_fixed_length = 0;
4981 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4982 ri->regstclass = NULL;
4983 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4985 && !(data.start_class->flags & ANYOF_EOS)
4986 && !cl_is_anything(data.start_class))
4988 const U32 n = add_data(pRExC_state, 1, "f");
4990 Newx(RExC_rxi->data->data[n], 1,
4991 struct regnode_charclass_class);
4992 StructCopy(data.start_class,
4993 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4994 struct regnode_charclass_class);
4995 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
4996 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
4997 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
4998 regprop(r, sv, (regnode*)data.start_class);
4999 PerlIO_printf(Perl_debug_log,
5000 "synthetic stclass \"%s\".\n",
5001 SvPVX_const(sv));});
5004 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5005 if (longest_fixed_length > longest_float_length) {
5006 r->check_end_shift = r->anchored_end_shift;
5007 r->check_substr = r->anchored_substr;
5008 r->check_utf8 = r->anchored_utf8;
5009 r->check_offset_min = r->check_offset_max = r->anchored_offset;
5010 if (r->extflags & RXf_ANCH_SINGLE)
5011 r->extflags |= RXf_NOSCAN;
5014 r->check_end_shift = r->float_end_shift;
5015 r->check_substr = r->float_substr;
5016 r->check_utf8 = r->float_utf8;
5017 r->check_offset_min = r->float_min_offset;
5018 r->check_offset_max = r->float_max_offset;
5020 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5021 This should be changed ASAP! */
5022 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5023 r->extflags |= RXf_USE_INTUIT;
5024 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5025 r->extflags |= RXf_INTUIT_TAIL;
5027 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5028 if ( (STRLEN)minlen < longest_float_length )
5029 minlen= longest_float_length;
5030 if ( (STRLEN)minlen < longest_fixed_length )
5031 minlen= longest_fixed_length;
5035 /* Several toplevels. Best we can is to set minlen. */
5037 struct regnode_charclass_class ch_class;
5040 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5042 scan = ri->program + 1;
5043 cl_init(pRExC_state, &ch_class);
5044 data.start_class = &ch_class;
5045 data.last_closep = &last_close;
5048 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5049 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5053 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5054 = r->float_substr = r->float_utf8 = NULL;
5055 if (!(data.start_class->flags & ANYOF_EOS)
5056 && !cl_is_anything(data.start_class))
5058 const U32 n = add_data(pRExC_state, 1, "f");
5060 Newx(RExC_rxi->data->data[n], 1,
5061 struct regnode_charclass_class);
5062 StructCopy(data.start_class,
5063 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5064 struct regnode_charclass_class);
5065 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5066 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5067 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5068 regprop(r, sv, (regnode*)data.start_class);
5069 PerlIO_printf(Perl_debug_log,
5070 "synthetic stclass \"%s\".\n",
5071 SvPVX_const(sv));});
5075 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5076 the "real" pattern. */
5078 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5079 (IV)minlen, (IV)r->minlen);
5081 r->minlenret = minlen;
5082 if (r->minlen < minlen)
5085 if (RExC_seen & REG_SEEN_GPOS)
5086 r->extflags |= RXf_GPOS_SEEN;
5087 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5088 r->extflags |= RXf_LOOKBEHIND_SEEN;
5089 if (RExC_seen & REG_SEEN_EVAL)
5090 r->extflags |= RXf_EVAL_SEEN;
5091 if (RExC_seen & REG_SEEN_CANY)
5092 r->extflags |= RXf_CANY_SEEN;
5093 if (RExC_seen & REG_SEEN_VERBARG)
5094 r->intflags |= PREGf_VERBARG_SEEN;
5095 if (RExC_seen & REG_SEEN_CUTGROUP)
5096 r->intflags |= PREGf_CUTGROUP_SEEN;
5097 if (RExC_paren_names)
5098 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5100 RXp_PAREN_NAMES(r) = NULL;
5102 #ifdef STUPID_PATTERN_CHECKS
5103 if (RX_PRELEN(rx) == 0)
5104 r->extflags |= RXf_NULL;
5105 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5106 /* XXX: this should happen BEFORE we compile */
5107 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5108 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5109 r->extflags |= RXf_WHITE;
5110 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5111 r->extflags |= RXf_START_ONLY;
5113 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5114 /* XXX: this should happen BEFORE we compile */
5115 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5117 regnode *first = ri->program + 1;
5119 U8 nop = OP(NEXTOPER(first));
5121 if (PL_regkind[fop] == NOTHING && nop == END)
5122 r->extflags |= RXf_NULL;
5123 else if (PL_regkind[fop] == BOL && nop == END)
5124 r->extflags |= RXf_START_ONLY;
5125 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5126 r->extflags |= RXf_WHITE;
5130 if (RExC_paren_names) {
5131 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5132 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5135 ri->name_list_idx = 0;
5137 if (RExC_recurse_count) {
5138 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5139 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5140 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5143 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5144 /* assume we don't need to swap parens around before we match */
5147 PerlIO_printf(Perl_debug_log,"Final program:\n");
5150 #ifdef RE_TRACK_PATTERN_OFFSETS
5151 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5152 const U32 len = ri->u.offsets[0];
5154 GET_RE_DEBUG_FLAGS_DECL;
5155 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5156 for (i = 1; i <= len; i++) {
5157 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5158 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5159 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5161 PerlIO_printf(Perl_debug_log, "\n");
5167 #undef RE_ENGINE_PTR
5171 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5174 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5176 PERL_UNUSED_ARG(value);
5178 if (flags & RXapif_FETCH) {
5179 return reg_named_buff_fetch(rx, key, flags);
5180 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5181 Perl_croak_no_modify(aTHX);
5183 } else if (flags & RXapif_EXISTS) {
5184 return reg_named_buff_exists(rx, key, flags)
5187 } else if (flags & RXapif_REGNAMES) {
5188 return reg_named_buff_all(rx, flags);
5189 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5190 return reg_named_buff_scalar(rx, flags);
5192 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5198 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5201 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5202 PERL_UNUSED_ARG(lastkey);
5204 if (flags & RXapif_FIRSTKEY)
5205 return reg_named_buff_firstkey(rx, flags);
5206 else if (flags & RXapif_NEXTKEY)
5207 return reg_named_buff_nextkey(rx, flags);
5209 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5215 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5218 AV *retarray = NULL;
5220 struct regexp *const rx = (struct regexp *)SvANY(r);
5222 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5224 if (flags & RXapif_ALL)
5227 if (rx && RXp_PAREN_NAMES(rx)) {
5228 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5231 SV* sv_dat=HeVAL(he_str);
5232 I32 *nums=(I32*)SvPVX(sv_dat);
5233 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5234 if ((I32)(rx->nparens) >= nums[i]
5235 && rx->offs[nums[i]].start != -1
5236 && rx->offs[nums[i]].end != -1)
5239 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5243 ret = newSVsv(&PL_sv_undef);
5246 av_push(retarray, ret);
5249 return newRV_noinc(MUTABLE_SV(retarray));
5256 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5259 struct regexp *const rx = (struct regexp *)SvANY(r);
5261 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5263 if (rx && RXp_PAREN_NAMES(rx)) {
5264 if (flags & RXapif_ALL) {
5265 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5267 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5281 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5283 struct regexp *const rx = (struct regexp *)SvANY(r);
5285 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5287 if ( rx && RXp_PAREN_NAMES(rx) ) {
5288 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5290 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5297 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5299 struct regexp *const rx = (struct regexp *)SvANY(r);
5300 GET_RE_DEBUG_FLAGS_DECL;
5302 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5304 if (rx && RXp_PAREN_NAMES(rx)) {
5305 HV *hv = RXp_PAREN_NAMES(rx);
5307 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5310 SV* sv_dat = HeVAL(temphe);
5311 I32 *nums = (I32*)SvPVX(sv_dat);
5312 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5313 if ((I32)(rx->lastparen) >= nums[i] &&
5314 rx->offs[nums[i]].start != -1 &&
5315 rx->offs[nums[i]].end != -1)
5321 if (parno || flags & RXapif_ALL) {
5322 return newSVhek(HeKEY_hek(temphe));
5330 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5335 struct regexp *const rx = (struct regexp *)SvANY(r);
5337 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5339 if (rx && RXp_PAREN_NAMES(rx)) {
5340 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5341 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5342 } else if (flags & RXapif_ONE) {
5343 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5344 av = MUTABLE_AV(SvRV(ret));
5345 length = av_len(av);
5347 return newSViv(length + 1);
5349 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5353 return &PL_sv_undef;
5357 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5359 struct regexp *const rx = (struct regexp *)SvANY(r);
5362 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5364 if (rx && RXp_PAREN_NAMES(rx)) {
5365 HV *hv= RXp_PAREN_NAMES(rx);
5367 (void)hv_iterinit(hv);
5368 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5371 SV* sv_dat = HeVAL(temphe);
5372 I32 *nums = (I32*)SvPVX(sv_dat);
5373 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5374 if ((I32)(rx->lastparen) >= nums[i] &&
5375 rx->offs[nums[i]].start != -1 &&
5376 rx->offs[nums[i]].end != -1)
5382 if (parno || flags & RXapif_ALL) {
5383 av_push(av, newSVhek(HeKEY_hek(temphe)));
5388 return newRV_noinc(MUTABLE_SV(av));
5392 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5395 struct regexp *const rx = (struct regexp *)SvANY(r);
5400 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5403 sv_setsv(sv,&PL_sv_undef);
5407 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5409 i = rx->offs[0].start;
5413 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5415 s = rx->subbeg + rx->offs[0].end;
5416 i = rx->sublen - rx->offs[0].end;
5419 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5420 (s1 = rx->offs[paren].start) != -1 &&
5421 (t1 = rx->offs[paren].end) != -1)
5425 s = rx->subbeg + s1;
5427 sv_setsv(sv,&PL_sv_undef);
5430 assert(rx->sublen >= (s - rx->subbeg) + i );
5432 const int oldtainted = PL_tainted;
5434 sv_setpvn(sv, s, i);
5435 PL_tainted = oldtainted;
5436 if ( (rx->extflags & RXf_CANY_SEEN)
5437 ? (RXp_MATCH_UTF8(rx)
5438 && (!i || is_utf8_string((U8*)s, i)))
5439 : (RXp_MATCH_UTF8(rx)) )
5446 if (RXp_MATCH_TAINTED(rx)) {
5447 if (SvTYPE(sv) >= SVt_PVMG) {
5448 MAGIC* const mg = SvMAGIC(sv);
5451 SvMAGIC_set(sv, mg->mg_moremagic);
5453 if ((mgt = SvMAGIC(sv))) {
5454 mg->mg_moremagic = mgt;
5455 SvMAGIC_set(sv, mg);
5465 sv_setsv(sv,&PL_sv_undef);
5471 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5472 SV const * const value)
5474 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5476 PERL_UNUSED_ARG(rx);
5477 PERL_UNUSED_ARG(paren);
5478 PERL_UNUSED_ARG(value);
5481 Perl_croak_no_modify(aTHX);
5485 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5488 struct regexp *const rx = (struct regexp *)SvANY(r);
5492 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5494 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5496 /* $` / ${^PREMATCH} */
5497 case RX_BUFF_IDX_PREMATCH:
5498 if (rx->offs[0].start != -1) {
5499 i = rx->offs[0].start;
5507 /* $' / ${^POSTMATCH} */
5508 case RX_BUFF_IDX_POSTMATCH:
5509 if (rx->offs[0].end != -1) {
5510 i = rx->sublen - rx->offs[0].end;
5512 s1 = rx->offs[0].end;
5518 /* $& / ${^MATCH}, $1, $2, ... */
5520 if (paren <= (I32)rx->nparens &&
5521 (s1 = rx->offs[paren].start) != -1 &&
5522 (t1 = rx->offs[paren].end) != -1)
5527 if (ckWARN(WARN_UNINITIALIZED))
5528 report_uninit((const SV *)sv);
5533 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5534 const char * const s = rx->subbeg + s1;
5539 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5546 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5548 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5549 PERL_UNUSED_ARG(rx);
5553 return newSVpvs("Regexp");
5556 /* Scans the name of a named buffer from the pattern.
5557 * If flags is REG_RSN_RETURN_NULL returns null.
5558 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5559 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5560 * to the parsed name as looked up in the RExC_paren_names hash.
5561 * If there is an error throws a vFAIL().. type exception.
5564 #define REG_RSN_RETURN_NULL 0
5565 #define REG_RSN_RETURN_NAME 1
5566 #define REG_RSN_RETURN_DATA 2
5569 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5571 char *name_start = RExC_parse;
5573 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5575 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5576 /* skip IDFIRST by using do...while */
5579 RExC_parse += UTF8SKIP(RExC_parse);
5580 } while (isALNUM_utf8((U8*)RExC_parse));
5584 } while (isALNUM(*RExC_parse));
5589 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5590 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5591 if ( flags == REG_RSN_RETURN_NAME)
5593 else if (flags==REG_RSN_RETURN_DATA) {
5596 if ( ! sv_name ) /* should not happen*/
5597 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5598 if (RExC_paren_names)
5599 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5601 sv_dat = HeVAL(he_str);
5603 vFAIL("Reference to nonexistent named group");
5607 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5614 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5615 int rem=(int)(RExC_end - RExC_parse); \
5624 if (RExC_lastparse!=RExC_parse) \
5625 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5628 iscut ? "..." : "<" \
5631 PerlIO_printf(Perl_debug_log,"%16s",""); \
5634 num = RExC_size + 1; \
5636 num=REG_NODE_NUM(RExC_emit); \
5637 if (RExC_lastnum!=num) \
5638 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5640 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5641 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5642 (int)((depth*2)), "", \
5646 RExC_lastparse=RExC_parse; \
5651 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5652 DEBUG_PARSE_MSG((funcname)); \
5653 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5655 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5656 DEBUG_PARSE_MSG((funcname)); \
5657 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5660 - reg - regular expression, i.e. main body or parenthesized thing
5662 * Caller must absorb opening parenthesis.
5664 * Combining parenthesis handling with the base level of regular expression
5665 * is a trifle forced, but the need to tie the tails of the branches to what
5666 * follows makes it hard to avoid.
5668 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5670 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5672 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5676 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5677 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5680 register regnode *ret; /* Will be the head of the group. */
5681 register regnode *br;
5682 register regnode *lastbr;
5683 register regnode *ender = NULL;
5684 register I32 parno = 0;
5686 U32 oregflags = RExC_flags;
5687 bool have_branch = 0;
5689 I32 freeze_paren = 0;
5690 I32 after_freeze = 0;
5692 /* for (?g), (?gc), and (?o) warnings; warning
5693 about (?c) will warn about (?g) -- japhy */
5695 #define WASTED_O 0x01
5696 #define WASTED_G 0x02
5697 #define WASTED_C 0x04
5698 #define WASTED_GC (0x02|0x04)
5699 I32 wastedflags = 0x00;
5701 char * parse_start = RExC_parse; /* MJD */
5702 char * const oregcomp_parse = RExC_parse;
5704 GET_RE_DEBUG_FLAGS_DECL;
5706 PERL_ARGS_ASSERT_REG;
5707 DEBUG_PARSE("reg ");
5709 *flagp = 0; /* Tentatively. */
5712 /* Make an OPEN node, if parenthesized. */
5714 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5715 char *start_verb = RExC_parse;
5716 STRLEN verb_len = 0;
5717 char *start_arg = NULL;
5718 unsigned char op = 0;
5720 int internal_argval = 0; /* internal_argval is only useful if !argok */
5721 while ( *RExC_parse && *RExC_parse != ')' ) {
5722 if ( *RExC_parse == ':' ) {
5723 start_arg = RExC_parse + 1;
5729 verb_len = RExC_parse - start_verb;
5732 while ( *RExC_parse && *RExC_parse != ')' )
5734 if ( *RExC_parse != ')' )
5735 vFAIL("Unterminated verb pattern argument");
5736 if ( RExC_parse == start_arg )
5739 if ( *RExC_parse != ')' )
5740 vFAIL("Unterminated verb pattern");
5743 switch ( *start_verb ) {
5744 case 'A': /* (*ACCEPT) */
5745 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5747 internal_argval = RExC_nestroot;
5750 case 'C': /* (*COMMIT) */
5751 if ( memEQs(start_verb,verb_len,"COMMIT") )
5754 case 'F': /* (*FAIL) */
5755 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5760 case ':': /* (*:NAME) */
5761 case 'M': /* (*MARK:NAME) */
5762 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5767 case 'P': /* (*PRUNE) */
5768 if ( memEQs(start_verb,verb_len,"PRUNE") )
5771 case 'S': /* (*SKIP) */
5772 if ( memEQs(start_verb,verb_len,"SKIP") )
5775 case 'T': /* (*THEN) */
5776 /* [19:06] <TimToady> :: is then */
5777 if ( memEQs(start_verb,verb_len,"THEN") ) {
5779 RExC_seen |= REG_SEEN_CUTGROUP;
5785 vFAIL3("Unknown verb pattern '%.*s'",
5786 verb_len, start_verb);
5789 if ( start_arg && internal_argval ) {
5790 vFAIL3("Verb pattern '%.*s' may not have an argument",
5791 verb_len, start_verb);
5792 } else if ( argok < 0 && !start_arg ) {
5793 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5794 verb_len, start_verb);
5796 ret = reganode(pRExC_state, op, internal_argval);
5797 if ( ! internal_argval && ! SIZE_ONLY ) {
5799 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5800 ARG(ret) = add_data( pRExC_state, 1, "S" );
5801 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5808 if (!internal_argval)
5809 RExC_seen |= REG_SEEN_VERBARG;
5810 } else if ( start_arg ) {
5811 vFAIL3("Verb pattern '%.*s' may not have an argument",
5812 verb_len, start_verb);
5814 ret = reg_node(pRExC_state, op);
5816 nextchar(pRExC_state);
5819 if (*RExC_parse == '?') { /* (?...) */
5820 bool is_logical = 0;
5821 const char * const seqstart = RExC_parse;
5822 bool has_use_defaults = FALSE;
5825 paren = *RExC_parse++;
5826 ret = NULL; /* For look-ahead/behind. */
5829 case 'P': /* (?P...) variants for those used to PCRE/Python */
5830 paren = *RExC_parse++;
5831 if ( paren == '<') /* (?P<...>) named capture */
5833 else if (paren == '>') { /* (?P>name) named recursion */
5834 goto named_recursion;
5836 else if (paren == '=') { /* (?P=...) named backref */
5837 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5838 you change this make sure you change that */
5839 char* name_start = RExC_parse;
5841 SV *sv_dat = reg_scan_name(pRExC_state,
5842 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5843 if (RExC_parse == name_start || *RExC_parse != ')')
5844 vFAIL2("Sequence %.3s... not terminated",parse_start);
5847 num = add_data( pRExC_state, 1, "S" );
5848 RExC_rxi->data->data[num]=(void*)sv_dat;
5849 SvREFCNT_inc_simple_void(sv_dat);
5852 ret = reganode(pRExC_state,
5853 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5857 Set_Node_Offset(ret, parse_start+1);
5858 Set_Node_Cur_Length(ret); /* MJD */
5860 nextchar(pRExC_state);
5864 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5866 case '<': /* (?<...) */
5867 if (*RExC_parse == '!')
5869 else if (*RExC_parse != '=')
5875 case '\'': /* (?'...') */
5876 name_start= RExC_parse;
5877 svname = reg_scan_name(pRExC_state,
5878 SIZE_ONLY ? /* reverse test from the others */
5879 REG_RSN_RETURN_NAME :
5880 REG_RSN_RETURN_NULL);
5881 if (RExC_parse == name_start) {
5883 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5886 if (*RExC_parse != paren)
5887 vFAIL2("Sequence (?%c... not terminated",
5888 paren=='>' ? '<' : paren);
5892 if (!svname) /* shouldnt happen */
5894 "panic: reg_scan_name returned NULL");
5895 if (!RExC_paren_names) {
5896 RExC_paren_names= newHV();
5897 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5899 RExC_paren_name_list= newAV();
5900 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5903 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5905 sv_dat = HeVAL(he_str);
5907 /* croak baby croak */
5909 "panic: paren_name hash element allocation failed");
5910 } else if ( SvPOK(sv_dat) ) {
5911 /* (?|...) can mean we have dupes so scan to check
5912 its already been stored. Maybe a flag indicating
5913 we are inside such a construct would be useful,
5914 but the arrays are likely to be quite small, so
5915 for now we punt -- dmq */
5916 IV count = SvIV(sv_dat);
5917 I32 *pv = (I32*)SvPVX(sv_dat);
5919 for ( i = 0 ; i < count ; i++ ) {
5920 if ( pv[i] == RExC_npar ) {
5926 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5927 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5928 pv[count] = RExC_npar;
5929 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5932 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5933 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5935 SvIV_set(sv_dat, 1);
5938 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5939 SvREFCNT_dec(svname);
5942 /*sv_dump(sv_dat);*/
5944 nextchar(pRExC_state);
5946 goto capturing_parens;
5948 RExC_seen |= REG_SEEN_LOOKBEHIND;
5950 case '=': /* (?=...) */
5951 RExC_seen_zerolen++;
5953 case '!': /* (?!...) */
5954 RExC_seen_zerolen++;
5955 if (*RExC_parse == ')') {
5956 ret=reg_node(pRExC_state, OPFAIL);
5957 nextchar(pRExC_state);
5961 case '|': /* (?|...) */
5962 /* branch reset, behave like a (?:...) except that
5963 buffers in alternations share the same numbers */
5965 after_freeze = freeze_paren = RExC_npar;
5967 case ':': /* (?:...) */
5968 case '>': /* (?>...) */
5970 case '$': /* (?$...) */
5971 case '@': /* (?@...) */
5972 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5974 case '#': /* (?#...) */
5975 while (*RExC_parse && *RExC_parse != ')')
5977 if (*RExC_parse != ')')
5978 FAIL("Sequence (?#... not terminated");
5979 nextchar(pRExC_state);
5982 case '0' : /* (?0) */
5983 case 'R' : /* (?R) */
5984 if (*RExC_parse != ')')
5985 FAIL("Sequence (?R) not terminated");
5986 ret = reg_node(pRExC_state, GOSTART);
5987 *flagp |= POSTPONED;
5988 nextchar(pRExC_state);
5991 { /* named and numeric backreferences */
5993 case '&': /* (?&NAME) */
5994 parse_start = RExC_parse - 1;
5997 SV *sv_dat = reg_scan_name(pRExC_state,
5998 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5999 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6001 goto gen_recurse_regop;
6004 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6006 vFAIL("Illegal pattern");
6008 goto parse_recursion;
6010 case '-': /* (?-1) */
6011 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6012 RExC_parse--; /* rewind to let it be handled later */
6016 case '1': case '2': case '3': case '4': /* (?1) */
6017 case '5': case '6': case '7': case '8': case '9':
6020 num = atoi(RExC_parse);
6021 parse_start = RExC_parse - 1; /* MJD */
6022 if (*RExC_parse == '-')
6024 while (isDIGIT(*RExC_parse))
6026 if (*RExC_parse!=')')
6027 vFAIL("Expecting close bracket");
6030 if ( paren == '-' ) {
6032 Diagram of capture buffer numbering.
6033 Top line is the normal capture buffer numbers
6034 Bottom line is the negative indexing as from
6038 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6042 num = RExC_npar + num;
6045 vFAIL("Reference to nonexistent group");
6047 } else if ( paren == '+' ) {
6048 num = RExC_npar + num - 1;
6051 ret = reganode(pRExC_state, GOSUB, num);
6053 if (num > (I32)RExC_rx->nparens) {
6055 vFAIL("Reference to nonexistent group");
6057 ARG2L_SET( ret, RExC_recurse_count++);
6059 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6060 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6064 RExC_seen |= REG_SEEN_RECURSE;
6065 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6066 Set_Node_Offset(ret, parse_start); /* MJD */
6068 *flagp |= POSTPONED;
6069 nextchar(pRExC_state);
6071 } /* named and numeric backreferences */
6074 case '?': /* (??...) */
6076 if (*RExC_parse != '{') {
6078 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6081 *flagp |= POSTPONED;
6082 paren = *RExC_parse++;
6084 case '{': /* (?{...}) */
6089 char *s = RExC_parse;
6091 RExC_seen_zerolen++;
6092 RExC_seen |= REG_SEEN_EVAL;
6093 while (count && (c = *RExC_parse)) {
6104 if (*RExC_parse != ')') {
6106 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6110 OP_4tree *sop, *rop;
6111 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6114 Perl_save_re_context(aTHX);
6115 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6116 sop->op_private |= OPpREFCOUNTED;
6117 /* re_dup will OpREFCNT_inc */
6118 OpREFCNT_set(sop, 1);
6121 n = add_data(pRExC_state, 3, "nop");
6122 RExC_rxi->data->data[n] = (void*)rop;
6123 RExC_rxi->data->data[n+1] = (void*)sop;
6124 RExC_rxi->data->data[n+2] = (void*)pad;
6127 else { /* First pass */
6128 if (PL_reginterp_cnt < ++RExC_seen_evals
6130 /* No compiled RE interpolated, has runtime
6131 components ===> unsafe. */
6132 FAIL("Eval-group not allowed at runtime, use re 'eval'");
6133 if (PL_tainting && PL_tainted)
6134 FAIL("Eval-group in insecure regular expression");
6135 #if PERL_VERSION > 8
6136 if (IN_PERL_COMPILETIME)
6141 nextchar(pRExC_state);
6143 ret = reg_node(pRExC_state, LOGICAL);
6146 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6147 /* deal with the length of this later - MJD */
6150 ret = reganode(pRExC_state, EVAL, n);
6151 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6152 Set_Node_Offset(ret, parse_start);
6155 case '(': /* (?(?{...})...) and (?(?=...)...) */
6158 if (RExC_parse[0] == '?') { /* (?(?...)) */
6159 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6160 || RExC_parse[1] == '<'
6161 || RExC_parse[1] == '{') { /* Lookahead or eval. */
6164 ret = reg_node(pRExC_state, LOGICAL);
6167 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6171 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6172 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6174 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6175 char *name_start= RExC_parse++;
6177 SV *sv_dat=reg_scan_name(pRExC_state,
6178 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6179 if (RExC_parse == name_start || *RExC_parse != ch)
6180 vFAIL2("Sequence (?(%c... not terminated",
6181 (ch == '>' ? '<' : ch));
6184 num = add_data( pRExC_state, 1, "S" );
6185 RExC_rxi->data->data[num]=(void*)sv_dat;
6186 SvREFCNT_inc_simple_void(sv_dat);
6188 ret = reganode(pRExC_state,NGROUPP,num);
6189 goto insert_if_check_paren;
6191 else if (RExC_parse[0] == 'D' &&
6192 RExC_parse[1] == 'E' &&
6193 RExC_parse[2] == 'F' &&
6194 RExC_parse[3] == 'I' &&
6195 RExC_parse[4] == 'N' &&
6196 RExC_parse[5] == 'E')
6198 ret = reganode(pRExC_state,DEFINEP,0);
6201 goto insert_if_check_paren;
6203 else if (RExC_parse[0] == 'R') {
6206 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6207 parno = atoi(RExC_parse++);
6208 while (isDIGIT(*RExC_parse))
6210 } else if (RExC_parse[0] == '&') {
6213 sv_dat = reg_scan_name(pRExC_state,
6214 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6215 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6217 ret = reganode(pRExC_state,INSUBP,parno);
6218 goto insert_if_check_paren;
6220 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6223 parno = atoi(RExC_parse++);
6225 while (isDIGIT(*RExC_parse))
6227 ret = reganode(pRExC_state, GROUPP, parno);
6229 insert_if_check_paren:
6230 if ((c = *nextchar(pRExC_state)) != ')')
6231 vFAIL("Switch condition not recognized");
6233 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6234 br = regbranch(pRExC_state, &flags, 1,depth+1);
6236 br = reganode(pRExC_state, LONGJMP, 0);
6238 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6239 c = *nextchar(pRExC_state);
6244 vFAIL("(?(DEFINE)....) does not allow branches");
6245 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6246 regbranch(pRExC_state, &flags, 1,depth+1);
6247 REGTAIL(pRExC_state, ret, lastbr);
6250 c = *nextchar(pRExC_state);
6255 vFAIL("Switch (?(condition)... contains too many branches");
6256 ender = reg_node(pRExC_state, TAIL);
6257 REGTAIL(pRExC_state, br, ender);
6259 REGTAIL(pRExC_state, lastbr, ender);
6260 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6263 REGTAIL(pRExC_state, ret, ender);
6264 RExC_size++; /* XXX WHY do we need this?!!
6265 For large programs it seems to be required
6266 but I can't figure out why. -- dmq*/
6270 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6274 RExC_parse--; /* for vFAIL to print correctly */
6275 vFAIL("Sequence (? incomplete");
6277 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6279 has_use_defaults = TRUE;
6280 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6281 if (RExC_utf8) { /* But the default for a utf8 pattern is
6282 unicode semantics */
6283 RExC_flags |= RXf_PMf_UNICODE;
6288 parse_flags: /* (?i) */
6290 U32 posflags = 0, negflags = 0;
6291 U32 *flagsp = &posflags;
6292 bool has_charset_modifier = 0;
6294 while (*RExC_parse) {
6295 /* && strchr("iogcmsx", *RExC_parse) */
6296 /* (?g), (?gc) and (?o) are useless here
6297 and must be globally applied -- japhy */
6298 switch (*RExC_parse) {
6299 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6300 case LOCALE_PAT_MOD:
6301 if (has_charset_modifier || flagsp == &negflags) {
6302 goto fail_modifiers;
6304 posflags |= RXf_PMf_LOCALE;
6305 negflags |= RXf_PMf_UNICODE;
6306 has_charset_modifier = 1;
6308 case UNICODE_PAT_MOD:
6309 if (has_charset_modifier || flagsp == &negflags) {
6310 goto fail_modifiers;
6312 posflags |= RXf_PMf_UNICODE;
6313 negflags |= RXf_PMf_LOCALE;
6314 has_charset_modifier = 1;
6317 if (has_use_defaults
6318 || has_charset_modifier
6319 || flagsp == &negflags)
6321 goto fail_modifiers;
6324 /* The dual charset means unicode semantics if the
6325 * pattern (or target, not known until runtime) are
6328 posflags |= RXf_PMf_UNICODE;
6329 negflags |= RXf_PMf_LOCALE;
6332 negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6334 has_charset_modifier = 1;
6336 case ONCE_PAT_MOD: /* 'o' */
6337 case GLOBAL_PAT_MOD: /* 'g' */
6338 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6339 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6340 if (! (wastedflags & wflagbit) ) {
6341 wastedflags |= wflagbit;
6344 "Useless (%s%c) - %suse /%c modifier",
6345 flagsp == &negflags ? "?-" : "?",
6347 flagsp == &negflags ? "don't " : "",
6354 case CONTINUE_PAT_MOD: /* 'c' */
6355 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6356 if (! (wastedflags & WASTED_C) ) {
6357 wastedflags |= WASTED_GC;
6360 "Useless (%sc) - %suse /gc modifier",
6361 flagsp == &negflags ? "?-" : "?",
6362 flagsp == &negflags ? "don't " : ""
6367 case KEEPCOPY_PAT_MOD: /* 'p' */
6368 if (flagsp == &negflags) {
6370 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6372 *flagsp |= RXf_PMf_KEEPCOPY;
6376 /* A flag is a default iff it is following a minus, so
6377 * if there is a minus, it means will be trying to
6378 * re-specify a default which is an error */
6379 if (has_use_defaults || flagsp == &negflags) {
6382 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6386 wastedflags = 0; /* reset so (?g-c) warns twice */
6392 RExC_flags |= posflags;
6393 RExC_flags &= ~negflags;
6395 oregflags |= posflags;
6396 oregflags &= ~negflags;
6398 nextchar(pRExC_state);
6409 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6414 }} /* one for the default block, one for the switch */
6421 ret = reganode(pRExC_state, OPEN, parno);
6424 RExC_nestroot = parno;
6425 if (RExC_seen & REG_SEEN_RECURSE
6426 && !RExC_open_parens[parno-1])
6428 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6429 "Setting open paren #%"IVdf" to %d\n",
6430 (IV)parno, REG_NODE_NUM(ret)));
6431 RExC_open_parens[parno-1]= ret;
6434 Set_Node_Length(ret, 1); /* MJD */
6435 Set_Node_Offset(ret, RExC_parse); /* MJD */
6443 /* Pick up the branches, linking them together. */
6444 parse_start = RExC_parse; /* MJD */
6445 br = regbranch(pRExC_state, &flags, 1,depth+1);
6448 if (RExC_npar > after_freeze)
6449 after_freeze = RExC_npar;
6450 RExC_npar = freeze_paren;
6453 /* branch_len = (paren != 0); */
6457 if (*RExC_parse == '|') {
6458 if (!SIZE_ONLY && RExC_extralen) {
6459 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6462 reginsert(pRExC_state, BRANCH, br, depth+1);
6463 Set_Node_Length(br, paren != 0);
6464 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6468 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6470 else if (paren == ':') {
6471 *flagp |= flags&SIMPLE;
6473 if (is_open) { /* Starts with OPEN. */
6474 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6476 else if (paren != '?') /* Not Conditional */
6478 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6480 while (*RExC_parse == '|') {
6481 if (!SIZE_ONLY && RExC_extralen) {
6482 ender = reganode(pRExC_state, LONGJMP,0);
6483 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6486 RExC_extralen += 2; /* Account for LONGJMP. */
6487 nextchar(pRExC_state);
6489 if (RExC_npar > after_freeze)
6490 after_freeze = RExC_npar;
6491 RExC_npar = freeze_paren;
6493 br = regbranch(pRExC_state, &flags, 0, depth+1);
6497 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6499 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6502 if (have_branch || paren != ':') {
6503 /* Make a closing node, and hook it on the end. */
6506 ender = reg_node(pRExC_state, TAIL);
6509 ender = reganode(pRExC_state, CLOSE, parno);
6510 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6511 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6512 "Setting close paren #%"IVdf" to %d\n",
6513 (IV)parno, REG_NODE_NUM(ender)));
6514 RExC_close_parens[parno-1]= ender;
6515 if (RExC_nestroot == parno)
6518 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6519 Set_Node_Length(ender,1); /* MJD */
6525 *flagp &= ~HASWIDTH;
6528 ender = reg_node(pRExC_state, SUCCEED);
6531 ender = reg_node(pRExC_state, END);
6533 assert(!RExC_opend); /* there can only be one! */
6538 REGTAIL(pRExC_state, lastbr, ender);
6540 if (have_branch && !SIZE_ONLY) {
6542 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6544 /* Hook the tails of the branches to the closing node. */
6545 for (br = ret; br; br = regnext(br)) {
6546 const U8 op = PL_regkind[OP(br)];
6548 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6550 else if (op == BRANCHJ) {
6551 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6559 static const char parens[] = "=!<,>";
6561 if (paren && (p = strchr(parens, paren))) {
6562 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6563 int flag = (p - parens) > 1;
6566 node = SUSPEND, flag = 0;
6567 reginsert(pRExC_state, node,ret, depth+1);
6568 Set_Node_Cur_Length(ret);
6569 Set_Node_Offset(ret, parse_start + 1);
6571 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6575 /* Check for proper termination. */
6577 RExC_flags = oregflags;
6578 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6579 RExC_parse = oregcomp_parse;
6580 vFAIL("Unmatched (");
6583 else if (!paren && RExC_parse < RExC_end) {
6584 if (*RExC_parse == ')') {
6586 vFAIL("Unmatched )");
6589 FAIL("Junk on end of regexp"); /* "Can't happen". */
6593 RExC_npar = after_freeze;
6598 - regbranch - one alternative of an | operator
6600 * Implements the concatenation operator.
6603 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6606 register regnode *ret;
6607 register regnode *chain = NULL;
6608 register regnode *latest;
6609 I32 flags = 0, c = 0;
6610 GET_RE_DEBUG_FLAGS_DECL;
6612 PERL_ARGS_ASSERT_REGBRANCH;
6614 DEBUG_PARSE("brnc");
6619 if (!SIZE_ONLY && RExC_extralen)
6620 ret = reganode(pRExC_state, BRANCHJ,0);
6622 ret = reg_node(pRExC_state, BRANCH);
6623 Set_Node_Length(ret, 1);
6627 if (!first && SIZE_ONLY)
6628 RExC_extralen += 1; /* BRANCHJ */
6630 *flagp = WORST; /* Tentatively. */
6633 nextchar(pRExC_state);
6634 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6636 latest = regpiece(pRExC_state, &flags,depth+1);
6637 if (latest == NULL) {
6638 if (flags & TRYAGAIN)
6642 else if (ret == NULL)
6644 *flagp |= flags&(HASWIDTH|POSTPONED);
6645 if (chain == NULL) /* First piece. */
6646 *flagp |= flags&SPSTART;
6649 REGTAIL(pRExC_state, chain, latest);
6654 if (chain == NULL) { /* Loop ran zero times. */
6655 chain = reg_node(pRExC_state, NOTHING);
6660 *flagp |= flags&SIMPLE;
6667 - regpiece - something followed by possible [*+?]
6669 * Note that the branching code sequences used for ? and the general cases
6670 * of * and + are somewhat optimized: they use the same NOTHING node as
6671 * both the endmarker for their branch list and the body of the last branch.
6672 * It might seem that this node could be dispensed with entirely, but the
6673 * endmarker role is not redundant.
6676 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6679 register regnode *ret;
6681 register char *next;
6683 const char * const origparse = RExC_parse;
6685 I32 max = REG_INFTY;
6687 const char *maxpos = NULL;
6688 GET_RE_DEBUG_FLAGS_DECL;
6690 PERL_ARGS_ASSERT_REGPIECE;
6692 DEBUG_PARSE("piec");
6694 ret = regatom(pRExC_state, &flags,depth+1);
6696 if (flags & TRYAGAIN)
6703 if (op == '{' && regcurly(RExC_parse)) {
6705 parse_start = RExC_parse; /* MJD */
6706 next = RExC_parse + 1;
6707 while (isDIGIT(*next) || *next == ',') {
6716 if (*next == '}') { /* got one */
6720 min = atoi(RExC_parse);
6724 maxpos = RExC_parse;
6726 if (!max && *maxpos != '0')
6727 max = REG_INFTY; /* meaning "infinity" */
6728 else if (max >= REG_INFTY)
6729 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6731 nextchar(pRExC_state);
6734 if ((flags&SIMPLE)) {
6735 RExC_naughty += 2 + RExC_naughty / 2;
6736 reginsert(pRExC_state, CURLY, ret, depth+1);
6737 Set_Node_Offset(ret, parse_start+1); /* MJD */
6738 Set_Node_Cur_Length(ret);
6741 regnode * const w = reg_node(pRExC_state, WHILEM);
6744 REGTAIL(pRExC_state, ret, w);
6745 if (!SIZE_ONLY && RExC_extralen) {
6746 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6747 reginsert(pRExC_state, NOTHING,ret, depth+1);
6748 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6750 reginsert(pRExC_state, CURLYX,ret, depth+1);
6752 Set_Node_Offset(ret, parse_start+1);
6753 Set_Node_Length(ret,
6754 op == '{' ? (RExC_parse - parse_start) : 1);
6756 if (!SIZE_ONLY && RExC_extralen)
6757 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6758 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6760 RExC_whilem_seen++, RExC_extralen += 3;
6761 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6770 vFAIL("Can't do {n,m} with n > m");
6772 ARG1_SET(ret, (U16)min);
6773 ARG2_SET(ret, (U16)max);
6785 #if 0 /* Now runtime fix should be reliable. */
6787 /* if this is reinstated, don't forget to put this back into perldiag:
6789 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6791 (F) The part of the regexp subject to either the * or + quantifier
6792 could match an empty string. The {#} shows in the regular
6793 expression about where the problem was discovered.
6797 if (!(flags&HASWIDTH) && op != '?')
6798 vFAIL("Regexp *+ operand could be empty");
6801 parse_start = RExC_parse;
6802 nextchar(pRExC_state);
6804 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6806 if (op == '*' && (flags&SIMPLE)) {
6807 reginsert(pRExC_state, STAR, ret, depth+1);
6811 else if (op == '*') {
6815 else if (op == '+' && (flags&SIMPLE)) {
6816 reginsert(pRExC_state, PLUS, ret, depth+1);
6820 else if (op == '+') {
6824 else if (op == '?') {
6829 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6830 ckWARN3reg(RExC_parse,
6831 "%.*s matches null string many times",
6832 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6836 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6837 nextchar(pRExC_state);
6838 reginsert(pRExC_state, MINMOD, ret, depth+1);
6839 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6841 #ifndef REG_ALLOW_MINMOD_SUSPEND
6844 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6846 nextchar(pRExC_state);
6847 ender = reg_node(pRExC_state, SUCCEED);
6848 REGTAIL(pRExC_state, ret, ender);
6849 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6851 ender = reg_node(pRExC_state, TAIL);
6852 REGTAIL(pRExC_state, ret, ender);
6856 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6858 vFAIL("Nested quantifiers");
6865 /* reg_namedseq(pRExC_state,UVp)
6867 This is expected to be called by a parser routine that has
6868 recognized '\N' and needs to handle the rest. RExC_parse is
6869 expected to point at the first char following the N at the time
6872 The \N may be inside (indicated by valuep not being NULL) or outside a
6875 \N may begin either a named sequence, or if outside a character class, mean
6876 to match a non-newline. For non single-quoted regexes, the tokenizer has
6877 attempted to decide which, and in the case of a named sequence converted it
6878 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6879 where c1... are the characters in the sequence. For single-quoted regexes,
6880 the tokenizer passes the \N sequence through unchanged; this code will not
6881 attempt to determine this nor expand those. The net effect is that if the
6882 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6883 signals that this \N occurrence means to match a non-newline.
6885 Only the \N{U+...} form should occur in a character class, for the same
6886 reason that '.' inside a character class means to just match a period: it
6887 just doesn't make sense.
6889 If valuep is non-null then it is assumed that we are parsing inside
6890 of a charclass definition and the first codepoint in the resolved
6891 string is returned via *valuep and the routine will return NULL.
6892 In this mode if a multichar string is returned from the charnames
6893 handler, a warning will be issued, and only the first char in the
6894 sequence will be examined. If the string returned is zero length
6895 then the value of *valuep is undefined and NON-NULL will
6896 be returned to indicate failure. (This will NOT be a valid pointer
6899 If valuep is null then it is assumed that we are parsing normal text and a
6900 new EXACT node is inserted into the program containing the resolved string,
6901 and a pointer to the new node is returned. But if the string is zero length
6902 a NOTHING node is emitted instead.
6904 On success RExC_parse is set to the char following the endbrace.
6905 Parsing failures will generate a fatal error via vFAIL(...)
6908 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6910 char * endbrace; /* '}' following the name */
6911 regnode *ret = NULL;
6913 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6917 GET_RE_DEBUG_FLAGS_DECL;
6919 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6923 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6924 * modifier. The other meaning does not */
6925 p = (RExC_flags & RXf_PMf_EXTENDED)
6926 ? regwhite( pRExC_state, RExC_parse )
6929 /* Disambiguate between \N meaning a named character versus \N meaning
6930 * [^\n]. The former is assumed when it can't be the latter. */
6931 if (*p != '{' || regcurly(p)) {
6934 /* no bare \N in a charclass */
6935 vFAIL("\\N in a character class must be a named character: \\N{...}");
6937 nextchar(pRExC_state);
6938 ret = reg_node(pRExC_state, REG_ANY);
6939 *flagp |= HASWIDTH|SIMPLE;
6942 Set_Node_Length(ret, 1); /* MJD */
6946 /* Here, we have decided it should be a named sequence */
6948 /* The test above made sure that the next real character is a '{', but
6949 * under the /x modifier, it could be separated by space (or a comment and
6950 * \n) and this is not allowed (for consistency with \x{...} and the
6951 * tokenizer handling of \N{NAME}). */
6952 if (*RExC_parse != '{') {
6953 vFAIL("Missing braces on \\N{}");
6956 RExC_parse++; /* Skip past the '{' */
6958 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6959 || ! (endbrace == RExC_parse /* nothing between the {} */
6960 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6961 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6963 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6964 vFAIL("\\N{NAME} must be resolved by the lexer");
6967 if (endbrace == RExC_parse) { /* empty: \N{} */
6969 RExC_parse = endbrace + 1;
6970 return reg_node(pRExC_state,NOTHING);
6974 ckWARNreg(RExC_parse,
6975 "Ignoring zero length \\N{} in character class"
6977 RExC_parse = endbrace + 1;
6980 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6983 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
6984 RExC_parse += 2; /* Skip past the 'U+' */
6986 if (valuep) { /* In a bracketed char class */
6987 /* We only pay attention to the first char of
6988 multichar strings being returned. I kinda wonder
6989 if this makes sense as it does change the behaviour
6990 from earlier versions, OTOH that behaviour was broken
6991 as well. XXX Solution is to recharacterize as
6992 [rest-of-class]|multi1|multi2... */
6994 STRLEN length_of_hex;
6995 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6996 | PERL_SCAN_DISALLOW_PREFIX
6997 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6999 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7000 if (endchar < endbrace) {
7001 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7004 length_of_hex = (STRLEN)(endchar - RExC_parse);
7005 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7007 /* The tokenizer should have guaranteed validity, but it's possible to
7008 * bypass it by using single quoting, so check */
7009 if (length_of_hex == 0
7010 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7012 RExC_parse += length_of_hex; /* Includes all the valid */
7013 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7014 ? UTF8SKIP(RExC_parse)
7016 /* Guard against malformed utf8 */
7017 if (RExC_parse >= endchar) RExC_parse = endchar;
7018 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7021 RExC_parse = endbrace + 1;
7022 if (endchar == endbrace) return NULL;
7024 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
7026 else { /* Not a char class */
7027 char *s; /* String to put in generated EXACT node */
7028 STRLEN len = 0; /* Its current byte length */
7029 char *endchar; /* Points to '.' or '}' ending cur char in the input
7032 ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7040 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7041 * the input which is of the form now 'c1.c2.c3...}' until find the
7042 * ending brace or exceed length 255. The characters that exceed this
7043 * limit are dropped. The limit could be relaxed should it become
7044 * desirable by reparsing this as (?:\N{NAME}), so could generate
7045 * multiple EXACT nodes, as is done for just regular input. But this
7046 * is primarily a named character, and not intended to be a huge long
7047 * string, so 255 bytes should be good enough */
7049 STRLEN length_of_hex;
7050 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7051 | PERL_SCAN_DISALLOW_PREFIX
7052 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7053 UV cp; /* Ord of current character */
7055 /* Code points are separated by dots. If none, there is only one
7056 * code point, and is terminated by the brace */
7057 endchar = RExC_parse + strcspn(RExC_parse, ".}");
7059 /* The values are Unicode even on EBCDIC machines */
7060 length_of_hex = (STRLEN)(endchar - RExC_parse);
7061 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7062 if ( length_of_hex == 0
7063 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7065 RExC_parse += length_of_hex; /* Includes all the valid */
7066 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7067 ? UTF8SKIP(RExC_parse)
7069 /* Guard against malformed utf8 */
7070 if (RExC_parse >= endchar) RExC_parse = endchar;
7071 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7074 if (! FOLD) { /* Not folding, just append to the string */
7077 /* Quit before adding this character if would exceed limit */
7078 if (len + UNISKIP(cp) > U8_MAX) break;
7080 unilen = reguni(pRExC_state, cp, s);
7085 } else { /* Folding, output the folded equivalent */
7086 STRLEN foldlen,numlen;
7087 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7088 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7090 /* Quit before exceeding size limit */
7091 if (len + foldlen > U8_MAX) break;
7093 for (foldbuf = tmpbuf;
7097 cp = utf8_to_uvchr(foldbuf, &numlen);
7099 const STRLEN unilen = reguni(pRExC_state, cp, s);
7102 /* In EBCDIC the numlen and unilen can differ. */
7104 if (numlen >= foldlen)
7108 break; /* "Can't happen." */
7112 /* Point to the beginning of the next character in the sequence. */
7113 RExC_parse = endchar + 1;
7115 /* Quit if no more characters */
7116 if (RExC_parse >= endbrace) break;
7121 if (RExC_parse < endbrace) {
7122 ckWARNreg(RExC_parse - 1,
7123 "Using just the first characters returned by \\N{}");
7126 RExC_size += STR_SZ(len);
7129 RExC_emit += STR_SZ(len);
7132 RExC_parse = endbrace + 1;
7134 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7135 with malformed in t/re/pat_advanced.t */
7137 Set_Node_Cur_Length(ret); /* MJD */
7138 nextchar(pRExC_state);
7148 * It returns the code point in utf8 for the value in *encp.
7149 * value: a code value in the source encoding
7150 * encp: a pointer to an Encode object
7152 * If the result from Encode is not a single character,
7153 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7156 S_reg_recode(pTHX_ const char value, SV **encp)
7159 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7160 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7161 const STRLEN newlen = SvCUR(sv);
7162 UV uv = UNICODE_REPLACEMENT;
7164 PERL_ARGS_ASSERT_REG_RECODE;
7168 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7171 if (!newlen || numlen != newlen) {
7172 uv = UNICODE_REPLACEMENT;
7180 - regatom - the lowest level
7182 Try to identify anything special at the start of the pattern. If there
7183 is, then handle it as required. This may involve generating a single regop,
7184 such as for an assertion; or it may involve recursing, such as to
7185 handle a () structure.
7187 If the string doesn't start with something special then we gobble up
7188 as much literal text as we can.
7190 Once we have been able to handle whatever type of thing started the
7191 sequence, we return.
7193 Note: we have to be careful with escapes, as they can be both literal
7194 and special, and in the case of \10 and friends can either, depending
7195 on context. Specifically there are two seperate switches for handling
7196 escape sequences, with the one for handling literal escapes requiring
7197 a dummy entry for all of the special escapes that are actually handled
7202 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7205 register regnode *ret = NULL;
7207 char *parse_start = RExC_parse;
7208 GET_RE_DEBUG_FLAGS_DECL;
7209 DEBUG_PARSE("atom");
7210 *flagp = WORST; /* Tentatively. */
7212 PERL_ARGS_ASSERT_REGATOM;
7215 switch ((U8)*RExC_parse) {
7217 RExC_seen_zerolen++;
7218 nextchar(pRExC_state);
7219 if (RExC_flags & RXf_PMf_MULTILINE)
7220 ret = reg_node(pRExC_state, MBOL);
7221 else if (RExC_flags & RXf_PMf_SINGLELINE)
7222 ret = reg_node(pRExC_state, SBOL);
7224 ret = reg_node(pRExC_state, BOL);
7225 Set_Node_Length(ret, 1); /* MJD */
7228 nextchar(pRExC_state);
7230 RExC_seen_zerolen++;
7231 if (RExC_flags & RXf_PMf_MULTILINE)
7232 ret = reg_node(pRExC_state, MEOL);
7233 else if (RExC_flags & RXf_PMf_SINGLELINE)
7234 ret = reg_node(pRExC_state, SEOL);
7236 ret = reg_node(pRExC_state, EOL);
7237 Set_Node_Length(ret, 1); /* MJD */
7240 nextchar(pRExC_state);
7241 if (RExC_flags & RXf_PMf_SINGLELINE)
7242 ret = reg_node(pRExC_state, SANY);
7244 ret = reg_node(pRExC_state, REG_ANY);
7245 *flagp |= HASWIDTH|SIMPLE;
7247 Set_Node_Length(ret, 1); /* MJD */
7251 char * const oregcomp_parse = ++RExC_parse;
7252 ret = regclass(pRExC_state,depth+1);
7253 if (*RExC_parse != ']') {
7254 RExC_parse = oregcomp_parse;
7255 vFAIL("Unmatched [");
7257 nextchar(pRExC_state);
7258 *flagp |= HASWIDTH|SIMPLE;
7259 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7263 nextchar(pRExC_state);
7264 ret = reg(pRExC_state, 1, &flags,depth+1);
7266 if (flags & TRYAGAIN) {
7267 if (RExC_parse == RExC_end) {
7268 /* Make parent create an empty node if needed. */
7276 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7280 if (flags & TRYAGAIN) {
7284 vFAIL("Internal urp");
7285 /* Supposed to be caught earlier. */
7288 if (!regcurly(RExC_parse)) {
7297 vFAIL("Quantifier follows nothing");
7305 len=0; /* silence a spurious compiler warning */
7306 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7307 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7308 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7309 ret = reganode(pRExC_state, FOLDCHAR, cp);
7310 Set_Node_Length(ret, 1); /* MJD */
7311 nextchar(pRExC_state); /* kill whitespace under /x */
7319 This switch handles escape sequences that resolve to some kind
7320 of special regop and not to literal text. Escape sequnces that
7321 resolve to literal text are handled below in the switch marked
7324 Every entry in this switch *must* have a corresponding entry
7325 in the literal escape switch. However, the opposite is not
7326 required, as the default for this switch is to jump to the
7327 literal text handling code.
7329 switch ((U8)*++RExC_parse) {
7334 /* Special Escapes */
7336 RExC_seen_zerolen++;
7337 ret = reg_node(pRExC_state, SBOL);
7339 goto finish_meta_pat;
7341 ret = reg_node(pRExC_state, GPOS);
7342 RExC_seen |= REG_SEEN_GPOS;
7344 goto finish_meta_pat;
7346 RExC_seen_zerolen++;
7347 ret = reg_node(pRExC_state, KEEPS);
7349 /* XXX:dmq : disabling in-place substitution seems to
7350 * be necessary here to avoid cases of memory corruption, as
7351 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7353 RExC_seen |= REG_SEEN_LOOKBEHIND;
7354 goto finish_meta_pat;
7356 ret = reg_node(pRExC_state, SEOL);
7358 RExC_seen_zerolen++; /* Do not optimize RE away */
7359 goto finish_meta_pat;
7361 ret = reg_node(pRExC_state, EOS);
7363 RExC_seen_zerolen++; /* Do not optimize RE away */
7364 goto finish_meta_pat;
7366 ret = reg_node(pRExC_state, CANY);
7367 RExC_seen |= REG_SEEN_CANY;
7368 *flagp |= HASWIDTH|SIMPLE;
7369 goto finish_meta_pat;
7371 ret = reg_node(pRExC_state, CLUMP);
7373 goto finish_meta_pat;
7376 ret = reg_node(pRExC_state, (U8)(ALNUML));
7378 ret = reg_node(pRExC_state, (U8)(ALNUM));
7379 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7381 *flagp |= HASWIDTH|SIMPLE;
7382 goto finish_meta_pat;
7385 ret = reg_node(pRExC_state, (U8)(NALNUML));
7387 ret = reg_node(pRExC_state, (U8)(NALNUM));
7388 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7390 *flagp |= HASWIDTH|SIMPLE;
7391 goto finish_meta_pat;
7393 RExC_seen_zerolen++;
7394 RExC_seen |= REG_SEEN_LOOKBEHIND;
7396 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7398 ret = reg_node(pRExC_state, (U8)(BOUND));
7399 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7402 goto finish_meta_pat;
7404 RExC_seen_zerolen++;
7405 RExC_seen |= REG_SEEN_LOOKBEHIND;
7407 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7409 ret = reg_node(pRExC_state, (U8)(NBOUND));
7410 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7413 goto finish_meta_pat;
7416 ret = reg_node(pRExC_state, (U8)(SPACEL));
7418 ret = reg_node(pRExC_state, (U8)(SPACE));
7419 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7421 *flagp |= HASWIDTH|SIMPLE;
7422 goto finish_meta_pat;
7425 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7427 ret = reg_node(pRExC_state, (U8)(NSPACE));
7428 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7430 *flagp |= HASWIDTH|SIMPLE;
7431 goto finish_meta_pat;
7433 ret = reg_node(pRExC_state, DIGIT);
7434 *flagp |= HASWIDTH|SIMPLE;
7435 goto finish_meta_pat;
7437 ret = reg_node(pRExC_state, NDIGIT);
7438 *flagp |= HASWIDTH|SIMPLE;
7439 goto finish_meta_pat;
7441 ret = reg_node(pRExC_state, LNBREAK);
7442 *flagp |= HASWIDTH|SIMPLE;
7443 goto finish_meta_pat;
7445 ret = reg_node(pRExC_state, HORIZWS);
7446 *flagp |= HASWIDTH|SIMPLE;
7447 goto finish_meta_pat;
7449 ret = reg_node(pRExC_state, NHORIZWS);
7450 *flagp |= HASWIDTH|SIMPLE;
7451 goto finish_meta_pat;
7453 ret = reg_node(pRExC_state, VERTWS);
7454 *flagp |= HASWIDTH|SIMPLE;
7455 goto finish_meta_pat;
7457 ret = reg_node(pRExC_state, NVERTWS);
7458 *flagp |= HASWIDTH|SIMPLE;
7460 nextchar(pRExC_state);
7461 Set_Node_Length(ret, 2); /* MJD */
7466 char* const oldregxend = RExC_end;
7468 char* parse_start = RExC_parse - 2;
7471 if (RExC_parse[1] == '{') {
7472 /* a lovely hack--pretend we saw [\pX] instead */
7473 RExC_end = strchr(RExC_parse, '}');
7475 const U8 c = (U8)*RExC_parse;
7477 RExC_end = oldregxend;
7478 vFAIL2("Missing right brace on \\%c{}", c);
7483 RExC_end = RExC_parse + 2;
7484 if (RExC_end > oldregxend)
7485 RExC_end = oldregxend;
7489 ret = regclass(pRExC_state,depth+1);
7491 RExC_end = oldregxend;
7494 Set_Node_Offset(ret, parse_start + 2);
7495 Set_Node_Cur_Length(ret);
7496 nextchar(pRExC_state);
7497 *flagp |= HASWIDTH|SIMPLE;
7501 /* Handle \N and \N{NAME} here and not below because it can be
7502 multicharacter. join_exact() will join them up later on.
7503 Also this makes sure that things like /\N{BLAH}+/ and
7504 \N{BLAH} being multi char Just Happen. dmq*/
7506 ret= reg_namedseq(pRExC_state, NULL, flagp);
7508 case 'k': /* Handle \k<NAME> and \k'NAME' */
7511 char ch= RExC_parse[1];
7512 if (ch != '<' && ch != '\'' && ch != '{') {
7514 vFAIL2("Sequence %.2s... not terminated",parse_start);
7516 /* this pretty much dupes the code for (?P=...) in reg(), if
7517 you change this make sure you change that */
7518 char* name_start = (RExC_parse += 2);
7520 SV *sv_dat = reg_scan_name(pRExC_state,
7521 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7522 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7523 if (RExC_parse == name_start || *RExC_parse != ch)
7524 vFAIL2("Sequence %.3s... not terminated",parse_start);
7527 num = add_data( pRExC_state, 1, "S" );
7528 RExC_rxi->data->data[num]=(void*)sv_dat;
7529 SvREFCNT_inc_simple_void(sv_dat);
7533 ret = reganode(pRExC_state,
7534 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
7538 /* override incorrect value set in reganode MJD */
7539 Set_Node_Offset(ret, parse_start+1);
7540 Set_Node_Cur_Length(ret); /* MJD */
7541 nextchar(pRExC_state);
7547 case '1': case '2': case '3': case '4':
7548 case '5': case '6': case '7': case '8': case '9':
7551 bool isg = *RExC_parse == 'g';
7556 if (*RExC_parse == '{') {
7560 if (*RExC_parse == '-') {
7564 if (hasbrace && !isDIGIT(*RExC_parse)) {
7565 if (isrel) RExC_parse--;
7567 goto parse_named_seq;
7569 num = atoi(RExC_parse);
7570 if (isg && num == 0)
7571 vFAIL("Reference to invalid group 0");
7573 num = RExC_npar - num;
7575 vFAIL("Reference to nonexistent or unclosed group");
7577 if (!isg && num > 9 && num >= RExC_npar)
7580 char * const parse_start = RExC_parse - 1; /* MJD */
7581 while (isDIGIT(*RExC_parse))
7583 if (parse_start == RExC_parse - 1)
7584 vFAIL("Unterminated \\g... pattern");
7586 if (*RExC_parse != '}')
7587 vFAIL("Unterminated \\g{...} pattern");
7591 if (num > (I32)RExC_rx->nparens)
7592 vFAIL("Reference to nonexistent group");
7595 ret = reganode(pRExC_state,
7596 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
7600 /* override incorrect value set in reganode MJD */
7601 Set_Node_Offset(ret, parse_start+1);
7602 Set_Node_Cur_Length(ret); /* MJD */
7604 nextchar(pRExC_state);
7609 if (RExC_parse >= RExC_end)
7610 FAIL("Trailing \\");
7613 /* Do not generate "unrecognized" warnings here, we fall
7614 back into the quick-grab loop below */
7621 if (RExC_flags & RXf_PMf_EXTENDED) {
7622 if ( reg_skipcomment( pRExC_state ) )
7629 register STRLEN len;
7634 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7636 parse_start = RExC_parse - 1;
7642 ret = reg_node(pRExC_state,
7643 (U8) ((! FOLD) ? EXACT
7651 for (len = 0, p = RExC_parse - 1;
7652 len < 127 && p < RExC_end;
7655 char * const oldp = p;
7657 if (RExC_flags & RXf_PMf_EXTENDED)
7658 p = regwhite( pRExC_state, p );
7663 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7664 goto normal_default;
7674 /* Literal Escapes Switch
7676 This switch is meant to handle escape sequences that
7677 resolve to a literal character.
7679 Every escape sequence that represents something
7680 else, like an assertion or a char class, is handled
7681 in the switch marked 'Special Escapes' above in this
7682 routine, but also has an entry here as anything that
7683 isn't explicitly mentioned here will be treated as
7684 an unescaped equivalent literal.
7688 /* These are all the special escapes. */
7692 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7693 goto normal_default;
7694 case 'A': /* Start assertion */
7695 case 'b': case 'B': /* Word-boundary assertion*/
7696 case 'C': /* Single char !DANGEROUS! */
7697 case 'd': case 'D': /* digit class */
7698 case 'g': case 'G': /* generic-backref, pos assertion */
7699 case 'h': case 'H': /* HORIZWS */
7700 case 'k': case 'K': /* named backref, keep marker */
7701 case 'N': /* named char sequence */
7702 case 'p': case 'P': /* Unicode property */
7703 case 'R': /* LNBREAK */
7704 case 's': case 'S': /* space class */
7705 case 'v': case 'V': /* VERTWS */
7706 case 'w': case 'W': /* word class */
7707 case 'X': /* eXtended Unicode "combining character sequence" */
7708 case 'z': case 'Z': /* End of line/string assertion */
7712 /* Anything after here is an escape that resolves to a
7713 literal. (Except digits, which may or may not)
7732 ender = ASCII_TO_NATIVE('\033');
7736 ender = ASCII_TO_NATIVE('\007');
7741 STRLEN brace_len = len;
7743 const char* error_msg;
7745 bool valid = grok_bslash_o(p,
7752 RExC_parse = p; /* going to die anyway; point
7753 to exact spot of failure */
7760 if (PL_encoding && ender < 0x100) {
7761 goto recode_encoding;
7770 char* const e = strchr(p, '}');
7774 vFAIL("Missing right brace on \\x{}");
7777 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7778 | PERL_SCAN_DISALLOW_PREFIX;
7779 STRLEN numlen = e - p - 1;
7780 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7787 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7789 ender = grok_hex(p, &numlen, &flags, NULL);
7792 if (PL_encoding && ender < 0x100)
7793 goto recode_encoding;
7797 ender = grok_bslash_c(*p++, SIZE_ONLY);
7799 case '0': case '1': case '2': case '3':case '4':
7800 case '5': case '6': case '7': case '8':case '9':
7802 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7804 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7806 ender = grok_oct(p, &numlen, &flags, NULL);
7816 if (PL_encoding && ender < 0x100)
7817 goto recode_encoding;
7821 SV* enc = PL_encoding;
7822 ender = reg_recode((const char)(U8)ender, &enc);
7823 if (!enc && SIZE_ONLY)
7824 ckWARNreg(p, "Invalid escape in the specified encoding");
7830 FAIL("Trailing \\");
7833 if (!SIZE_ONLY&& isALPHA(*p))
7834 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7835 goto normal_default;
7840 if (UTF8_IS_START(*p) && UTF) {
7842 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7843 &numlen, UTF8_ALLOW_DEFAULT);
7850 if ( RExC_flags & RXf_PMf_EXTENDED)
7851 p = regwhite( pRExC_state, p );
7853 /* Prime the casefolded buffer. */
7854 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7856 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7861 /* Emit all the Unicode characters. */
7863 for (foldbuf = tmpbuf;
7865 foldlen -= numlen) {
7866 ender = utf8_to_uvchr(foldbuf, &numlen);
7868 const STRLEN unilen = reguni(pRExC_state, ender, s);
7871 /* In EBCDIC the numlen
7872 * and unilen can differ. */
7874 if (numlen >= foldlen)
7878 break; /* "Can't happen." */
7882 const STRLEN unilen = reguni(pRExC_state, ender, s);
7891 REGC((char)ender, s++);
7897 /* Emit all the Unicode characters. */
7899 for (foldbuf = tmpbuf;
7901 foldlen -= numlen) {
7902 ender = utf8_to_uvchr(foldbuf, &numlen);
7904 const STRLEN unilen = reguni(pRExC_state, ender, s);
7907 /* In EBCDIC the numlen
7908 * and unilen can differ. */
7910 if (numlen >= foldlen)
7918 const STRLEN unilen = reguni(pRExC_state, ender, s);
7927 REGC((char)ender, s++);
7931 Set_Node_Cur_Length(ret); /* MJD */
7932 nextchar(pRExC_state);
7934 /* len is STRLEN which is unsigned, need to copy to signed */
7937 vFAIL("Internal disaster");
7941 if (len == 1 && UNI_IS_INVARIANT(ender))
7945 RExC_size += STR_SZ(len);
7948 RExC_emit += STR_SZ(len);
7958 S_regwhite( RExC_state_t *pRExC_state, char *p )
7960 const char *e = RExC_end;
7962 PERL_ARGS_ASSERT_REGWHITE;
7967 else if (*p == '#') {
7976 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7984 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7985 Character classes ([:foo:]) can also be negated ([:^foo:]).
7986 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7987 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
7988 but trigger failures because they are currently unimplemented. */
7990 #define POSIXCC_DONE(c) ((c) == ':')
7991 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7992 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7995 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
7998 I32 namedclass = OOB_NAMEDCLASS;
8000 PERL_ARGS_ASSERT_REGPPOSIXCC;
8002 if (value == '[' && RExC_parse + 1 < RExC_end &&
8003 /* I smell either [: or [= or [. -- POSIX has been here, right? */
8004 POSIXCC(UCHARAT(RExC_parse))) {
8005 const char c = UCHARAT(RExC_parse);
8006 char* const s = RExC_parse++;
8008 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
8010 if (RExC_parse == RExC_end)
8011 /* Grandfather lone [:, [=, [. */
8014 const char* const t = RExC_parse++; /* skip over the c */
8017 if (UCHARAT(RExC_parse) == ']') {
8018 const char *posixcc = s + 1;
8019 RExC_parse++; /* skip over the ending ] */
8022 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8023 const I32 skip = t - posixcc;
8025 /* Initially switch on the length of the name. */
8028 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8029 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
8032 /* Names all of length 5. */
8033 /* alnum alpha ascii blank cntrl digit graph lower
8034 print punct space upper */
8035 /* Offset 4 gives the best switch position. */
8036 switch (posixcc[4]) {
8038 if (memEQ(posixcc, "alph", 4)) /* alpha */
8039 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
8042 if (memEQ(posixcc, "spac", 4)) /* space */
8043 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
8046 if (memEQ(posixcc, "grap", 4)) /* graph */
8047 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
8050 if (memEQ(posixcc, "asci", 4)) /* ascii */
8051 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
8054 if (memEQ(posixcc, "blan", 4)) /* blank */
8055 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
8058 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8059 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
8062 if (memEQ(posixcc, "alnu", 4)) /* alnum */
8063 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
8066 if (memEQ(posixcc, "lowe", 4)) /* lower */
8067 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8068 else if (memEQ(posixcc, "uppe", 4)) /* upper */
8069 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
8072 if (memEQ(posixcc, "digi", 4)) /* digit */
8073 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8074 else if (memEQ(posixcc, "prin", 4)) /* print */
8075 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8076 else if (memEQ(posixcc, "punc", 4)) /* punct */
8077 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8082 if (memEQ(posixcc, "xdigit", 6))
8083 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8087 if (namedclass == OOB_NAMEDCLASS)
8088 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8090 assert (posixcc[skip] == ':');
8091 assert (posixcc[skip+1] == ']');
8092 } else if (!SIZE_ONLY) {
8093 /* [[=foo=]] and [[.foo.]] are still future. */
8095 /* adjust RExC_parse so the warning shows after
8097 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8099 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8102 /* Maternal grandfather:
8103 * "[:" ending in ":" but not in ":]" */
8113 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8117 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8119 if (POSIXCC(UCHARAT(RExC_parse))) {
8120 const char *s = RExC_parse;
8121 const char c = *s++;
8125 if (*s && c == *s && s[1] == ']') {
8127 "POSIX syntax [%c %c] belongs inside character classes",
8130 /* [[=foo=]] and [[.foo.]] are still future. */
8131 if (POSIXCC_NOTYET(c)) {
8132 /* adjust RExC_parse so the error shows after
8134 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8136 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8142 /* No locale test */
8143 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8145 for (value = 0; value < 256; value++) \
8147 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8151 case ANYOF_N##NAME: \
8152 for (value = 0; value < 256; value++) \
8154 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8159 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8160 * there are two tests passed in, to use depending on that. There aren't any
8161 * cases where the label is different from the name, so no need for that
8163 #define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \
8165 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8166 else if (UNI_SEMANTICS) { \
8167 for (value = 0; value < 256; value++) { \
8168 if (TEST_8) stored += \
8169 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8173 for (value = 0; value < 256; value++) { \
8174 if (TEST_7) stored += \
8175 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8181 case ANYOF_N##NAME: \
8182 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8183 else if (UNI_SEMANTICS) { \
8184 for (value = 0; value < 256; value++) { \
8185 if (! TEST_8) stored += \
8186 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8190 for (value = 0; value < 256; value++) { \
8191 if (! TEST_7) stored += \
8192 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8200 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8201 so that it is possible to override the option here without having to
8202 rebuild the entire core. as we are required to do if we change regcomp.h
8203 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8205 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8206 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8209 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8210 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8212 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8216 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8219 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8220 * Locale folding is done at run-time, so this function should not be
8221 * called for nodes that are for locales.
8223 * This function simply sets the bit corresponding to the fold of the input
8224 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
8227 * It also sets any necessary flags, and returns the number of bits that
8228 * actually changed from 0 to 1 */
8233 fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
8236 /* It assumes the bit for 'value' has already been set */
8237 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8238 ANYOF_BITMAP_SET(node, fold);
8242 /* The fold of the German sharp s is two ASCII characters, so isn't in the
8243 * bitmap and doesn't have to be in utf8, but we only process it if unicode
8244 * semantics are called for */
8245 if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
8246 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
8248 else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
8251 && PL_fold_latin1[value] != value))
8252 { /* A character that has a fold outside of Latin1 matches outside the
8253 bitmap, but only when the target string is utf8. Similarly when we
8254 don't have unicode semantics for the above ASCII Latin-1 characters,
8255 and they have a fold, they should match if the target is utf8, and
8257 ANYOF_FLAGS(node) |= ANYOF_UTF8;
8264 PERL_STATIC_INLINE U8
8265 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 value)
8267 /* This inline function sets a bit in the bitmap if not already set, and if
8268 * appropriate, its fold, returning the number of bits that actually
8269 * changed from 0 to 1 */
8273 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
8277 ANYOF_BITMAP_SET(node, value);
8280 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
8281 stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8288 parse a class specification and produce either an ANYOF node that
8289 matches the pattern or if the pattern matches a single char only and
8290 that char is < 256 and we are case insensitive then we produce an
8295 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8298 register UV nextvalue;
8299 register IV prevvalue = OOB_UNICODE;
8300 register IV range = 0;
8301 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8302 register regnode *ret;
8305 char *rangebegin = NULL;
8306 bool need_class = 0;
8309 bool optimize_invert = TRUE;
8310 AV* unicode_alternate = NULL;
8312 UV literal_endpoint = 0;
8314 UV stored = 0; /* 0, 1, or more than 1 chars stored in the class */
8316 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8317 case we need to change the emitted regop to an EXACT. */
8318 const char * orig_parse = RExC_parse;
8319 GET_RE_DEBUG_FLAGS_DECL;
8321 PERL_ARGS_ASSERT_REGCLASS;
8323 PERL_UNUSED_ARG(depth);
8326 DEBUG_PARSE("clas");
8328 /* Assume we are going to generate an ANYOF node. */
8329 ret = reganode(pRExC_state, ANYOF, 0);
8332 ANYOF_FLAGS(ret) = 0;
8334 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
8338 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8342 RExC_size += ANYOF_SKIP;
8343 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8346 RExC_emit += ANYOF_SKIP;
8348 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8350 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8351 ANYOF_BITMAP_ZERO(ret);
8352 listsv = newSVpvs("# comment\n");
8355 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8357 if (!SIZE_ONLY && POSIXCC(nextvalue))
8358 checkposixcc(pRExC_state);
8360 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8361 if (UCHARAT(RExC_parse) == ']')
8365 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8369 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8372 rangebegin = RExC_parse;
8374 value = utf8n_to_uvchr((U8*)RExC_parse,
8375 RExC_end - RExC_parse,
8376 &numlen, UTF8_ALLOW_DEFAULT);
8377 RExC_parse += numlen;
8380 value = UCHARAT(RExC_parse++);
8382 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8383 if (value == '[' && POSIXCC(nextvalue))
8384 namedclass = regpposixcc(pRExC_state, value);
8385 else if (value == '\\') {
8387 value = utf8n_to_uvchr((U8*)RExC_parse,
8388 RExC_end - RExC_parse,
8389 &numlen, UTF8_ALLOW_DEFAULT);
8390 RExC_parse += numlen;
8393 value = UCHARAT(RExC_parse++);
8394 /* Some compilers cannot handle switching on 64-bit integer
8395 * values, therefore value cannot be an UV. Yes, this will
8396 * be a problem later if we want switch on Unicode.
8397 * A similar issue a little bit later when switching on
8398 * namedclass. --jhi */
8399 switch ((I32)value) {
8400 case 'w': namedclass = ANYOF_ALNUM; break;
8401 case 'W': namedclass = ANYOF_NALNUM; break;
8402 case 's': namedclass = ANYOF_SPACE; break;
8403 case 'S': namedclass = ANYOF_NSPACE; break;
8404 case 'd': namedclass = ANYOF_DIGIT; break;
8405 case 'D': namedclass = ANYOF_NDIGIT; break;
8406 case 'v': namedclass = ANYOF_VERTWS; break;
8407 case 'V': namedclass = ANYOF_NVERTWS; break;
8408 case 'h': namedclass = ANYOF_HORIZWS; break;
8409 case 'H': namedclass = ANYOF_NHORIZWS; break;
8410 case 'N': /* Handle \N{NAME} in class */
8412 /* We only pay attention to the first char of
8413 multichar strings being returned. I kinda wonder
8414 if this makes sense as it does change the behaviour
8415 from earlier versions, OTOH that behaviour was broken
8417 UV v; /* value is register so we cant & it /grrr */
8418 if (reg_namedseq(pRExC_state, &v, NULL)) {
8428 if (RExC_parse >= RExC_end)
8429 vFAIL2("Empty \\%c{}", (U8)value);
8430 if (*RExC_parse == '{') {
8431 const U8 c = (U8)value;
8432 e = strchr(RExC_parse++, '}');
8434 vFAIL2("Missing right brace on \\%c{}", c);
8435 while (isSPACE(UCHARAT(RExC_parse)))
8437 if (e == RExC_parse)
8438 vFAIL2("Empty \\%c{}", c);
8440 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8448 if (UCHARAT(RExC_parse) == '^') {
8451 value = value == 'p' ? 'P' : 'p'; /* toggle */
8452 while (isSPACE(UCHARAT(RExC_parse))) {
8457 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8458 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8462 /* The \p could match something in the Latin1 range, hence
8463 * something that isn't utf8 */
8464 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
8465 namedclass = ANYOF_MAX; /* no official name, but it's named */
8468 case 'n': value = '\n'; break;
8469 case 'r': value = '\r'; break;
8470 case 't': value = '\t'; break;
8471 case 'f': value = '\f'; break;
8472 case 'b': value = '\b'; break;
8473 case 'e': value = ASCII_TO_NATIVE('\033');break;
8474 case 'a': value = ASCII_TO_NATIVE('\007');break;
8476 RExC_parse--; /* function expects to be pointed at the 'o' */
8478 const char* error_msg;
8479 bool valid = grok_bslash_o(RExC_parse,
8484 RExC_parse += numlen;
8489 if (PL_encoding && value < 0x100) {
8490 goto recode_encoding;
8494 if (*RExC_parse == '{') {
8495 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8496 | PERL_SCAN_DISALLOW_PREFIX;
8497 char * const e = strchr(RExC_parse++, '}');
8499 vFAIL("Missing right brace on \\x{}");
8501 numlen = e - RExC_parse;
8502 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8506 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8508 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8509 RExC_parse += numlen;
8511 if (PL_encoding && value < 0x100)
8512 goto recode_encoding;
8515 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8517 case '0': case '1': case '2': case '3': case '4':
8518 case '5': case '6': case '7':
8520 /* Take 1-3 octal digits */
8521 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8523 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8524 RExC_parse += numlen;
8525 if (PL_encoding && value < 0x100)
8526 goto recode_encoding;
8531 SV* enc = PL_encoding;
8532 value = reg_recode((const char)(U8)value, &enc);
8533 if (!enc && SIZE_ONLY)
8534 ckWARNreg(RExC_parse,
8535 "Invalid escape in the specified encoding");
8539 /* Allow \_ to not give an error */
8540 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8541 ckWARN2reg(RExC_parse,
8542 "Unrecognized escape \\%c in character class passed through",
8547 } /* end of \blah */
8553 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8555 /* What matches in a locale is not known until runtime, so need to
8556 * (one time per class) allocate extra space to pass to regexec.
8557 * The space will contain a bit for each named class that is to be
8558 * matched against. This isn't needed for \p{} and pseudo-classes,
8559 * as they are not affected by locale, and hence are dealt with
8561 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8564 RExC_size += ANYOF_CLASS_ADD_SKIP;
8567 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8568 ANYOF_CLASS_ZERO(ret);
8570 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8573 /* a bad range like a-\d, a-[:digit:] ? */
8577 RExC_parse >= rangebegin ?
8578 RExC_parse - rangebegin : 0;
8579 ckWARN4reg(RExC_parse,
8580 "False [] range \"%*.*s\"",
8583 if (prevvalue < 256) {
8585 S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
8587 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8590 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8591 Perl_sv_catpvf(aTHX_ listsv,
8592 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8596 range = 0; /* this was not a true range */
8602 const char *what = NULL;
8605 if (namedclass > OOB_NAMEDCLASS)
8606 optimize_invert = FALSE;
8607 /* Possible truncation here but in some 64-bit environments
8608 * the compiler gets heartburn about switch on 64-bit values.
8609 * A similar issue a little earlier when switching on value.
8611 switch ((I32)namedclass) {
8613 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8614 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8615 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8616 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8617 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8618 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8619 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8620 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8621 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8622 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
8623 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8624 /* \s, \w match all unicode if utf8. */
8625 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8626 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8628 /* \s, \w match ascii and locale only */
8629 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8630 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8632 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
8633 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8634 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8637 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8640 for (value = 0; value < 128; value++)
8642 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
8644 for (value = 0; value < 256; value++) {
8646 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
8655 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8658 for (value = 128; value < 256; value++)
8660 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
8662 for (value = 0; value < 256; value++) {
8663 if (!isASCII(value))
8664 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
8673 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8675 /* consecutive digits assumed */
8676 for (value = '0'; value <= '9'; value++)
8678 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
8681 what = POSIX_CC_UNI_NAME("Digit");
8685 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8687 /* consecutive digits assumed */
8688 for (value = 0; value < '0'; value++)
8690 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
8691 for (value = '9' + 1; value < 256; value++)
8693 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
8696 what = POSIX_CC_UNI_NAME("Digit");
8699 /* this is to handle \p and \P */
8702 vFAIL("Invalid [::] class");
8706 /* Strings such as "+utf8::isWord\n" */
8707 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8709 stored+=2; /* can't optimize this class */
8711 /* All but ASCII can match Unicode characters, but all the ones
8712 * that aren't in utf8 are in the bitmap */
8713 if (namedclass != ANYOF_ASCII) {
8714 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8718 } /* end of namedclass \blah */
8721 if (prevvalue > (IV)value) /* b-a */ {
8722 const int w = RExC_parse - rangebegin;
8723 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8724 range = 0; /* not a valid range */
8728 prevvalue = value; /* save the beginning of the range */
8729 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8730 RExC_parse[1] != ']') {
8733 /* a bad range like \w-, [:word:]- ? */
8734 if (namedclass > OOB_NAMEDCLASS) {
8735 if (ckWARN(WARN_REGEXP)) {
8737 RExC_parse >= rangebegin ?
8738 RExC_parse - rangebegin : 0;
8740 "False [] range \"%*.*s\"",
8745 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8747 range = 1; /* yeah, it's a range! */
8748 continue; /* but do it the next time */
8752 /* now is the next time */
8753 /*stored += (value - prevvalue + 1);*/
8755 if (prevvalue < 256) {
8756 const IV ceilvalue = value < 256 ? value : 255;
8759 /* In EBCDIC [\x89-\x91] should include
8760 * the \x8e but [i-j] should not. */
8761 if (literal_endpoint == 2 &&
8762 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8763 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8765 if (isLOWER(prevvalue)) {
8766 for (i = prevvalue; i <= ceilvalue; i++)
8767 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8769 S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
8772 for (i = prevvalue; i <= ceilvalue; i++)
8773 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8775 S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
8781 for (i = prevvalue; i <= ceilvalue; i++) {
8782 if (!ANYOF_BITMAP_TEST(ret,i)) {
8784 S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
8788 if (value > 255 || UTF) {
8789 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8790 const UV natvalue = NATIVE_TO_UNI(value);
8791 stored+=2; /* can't optimize this class */
8793 /* If the code point requires utf8 to represent, and we are not
8794 * folding, it can't match unless the target is in utf8. Only
8795 * a few code points above 255 fold to below it, so XXX an
8796 * optimization would be to know which ones and set the flag
8798 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8801 if (prevnatvalue < natvalue) { /* what about > ? */
8802 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8803 prevnatvalue, natvalue);
8805 else if (prevnatvalue == natvalue) {
8806 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8808 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8810 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8812 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8813 if (RExC_precomp[0] == ':' &&
8814 RExC_precomp[1] == '[' &&
8815 (f == 0xDF || f == 0x92)) {
8816 f = NATIVE_TO_UNI(f);
8819 /* If folding and foldable and a single
8820 * character, insert also the folded version
8821 * to the charclass. */
8823 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8824 if ((RExC_precomp[0] == ':' &&
8825 RExC_precomp[1] == '[' &&
8827 (value == 0xFB05 || value == 0xFB06))) ?
8828 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8829 foldlen == (STRLEN)UNISKIP(f) )
8831 if (foldlen == (STRLEN)UNISKIP(f))
8833 Perl_sv_catpvf(aTHX_ listsv,
8836 /* Any multicharacter foldings
8837 * require the following transform:
8838 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8839 * where E folds into "pq" and F folds
8840 * into "rst", all other characters
8841 * fold to single characters. We save
8842 * away these multicharacter foldings,
8843 * to be later saved as part of the
8844 * additional "s" data. */
8847 if (!unicode_alternate)
8848 unicode_alternate = newAV();
8849 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8851 av_push(unicode_alternate, sv);
8855 /* If folding and the value is one of the Greek
8856 * sigmas insert a few more sigmas to make the
8857 * folding rules of the sigmas to work right.
8858 * Note that not all the possible combinations
8859 * are handled here: some of them are handled
8860 * by the standard folding rules, and some of
8861 * them (literal or EXACTF cases) are handled
8862 * during runtime in regexec.c:S_find_byclass(). */
8863 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8864 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8865 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8866 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8867 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8869 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8870 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8871 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8876 literal_endpoint = 0;
8880 range = 0; /* this range (if it was one) is done now */
8887 /****** !SIZE_ONLY AFTER HERE *********/
8889 if( stored == 1 && (value < 128 || (value < 256 && !UTF))
8890 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
8892 /* optimize single char class to an EXACT node but *only* when its not
8893 * a UTF/high char. Note that the information needed to decide to do
8894 * this optimization is not currently available until the 2nd pass, and
8895 * that the actually used EXACT node takes less space than the
8896 * calculated ANYOF node, and hence the amount of space calculated in
8897 * the first pass is larger than actually used. Currently we don't
8898 * keep track of enough information to do this for nodes which contain
8899 * matches outside the bitmap */
8900 const char * cur_parse= RExC_parse;
8901 RExC_emit = (regnode *)orig_emit;
8902 RExC_parse = (char *)orig_parse;
8903 ret = reg_node(pRExC_state,
8904 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
8905 RExC_parse = (char *)cur_parse;
8906 *STRING(ret)= (char)value;
8908 RExC_emit += STR_SZ(1);
8909 SvREFCNT_dec(listsv);
8912 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
8913 if ( /* If the only flag is folding (plus possibly inversion). */
8914 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
8916 for (value = 0; value < 256; ++value) {
8917 if (ANYOF_BITMAP_TEST(ret, value)) {
8918 UV fold = PL_fold[value];
8921 ANYOF_BITMAP_SET(ret, fold);
8924 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
8927 /* optimize inverted simple patterns (e.g. [^a-z]) */
8928 if (optimize_invert &&
8929 /* If the only flag is inversion. */
8930 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8931 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8932 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
8933 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
8936 AV * const av = newAV();
8938 /* The 0th element stores the character class description
8939 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8940 * to initialize the appropriate swash (which gets stored in
8941 * the 1st element), and also useful for dumping the regnode.
8942 * The 2nd element stores the multicharacter foldings,
8943 * used later (regexec.c:S_reginclass()). */
8944 av_store(av, 0, listsv);
8945 av_store(av, 1, NULL);
8946 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8947 rv = newRV_noinc(MUTABLE_SV(av));
8948 n = add_data(pRExC_state, 1, "s");
8949 RExC_rxi->data->data[n] = (void*)rv;
8957 /* reg_skipcomment()
8959 Absorbs an /x style # comments from the input stream.
8960 Returns true if there is more text remaining in the stream.
8961 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
8962 terminates the pattern without including a newline.
8964 Note its the callers responsibility to ensure that we are
8970 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
8974 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
8976 while (RExC_parse < RExC_end)
8977 if (*RExC_parse++ == '\n') {
8982 /* we ran off the end of the pattern without ending
8983 the comment, so we have to add an \n when wrapping */
8984 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8992 Advances the parse position, and optionally absorbs
8993 "whitespace" from the inputstream.
8995 Without /x "whitespace" means (?#...) style comments only,
8996 with /x this means (?#...) and # comments and whitespace proper.
8998 Returns the RExC_parse point from BEFORE the scan occurs.
9000 This is the /x friendly way of saying RExC_parse++.
9004 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
9006 char* const retval = RExC_parse++;
9008 PERL_ARGS_ASSERT_NEXTCHAR;
9011 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9012 RExC_parse[2] == '#') {
9013 while (*RExC_parse != ')') {
9014 if (RExC_parse == RExC_end)
9015 FAIL("Sequence (?#... not terminated");
9021 if (RExC_flags & RXf_PMf_EXTENDED) {
9022 if (isSPACE(*RExC_parse)) {
9026 else if (*RExC_parse == '#') {
9027 if ( reg_skipcomment( pRExC_state ) )
9036 - reg_node - emit a node
9038 STATIC regnode * /* Location. */
9039 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
9042 register regnode *ptr;
9043 regnode * const ret = RExC_emit;
9044 GET_RE_DEBUG_FLAGS_DECL;
9046 PERL_ARGS_ASSERT_REG_NODE;
9049 SIZE_ALIGN(RExC_size);
9053 if (RExC_emit >= RExC_emit_bound)
9054 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9056 NODE_ALIGN_FILL(ret);
9058 FILL_ADVANCE_NODE(ptr, op);
9059 #ifdef RE_TRACK_PATTERN_OFFSETS
9060 if (RExC_offsets) { /* MJD */
9061 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
9062 "reg_node", __LINE__,
9064 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9065 ? "Overwriting end of array!\n" : "OK",
9066 (UV)(RExC_emit - RExC_emit_start),
9067 (UV)(RExC_parse - RExC_start),
9068 (UV)RExC_offsets[0]));
9069 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
9077 - reganode - emit a node with an argument
9079 STATIC regnode * /* Location. */
9080 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
9083 register regnode *ptr;
9084 regnode * const ret = RExC_emit;
9085 GET_RE_DEBUG_FLAGS_DECL;
9087 PERL_ARGS_ASSERT_REGANODE;
9090 SIZE_ALIGN(RExC_size);
9095 assert(2==regarglen[op]+1);
9097 Anything larger than this has to allocate the extra amount.
9098 If we changed this to be:
9100 RExC_size += (1 + regarglen[op]);
9102 then it wouldn't matter. Its not clear what side effect
9103 might come from that so its not done so far.
9108 if (RExC_emit >= RExC_emit_bound)
9109 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9111 NODE_ALIGN_FILL(ret);
9113 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
9114 #ifdef RE_TRACK_PATTERN_OFFSETS
9115 if (RExC_offsets) { /* MJD */
9116 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9120 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
9121 "Overwriting end of array!\n" : "OK",
9122 (UV)(RExC_emit - RExC_emit_start),
9123 (UV)(RExC_parse - RExC_start),
9124 (UV)RExC_offsets[0]));
9125 Set_Cur_Node_Offset;
9133 - reguni - emit (if appropriate) a Unicode character
9136 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
9140 PERL_ARGS_ASSERT_REGUNI;
9142 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
9146 - reginsert - insert an operator in front of already-emitted operand
9148 * Means relocating the operand.
9151 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
9154 register regnode *src;
9155 register regnode *dst;
9156 register regnode *place;
9157 const int offset = regarglen[(U8)op];
9158 const int size = NODE_STEP_REGNODE + offset;
9159 GET_RE_DEBUG_FLAGS_DECL;
9161 PERL_ARGS_ASSERT_REGINSERT;
9162 PERL_UNUSED_ARG(depth);
9163 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
9164 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
9173 if (RExC_open_parens) {
9175 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9176 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9177 if ( RExC_open_parens[paren] >= opnd ) {
9178 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9179 RExC_open_parens[paren] += size;
9181 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9183 if ( RExC_close_parens[paren] >= opnd ) {
9184 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9185 RExC_close_parens[paren] += size;
9187 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9192 while (src > opnd) {
9193 StructCopy(--src, --dst, regnode);
9194 #ifdef RE_TRACK_PATTERN_OFFSETS
9195 if (RExC_offsets) { /* MJD 20010112 */
9196 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9200 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9201 ? "Overwriting end of array!\n" : "OK",
9202 (UV)(src - RExC_emit_start),
9203 (UV)(dst - RExC_emit_start),
9204 (UV)RExC_offsets[0]));
9205 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9206 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9212 place = opnd; /* Op node, where operand used to be. */
9213 #ifdef RE_TRACK_PATTERN_OFFSETS
9214 if (RExC_offsets) { /* MJD */
9215 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9219 (UV)(place - RExC_emit_start) > RExC_offsets[0]
9220 ? "Overwriting end of array!\n" : "OK",
9221 (UV)(place - RExC_emit_start),
9222 (UV)(RExC_parse - RExC_start),
9223 (UV)RExC_offsets[0]));
9224 Set_Node_Offset(place, RExC_parse);
9225 Set_Node_Length(place, 1);
9228 src = NEXTOPER(place);
9229 FILL_ADVANCE_NODE(place, op);
9230 Zero(src, offset, regnode);
9234 - regtail - set the next-pointer at the end of a node chain of p to val.
9235 - SEE ALSO: regtail_study
9237 /* TODO: All three parms should be const */
9239 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9242 register regnode *scan;
9243 GET_RE_DEBUG_FLAGS_DECL;
9245 PERL_ARGS_ASSERT_REGTAIL;
9247 PERL_UNUSED_ARG(depth);
9253 /* Find last node. */
9256 regnode * const temp = regnext(scan);
9258 SV * const mysv=sv_newmortal();
9259 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9260 regprop(RExC_rx, mysv, scan);
9261 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9262 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9263 (temp == NULL ? "->" : ""),
9264 (temp == NULL ? PL_reg_name[OP(val)] : "")
9272 if (reg_off_by_arg[OP(scan)]) {
9273 ARG_SET(scan, val - scan);
9276 NEXT_OFF(scan) = val - scan;
9282 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9283 - Look for optimizable sequences at the same time.
9284 - currently only looks for EXACT chains.
9286 This is expermental code. The idea is to use this routine to perform
9287 in place optimizations on branches and groups as they are constructed,
9288 with the long term intention of removing optimization from study_chunk so
9289 that it is purely analytical.
9291 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9292 to control which is which.
9295 /* TODO: All four parms should be const */
9298 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9301 register regnode *scan;
9303 #ifdef EXPERIMENTAL_INPLACESCAN
9306 GET_RE_DEBUG_FLAGS_DECL;
9308 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9314 /* Find last node. */
9318 regnode * const temp = regnext(scan);
9319 #ifdef EXPERIMENTAL_INPLACESCAN
9320 if (PL_regkind[OP(scan)] == EXACT)
9321 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9330 if( exact == PSEUDO )
9332 else if ( exact != OP(scan) )
9341 SV * const mysv=sv_newmortal();
9342 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9343 regprop(RExC_rx, mysv, scan);
9344 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9345 SvPV_nolen_const(mysv),
9347 PL_reg_name[exact]);
9354 SV * const mysv_val=sv_newmortal();
9355 DEBUG_PARSE_MSG("");
9356 regprop(RExC_rx, mysv_val, val);
9357 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9358 SvPV_nolen_const(mysv_val),
9359 (IV)REG_NODE_NUM(val),
9363 if (reg_off_by_arg[OP(scan)]) {
9364 ARG_SET(scan, val - scan);
9367 NEXT_OFF(scan) = val - scan;
9375 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9379 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9384 for (bit=0; bit<32; bit++) {
9385 if (flags & (1<<bit)) {
9387 PerlIO_printf(Perl_debug_log, "%s",lead);
9388 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9393 PerlIO_printf(Perl_debug_log, "\n");
9395 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9401 Perl_regdump(pTHX_ const regexp *r)
9405 SV * const sv = sv_newmortal();
9406 SV *dsv= sv_newmortal();
9408 GET_RE_DEBUG_FLAGS_DECL;
9410 PERL_ARGS_ASSERT_REGDUMP;
9412 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9414 /* Header fields of interest. */
9415 if (r->anchored_substr) {
9416 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9417 RE_SV_DUMPLEN(r->anchored_substr), 30);
9418 PerlIO_printf(Perl_debug_log,
9419 "anchored %s%s at %"IVdf" ",
9420 s, RE_SV_TAIL(r->anchored_substr),
9421 (IV)r->anchored_offset);
9422 } else if (r->anchored_utf8) {
9423 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9424 RE_SV_DUMPLEN(r->anchored_utf8), 30);
9425 PerlIO_printf(Perl_debug_log,
9426 "anchored utf8 %s%s at %"IVdf" ",
9427 s, RE_SV_TAIL(r->anchored_utf8),
9428 (IV)r->anchored_offset);
9430 if (r->float_substr) {
9431 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9432 RE_SV_DUMPLEN(r->float_substr), 30);
9433 PerlIO_printf(Perl_debug_log,
9434 "floating %s%s at %"IVdf"..%"UVuf" ",
9435 s, RE_SV_TAIL(r->float_substr),
9436 (IV)r->float_min_offset, (UV)r->float_max_offset);
9437 } else if (r->float_utf8) {
9438 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9439 RE_SV_DUMPLEN(r->float_utf8), 30);
9440 PerlIO_printf(Perl_debug_log,
9441 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9442 s, RE_SV_TAIL(r->float_utf8),
9443 (IV)r->float_min_offset, (UV)r->float_max_offset);
9445 if (r->check_substr || r->check_utf8)
9446 PerlIO_printf(Perl_debug_log,
9448 (r->check_substr == r->float_substr
9449 && r->check_utf8 == r->float_utf8
9450 ? "(checking floating" : "(checking anchored"));
9451 if (r->extflags & RXf_NOSCAN)
9452 PerlIO_printf(Perl_debug_log, " noscan");
9453 if (r->extflags & RXf_CHECK_ALL)
9454 PerlIO_printf(Perl_debug_log, " isall");
9455 if (r->check_substr || r->check_utf8)
9456 PerlIO_printf(Perl_debug_log, ") ");
9458 if (ri->regstclass) {
9459 regprop(r, sv, ri->regstclass);
9460 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9462 if (r->extflags & RXf_ANCH) {
9463 PerlIO_printf(Perl_debug_log, "anchored");
9464 if (r->extflags & RXf_ANCH_BOL)
9465 PerlIO_printf(Perl_debug_log, "(BOL)");
9466 if (r->extflags & RXf_ANCH_MBOL)
9467 PerlIO_printf(Perl_debug_log, "(MBOL)");
9468 if (r->extflags & RXf_ANCH_SBOL)
9469 PerlIO_printf(Perl_debug_log, "(SBOL)");
9470 if (r->extflags & RXf_ANCH_GPOS)
9471 PerlIO_printf(Perl_debug_log, "(GPOS)");
9472 PerlIO_putc(Perl_debug_log, ' ');
9474 if (r->extflags & RXf_GPOS_SEEN)
9475 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9476 if (r->intflags & PREGf_SKIP)
9477 PerlIO_printf(Perl_debug_log, "plus ");
9478 if (r->intflags & PREGf_IMPLICIT)
9479 PerlIO_printf(Perl_debug_log, "implicit ");
9480 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9481 if (r->extflags & RXf_EVAL_SEEN)
9482 PerlIO_printf(Perl_debug_log, "with eval ");
9483 PerlIO_printf(Perl_debug_log, "\n");
9484 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9486 PERL_ARGS_ASSERT_REGDUMP;
9487 PERL_UNUSED_CONTEXT;
9489 #endif /* DEBUGGING */
9493 - regprop - printable representation of opcode
9495 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9498 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9499 if (flags & ANYOF_INVERT) \
9500 /*make sure the invert info is in each */ \
9501 sv_catpvs(sv, "^"); \
9507 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9512 RXi_GET_DECL(prog,progi);
9513 GET_RE_DEBUG_FLAGS_DECL;
9515 PERL_ARGS_ASSERT_REGPROP;
9519 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9520 /* It would be nice to FAIL() here, but this may be called from
9521 regexec.c, and it would be hard to supply pRExC_state. */
9522 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9523 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9525 k = PL_regkind[OP(o)];
9529 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9530 * is a crude hack but it may be the best for now since
9531 * we have no flag "this EXACTish node was UTF-8"
9533 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9534 PERL_PV_ESCAPE_UNI_DETECT |
9535 PERL_PV_PRETTY_ELLIPSES |
9536 PERL_PV_PRETTY_LTGT |
9537 PERL_PV_PRETTY_NOCLEAR
9539 } else if (k == TRIE) {
9540 /* print the details of the trie in dumpuntil instead, as
9541 * progi->data isn't available here */
9542 const char op = OP(o);
9543 const U32 n = ARG(o);
9544 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9545 (reg_ac_data *)progi->data->data[n] :
9547 const reg_trie_data * const trie
9548 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9550 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9551 DEBUG_TRIE_COMPILE_r(
9552 Perl_sv_catpvf(aTHX_ sv,
9553 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9554 (UV)trie->startstate,
9555 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9556 (UV)trie->wordcount,
9559 (UV)TRIE_CHARCOUNT(trie),
9560 (UV)trie->uniquecharcount
9563 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9565 int rangestart = -1;
9566 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9568 for (i = 0; i <= 256; i++) {
9569 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9570 if (rangestart == -1)
9572 } else if (rangestart != -1) {
9573 if (i <= rangestart + 3)
9574 for (; rangestart < i; rangestart++)
9575 put_byte(sv, rangestart);
9577 put_byte(sv, rangestart);
9579 put_byte(sv, i - 1);
9587 } else if (k == CURLY) {
9588 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9589 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9590 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9592 else if (k == WHILEM && o->flags) /* Ordinal/of */
9593 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9594 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9595 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9596 if ( RXp_PAREN_NAMES(prog) ) {
9597 if ( k != REF || OP(o) < NREF) {
9598 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9599 SV **name= av_fetch(list, ARG(o), 0 );
9601 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9604 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9605 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9606 I32 *nums=(I32*)SvPVX(sv_dat);
9607 SV **name= av_fetch(list, nums[0], 0 );
9610 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9611 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9612 (n ? "," : ""), (IV)nums[n]);
9614 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9618 } else if (k == GOSUB)
9619 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9620 else if (k == VERB) {
9622 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9623 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9624 } else if (k == LOGICAL)
9625 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9626 else if (k == FOLDCHAR)
9627 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9628 else if (k == ANYOF) {
9629 int i, rangestart = -1;
9630 const U8 flags = ANYOF_FLAGS(o);
9633 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9634 static const char * const anyofs[] = {
9667 if (flags & ANYOF_LOCALE)
9668 sv_catpvs(sv, "{loc}");
9669 if (flags & ANYOF_FOLD)
9670 sv_catpvs(sv, "{i}");
9671 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9672 if (flags & ANYOF_INVERT)
9675 /* output what the standard cp 0-255 bitmap matches */
9676 for (i = 0; i <= 256; i++) {
9677 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9678 if (rangestart == -1)
9680 } else if (rangestart != -1) {
9681 if (i <= rangestart + 3)
9682 for (; rangestart < i; rangestart++)
9683 put_byte(sv, rangestart);
9685 put_byte(sv, rangestart);
9687 put_byte(sv, i - 1);
9694 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9695 /* output any special charclass tests (used mostly under use locale) */
9696 if (o->flags & ANYOF_CLASS && ANYOF_CLASS_TEST_ANY_SET(o))
9697 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9698 if (ANYOF_CLASS_TEST(o,i)) {
9699 sv_catpv(sv, anyofs[i]);
9703 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9705 /* output information about the unicode matching */
9706 if (flags & ANYOF_UNICODE_ALL)
9707 sv_catpvs(sv, "{unicode_all}");
9708 else if (flags & ANYOF_UTF8)
9709 sv_catpvs(sv, "{unicode}");
9710 if (flags & ANYOF_NONBITMAP_NON_UTF8)
9711 sv_catpvs(sv, "{outside bitmap}");
9715 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9719 U8 s[UTF8_MAXBYTES_CASE+1];
9721 for (i = 0; i <= 256; i++) { /* just the first 256 */
9722 uvchr_to_utf8(s, i);
9724 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9725 if (rangestart == -1)
9727 } else if (rangestart != -1) {
9728 if (i <= rangestart + 3)
9729 for (; rangestart < i; rangestart++) {
9730 const U8 * const e = uvchr_to_utf8(s,rangestart);
9732 for(p = s; p < e; p++)
9736 const U8 *e = uvchr_to_utf8(s,rangestart);
9738 for (p = s; p < e; p++)
9741 e = uvchr_to_utf8(s, i-1);
9742 for (p = s; p < e; p++)
9749 sv_catpvs(sv, "..."); /* et cetera */
9753 char *s = savesvpv(lv);
9754 char * const origs = s;
9756 while (*s && *s != '\n')
9760 const char * const t = ++s;
9778 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9780 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9781 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9783 PERL_UNUSED_CONTEXT;
9784 PERL_UNUSED_ARG(sv);
9786 PERL_UNUSED_ARG(prog);
9787 #endif /* DEBUGGING */
9791 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9792 { /* Assume that RE_INTUIT is set */
9794 struct regexp *const prog = (struct regexp *)SvANY(r);
9795 GET_RE_DEBUG_FLAGS_DECL;
9797 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9798 PERL_UNUSED_CONTEXT;
9802 const char * const s = SvPV_nolen_const(prog->check_substr
9803 ? prog->check_substr : prog->check_utf8);
9805 if (!PL_colorset) reginitcolors();
9806 PerlIO_printf(Perl_debug_log,
9807 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9809 prog->check_substr ? "" : "utf8 ",
9810 PL_colors[5],PL_colors[0],
9813 (strlen(s) > 60 ? "..." : ""));
9816 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9822 handles refcounting and freeing the perl core regexp structure. When
9823 it is necessary to actually free the structure the first thing it
9824 does is call the 'free' method of the regexp_engine associated to
9825 the regexp, allowing the handling of the void *pprivate; member
9826 first. (This routine is not overridable by extensions, which is why
9827 the extensions free is called first.)
9829 See regdupe and regdupe_internal if you change anything here.
9831 #ifndef PERL_IN_XSUB_RE
9833 Perl_pregfree(pTHX_ REGEXP *r)
9839 Perl_pregfree2(pTHX_ REGEXP *rx)
9842 struct regexp *const r = (struct regexp *)SvANY(rx);
9843 GET_RE_DEBUG_FLAGS_DECL;
9845 PERL_ARGS_ASSERT_PREGFREE2;
9848 ReREFCNT_dec(r->mother_re);
9850 CALLREGFREE_PVT(rx); /* free the private data */
9851 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9854 SvREFCNT_dec(r->anchored_substr);
9855 SvREFCNT_dec(r->anchored_utf8);
9856 SvREFCNT_dec(r->float_substr);
9857 SvREFCNT_dec(r->float_utf8);
9858 Safefree(r->substrs);
9860 RX_MATCH_COPY_FREE(rx);
9861 #ifdef PERL_OLD_COPY_ON_WRITE
9862 SvREFCNT_dec(r->saved_copy);
9869 This is a hacky workaround to the structural issue of match results
9870 being stored in the regexp structure which is in turn stored in
9871 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9872 could be PL_curpm in multiple contexts, and could require multiple
9873 result sets being associated with the pattern simultaneously, such
9874 as when doing a recursive match with (??{$qr})
9876 The solution is to make a lightweight copy of the regexp structure
9877 when a qr// is returned from the code executed by (??{$qr}) this
9878 lightweight copy doesnt actually own any of its data except for
9879 the starp/end and the actual regexp structure itself.
9885 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9888 struct regexp *const r = (struct regexp *)SvANY(rx);
9889 register const I32 npar = r->nparens+1;
9891 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9894 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9895 ret = (struct regexp *)SvANY(ret_x);
9897 (void)ReREFCNT_inc(rx);
9898 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9899 by pointing directly at the buffer, but flagging that the allocated
9900 space in the copy is zero. As we've just done a struct copy, it's now
9901 a case of zero-ing that, rather than copying the current length. */
9902 SvPV_set(ret_x, RX_WRAPPED(rx));
9903 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9904 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9905 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9906 SvLEN_set(ret_x, 0);
9907 SvSTASH_set(ret_x, NULL);
9908 SvMAGIC_set(ret_x, NULL);
9909 Newx(ret->offs, npar, regexp_paren_pair);
9910 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9912 Newx(ret->substrs, 1, struct reg_substr_data);
9913 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9915 SvREFCNT_inc_void(ret->anchored_substr);
9916 SvREFCNT_inc_void(ret->anchored_utf8);
9917 SvREFCNT_inc_void(ret->float_substr);
9918 SvREFCNT_inc_void(ret->float_utf8);
9920 /* check_substr and check_utf8, if non-NULL, point to either their
9921 anchored or float namesakes, and don't hold a second reference. */
9923 RX_MATCH_COPIED_off(ret_x);
9924 #ifdef PERL_OLD_COPY_ON_WRITE
9925 ret->saved_copy = NULL;
9927 ret->mother_re = rx;
9933 /* regfree_internal()
9935 Free the private data in a regexp. This is overloadable by
9936 extensions. Perl takes care of the regexp structure in pregfree(),
9937 this covers the *pprivate pointer which technically perl doesn't
9938 know about, however of course we have to handle the
9939 regexp_internal structure when no extension is in use.
9941 Note this is called before freeing anything in the regexp
9946 Perl_regfree_internal(pTHX_ REGEXP * const rx)
9949 struct regexp *const r = (struct regexp *)SvANY(rx);
9951 GET_RE_DEBUG_FLAGS_DECL;
9953 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
9959 SV *dsv= sv_newmortal();
9960 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
9961 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
9962 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
9963 PL_colors[4],PL_colors[5],s);
9966 #ifdef RE_TRACK_PATTERN_OFFSETS
9968 Safefree(ri->u.offsets); /* 20010421 MJD */
9971 int n = ri->data->count;
9972 PAD* new_comppad = NULL;
9977 /* If you add a ->what type here, update the comment in regcomp.h */
9978 switch (ri->data->what[n]) {
9983 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
9986 Safefree(ri->data->data[n]);
9989 new_comppad = MUTABLE_AV(ri->data->data[n]);
9992 if (new_comppad == NULL)
9993 Perl_croak(aTHX_ "panic: pregfree comppad");
9994 PAD_SAVE_LOCAL(old_comppad,
9995 /* Watch out for global destruction's random ordering. */
9996 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
9999 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
10002 op_free((OP_4tree*)ri->data->data[n]);
10004 PAD_RESTORE_LOCAL(old_comppad);
10005 SvREFCNT_dec(MUTABLE_SV(new_comppad));
10006 new_comppad = NULL;
10011 { /* Aho Corasick add-on structure for a trie node.
10012 Used in stclass optimization only */
10014 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
10016 refcount = --aho->refcount;
10019 PerlMemShared_free(aho->states);
10020 PerlMemShared_free(aho->fail);
10021 /* do this last!!!! */
10022 PerlMemShared_free(ri->data->data[n]);
10023 PerlMemShared_free(ri->regstclass);
10029 /* trie structure. */
10031 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
10033 refcount = --trie->refcount;
10036 PerlMemShared_free(trie->charmap);
10037 PerlMemShared_free(trie->states);
10038 PerlMemShared_free(trie->trans);
10040 PerlMemShared_free(trie->bitmap);
10042 PerlMemShared_free(trie->jump);
10043 PerlMemShared_free(trie->wordinfo);
10044 /* do this last!!!! */
10045 PerlMemShared_free(ri->data->data[n]);
10050 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
10053 Safefree(ri->data->what);
10054 Safefree(ri->data);
10060 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10061 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
10062 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
10065 re_dup - duplicate a regexp.
10067 This routine is expected to clone a given regexp structure. It is only
10068 compiled under USE_ITHREADS.
10070 After all of the core data stored in struct regexp is duplicated
10071 the regexp_engine.dupe method is used to copy any private data
10072 stored in the *pprivate pointer. This allows extensions to handle
10073 any duplication it needs to do.
10075 See pregfree() and regfree_internal() if you change anything here.
10077 #if defined(USE_ITHREADS)
10078 #ifndef PERL_IN_XSUB_RE
10080 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
10084 const struct regexp *r = (const struct regexp *)SvANY(sstr);
10085 struct regexp *ret = (struct regexp *)SvANY(dstr);
10087 PERL_ARGS_ASSERT_RE_DUP_GUTS;
10089 npar = r->nparens+1;
10090 Newx(ret->offs, npar, regexp_paren_pair);
10091 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10093 /* no need to copy these */
10094 Newx(ret->swap, npar, regexp_paren_pair);
10097 if (ret->substrs) {
10098 /* Do it this way to avoid reading from *r after the StructCopy().
10099 That way, if any of the sv_dup_inc()s dislodge *r from the L1
10100 cache, it doesn't matter. */
10101 const bool anchored = r->check_substr
10102 ? r->check_substr == r->anchored_substr
10103 : r->check_utf8 == r->anchored_utf8;
10104 Newx(ret->substrs, 1, struct reg_substr_data);
10105 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10107 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10108 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10109 ret->float_substr = sv_dup_inc(ret->float_substr, param);
10110 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
10112 /* check_substr and check_utf8, if non-NULL, point to either their
10113 anchored or float namesakes, and don't hold a second reference. */
10115 if (ret->check_substr) {
10117 assert(r->check_utf8 == r->anchored_utf8);
10118 ret->check_substr = ret->anchored_substr;
10119 ret->check_utf8 = ret->anchored_utf8;
10121 assert(r->check_substr == r->float_substr);
10122 assert(r->check_utf8 == r->float_utf8);
10123 ret->check_substr = ret->float_substr;
10124 ret->check_utf8 = ret->float_utf8;
10126 } else if (ret->check_utf8) {
10128 ret->check_utf8 = ret->anchored_utf8;
10130 ret->check_utf8 = ret->float_utf8;
10135 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
10138 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
10140 if (RX_MATCH_COPIED(dstr))
10141 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
10143 ret->subbeg = NULL;
10144 #ifdef PERL_OLD_COPY_ON_WRITE
10145 ret->saved_copy = NULL;
10148 if (ret->mother_re) {
10149 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10150 /* Our storage points directly to our mother regexp, but that's
10151 1: a buffer in a different thread
10152 2: something we no longer hold a reference on
10153 so we need to copy it locally. */
10154 /* Note we need to sue SvCUR() on our mother_re, because it, in
10155 turn, may well be pointing to its own mother_re. */
10156 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10157 SvCUR(ret->mother_re)+1));
10158 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10160 ret->mother_re = NULL;
10164 #endif /* PERL_IN_XSUB_RE */
10169 This is the internal complement to regdupe() which is used to copy
10170 the structure pointed to by the *pprivate pointer in the regexp.
10171 This is the core version of the extension overridable cloning hook.
10172 The regexp structure being duplicated will be copied by perl prior
10173 to this and will be provided as the regexp *r argument, however
10174 with the /old/ structures pprivate pointer value. Thus this routine
10175 may override any copying normally done by perl.
10177 It returns a pointer to the new regexp_internal structure.
10181 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10184 struct regexp *const r = (struct regexp *)SvANY(rx);
10185 regexp_internal *reti;
10187 RXi_GET_DECL(r,ri);
10189 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10191 npar = r->nparens+1;
10194 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10195 Copy(ri->program, reti->program, len+1, regnode);
10198 reti->regstclass = NULL;
10201 struct reg_data *d;
10202 const int count = ri->data->count;
10205 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10206 char, struct reg_data);
10207 Newx(d->what, count, U8);
10210 for (i = 0; i < count; i++) {
10211 d->what[i] = ri->data->what[i];
10212 switch (d->what[i]) {
10213 /* legal options are one of: sSfpontTua
10214 see also regcomp.h and pregfree() */
10215 case 'a': /* actually an AV, but the dup function is identical. */
10218 case 'p': /* actually an AV, but the dup function is identical. */
10219 case 'u': /* actually an HV, but the dup function is identical. */
10220 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10223 /* This is cheating. */
10224 Newx(d->data[i], 1, struct regnode_charclass_class);
10225 StructCopy(ri->data->data[i], d->data[i],
10226 struct regnode_charclass_class);
10227 reti->regstclass = (regnode*)d->data[i];
10230 /* Compiled op trees are readonly and in shared memory,
10231 and can thus be shared without duplication. */
10233 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10237 /* Trie stclasses are readonly and can thus be shared
10238 * without duplication. We free the stclass in pregfree
10239 * when the corresponding reg_ac_data struct is freed.
10241 reti->regstclass= ri->regstclass;
10245 ((reg_trie_data*)ri->data->data[i])->refcount++;
10249 d->data[i] = ri->data->data[i];
10252 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10261 reti->name_list_idx = ri->name_list_idx;
10263 #ifdef RE_TRACK_PATTERN_OFFSETS
10264 if (ri->u.offsets) {
10265 Newx(reti->u.offsets, 2*len+1, U32);
10266 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10269 SetProgLen(reti,len);
10272 return (void*)reti;
10275 #endif /* USE_ITHREADS */
10277 #ifndef PERL_IN_XSUB_RE
10280 - regnext - dig the "next" pointer out of a node
10283 Perl_regnext(pTHX_ register regnode *p)
10286 register I32 offset;
10291 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
10292 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10295 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10304 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10307 STRLEN l1 = strlen(pat1);
10308 STRLEN l2 = strlen(pat2);
10311 const char *message;
10313 PERL_ARGS_ASSERT_RE_CROAK2;
10319 Copy(pat1, buf, l1 , char);
10320 Copy(pat2, buf + l1, l2 , char);
10321 buf[l1 + l2] = '\n';
10322 buf[l1 + l2 + 1] = '\0';
10324 /* ANSI variant takes additional second argument */
10325 va_start(args, pat2);
10329 msv = vmess(buf, &args);
10331 message = SvPV_const(msv,l1);
10334 Copy(message, buf, l1 , char);
10335 buf[l1-1] = '\0'; /* Overwrite \n */
10336 Perl_croak(aTHX_ "%s", buf);
10339 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10341 #ifndef PERL_IN_XSUB_RE
10343 Perl_save_re_context(pTHX)
10347 struct re_save_state *state;
10349 SAVEVPTR(PL_curcop);
10350 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10352 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10353 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10354 SSPUSHUV(SAVEt_RE_STATE);
10356 Copy(&PL_reg_state, state, 1, struct re_save_state);
10358 PL_reg_start_tmp = 0;
10359 PL_reg_start_tmpl = 0;
10360 PL_reg_oldsaved = NULL;
10361 PL_reg_oldsavedlen = 0;
10362 PL_reg_maxiter = 0;
10363 PL_reg_leftiter = 0;
10364 PL_reg_poscache = NULL;
10365 PL_reg_poscache_size = 0;
10366 #ifdef PERL_OLD_COPY_ON_WRITE
10370 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10372 const REGEXP * const rx = PM_GETRE(PL_curpm);
10375 for (i = 1; i <= RX_NPARENS(rx); i++) {
10376 char digits[TYPE_CHARS(long)];
10377 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10378 GV *const *const gvp
10379 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10382 GV * const gv = *gvp;
10383 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10393 clear_re(pTHX_ void *r)
10396 ReREFCNT_dec((REGEXP *)r);
10402 S_put_byte(pTHX_ SV *sv, int c)
10404 PERL_ARGS_ASSERT_PUT_BYTE;
10406 /* Our definition of isPRINT() ignores locales, so only bytes that are
10407 not part of UTF-8 are considered printable. I assume that the same
10408 holds for UTF-EBCDIC.
10409 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10410 which Wikipedia says:
10412 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10413 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10414 identical, to the ASCII delete (DEL) or rubout control character.
10415 ) So the old condition can be simplified to !isPRINT(c) */
10418 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10421 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10425 const char string = c;
10426 if (c == '-' || c == ']' || c == '\\' || c == '^')
10427 sv_catpvs(sv, "\\");
10428 sv_catpvn(sv, &string, 1);
10433 #define CLEAR_OPTSTART \
10434 if (optstart) STMT_START { \
10435 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10439 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10441 STATIC const regnode *
10442 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10443 const regnode *last, const regnode *plast,
10444 SV* sv, I32 indent, U32 depth)
10447 register U8 op = PSEUDO; /* Arbitrary non-END op. */
10448 register const regnode *next;
10449 const regnode *optstart= NULL;
10451 RXi_GET_DECL(r,ri);
10452 GET_RE_DEBUG_FLAGS_DECL;
10454 PERL_ARGS_ASSERT_DUMPUNTIL;
10456 #ifdef DEBUG_DUMPUNTIL
10457 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10458 last ? last-start : 0,plast ? plast-start : 0);
10461 if (plast && plast < last)
10464 while (PL_regkind[op] != END && (!last || node < last)) {
10465 /* While that wasn't END last time... */
10468 if (op == CLOSE || op == WHILEM)
10470 next = regnext((regnode *)node);
10473 if (OP(node) == OPTIMIZED) {
10474 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10481 regprop(r, sv, node);
10482 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10483 (int)(2*indent + 1), "", SvPVX_const(sv));
10485 if (OP(node) != OPTIMIZED) {
10486 if (next == NULL) /* Next ptr. */
10487 PerlIO_printf(Perl_debug_log, " (0)");
10488 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10489 PerlIO_printf(Perl_debug_log, " (FAIL)");
10491 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10492 (void)PerlIO_putc(Perl_debug_log, '\n');
10496 if (PL_regkind[(U8)op] == BRANCHJ) {
10499 register const regnode *nnode = (OP(next) == LONGJMP
10500 ? regnext((regnode *)next)
10502 if (last && nnode > last)
10504 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10507 else if (PL_regkind[(U8)op] == BRANCH) {
10509 DUMPUNTIL(NEXTOPER(node), next);
10511 else if ( PL_regkind[(U8)op] == TRIE ) {
10512 const regnode *this_trie = node;
10513 const char op = OP(node);
10514 const U32 n = ARG(node);
10515 const reg_ac_data * const ac = op>=AHOCORASICK ?
10516 (reg_ac_data *)ri->data->data[n] :
10518 const reg_trie_data * const trie =
10519 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10521 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10523 const regnode *nextbranch= NULL;
10526 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10527 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10529 PerlIO_printf(Perl_debug_log, "%*s%s ",
10530 (int)(2*(indent+3)), "",
10531 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10532 PL_colors[0], PL_colors[1],
10533 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10534 PERL_PV_PRETTY_ELLIPSES |
10535 PERL_PV_PRETTY_LTGT
10540 U16 dist= trie->jump[word_idx+1];
10541 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10542 (UV)((dist ? this_trie + dist : next) - start));
10545 nextbranch= this_trie + trie->jump[0];
10546 DUMPUNTIL(this_trie + dist, nextbranch);
10548 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10549 nextbranch= regnext((regnode *)nextbranch);
10551 PerlIO_printf(Perl_debug_log, "\n");
10554 if (last && next > last)
10559 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10560 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10561 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10563 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10565 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10567 else if ( op == PLUS || op == STAR) {
10568 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10570 else if (op == ANYOF) {
10571 /* arglen 1 + class block */
10572 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
10573 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10574 node = NEXTOPER(node);
10576 else if (PL_regkind[(U8)op] == EXACT) {
10577 /* Literal string, where present. */
10578 node += NODE_SZ_STR(node) - 1;
10579 node = NEXTOPER(node);
10582 node = NEXTOPER(node);
10583 node += regarglen[(U8)op];
10585 if (op == CURLYX || op == OPEN)
10589 #ifdef DEBUG_DUMPUNTIL
10590 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10595 #endif /* DEBUGGING */
10599 * c-indentation-style: bsd
10600 * c-basic-offset: 4
10601 * indent-tabs-mode: t
10604 * ex: set ts=8 sts=4 sw=4 noet: