5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
95 # if defined(BUGGY_MSC6)
96 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
97 # pragma optimize("a",off)
98 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
99 # pragma optimize("w",on )
100 # endif /* BUGGY_MSC6 */
104 #define STATIC static
107 typedef struct RExC_state_t {
108 U32 flags; /* are we folding, multilining? */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object pprivate field */
113 char *start; /* Start of input for compile */
114 char *end; /* End of input for compile */
115 char *parse; /* Input-scan pointer. */
116 I32 whilem_seen; /* number of WHILEM in this expr */
117 regnode *emit_start; /* Start of emitted-code area */
118 regnode *emit_bound; /* First regnode outside of the allocated space */
119 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
120 I32 naughty; /* How bad is this pattern? */
121 I32 sawback; /* Did we see \1, ...? */
123 I32 size; /* Code size. */
124 I32 npar; /* Capture buffer count, (OPEN). */
125 I32 cpar; /* Capture buffer count, (CLOSE). */
126 I32 nestroot; /* root parens we are in - used by accept */
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
137 HV *paren_names; /* Paren names */
139 regnode **recurse; /* Recurse regops */
140 I32 recurse_count; /* Number of recurse regops */
142 char *starttry; /* -Dr: where regtry was called. */
143 #define RExC_starttry (pRExC_state->starttry)
146 const char *lastparse;
148 AV *paren_name_list; /* idx -> name */
149 #define RExC_lastparse (pRExC_state->lastparse)
150 #define RExC_lastnum (pRExC_state->lastnum)
151 #define RExC_paren_name_list (pRExC_state->paren_name_list)
155 #define RExC_flags (pRExC_state->flags)
156 #define RExC_precomp (pRExC_state->precomp)
157 #define RExC_rx_sv (pRExC_state->rx_sv)
158 #define RExC_rx (pRExC_state->rx)
159 #define RExC_rxi (pRExC_state->rxi)
160 #define RExC_start (pRExC_state->start)
161 #define RExC_end (pRExC_state->end)
162 #define RExC_parse (pRExC_state->parse)
163 #define RExC_whilem_seen (pRExC_state->whilem_seen)
164 #ifdef RE_TRACK_PATTERN_OFFSETS
165 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
167 #define RExC_emit (pRExC_state->emit)
168 #define RExC_emit_start (pRExC_state->emit_start)
169 #define RExC_emit_bound (pRExC_state->emit_bound)
170 #define RExC_naughty (pRExC_state->naughty)
171 #define RExC_sawback (pRExC_state->sawback)
172 #define RExC_seen (pRExC_state->seen)
173 #define RExC_size (pRExC_state->size)
174 #define RExC_npar (pRExC_state->npar)
175 #define RExC_nestroot (pRExC_state->nestroot)
176 #define RExC_extralen (pRExC_state->extralen)
177 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
178 #define RExC_seen_evals (pRExC_state->seen_evals)
179 #define RExC_utf8 (pRExC_state->utf8)
180 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
181 #define RExC_open_parens (pRExC_state->open_parens)
182 #define RExC_close_parens (pRExC_state->close_parens)
183 #define RExC_opend (pRExC_state->opend)
184 #define RExC_paren_names (pRExC_state->paren_names)
185 #define RExC_recurse (pRExC_state->recurse)
186 #define RExC_recurse_count (pRExC_state->recurse_count)
189 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
190 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
191 ((*s) == '{' && regcurly(s)))
194 #undef SPSTART /* dratted cpp namespace... */
197 * Flags to be passed up and down.
199 #define WORST 0 /* Worst case. */
200 #define HASWIDTH 0x01 /* Known to match non-null strings. */
202 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
203 * character, and if utf8, must be invariant. */
205 #define SPSTART 0x04 /* Starts with * or +. */
206 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
207 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
209 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
211 /* whether trie related optimizations are enabled */
212 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
213 #define TRIE_STUDY_OPT
214 #define FULL_TRIE_STUDY
220 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
221 #define PBITVAL(paren) (1 << ((paren) & 7))
222 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
223 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
224 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
226 /* If not already in utf8, do a longjmp back to the beginning */
227 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
228 #define REQUIRE_UTF8 STMT_START { \
229 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
232 /* About scan_data_t.
234 During optimisation we recurse through the regexp program performing
235 various inplace (keyhole style) optimisations. In addition study_chunk
236 and scan_commit populate this data structure with information about
237 what strings MUST appear in the pattern. We look for the longest
238 string that must appear at a fixed location, and we look for the
239 longest string that may appear at a floating location. So for instance
244 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
245 strings (because they follow a .* construct). study_chunk will identify
246 both FOO and BAR as being the longest fixed and floating strings respectively.
248 The strings can be composites, for instance
252 will result in a composite fixed substring 'foo'.
254 For each string some basic information is maintained:
256 - offset or min_offset
257 This is the position the string must appear at, or not before.
258 It also implicitly (when combined with minlenp) tells us how many
259 characters must match before the string we are searching for.
260 Likewise when combined with minlenp and the length of the string it
261 tells us how many characters must appear after the string we have
265 Only used for floating strings. This is the rightmost point that
266 the string can appear at. If set to I32 max it indicates that the
267 string can occur infinitely far to the right.
270 A pointer to the minimum length of the pattern that the string
271 was found inside. This is important as in the case of positive
272 lookahead or positive lookbehind we can have multiple patterns
277 The minimum length of the pattern overall is 3, the minimum length
278 of the lookahead part is 3, but the minimum length of the part that
279 will actually match is 1. So 'FOO's minimum length is 3, but the
280 minimum length for the F is 1. This is important as the minimum length
281 is used to determine offsets in front of and behind the string being
282 looked for. Since strings can be composites this is the length of the
283 pattern at the time it was commited with a scan_commit. Note that
284 the length is calculated by study_chunk, so that the minimum lengths
285 are not known until the full pattern has been compiled, thus the
286 pointer to the value.
290 In the case of lookbehind the string being searched for can be
291 offset past the start point of the final matching string.
292 If this value was just blithely removed from the min_offset it would
293 invalidate some of the calculations for how many chars must match
294 before or after (as they are derived from min_offset and minlen and
295 the length of the string being searched for).
296 When the final pattern is compiled and the data is moved from the
297 scan_data_t structure into the regexp structure the information
298 about lookbehind is factored in, with the information that would
299 have been lost precalculated in the end_shift field for the
302 The fields pos_min and pos_delta are used to store the minimum offset
303 and the delta to the maximum offset at the current point in the pattern.
307 typedef struct scan_data_t {
308 /*I32 len_min; unused */
309 /*I32 len_delta; unused */
313 I32 last_end; /* min value, <0 unless valid. */
316 SV **longest; /* Either &l_fixed, or &l_float. */
317 SV *longest_fixed; /* longest fixed string found in pattern */
318 I32 offset_fixed; /* offset where it starts */
319 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
320 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
321 SV *longest_float; /* longest floating string found in pattern */
322 I32 offset_float_min; /* earliest point in string it can appear */
323 I32 offset_float_max; /* latest point in string it can appear */
324 I32 *minlen_float; /* pointer to the minlen relevent to the string */
325 I32 lookbehind_float; /* is the position of the string modified by LB */
329 struct regnode_charclass_class *start_class;
333 * Forward declarations for pregcomp()'s friends.
336 static const scan_data_t zero_scan_data =
337 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
339 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
340 #define SF_BEFORE_SEOL 0x0001
341 #define SF_BEFORE_MEOL 0x0002
342 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
343 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
346 # define SF_FIX_SHIFT_EOL (0+2)
347 # define SF_FL_SHIFT_EOL (0+4)
349 # define SF_FIX_SHIFT_EOL (+2)
350 # define SF_FL_SHIFT_EOL (+4)
353 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
354 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
356 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
357 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
358 #define SF_IS_INF 0x0040
359 #define SF_HAS_PAR 0x0080
360 #define SF_IN_PAR 0x0100
361 #define SF_HAS_EVAL 0x0200
362 #define SCF_DO_SUBSTR 0x0400
363 #define SCF_DO_STCLASS_AND 0x0800
364 #define SCF_DO_STCLASS_OR 0x1000
365 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
366 #define SCF_WHILEM_VISITED_POS 0x2000
368 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
369 #define SCF_SEEN_ACCEPT 0x8000
371 #define UTF cBOOL(RExC_utf8)
372 #define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE)
373 #define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE)
374 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
376 #define OOB_UNICODE 12345678
377 #define OOB_NAMEDCLASS -1
379 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
380 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
383 /* length of regex to show in messages that don't mark a position within */
384 #define RegexLengthToShowInErrorMessages 127
387 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
388 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
389 * op/pragma/warn/regcomp.
391 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
392 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
394 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
397 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
398 * arg. Show regex, up to a maximum length. If it's too long, chop and add
401 #define _FAIL(code) STMT_START { \
402 const char *ellipses = ""; \
403 IV len = RExC_end - RExC_precomp; \
406 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
407 if (len > RegexLengthToShowInErrorMessages) { \
408 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
409 len = RegexLengthToShowInErrorMessages - 10; \
415 #define FAIL(msg) _FAIL( \
416 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
417 msg, (int)len, RExC_precomp, ellipses))
419 #define FAIL2(msg,arg) _FAIL( \
420 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
421 arg, (int)len, RExC_precomp, ellipses))
424 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
426 #define Simple_vFAIL(m) STMT_START { \
427 const IV offset = RExC_parse - RExC_precomp; \
428 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
429 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
433 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
435 #define vFAIL(m) STMT_START { \
437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
442 * Like Simple_vFAIL(), but accepts two arguments.
444 #define Simple_vFAIL2(m,a1) STMT_START { \
445 const IV offset = RExC_parse - RExC_precomp; \
446 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
447 (int)offset, RExC_precomp, RExC_precomp + offset); \
451 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
453 #define vFAIL2(m,a1) STMT_START { \
455 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
456 Simple_vFAIL2(m, a1); \
461 * Like Simple_vFAIL(), but accepts three arguments.
463 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
464 const IV offset = RExC_parse - RExC_precomp; \
465 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
466 (int)offset, RExC_precomp, RExC_precomp + offset); \
470 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
472 #define vFAIL3(m,a1,a2) STMT_START { \
474 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
475 Simple_vFAIL3(m, a1, a2); \
479 * Like Simple_vFAIL(), but accepts four arguments.
481 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
482 const IV offset = RExC_parse - RExC_precomp; \
483 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
484 (int)offset, RExC_precomp, RExC_precomp + offset); \
487 #define ckWARNreg(loc,m) STMT_START { \
488 const IV offset = loc - RExC_precomp; \
489 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 (int)offset, RExC_precomp, RExC_precomp + offset); \
493 #define ckWARNregdep(loc,m) STMT_START { \
494 const IV offset = loc - RExC_precomp; \
495 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
500 #define ckWARN2reg(loc, m, a1) STMT_START { \
501 const IV offset = loc - RExC_precomp; \
502 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
503 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
506 #define vWARN3(loc, m, a1, a2) STMT_START { \
507 const IV offset = loc - RExC_precomp; \
508 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
509 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
512 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
513 const IV offset = loc - RExC_precomp; \
514 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
515 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
518 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
519 const IV offset = loc - RExC_precomp; \
520 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
524 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
527 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
530 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
531 const IV offset = loc - RExC_precomp; \
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
537 /* Allow for side effects in s */
538 #define REGC(c,s) STMT_START { \
539 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
542 /* Macros for recording node offsets. 20001227 mjd@plover.com
543 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
544 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
545 * Element 0 holds the number n.
546 * Position is 1 indexed.
548 #ifndef RE_TRACK_PATTERN_OFFSETS
549 #define Set_Node_Offset_To_R(node,byte)
550 #define Set_Node_Offset(node,byte)
551 #define Set_Cur_Node_Offset
552 #define Set_Node_Length_To_R(node,len)
553 #define Set_Node_Length(node,len)
554 #define Set_Node_Cur_Length(node)
555 #define Node_Offset(n)
556 #define Node_Length(n)
557 #define Set_Node_Offset_Length(node,offset,len)
558 #define ProgLen(ri) ri->u.proglen
559 #define SetProgLen(ri,x) ri->u.proglen = x
561 #define ProgLen(ri) ri->u.offsets[0]
562 #define SetProgLen(ri,x) ri->u.offsets[0] = x
563 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
565 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
566 __LINE__, (int)(node), (int)(byte))); \
568 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
570 RExC_offsets[2*(node)-1] = (byte); \
575 #define Set_Node_Offset(node,byte) \
576 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
577 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
579 #define Set_Node_Length_To_R(node,len) STMT_START { \
581 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
582 __LINE__, (int)(node), (int)(len))); \
584 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
586 RExC_offsets[2*(node)] = (len); \
591 #define Set_Node_Length(node,len) \
592 Set_Node_Length_To_R((node)-RExC_emit_start, len)
593 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
594 #define Set_Node_Cur_Length(node) \
595 Set_Node_Length(node, RExC_parse - parse_start)
597 /* Get offsets and lengths */
598 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
599 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
601 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
602 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
603 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
607 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
608 #define EXPERIMENTAL_INPLACESCAN
609 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
611 #define DEBUG_STUDYDATA(str,data,depth) \
612 DEBUG_OPTIMISE_MORE_r(if(data){ \
613 PerlIO_printf(Perl_debug_log, \
614 "%*s" str "Pos:%"IVdf"/%"IVdf \
615 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
616 (int)(depth)*2, "", \
617 (IV)((data)->pos_min), \
618 (IV)((data)->pos_delta), \
619 (UV)((data)->flags), \
620 (IV)((data)->whilem_c), \
621 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
622 is_inf ? "INF " : "" \
624 if ((data)->last_found) \
625 PerlIO_printf(Perl_debug_log, \
626 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
627 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
628 SvPVX_const((data)->last_found), \
629 (IV)((data)->last_end), \
630 (IV)((data)->last_start_min), \
631 (IV)((data)->last_start_max), \
632 ((data)->longest && \
633 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
634 SvPVX_const((data)->longest_fixed), \
635 (IV)((data)->offset_fixed), \
636 ((data)->longest && \
637 (data)->longest==&((data)->longest_float)) ? "*" : "", \
638 SvPVX_const((data)->longest_float), \
639 (IV)((data)->offset_float_min), \
640 (IV)((data)->offset_float_max) \
642 PerlIO_printf(Perl_debug_log,"\n"); \
645 static void clear_re(pTHX_ void *r);
647 /* Mark that we cannot extend a found fixed substring at this point.
648 Update the longest found anchored substring and the longest found
649 floating substrings if needed. */
652 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
654 const STRLEN l = CHR_SVLEN(data->last_found);
655 const STRLEN old_l = CHR_SVLEN(*data->longest);
656 GET_RE_DEBUG_FLAGS_DECL;
658 PERL_ARGS_ASSERT_SCAN_COMMIT;
660 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
661 SvSetMagicSV(*data->longest, data->last_found);
662 if (*data->longest == data->longest_fixed) {
663 data->offset_fixed = l ? data->last_start_min : data->pos_min;
664 if (data->flags & SF_BEFORE_EOL)
666 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
668 data->flags &= ~SF_FIX_BEFORE_EOL;
669 data->minlen_fixed=minlenp;
670 data->lookbehind_fixed=0;
672 else { /* *data->longest == data->longest_float */
673 data->offset_float_min = l ? data->last_start_min : data->pos_min;
674 data->offset_float_max = (l
675 ? data->last_start_max
676 : data->pos_min + data->pos_delta);
677 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
678 data->offset_float_max = I32_MAX;
679 if (data->flags & SF_BEFORE_EOL)
681 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
683 data->flags &= ~SF_FL_BEFORE_EOL;
684 data->minlen_float=minlenp;
685 data->lookbehind_float=0;
688 SvCUR_set(data->last_found, 0);
690 SV * const sv = data->last_found;
691 if (SvUTF8(sv) && SvMAGICAL(sv)) {
692 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
698 data->flags &= ~SF_BEFORE_EOL;
699 DEBUG_STUDYDATA("commit: ",data,0);
702 /* Can match anything (initialization) */
704 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
706 PERL_ARGS_ASSERT_CL_ANYTHING;
708 ANYOF_CLASS_ZERO(cl);
709 ANYOF_BITMAP_SETALL(cl);
710 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
712 cl->flags |= ANYOF_LOCALE;
713 cl->flags |= ANYOF_FOLD;
716 /* Can match anything (initialization) */
718 S_cl_is_anything(const struct regnode_charclass_class *cl)
722 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
724 for (value = 0; value <= ANYOF_MAX; value += 2)
725 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
727 if (!(cl->flags & ANYOF_UNICODE_ALL))
729 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
734 /* Can match anything (initialization) */
736 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
738 PERL_ARGS_ASSERT_CL_INIT;
740 Zero(cl, 1, struct regnode_charclass_class);
742 cl_anything(pRExC_state, cl);
746 S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
748 PERL_ARGS_ASSERT_CL_INIT_ZERO;
750 Zero(cl, 1, struct regnode_charclass_class);
752 cl_anything(pRExC_state, cl);
754 cl->flags |= ANYOF_LOCALE;
757 /* 'And' a given class with another one. Can create false positives */
758 /* We assume that cl is not inverted */
760 S_cl_and(struct regnode_charclass_class *cl,
761 const struct regnode_charclass_class *and_with)
763 PERL_ARGS_ASSERT_CL_AND;
765 assert(and_with->type == ANYOF);
766 if (!(and_with->flags & ANYOF_CLASS)
767 && !(cl->flags & ANYOF_CLASS)
768 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
769 && !(and_with->flags & ANYOF_FOLD)
770 && !(cl->flags & ANYOF_FOLD)) {
773 if (and_with->flags & ANYOF_INVERT)
774 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
775 cl->bitmap[i] &= ~and_with->bitmap[i];
777 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
778 cl->bitmap[i] &= and_with->bitmap[i];
779 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
780 if (!(and_with->flags & ANYOF_EOS))
781 cl->flags &= ~ANYOF_EOS;
783 if (!(and_with->flags & ANYOF_FOLD))
784 cl->flags &= ~ANYOF_FOLD;
786 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_NONBITMAP &&
787 !(and_with->flags & ANYOF_INVERT)) {
788 cl->flags &= ~ANYOF_UNICODE_ALL;
789 cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
792 ARG_SET(cl, ARG(and_with));
794 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
795 !(and_with->flags & ANYOF_INVERT))
796 cl->flags &= ~ANYOF_UNICODE_ALL;
797 if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
798 !(and_with->flags & ANYOF_INVERT))
799 cl->flags &= ~ANYOF_NONBITMAP;
802 /* 'OR' a given class with another one. Can create false positives */
803 /* We assume that cl is not inverted */
805 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
807 PERL_ARGS_ASSERT_CL_OR;
809 if (or_with->flags & ANYOF_INVERT) {
811 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
812 * <= (B1 | !B2) | (CL1 | !CL2)
813 * which is wasteful if CL2 is small, but we ignore CL2:
814 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
815 * XXXX Can we handle case-fold? Unclear:
816 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
817 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
819 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
820 && !(or_with->flags & ANYOF_FOLD)
821 && !(cl->flags & ANYOF_FOLD) ) {
824 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
825 cl->bitmap[i] |= ~or_with->bitmap[i];
826 } /* XXXX: logic is complicated otherwise */
828 cl_anything(pRExC_state, cl);
831 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
832 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
833 && (!(or_with->flags & ANYOF_FOLD)
834 || (cl->flags & ANYOF_FOLD)) ) {
837 /* OR char bitmap and class bitmap separately */
838 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
839 cl->bitmap[i] |= or_with->bitmap[i];
840 if (or_with->flags & ANYOF_CLASS) {
841 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
842 cl->classflags[i] |= or_with->classflags[i];
843 cl->flags |= ANYOF_CLASS;
846 else { /* XXXX: logic is complicated, leave it along for a moment. */
847 cl_anything(pRExC_state, cl);
850 if (or_with->flags & ANYOF_EOS)
851 cl->flags |= ANYOF_EOS;
853 if (or_with->flags & ANYOF_FOLD)
854 cl->flags |= ANYOF_FOLD;
856 /* If both nodes match something outside the bitmap, but what they match
857 * outside is not the same pointer, and hence not easily compared, give up
858 * and allow the start class to match everything outside the bitmap */
859 if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
860 ARG(cl) != ARG(or_with)) {
861 cl->flags |= ANYOF_UNICODE_ALL;
864 if (or_with->flags & ANYOF_UNICODE_ALL) {
865 cl->flags |= ANYOF_UNICODE_ALL;
869 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
870 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
871 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
872 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
877 dump_trie(trie,widecharmap,revcharmap)
878 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
879 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
881 These routines dump out a trie in a somewhat readable format.
882 The _interim_ variants are used for debugging the interim
883 tables that are used to generate the final compressed
884 representation which is what dump_trie expects.
886 Part of the reason for their existance is to provide a form
887 of documentation as to how the different representations function.
892 Dumps the final compressed table form of the trie to Perl_debug_log.
893 Used for debugging make_trie().
897 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
898 AV *revcharmap, U32 depth)
901 SV *sv=sv_newmortal();
902 int colwidth= widecharmap ? 6 : 4;
904 GET_RE_DEBUG_FLAGS_DECL;
906 PERL_ARGS_ASSERT_DUMP_TRIE;
908 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
909 (int)depth * 2 + 2,"",
910 "Match","Base","Ofs" );
912 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
913 SV ** const tmp = av_fetch( revcharmap, state, 0);
915 PerlIO_printf( Perl_debug_log, "%*s",
917 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
918 PL_colors[0], PL_colors[1],
919 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
920 PERL_PV_ESCAPE_FIRSTCHAR
925 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
926 (int)depth * 2 + 2,"");
928 for( state = 0 ; state < trie->uniquecharcount ; state++ )
929 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
930 PerlIO_printf( Perl_debug_log, "\n");
932 for( state = 1 ; state < trie->statecount ; state++ ) {
933 const U32 base = trie->states[ state ].trans.base;
935 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
937 if ( trie->states[ state ].wordnum ) {
938 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
940 PerlIO_printf( Perl_debug_log, "%6s", "" );
943 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
948 while( ( base + ofs < trie->uniquecharcount ) ||
949 ( base + ofs - trie->uniquecharcount < trie->lasttrans
950 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
953 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
955 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
956 if ( ( base + ofs >= trie->uniquecharcount ) &&
957 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
958 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
960 PerlIO_printf( Perl_debug_log, "%*"UVXf,
962 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
964 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
968 PerlIO_printf( Perl_debug_log, "]");
971 PerlIO_printf( Perl_debug_log, "\n" );
973 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
974 for (word=1; word <= trie->wordcount; word++) {
975 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
976 (int)word, (int)(trie->wordinfo[word].prev),
977 (int)(trie->wordinfo[word].len));
979 PerlIO_printf(Perl_debug_log, "\n" );
982 Dumps a fully constructed but uncompressed trie in list form.
983 List tries normally only are used for construction when the number of
984 possible chars (trie->uniquecharcount) is very high.
985 Used for debugging make_trie().
988 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
989 HV *widecharmap, AV *revcharmap, U32 next_alloc,
993 SV *sv=sv_newmortal();
994 int colwidth= widecharmap ? 6 : 4;
995 GET_RE_DEBUG_FLAGS_DECL;
997 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
999 /* print out the table precompression. */
1000 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1001 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1002 "------:-----+-----------------\n" );
1004 for( state=1 ; state < next_alloc ; state ++ ) {
1007 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1008 (int)depth * 2 + 2,"", (UV)state );
1009 if ( ! trie->states[ state ].wordnum ) {
1010 PerlIO_printf( Perl_debug_log, "%5s| ","");
1012 PerlIO_printf( Perl_debug_log, "W%4x| ",
1013 trie->states[ state ].wordnum
1016 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1017 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1019 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1021 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1022 PL_colors[0], PL_colors[1],
1023 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1024 PERL_PV_ESCAPE_FIRSTCHAR
1026 TRIE_LIST_ITEM(state,charid).forid,
1027 (UV)TRIE_LIST_ITEM(state,charid).newstate
1030 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1031 (int)((depth * 2) + 14), "");
1034 PerlIO_printf( Perl_debug_log, "\n");
1039 Dumps a fully constructed but uncompressed trie in table form.
1040 This is the normal DFA style state transition table, with a few
1041 twists to facilitate compression later.
1042 Used for debugging make_trie().
1045 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1046 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1051 SV *sv=sv_newmortal();
1052 int colwidth= widecharmap ? 6 : 4;
1053 GET_RE_DEBUG_FLAGS_DECL;
1055 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1058 print out the table precompression so that we can do a visual check
1059 that they are identical.
1062 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1064 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1065 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1067 PerlIO_printf( Perl_debug_log, "%*s",
1069 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1070 PL_colors[0], PL_colors[1],
1071 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1072 PERL_PV_ESCAPE_FIRSTCHAR
1078 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1080 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1081 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1084 PerlIO_printf( Perl_debug_log, "\n" );
1086 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1088 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1089 (int)depth * 2 + 2,"",
1090 (UV)TRIE_NODENUM( state ) );
1092 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1093 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1095 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1097 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1099 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1100 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1102 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1103 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1111 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1112 startbranch: the first branch in the whole branch sequence
1113 first : start branch of sequence of branch-exact nodes.
1114 May be the same as startbranch
1115 last : Thing following the last branch.
1116 May be the same as tail.
1117 tail : item following the branch sequence
1118 count : words in the sequence
1119 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1120 depth : indent depth
1122 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1124 A trie is an N'ary tree where the branches are determined by digital
1125 decomposition of the key. IE, at the root node you look up the 1st character and
1126 follow that branch repeat until you find the end of the branches. Nodes can be
1127 marked as "accepting" meaning they represent a complete word. Eg:
1131 would convert into the following structure. Numbers represent states, letters
1132 following numbers represent valid transitions on the letter from that state, if
1133 the number is in square brackets it represents an accepting state, otherwise it
1134 will be in parenthesis.
1136 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1140 (1) +-i->(6)-+-s->[7]
1142 +-s->(3)-+-h->(4)-+-e->[5]
1144 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1146 This shows that when matching against the string 'hers' we will begin at state 1
1147 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1148 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1149 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1150 single traverse. We store a mapping from accepting to state to which word was
1151 matched, and then when we have multiple possibilities we try to complete the
1152 rest of the regex in the order in which they occured in the alternation.
1154 The only prior NFA like behaviour that would be changed by the TRIE support is
1155 the silent ignoring of duplicate alternations which are of the form:
1157 / (DUPE|DUPE) X? (?{ ... }) Y /x
1159 Thus EVAL blocks following a trie may be called a different number of times with
1160 and without the optimisation. With the optimisations dupes will be silently
1161 ignored. This inconsistant behaviour of EVAL type nodes is well established as
1162 the following demonstrates:
1164 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1166 which prints out 'word' three times, but
1168 'words'=~/(word|word|word)(?{ print $1 })S/
1170 which doesnt print it out at all. This is due to other optimisations kicking in.
1172 Example of what happens on a structural level:
1174 The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1176 1: CURLYM[1] {1,32767}(18)
1187 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1188 and should turn into:
1190 1: CURLYM[1] {1,32767}(18)
1192 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1200 Cases where tail != last would be like /(?foo|bar)baz/:
1210 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1211 and would end up looking like:
1214 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1221 d = uvuni_to_utf8_flags(d, uv, 0);
1223 is the recommended Unicode-aware way of saying
1228 #define TRIE_STORE_REVCHAR \
1231 SV *zlopp = newSV(2); \
1232 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1233 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
1234 SvCUR_set(zlopp, kapow - flrbbbbb); \
1237 av_push(revcharmap, zlopp); \
1239 char ooooff = (char)uvc; \
1240 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1244 #define TRIE_READ_CHAR STMT_START { \
1248 if ( foldlen > 0 ) { \
1249 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1254 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1255 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1256 foldlen -= UNISKIP( uvc ); \
1257 scan = foldbuf + UNISKIP( uvc ); \
1260 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1270 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1271 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1272 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1273 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1275 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1276 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1277 TRIE_LIST_CUR( state )++; \
1280 #define TRIE_LIST_NEW(state) STMT_START { \
1281 Newxz( trie->states[ state ].trans.list, \
1282 4, reg_trie_trans_le ); \
1283 TRIE_LIST_CUR( state ) = 1; \
1284 TRIE_LIST_LEN( state ) = 4; \
1287 #define TRIE_HANDLE_WORD(state) STMT_START { \
1288 U16 dupe= trie->states[ state ].wordnum; \
1289 regnode * const noper_next = regnext( noper ); \
1292 /* store the word for dumping */ \
1294 if (OP(noper) != NOTHING) \
1295 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1297 tmp = newSVpvn_utf8( "", 0, UTF ); \
1298 av_push( trie_words, tmp ); \
1302 trie->wordinfo[curword].prev = 0; \
1303 trie->wordinfo[curword].len = wordlen; \
1304 trie->wordinfo[curword].accept = state; \
1306 if ( noper_next < tail ) { \
1308 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1309 trie->jump[curword] = (U16)(noper_next - convert); \
1311 jumper = noper_next; \
1313 nextbranch= regnext(cur); \
1317 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1318 /* chain, so that when the bits of chain are later */\
1319 /* linked together, the dups appear in the chain */\
1320 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1321 trie->wordinfo[dupe].prev = curword; \
1323 /* we haven't inserted this word yet. */ \
1324 trie->states[ state ].wordnum = curword; \
1329 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1330 ( ( base + charid >= ucharcount \
1331 && base + charid < ubound \
1332 && state == trie->trans[ base - ucharcount + charid ].check \
1333 && trie->trans[ base - ucharcount + charid ].next ) \
1334 ? trie->trans[ base - ucharcount + charid ].next \
1335 : ( state==1 ? special : 0 ) \
1339 #define MADE_JUMP_TRIE 2
1340 #define MADE_EXACT_TRIE 4
1343 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1346 /* first pass, loop through and scan words */
1347 reg_trie_data *trie;
1348 HV *widecharmap = NULL;
1349 AV *revcharmap = newAV();
1351 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1356 regnode *jumper = NULL;
1357 regnode *nextbranch = NULL;
1358 regnode *convert = NULL;
1359 U32 *prev_states; /* temp array mapping each state to previous one */
1360 /* we just use folder as a flag in utf8 */
1361 const U8 * folder = NULL;
1364 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1365 AV *trie_words = NULL;
1366 /* along with revcharmap, this only used during construction but both are
1367 * useful during debugging so we store them in the struct when debugging.
1370 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1371 STRLEN trie_charcount=0;
1373 SV *re_trie_maxbuff;
1374 GET_RE_DEBUG_FLAGS_DECL;
1376 PERL_ARGS_ASSERT_MAKE_TRIE;
1378 PERL_UNUSED_ARG(depth);
1382 case EXACTFU: folder = PL_fold_latin1; break;
1383 case EXACTF: folder = PL_fold; break;
1384 case EXACTFL: folder = PL_fold_locale; break;
1387 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1389 trie->startstate = 1;
1390 trie->wordcount = word_count;
1391 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1392 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1393 if (!(UTF && folder))
1394 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1395 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1396 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1399 trie_words = newAV();
1402 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1403 if (!SvIOK(re_trie_maxbuff)) {
1404 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1407 PerlIO_printf( Perl_debug_log,
1408 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1409 (int)depth * 2 + 2, "",
1410 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1411 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1415 /* Find the node we are going to overwrite */
1416 if ( first == startbranch && OP( last ) != BRANCH ) {
1417 /* whole branch chain */
1420 /* branch sub-chain */
1421 convert = NEXTOPER( first );
1424 /* -- First loop and Setup --
1426 We first traverse the branches and scan each word to determine if it
1427 contains widechars, and how many unique chars there are, this is
1428 important as we have to build a table with at least as many columns as we
1431 We use an array of integers to represent the character codes 0..255
1432 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1433 native representation of the character value as the key and IV's for the
1436 *TODO* If we keep track of how many times each character is used we can
1437 remap the columns so that the table compression later on is more
1438 efficient in terms of memory by ensuring the most common value is in the
1439 middle and the least common are on the outside. IMO this would be better
1440 than a most to least common mapping as theres a decent chance the most
1441 common letter will share a node with the least common, meaning the node
1442 will not be compressable. With a middle is most common approach the worst
1443 case is when we have the least common nodes twice.
1447 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1448 regnode * const noper = NEXTOPER( cur );
1449 const U8 *uc = (U8*)STRING( noper );
1450 const U8 * const e = uc + STR_LEN( noper );
1452 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1453 const U8 *scan = (U8*)NULL;
1454 U32 wordlen = 0; /* required init */
1456 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1458 if (OP(noper) == NOTHING) {
1462 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1463 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1464 regardless of encoding */
1466 for ( ; uc < e ; uc += len ) {
1467 TRIE_CHARCOUNT(trie)++;
1471 if ( !trie->charmap[ uvc ] ) {
1472 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1474 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
1478 /* store the codepoint in the bitmap, and its folded
1480 TRIE_BITMAP_SET(trie,uvc);
1482 /* store the folded codepoint */
1483 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1486 /* store first byte of utf8 representation of
1487 variant codepoints */
1488 if (! UNI_IS_INVARIANT(uvc)) {
1489 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1492 set_bit = 0; /* We've done our bit :-) */
1497 widecharmap = newHV();
1499 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1502 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1504 if ( !SvTRUE( *svpp ) ) {
1505 sv_setiv( *svpp, ++trie->uniquecharcount );
1510 if( cur == first ) {
1513 } else if (chars < trie->minlen) {
1515 } else if (chars > trie->maxlen) {
1519 } /* end first pass */
1520 DEBUG_TRIE_COMPILE_r(
1521 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1522 (int)depth * 2 + 2,"",
1523 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1524 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1525 (int)trie->minlen, (int)trie->maxlen )
1529 We now know what we are dealing with in terms of unique chars and
1530 string sizes so we can calculate how much memory a naive
1531 representation using a flat table will take. If it's over a reasonable
1532 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1533 conservative but potentially much slower representation using an array
1536 At the end we convert both representations into the same compressed
1537 form that will be used in regexec.c for matching with. The latter
1538 is a form that cannot be used to construct with but has memory
1539 properties similar to the list form and access properties similar
1540 to the table form making it both suitable for fast searches and
1541 small enough that its feasable to store for the duration of a program.
1543 See the comment in the code where the compressed table is produced
1544 inplace from the flat tabe representation for an explanation of how
1545 the compression works.
1550 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1553 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1555 Second Pass -- Array Of Lists Representation
1557 Each state will be represented by a list of charid:state records
1558 (reg_trie_trans_le) the first such element holds the CUR and LEN
1559 points of the allocated array. (See defines above).
1561 We build the initial structure using the lists, and then convert
1562 it into the compressed table form which allows faster lookups
1563 (but cant be modified once converted).
1566 STRLEN transcount = 1;
1568 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1569 "%*sCompiling trie using list compiler\n",
1570 (int)depth * 2 + 2, ""));
1572 trie->states = (reg_trie_state *)
1573 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1574 sizeof(reg_trie_state) );
1578 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1580 regnode * const noper = NEXTOPER( cur );
1581 U8 *uc = (U8*)STRING( noper );
1582 const U8 * const e = uc + STR_LEN( noper );
1583 U32 state = 1; /* required init */
1584 U16 charid = 0; /* sanity init */
1585 U8 *scan = (U8*)NULL; /* sanity init */
1586 STRLEN foldlen = 0; /* required init */
1587 U32 wordlen = 0; /* required init */
1588 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1590 if (OP(noper) != NOTHING) {
1591 for ( ; uc < e ; uc += len ) {
1596 charid = trie->charmap[ uvc ];
1598 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1602 charid=(U16)SvIV( *svpp );
1605 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1612 if ( !trie->states[ state ].trans.list ) {
1613 TRIE_LIST_NEW( state );
1615 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1616 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1617 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1622 newstate = next_alloc++;
1623 prev_states[newstate] = state;
1624 TRIE_LIST_PUSH( state, charid, newstate );
1629 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1633 TRIE_HANDLE_WORD(state);
1635 } /* end second pass */
1637 /* next alloc is the NEXT state to be allocated */
1638 trie->statecount = next_alloc;
1639 trie->states = (reg_trie_state *)
1640 PerlMemShared_realloc( trie->states,
1642 * sizeof(reg_trie_state) );
1644 /* and now dump it out before we compress it */
1645 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1646 revcharmap, next_alloc,
1650 trie->trans = (reg_trie_trans *)
1651 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1658 for( state=1 ; state < next_alloc ; state ++ ) {
1662 DEBUG_TRIE_COMPILE_MORE_r(
1663 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1667 if (trie->states[state].trans.list) {
1668 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1672 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1673 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1674 if ( forid < minid ) {
1676 } else if ( forid > maxid ) {
1680 if ( transcount < tp + maxid - minid + 1) {
1682 trie->trans = (reg_trie_trans *)
1683 PerlMemShared_realloc( trie->trans,
1685 * sizeof(reg_trie_trans) );
1686 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1688 base = trie->uniquecharcount + tp - minid;
1689 if ( maxid == minid ) {
1691 for ( ; zp < tp ; zp++ ) {
1692 if ( ! trie->trans[ zp ].next ) {
1693 base = trie->uniquecharcount + zp - minid;
1694 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1695 trie->trans[ zp ].check = state;
1701 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1702 trie->trans[ tp ].check = state;
1707 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1708 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1709 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1710 trie->trans[ tid ].check = state;
1712 tp += ( maxid - minid + 1 );
1714 Safefree(trie->states[ state ].trans.list);
1717 DEBUG_TRIE_COMPILE_MORE_r(
1718 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1721 trie->states[ state ].trans.base=base;
1723 trie->lasttrans = tp + 1;
1727 Second Pass -- Flat Table Representation.
1729 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1730 We know that we will need Charcount+1 trans at most to store the data
1731 (one row per char at worst case) So we preallocate both structures
1732 assuming worst case.
1734 We then construct the trie using only the .next slots of the entry
1737 We use the .check field of the first entry of the node temporarily to
1738 make compression both faster and easier by keeping track of how many non
1739 zero fields are in the node.
1741 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1744 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1745 number representing the first entry of the node, and state as a
1746 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1747 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1748 are 2 entrys per node. eg:
1756 The table is internally in the right hand, idx form. However as we also
1757 have to deal with the states array which is indexed by nodenum we have to
1758 use TRIE_NODENUM() to convert.
1761 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1762 "%*sCompiling trie using table compiler\n",
1763 (int)depth * 2 + 2, ""));
1765 trie->trans = (reg_trie_trans *)
1766 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1767 * trie->uniquecharcount + 1,
1768 sizeof(reg_trie_trans) );
1769 trie->states = (reg_trie_state *)
1770 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1771 sizeof(reg_trie_state) );
1772 next_alloc = trie->uniquecharcount + 1;
1775 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1777 regnode * const noper = NEXTOPER( cur );
1778 const U8 *uc = (U8*)STRING( noper );
1779 const U8 * const e = uc + STR_LEN( noper );
1781 U32 state = 1; /* required init */
1783 U16 charid = 0; /* sanity init */
1784 U32 accept_state = 0; /* sanity init */
1785 U8 *scan = (U8*)NULL; /* sanity init */
1787 STRLEN foldlen = 0; /* required init */
1788 U32 wordlen = 0; /* required init */
1789 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1791 if ( OP(noper) != NOTHING ) {
1792 for ( ; uc < e ; uc += len ) {
1797 charid = trie->charmap[ uvc ];
1799 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1800 charid = svpp ? (U16)SvIV(*svpp) : 0;
1804 if ( !trie->trans[ state + charid ].next ) {
1805 trie->trans[ state + charid ].next = next_alloc;
1806 trie->trans[ state ].check++;
1807 prev_states[TRIE_NODENUM(next_alloc)]
1808 = TRIE_NODENUM(state);
1809 next_alloc += trie->uniquecharcount;
1811 state = trie->trans[ state + charid ].next;
1813 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1815 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1818 accept_state = TRIE_NODENUM( state );
1819 TRIE_HANDLE_WORD(accept_state);
1821 } /* end second pass */
1823 /* and now dump it out before we compress it */
1824 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1826 next_alloc, depth+1));
1830 * Inplace compress the table.*
1832 For sparse data sets the table constructed by the trie algorithm will
1833 be mostly 0/FAIL transitions or to put it another way mostly empty.
1834 (Note that leaf nodes will not contain any transitions.)
1836 This algorithm compresses the tables by eliminating most such
1837 transitions, at the cost of a modest bit of extra work during lookup:
1839 - Each states[] entry contains a .base field which indicates the
1840 index in the state[] array wheres its transition data is stored.
1842 - If .base is 0 there are no valid transitions from that node.
1844 - If .base is nonzero then charid is added to it to find an entry in
1847 -If trans[states[state].base+charid].check!=state then the
1848 transition is taken to be a 0/Fail transition. Thus if there are fail
1849 transitions at the front of the node then the .base offset will point
1850 somewhere inside the previous nodes data (or maybe even into a node
1851 even earlier), but the .check field determines if the transition is
1855 The following process inplace converts the table to the compressed
1856 table: We first do not compress the root node 1,and mark all its
1857 .check pointers as 1 and set its .base pointer as 1 as well. This
1858 allows us to do a DFA construction from the compressed table later,
1859 and ensures that any .base pointers we calculate later are greater
1862 - We set 'pos' to indicate the first entry of the second node.
1864 - We then iterate over the columns of the node, finding the first and
1865 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1866 and set the .check pointers accordingly, and advance pos
1867 appropriately and repreat for the next node. Note that when we copy
1868 the next pointers we have to convert them from the original
1869 NODEIDX form to NODENUM form as the former is not valid post
1872 - If a node has no transitions used we mark its base as 0 and do not
1873 advance the pos pointer.
1875 - If a node only has one transition we use a second pointer into the
1876 structure to fill in allocated fail transitions from other states.
1877 This pointer is independent of the main pointer and scans forward
1878 looking for null transitions that are allocated to a state. When it
1879 finds one it writes the single transition into the "hole". If the
1880 pointer doesnt find one the single transition is appended as normal.
1882 - Once compressed we can Renew/realloc the structures to release the
1885 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1886 specifically Fig 3.47 and the associated pseudocode.
1890 const U32 laststate = TRIE_NODENUM( next_alloc );
1893 trie->statecount = laststate;
1895 for ( state = 1 ; state < laststate ; state++ ) {
1897 const U32 stateidx = TRIE_NODEIDX( state );
1898 const U32 o_used = trie->trans[ stateidx ].check;
1899 U32 used = trie->trans[ stateidx ].check;
1900 trie->trans[ stateidx ].check = 0;
1902 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1903 if ( flag || trie->trans[ stateidx + charid ].next ) {
1904 if ( trie->trans[ stateidx + charid ].next ) {
1906 for ( ; zp < pos ; zp++ ) {
1907 if ( ! trie->trans[ zp ].next ) {
1911 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1912 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1913 trie->trans[ zp ].check = state;
1914 if ( ++zp > pos ) pos = zp;
1921 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1923 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1924 trie->trans[ pos ].check = state;
1929 trie->lasttrans = pos + 1;
1930 trie->states = (reg_trie_state *)
1931 PerlMemShared_realloc( trie->states, laststate
1932 * sizeof(reg_trie_state) );
1933 DEBUG_TRIE_COMPILE_MORE_r(
1934 PerlIO_printf( Perl_debug_log,
1935 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1936 (int)depth * 2 + 2,"",
1937 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
1940 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1943 } /* end table compress */
1945 DEBUG_TRIE_COMPILE_MORE_r(
1946 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1947 (int)depth * 2 + 2, "",
1948 (UV)trie->statecount,
1949 (UV)trie->lasttrans)
1951 /* resize the trans array to remove unused space */
1952 trie->trans = (reg_trie_trans *)
1953 PerlMemShared_realloc( trie->trans, trie->lasttrans
1954 * sizeof(reg_trie_trans) );
1956 { /* Modify the program and insert the new TRIE node */
1957 U8 nodetype =(U8)(flags & 0xFF);
1961 regnode *optimize = NULL;
1962 #ifdef RE_TRACK_PATTERN_OFFSETS
1965 U32 mjd_nodelen = 0;
1966 #endif /* RE_TRACK_PATTERN_OFFSETS */
1967 #endif /* DEBUGGING */
1969 This means we convert either the first branch or the first Exact,
1970 depending on whether the thing following (in 'last') is a branch
1971 or not and whther first is the startbranch (ie is it a sub part of
1972 the alternation or is it the whole thing.)
1973 Assuming its a sub part we convert the EXACT otherwise we convert
1974 the whole branch sequence, including the first.
1976 /* Find the node we are going to overwrite */
1977 if ( first != startbranch || OP( last ) == BRANCH ) {
1978 /* branch sub-chain */
1979 NEXT_OFF( first ) = (U16)(last - first);
1980 #ifdef RE_TRACK_PATTERN_OFFSETS
1982 mjd_offset= Node_Offset((convert));
1983 mjd_nodelen= Node_Length((convert));
1986 /* whole branch chain */
1988 #ifdef RE_TRACK_PATTERN_OFFSETS
1991 const regnode *nop = NEXTOPER( convert );
1992 mjd_offset= Node_Offset((nop));
1993 mjd_nodelen= Node_Length((nop));
1997 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1998 (int)depth * 2 + 2, "",
1999 (UV)mjd_offset, (UV)mjd_nodelen)
2002 /* But first we check to see if there is a common prefix we can
2003 split out as an EXACT and put in front of the TRIE node. */
2004 trie->startstate= 1;
2005 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2007 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2011 const U32 base = trie->states[ state ].trans.base;
2013 if ( trie->states[state].wordnum )
2016 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2017 if ( ( base + ofs >= trie->uniquecharcount ) &&
2018 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2019 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2021 if ( ++count > 1 ) {
2022 SV **tmp = av_fetch( revcharmap, ofs, 0);
2023 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2024 if ( state == 1 ) break;
2026 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2028 PerlIO_printf(Perl_debug_log,
2029 "%*sNew Start State=%"UVuf" Class: [",
2030 (int)depth * 2 + 2, "",
2033 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2034 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2036 TRIE_BITMAP_SET(trie,*ch);
2038 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2040 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2044 TRIE_BITMAP_SET(trie,*ch);
2046 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2047 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2053 SV **tmp = av_fetch( revcharmap, idx, 0);
2055 char *ch = SvPV( *tmp, len );
2057 SV *sv=sv_newmortal();
2058 PerlIO_printf( Perl_debug_log,
2059 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2060 (int)depth * 2 + 2, "",
2062 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2063 PL_colors[0], PL_colors[1],
2064 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2065 PERL_PV_ESCAPE_FIRSTCHAR
2070 OP( convert ) = nodetype;
2071 str=STRING(convert);
2074 STR_LEN(convert) += len;
2080 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2085 trie->prefixlen = (state-1);
2087 regnode *n = convert+NODE_SZ_STR(convert);
2088 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2089 trie->startstate = state;
2090 trie->minlen -= (state - 1);
2091 trie->maxlen -= (state - 1);
2093 /* At least the UNICOS C compiler choked on this
2094 * being argument to DEBUG_r(), so let's just have
2097 #ifdef PERL_EXT_RE_BUILD
2103 regnode *fix = convert;
2104 U32 word = trie->wordcount;
2106 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2107 while( ++fix < n ) {
2108 Set_Node_Offset_Length(fix, 0, 0);
2111 SV ** const tmp = av_fetch( trie_words, word, 0 );
2113 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2114 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2116 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2124 NEXT_OFF(convert) = (U16)(tail - convert);
2125 DEBUG_r(optimize= n);
2131 if ( trie->maxlen ) {
2132 NEXT_OFF( convert ) = (U16)(tail - convert);
2133 ARG_SET( convert, data_slot );
2134 /* Store the offset to the first unabsorbed branch in
2135 jump[0], which is otherwise unused by the jump logic.
2136 We use this when dumping a trie and during optimisation. */
2138 trie->jump[0] = (U16)(nextbranch - convert);
2140 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2141 * and there is a bitmap
2142 * and the first "jump target" node we found leaves enough room
2143 * then convert the TRIE node into a TRIEC node, with the bitmap
2144 * embedded inline in the opcode - this is hypothetically faster.
2146 if ( !trie->states[trie->startstate].wordnum
2148 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2150 OP( convert ) = TRIEC;
2151 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2152 PerlMemShared_free(trie->bitmap);
2155 OP( convert ) = TRIE;
2157 /* store the type in the flags */
2158 convert->flags = nodetype;
2162 + regarglen[ OP( convert ) ];
2164 /* XXX We really should free up the resource in trie now,
2165 as we won't use them - (which resources?) dmq */
2167 /* needed for dumping*/
2168 DEBUG_r(if (optimize) {
2169 regnode *opt = convert;
2171 while ( ++opt < optimize) {
2172 Set_Node_Offset_Length(opt,0,0);
2175 Try to clean up some of the debris left after the
2178 while( optimize < jumper ) {
2179 mjd_nodelen += Node_Length((optimize));
2180 OP( optimize ) = OPTIMIZED;
2181 Set_Node_Offset_Length(optimize,0,0);
2184 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2186 } /* end node insert */
2188 /* Finish populating the prev field of the wordinfo array. Walk back
2189 * from each accept state until we find another accept state, and if
2190 * so, point the first word's .prev field at the second word. If the
2191 * second already has a .prev field set, stop now. This will be the
2192 * case either if we've already processed that word's accept state,
2193 * or that state had multiple words, and the overspill words were
2194 * already linked up earlier.
2201 for (word=1; word <= trie->wordcount; word++) {
2203 if (trie->wordinfo[word].prev)
2205 state = trie->wordinfo[word].accept;
2207 state = prev_states[state];
2210 prev = trie->states[state].wordnum;
2214 trie->wordinfo[word].prev = prev;
2216 Safefree(prev_states);
2220 /* and now dump out the compressed format */
2221 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2223 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2225 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2226 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2228 SvREFCNT_dec(revcharmap);
2232 : trie->startstate>1
2238 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2240 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2242 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2243 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2246 We find the fail state for each state in the trie, this state is the longest proper
2247 suffix of the current state's 'word' that is also a proper prefix of another word in our
2248 trie. State 1 represents the word '' and is thus the default fail state. This allows
2249 the DFA not to have to restart after its tried and failed a word at a given point, it
2250 simply continues as though it had been matching the other word in the first place.
2252 'abcdgu'=~/abcdefg|cdgu/
2253 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2254 fail, which would bring us to the state representing 'd' in the second word where we would
2255 try 'g' and succeed, proceeding to match 'cdgu'.
2257 /* add a fail transition */
2258 const U32 trie_offset = ARG(source);
2259 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2261 const U32 ucharcount = trie->uniquecharcount;
2262 const U32 numstates = trie->statecount;
2263 const U32 ubound = trie->lasttrans + ucharcount;
2267 U32 base = trie->states[ 1 ].trans.base;
2270 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2271 GET_RE_DEBUG_FLAGS_DECL;
2273 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2275 PERL_UNUSED_ARG(depth);
2279 ARG_SET( stclass, data_slot );
2280 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2281 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2282 aho->trie=trie_offset;
2283 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2284 Copy( trie->states, aho->states, numstates, reg_trie_state );
2285 Newxz( q, numstates, U32);
2286 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2289 /* initialize fail[0..1] to be 1 so that we always have
2290 a valid final fail state */
2291 fail[ 0 ] = fail[ 1 ] = 1;
2293 for ( charid = 0; charid < ucharcount ; charid++ ) {
2294 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2296 q[ q_write ] = newstate;
2297 /* set to point at the root */
2298 fail[ q[ q_write++ ] ]=1;
2301 while ( q_read < q_write) {
2302 const U32 cur = q[ q_read++ % numstates ];
2303 base = trie->states[ cur ].trans.base;
2305 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2306 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2308 U32 fail_state = cur;
2311 fail_state = fail[ fail_state ];
2312 fail_base = aho->states[ fail_state ].trans.base;
2313 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2315 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2316 fail[ ch_state ] = fail_state;
2317 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2319 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2321 q[ q_write++ % numstates] = ch_state;
2325 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2326 when we fail in state 1, this allows us to use the
2327 charclass scan to find a valid start char. This is based on the principle
2328 that theres a good chance the string being searched contains lots of stuff
2329 that cant be a start char.
2331 fail[ 0 ] = fail[ 1 ] = 0;
2332 DEBUG_TRIE_COMPILE_r({
2333 PerlIO_printf(Perl_debug_log,
2334 "%*sStclass Failtable (%"UVuf" states): 0",
2335 (int)(depth * 2), "", (UV)numstates
2337 for( q_read=1; q_read<numstates; q_read++ ) {
2338 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2340 PerlIO_printf(Perl_debug_log, "\n");
2343 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2348 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2349 * These need to be revisited when a newer toolchain becomes available.
2351 #if defined(__sparc64__) && defined(__GNUC__)
2352 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2353 # undef SPARC64_GCC_WORKAROUND
2354 # define SPARC64_GCC_WORKAROUND 1
2358 #define DEBUG_PEEP(str,scan,depth) \
2359 DEBUG_OPTIMISE_r({if (scan){ \
2360 SV * const mysv=sv_newmortal(); \
2361 regnode *Next = regnext(scan); \
2362 regprop(RExC_rx, mysv, scan); \
2363 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2364 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2365 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2372 #define JOIN_EXACT(scan,min,flags) \
2373 if (PL_regkind[OP(scan)] == EXACT) \
2374 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2377 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2378 /* Merge several consecutive EXACTish nodes into one. */
2379 regnode *n = regnext(scan);
2381 regnode *next = scan + NODE_SZ_STR(scan);
2385 regnode *stop = scan;
2386 GET_RE_DEBUG_FLAGS_DECL;
2388 PERL_UNUSED_ARG(depth);
2391 PERL_ARGS_ASSERT_JOIN_EXACT;
2392 #ifndef EXPERIMENTAL_INPLACESCAN
2393 PERL_UNUSED_ARG(flags);
2394 PERL_UNUSED_ARG(val);
2396 DEBUG_PEEP("join",scan,depth);
2398 /* Skip NOTHING, merge EXACT*. */
2400 ( PL_regkind[OP(n)] == NOTHING ||
2401 (stringok && (OP(n) == OP(scan))))
2403 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2405 if (OP(n) == TAIL || n > next)
2407 if (PL_regkind[OP(n)] == NOTHING) {
2408 DEBUG_PEEP("skip:",n,depth);
2409 NEXT_OFF(scan) += NEXT_OFF(n);
2410 next = n + NODE_STEP_REGNODE;
2417 else if (stringok) {
2418 const unsigned int oldl = STR_LEN(scan);
2419 regnode * const nnext = regnext(n);
2421 DEBUG_PEEP("merg",n,depth);
2424 if (oldl + STR_LEN(n) > U8_MAX)
2426 NEXT_OFF(scan) += NEXT_OFF(n);
2427 STR_LEN(scan) += STR_LEN(n);
2428 next = n + NODE_SZ_STR(n);
2429 /* Now we can overwrite *n : */
2430 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2438 #ifdef EXPERIMENTAL_INPLACESCAN
2439 if (flags && !NEXT_OFF(n)) {
2440 DEBUG_PEEP("atch", val, depth);
2441 if (reg_off_by_arg[OP(n)]) {
2442 ARG_SET(n, val - n);
2445 NEXT_OFF(n) = val - n;
2451 #define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
2452 #define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
2453 #define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
2454 #define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
2457 && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
2458 && ( STR_LEN(scan) >= 6 ) )
2461 Two problematic code points in Unicode casefolding of EXACT nodes:
2463 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2464 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2470 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2471 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2473 This means that in case-insensitive matching (or "loose matching",
2474 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2475 length of the above casefolded versions) can match a target string
2476 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2477 This would rather mess up the minimum length computation.
2479 What we'll do is to look for the tail four bytes, and then peek
2480 at the preceding two bytes to see whether we need to decrease
2481 the minimum length by four (six minus two).
2483 Thanks to the design of UTF-8, there cannot be false matches:
2484 A sequence of valid UTF-8 bytes cannot be a subsequence of
2485 another valid sequence of UTF-8 bytes.
2488 char * const s0 = STRING(scan), *s, *t;
2489 char * const s1 = s0 + STR_LEN(scan) - 1;
2490 char * const s2 = s1 - 4;
2491 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2492 const char t0[] = "\xaf\x49\xaf\x42";
2494 const char t0[] = "\xcc\x88\xcc\x81";
2496 const char * const t1 = t0 + 3;
2499 s < s2 && (t = ninstr(s, s1, t0, t1));
2502 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2503 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2505 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2506 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
2514 n = scan + NODE_SZ_STR(scan);
2516 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2523 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2527 /* REx optimizer. Converts nodes into quickier variants "in place".
2528 Finds fixed substrings. */
2530 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2531 to the position after last scanned or to NULL. */
2533 #define INIT_AND_WITHP \
2534 assert(!and_withp); \
2535 Newx(and_withp,1,struct regnode_charclass_class); \
2536 SAVEFREEPV(and_withp)
2538 /* this is a chain of data about sub patterns we are processing that
2539 need to be handled seperately/specially in study_chunk. Its so
2540 we can simulate recursion without losing state. */
2542 typedef struct scan_frame {
2543 regnode *last; /* last node to process in this frame */
2544 regnode *next; /* next node to process when last is reached */
2545 struct scan_frame *prev; /*previous frame*/
2546 I32 stop; /* what stopparen do we use */
2550 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2552 #define CASE_SYNST_FNC(nAmE) \
2554 if (flags & SCF_DO_STCLASS_AND) { \
2555 for (value = 0; value < 256; value++) \
2556 if (!is_ ## nAmE ## _cp(value)) \
2557 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2560 for (value = 0; value < 256; value++) \
2561 if (is_ ## nAmE ## _cp(value)) \
2562 ANYOF_BITMAP_SET(data->start_class, value); \
2566 if (flags & SCF_DO_STCLASS_AND) { \
2567 for (value = 0; value < 256; value++) \
2568 if (is_ ## nAmE ## _cp(value)) \
2569 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2572 for (value = 0; value < 256; value++) \
2573 if (!is_ ## nAmE ## _cp(value)) \
2574 ANYOF_BITMAP_SET(data->start_class, value); \
2581 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2582 I32 *minlenp, I32 *deltap,
2587 struct regnode_charclass_class *and_withp,
2588 U32 flags, U32 depth)
2589 /* scanp: Start here (read-write). */
2590 /* deltap: Write maxlen-minlen here. */
2591 /* last: Stop before this one. */
2592 /* data: string data about the pattern */
2593 /* stopparen: treat close N as END */
2594 /* recursed: which subroutines have we recursed into */
2595 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
2598 I32 min = 0, pars = 0, code;
2599 regnode *scan = *scanp, *next;
2601 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
2602 int is_inf_internal = 0; /* The studied chunk is infinite */
2603 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2604 scan_data_t data_fake;
2605 SV *re_trie_maxbuff = NULL;
2606 regnode *first_non_open = scan;
2607 I32 stopmin = I32_MAX;
2608 scan_frame *frame = NULL;
2609 GET_RE_DEBUG_FLAGS_DECL;
2611 PERL_ARGS_ASSERT_STUDY_CHUNK;
2614 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
2618 while (first_non_open && OP(first_non_open) == OPEN)
2619 first_non_open=regnext(first_non_open);
2624 while ( scan && OP(scan) != END && scan < last ){
2625 /* Peephole optimizer: */
2626 DEBUG_STUDYDATA("Peep:", data,depth);
2627 DEBUG_PEEP("Peep",scan,depth);
2628 JOIN_EXACT(scan,&min,0);
2630 /* Follow the next-chain of the current node and optimize
2631 away all the NOTHINGs from it. */
2632 if (OP(scan) != CURLYX) {
2633 const int max = (reg_off_by_arg[OP(scan)]
2635 /* I32 may be smaller than U16 on CRAYs! */
2636 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2637 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2641 /* Skip NOTHING and LONGJMP. */
2642 while ((n = regnext(n))
2643 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2644 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2645 && off + noff < max)
2647 if (reg_off_by_arg[OP(scan)])
2650 NEXT_OFF(scan) = off;
2655 /* The principal pseudo-switch. Cannot be a switch, since we
2656 look into several different things. */
2657 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2658 || OP(scan) == IFTHEN) {
2659 next = regnext(scan);
2661 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2663 if (OP(next) == code || code == IFTHEN) {
2664 /* NOTE - There is similar code to this block below for handling
2665 TRIE nodes on a re-study. If you change stuff here check there
2667 I32 max1 = 0, min1 = I32_MAX, num = 0;
2668 struct regnode_charclass_class accum;
2669 regnode * const startbranch=scan;
2671 if (flags & SCF_DO_SUBSTR)
2672 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
2673 if (flags & SCF_DO_STCLASS)
2674 cl_init_zero(pRExC_state, &accum);
2676 while (OP(scan) == code) {
2677 I32 deltanext, minnext, f = 0, fake;
2678 struct regnode_charclass_class this_class;
2681 data_fake.flags = 0;
2683 data_fake.whilem_c = data->whilem_c;
2684 data_fake.last_closep = data->last_closep;
2687 data_fake.last_closep = &fake;
2689 data_fake.pos_delta = delta;
2690 next = regnext(scan);
2691 scan = NEXTOPER(scan);
2693 scan = NEXTOPER(scan);
2694 if (flags & SCF_DO_STCLASS) {
2695 cl_init(pRExC_state, &this_class);
2696 data_fake.start_class = &this_class;
2697 f = SCF_DO_STCLASS_AND;
2699 if (flags & SCF_WHILEM_VISITED_POS)
2700 f |= SCF_WHILEM_VISITED_POS;
2702 /* we suppose the run is continuous, last=next...*/
2703 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2705 stopparen, recursed, NULL, f,depth+1);
2708 if (max1 < minnext + deltanext)
2709 max1 = minnext + deltanext;
2710 if (deltanext == I32_MAX)
2711 is_inf = is_inf_internal = 1;
2713 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2715 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2716 if ( stopmin > minnext)
2717 stopmin = min + min1;
2718 flags &= ~SCF_DO_SUBSTR;
2720 data->flags |= SCF_SEEN_ACCEPT;
2723 if (data_fake.flags & SF_HAS_EVAL)
2724 data->flags |= SF_HAS_EVAL;
2725 data->whilem_c = data_fake.whilem_c;
2727 if (flags & SCF_DO_STCLASS)
2728 cl_or(pRExC_state, &accum, &this_class);
2730 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2732 if (flags & SCF_DO_SUBSTR) {
2733 data->pos_min += min1;
2734 data->pos_delta += max1 - min1;
2735 if (max1 != min1 || is_inf)
2736 data->longest = &(data->longest_float);
2739 delta += max1 - min1;
2740 if (flags & SCF_DO_STCLASS_OR) {
2741 cl_or(pRExC_state, data->start_class, &accum);
2743 cl_and(data->start_class, and_withp);
2744 flags &= ~SCF_DO_STCLASS;
2747 else if (flags & SCF_DO_STCLASS_AND) {
2749 cl_and(data->start_class, &accum);
2750 flags &= ~SCF_DO_STCLASS;
2753 /* Switch to OR mode: cache the old value of
2754 * data->start_class */
2756 StructCopy(data->start_class, and_withp,
2757 struct regnode_charclass_class);
2758 flags &= ~SCF_DO_STCLASS_AND;
2759 StructCopy(&accum, data->start_class,
2760 struct regnode_charclass_class);
2761 flags |= SCF_DO_STCLASS_OR;
2762 data->start_class->flags |= ANYOF_EOS;
2766 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2769 Assuming this was/is a branch we are dealing with: 'scan' now
2770 points at the item that follows the branch sequence, whatever
2771 it is. We now start at the beginning of the sequence and look
2778 which would be constructed from a pattern like /A|LIST|OF|WORDS/
2780 If we can find such a subseqence we need to turn the first
2781 element into a trie and then add the subsequent branch exact
2782 strings to the trie.
2786 1. patterns where the whole set of branches can be converted.
2788 2. patterns where only a subset can be converted.
2790 In case 1 we can replace the whole set with a single regop
2791 for the trie. In case 2 we need to keep the start and end
2794 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2795 becomes BRANCH TRIE; BRANCH X;
2797 There is an additional case, that being where there is a
2798 common prefix, which gets split out into an EXACT like node
2799 preceding the TRIE node.
2801 If x(1..n)==tail then we can do a simple trie, if not we make
2802 a "jump" trie, such that when we match the appropriate word
2803 we "jump" to the appopriate tail node. Essentailly we turn
2804 a nested if into a case structure of sorts.
2809 if (!re_trie_maxbuff) {
2810 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2811 if (!SvIOK(re_trie_maxbuff))
2812 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2814 if ( SvIV(re_trie_maxbuff)>=0 ) {
2816 regnode *first = (regnode *)NULL;
2817 regnode *last = (regnode *)NULL;
2818 regnode *tail = scan;
2823 SV * const mysv = sv_newmortal(); /* for dumping */
2825 /* var tail is used because there may be a TAIL
2826 regop in the way. Ie, the exacts will point to the
2827 thing following the TAIL, but the last branch will
2828 point at the TAIL. So we advance tail. If we
2829 have nested (?:) we may have to move through several
2833 while ( OP( tail ) == TAIL ) {
2834 /* this is the TAIL generated by (?:) */
2835 tail = regnext( tail );
2840 regprop(RExC_rx, mysv, tail );
2841 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2842 (int)depth * 2 + 2, "",
2843 "Looking for TRIE'able sequences. Tail node is: ",
2844 SvPV_nolen_const( mysv )
2850 step through the branches, cur represents each
2851 branch, noper is the first thing to be matched
2852 as part of that branch and noper_next is the
2853 regnext() of that node. if noper is an EXACT
2854 and noper_next is the same as scan (our current
2855 position in the regex) then the EXACT branch is
2856 a possible optimization target. Once we have
2857 two or more consequetive such branches we can
2858 create a trie of the EXACT's contents and stich
2859 it in place. If the sequence represents all of
2860 the branches we eliminate the whole thing and
2861 replace it with a single TRIE. If it is a
2862 subsequence then we need to stitch it in. This
2863 means the first branch has to remain, and needs
2864 to be repointed at the item on the branch chain
2865 following the last branch optimized. This could
2866 be either a BRANCH, in which case the
2867 subsequence is internal, or it could be the
2868 item following the branch sequence in which
2869 case the subsequence is at the end.
2873 /* dont use tail as the end marker for this traverse */
2874 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2875 regnode * const noper = NEXTOPER( cur );
2876 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2877 regnode * const noper_next = regnext( noper );
2881 regprop(RExC_rx, mysv, cur);
2882 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2883 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2885 regprop(RExC_rx, mysv, noper);
2886 PerlIO_printf( Perl_debug_log, " -> %s",
2887 SvPV_nolen_const(mysv));
2890 regprop(RExC_rx, mysv, noper_next );
2891 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2892 SvPV_nolen_const(mysv));
2894 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2895 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2897 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2898 : PL_regkind[ OP( noper ) ] == EXACT )
2899 || OP(noper) == NOTHING )
2901 && noper_next == tail
2906 if ( !first || optype == NOTHING ) {
2907 if (!first) first = cur;
2908 optype = OP( noper );
2914 Currently we do not believe that the trie logic can
2915 handle case insensitive matching properly when the
2916 pattern is not unicode (thus forcing unicode semantics).
2918 If/when this is fixed the following define can be swapped
2919 in below to fully enable trie logic.
2921 #define TRIE_TYPE_IS_SAFE 1
2924 #define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2926 if ( last && TRIE_TYPE_IS_SAFE ) {
2927 make_trie( pRExC_state,
2928 startbranch, first, cur, tail, count,
2931 if ( PL_regkind[ OP( noper ) ] == EXACT
2933 && noper_next == tail
2938 optype = OP( noper );
2948 regprop(RExC_rx, mysv, cur);
2949 PerlIO_printf( Perl_debug_log,
2950 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2951 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2955 if ( last && TRIE_TYPE_IS_SAFE ) {
2956 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
2957 #ifdef TRIE_STUDY_OPT
2958 if ( ((made == MADE_EXACT_TRIE &&
2959 startbranch == first)
2960 || ( first_non_open == first )) &&
2962 flags |= SCF_TRIE_RESTUDY;
2963 if ( startbranch == first
2966 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2976 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2977 scan = NEXTOPER(NEXTOPER(scan));
2978 } else /* single branch is optimized. */
2979 scan = NEXTOPER(scan);
2981 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2982 scan_frame *newframe = NULL;
2987 if (OP(scan) != SUSPEND) {
2988 /* set the pointer */
2989 if (OP(scan) == GOSUB) {
2991 RExC_recurse[ARG2L(scan)] = scan;
2992 start = RExC_open_parens[paren-1];
2993 end = RExC_close_parens[paren-1];
2996 start = RExC_rxi->program + 1;
3000 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3001 SAVEFREEPV(recursed);
3003 if (!PAREN_TEST(recursed,paren+1)) {
3004 PAREN_SET(recursed,paren+1);
3005 Newx(newframe,1,scan_frame);
3007 if (flags & SCF_DO_SUBSTR) {
3008 SCAN_COMMIT(pRExC_state,data,minlenp);
3009 data->longest = &(data->longest_float);
3011 is_inf = is_inf_internal = 1;
3012 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3013 cl_anything(pRExC_state, data->start_class);
3014 flags &= ~SCF_DO_STCLASS;
3017 Newx(newframe,1,scan_frame);
3020 end = regnext(scan);
3025 SAVEFREEPV(newframe);
3026 newframe->next = regnext(scan);
3027 newframe->last = last;
3028 newframe->stop = stopparen;
3029 newframe->prev = frame;
3039 else if (OP(scan) == EXACT) {
3040 I32 l = STR_LEN(scan);
3043 const U8 * const s = (U8*)STRING(scan);
3044 l = utf8_length(s, s + l);
3045 uc = utf8_to_uvchr(s, NULL);
3047 uc = *((U8*)STRING(scan));
3050 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3051 /* The code below prefers earlier match for fixed
3052 offset, later match for variable offset. */
3053 if (data->last_end == -1) { /* Update the start info. */
3054 data->last_start_min = data->pos_min;
3055 data->last_start_max = is_inf
3056 ? I32_MAX : data->pos_min + data->pos_delta;
3058 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3060 SvUTF8_on(data->last_found);
3062 SV * const sv = data->last_found;
3063 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3064 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3065 if (mg && mg->mg_len >= 0)
3066 mg->mg_len += utf8_length((U8*)STRING(scan),
3067 (U8*)STRING(scan)+STR_LEN(scan));
3069 data->last_end = data->pos_min + l;
3070 data->pos_min += l; /* As in the first entry. */
3071 data->flags &= ~SF_BEFORE_EOL;
3073 if (flags & SCF_DO_STCLASS_AND) {
3074 /* Check whether it is compatible with what we know already! */
3078 /* If compatibile, we or it in below. It is compatible if is
3079 * in the bitmp and either 1) its bit or its fold is set, or 2)
3080 * it's for a locale. Even if there isn't unicode semantics
3081 * here, at runtime there may be because of matching against a
3082 * utf8 string, so accept a possible false positive for
3083 * latin1-range folds */
3085 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3086 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3087 && (!(data->start_class->flags & ANYOF_FOLD)
3088 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3091 ANYOF_CLASS_ZERO(data->start_class);
3092 ANYOF_BITMAP_ZERO(data->start_class);
3094 ANYOF_BITMAP_SET(data->start_class, uc);
3095 data->start_class->flags &= ~ANYOF_EOS;
3097 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3099 else if (flags & SCF_DO_STCLASS_OR) {
3100 /* false positive possible if the class is case-folded */
3102 ANYOF_BITMAP_SET(data->start_class, uc);
3104 data->start_class->flags |= ANYOF_UNICODE_ALL;
3105 data->start_class->flags &= ~ANYOF_EOS;
3106 cl_and(data->start_class, and_withp);
3108 flags &= ~SCF_DO_STCLASS;
3110 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3111 I32 l = STR_LEN(scan);
3112 UV uc = *((U8*)STRING(scan));
3114 /* Search for fixed substrings supports EXACT only. */
3115 if (flags & SCF_DO_SUBSTR) {
3117 SCAN_COMMIT(pRExC_state, data, minlenp);
3120 const U8 * const s = (U8 *)STRING(scan);
3121 l = utf8_length(s, s + l);
3122 uc = utf8_to_uvchr(s, NULL);
3125 if (flags & SCF_DO_SUBSTR)
3127 if (flags & SCF_DO_STCLASS_AND) {
3128 /* Check whether it is compatible with what we know already! */
3131 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3132 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3133 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3137 ANYOF_CLASS_ZERO(data->start_class);
3138 ANYOF_BITMAP_ZERO(data->start_class);
3140 ANYOF_BITMAP_SET(data->start_class, uc);
3141 data->start_class->flags &= ~ANYOF_EOS;
3142 data->start_class->flags |= ANYOF_FOLD;
3143 if (OP(scan) == EXACTFL) {
3144 data->start_class->flags |= ANYOF_LOCALE;
3148 /* Also set the other member of the fold pair. In case
3149 * that unicode semantics is called for at runtime, use
3150 * the full latin1 fold. (Can't do this for locale,
3151 * because not known until runtime */
3152 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3156 else if (flags & SCF_DO_STCLASS_OR) {
3157 if (data->start_class->flags & ANYOF_FOLD) {
3158 /* false positive possible if the class is case-folded.
3159 Assume that the locale settings are the same... */
3161 ANYOF_BITMAP_SET(data->start_class, uc);
3162 if (OP(scan) != EXACTFL) {
3164 /* And set the other member of the fold pair, but
3165 * can't do that in locale because not known until
3167 ANYOF_BITMAP_SET(data->start_class,
3168 PL_fold_latin1[uc]);
3171 data->start_class->flags &= ~ANYOF_EOS;
3173 cl_and(data->start_class, and_withp);
3175 flags &= ~SCF_DO_STCLASS;
3177 else if (REGNODE_VARIES(OP(scan))) {
3178 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3179 I32 f = flags, pos_before = 0;
3180 regnode * const oscan = scan;
3181 struct regnode_charclass_class this_class;
3182 struct regnode_charclass_class *oclass = NULL;
3183 I32 next_is_eval = 0;
3185 switch (PL_regkind[OP(scan)]) {
3186 case WHILEM: /* End of (?:...)* . */
3187 scan = NEXTOPER(scan);
3190 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3191 next = NEXTOPER(scan);
3192 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3194 maxcount = REG_INFTY;
3195 next = regnext(scan);
3196 scan = NEXTOPER(scan);
3200 if (flags & SCF_DO_SUBSTR)
3205 if (flags & SCF_DO_STCLASS) {
3207 maxcount = REG_INFTY;
3208 next = regnext(scan);
3209 scan = NEXTOPER(scan);
3212 is_inf = is_inf_internal = 1;
3213 scan = regnext(scan);
3214 if (flags & SCF_DO_SUBSTR) {
3215 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3216 data->longest = &(data->longest_float);
3218 goto optimize_curly_tail;
3220 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3221 && (scan->flags == stopparen))
3226 mincount = ARG1(scan);
3227 maxcount = ARG2(scan);
3229 next = regnext(scan);
3230 if (OP(scan) == CURLYX) {
3231 I32 lp = (data ? *(data->last_closep) : 0);
3232 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3234 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3235 next_is_eval = (OP(scan) == EVAL);
3237 if (flags & SCF_DO_SUBSTR) {
3238 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3239 pos_before = data->pos_min;
3243 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3245 data->flags |= SF_IS_INF;
3247 if (flags & SCF_DO_STCLASS) {
3248 cl_init(pRExC_state, &this_class);
3249 oclass = data->start_class;
3250 data->start_class = &this_class;
3251 f |= SCF_DO_STCLASS_AND;
3252 f &= ~SCF_DO_STCLASS_OR;
3254 /* Exclude from super-linear cache processing any {n,m}
3255 regops for which the combination of input pos and regex
3256 pos is not enough information to determine if a match
3259 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3260 regex pos at the \s*, the prospects for a match depend not
3261 only on the input position but also on how many (bar\s*)
3262 repeats into the {4,8} we are. */
3263 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3264 f &= ~SCF_WHILEM_VISITED_POS;
3266 /* This will finish on WHILEM, setting scan, or on NULL: */
3267 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3268 last, data, stopparen, recursed, NULL,
3270 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3272 if (flags & SCF_DO_STCLASS)
3273 data->start_class = oclass;
3274 if (mincount == 0 || minnext == 0) {
3275 if (flags & SCF_DO_STCLASS_OR) {
3276 cl_or(pRExC_state, data->start_class, &this_class);
3278 else if (flags & SCF_DO_STCLASS_AND) {
3279 /* Switch to OR mode: cache the old value of
3280 * data->start_class */
3282 StructCopy(data->start_class, and_withp,
3283 struct regnode_charclass_class);
3284 flags &= ~SCF_DO_STCLASS_AND;
3285 StructCopy(&this_class, data->start_class,
3286 struct regnode_charclass_class);
3287 flags |= SCF_DO_STCLASS_OR;
3288 data->start_class->flags |= ANYOF_EOS;
3290 } else { /* Non-zero len */
3291 if (flags & SCF_DO_STCLASS_OR) {
3292 cl_or(pRExC_state, data->start_class, &this_class);
3293 cl_and(data->start_class, and_withp);
3295 else if (flags & SCF_DO_STCLASS_AND)
3296 cl_and(data->start_class, &this_class);
3297 flags &= ~SCF_DO_STCLASS;
3299 if (!scan) /* It was not CURLYX, but CURLY. */
3301 if ( /* ? quantifier ok, except for (?{ ... }) */
3302 (next_is_eval || !(mincount == 0 && maxcount == 1))
3303 && (minnext == 0) && (deltanext == 0)
3304 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3305 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3307 ckWARNreg(RExC_parse,
3308 "Quantifier unexpected on zero-length expression");
3311 min += minnext * mincount;
3312 is_inf_internal |= ((maxcount == REG_INFTY
3313 && (minnext + deltanext) > 0)
3314 || deltanext == I32_MAX);
3315 is_inf |= is_inf_internal;
3316 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3318 /* Try powerful optimization CURLYX => CURLYN. */
3319 if ( OP(oscan) == CURLYX && data
3320 && data->flags & SF_IN_PAR
3321 && !(data->flags & SF_HAS_EVAL)
3322 && !deltanext && minnext == 1 ) {
3323 /* Try to optimize to CURLYN. */
3324 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3325 regnode * const nxt1 = nxt;
3332 if (!REGNODE_SIMPLE(OP(nxt))
3333 && !(PL_regkind[OP(nxt)] == EXACT
3334 && STR_LEN(nxt) == 1))
3340 if (OP(nxt) != CLOSE)
3342 if (RExC_open_parens) {
3343 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3344 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3346 /* Now we know that nxt2 is the only contents: */
3347 oscan->flags = (U8)ARG(nxt);
3349 OP(nxt1) = NOTHING; /* was OPEN. */
3352 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3353 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3354 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3355 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3356 OP(nxt + 1) = OPTIMIZED; /* was count. */
3357 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3362 /* Try optimization CURLYX => CURLYM. */
3363 if ( OP(oscan) == CURLYX && data
3364 && !(data->flags & SF_HAS_PAR)
3365 && !(data->flags & SF_HAS_EVAL)
3366 && !deltanext /* atom is fixed width */
3367 && minnext != 0 /* CURLYM can't handle zero width */
3369 /* XXXX How to optimize if data == 0? */
3370 /* Optimize to a simpler form. */
3371 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3375 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3376 && (OP(nxt2) != WHILEM))
3378 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3379 /* Need to optimize away parenths. */
3380 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3381 /* Set the parenth number. */
3382 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3384 oscan->flags = (U8)ARG(nxt);
3385 if (RExC_open_parens) {
3386 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3387 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3389 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3390 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3393 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3394 OP(nxt + 1) = OPTIMIZED; /* was count. */
3395 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3396 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
3399 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3400 regnode *nnxt = regnext(nxt1);
3402 if (reg_off_by_arg[OP(nxt1)])
3403 ARG_SET(nxt1, nxt2 - nxt1);
3404 else if (nxt2 - nxt1 < U16_MAX)
3405 NEXT_OFF(nxt1) = nxt2 - nxt1;
3407 OP(nxt) = NOTHING; /* Cannot beautify */
3412 /* Optimize again: */
3413 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3414 NULL, stopparen, recursed, NULL, 0,depth+1);
3419 else if ((OP(oscan) == CURLYX)
3420 && (flags & SCF_WHILEM_VISITED_POS)
3421 /* See the comment on a similar expression above.
3422 However, this time it's not a subexpression
3423 we care about, but the expression itself. */
3424 && (maxcount == REG_INFTY)
3425 && data && ++data->whilem_c < 16) {
3426 /* This stays as CURLYX, we can put the count/of pair. */
3427 /* Find WHILEM (as in regexec.c) */
3428 regnode *nxt = oscan + NEXT_OFF(oscan);
3430 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3432 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3433 | (RExC_whilem_seen << 4)); /* On WHILEM */
3435 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3437 if (flags & SCF_DO_SUBSTR) {
3438 SV *last_str = NULL;
3439 int counted = mincount != 0;
3441 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3442 #if defined(SPARC64_GCC_WORKAROUND)
3445 const char *s = NULL;
3448 if (pos_before >= data->last_start_min)
3451 b = data->last_start_min;
3454 s = SvPV_const(data->last_found, l);
3455 old = b - data->last_start_min;
3458 I32 b = pos_before >= data->last_start_min
3459 ? pos_before : data->last_start_min;
3461 const char * const s = SvPV_const(data->last_found, l);
3462 I32 old = b - data->last_start_min;
3466 old = utf8_hop((U8*)s, old) - (U8*)s;
3468 /* Get the added string: */
3469 last_str = newSVpvn_utf8(s + old, l, UTF);
3470 if (deltanext == 0 && pos_before == b) {
3471 /* What was added is a constant string */
3473 SvGROW(last_str, (mincount * l) + 1);
3474 repeatcpy(SvPVX(last_str) + l,
3475 SvPVX_const(last_str), l, mincount - 1);
3476 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3477 /* Add additional parts. */
3478 SvCUR_set(data->last_found,
3479 SvCUR(data->last_found) - l);
3480 sv_catsv(data->last_found, last_str);
3482 SV * sv = data->last_found;
3484 SvUTF8(sv) && SvMAGICAL(sv) ?
3485 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3486 if (mg && mg->mg_len >= 0)
3487 mg->mg_len += CHR_SVLEN(last_str) - l;
3489 data->last_end += l * (mincount - 1);
3492 /* start offset must point into the last copy */
3493 data->last_start_min += minnext * (mincount - 1);
3494 data->last_start_max += is_inf ? I32_MAX
3495 : (maxcount - 1) * (minnext + data->pos_delta);
3498 /* It is counted once already... */
3499 data->pos_min += minnext * (mincount - counted);
3500 data->pos_delta += - counted * deltanext +
3501 (minnext + deltanext) * maxcount - minnext * mincount;
3502 if (mincount != maxcount) {
3503 /* Cannot extend fixed substrings found inside
3505 SCAN_COMMIT(pRExC_state,data,minlenp);
3506 if (mincount && last_str) {
3507 SV * const sv = data->last_found;
3508 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3509 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3513 sv_setsv(sv, last_str);
3514 data->last_end = data->pos_min;
3515 data->last_start_min =
3516 data->pos_min - CHR_SVLEN(last_str);
3517 data->last_start_max = is_inf
3519 : data->pos_min + data->pos_delta
3520 - CHR_SVLEN(last_str);
3522 data->longest = &(data->longest_float);
3524 SvREFCNT_dec(last_str);
3526 if (data && (fl & SF_HAS_EVAL))
3527 data->flags |= SF_HAS_EVAL;
3528 optimize_curly_tail:
3529 if (OP(oscan) != CURLYX) {
3530 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3532 NEXT_OFF(oscan) += NEXT_OFF(next);
3535 default: /* REF and CLUMP only? */
3536 if (flags & SCF_DO_SUBSTR) {
3537 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3538 data->longest = &(data->longest_float);
3540 is_inf = is_inf_internal = 1;
3541 if (flags & SCF_DO_STCLASS_OR)
3542 cl_anything(pRExC_state, data->start_class);
3543 flags &= ~SCF_DO_STCLASS;
3547 else if (OP(scan) == LNBREAK) {
3548 if (flags & SCF_DO_STCLASS) {
3550 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3551 if (flags & SCF_DO_STCLASS_AND) {
3552 for (value = 0; value < 256; value++)
3553 if (!is_VERTWS_cp(value))
3554 ANYOF_BITMAP_CLEAR(data->start_class, value);
3557 for (value = 0; value < 256; value++)
3558 if (is_VERTWS_cp(value))
3559 ANYOF_BITMAP_SET(data->start_class, value);
3561 if (flags & SCF_DO_STCLASS_OR)
3562 cl_and(data->start_class, and_withp);
3563 flags &= ~SCF_DO_STCLASS;
3567 if (flags & SCF_DO_SUBSTR) {
3568 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3570 data->pos_delta += 1;
3571 data->longest = &(data->longest_float);
3574 else if (OP(scan) == FOLDCHAR) {
3575 int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
3576 flags &= ~SCF_DO_STCLASS;
3579 if (flags & SCF_DO_SUBSTR) {
3580 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
3582 data->pos_delta += d;
3583 data->longest = &(data->longest_float);
3586 else if (REGNODE_SIMPLE(OP(scan))) {
3589 if (flags & SCF_DO_SUBSTR) {
3590 SCAN_COMMIT(pRExC_state,data,minlenp);
3594 if (flags & SCF_DO_STCLASS) {
3595 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3597 /* Some of the logic below assumes that switching
3598 locale on will only add false positives. */
3599 switch (PL_regkind[OP(scan)]) {
3603 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3604 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3605 cl_anything(pRExC_state, data->start_class);
3608 if (OP(scan) == SANY)
3610 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3611 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3612 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3613 cl_anything(pRExC_state, data->start_class);
3615 if (flags & SCF_DO_STCLASS_AND || !value)
3616 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3619 if (flags & SCF_DO_STCLASS_AND)
3620 cl_and(data->start_class,
3621 (struct regnode_charclass_class*)scan);
3623 cl_or(pRExC_state, data->start_class,
3624 (struct regnode_charclass_class*)scan);
3627 if (flags & SCF_DO_STCLASS_AND) {
3628 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3629 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3630 if (FLAGS(scan) & USE_UNI) {
3631 for (value = 0; value < 256; value++) {
3632 if (!isWORDCHAR_L1(value)) {
3633 ANYOF_BITMAP_CLEAR(data->start_class, value);
3637 for (value = 0; value < 256; value++) {
3638 if (!isALNUM(value)) {
3639 ANYOF_BITMAP_CLEAR(data->start_class, value);
3646 if (data->start_class->flags & ANYOF_LOCALE)
3647 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3648 else if (FLAGS(scan) & USE_UNI) {
3649 for (value = 0; value < 256; value++) {
3650 if (isWORDCHAR_L1(value)) {
3651 ANYOF_BITMAP_SET(data->start_class, value);
3655 for (value = 0; value < 256; value++) {
3656 if (isALNUM(value)) {
3657 ANYOF_BITMAP_SET(data->start_class, value);
3664 if (flags & SCF_DO_STCLASS_AND) {
3665 if (data->start_class->flags & ANYOF_LOCALE)
3666 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3669 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3670 data->start_class->flags |= ANYOF_LOCALE;
3674 if (flags & SCF_DO_STCLASS_AND) {
3675 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3676 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3677 if (FLAGS(scan) & USE_UNI) {
3678 for (value = 0; value < 256; value++) {
3679 if (isWORDCHAR_L1(value)) {
3680 ANYOF_BITMAP_CLEAR(data->start_class, value);
3684 for (value = 0; value < 256; value++) {
3685 if (isALNUM(value)) {
3686 ANYOF_BITMAP_CLEAR(data->start_class, value);
3693 if (data->start_class->flags & ANYOF_LOCALE)
3694 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3696 for (value = 0; value < 256; value++)
3697 if (!isALNUM(value))
3698 ANYOF_BITMAP_SET(data->start_class, value);
3703 if (flags & SCF_DO_STCLASS_AND) {
3704 if (data->start_class->flags & ANYOF_LOCALE)
3705 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3708 data->start_class->flags |= ANYOF_LOCALE;
3709 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3713 if (flags & SCF_DO_STCLASS_AND) {
3714 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3715 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3716 if (FLAGS(scan) & USE_UNI) {
3717 for (value = 0; value < 256; value++) {
3718 if (!isSPACE_L1(value)) {
3719 ANYOF_BITMAP_CLEAR(data->start_class, value);
3723 for (value = 0; value < 256; value++) {
3724 if (!isSPACE(value)) {
3725 ANYOF_BITMAP_CLEAR(data->start_class, value);
3732 if (data->start_class->flags & ANYOF_LOCALE) {
3733 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3735 else if (FLAGS(scan) & USE_UNI) {
3736 for (value = 0; value < 256; value++) {
3737 if (isSPACE_L1(value)) {
3738 ANYOF_BITMAP_SET(data->start_class, value);
3742 for (value = 0; value < 256; value++) {
3743 if (isSPACE(value)) {
3744 ANYOF_BITMAP_SET(data->start_class, value);
3751 if (flags & SCF_DO_STCLASS_AND) {
3752 if (data->start_class->flags & ANYOF_LOCALE)
3753 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3756 data->start_class->flags |= ANYOF_LOCALE;
3757 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3761 if (flags & SCF_DO_STCLASS_AND) {
3762 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3763 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3764 if (FLAGS(scan) & USE_UNI) {
3765 for (value = 0; value < 256; value++) {
3766 if (isSPACE_L1(value)) {
3767 ANYOF_BITMAP_CLEAR(data->start_class, value);
3771 for (value = 0; value < 256; value++) {
3772 if (isSPACE(value)) {
3773 ANYOF_BITMAP_CLEAR(data->start_class, value);
3780 if (data->start_class->flags & ANYOF_LOCALE)
3781 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3782 else if (FLAGS(scan) & USE_UNI) {
3783 for (value = 0; value < 256; value++) {
3784 if (!isSPACE_L1(value)) {
3785 ANYOF_BITMAP_SET(data->start_class, value);
3790 for (value = 0; value < 256; value++) {
3791 if (!isSPACE(value)) {
3792 ANYOF_BITMAP_SET(data->start_class, value);
3799 if (flags & SCF_DO_STCLASS_AND) {
3800 if (data->start_class->flags & ANYOF_LOCALE) {
3801 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3802 for (value = 0; value < 256; value++)
3803 if (!isSPACE(value))
3804 ANYOF_BITMAP_CLEAR(data->start_class, value);
3808 data->start_class->flags |= ANYOF_LOCALE;
3809 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3813 if (flags & SCF_DO_STCLASS_AND) {
3814 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3815 for (value = 0; value < 256; value++)
3816 if (!isDIGIT(value))
3817 ANYOF_BITMAP_CLEAR(data->start_class, value);
3820 if (data->start_class->flags & ANYOF_LOCALE)
3821 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3823 for (value = 0; value < 256; value++)
3825 ANYOF_BITMAP_SET(data->start_class, value);
3830 if (flags & SCF_DO_STCLASS_AND) {
3831 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3832 for (value = 0; value < 256; value++)
3834 ANYOF_BITMAP_CLEAR(data->start_class, value);
3837 if (data->start_class->flags & ANYOF_LOCALE)
3838 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3840 for (value = 0; value < 256; value++)
3841 if (!isDIGIT(value))
3842 ANYOF_BITMAP_SET(data->start_class, value);
3846 CASE_SYNST_FNC(VERTWS);
3847 CASE_SYNST_FNC(HORIZWS);
3850 if (flags & SCF_DO_STCLASS_OR)
3851 cl_and(data->start_class, and_withp);
3852 flags &= ~SCF_DO_STCLASS;
3855 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3856 data->flags |= (OP(scan) == MEOL
3860 else if ( PL_regkind[OP(scan)] == BRANCHJ
3861 /* Lookbehind, or need to calculate parens/evals/stclass: */
3862 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3863 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3864 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3865 || OP(scan) == UNLESSM )
3867 /* Negative Lookahead/lookbehind
3868 In this case we can't do fixed string optimisation.
3871 I32 deltanext, minnext, fake = 0;
3873 struct regnode_charclass_class intrnl;
3876 data_fake.flags = 0;
3878 data_fake.whilem_c = data->whilem_c;
3879 data_fake.last_closep = data->last_closep;
3882 data_fake.last_closep = &fake;
3883 data_fake.pos_delta = delta;
3884 if ( flags & SCF_DO_STCLASS && !scan->flags
3885 && OP(scan) == IFMATCH ) { /* Lookahead */
3886 cl_init(pRExC_state, &intrnl);
3887 data_fake.start_class = &intrnl;
3888 f |= SCF_DO_STCLASS_AND;
3890 if (flags & SCF_WHILEM_VISITED_POS)
3891 f |= SCF_WHILEM_VISITED_POS;
3892 next = regnext(scan);
3893 nscan = NEXTOPER(NEXTOPER(scan));
3894 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3895 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3898 FAIL("Variable length lookbehind not implemented");
3900 else if (minnext > (I32)U8_MAX) {
3901 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3903 scan->flags = (U8)minnext;
3906 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3908 if (data_fake.flags & SF_HAS_EVAL)
3909 data->flags |= SF_HAS_EVAL;
3910 data->whilem_c = data_fake.whilem_c;
3912 if (f & SCF_DO_STCLASS_AND) {
3913 if (flags & SCF_DO_STCLASS_OR) {
3914 /* OR before, AND after: ideally we would recurse with
3915 * data_fake to get the AND applied by study of the
3916 * remainder of the pattern, and then derecurse;
3917 * *** HACK *** for now just treat as "no information".
3918 * See [perl #56690].
3920 cl_init(pRExC_state, data->start_class);
3922 /* AND before and after: combine and continue */
3923 const int was = (data->start_class->flags & ANYOF_EOS);
3925 cl_and(data->start_class, &intrnl);
3927 data->start_class->flags |= ANYOF_EOS;
3931 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3933 /* Positive Lookahead/lookbehind
3934 In this case we can do fixed string optimisation,
3935 but we must be careful about it. Note in the case of
3936 lookbehind the positions will be offset by the minimum
3937 length of the pattern, something we won't know about
3938 until after the recurse.
3940 I32 deltanext, fake = 0;
3942 struct regnode_charclass_class intrnl;
3944 /* We use SAVEFREEPV so that when the full compile
3945 is finished perl will clean up the allocated
3946 minlens when it's all done. This way we don't
3947 have to worry about freeing them when we know
3948 they wont be used, which would be a pain.
3951 Newx( minnextp, 1, I32 );
3952 SAVEFREEPV(minnextp);
3955 StructCopy(data, &data_fake, scan_data_t);
3956 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3959 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
3960 data_fake.last_found=newSVsv(data->last_found);
3964 data_fake.last_closep = &fake;
3965 data_fake.flags = 0;
3966 data_fake.pos_delta = delta;
3968 data_fake.flags |= SF_IS_INF;
3969 if ( flags & SCF_DO_STCLASS && !scan->flags
3970 && OP(scan) == IFMATCH ) { /* Lookahead */
3971 cl_init(pRExC_state, &intrnl);
3972 data_fake.start_class = &intrnl;
3973 f |= SCF_DO_STCLASS_AND;
3975 if (flags & SCF_WHILEM_VISITED_POS)
3976 f |= SCF_WHILEM_VISITED_POS;
3977 next = regnext(scan);
3978 nscan = NEXTOPER(NEXTOPER(scan));
3980 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3981 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3984 FAIL("Variable length lookbehind not implemented");
3986 else if (*minnextp > (I32)U8_MAX) {
3987 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
3989 scan->flags = (U8)*minnextp;
3994 if (f & SCF_DO_STCLASS_AND) {
3995 const int was = (data->start_class->flags & ANYOF_EOS);
3997 cl_and(data->start_class, &intrnl);
3999 data->start_class->flags |= ANYOF_EOS;
4002 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4004 if (data_fake.flags & SF_HAS_EVAL)
4005 data->flags |= SF_HAS_EVAL;
4006 data->whilem_c = data_fake.whilem_c;
4007 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4008 if (RExC_rx->minlen<*minnextp)
4009 RExC_rx->minlen=*minnextp;
4010 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4011 SvREFCNT_dec(data_fake.last_found);
4013 if ( data_fake.minlen_fixed != minlenp )
4015 data->offset_fixed= data_fake.offset_fixed;
4016 data->minlen_fixed= data_fake.minlen_fixed;
4017 data->lookbehind_fixed+= scan->flags;
4019 if ( data_fake.minlen_float != minlenp )
4021 data->minlen_float= data_fake.minlen_float;
4022 data->offset_float_min=data_fake.offset_float_min;
4023 data->offset_float_max=data_fake.offset_float_max;
4024 data->lookbehind_float+= scan->flags;
4033 else if (OP(scan) == OPEN) {
4034 if (stopparen != (I32)ARG(scan))
4037 else if (OP(scan) == CLOSE) {
4038 if (stopparen == (I32)ARG(scan)) {
4041 if ((I32)ARG(scan) == is_par) {
4042 next = regnext(scan);
4044 if ( next && (OP(next) != WHILEM) && next < last)
4045 is_par = 0; /* Disable optimization */
4048 *(data->last_closep) = ARG(scan);
4050 else if (OP(scan) == EVAL) {
4052 data->flags |= SF_HAS_EVAL;
4054 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4055 if (flags & SCF_DO_SUBSTR) {
4056 SCAN_COMMIT(pRExC_state,data,minlenp);
4057 flags &= ~SCF_DO_SUBSTR;
4059 if (data && OP(scan)==ACCEPT) {
4060 data->flags |= SCF_SEEN_ACCEPT;
4065 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4067 if (flags & SCF_DO_SUBSTR) {
4068 SCAN_COMMIT(pRExC_state,data,minlenp);
4069 data->longest = &(data->longest_float);
4071 is_inf = is_inf_internal = 1;
4072 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4073 cl_anything(pRExC_state, data->start_class);
4074 flags &= ~SCF_DO_STCLASS;
4076 else if (OP(scan) == GPOS) {
4077 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4078 !(delta || is_inf || (data && data->pos_delta)))
4080 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4081 RExC_rx->extflags |= RXf_ANCH_GPOS;
4082 if (RExC_rx->gofs < (U32)min)
4083 RExC_rx->gofs = min;
4085 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4089 #ifdef TRIE_STUDY_OPT
4090 #ifdef FULL_TRIE_STUDY
4091 else if (PL_regkind[OP(scan)] == TRIE) {
4092 /* NOTE - There is similar code to this block above for handling
4093 BRANCH nodes on the initial study. If you change stuff here
4095 regnode *trie_node= scan;
4096 regnode *tail= regnext(scan);
4097 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4098 I32 max1 = 0, min1 = I32_MAX;
4099 struct regnode_charclass_class accum;
4101 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4102 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4103 if (flags & SCF_DO_STCLASS)
4104 cl_init_zero(pRExC_state, &accum);
4110 const regnode *nextbranch= NULL;
4113 for ( word=1 ; word <= trie->wordcount ; word++)
4115 I32 deltanext=0, minnext=0, f = 0, fake;
4116 struct regnode_charclass_class this_class;
4118 data_fake.flags = 0;
4120 data_fake.whilem_c = data->whilem_c;
4121 data_fake.last_closep = data->last_closep;
4124 data_fake.last_closep = &fake;
4125 data_fake.pos_delta = delta;
4126 if (flags & SCF_DO_STCLASS) {
4127 cl_init(pRExC_state, &this_class);
4128 data_fake.start_class = &this_class;
4129 f = SCF_DO_STCLASS_AND;
4131 if (flags & SCF_WHILEM_VISITED_POS)
4132 f |= SCF_WHILEM_VISITED_POS;
4134 if (trie->jump[word]) {
4136 nextbranch = trie_node + trie->jump[0];
4137 scan= trie_node + trie->jump[word];
4138 /* We go from the jump point to the branch that follows
4139 it. Note this means we need the vestigal unused branches
4140 even though they arent otherwise used.
4142 minnext = study_chunk(pRExC_state, &scan, minlenp,
4143 &deltanext, (regnode *)nextbranch, &data_fake,
4144 stopparen, recursed, NULL, f,depth+1);
4146 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4147 nextbranch= regnext((regnode*)nextbranch);
4149 if (min1 > (I32)(minnext + trie->minlen))
4150 min1 = minnext + trie->minlen;
4151 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4152 max1 = minnext + deltanext + trie->maxlen;
4153 if (deltanext == I32_MAX)
4154 is_inf = is_inf_internal = 1;
4156 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4158 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4159 if ( stopmin > min + min1)
4160 stopmin = min + min1;
4161 flags &= ~SCF_DO_SUBSTR;
4163 data->flags |= SCF_SEEN_ACCEPT;
4166 if (data_fake.flags & SF_HAS_EVAL)
4167 data->flags |= SF_HAS_EVAL;
4168 data->whilem_c = data_fake.whilem_c;
4170 if (flags & SCF_DO_STCLASS)
4171 cl_or(pRExC_state, &accum, &this_class);
4174 if (flags & SCF_DO_SUBSTR) {
4175 data->pos_min += min1;
4176 data->pos_delta += max1 - min1;
4177 if (max1 != min1 || is_inf)
4178 data->longest = &(data->longest_float);
4181 delta += max1 - min1;
4182 if (flags & SCF_DO_STCLASS_OR) {
4183 cl_or(pRExC_state, data->start_class, &accum);
4185 cl_and(data->start_class, and_withp);
4186 flags &= ~SCF_DO_STCLASS;
4189 else if (flags & SCF_DO_STCLASS_AND) {
4191 cl_and(data->start_class, &accum);
4192 flags &= ~SCF_DO_STCLASS;
4195 /* Switch to OR mode: cache the old value of
4196 * data->start_class */
4198 StructCopy(data->start_class, and_withp,
4199 struct regnode_charclass_class);
4200 flags &= ~SCF_DO_STCLASS_AND;
4201 StructCopy(&accum, data->start_class,
4202 struct regnode_charclass_class);
4203 flags |= SCF_DO_STCLASS_OR;
4204 data->start_class->flags |= ANYOF_EOS;
4211 else if (PL_regkind[OP(scan)] == TRIE) {
4212 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4215 min += trie->minlen;
4216 delta += (trie->maxlen - trie->minlen);
4217 flags &= ~SCF_DO_STCLASS; /* xxx */
4218 if (flags & SCF_DO_SUBSTR) {
4219 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4220 data->pos_min += trie->minlen;
4221 data->pos_delta += (trie->maxlen - trie->minlen);
4222 if (trie->maxlen != trie->minlen)
4223 data->longest = &(data->longest_float);
4225 if (trie->jump) /* no more substrings -- for now /grr*/
4226 flags &= ~SCF_DO_SUBSTR;
4228 #endif /* old or new */
4229 #endif /* TRIE_STUDY_OPT */
4231 /* Else: zero-length, ignore. */
4232 scan = regnext(scan);
4237 stopparen = frame->stop;
4238 frame = frame->prev;
4239 goto fake_study_recurse;
4244 DEBUG_STUDYDATA("pre-fin:",data,depth);
4247 *deltap = is_inf_internal ? I32_MAX : delta;
4248 if (flags & SCF_DO_SUBSTR && is_inf)
4249 data->pos_delta = I32_MAX - data->pos_min;
4250 if (is_par > (I32)U8_MAX)
4252 if (is_par && pars==1 && data) {
4253 data->flags |= SF_IN_PAR;
4254 data->flags &= ~SF_HAS_PAR;
4256 else if (pars && data) {
4257 data->flags |= SF_HAS_PAR;
4258 data->flags &= ~SF_IN_PAR;
4260 if (flags & SCF_DO_STCLASS_OR)
4261 cl_and(data->start_class, and_withp);
4262 if (flags & SCF_TRIE_RESTUDY)
4263 data->flags |= SCF_TRIE_RESTUDY;
4265 DEBUG_STUDYDATA("post-fin:",data,depth);
4267 return min < stopmin ? min : stopmin;
4271 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4273 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4275 PERL_ARGS_ASSERT_ADD_DATA;
4277 Renewc(RExC_rxi->data,
4278 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4279 char, struct reg_data);
4281 Renew(RExC_rxi->data->what, count + n, U8);
4283 Newx(RExC_rxi->data->what, n, U8);
4284 RExC_rxi->data->count = count + n;
4285 Copy(s, RExC_rxi->data->what + count, n, U8);
4289 /*XXX: todo make this not included in a non debugging perl */
4290 #ifndef PERL_IN_XSUB_RE
4292 Perl_reginitcolors(pTHX)
4295 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4297 char *t = savepv(s);
4301 t = strchr(t, '\t');
4307 PL_colors[i] = t = (char *)"";
4312 PL_colors[i++] = (char *)"";
4319 #ifdef TRIE_STUDY_OPT
4320 #define CHECK_RESTUDY_GOTO \
4322 (data.flags & SCF_TRIE_RESTUDY) \
4326 #define CHECK_RESTUDY_GOTO
4330 - pregcomp - compile a regular expression into internal code
4332 * We can't allocate space until we know how big the compiled form will be,
4333 * but we can't compile it (and thus know how big it is) until we've got a
4334 * place to put the code. So we cheat: we compile it twice, once with code
4335 * generation turned off and size counting turned on, and once "for real".
4336 * This also means that we don't allocate space until we are sure that the
4337 * thing really will compile successfully, and we never have to move the
4338 * code and thus invalidate pointers into it. (Note that it has to be in
4339 * one piece because free() must be able to free it all.) [NB: not true in perl]
4341 * Beware that the optimization-preparation code in here knows about some
4342 * of the structure of the compiled regexp. [I'll say.]
4347 #ifndef PERL_IN_XSUB_RE
4348 #define RE_ENGINE_PTR &PL_core_reg_engine
4350 extern const struct regexp_engine my_reg_engine;
4351 #define RE_ENGINE_PTR &my_reg_engine
4354 #ifndef PERL_IN_XSUB_RE
4356 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4359 HV * const table = GvHV(PL_hintgv);
4361 PERL_ARGS_ASSERT_PREGCOMP;
4363 /* Dispatch a request to compile a regexp to correct
4366 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
4367 GET_RE_DEBUG_FLAGS_DECL;
4368 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
4369 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4371 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4374 return CALLREGCOMP_ENG(eng, pattern, flags);
4377 return Perl_re_compile(aTHX_ pattern, flags);
4382 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4387 register regexp_internal *ri;
4396 /* these are all flags - maybe they should be turned
4397 * into a single int with different bit masks */
4398 I32 sawlookahead = 0;
4401 bool used_setjump = FALSE;
4406 RExC_state_t RExC_state;
4407 RExC_state_t * const pRExC_state = &RExC_state;
4408 #ifdef TRIE_STUDY_OPT
4410 RExC_state_t copyRExC_state;
4412 GET_RE_DEBUG_FLAGS_DECL;
4414 PERL_ARGS_ASSERT_RE_COMPILE;
4416 DEBUG_r(if (!PL_colorset) reginitcolors());
4418 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4420 /****************** LONG JUMP TARGET HERE***********************/
4421 /* Longjmp back to here if have to switch in midstream to utf8 */
4422 if (! RExC_orig_utf8) {
4423 JMPENV_PUSH(jump_ret);
4424 used_setjump = TRUE;
4427 if (jump_ret == 0) { /* First time through */
4428 exp = SvPV(pattern, plen);
4430 /* ignore the utf8ness if the pattern is 0 length */
4432 RExC_utf8 = RExC_orig_utf8 = 0;
4436 SV *dsv= sv_newmortal();
4437 RE_PV_QUOTED_DECL(s, RExC_utf8,
4438 dsv, exp, plen, 60);
4439 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4440 PL_colors[4],PL_colors[5],s);
4443 else { /* longjumped back */
4446 /* If the cause for the longjmp was other than changing to utf8, pop
4447 * our own setjmp, and longjmp to the correct handler */
4448 if (jump_ret != UTF8_LONGJMP) {
4450 JMPENV_JUMP(jump_ret);
4455 /* It's possible to write a regexp in ascii that represents Unicode
4456 codepoints outside of the byte range, such as via \x{100}. If we
4457 detect such a sequence we have to convert the entire pattern to utf8
4458 and then recompile, as our sizing calculation will have been based
4459 on 1 byte == 1 character, but we will need to use utf8 to encode
4460 at least some part of the pattern, and therefore must convert the whole
4463 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4464 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
4465 exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len);
4467 RExC_orig_utf8 = RExC_utf8 = 1;
4471 #ifdef TRIE_STUDY_OPT
4475 /* Set to use unicode semantics if the pattern is in utf8 and has the
4476 * 'dual' charset specified, as it means unicode when utf8 */
4477 pm_flags = orig_pm_flags;
4478 if (RExC_utf8 && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
4479 pm_flags |= RXf_PMf_UNICODE;
4483 RExC_flags = pm_flags;
4487 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4488 RExC_seen_evals = 0;
4491 /* First pass: determine size, legality. */
4499 RExC_emit = &PL_regdummy;
4500 RExC_whilem_seen = 0;
4501 RExC_open_parens = NULL;
4502 RExC_close_parens = NULL;
4504 RExC_paren_names = NULL;
4506 RExC_paren_name_list = NULL;
4508 RExC_recurse = NULL;
4509 RExC_recurse_count = 0;
4511 #if 0 /* REGC() is (currently) a NOP at the first pass.
4512 * Clever compilers notice this and complain. --jhi */
4513 REGC((U8)REG_MAGIC, (char*)RExC_emit);
4515 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4516 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4517 RExC_precomp = NULL;
4521 /* Here, finished first pass. Get rid of any added setjmp */
4526 PerlIO_printf(Perl_debug_log,
4527 "Required size %"IVdf" nodes\n"
4528 "Starting second pass (creation)\n",
4531 RExC_lastparse=NULL;
4533 /* Small enough for pointer-storage convention?
4534 If extralen==0, this means that we will not need long jumps. */
4535 if (RExC_size >= 0x10000L && RExC_extralen)
4536 RExC_size += RExC_extralen;
4539 if (RExC_whilem_seen > 15)
4540 RExC_whilem_seen = 15;
4542 /* Allocate space and zero-initialize. Note, the two step process
4543 of zeroing when in debug mode, thus anything assigned has to
4544 happen after that */
4545 rx = (REGEXP*) newSV_type(SVt_REGEXP);
4546 r = (struct regexp*)SvANY(rx);
4547 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4548 char, regexp_internal);
4549 if ( r == NULL || ri == NULL )
4550 FAIL("Regexp out of space");
4552 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
4553 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
4555 /* bulk initialize base fields with 0. */
4556 Zero(ri, sizeof(regexp_internal), char);
4559 /* non-zero initialization begins here */
4561 r->engine= RE_ENGINE_PTR;
4562 r->extflags = pm_flags;
4564 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4565 bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE));
4567 /* The caret is output if there are any defaults: if not all the STD
4568 * flags are set, or if no character set specifier is needed */
4570 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
4572 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4573 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
4574 >> RXf_PMf_STD_PMMOD_SHIFT);
4575 const char *fptr = STD_PAT_MODS; /*"msix"*/
4577 /* Allocate for the worst case, which is all the std flags are turned
4578 * on. If more precision is desired, we could do a population count of
4579 * the flags set. This could be done with a small lookup table, or by
4580 * shifting, masking and adding, or even, when available, assembly
4581 * language for a machine-language population count.
4582 * We never output a minus, as all those are defaults, so are
4583 * covered by the caret */
4584 const STRLEN wraplen = plen + has_p + has_runon
4585 + has_default /* If needs a caret */
4586 + has_charset /* If needs a character set specifier */
4587 + (sizeof(STD_PAT_MODS) - 1)
4588 + (sizeof("(?:)") - 1);
4590 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
4592 SvFLAGS(rx) |= SvUTF8(pattern);
4595 /* If a default, cover it using the caret */
4597 *p++= DEFAULT_PAT_MOD;
4600 if (r->extflags & RXf_PMf_LOCALE) {
4601 *p++ = LOCALE_PAT_MOD;
4603 *p++ = UNICODE_PAT_MOD;
4607 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
4610 while((ch = *fptr++)) {
4618 Copy(RExC_precomp, p, plen, char);
4619 assert ((RX_WRAPPED(rx) - p) < 16);
4620 r->pre_prefix = p - RX_WRAPPED(rx);
4626 SvCUR_set(rx, p - SvPVX_const(rx));
4630 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
4632 if (RExC_seen & REG_SEEN_RECURSE) {
4633 Newxz(RExC_open_parens, RExC_npar,regnode *);
4634 SAVEFREEPV(RExC_open_parens);
4635 Newxz(RExC_close_parens,RExC_npar,regnode *);
4636 SAVEFREEPV(RExC_close_parens);
4639 /* Useful during FAIL. */
4640 #ifdef RE_TRACK_PATTERN_OFFSETS
4641 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
4642 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
4643 "%s %"UVuf" bytes for offset annotations.\n",
4644 ri->u.offsets ? "Got" : "Couldn't get",
4645 (UV)((2*RExC_size+1) * sizeof(U32))));
4647 SetProgLen(ri,RExC_size);
4652 /* Second pass: emit code. */
4653 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
4658 RExC_emit_start = ri->program;
4659 RExC_emit = ri->program;
4660 RExC_emit_bound = ri->program + RExC_size + 1;
4662 /* Store the count of eval-groups for security checks: */
4663 RExC_rx->seen_evals = RExC_seen_evals;
4664 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
4665 if (reg(pRExC_state, 0, &flags,1) == NULL) {
4669 /* XXXX To minimize changes to RE engine we always allocate
4670 3-units-long substrs field. */
4671 Newx(r->substrs, 1, struct reg_substr_data);
4672 if (RExC_recurse_count) {
4673 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4674 SAVEFREEPV(RExC_recurse);
4678 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
4679 Zero(r->substrs, 1, struct reg_substr_data);
4681 #ifdef TRIE_STUDY_OPT
4683 StructCopy(&zero_scan_data, &data, scan_data_t);
4684 copyRExC_state = RExC_state;
4687 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
4689 RExC_state = copyRExC_state;
4690 if (seen & REG_TOP_LEVEL_BRANCHES)
4691 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4693 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
4694 if (data.last_found) {
4695 SvREFCNT_dec(data.longest_fixed);
4696 SvREFCNT_dec(data.longest_float);
4697 SvREFCNT_dec(data.last_found);
4699 StructCopy(&zero_scan_data, &data, scan_data_t);
4702 StructCopy(&zero_scan_data, &data, scan_data_t);
4705 /* Dig out information for optimizations. */
4706 r->extflags = RExC_flags; /* was pm_op */
4707 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
4710 SvUTF8_on(rx); /* Unicode in it? */
4711 ri->regstclass = NULL;
4712 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
4713 r->intflags |= PREGf_NAUGHTY;
4714 scan = ri->program + 1; /* First BRANCH. */
4716 /* testing for BRANCH here tells us whether there is "must appear"
4717 data in the pattern. If there is then we can use it for optimisations */
4718 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
4720 STRLEN longest_float_length, longest_fixed_length;
4721 struct regnode_charclass_class ch_class; /* pointed to by data */
4723 I32 last_close = 0; /* pointed to by data */
4724 regnode *first= scan;
4725 regnode *first_next= regnext(first);
4727 * Skip introductions and multiplicators >= 1
4728 * so that we can extract the 'meat' of the pattern that must
4729 * match in the large if() sequence following.
4730 * NOTE that EXACT is NOT covered here, as it is normally
4731 * picked up by the optimiser separately.
4733 * This is unfortunate as the optimiser isnt handling lookahead
4734 * properly currently.
4737 while ((OP(first) == OPEN && (sawopen = 1)) ||
4738 /* An OR of *one* alternative - should not happen now. */
4739 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
4740 /* for now we can't handle lookbehind IFMATCH*/
4741 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
4742 (OP(first) == PLUS) ||
4743 (OP(first) == MINMOD) ||
4744 /* An {n,m} with n>0 */
4745 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
4746 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
4749 * the only op that could be a regnode is PLUS, all the rest
4750 * will be regnode_1 or regnode_2.
4753 if (OP(first) == PLUS)
4756 first += regarglen[OP(first)];
4758 first = NEXTOPER(first);
4759 first_next= regnext(first);
4762 /* Starting-point info. */
4764 DEBUG_PEEP("first:",first,0);
4765 /* Ignore EXACT as we deal with it later. */
4766 if (PL_regkind[OP(first)] == EXACT) {
4767 if (OP(first) == EXACT)
4768 NOOP; /* Empty, get anchored substr later. */
4770 ri->regstclass = first;
4773 else if (PL_regkind[OP(first)] == TRIE &&
4774 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
4777 /* this can happen only on restudy */
4778 if ( OP(first) == TRIE ) {
4779 struct regnode_1 *trieop = (struct regnode_1 *)
4780 PerlMemShared_calloc(1, sizeof(struct regnode_1));
4781 StructCopy(first,trieop,struct regnode_1);
4782 trie_op=(regnode *)trieop;
4784 struct regnode_charclass *trieop = (struct regnode_charclass *)
4785 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
4786 StructCopy(first,trieop,struct regnode_charclass);
4787 trie_op=(regnode *)trieop;
4790 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
4791 ri->regstclass = trie_op;
4794 else if (REGNODE_SIMPLE(OP(first)))
4795 ri->regstclass = first;
4796 else if (PL_regkind[OP(first)] == BOUND ||
4797 PL_regkind[OP(first)] == NBOUND)
4798 ri->regstclass = first;
4799 else if (PL_regkind[OP(first)] == BOL) {
4800 r->extflags |= (OP(first) == MBOL
4802 : (OP(first) == SBOL
4805 first = NEXTOPER(first);
4808 else if (OP(first) == GPOS) {
4809 r->extflags |= RXf_ANCH_GPOS;
4810 first = NEXTOPER(first);
4813 else if ((!sawopen || !RExC_sawback) &&
4814 (OP(first) == STAR &&
4815 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
4816 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
4818 /* turn .* into ^.* with an implied $*=1 */
4820 (OP(NEXTOPER(first)) == REG_ANY)
4823 r->extflags |= type;
4824 r->intflags |= PREGf_IMPLICIT;
4825 first = NEXTOPER(first);
4828 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
4829 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
4830 /* x+ must match at the 1st pos of run of x's */
4831 r->intflags |= PREGf_SKIP;
4833 /* Scan is after the zeroth branch, first is atomic matcher. */
4834 #ifdef TRIE_STUDY_OPT
4837 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4838 (IV)(first - scan + 1))
4842 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4843 (IV)(first - scan + 1))
4849 * If there's something expensive in the r.e., find the
4850 * longest literal string that must appear and make it the
4851 * regmust. Resolve ties in favor of later strings, since
4852 * the regstart check works with the beginning of the r.e.
4853 * and avoiding duplication strengthens checking. Not a
4854 * strong reason, but sufficient in the absence of others.
4855 * [Now we resolve ties in favor of the earlier string if
4856 * it happens that c_offset_min has been invalidated, since the
4857 * earlier string may buy us something the later one won't.]
4860 data.longest_fixed = newSVpvs("");
4861 data.longest_float = newSVpvs("");
4862 data.last_found = newSVpvs("");
4863 data.longest = &(data.longest_fixed);
4865 if (!ri->regstclass) {
4866 cl_init(pRExC_state, &ch_class);
4867 data.start_class = &ch_class;
4868 stclass_flag = SCF_DO_STCLASS_AND;
4869 } else /* XXXX Check for BOUND? */
4871 data.last_closep = &last_close;
4873 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
4874 &data, -1, NULL, NULL,
4875 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
4881 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
4882 && data.last_start_min == 0 && data.last_end > 0
4883 && !RExC_seen_zerolen
4884 && !(RExC_seen & REG_SEEN_VERBARG)
4885 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4886 r->extflags |= RXf_CHECK_ALL;
4887 scan_commit(pRExC_state, &data,&minlen,0);
4888 SvREFCNT_dec(data.last_found);
4890 /* Note that code very similar to this but for anchored string
4891 follows immediately below, changes may need to be made to both.
4894 longest_float_length = CHR_SVLEN(data.longest_float);
4895 if (longest_float_length
4896 || (data.flags & SF_FL_BEFORE_EOL
4897 && (!(data.flags & SF_FL_BEFORE_MEOL)
4898 || (RExC_flags & RXf_PMf_MULTILINE))))
4902 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
4903 && data.offset_fixed == data.offset_float_min
4904 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4905 goto remove_float; /* As in (a)+. */
4907 /* copy the information about the longest float from the reg_scan_data
4908 over to the program. */
4909 if (SvUTF8(data.longest_float)) {
4910 r->float_utf8 = data.longest_float;
4911 r->float_substr = NULL;
4913 r->float_substr = data.longest_float;
4914 r->float_utf8 = NULL;
4916 /* float_end_shift is how many chars that must be matched that
4917 follow this item. We calculate it ahead of time as once the
4918 lookbehind offset is added in we lose the ability to correctly
4920 ml = data.minlen_float ? *(data.minlen_float)
4921 : (I32)longest_float_length;
4922 r->float_end_shift = ml - data.offset_float_min
4923 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4924 + data.lookbehind_float;
4925 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
4926 r->float_max_offset = data.offset_float_max;
4927 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
4928 r->float_max_offset -= data.lookbehind_float;
4930 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4931 && (!(data.flags & SF_FL_BEFORE_MEOL)
4932 || (RExC_flags & RXf_PMf_MULTILINE)));
4933 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
4937 r->float_substr = r->float_utf8 = NULL;
4938 SvREFCNT_dec(data.longest_float);
4939 longest_float_length = 0;
4942 /* Note that code very similar to this but for floating string
4943 is immediately above, changes may need to be made to both.
4946 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
4947 if (longest_fixed_length
4948 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4949 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4950 || (RExC_flags & RXf_PMf_MULTILINE))))
4954 /* copy the information about the longest fixed
4955 from the reg_scan_data over to the program. */
4956 if (SvUTF8(data.longest_fixed)) {
4957 r->anchored_utf8 = data.longest_fixed;
4958 r->anchored_substr = NULL;
4960 r->anchored_substr = data.longest_fixed;
4961 r->anchored_utf8 = NULL;
4963 /* fixed_end_shift is how many chars that must be matched that
4964 follow this item. We calculate it ahead of time as once the
4965 lookbehind offset is added in we lose the ability to correctly
4967 ml = data.minlen_fixed ? *(data.minlen_fixed)
4968 : (I32)longest_fixed_length;
4969 r->anchored_end_shift = ml - data.offset_fixed
4970 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4971 + data.lookbehind_fixed;
4972 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4974 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4975 && (!(data.flags & SF_FIX_BEFORE_MEOL)
4976 || (RExC_flags & RXf_PMf_MULTILINE)));
4977 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
4980 r->anchored_substr = r->anchored_utf8 = NULL;
4981 SvREFCNT_dec(data.longest_fixed);
4982 longest_fixed_length = 0;
4985 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4986 ri->regstclass = NULL;
4987 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4989 && !(data.start_class->flags & ANYOF_EOS)
4990 && !cl_is_anything(data.start_class))
4992 const U32 n = add_data(pRExC_state, 1, "f");
4994 Newx(RExC_rxi->data->data[n], 1,
4995 struct regnode_charclass_class);
4996 StructCopy(data.start_class,
4997 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
4998 struct regnode_charclass_class);
4999 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5000 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5001 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
5002 regprop(r, sv, (regnode*)data.start_class);
5003 PerlIO_printf(Perl_debug_log,
5004 "synthetic stclass \"%s\".\n",
5005 SvPVX_const(sv));});
5008 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
5009 if (longest_fixed_length > longest_float_length) {
5010 r->check_end_shift = r->anchored_end_shift;
5011 r->check_substr = r->anchored_substr;
5012 r->check_utf8 = r->anchored_utf8;
5013 r->check_offset_min = r->check_offset_max = r->anchored_offset;
5014 if (r->extflags & RXf_ANCH_SINGLE)
5015 r->extflags |= RXf_NOSCAN;
5018 r->check_end_shift = r->float_end_shift;
5019 r->check_substr = r->float_substr;
5020 r->check_utf8 = r->float_utf8;
5021 r->check_offset_min = r->float_min_offset;
5022 r->check_offset_max = r->float_max_offset;
5024 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5025 This should be changed ASAP! */
5026 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5027 r->extflags |= RXf_USE_INTUIT;
5028 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
5029 r->extflags |= RXf_INTUIT_TAIL;
5031 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5032 if ( (STRLEN)minlen < longest_float_length )
5033 minlen= longest_float_length;
5034 if ( (STRLEN)minlen < longest_fixed_length )
5035 minlen= longest_fixed_length;
5039 /* Several toplevels. Best we can is to set minlen. */
5041 struct regnode_charclass_class ch_class;
5044 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
5046 scan = ri->program + 1;
5047 cl_init(pRExC_state, &ch_class);
5048 data.start_class = &ch_class;
5049 data.last_closep = &last_close;
5052 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
5053 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
5057 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
5058 = r->float_substr = r->float_utf8 = NULL;
5059 if (!(data.start_class->flags & ANYOF_EOS)
5060 && !cl_is_anything(data.start_class))
5062 const U32 n = add_data(pRExC_state, 1, "f");
5064 Newx(RExC_rxi->data->data[n], 1,
5065 struct regnode_charclass_class);
5066 StructCopy(data.start_class,
5067 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
5068 struct regnode_charclass_class);
5069 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
5070 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
5071 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
5072 regprop(r, sv, (regnode*)data.start_class);
5073 PerlIO_printf(Perl_debug_log,
5074 "synthetic stclass \"%s\".\n",
5075 SvPVX_const(sv));});
5079 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5080 the "real" pattern. */
5082 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
5083 (IV)minlen, (IV)r->minlen);
5085 r->minlenret = minlen;
5086 if (r->minlen < minlen)
5089 if (RExC_seen & REG_SEEN_GPOS)
5090 r->extflags |= RXf_GPOS_SEEN;
5091 if (RExC_seen & REG_SEEN_LOOKBEHIND)
5092 r->extflags |= RXf_LOOKBEHIND_SEEN;
5093 if (RExC_seen & REG_SEEN_EVAL)
5094 r->extflags |= RXf_EVAL_SEEN;
5095 if (RExC_seen & REG_SEEN_CANY)
5096 r->extflags |= RXf_CANY_SEEN;
5097 if (RExC_seen & REG_SEEN_VERBARG)
5098 r->intflags |= PREGf_VERBARG_SEEN;
5099 if (RExC_seen & REG_SEEN_CUTGROUP)
5100 r->intflags |= PREGf_CUTGROUP_SEEN;
5101 if (RExC_paren_names)
5102 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
5104 RXp_PAREN_NAMES(r) = NULL;
5106 #ifdef STUPID_PATTERN_CHECKS
5107 if (RX_PRELEN(rx) == 0)
5108 r->extflags |= RXf_NULL;
5109 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5110 /* XXX: this should happen BEFORE we compile */
5111 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5112 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
5113 r->extflags |= RXf_WHITE;
5114 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
5115 r->extflags |= RXf_START_ONLY;
5117 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
5118 /* XXX: this should happen BEFORE we compile */
5119 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5121 regnode *first = ri->program + 1;
5123 U8 nop = OP(NEXTOPER(first));
5125 if (PL_regkind[fop] == NOTHING && nop == END)
5126 r->extflags |= RXf_NULL;
5127 else if (PL_regkind[fop] == BOL && nop == END)
5128 r->extflags |= RXf_START_ONLY;
5129 else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
5130 r->extflags |= RXf_WHITE;
5134 if (RExC_paren_names) {
5135 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
5136 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5139 ri->name_list_idx = 0;
5141 if (RExC_recurse_count) {
5142 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5143 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5144 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5147 Newxz(r->offs, RExC_npar, regexp_paren_pair);
5148 /* assume we don't need to swap parens around before we match */
5151 PerlIO_printf(Perl_debug_log,"Final program:\n");
5154 #ifdef RE_TRACK_PATTERN_OFFSETS
5155 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5156 const U32 len = ri->u.offsets[0];
5158 GET_RE_DEBUG_FLAGS_DECL;
5159 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
5160 for (i = 1; i <= len; i++) {
5161 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
5162 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
5163 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
5165 PerlIO_printf(Perl_debug_log, "\n");
5171 #undef RE_ENGINE_PTR
5175 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5178 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5180 PERL_UNUSED_ARG(value);
5182 if (flags & RXapif_FETCH) {
5183 return reg_named_buff_fetch(rx, key, flags);
5184 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
5185 Perl_croak_no_modify(aTHX);
5187 } else if (flags & RXapif_EXISTS) {
5188 return reg_named_buff_exists(rx, key, flags)
5191 } else if (flags & RXapif_REGNAMES) {
5192 return reg_named_buff_all(rx, flags);
5193 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
5194 return reg_named_buff_scalar(rx, flags);
5196 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5202 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5205 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
5206 PERL_UNUSED_ARG(lastkey);
5208 if (flags & RXapif_FIRSTKEY)
5209 return reg_named_buff_firstkey(rx, flags);
5210 else if (flags & RXapif_NEXTKEY)
5211 return reg_named_buff_nextkey(rx, flags);
5213 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5219 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5222 AV *retarray = NULL;
5224 struct regexp *const rx = (struct regexp *)SvANY(r);
5226 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5228 if (flags & RXapif_ALL)
5231 if (rx && RXp_PAREN_NAMES(rx)) {
5232 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
5235 SV* sv_dat=HeVAL(he_str);
5236 I32 *nums=(I32*)SvPVX(sv_dat);
5237 for ( i=0; i<SvIVX(sv_dat); i++ ) {
5238 if ((I32)(rx->nparens) >= nums[i]
5239 && rx->offs[nums[i]].start != -1
5240 && rx->offs[nums[i]].end != -1)
5243 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
5247 ret = newSVsv(&PL_sv_undef);
5250 av_push(retarray, ret);
5253 return newRV_noinc(MUTABLE_SV(retarray));
5260 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
5263 struct regexp *const rx = (struct regexp *)SvANY(r);
5265 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5267 if (rx && RXp_PAREN_NAMES(rx)) {
5268 if (flags & RXapif_ALL) {
5269 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
5271 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
5285 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
5287 struct regexp *const rx = (struct regexp *)SvANY(r);
5289 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5291 if ( rx && RXp_PAREN_NAMES(rx) ) {
5292 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
5294 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
5301 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
5303 struct regexp *const rx = (struct regexp *)SvANY(r);
5304 GET_RE_DEBUG_FLAGS_DECL;
5306 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5308 if (rx && RXp_PAREN_NAMES(rx)) {
5309 HV *hv = RXp_PAREN_NAMES(rx);
5311 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5314 SV* sv_dat = HeVAL(temphe);
5315 I32 *nums = (I32*)SvPVX(sv_dat);
5316 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5317 if ((I32)(rx->lastparen) >= nums[i] &&
5318 rx->offs[nums[i]].start != -1 &&
5319 rx->offs[nums[i]].end != -1)
5325 if (parno || flags & RXapif_ALL) {
5326 return newSVhek(HeKEY_hek(temphe));
5334 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
5339 struct regexp *const rx = (struct regexp *)SvANY(r);
5341 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5343 if (rx && RXp_PAREN_NAMES(rx)) {
5344 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5345 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
5346 } else if (flags & RXapif_ONE) {
5347 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
5348 av = MUTABLE_AV(SvRV(ret));
5349 length = av_len(av);
5351 return newSViv(length + 1);
5353 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5357 return &PL_sv_undef;
5361 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
5363 struct regexp *const rx = (struct regexp *)SvANY(r);
5366 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5368 if (rx && RXp_PAREN_NAMES(rx)) {
5369 HV *hv= RXp_PAREN_NAMES(rx);
5371 (void)hv_iterinit(hv);
5372 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5375 SV* sv_dat = HeVAL(temphe);
5376 I32 *nums = (I32*)SvPVX(sv_dat);
5377 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
5378 if ((I32)(rx->lastparen) >= nums[i] &&
5379 rx->offs[nums[i]].start != -1 &&
5380 rx->offs[nums[i]].end != -1)
5386 if (parno || flags & RXapif_ALL) {
5387 av_push(av, newSVhek(HeKEY_hek(temphe)));
5392 return newRV_noinc(MUTABLE_SV(av));
5396 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5399 struct regexp *const rx = (struct regexp *)SvANY(r);
5404 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
5407 sv_setsv(sv,&PL_sv_undef);
5411 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
5413 i = rx->offs[0].start;
5417 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
5419 s = rx->subbeg + rx->offs[0].end;
5420 i = rx->sublen - rx->offs[0].end;
5423 if ( 0 <= paren && paren <= (I32)rx->nparens &&
5424 (s1 = rx->offs[paren].start) != -1 &&
5425 (t1 = rx->offs[paren].end) != -1)
5429 s = rx->subbeg + s1;
5431 sv_setsv(sv,&PL_sv_undef);
5434 assert(rx->sublen >= (s - rx->subbeg) + i );
5436 const int oldtainted = PL_tainted;
5438 sv_setpvn(sv, s, i);
5439 PL_tainted = oldtainted;
5440 if ( (rx->extflags & RXf_CANY_SEEN)
5441 ? (RXp_MATCH_UTF8(rx)
5442 && (!i || is_utf8_string((U8*)s, i)))
5443 : (RXp_MATCH_UTF8(rx)) )
5450 if (RXp_MATCH_TAINTED(rx)) {
5451 if (SvTYPE(sv) >= SVt_PVMG) {
5452 MAGIC* const mg = SvMAGIC(sv);
5455 SvMAGIC_set(sv, mg->mg_moremagic);
5457 if ((mgt = SvMAGIC(sv))) {
5458 mg->mg_moremagic = mgt;
5459 SvMAGIC_set(sv, mg);
5469 sv_setsv(sv,&PL_sv_undef);
5475 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5476 SV const * const value)
5478 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5480 PERL_UNUSED_ARG(rx);
5481 PERL_UNUSED_ARG(paren);
5482 PERL_UNUSED_ARG(value);
5485 Perl_croak_no_modify(aTHX);
5489 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
5492 struct regexp *const rx = (struct regexp *)SvANY(r);
5496 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5498 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5500 /* $` / ${^PREMATCH} */
5501 case RX_BUFF_IDX_PREMATCH:
5502 if (rx->offs[0].start != -1) {
5503 i = rx->offs[0].start;
5511 /* $' / ${^POSTMATCH} */
5512 case RX_BUFF_IDX_POSTMATCH:
5513 if (rx->offs[0].end != -1) {
5514 i = rx->sublen - rx->offs[0].end;
5516 s1 = rx->offs[0].end;
5522 /* $& / ${^MATCH}, $1, $2, ... */
5524 if (paren <= (I32)rx->nparens &&
5525 (s1 = rx->offs[paren].start) != -1 &&
5526 (t1 = rx->offs[paren].end) != -1)
5531 if (ckWARN(WARN_UNINITIALIZED))
5532 report_uninit((const SV *)sv);
5537 if (i > 0 && RXp_MATCH_UTF8(rx)) {
5538 const char * const s = rx->subbeg + s1;
5543 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
5550 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
5552 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
5553 PERL_UNUSED_ARG(rx);
5557 return newSVpvs("Regexp");
5560 /* Scans the name of a named buffer from the pattern.
5561 * If flags is REG_RSN_RETURN_NULL returns null.
5562 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
5563 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
5564 * to the parsed name as looked up in the RExC_paren_names hash.
5565 * If there is an error throws a vFAIL().. type exception.
5568 #define REG_RSN_RETURN_NULL 0
5569 #define REG_RSN_RETURN_NAME 1
5570 #define REG_RSN_RETURN_DATA 2
5573 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
5575 char *name_start = RExC_parse;
5577 PERL_ARGS_ASSERT_REG_SCAN_NAME;
5579 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
5580 /* skip IDFIRST by using do...while */
5583 RExC_parse += UTF8SKIP(RExC_parse);
5584 } while (isALNUM_utf8((U8*)RExC_parse));
5588 } while (isALNUM(*RExC_parse));
5593 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
5594 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
5595 if ( flags == REG_RSN_RETURN_NAME)
5597 else if (flags==REG_RSN_RETURN_DATA) {
5600 if ( ! sv_name ) /* should not happen*/
5601 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
5602 if (RExC_paren_names)
5603 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
5605 sv_dat = HeVAL(he_str);
5607 vFAIL("Reference to nonexistent named group");
5611 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
5618 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
5619 int rem=(int)(RExC_end - RExC_parse); \
5628 if (RExC_lastparse!=RExC_parse) \
5629 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
5632 iscut ? "..." : "<" \
5635 PerlIO_printf(Perl_debug_log,"%16s",""); \
5638 num = RExC_size + 1; \
5640 num=REG_NODE_NUM(RExC_emit); \
5641 if (RExC_lastnum!=num) \
5642 PerlIO_printf(Perl_debug_log,"|%4d",num); \
5644 PerlIO_printf(Perl_debug_log,"|%4s",""); \
5645 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
5646 (int)((depth*2)), "", \
5650 RExC_lastparse=RExC_parse; \
5655 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
5656 DEBUG_PARSE_MSG((funcname)); \
5657 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
5659 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
5660 DEBUG_PARSE_MSG((funcname)); \
5661 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
5664 - reg - regular expression, i.e. main body or parenthesized thing
5666 * Caller must absorb opening parenthesis.
5668 * Combining parenthesis handling with the base level of regular expression
5669 * is a trifle forced, but the need to tie the tails of the branches to what
5670 * follows makes it hard to avoid.
5672 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
5674 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
5676 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
5680 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
5681 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
5684 register regnode *ret; /* Will be the head of the group. */
5685 register regnode *br;
5686 register regnode *lastbr;
5687 register regnode *ender = NULL;
5688 register I32 parno = 0;
5690 U32 oregflags = RExC_flags;
5691 bool have_branch = 0;
5693 I32 freeze_paren = 0;
5694 I32 after_freeze = 0;
5696 /* for (?g), (?gc), and (?o) warnings; warning
5697 about (?c) will warn about (?g) -- japhy */
5699 #define WASTED_O 0x01
5700 #define WASTED_G 0x02
5701 #define WASTED_C 0x04
5702 #define WASTED_GC (0x02|0x04)
5703 I32 wastedflags = 0x00;
5705 char * parse_start = RExC_parse; /* MJD */
5706 char * const oregcomp_parse = RExC_parse;
5708 GET_RE_DEBUG_FLAGS_DECL;
5710 PERL_ARGS_ASSERT_REG;
5711 DEBUG_PARSE("reg ");
5713 *flagp = 0; /* Tentatively. */
5716 /* Make an OPEN node, if parenthesized. */
5718 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
5719 char *start_verb = RExC_parse;
5720 STRLEN verb_len = 0;
5721 char *start_arg = NULL;
5722 unsigned char op = 0;
5724 int internal_argval = 0; /* internal_argval is only useful if !argok */
5725 while ( *RExC_parse && *RExC_parse != ')' ) {
5726 if ( *RExC_parse == ':' ) {
5727 start_arg = RExC_parse + 1;
5733 verb_len = RExC_parse - start_verb;
5736 while ( *RExC_parse && *RExC_parse != ')' )
5738 if ( *RExC_parse != ')' )
5739 vFAIL("Unterminated verb pattern argument");
5740 if ( RExC_parse == start_arg )
5743 if ( *RExC_parse != ')' )
5744 vFAIL("Unterminated verb pattern");
5747 switch ( *start_verb ) {
5748 case 'A': /* (*ACCEPT) */
5749 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
5751 internal_argval = RExC_nestroot;
5754 case 'C': /* (*COMMIT) */
5755 if ( memEQs(start_verb,verb_len,"COMMIT") )
5758 case 'F': /* (*FAIL) */
5759 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
5764 case ':': /* (*:NAME) */
5765 case 'M': /* (*MARK:NAME) */
5766 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
5771 case 'P': /* (*PRUNE) */
5772 if ( memEQs(start_verb,verb_len,"PRUNE") )
5775 case 'S': /* (*SKIP) */
5776 if ( memEQs(start_verb,verb_len,"SKIP") )
5779 case 'T': /* (*THEN) */
5780 /* [19:06] <TimToady> :: is then */
5781 if ( memEQs(start_verb,verb_len,"THEN") ) {
5783 RExC_seen |= REG_SEEN_CUTGROUP;
5789 vFAIL3("Unknown verb pattern '%.*s'",
5790 verb_len, start_verb);
5793 if ( start_arg && internal_argval ) {
5794 vFAIL3("Verb pattern '%.*s' may not have an argument",
5795 verb_len, start_verb);
5796 } else if ( argok < 0 && !start_arg ) {
5797 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5798 verb_len, start_verb);
5800 ret = reganode(pRExC_state, op, internal_argval);
5801 if ( ! internal_argval && ! SIZE_ONLY ) {
5803 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5804 ARG(ret) = add_data( pRExC_state, 1, "S" );
5805 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
5812 if (!internal_argval)
5813 RExC_seen |= REG_SEEN_VERBARG;
5814 } else if ( start_arg ) {
5815 vFAIL3("Verb pattern '%.*s' may not have an argument",
5816 verb_len, start_verb);
5818 ret = reg_node(pRExC_state, op);
5820 nextchar(pRExC_state);
5823 if (*RExC_parse == '?') { /* (?...) */
5824 bool is_logical = 0;
5825 const char * const seqstart = RExC_parse;
5826 bool has_use_defaults = FALSE;
5829 paren = *RExC_parse++;
5830 ret = NULL; /* For look-ahead/behind. */
5833 case 'P': /* (?P...) variants for those used to PCRE/Python */
5834 paren = *RExC_parse++;
5835 if ( paren == '<') /* (?P<...>) named capture */
5837 else if (paren == '>') { /* (?P>name) named recursion */
5838 goto named_recursion;
5840 else if (paren == '=') { /* (?P=...) named backref */
5841 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5842 you change this make sure you change that */
5843 char* name_start = RExC_parse;
5845 SV *sv_dat = reg_scan_name(pRExC_state,
5846 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5847 if (RExC_parse == name_start || *RExC_parse != ')')
5848 vFAIL2("Sequence %.3s... not terminated",parse_start);
5851 num = add_data( pRExC_state, 1, "S" );
5852 RExC_rxi->data->data[num]=(void*)sv_dat;
5853 SvREFCNT_inc_simple_void(sv_dat);
5856 ret = reganode(pRExC_state,
5867 Set_Node_Offset(ret, parse_start+1);
5868 Set_Node_Cur_Length(ret); /* MJD */
5870 nextchar(pRExC_state);
5874 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5876 case '<': /* (?<...) */
5877 if (*RExC_parse == '!')
5879 else if (*RExC_parse != '=')
5885 case '\'': /* (?'...') */
5886 name_start= RExC_parse;
5887 svname = reg_scan_name(pRExC_state,
5888 SIZE_ONLY ? /* reverse test from the others */
5889 REG_RSN_RETURN_NAME :
5890 REG_RSN_RETURN_NULL);
5891 if (RExC_parse == name_start) {
5893 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5896 if (*RExC_parse != paren)
5897 vFAIL2("Sequence (?%c... not terminated",
5898 paren=='>' ? '<' : paren);
5902 if (!svname) /* shouldnt happen */
5904 "panic: reg_scan_name returned NULL");
5905 if (!RExC_paren_names) {
5906 RExC_paren_names= newHV();
5907 sv_2mortal(MUTABLE_SV(RExC_paren_names));
5909 RExC_paren_name_list= newAV();
5910 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
5913 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
5915 sv_dat = HeVAL(he_str);
5917 /* croak baby croak */
5919 "panic: paren_name hash element allocation failed");
5920 } else if ( SvPOK(sv_dat) ) {
5921 /* (?|...) can mean we have dupes so scan to check
5922 its already been stored. Maybe a flag indicating
5923 we are inside such a construct would be useful,
5924 but the arrays are likely to be quite small, so
5925 for now we punt -- dmq */
5926 IV count = SvIV(sv_dat);
5927 I32 *pv = (I32*)SvPVX(sv_dat);
5929 for ( i = 0 ; i < count ; i++ ) {
5930 if ( pv[i] == RExC_npar ) {
5936 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5937 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5938 pv[count] = RExC_npar;
5939 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
5942 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5943 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5945 SvIV_set(sv_dat, 1);
5948 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5949 SvREFCNT_dec(svname);
5952 /*sv_dump(sv_dat);*/
5954 nextchar(pRExC_state);
5956 goto capturing_parens;
5958 RExC_seen |= REG_SEEN_LOOKBEHIND;
5960 case '=': /* (?=...) */
5961 RExC_seen_zerolen++;
5963 case '!': /* (?!...) */
5964 RExC_seen_zerolen++;
5965 if (*RExC_parse == ')') {
5966 ret=reg_node(pRExC_state, OPFAIL);
5967 nextchar(pRExC_state);
5971 case '|': /* (?|...) */
5972 /* branch reset, behave like a (?:...) except that
5973 buffers in alternations share the same numbers */
5975 after_freeze = freeze_paren = RExC_npar;
5977 case ':': /* (?:...) */
5978 case '>': /* (?>...) */
5980 case '$': /* (?$...) */
5981 case '@': /* (?@...) */
5982 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
5984 case '#': /* (?#...) */
5985 while (*RExC_parse && *RExC_parse != ')')
5987 if (*RExC_parse != ')')
5988 FAIL("Sequence (?#... not terminated");
5989 nextchar(pRExC_state);
5992 case '0' : /* (?0) */
5993 case 'R' : /* (?R) */
5994 if (*RExC_parse != ')')
5995 FAIL("Sequence (?R) not terminated");
5996 ret = reg_node(pRExC_state, GOSTART);
5997 *flagp |= POSTPONED;
5998 nextchar(pRExC_state);
6001 { /* named and numeric backreferences */
6003 case '&': /* (?&NAME) */
6004 parse_start = RExC_parse - 1;
6007 SV *sv_dat = reg_scan_name(pRExC_state,
6008 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6009 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6011 goto gen_recurse_regop;
6014 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6016 vFAIL("Illegal pattern");
6018 goto parse_recursion;
6020 case '-': /* (?-1) */
6021 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
6022 RExC_parse--; /* rewind to let it be handled later */
6026 case '1': case '2': case '3': case '4': /* (?1) */
6027 case '5': case '6': case '7': case '8': case '9':
6030 num = atoi(RExC_parse);
6031 parse_start = RExC_parse - 1; /* MJD */
6032 if (*RExC_parse == '-')
6034 while (isDIGIT(*RExC_parse))
6036 if (*RExC_parse!=')')
6037 vFAIL("Expecting close bracket");
6040 if ( paren == '-' ) {
6042 Diagram of capture buffer numbering.
6043 Top line is the normal capture buffer numbers
6044 Bottom line is the negative indexing as from
6048 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
6052 num = RExC_npar + num;
6055 vFAIL("Reference to nonexistent group");
6057 } else if ( paren == '+' ) {
6058 num = RExC_npar + num - 1;
6061 ret = reganode(pRExC_state, GOSUB, num);
6063 if (num > (I32)RExC_rx->nparens) {
6065 vFAIL("Reference to nonexistent group");
6067 ARG2L_SET( ret, RExC_recurse_count++);
6069 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6070 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
6074 RExC_seen |= REG_SEEN_RECURSE;
6075 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
6076 Set_Node_Offset(ret, parse_start); /* MJD */
6078 *flagp |= POSTPONED;
6079 nextchar(pRExC_state);
6081 } /* named and numeric backreferences */
6084 case '?': /* (??...) */
6086 if (*RExC_parse != '{') {
6088 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6091 *flagp |= POSTPONED;
6092 paren = *RExC_parse++;
6094 case '{': /* (?{...}) */
6099 char *s = RExC_parse;
6101 RExC_seen_zerolen++;
6102 RExC_seen |= REG_SEEN_EVAL;
6103 while (count && (c = *RExC_parse)) {
6114 if (*RExC_parse != ')') {
6116 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
6120 OP_4tree *sop, *rop;
6121 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
6124 Perl_save_re_context(aTHX);
6125 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
6126 sop->op_private |= OPpREFCOUNTED;
6127 /* re_dup will OpREFCNT_inc */
6128 OpREFCNT_set(sop, 1);
6131 n = add_data(pRExC_state, 3, "nop");
6132 RExC_rxi->data->data[n] = (void*)rop;
6133 RExC_rxi->data->data[n+1] = (void*)sop;
6134 RExC_rxi->data->data[n+2] = (void*)pad;
6137 else { /* First pass */
6138 if (PL_reginterp_cnt < ++RExC_seen_evals
6140 /* No compiled RE interpolated, has runtime
6141 components ===> unsafe. */
6142 FAIL("Eval-group not allowed at runtime, use re 'eval'");
6143 if (PL_tainting && PL_tainted)
6144 FAIL("Eval-group in insecure regular expression");
6145 #if PERL_VERSION > 8
6146 if (IN_PERL_COMPILETIME)
6151 nextchar(pRExC_state);
6153 ret = reg_node(pRExC_state, LOGICAL);
6156 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
6157 /* deal with the length of this later - MJD */
6160 ret = reganode(pRExC_state, EVAL, n);
6161 Set_Node_Length(ret, RExC_parse - parse_start + 1);
6162 Set_Node_Offset(ret, parse_start);
6165 case '(': /* (?(?{...})...) and (?(?=...)...) */
6168 if (RExC_parse[0] == '?') { /* (?(?...)) */
6169 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
6170 || RExC_parse[1] == '<'
6171 || RExC_parse[1] == '{') { /* Lookahead or eval. */
6174 ret = reg_node(pRExC_state, LOGICAL);
6177 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
6181 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
6182 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
6184 char ch = RExC_parse[0] == '<' ? '>' : '\'';
6185 char *name_start= RExC_parse++;
6187 SV *sv_dat=reg_scan_name(pRExC_state,
6188 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6189 if (RExC_parse == name_start || *RExC_parse != ch)
6190 vFAIL2("Sequence (?(%c... not terminated",
6191 (ch == '>' ? '<' : ch));
6194 num = add_data( pRExC_state, 1, "S" );
6195 RExC_rxi->data->data[num]=(void*)sv_dat;
6196 SvREFCNT_inc_simple_void(sv_dat);
6198 ret = reganode(pRExC_state,NGROUPP,num);
6199 goto insert_if_check_paren;
6201 else if (RExC_parse[0] == 'D' &&
6202 RExC_parse[1] == 'E' &&
6203 RExC_parse[2] == 'F' &&
6204 RExC_parse[3] == 'I' &&
6205 RExC_parse[4] == 'N' &&
6206 RExC_parse[5] == 'E')
6208 ret = reganode(pRExC_state,DEFINEP,0);
6211 goto insert_if_check_paren;
6213 else if (RExC_parse[0] == 'R') {
6216 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6217 parno = atoi(RExC_parse++);
6218 while (isDIGIT(*RExC_parse))
6220 } else if (RExC_parse[0] == '&') {
6223 sv_dat = reg_scan_name(pRExC_state,
6224 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
6225 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
6227 ret = reganode(pRExC_state,INSUBP,parno);
6228 goto insert_if_check_paren;
6230 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
6233 parno = atoi(RExC_parse++);
6235 while (isDIGIT(*RExC_parse))
6237 ret = reganode(pRExC_state, GROUPP, parno);
6239 insert_if_check_paren:
6240 if ((c = *nextchar(pRExC_state)) != ')')
6241 vFAIL("Switch condition not recognized");
6243 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
6244 br = regbranch(pRExC_state, &flags, 1,depth+1);
6246 br = reganode(pRExC_state, LONGJMP, 0);
6248 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
6249 c = *nextchar(pRExC_state);
6254 vFAIL("(?(DEFINE)....) does not allow branches");
6255 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
6256 regbranch(pRExC_state, &flags, 1,depth+1);
6257 REGTAIL(pRExC_state, ret, lastbr);
6260 c = *nextchar(pRExC_state);
6265 vFAIL("Switch (?(condition)... contains too many branches");
6266 ender = reg_node(pRExC_state, TAIL);
6267 REGTAIL(pRExC_state, br, ender);
6269 REGTAIL(pRExC_state, lastbr, ender);
6270 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
6273 REGTAIL(pRExC_state, ret, ender);
6274 RExC_size++; /* XXX WHY do we need this?!!
6275 For large programs it seems to be required
6276 but I can't figure out why. -- dmq*/
6280 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
6284 RExC_parse--; /* for vFAIL to print correctly */
6285 vFAIL("Sequence (? incomplete");
6287 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
6289 has_use_defaults = TRUE;
6290 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
6291 if (RExC_utf8) { /* But the default for a utf8 pattern is
6292 unicode semantics */
6293 RExC_flags |= RXf_PMf_UNICODE;
6298 parse_flags: /* (?i) */
6300 U32 posflags = 0, negflags = 0;
6301 U32 *flagsp = &posflags;
6302 bool has_charset_modifier = 0;
6304 while (*RExC_parse) {
6305 /* && strchr("iogcmsx", *RExC_parse) */
6306 /* (?g), (?gc) and (?o) are useless here
6307 and must be globally applied -- japhy */
6308 switch (*RExC_parse) {
6309 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
6310 case LOCALE_PAT_MOD:
6311 if (has_charset_modifier || flagsp == &negflags) {
6312 goto fail_modifiers;
6314 posflags |= RXf_PMf_LOCALE;
6315 negflags |= RXf_PMf_UNICODE;
6316 has_charset_modifier = 1;
6318 case UNICODE_PAT_MOD:
6319 if (has_charset_modifier || flagsp == &negflags) {
6320 goto fail_modifiers;
6322 posflags |= RXf_PMf_UNICODE;
6323 negflags |= RXf_PMf_LOCALE;
6324 has_charset_modifier = 1;
6327 if (has_use_defaults
6328 || has_charset_modifier
6329 || flagsp == &negflags)
6331 goto fail_modifiers;
6334 /* The dual charset means unicode semantics if the
6335 * pattern (or target, not known until runtime) are
6338 posflags |= RXf_PMf_UNICODE;
6339 negflags |= RXf_PMf_LOCALE;
6342 negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
6344 has_charset_modifier = 1;
6346 case ONCE_PAT_MOD: /* 'o' */
6347 case GLOBAL_PAT_MOD: /* 'g' */
6348 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6349 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
6350 if (! (wastedflags & wflagbit) ) {
6351 wastedflags |= wflagbit;
6354 "Useless (%s%c) - %suse /%c modifier",
6355 flagsp == &negflags ? "?-" : "?",
6357 flagsp == &negflags ? "don't " : "",
6364 case CONTINUE_PAT_MOD: /* 'c' */
6365 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6366 if (! (wastedflags & WASTED_C) ) {
6367 wastedflags |= WASTED_GC;
6370 "Useless (%sc) - %suse /gc modifier",
6371 flagsp == &negflags ? "?-" : "?",
6372 flagsp == &negflags ? "don't " : ""
6377 case KEEPCOPY_PAT_MOD: /* 'p' */
6378 if (flagsp == &negflags) {
6380 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
6382 *flagsp |= RXf_PMf_KEEPCOPY;
6386 /* A flag is a default iff it is following a minus, so
6387 * if there is a minus, it means will be trying to
6388 * re-specify a default which is an error */
6389 if (has_use_defaults || flagsp == &negflags) {
6392 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6396 wastedflags = 0; /* reset so (?g-c) warns twice */
6402 RExC_flags |= posflags;
6403 RExC_flags &= ~negflags;
6405 oregflags |= posflags;
6406 oregflags &= ~negflags;
6408 nextchar(pRExC_state);
6419 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
6424 }} /* one for the default block, one for the switch */
6431 ret = reganode(pRExC_state, OPEN, parno);
6434 RExC_nestroot = parno;
6435 if (RExC_seen & REG_SEEN_RECURSE
6436 && !RExC_open_parens[parno-1])
6438 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6439 "Setting open paren #%"IVdf" to %d\n",
6440 (IV)parno, REG_NODE_NUM(ret)));
6441 RExC_open_parens[parno-1]= ret;
6444 Set_Node_Length(ret, 1); /* MJD */
6445 Set_Node_Offset(ret, RExC_parse); /* MJD */
6453 /* Pick up the branches, linking them together. */
6454 parse_start = RExC_parse; /* MJD */
6455 br = regbranch(pRExC_state, &flags, 1,depth+1);
6458 if (RExC_npar > after_freeze)
6459 after_freeze = RExC_npar;
6460 RExC_npar = freeze_paren;
6463 /* branch_len = (paren != 0); */
6467 if (*RExC_parse == '|') {
6468 if (!SIZE_ONLY && RExC_extralen) {
6469 reginsert(pRExC_state, BRANCHJ, br, depth+1);
6472 reginsert(pRExC_state, BRANCH, br, depth+1);
6473 Set_Node_Length(br, paren != 0);
6474 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
6478 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
6480 else if (paren == ':') {
6481 *flagp |= flags&SIMPLE;
6483 if (is_open) { /* Starts with OPEN. */
6484 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
6486 else if (paren != '?') /* Not Conditional */
6488 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6490 while (*RExC_parse == '|') {
6491 if (!SIZE_ONLY && RExC_extralen) {
6492 ender = reganode(pRExC_state, LONGJMP,0);
6493 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
6496 RExC_extralen += 2; /* Account for LONGJMP. */
6497 nextchar(pRExC_state);
6499 if (RExC_npar > after_freeze)
6500 after_freeze = RExC_npar;
6501 RExC_npar = freeze_paren;
6503 br = regbranch(pRExC_state, &flags, 0, depth+1);
6507 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
6509 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
6512 if (have_branch || paren != ':') {
6513 /* Make a closing node, and hook it on the end. */
6516 ender = reg_node(pRExC_state, TAIL);
6519 ender = reganode(pRExC_state, CLOSE, parno);
6520 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
6521 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
6522 "Setting close paren #%"IVdf" to %d\n",
6523 (IV)parno, REG_NODE_NUM(ender)));
6524 RExC_close_parens[parno-1]= ender;
6525 if (RExC_nestroot == parno)
6528 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
6529 Set_Node_Length(ender,1); /* MJD */
6535 *flagp &= ~HASWIDTH;
6538 ender = reg_node(pRExC_state, SUCCEED);
6541 ender = reg_node(pRExC_state, END);
6543 assert(!RExC_opend); /* there can only be one! */
6548 REGTAIL(pRExC_state, lastbr, ender);
6550 if (have_branch && !SIZE_ONLY) {
6552 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6554 /* Hook the tails of the branches to the closing node. */
6555 for (br = ret; br; br = regnext(br)) {
6556 const U8 op = PL_regkind[OP(br)];
6558 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
6560 else if (op == BRANCHJ) {
6561 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
6569 static const char parens[] = "=!<,>";
6571 if (paren && (p = strchr(parens, paren))) {
6572 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
6573 int flag = (p - parens) > 1;
6576 node = SUSPEND, flag = 0;
6577 reginsert(pRExC_state, node,ret, depth+1);
6578 Set_Node_Cur_Length(ret);
6579 Set_Node_Offset(ret, parse_start + 1);
6581 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
6585 /* Check for proper termination. */
6587 RExC_flags = oregflags;
6588 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
6589 RExC_parse = oregcomp_parse;
6590 vFAIL("Unmatched (");
6593 else if (!paren && RExC_parse < RExC_end) {
6594 if (*RExC_parse == ')') {
6596 vFAIL("Unmatched )");
6599 FAIL("Junk on end of regexp"); /* "Can't happen". */
6603 RExC_npar = after_freeze;
6608 - regbranch - one alternative of an | operator
6610 * Implements the concatenation operator.
6613 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
6616 register regnode *ret;
6617 register regnode *chain = NULL;
6618 register regnode *latest;
6619 I32 flags = 0, c = 0;
6620 GET_RE_DEBUG_FLAGS_DECL;
6622 PERL_ARGS_ASSERT_REGBRANCH;
6624 DEBUG_PARSE("brnc");
6629 if (!SIZE_ONLY && RExC_extralen)
6630 ret = reganode(pRExC_state, BRANCHJ,0);
6632 ret = reg_node(pRExC_state, BRANCH);
6633 Set_Node_Length(ret, 1);
6637 if (!first && SIZE_ONLY)
6638 RExC_extralen += 1; /* BRANCHJ */
6640 *flagp = WORST; /* Tentatively. */
6643 nextchar(pRExC_state);
6644 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
6646 latest = regpiece(pRExC_state, &flags,depth+1);
6647 if (latest == NULL) {
6648 if (flags & TRYAGAIN)
6652 else if (ret == NULL)
6654 *flagp |= flags&(HASWIDTH|POSTPONED);
6655 if (chain == NULL) /* First piece. */
6656 *flagp |= flags&SPSTART;
6659 REGTAIL(pRExC_state, chain, latest);
6664 if (chain == NULL) { /* Loop ran zero times. */
6665 chain = reg_node(pRExC_state, NOTHING);
6670 *flagp |= flags&SIMPLE;
6677 - regpiece - something followed by possible [*+?]
6679 * Note that the branching code sequences used for ? and the general cases
6680 * of * and + are somewhat optimized: they use the same NOTHING node as
6681 * both the endmarker for their branch list and the body of the last branch.
6682 * It might seem that this node could be dispensed with entirely, but the
6683 * endmarker role is not redundant.
6686 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
6689 register regnode *ret;
6691 register char *next;
6693 const char * const origparse = RExC_parse;
6695 I32 max = REG_INFTY;
6697 const char *maxpos = NULL;
6698 GET_RE_DEBUG_FLAGS_DECL;
6700 PERL_ARGS_ASSERT_REGPIECE;
6702 DEBUG_PARSE("piec");
6704 ret = regatom(pRExC_state, &flags,depth+1);
6706 if (flags & TRYAGAIN)
6713 if (op == '{' && regcurly(RExC_parse)) {
6715 parse_start = RExC_parse; /* MJD */
6716 next = RExC_parse + 1;
6717 while (isDIGIT(*next) || *next == ',') {
6726 if (*next == '}') { /* got one */
6730 min = atoi(RExC_parse);
6734 maxpos = RExC_parse;
6736 if (!max && *maxpos != '0')
6737 max = REG_INFTY; /* meaning "infinity" */
6738 else if (max >= REG_INFTY)
6739 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
6741 nextchar(pRExC_state);
6744 if ((flags&SIMPLE)) {
6745 RExC_naughty += 2 + RExC_naughty / 2;
6746 reginsert(pRExC_state, CURLY, ret, depth+1);
6747 Set_Node_Offset(ret, parse_start+1); /* MJD */
6748 Set_Node_Cur_Length(ret);
6751 regnode * const w = reg_node(pRExC_state, WHILEM);
6754 REGTAIL(pRExC_state, ret, w);
6755 if (!SIZE_ONLY && RExC_extralen) {
6756 reginsert(pRExC_state, LONGJMP,ret, depth+1);
6757 reginsert(pRExC_state, NOTHING,ret, depth+1);
6758 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
6760 reginsert(pRExC_state, CURLYX,ret, depth+1);
6762 Set_Node_Offset(ret, parse_start+1);
6763 Set_Node_Length(ret,
6764 op == '{' ? (RExC_parse - parse_start) : 1);
6766 if (!SIZE_ONLY && RExC_extralen)
6767 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
6768 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
6770 RExC_whilem_seen++, RExC_extralen += 3;
6771 RExC_naughty += 4 + RExC_naughty; /* compound interest */
6780 vFAIL("Can't do {n,m} with n > m");
6782 ARG1_SET(ret, (U16)min);
6783 ARG2_SET(ret, (U16)max);
6795 #if 0 /* Now runtime fix should be reliable. */
6797 /* if this is reinstated, don't forget to put this back into perldiag:
6799 =item Regexp *+ operand could be empty at {#} in regex m/%s/
6801 (F) The part of the regexp subject to either the * or + quantifier
6802 could match an empty string. The {#} shows in the regular
6803 expression about where the problem was discovered.
6807 if (!(flags&HASWIDTH) && op != '?')
6808 vFAIL("Regexp *+ operand could be empty");
6811 parse_start = RExC_parse;
6812 nextchar(pRExC_state);
6814 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
6816 if (op == '*' && (flags&SIMPLE)) {
6817 reginsert(pRExC_state, STAR, ret, depth+1);
6821 else if (op == '*') {
6825 else if (op == '+' && (flags&SIMPLE)) {
6826 reginsert(pRExC_state, PLUS, ret, depth+1);
6830 else if (op == '+') {
6834 else if (op == '?') {
6839 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
6840 ckWARN3reg(RExC_parse,
6841 "%.*s matches null string many times",
6842 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
6846 if (RExC_parse < RExC_end && *RExC_parse == '?') {
6847 nextchar(pRExC_state);
6848 reginsert(pRExC_state, MINMOD, ret, depth+1);
6849 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
6851 #ifndef REG_ALLOW_MINMOD_SUSPEND
6854 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6856 nextchar(pRExC_state);
6857 ender = reg_node(pRExC_state, SUCCEED);
6858 REGTAIL(pRExC_state, ret, ender);
6859 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6861 ender = reg_node(pRExC_state, TAIL);
6862 REGTAIL(pRExC_state, ret, ender);
6866 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
6868 vFAIL("Nested quantifiers");
6875 /* reg_namedseq(pRExC_state,UVp)
6877 This is expected to be called by a parser routine that has
6878 recognized '\N' and needs to handle the rest. RExC_parse is
6879 expected to point at the first char following the N at the time
6882 The \N may be inside (indicated by valuep not being NULL) or outside a
6885 \N may begin either a named sequence, or if outside a character class, mean
6886 to match a non-newline. For non single-quoted regexes, the tokenizer has
6887 attempted to decide which, and in the case of a named sequence converted it
6888 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
6889 where c1... are the characters in the sequence. For single-quoted regexes,
6890 the tokenizer passes the \N sequence through unchanged; this code will not
6891 attempt to determine this nor expand those. The net effect is that if the
6892 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
6893 signals that this \N occurrence means to match a non-newline.
6895 Only the \N{U+...} form should occur in a character class, for the same
6896 reason that '.' inside a character class means to just match a period: it
6897 just doesn't make sense.
6899 If valuep is non-null then it is assumed that we are parsing inside
6900 of a charclass definition and the first codepoint in the resolved
6901 string is returned via *valuep and the routine will return NULL.
6902 In this mode if a multichar string is returned from the charnames
6903 handler, a warning will be issued, and only the first char in the
6904 sequence will be examined. If the string returned is zero length
6905 then the value of *valuep is undefined and NON-NULL will
6906 be returned to indicate failure. (This will NOT be a valid pointer
6909 If valuep is null then it is assumed that we are parsing normal text and a
6910 new EXACT node is inserted into the program containing the resolved string,
6911 and a pointer to the new node is returned. But if the string is zero length
6912 a NOTHING node is emitted instead.
6914 On success RExC_parse is set to the char following the endbrace.
6915 Parsing failures will generate a fatal error via vFAIL(...)
6918 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
6920 char * endbrace; /* '}' following the name */
6921 regnode *ret = NULL;
6923 char* parse_start = RExC_parse - 2; /* points to the '\N' */
6927 GET_RE_DEBUG_FLAGS_DECL;
6929 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
6933 /* The [^\n] meaning of \N ignores spaces and comments under the /x
6934 * modifier. The other meaning does not */
6935 p = (RExC_flags & RXf_PMf_EXTENDED)
6936 ? regwhite( pRExC_state, RExC_parse )
6939 /* Disambiguate between \N meaning a named character versus \N meaning
6940 * [^\n]. The former is assumed when it can't be the latter. */
6941 if (*p != '{' || regcurly(p)) {
6944 /* no bare \N in a charclass */
6945 vFAIL("\\N in a character class must be a named character: \\N{...}");
6947 nextchar(pRExC_state);
6948 ret = reg_node(pRExC_state, REG_ANY);
6949 *flagp |= HASWIDTH|SIMPLE;
6952 Set_Node_Length(ret, 1); /* MJD */
6956 /* Here, we have decided it should be a named sequence */
6958 /* The test above made sure that the next real character is a '{', but
6959 * under the /x modifier, it could be separated by space (or a comment and
6960 * \n) and this is not allowed (for consistency with \x{...} and the
6961 * tokenizer handling of \N{NAME}). */
6962 if (*RExC_parse != '{') {
6963 vFAIL("Missing braces on \\N{}");
6966 RExC_parse++; /* Skip past the '{' */
6968 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
6969 || ! (endbrace == RExC_parse /* nothing between the {} */
6970 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
6971 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
6973 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
6974 vFAIL("\\N{NAME} must be resolved by the lexer");
6977 if (endbrace == RExC_parse) { /* empty: \N{} */
6979 RExC_parse = endbrace + 1;
6980 return reg_node(pRExC_state,NOTHING);
6984 ckWARNreg(RExC_parse,
6985 "Ignoring zero length \\N{} in character class"
6987 RExC_parse = endbrace + 1;
6990 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
6993 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
6994 RExC_parse += 2; /* Skip past the 'U+' */
6996 if (valuep) { /* In a bracketed char class */
6997 /* We only pay attention to the first char of
6998 multichar strings being returned. I kinda wonder
6999 if this makes sense as it does change the behaviour
7000 from earlier versions, OTOH that behaviour was broken
7001 as well. XXX Solution is to recharacterize as
7002 [rest-of-class]|multi1|multi2... */
7004 STRLEN length_of_hex;
7005 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7006 | PERL_SCAN_DISALLOW_PREFIX
7007 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7009 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
7010 if (endchar < endbrace) {
7011 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
7014 length_of_hex = (STRLEN)(endchar - RExC_parse);
7015 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
7017 /* The tokenizer should have guaranteed validity, but it's possible to
7018 * bypass it by using single quoting, so check */
7019 if (length_of_hex == 0
7020 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7022 RExC_parse += length_of_hex; /* Includes all the valid */
7023 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7024 ? UTF8SKIP(RExC_parse)
7026 /* Guard against malformed utf8 */
7027 if (RExC_parse >= endchar) RExC_parse = endchar;
7028 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7031 RExC_parse = endbrace + 1;
7032 if (endchar == endbrace) return NULL;
7034 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
7036 else { /* Not a char class */
7037 char *s; /* String to put in generated EXACT node */
7038 STRLEN len = 0; /* Its current byte length */
7039 char *endchar; /* Points to '.' or '}' ending cur char in the input
7042 ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
7050 /* Exact nodes can hold only a U8 length's of text = 255. Loop through
7051 * the input which is of the form now 'c1.c2.c3...}' until find the
7052 * ending brace or exceed length 255. The characters that exceed this
7053 * limit are dropped. The limit could be relaxed should it become
7054 * desirable by reparsing this as (?:\N{NAME}), so could generate
7055 * multiple EXACT nodes, as is done for just regular input. But this
7056 * is primarily a named character, and not intended to be a huge long
7057 * string, so 255 bytes should be good enough */
7059 STRLEN length_of_hex;
7060 I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
7061 | PERL_SCAN_DISALLOW_PREFIX
7062 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
7063 UV cp; /* Ord of current character */
7065 /* Code points are separated by dots. If none, there is only one
7066 * code point, and is terminated by the brace */
7067 endchar = RExC_parse + strcspn(RExC_parse, ".}");
7069 /* The values are Unicode even on EBCDIC machines */
7070 length_of_hex = (STRLEN)(endchar - RExC_parse);
7071 cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
7072 if ( length_of_hex == 0
7073 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
7075 RExC_parse += length_of_hex; /* Includes all the valid */
7076 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
7077 ? UTF8SKIP(RExC_parse)
7079 /* Guard against malformed utf8 */
7080 if (RExC_parse >= endchar) RExC_parse = endchar;
7081 vFAIL("Invalid hexadecimal number in \\N{U+...}");
7084 if (! FOLD) { /* Not folding, just append to the string */
7087 /* Quit before adding this character if would exceed limit */
7088 if (len + UNISKIP(cp) > U8_MAX) break;
7090 unilen = reguni(pRExC_state, cp, s);
7095 } else { /* Folding, output the folded equivalent */
7096 STRLEN foldlen,numlen;
7097 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7098 cp = toFOLD_uni(cp, tmpbuf, &foldlen);
7100 /* Quit before exceeding size limit */
7101 if (len + foldlen > U8_MAX) break;
7103 for (foldbuf = tmpbuf;
7107 cp = utf8_to_uvchr(foldbuf, &numlen);
7109 const STRLEN unilen = reguni(pRExC_state, cp, s);
7112 /* In EBCDIC the numlen and unilen can differ. */
7114 if (numlen >= foldlen)
7118 break; /* "Can't happen." */
7122 /* Point to the beginning of the next character in the sequence. */
7123 RExC_parse = endchar + 1;
7125 /* Quit if no more characters */
7126 if (RExC_parse >= endbrace) break;
7131 if (RExC_parse < endbrace) {
7132 ckWARNreg(RExC_parse - 1,
7133 "Using just the first characters returned by \\N{}");
7136 RExC_size += STR_SZ(len);
7139 RExC_emit += STR_SZ(len);
7142 RExC_parse = endbrace + 1;
7144 *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
7145 with malformed in t/re/pat_advanced.t */
7147 Set_Node_Cur_Length(ret); /* MJD */
7148 nextchar(pRExC_state);
7158 * It returns the code point in utf8 for the value in *encp.
7159 * value: a code value in the source encoding
7160 * encp: a pointer to an Encode object
7162 * If the result from Encode is not a single character,
7163 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
7166 S_reg_recode(pTHX_ const char value, SV **encp)
7169 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
7170 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
7171 const STRLEN newlen = SvCUR(sv);
7172 UV uv = UNICODE_REPLACEMENT;
7174 PERL_ARGS_ASSERT_REG_RECODE;
7178 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
7181 if (!newlen || numlen != newlen) {
7182 uv = UNICODE_REPLACEMENT;
7190 - regatom - the lowest level
7192 Try to identify anything special at the start of the pattern. If there
7193 is, then handle it as required. This may involve generating a single regop,
7194 such as for an assertion; or it may involve recursing, such as to
7195 handle a () structure.
7197 If the string doesn't start with something special then we gobble up
7198 as much literal text as we can.
7200 Once we have been able to handle whatever type of thing started the
7201 sequence, we return.
7203 Note: we have to be careful with escapes, as they can be both literal
7204 and special, and in the case of \10 and friends can either, depending
7205 on context. Specifically there are two seperate switches for handling
7206 escape sequences, with the one for handling literal escapes requiring
7207 a dummy entry for all of the special escapes that are actually handled
7212 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
7215 register regnode *ret = NULL;
7217 char *parse_start = RExC_parse;
7218 GET_RE_DEBUG_FLAGS_DECL;
7219 DEBUG_PARSE("atom");
7220 *flagp = WORST; /* Tentatively. */
7222 PERL_ARGS_ASSERT_REGATOM;
7225 switch ((U8)*RExC_parse) {
7227 RExC_seen_zerolen++;
7228 nextchar(pRExC_state);
7229 if (RExC_flags & RXf_PMf_MULTILINE)
7230 ret = reg_node(pRExC_state, MBOL);
7231 else if (RExC_flags & RXf_PMf_SINGLELINE)
7232 ret = reg_node(pRExC_state, SBOL);
7234 ret = reg_node(pRExC_state, BOL);
7235 Set_Node_Length(ret, 1); /* MJD */
7238 nextchar(pRExC_state);
7240 RExC_seen_zerolen++;
7241 if (RExC_flags & RXf_PMf_MULTILINE)
7242 ret = reg_node(pRExC_state, MEOL);
7243 else if (RExC_flags & RXf_PMf_SINGLELINE)
7244 ret = reg_node(pRExC_state, SEOL);
7246 ret = reg_node(pRExC_state, EOL);
7247 Set_Node_Length(ret, 1); /* MJD */
7250 nextchar(pRExC_state);
7251 if (RExC_flags & RXf_PMf_SINGLELINE)
7252 ret = reg_node(pRExC_state, SANY);
7254 ret = reg_node(pRExC_state, REG_ANY);
7255 *flagp |= HASWIDTH|SIMPLE;
7257 Set_Node_Length(ret, 1); /* MJD */
7261 char * const oregcomp_parse = ++RExC_parse;
7262 ret = regclass(pRExC_state,depth+1);
7263 if (*RExC_parse != ']') {
7264 RExC_parse = oregcomp_parse;
7265 vFAIL("Unmatched [");
7267 nextchar(pRExC_state);
7268 *flagp |= HASWIDTH|SIMPLE;
7269 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
7273 nextchar(pRExC_state);
7274 ret = reg(pRExC_state, 1, &flags,depth+1);
7276 if (flags & TRYAGAIN) {
7277 if (RExC_parse == RExC_end) {
7278 /* Make parent create an empty node if needed. */
7286 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
7290 if (flags & TRYAGAIN) {
7294 vFAIL("Internal urp");
7295 /* Supposed to be caught earlier. */
7298 if (!regcurly(RExC_parse)) {
7307 vFAIL("Quantifier follows nothing");
7309 case LATIN_SMALL_LETTER_SHARP_S:
7310 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7311 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7312 #if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
7313 #error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
7314 case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
7319 len=0; /* silence a spurious compiler warning */
7320 if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
7321 *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
7322 RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
7323 ret = reganode(pRExC_state, FOLDCHAR, cp);
7324 Set_Node_Length(ret, 1); /* MJD */
7325 nextchar(pRExC_state); /* kill whitespace under /x */
7333 This switch handles escape sequences that resolve to some kind
7334 of special regop and not to literal text. Escape sequnces that
7335 resolve to literal text are handled below in the switch marked
7338 Every entry in this switch *must* have a corresponding entry
7339 in the literal escape switch. However, the opposite is not
7340 required, as the default for this switch is to jump to the
7341 literal text handling code.
7343 switch ((U8)*++RExC_parse) {
7344 case LATIN_SMALL_LETTER_SHARP_S:
7345 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7346 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7348 /* Special Escapes */
7350 RExC_seen_zerolen++;
7351 ret = reg_node(pRExC_state, SBOL);
7353 goto finish_meta_pat;
7355 ret = reg_node(pRExC_state, GPOS);
7356 RExC_seen |= REG_SEEN_GPOS;
7358 goto finish_meta_pat;
7360 RExC_seen_zerolen++;
7361 ret = reg_node(pRExC_state, KEEPS);
7363 /* XXX:dmq : disabling in-place substitution seems to
7364 * be necessary here to avoid cases of memory corruption, as
7365 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
7367 RExC_seen |= REG_SEEN_LOOKBEHIND;
7368 goto finish_meta_pat;
7370 ret = reg_node(pRExC_state, SEOL);
7372 RExC_seen_zerolen++; /* Do not optimize RE away */
7373 goto finish_meta_pat;
7375 ret = reg_node(pRExC_state, EOS);
7377 RExC_seen_zerolen++; /* Do not optimize RE away */
7378 goto finish_meta_pat;
7380 ret = reg_node(pRExC_state, CANY);
7381 RExC_seen |= REG_SEEN_CANY;
7382 *flagp |= HASWIDTH|SIMPLE;
7383 goto finish_meta_pat;
7385 ret = reg_node(pRExC_state, CLUMP);
7387 goto finish_meta_pat;
7390 ret = reg_node(pRExC_state, (U8)(ALNUML));
7392 ret = reg_node(pRExC_state, (U8)(ALNUM));
7393 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7395 *flagp |= HASWIDTH|SIMPLE;
7396 goto finish_meta_pat;
7399 ret = reg_node(pRExC_state, (U8)(NALNUML));
7401 ret = reg_node(pRExC_state, (U8)(NALNUM));
7402 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7404 *flagp |= HASWIDTH|SIMPLE;
7405 goto finish_meta_pat;
7407 RExC_seen_zerolen++;
7408 RExC_seen |= REG_SEEN_LOOKBEHIND;
7410 ret = reg_node(pRExC_state, (U8)(BOUNDL));
7412 ret = reg_node(pRExC_state, (U8)(BOUND));
7413 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7416 goto finish_meta_pat;
7418 RExC_seen_zerolen++;
7419 RExC_seen |= REG_SEEN_LOOKBEHIND;
7421 ret = reg_node(pRExC_state, (U8)(NBOUNDL));
7423 ret = reg_node(pRExC_state, (U8)(NBOUND));
7424 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7427 goto finish_meta_pat;
7430 ret = reg_node(pRExC_state, (U8)(SPACEL));
7432 ret = reg_node(pRExC_state, (U8)(SPACE));
7433 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7435 *flagp |= HASWIDTH|SIMPLE;
7436 goto finish_meta_pat;
7439 ret = reg_node(pRExC_state, (U8)(NSPACEL));
7441 ret = reg_node(pRExC_state, (U8)(NSPACE));
7442 FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0;
7444 *flagp |= HASWIDTH|SIMPLE;
7445 goto finish_meta_pat;
7448 ret = reg_node(pRExC_state, (U8)(DIGITL));
7450 ret = reg_node(pRExC_state, (U8)(DIGIT));
7452 *flagp |= HASWIDTH|SIMPLE;
7453 goto finish_meta_pat;
7456 ret = reg_node(pRExC_state, (U8)(NDIGITL));
7458 ret = reg_node(pRExC_state, (U8)(NDIGIT));
7460 *flagp |= HASWIDTH|SIMPLE;
7461 goto finish_meta_pat;
7463 ret = reg_node(pRExC_state, LNBREAK);
7464 *flagp |= HASWIDTH|SIMPLE;
7465 goto finish_meta_pat;
7467 ret = reg_node(pRExC_state, HORIZWS);
7468 *flagp |= HASWIDTH|SIMPLE;
7469 goto finish_meta_pat;
7471 ret = reg_node(pRExC_state, NHORIZWS);
7472 *flagp |= HASWIDTH|SIMPLE;
7473 goto finish_meta_pat;
7475 ret = reg_node(pRExC_state, VERTWS);
7476 *flagp |= HASWIDTH|SIMPLE;
7477 goto finish_meta_pat;
7479 ret = reg_node(pRExC_state, NVERTWS);
7480 *flagp |= HASWIDTH|SIMPLE;
7482 nextchar(pRExC_state);
7483 Set_Node_Length(ret, 2); /* MJD */
7488 char* const oldregxend = RExC_end;
7490 char* parse_start = RExC_parse - 2;
7493 if (RExC_parse[1] == '{') {
7494 /* a lovely hack--pretend we saw [\pX] instead */
7495 RExC_end = strchr(RExC_parse, '}');
7497 const U8 c = (U8)*RExC_parse;
7499 RExC_end = oldregxend;
7500 vFAIL2("Missing right brace on \\%c{}", c);
7505 RExC_end = RExC_parse + 2;
7506 if (RExC_end > oldregxend)
7507 RExC_end = oldregxend;
7511 ret = regclass(pRExC_state,depth+1);
7513 RExC_end = oldregxend;
7516 Set_Node_Offset(ret, parse_start + 2);
7517 Set_Node_Cur_Length(ret);
7518 nextchar(pRExC_state);
7519 *flagp |= HASWIDTH|SIMPLE;
7523 /* Handle \N and \N{NAME} here and not below because it can be
7524 multicharacter. join_exact() will join them up later on.
7525 Also this makes sure that things like /\N{BLAH}+/ and
7526 \N{BLAH} being multi char Just Happen. dmq*/
7528 ret= reg_namedseq(pRExC_state, NULL, flagp);
7530 case 'k': /* Handle \k<NAME> and \k'NAME' */
7533 char ch= RExC_parse[1];
7534 if (ch != '<' && ch != '\'' && ch != '{') {
7536 vFAIL2("Sequence %.2s... not terminated",parse_start);
7538 /* this pretty much dupes the code for (?P=...) in reg(), if
7539 you change this make sure you change that */
7540 char* name_start = (RExC_parse += 2);
7542 SV *sv_dat = reg_scan_name(pRExC_state,
7543 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7544 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
7545 if (RExC_parse == name_start || *RExC_parse != ch)
7546 vFAIL2("Sequence %.3s... not terminated",parse_start);
7549 num = add_data( pRExC_state, 1, "S" );
7550 RExC_rxi->data->data[num]=(void*)sv_dat;
7551 SvREFCNT_inc_simple_void(sv_dat);
7555 ret = reganode(pRExC_state,
7566 /* override incorrect value set in reganode MJD */
7567 Set_Node_Offset(ret, parse_start+1);
7568 Set_Node_Cur_Length(ret); /* MJD */
7569 nextchar(pRExC_state);
7575 case '1': case '2': case '3': case '4':
7576 case '5': case '6': case '7': case '8': case '9':
7579 bool isg = *RExC_parse == 'g';
7584 if (*RExC_parse == '{') {
7588 if (*RExC_parse == '-') {
7592 if (hasbrace && !isDIGIT(*RExC_parse)) {
7593 if (isrel) RExC_parse--;
7595 goto parse_named_seq;
7597 num = atoi(RExC_parse);
7598 if (isg && num == 0)
7599 vFAIL("Reference to invalid group 0");
7601 num = RExC_npar - num;
7603 vFAIL("Reference to nonexistent or unclosed group");
7605 if (!isg && num > 9 && num >= RExC_npar)
7608 char * const parse_start = RExC_parse - 1; /* MJD */
7609 while (isDIGIT(*RExC_parse))
7611 if (parse_start == RExC_parse - 1)
7612 vFAIL("Unterminated \\g... pattern");
7614 if (*RExC_parse != '}')
7615 vFAIL("Unterminated \\g{...} pattern");
7619 if (num > (I32)RExC_rx->nparens)
7620 vFAIL("Reference to nonexistent group");
7623 ret = reganode(pRExC_state,
7634 /* override incorrect value set in reganode MJD */
7635 Set_Node_Offset(ret, parse_start+1);
7636 Set_Node_Cur_Length(ret); /* MJD */
7638 nextchar(pRExC_state);
7643 if (RExC_parse >= RExC_end)
7644 FAIL("Trailing \\");
7647 /* Do not generate "unrecognized" warnings here, we fall
7648 back into the quick-grab loop below */
7655 if (RExC_flags & RXf_PMf_EXTENDED) {
7656 if ( reg_skipcomment( pRExC_state ) )
7663 register STRLEN len;
7668 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
7670 parse_start = RExC_parse - 1;
7676 ret = reg_node(pRExC_state,
7677 (U8) ((! FOLD) ? EXACT
7685 for (len = 0, p = RExC_parse - 1;
7686 len < 127 && p < RExC_end;
7689 char * const oldp = p;
7691 if (RExC_flags & RXf_PMf_EXTENDED)
7692 p = regwhite( pRExC_state, p );
7694 case LATIN_SMALL_LETTER_SHARP_S:
7695 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7696 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7697 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7698 goto normal_default;
7708 /* Literal Escapes Switch
7710 This switch is meant to handle escape sequences that
7711 resolve to a literal character.
7713 Every escape sequence that represents something
7714 else, like an assertion or a char class, is handled
7715 in the switch marked 'Special Escapes' above in this
7716 routine, but also has an entry here as anything that
7717 isn't explicitly mentioned here will be treated as
7718 an unescaped equivalent literal.
7722 /* These are all the special escapes. */
7723 case LATIN_SMALL_LETTER_SHARP_S:
7724 case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
7725 case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
7726 if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
7727 goto normal_default;
7728 case 'A': /* Start assertion */
7729 case 'b': case 'B': /* Word-boundary assertion*/
7730 case 'C': /* Single char !DANGEROUS! */
7731 case 'd': case 'D': /* digit class */
7732 case 'g': case 'G': /* generic-backref, pos assertion */
7733 case 'h': case 'H': /* HORIZWS */
7734 case 'k': case 'K': /* named backref, keep marker */
7735 case 'N': /* named char sequence */
7736 case 'p': case 'P': /* Unicode property */
7737 case 'R': /* LNBREAK */
7738 case 's': case 'S': /* space class */
7739 case 'v': case 'V': /* VERTWS */
7740 case 'w': case 'W': /* word class */
7741 case 'X': /* eXtended Unicode "combining character sequence" */
7742 case 'z': case 'Z': /* End of line/string assertion */
7746 /* Anything after here is an escape that resolves to a
7747 literal. (Except digits, which may or may not)
7766 ender = ASCII_TO_NATIVE('\033');
7770 ender = ASCII_TO_NATIVE('\007');
7775 STRLEN brace_len = len;
7777 const char* error_msg;
7779 bool valid = grok_bslash_o(p,
7786 RExC_parse = p; /* going to die anyway; point
7787 to exact spot of failure */
7794 if (PL_encoding && ender < 0x100) {
7795 goto recode_encoding;
7804 char* const e = strchr(p, '}');
7808 vFAIL("Missing right brace on \\x{}");
7811 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7812 | PERL_SCAN_DISALLOW_PREFIX;
7813 STRLEN numlen = e - p - 1;
7814 ender = grok_hex(p + 1, &numlen, &flags, NULL);
7821 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
7823 ender = grok_hex(p, &numlen, &flags, NULL);
7826 if (PL_encoding && ender < 0x100)
7827 goto recode_encoding;
7831 ender = grok_bslash_c(*p++, SIZE_ONLY);
7833 case '0': case '1': case '2': case '3':case '4':
7834 case '5': case '6': case '7': case '8':case '9':
7836 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
7838 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
7840 ender = grok_oct(p, &numlen, &flags, NULL);
7850 if (PL_encoding && ender < 0x100)
7851 goto recode_encoding;
7855 SV* enc = PL_encoding;
7856 ender = reg_recode((const char)(U8)ender, &enc);
7857 if (!enc && SIZE_ONLY)
7858 ckWARNreg(p, "Invalid escape in the specified encoding");
7864 FAIL("Trailing \\");
7867 if (!SIZE_ONLY&& isALPHA(*p))
7868 ckWARN2reg(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
7869 goto normal_default;
7874 if (UTF8_IS_START(*p) && UTF) {
7876 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
7877 &numlen, UTF8_ALLOW_DEFAULT);
7884 if ( RExC_flags & RXf_PMf_EXTENDED)
7885 p = regwhite( pRExC_state, p );
7887 /* Prime the casefolded buffer. */
7888 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
7890 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
7895 /* Emit all the Unicode characters. */
7897 for (foldbuf = tmpbuf;
7899 foldlen -= numlen) {
7900 ender = utf8_to_uvchr(foldbuf, &numlen);
7902 const STRLEN unilen = reguni(pRExC_state, ender, s);
7905 /* In EBCDIC the numlen
7906 * and unilen can differ. */
7908 if (numlen >= foldlen)
7912 break; /* "Can't happen." */
7916 const STRLEN unilen = reguni(pRExC_state, ender, s);
7925 REGC((char)ender, s++);
7931 /* Emit all the Unicode characters. */
7933 for (foldbuf = tmpbuf;
7935 foldlen -= numlen) {
7936 ender = utf8_to_uvchr(foldbuf, &numlen);
7938 const STRLEN unilen = reguni(pRExC_state, ender, s);
7941 /* In EBCDIC the numlen
7942 * and unilen can differ. */
7944 if (numlen >= foldlen)
7952 const STRLEN unilen = reguni(pRExC_state, ender, s);
7961 REGC((char)ender, s++);
7965 Set_Node_Cur_Length(ret); /* MJD */
7966 nextchar(pRExC_state);
7968 /* len is STRLEN which is unsigned, need to copy to signed */
7971 vFAIL("Internal disaster");
7975 if (len == 1 && UNI_IS_INVARIANT(ender))
7979 RExC_size += STR_SZ(len);
7982 RExC_emit += STR_SZ(len);
7992 S_regwhite( RExC_state_t *pRExC_state, char *p )
7994 const char *e = RExC_end;
7996 PERL_ARGS_ASSERT_REGWHITE;
8001 else if (*p == '#') {
8010 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
8018 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
8019 Character classes ([:foo:]) can also be negated ([:^foo:]).
8020 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
8021 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
8022 but trigger failures because they are currently unimplemented. */
8024 #define POSIXCC_DONE(c) ((c) == ':')
8025 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
8026 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
8029 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
8032 I32 namedclass = OOB_NAMEDCLASS;
8034 PERL_ARGS_ASSERT_REGPPOSIXCC;
8036 if (value == '[' && RExC_parse + 1 < RExC_end &&
8037 /* I smell either [: or [= or [. -- POSIX has been here, right? */
8038 POSIXCC(UCHARAT(RExC_parse))) {
8039 const char c = UCHARAT(RExC_parse);
8040 char* const s = RExC_parse++;
8042 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
8044 if (RExC_parse == RExC_end)
8045 /* Grandfather lone [:, [=, [. */
8048 const char* const t = RExC_parse++; /* skip over the c */
8051 if (UCHARAT(RExC_parse) == ']') {
8052 const char *posixcc = s + 1;
8053 RExC_parse++; /* skip over the ending ] */
8056 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
8057 const I32 skip = t - posixcc;
8059 /* Initially switch on the length of the name. */
8062 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
8063 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
8066 /* Names all of length 5. */
8067 /* alnum alpha ascii blank cntrl digit graph lower
8068 print punct space upper */
8069 /* Offset 4 gives the best switch position. */
8070 switch (posixcc[4]) {
8072 if (memEQ(posixcc, "alph", 4)) /* alpha */
8073 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
8076 if (memEQ(posixcc, "spac", 4)) /* space */
8077 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
8080 if (memEQ(posixcc, "grap", 4)) /* graph */
8081 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
8084 if (memEQ(posixcc, "asci", 4)) /* ascii */
8085 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
8088 if (memEQ(posixcc, "blan", 4)) /* blank */
8089 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
8092 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
8093 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
8096 if (memEQ(posixcc, "alnu", 4)) /* alnum */
8097 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
8100 if (memEQ(posixcc, "lowe", 4)) /* lower */
8101 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
8102 else if (memEQ(posixcc, "uppe", 4)) /* upper */
8103 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
8106 if (memEQ(posixcc, "digi", 4)) /* digit */
8107 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
8108 else if (memEQ(posixcc, "prin", 4)) /* print */
8109 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
8110 else if (memEQ(posixcc, "punc", 4)) /* punct */
8111 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
8116 if (memEQ(posixcc, "xdigit", 6))
8117 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
8121 if (namedclass == OOB_NAMEDCLASS)
8122 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
8124 assert (posixcc[skip] == ':');
8125 assert (posixcc[skip+1] == ']');
8126 } else if (!SIZE_ONLY) {
8127 /* [[=foo=]] and [[.foo.]] are still future. */
8129 /* adjust RExC_parse so the warning shows after
8131 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
8133 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8136 /* Maternal grandfather:
8137 * "[:" ending in ":" but not in ":]" */
8147 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
8151 PERL_ARGS_ASSERT_CHECKPOSIXCC;
8153 if (POSIXCC(UCHARAT(RExC_parse))) {
8154 const char *s = RExC_parse;
8155 const char c = *s++;
8159 if (*s && c == *s && s[1] == ']') {
8161 "POSIX syntax [%c %c] belongs inside character classes",
8164 /* [[=foo=]] and [[.foo.]] are still future. */
8165 if (POSIXCC_NOTYET(c)) {
8166 /* adjust RExC_parse so the error shows after
8168 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
8170 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
8176 /* No locale test */
8177 #define _C_C_T_NOLOC_(NAME,TEST,WORD) \
8179 for (value = 0; value < 256; value++) \
8181 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8185 case ANYOF_N##NAME: \
8186 for (value = 0; value < 256; value++) \
8188 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8193 /* Like the above, but there are differences if we are in uni-8-bit or not, so
8194 * there are two tests passed in, to use depending on that. There aren't any
8195 * cases where the label is different from the name, so no need for that
8197 #define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \
8199 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
8200 else if (UNI_SEMANTICS) { \
8201 for (value = 0; value < 256; value++) { \
8202 if (TEST_8) stored += \
8203 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8207 for (value = 0; value < 128; value++) { \
8208 if (TEST_7) stored += \
8209 S_set_regclass_bit(aTHX_ pRExC_state, ret, UNI_TO_NATIVE(value)); \
8215 case ANYOF_N##NAME: \
8216 if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
8217 else if (UNI_SEMANTICS) { \
8218 for (value = 0; value < 256; value++) { \
8219 if (! TEST_8) stored += \
8220 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8224 for (value = 0; value < 128; value++) { \
8225 if (! TEST_7) stored += \
8226 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8228 for (value = 128; value < 256; value++) { \
8229 S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
8237 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
8238 so that it is possible to override the option here without having to
8239 rebuild the entire core. as we are required to do if we change regcomp.h
8240 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
8242 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
8243 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
8246 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8247 #define POSIX_CC_UNI_NAME(CCNAME) CCNAME
8249 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
8253 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8256 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
8257 * Locale folding is done at run-time, so this function should not be
8258 * called for nodes that are for locales.
8260 * This function simply sets the bit corresponding to the fold of the input
8261 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
8264 * It also sets any necessary flags, and returns the number of bits that
8265 * actually changed from 0 to 1 */
8270 fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
8273 /* It assumes the bit for 'value' has already been set */
8274 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
8275 ANYOF_BITMAP_SET(node, fold);
8279 /* The fold of the German sharp s is two ASCII characters, so isn't in the
8280 * bitmap and doesn't have to be in utf8, but we only process it if unicode
8281 * semantics are called for */
8282 if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
8283 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
8285 else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
8288 && PL_fold_latin1[value] != value))
8289 { /* A character that has a fold outside of Latin1 matches outside the
8290 bitmap, but only when the target string is utf8. Similarly when we
8291 don't have unicode semantics for the above ASCII Latin-1 characters,
8292 and they have a fold, they should match if the target is utf8, and
8294 ANYOF_FLAGS(node) |= ANYOF_UTF8;
8301 PERL_STATIC_INLINE U8
8302 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
8304 /* This inline function sets a bit in the bitmap if not already set, and if
8305 * appropriate, its fold, returning the number of bits that actually
8306 * changed from 0 to 1 */
8310 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
8314 ANYOF_BITMAP_SET(node, value);
8317 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
8318 stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
8325 parse a class specification and produce either an ANYOF node that
8326 matches the pattern or if the pattern matches a single char only and
8327 that char is < 256 and we are case insensitive then we produce an
8332 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
8335 register UV nextvalue;
8336 register IV prevvalue = OOB_UNICODE;
8337 register IV range = 0;
8338 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
8339 register regnode *ret;
8342 char *rangebegin = NULL;
8343 bool need_class = 0;
8346 AV* unicode_alternate = NULL;
8348 UV literal_endpoint = 0;
8350 UV stored = 0; /* how many chars stored in the bitmap */
8352 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
8353 case we need to change the emitted regop to an EXACT. */
8354 const char * orig_parse = RExC_parse;
8355 GET_RE_DEBUG_FLAGS_DECL;
8357 PERL_ARGS_ASSERT_REGCLASS;
8359 PERL_UNUSED_ARG(depth);
8362 DEBUG_PARSE("clas");
8364 /* Assume we are going to generate an ANYOF node. */
8365 ret = reganode(pRExC_state, ANYOF, 0);
8368 ANYOF_FLAGS(ret) = 0;
8370 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
8374 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
8378 RExC_size += ANYOF_SKIP;
8379 #ifdef ANYOF_ADD_LOC_SKIP
8381 RExC_size += ANYOF_ADD_LOC_SKIP;
8384 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
8387 RExC_emit += ANYOF_SKIP;
8389 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
8390 #ifdef ANYOF_ADD_LOC_SKIP
8391 RExC_emit += ANYOF_ADD_LOC_SKIP;
8394 ANYOF_BITMAP_ZERO(ret);
8395 listsv = newSVpvs("# comment\n");
8398 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8400 if (!SIZE_ONLY && POSIXCC(nextvalue))
8401 checkposixcc(pRExC_state);
8403 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
8404 if (UCHARAT(RExC_parse) == ']')
8408 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
8412 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
8415 rangebegin = RExC_parse;
8417 value = utf8n_to_uvchr((U8*)RExC_parse,
8418 RExC_end - RExC_parse,
8419 &numlen, UTF8_ALLOW_DEFAULT);
8420 RExC_parse += numlen;
8423 value = UCHARAT(RExC_parse++);
8425 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
8426 if (value == '[' && POSIXCC(nextvalue))
8427 namedclass = regpposixcc(pRExC_state, value);
8428 else if (value == '\\') {
8430 value = utf8n_to_uvchr((U8*)RExC_parse,
8431 RExC_end - RExC_parse,
8432 &numlen, UTF8_ALLOW_DEFAULT);
8433 RExC_parse += numlen;
8436 value = UCHARAT(RExC_parse++);
8437 /* Some compilers cannot handle switching on 64-bit integer
8438 * values, therefore value cannot be an UV. Yes, this will
8439 * be a problem later if we want switch on Unicode.
8440 * A similar issue a little bit later when switching on
8441 * namedclass. --jhi */
8442 switch ((I32)value) {
8443 case 'w': namedclass = ANYOF_ALNUM; break;
8444 case 'W': namedclass = ANYOF_NALNUM; break;
8445 case 's': namedclass = ANYOF_SPACE; break;
8446 case 'S': namedclass = ANYOF_NSPACE; break;
8447 case 'd': namedclass = ANYOF_DIGIT; break;
8448 case 'D': namedclass = ANYOF_NDIGIT; break;
8449 case 'v': namedclass = ANYOF_VERTWS; break;
8450 case 'V': namedclass = ANYOF_NVERTWS; break;
8451 case 'h': namedclass = ANYOF_HORIZWS; break;
8452 case 'H': namedclass = ANYOF_NHORIZWS; break;
8453 case 'N': /* Handle \N{NAME} in class */
8455 /* We only pay attention to the first char of
8456 multichar strings being returned. I kinda wonder
8457 if this makes sense as it does change the behaviour
8458 from earlier versions, OTOH that behaviour was broken
8460 UV v; /* value is register so we cant & it /grrr */
8461 if (reg_namedseq(pRExC_state, &v, NULL)) {
8471 if (RExC_parse >= RExC_end)
8472 vFAIL2("Empty \\%c{}", (U8)value);
8473 if (*RExC_parse == '{') {
8474 const U8 c = (U8)value;
8475 e = strchr(RExC_parse++, '}');
8477 vFAIL2("Missing right brace on \\%c{}", c);
8478 while (isSPACE(UCHARAT(RExC_parse)))
8480 if (e == RExC_parse)
8481 vFAIL2("Empty \\%c{}", c);
8483 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
8491 if (UCHARAT(RExC_parse) == '^') {
8494 value = value == 'p' ? 'P' : 'p'; /* toggle */
8495 while (isSPACE(UCHARAT(RExC_parse))) {
8500 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
8501 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
8505 /* The \p could match something in the Latin1 range, hence
8506 * something that isn't utf8 */
8507 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
8508 namedclass = ANYOF_MAX; /* no official name, but it's named */
8511 case 'n': value = '\n'; break;
8512 case 'r': value = '\r'; break;
8513 case 't': value = '\t'; break;
8514 case 'f': value = '\f'; break;
8515 case 'b': value = '\b'; break;
8516 case 'e': value = ASCII_TO_NATIVE('\033');break;
8517 case 'a': value = ASCII_TO_NATIVE('\007');break;
8519 RExC_parse--; /* function expects to be pointed at the 'o' */
8521 const char* error_msg;
8522 bool valid = grok_bslash_o(RExC_parse,
8527 RExC_parse += numlen;
8532 if (PL_encoding && value < 0x100) {
8533 goto recode_encoding;
8537 if (*RExC_parse == '{') {
8538 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8539 | PERL_SCAN_DISALLOW_PREFIX;
8540 char * const e = strchr(RExC_parse++, '}');
8542 vFAIL("Missing right brace on \\x{}");
8544 numlen = e - RExC_parse;
8545 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8549 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
8551 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
8552 RExC_parse += numlen;
8554 if (PL_encoding && value < 0x100)
8555 goto recode_encoding;
8558 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
8560 case '0': case '1': case '2': case '3': case '4':
8561 case '5': case '6': case '7':
8563 /* Take 1-3 octal digits */
8564 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
8566 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
8567 RExC_parse += numlen;
8568 if (PL_encoding && value < 0x100)
8569 goto recode_encoding;
8574 SV* enc = PL_encoding;
8575 value = reg_recode((const char)(U8)value, &enc);
8576 if (!enc && SIZE_ONLY)
8577 ckWARNreg(RExC_parse,
8578 "Invalid escape in the specified encoding");
8582 /* Allow \_ to not give an error */
8583 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
8584 ckWARN2reg(RExC_parse,
8585 "Unrecognized escape \\%c in character class passed through",
8590 } /* end of \blah */
8596 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
8598 /* What matches in a locale is not known until runtime, so need to
8599 * (one time per class) allocate extra space to pass to regexec.
8600 * The space will contain a bit for each named class that is to be
8601 * matched against. This isn't needed for \p{} and pseudo-classes,
8602 * as they are not affected by locale, and hence are dealt with
8604 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
8607 #ifdef ANYOF_CLASS_ADD_SKIP
8608 RExC_size += ANYOF_CLASS_ADD_SKIP;
8612 #ifdef ANYOF_CLASS_ADD_SKIP
8613 RExC_emit += ANYOF_CLASS_ADD_SKIP;
8615 ANYOF_CLASS_ZERO(ret);
8617 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
8620 /* a bad range like a-\d, a-[:digit:] ? */
8624 RExC_parse >= rangebegin ?
8625 RExC_parse - rangebegin : 0;
8626 ckWARN4reg(RExC_parse,
8627 "False [] range \"%*.*s\"",
8630 if (prevvalue < 256) {
8632 S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
8634 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8637 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8638 Perl_sv_catpvf(aTHX_ listsv,
8639 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
8643 range = 0; /* this was not a true range */
8649 const char *what = NULL;
8652 /* Possible truncation here but in some 64-bit environments
8653 * the compiler gets heartburn about switch on 64-bit values.
8654 * A similar issue a little earlier when switching on value.
8656 switch ((I32)namedclass) {
8658 case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
8659 case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
8660 case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
8661 case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
8662 case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
8663 case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
8664 case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
8665 case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
8666 case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
8667 case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
8668 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
8669 /* \s, \w match all unicode if utf8. */
8670 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
8671 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
8673 /* \s, \w match ascii and locale only */
8674 case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
8675 case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
8677 case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
8678 case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
8679 case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
8682 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
8684 for (value = 0; value < 128; value++)
8686 S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
8689 what = NULL; /* Doesn't match outside ascii, so
8690 don't want to add +utf8:: */
8694 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
8696 for (value = 128; value < 256; value++)
8698 S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
8705 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
8707 /* consecutive digits assumed */
8708 for (value = '0'; value <= '9'; value++)
8710 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
8713 what = POSIX_CC_UNI_NAME("Digit");
8717 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
8719 /* consecutive digits assumed */
8720 for (value = 0; value < '0'; value++)
8722 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
8723 for (value = '9' + 1; value < 256; value++)
8725 S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
8728 what = POSIX_CC_UNI_NAME("Digit");
8731 /* this is to handle \p and \P */
8734 vFAIL("Invalid [::] class");
8738 /* Strings such as "+utf8::isWord\n" */
8739 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
8740 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
8745 } /* end of namedclass \blah */
8748 if (prevvalue > (IV)value) /* b-a */ {
8749 const int w = RExC_parse - rangebegin;
8750 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
8751 range = 0; /* not a valid range */
8755 prevvalue = value; /* save the beginning of the range */
8756 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
8757 RExC_parse[1] != ']') {
8760 /* a bad range like \w-, [:word:]- ? */
8761 if (namedclass > OOB_NAMEDCLASS) {
8762 if (ckWARN(WARN_REGEXP)) {
8764 RExC_parse >= rangebegin ?
8765 RExC_parse - rangebegin : 0;
8767 "False [] range \"%*.*s\"",
8772 S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
8774 range = 1; /* yeah, it's a range! */
8775 continue; /* but do it the next time */
8779 /* now is the next time */
8781 if (prevvalue < 256) {
8782 const IV ceilvalue = value < 256 ? value : 255;
8785 /* In EBCDIC [\x89-\x91] should include
8786 * the \x8e but [i-j] should not. */
8787 if (literal_endpoint == 2 &&
8788 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
8789 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
8791 if (isLOWER(prevvalue)) {
8792 for (i = prevvalue; i <= ceilvalue; i++)
8793 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8795 S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
8798 for (i = prevvalue; i <= ceilvalue; i++)
8799 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
8801 S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
8807 for (i = prevvalue; i <= ceilvalue; i++) {
8808 stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
8811 if (value > 255 || UTF) {
8812 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
8813 const UV natvalue = NATIVE_TO_UNI(value);
8815 /* If the code point requires utf8 to represent, and we are not
8816 * folding, it can't match unless the target is in utf8. Only
8817 * a few code points above 255 fold to below it, so XXX an
8818 * optimization would be to know which ones and set the flag
8820 ANYOF_FLAGS(ret) |= (FOLD || value < 256)
8823 if (prevnatvalue < natvalue) { /* what about > ? */
8824 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
8825 prevnatvalue, natvalue);
8827 else if (prevnatvalue == natvalue) {
8828 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
8830 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
8832 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
8834 #ifdef EBCDIC /* RD t/uni/fold ff and 6b */
8835 if (RExC_precomp[0] == ':' &&
8836 RExC_precomp[1] == '[' &&
8837 (f == 0xDF || f == 0x92)) {
8838 f = NATIVE_TO_UNI(f);
8841 /* If folding and foldable and a single
8842 * character, insert also the folded version
8843 * to the charclass. */
8845 #ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
8846 if ((RExC_precomp[0] == ':' &&
8847 RExC_precomp[1] == '[' &&
8849 (value == 0xFB05 || value == 0xFB06))) ?
8850 foldlen == ((STRLEN)UNISKIP(f) - 1) :
8851 foldlen == (STRLEN)UNISKIP(f) )
8853 if (foldlen == (STRLEN)UNISKIP(f))
8855 Perl_sv_catpvf(aTHX_ listsv,
8858 /* Any multicharacter foldings
8859 * require the following transform:
8860 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
8861 * where E folds into "pq" and F folds
8862 * into "rst", all other characters
8863 * fold to single characters. We save
8864 * away these multicharacter foldings,
8865 * to be later saved as part of the
8866 * additional "s" data. */
8869 if (!unicode_alternate)
8870 unicode_alternate = newAV();
8871 sv = newSVpvn_utf8((char*)foldbuf, foldlen,
8873 av_push(unicode_alternate, sv);
8877 /* If folding and the value is one of the Greek
8878 * sigmas insert a few more sigmas to make the
8879 * folding rules of the sigmas to work right.
8880 * Note that not all the possible combinations
8881 * are handled here: some of them are handled
8882 * by the standard folding rules, and some of
8883 * them (literal or EXACTF cases) are handled
8884 * during runtime in regexec.c:S_find_byclass(). */
8885 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
8886 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8887 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
8888 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8889 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8891 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
8892 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
8893 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
8898 literal_endpoint = 0;
8902 range = 0; /* this range (if it was one) is done now */
8909 /****** !SIZE_ONLY AFTER HERE *********/
8911 /* Folding in the bitmap is taken care of above, but not for locale, for
8912 * which we have to wait to see what folding is in effect at runtime, and
8913 * for things not in the bitmap */
8914 if (FOLD && (LOC || ANYOF_FLAGS(ret) & ANYOF_NONBITMAP)) {
8915 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
8918 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that this doesn't
8919 * optimize locale. Doing so perhaps could be done as long as there is
8920 * nothing like \w in it; some thought also would have to be given to the
8921 * interaction with above 0x100 chars */
8922 if ((ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
8923 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
8924 ANYOF_BITMAP(ret)[value] ^= 0xFF;
8925 stored = 256 - stored;
8927 /* The inversion means that everything above 255 is matched */
8928 ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
8931 /* A single character class can be "optimized" into an EXACTish node.
8932 * Note that since we don't currently count how many characters there are
8933 * outside the bitmap, we are XXX missing optimization possibilities for
8934 * them. This optimization can't happen unless this is a truly single
8935 * character class, which means that it can't be an inversion into a
8936 * many-character class, and there must be no possibility of there being
8937 * things outside the bitmap. 'stored' (only) for locales doesn't include
8938 * \w, etc, so have to make a special test that they aren't present */
8939 if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
8940 && ((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
8941 || (! ANYOF_CLASS_TEST_ANY_SET(ret))))))
8943 /* Note that the information needed to decide to do this optimization
8944 * is not currently available until the 2nd pass, and that the actually
8945 * used EXACT node takes less space than the calculated ANYOF node, and
8946 * hence the amount of space calculated in the first pass is larger
8947 * than actually used, so this optimization doesn't gain us any space.
8948 * But an EXACT node is faster than an ANYOF node, and can be combined
8949 * with any adjacent EXACT nodes later by the optimizer for further
8952 const char * cur_parse= RExC_parse;
8953 RExC_emit = (regnode *)orig_emit;
8954 RExC_parse = (char *)orig_parse;
8956 /* (A locale node can have 1 point and be folded; all the other folds
8957 * will include the fold, hence will have 2 points, so we won't get
8958 * here with ANYOF_FOLD set unless it is also locale) */
8959 ret = reg_node(pRExC_state, (U8) (! (ANYOF_FLAGS(ret) & ANYOF_FOLD))
8963 RExC_parse = (char *)cur_parse;
8964 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
8965 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
8966 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
8968 RExC_emit += STR_SZ(2);
8971 *STRING(ret)= (char)value;
8973 RExC_emit += STR_SZ(1);
8975 SvREFCNT_dec(listsv);
8978 /* (A 2-character class of the very special form like [aA] could be
8979 * optimized into an EXACTFish node, but only for non-locales, and for
8980 * characters which only have the two folds; so things like 'fF' and
8981 * 'Ii' wouldn't work because of the fold of 'LATIN SMALL LIGATURE FI'.
8982 * Since we don't have that information currently conveniently
8983 * available, skip the optimization) */
8987 AV * const av = newAV();
8989 /* The 0th element stores the character class description
8990 * in its textual form: used later (regexec.c:Perl_regclass_swash())
8991 * to initialize the appropriate swash (which gets stored in
8992 * the 1st element), and also useful for dumping the regnode.
8993 * The 2nd element stores the multicharacter foldings,
8994 * used later (regexec.c:S_reginclass()). */
8995 av_store(av, 0, listsv);
8996 av_store(av, 1, NULL);
8997 av_store(av, 2, MUTABLE_SV(unicode_alternate));
8998 rv = newRV_noinc(MUTABLE_SV(av));
8999 n = add_data(pRExC_state, 1, "s");
9000 RExC_rxi->data->data[n] = (void*)rv;
9008 /* reg_skipcomment()
9010 Absorbs an /x style # comments from the input stream.
9011 Returns true if there is more text remaining in the stream.
9012 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
9013 terminates the pattern without including a newline.
9015 Note its the callers responsibility to ensure that we are
9021 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
9025 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
9027 while (RExC_parse < RExC_end)
9028 if (*RExC_parse++ == '\n') {
9033 /* we ran off the end of the pattern without ending
9034 the comment, so we have to add an \n when wrapping */
9035 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
9043 Advances the parse position, and optionally absorbs
9044 "whitespace" from the inputstream.
9046 Without /x "whitespace" means (?#...) style comments only,
9047 with /x this means (?#...) and # comments and whitespace proper.
9049 Returns the RExC_parse point from BEFORE the scan occurs.
9051 This is the /x friendly way of saying RExC_parse++.
9055 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
9057 char* const retval = RExC_parse++;
9059 PERL_ARGS_ASSERT_NEXTCHAR;
9062 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
9063 RExC_parse[2] == '#') {
9064 while (*RExC_parse != ')') {
9065 if (RExC_parse == RExC_end)
9066 FAIL("Sequence (?#... not terminated");
9072 if (RExC_flags & RXf_PMf_EXTENDED) {
9073 if (isSPACE(*RExC_parse)) {
9077 else if (*RExC_parse == '#') {
9078 if ( reg_skipcomment( pRExC_state ) )
9087 - reg_node - emit a node
9089 STATIC regnode * /* Location. */
9090 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
9093 register regnode *ptr;
9094 regnode * const ret = RExC_emit;
9095 GET_RE_DEBUG_FLAGS_DECL;
9097 PERL_ARGS_ASSERT_REG_NODE;
9100 SIZE_ALIGN(RExC_size);
9104 if (RExC_emit >= RExC_emit_bound)
9105 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9107 NODE_ALIGN_FILL(ret);
9109 FILL_ADVANCE_NODE(ptr, op);
9110 #ifdef RE_TRACK_PATTERN_OFFSETS
9111 if (RExC_offsets) { /* MJD */
9112 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
9113 "reg_node", __LINE__,
9115 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
9116 ? "Overwriting end of array!\n" : "OK",
9117 (UV)(RExC_emit - RExC_emit_start),
9118 (UV)(RExC_parse - RExC_start),
9119 (UV)RExC_offsets[0]));
9120 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
9128 - reganode - emit a node with an argument
9130 STATIC regnode * /* Location. */
9131 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
9134 register regnode *ptr;
9135 regnode * const ret = RExC_emit;
9136 GET_RE_DEBUG_FLAGS_DECL;
9138 PERL_ARGS_ASSERT_REGANODE;
9141 SIZE_ALIGN(RExC_size);
9146 assert(2==regarglen[op]+1);
9148 Anything larger than this has to allocate the extra amount.
9149 If we changed this to be:
9151 RExC_size += (1 + regarglen[op]);
9153 then it wouldn't matter. Its not clear what side effect
9154 might come from that so its not done so far.
9159 if (RExC_emit >= RExC_emit_bound)
9160 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
9162 NODE_ALIGN_FILL(ret);
9164 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
9165 #ifdef RE_TRACK_PATTERN_OFFSETS
9166 if (RExC_offsets) { /* MJD */
9167 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9171 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
9172 "Overwriting end of array!\n" : "OK",
9173 (UV)(RExC_emit - RExC_emit_start),
9174 (UV)(RExC_parse - RExC_start),
9175 (UV)RExC_offsets[0]));
9176 Set_Cur_Node_Offset;
9184 - reguni - emit (if appropriate) a Unicode character
9187 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
9191 PERL_ARGS_ASSERT_REGUNI;
9193 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
9197 - reginsert - insert an operator in front of already-emitted operand
9199 * Means relocating the operand.
9202 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
9205 register regnode *src;
9206 register regnode *dst;
9207 register regnode *place;
9208 const int offset = regarglen[(U8)op];
9209 const int size = NODE_STEP_REGNODE + offset;
9210 GET_RE_DEBUG_FLAGS_DECL;
9212 PERL_ARGS_ASSERT_REGINSERT;
9213 PERL_UNUSED_ARG(depth);
9214 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
9215 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
9224 if (RExC_open_parens) {
9226 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
9227 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
9228 if ( RExC_open_parens[paren] >= opnd ) {
9229 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
9230 RExC_open_parens[paren] += size;
9232 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
9234 if ( RExC_close_parens[paren] >= opnd ) {
9235 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
9236 RExC_close_parens[paren] += size;
9238 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
9243 while (src > opnd) {
9244 StructCopy(--src, --dst, regnode);
9245 #ifdef RE_TRACK_PATTERN_OFFSETS
9246 if (RExC_offsets) { /* MJD 20010112 */
9247 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
9251 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
9252 ? "Overwriting end of array!\n" : "OK",
9253 (UV)(src - RExC_emit_start),
9254 (UV)(dst - RExC_emit_start),
9255 (UV)RExC_offsets[0]));
9256 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
9257 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
9263 place = opnd; /* Op node, where operand used to be. */
9264 #ifdef RE_TRACK_PATTERN_OFFSETS
9265 if (RExC_offsets) { /* MJD */
9266 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
9270 (UV)(place - RExC_emit_start) > RExC_offsets[0]
9271 ? "Overwriting end of array!\n" : "OK",
9272 (UV)(place - RExC_emit_start),
9273 (UV)(RExC_parse - RExC_start),
9274 (UV)RExC_offsets[0]));
9275 Set_Node_Offset(place, RExC_parse);
9276 Set_Node_Length(place, 1);
9279 src = NEXTOPER(place);
9280 FILL_ADVANCE_NODE(place, op);
9281 Zero(src, offset, regnode);
9285 - regtail - set the next-pointer at the end of a node chain of p to val.
9286 - SEE ALSO: regtail_study
9288 /* TODO: All three parms should be const */
9290 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9293 register regnode *scan;
9294 GET_RE_DEBUG_FLAGS_DECL;
9296 PERL_ARGS_ASSERT_REGTAIL;
9298 PERL_UNUSED_ARG(depth);
9304 /* Find last node. */
9307 regnode * const temp = regnext(scan);
9309 SV * const mysv=sv_newmortal();
9310 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
9311 regprop(RExC_rx, mysv, scan);
9312 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
9313 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
9314 (temp == NULL ? "->" : ""),
9315 (temp == NULL ? PL_reg_name[OP(val)] : "")
9323 if (reg_off_by_arg[OP(scan)]) {
9324 ARG_SET(scan, val - scan);
9327 NEXT_OFF(scan) = val - scan;
9333 - regtail_study - set the next-pointer at the end of a node chain of p to val.
9334 - Look for optimizable sequences at the same time.
9335 - currently only looks for EXACT chains.
9337 This is expermental code. The idea is to use this routine to perform
9338 in place optimizations on branches and groups as they are constructed,
9339 with the long term intention of removing optimization from study_chunk so
9340 that it is purely analytical.
9342 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
9343 to control which is which.
9346 /* TODO: All four parms should be const */
9349 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
9352 register regnode *scan;
9354 #ifdef EXPERIMENTAL_INPLACESCAN
9357 GET_RE_DEBUG_FLAGS_DECL;
9359 PERL_ARGS_ASSERT_REGTAIL_STUDY;
9365 /* Find last node. */
9369 regnode * const temp = regnext(scan);
9370 #ifdef EXPERIMENTAL_INPLACESCAN
9371 if (PL_regkind[OP(scan)] == EXACT)
9372 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
9381 if( exact == PSEUDO )
9383 else if ( exact != OP(scan) )
9392 SV * const mysv=sv_newmortal();
9393 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
9394 regprop(RExC_rx, mysv, scan);
9395 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
9396 SvPV_nolen_const(mysv),
9398 PL_reg_name[exact]);
9405 SV * const mysv_val=sv_newmortal();
9406 DEBUG_PARSE_MSG("");
9407 regprop(RExC_rx, mysv_val, val);
9408 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
9409 SvPV_nolen_const(mysv_val),
9410 (IV)REG_NODE_NUM(val),
9414 if (reg_off_by_arg[OP(scan)]) {
9415 ARG_SET(scan, val - scan);
9418 NEXT_OFF(scan) = val - scan;
9426 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
9430 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
9435 for (bit=0; bit<32; bit++) {
9436 if (flags & (1<<bit)) {
9438 PerlIO_printf(Perl_debug_log, "%s",lead);
9439 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
9444 PerlIO_printf(Perl_debug_log, "\n");
9446 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
9452 Perl_regdump(pTHX_ const regexp *r)
9456 SV * const sv = sv_newmortal();
9457 SV *dsv= sv_newmortal();
9459 GET_RE_DEBUG_FLAGS_DECL;
9461 PERL_ARGS_ASSERT_REGDUMP;
9463 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
9465 /* Header fields of interest. */
9466 if (r->anchored_substr) {
9467 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
9468 RE_SV_DUMPLEN(r->anchored_substr), 30);
9469 PerlIO_printf(Perl_debug_log,
9470 "anchored %s%s at %"IVdf" ",
9471 s, RE_SV_TAIL(r->anchored_substr),
9472 (IV)r->anchored_offset);
9473 } else if (r->anchored_utf8) {
9474 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
9475 RE_SV_DUMPLEN(r->anchored_utf8), 30);
9476 PerlIO_printf(Perl_debug_log,
9477 "anchored utf8 %s%s at %"IVdf" ",
9478 s, RE_SV_TAIL(r->anchored_utf8),
9479 (IV)r->anchored_offset);
9481 if (r->float_substr) {
9482 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
9483 RE_SV_DUMPLEN(r->float_substr), 30);
9484 PerlIO_printf(Perl_debug_log,
9485 "floating %s%s at %"IVdf"..%"UVuf" ",
9486 s, RE_SV_TAIL(r->float_substr),
9487 (IV)r->float_min_offset, (UV)r->float_max_offset);
9488 } else if (r->float_utf8) {
9489 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
9490 RE_SV_DUMPLEN(r->float_utf8), 30);
9491 PerlIO_printf(Perl_debug_log,
9492 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
9493 s, RE_SV_TAIL(r->float_utf8),
9494 (IV)r->float_min_offset, (UV)r->float_max_offset);
9496 if (r->check_substr || r->check_utf8)
9497 PerlIO_printf(Perl_debug_log,
9499 (r->check_substr == r->float_substr
9500 && r->check_utf8 == r->float_utf8
9501 ? "(checking floating" : "(checking anchored"));
9502 if (r->extflags & RXf_NOSCAN)
9503 PerlIO_printf(Perl_debug_log, " noscan");
9504 if (r->extflags & RXf_CHECK_ALL)
9505 PerlIO_printf(Perl_debug_log, " isall");
9506 if (r->check_substr || r->check_utf8)
9507 PerlIO_printf(Perl_debug_log, ") ");
9509 if (ri->regstclass) {
9510 regprop(r, sv, ri->regstclass);
9511 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
9513 if (r->extflags & RXf_ANCH) {
9514 PerlIO_printf(Perl_debug_log, "anchored");
9515 if (r->extflags & RXf_ANCH_BOL)
9516 PerlIO_printf(Perl_debug_log, "(BOL)");
9517 if (r->extflags & RXf_ANCH_MBOL)
9518 PerlIO_printf(Perl_debug_log, "(MBOL)");
9519 if (r->extflags & RXf_ANCH_SBOL)
9520 PerlIO_printf(Perl_debug_log, "(SBOL)");
9521 if (r->extflags & RXf_ANCH_GPOS)
9522 PerlIO_printf(Perl_debug_log, "(GPOS)");
9523 PerlIO_putc(Perl_debug_log, ' ');
9525 if (r->extflags & RXf_GPOS_SEEN)
9526 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
9527 if (r->intflags & PREGf_SKIP)
9528 PerlIO_printf(Perl_debug_log, "plus ");
9529 if (r->intflags & PREGf_IMPLICIT)
9530 PerlIO_printf(Perl_debug_log, "implicit ");
9531 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
9532 if (r->extflags & RXf_EVAL_SEEN)
9533 PerlIO_printf(Perl_debug_log, "with eval ");
9534 PerlIO_printf(Perl_debug_log, "\n");
9535 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
9537 PERL_ARGS_ASSERT_REGDUMP;
9538 PERL_UNUSED_CONTEXT;
9540 #endif /* DEBUGGING */
9544 - regprop - printable representation of opcode
9546 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
9549 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
9550 if (flags & ANYOF_INVERT) \
9551 /*make sure the invert info is in each */ \
9552 sv_catpvs(sv, "^"); \
9558 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
9563 RXi_GET_DECL(prog,progi);
9564 GET_RE_DEBUG_FLAGS_DECL;
9566 PERL_ARGS_ASSERT_REGPROP;
9570 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
9571 /* It would be nice to FAIL() here, but this may be called from
9572 regexec.c, and it would be hard to supply pRExC_state. */
9573 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
9574 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9576 k = PL_regkind[OP(o)];
9580 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
9581 * is a crude hack but it may be the best for now since
9582 * we have no flag "this EXACTish node was UTF-8"
9584 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
9585 PERL_PV_ESCAPE_UNI_DETECT |
9586 PERL_PV_PRETTY_ELLIPSES |
9587 PERL_PV_PRETTY_LTGT |
9588 PERL_PV_PRETTY_NOCLEAR
9590 } else if (k == TRIE) {
9591 /* print the details of the trie in dumpuntil instead, as
9592 * progi->data isn't available here */
9593 const char op = OP(o);
9594 const U32 n = ARG(o);
9595 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
9596 (reg_ac_data *)progi->data->data[n] :
9598 const reg_trie_data * const trie
9599 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
9601 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
9602 DEBUG_TRIE_COMPILE_r(
9603 Perl_sv_catpvf(aTHX_ sv,
9604 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
9605 (UV)trie->startstate,
9606 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
9607 (UV)trie->wordcount,
9610 (UV)TRIE_CHARCOUNT(trie),
9611 (UV)trie->uniquecharcount
9614 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
9616 int rangestart = -1;
9617 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
9619 for (i = 0; i <= 256; i++) {
9620 if (i < 256 && BITMAP_TEST(bitmap,i)) {
9621 if (rangestart == -1)
9623 } else if (rangestart != -1) {
9624 if (i <= rangestart + 3)
9625 for (; rangestart < i; rangestart++)
9626 put_byte(sv, rangestart);
9628 put_byte(sv, rangestart);
9630 put_byte(sv, i - 1);
9638 } else if (k == CURLY) {
9639 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
9640 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
9641 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
9643 else if (k == WHILEM && o->flags) /* Ordinal/of */
9644 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
9645 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
9646 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
9647 if ( RXp_PAREN_NAMES(prog) ) {
9648 if ( k != REF || (OP(o) < NREF)) {
9649 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
9650 SV **name= av_fetch(list, ARG(o), 0 );
9652 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9655 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
9656 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
9657 I32 *nums=(I32*)SvPVX(sv_dat);
9658 SV **name= av_fetch(list, nums[0], 0 );
9661 for ( n=0; n<SvIVX(sv_dat); n++ ) {
9662 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
9663 (n ? "," : ""), (IV)nums[n]);
9665 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
9669 } else if (k == GOSUB)
9670 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
9671 else if (k == VERB) {
9673 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
9674 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
9675 } else if (k == LOGICAL)
9676 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
9677 else if (k == FOLDCHAR)
9678 Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
9679 else if (k == ANYOF) {
9680 int i, rangestart = -1;
9681 const U8 flags = ANYOF_FLAGS(o);
9684 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
9685 static const char * const anyofs[] = {
9718 if (flags & ANYOF_LOCALE)
9719 sv_catpvs(sv, "{loc}");
9720 if (flags & ANYOF_FOLD)
9721 sv_catpvs(sv, "{i}");
9722 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
9723 if (flags & ANYOF_INVERT)
9726 /* output what the standard cp 0-255 bitmap matches */
9727 for (i = 0; i <= 256; i++) {
9728 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
9729 if (rangestart == -1)
9731 } else if (rangestart != -1) {
9732 if (i <= rangestart + 3)
9733 for (; rangestart < i; rangestart++)
9734 put_byte(sv, rangestart);
9736 put_byte(sv, rangestart);
9738 put_byte(sv, i - 1);
9745 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9746 /* output any special charclass tests (used entirely under use locale) */
9747 if (ANYOF_CLASS_TEST_ANY_SET(o))
9748 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
9749 if (ANYOF_CLASS_TEST(o,i)) {
9750 sv_catpv(sv, anyofs[i]);
9754 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
9756 /* output information about the unicode matching */
9757 if (flags & ANYOF_UNICODE_ALL)
9758 sv_catpvs(sv, "{unicode_all}");
9759 else if (flags & ANYOF_UTF8)
9760 sv_catpvs(sv, "{unicode}");
9761 if (flags & ANYOF_NONBITMAP_NON_UTF8)
9762 sv_catpvs(sv, "{outside bitmap}");
9766 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
9770 U8 s[UTF8_MAXBYTES_CASE+1];
9772 for (i = 0; i <= 256; i++) { /* just the first 256 */
9773 uvchr_to_utf8(s, i);
9775 if (i < 256 && swash_fetch(sw, s, TRUE)) {
9776 if (rangestart == -1)
9778 } else if (rangestart != -1) {
9779 if (i <= rangestart + 3)
9780 for (; rangestart < i; rangestart++) {
9781 const U8 * const e = uvchr_to_utf8(s,rangestart);
9783 for(p = s; p < e; p++)
9787 const U8 *e = uvchr_to_utf8(s,rangestart);
9789 for (p = s; p < e; p++)
9792 e = uvchr_to_utf8(s, i-1);
9793 for (p = s; p < e; p++)
9800 sv_catpvs(sv, "..."); /* et cetera */
9804 char *s = savesvpv(lv);
9805 char * const origs = s;
9807 while (*s && *s != '\n')
9811 const char * const t = ++s;
9829 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
9831 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
9832 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
9834 PERL_UNUSED_CONTEXT;
9835 PERL_UNUSED_ARG(sv);
9837 PERL_UNUSED_ARG(prog);
9838 #endif /* DEBUGGING */
9842 Perl_re_intuit_string(pTHX_ REGEXP * const r)
9843 { /* Assume that RE_INTUIT is set */
9845 struct regexp *const prog = (struct regexp *)SvANY(r);
9846 GET_RE_DEBUG_FLAGS_DECL;
9848 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
9849 PERL_UNUSED_CONTEXT;
9853 const char * const s = SvPV_nolen_const(prog->check_substr
9854 ? prog->check_substr : prog->check_utf8);
9856 if (!PL_colorset) reginitcolors();
9857 PerlIO_printf(Perl_debug_log,
9858 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
9860 prog->check_substr ? "" : "utf8 ",
9861 PL_colors[5],PL_colors[0],
9864 (strlen(s) > 60 ? "..." : ""));
9867 return prog->check_substr ? prog->check_substr : prog->check_utf8;
9873 handles refcounting and freeing the perl core regexp structure. When
9874 it is necessary to actually free the structure the first thing it
9875 does is call the 'free' method of the regexp_engine associated to
9876 the regexp, allowing the handling of the void *pprivate; member
9877 first. (This routine is not overridable by extensions, which is why
9878 the extensions free is called first.)
9880 See regdupe and regdupe_internal if you change anything here.
9882 #ifndef PERL_IN_XSUB_RE
9884 Perl_pregfree(pTHX_ REGEXP *r)
9890 Perl_pregfree2(pTHX_ REGEXP *rx)
9893 struct regexp *const r = (struct regexp *)SvANY(rx);
9894 GET_RE_DEBUG_FLAGS_DECL;
9896 PERL_ARGS_ASSERT_PREGFREE2;
9899 ReREFCNT_dec(r->mother_re);
9901 CALLREGFREE_PVT(rx); /* free the private data */
9902 SvREFCNT_dec(RXp_PAREN_NAMES(r));
9905 SvREFCNT_dec(r->anchored_substr);
9906 SvREFCNT_dec(r->anchored_utf8);
9907 SvREFCNT_dec(r->float_substr);
9908 SvREFCNT_dec(r->float_utf8);
9909 Safefree(r->substrs);
9911 RX_MATCH_COPY_FREE(rx);
9912 #ifdef PERL_OLD_COPY_ON_WRITE
9913 SvREFCNT_dec(r->saved_copy);
9920 This is a hacky workaround to the structural issue of match results
9921 being stored in the regexp structure which is in turn stored in
9922 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
9923 could be PL_curpm in multiple contexts, and could require multiple
9924 result sets being associated with the pattern simultaneously, such
9925 as when doing a recursive match with (??{$qr})
9927 The solution is to make a lightweight copy of the regexp structure
9928 when a qr// is returned from the code executed by (??{$qr}) this
9929 lightweight copy doesnt actually own any of its data except for
9930 the starp/end and the actual regexp structure itself.
9936 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
9939 struct regexp *const r = (struct regexp *)SvANY(rx);
9940 register const I32 npar = r->nparens+1;
9942 PERL_ARGS_ASSERT_REG_TEMP_COPY;
9945 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
9946 ret = (struct regexp *)SvANY(ret_x);
9948 (void)ReREFCNT_inc(rx);
9949 /* We can take advantage of the existing "copied buffer" mechanism in SVs
9950 by pointing directly at the buffer, but flagging that the allocated
9951 space in the copy is zero. As we've just done a struct copy, it's now
9952 a case of zero-ing that, rather than copying the current length. */
9953 SvPV_set(ret_x, RX_WRAPPED(rx));
9954 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
9955 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
9956 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
9957 SvLEN_set(ret_x, 0);
9958 SvSTASH_set(ret_x, NULL);
9959 SvMAGIC_set(ret_x, NULL);
9960 Newx(ret->offs, npar, regexp_paren_pair);
9961 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
9963 Newx(ret->substrs, 1, struct reg_substr_data);
9964 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
9966 SvREFCNT_inc_void(ret->anchored_substr);
9967 SvREFCNT_inc_void(ret->anchored_utf8);
9968 SvREFCNT_inc_void(ret->float_substr);
9969 SvREFCNT_inc_void(ret->float_utf8);
9971 /* check_substr and check_utf8, if non-NULL, point to either their
9972 anchored or float namesakes, and don't hold a second reference. */
9974 RX_MATCH_COPIED_off(ret_x);
9975 #ifdef PERL_OLD_COPY_ON_WRITE
9976 ret->saved_copy = NULL;
9978 ret->mother_re = rx;
9984 /* regfree_internal()
9986 Free the private data in a regexp. This is overloadable by
9987 extensions. Perl takes care of the regexp structure in pregfree(),
9988 this covers the *pprivate pointer which technically perl doesn't
9989 know about, however of course we have to handle the
9990 regexp_internal structure when no extension is in use.
9992 Note this is called before freeing anything in the regexp
9997 Perl_regfree_internal(pTHX_ REGEXP * const rx)
10000 struct regexp *const r = (struct regexp *)SvANY(rx);
10001 RXi_GET_DECL(r,ri);
10002 GET_RE_DEBUG_FLAGS_DECL;
10004 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
10010 SV *dsv= sv_newmortal();
10011 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
10012 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
10013 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
10014 PL_colors[4],PL_colors[5],s);
10017 #ifdef RE_TRACK_PATTERN_OFFSETS
10019 Safefree(ri->u.offsets); /* 20010421 MJD */
10022 int n = ri->data->count;
10023 PAD* new_comppad = NULL;
10028 /* If you add a ->what type here, update the comment in regcomp.h */
10029 switch (ri->data->what[n]) {
10034 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
10037 Safefree(ri->data->data[n]);
10040 new_comppad = MUTABLE_AV(ri->data->data[n]);
10043 if (new_comppad == NULL)
10044 Perl_croak(aTHX_ "panic: pregfree comppad");
10045 PAD_SAVE_LOCAL(old_comppad,
10046 /* Watch out for global destruction's random ordering. */
10047 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
10050 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
10053 op_free((OP_4tree*)ri->data->data[n]);
10055 PAD_RESTORE_LOCAL(old_comppad);
10056 SvREFCNT_dec(MUTABLE_SV(new_comppad));
10057 new_comppad = NULL;
10062 { /* Aho Corasick add-on structure for a trie node.
10063 Used in stclass optimization only */
10065 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
10067 refcount = --aho->refcount;
10070 PerlMemShared_free(aho->states);
10071 PerlMemShared_free(aho->fail);
10072 /* do this last!!!! */
10073 PerlMemShared_free(ri->data->data[n]);
10074 PerlMemShared_free(ri->regstclass);
10080 /* trie structure. */
10082 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
10084 refcount = --trie->refcount;
10087 PerlMemShared_free(trie->charmap);
10088 PerlMemShared_free(trie->states);
10089 PerlMemShared_free(trie->trans);
10091 PerlMemShared_free(trie->bitmap);
10093 PerlMemShared_free(trie->jump);
10094 PerlMemShared_free(trie->wordinfo);
10095 /* do this last!!!! */
10096 PerlMemShared_free(ri->data->data[n]);
10101 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
10104 Safefree(ri->data->what);
10105 Safefree(ri->data);
10111 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
10112 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
10113 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
10116 re_dup - duplicate a regexp.
10118 This routine is expected to clone a given regexp structure. It is only
10119 compiled under USE_ITHREADS.
10121 After all of the core data stored in struct regexp is duplicated
10122 the regexp_engine.dupe method is used to copy any private data
10123 stored in the *pprivate pointer. This allows extensions to handle
10124 any duplication it needs to do.
10126 See pregfree() and regfree_internal() if you change anything here.
10128 #if defined(USE_ITHREADS)
10129 #ifndef PERL_IN_XSUB_RE
10131 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
10135 const struct regexp *r = (const struct regexp *)SvANY(sstr);
10136 struct regexp *ret = (struct regexp *)SvANY(dstr);
10138 PERL_ARGS_ASSERT_RE_DUP_GUTS;
10140 npar = r->nparens+1;
10141 Newx(ret->offs, npar, regexp_paren_pair);
10142 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
10144 /* no need to copy these */
10145 Newx(ret->swap, npar, regexp_paren_pair);
10148 if (ret->substrs) {
10149 /* Do it this way to avoid reading from *r after the StructCopy().
10150 That way, if any of the sv_dup_inc()s dislodge *r from the L1
10151 cache, it doesn't matter. */
10152 const bool anchored = r->check_substr
10153 ? r->check_substr == r->anchored_substr
10154 : r->check_utf8 == r->anchored_utf8;
10155 Newx(ret->substrs, 1, struct reg_substr_data);
10156 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
10158 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
10159 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
10160 ret->float_substr = sv_dup_inc(ret->float_substr, param);
10161 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
10163 /* check_substr and check_utf8, if non-NULL, point to either their
10164 anchored or float namesakes, and don't hold a second reference. */
10166 if (ret->check_substr) {
10168 assert(r->check_utf8 == r->anchored_utf8);
10169 ret->check_substr = ret->anchored_substr;
10170 ret->check_utf8 = ret->anchored_utf8;
10172 assert(r->check_substr == r->float_substr);
10173 assert(r->check_utf8 == r->float_utf8);
10174 ret->check_substr = ret->float_substr;
10175 ret->check_utf8 = ret->float_utf8;
10177 } else if (ret->check_utf8) {
10179 ret->check_utf8 = ret->anchored_utf8;
10181 ret->check_utf8 = ret->float_utf8;
10186 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
10189 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
10191 if (RX_MATCH_COPIED(dstr))
10192 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
10194 ret->subbeg = NULL;
10195 #ifdef PERL_OLD_COPY_ON_WRITE
10196 ret->saved_copy = NULL;
10199 if (ret->mother_re) {
10200 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
10201 /* Our storage points directly to our mother regexp, but that's
10202 1: a buffer in a different thread
10203 2: something we no longer hold a reference on
10204 so we need to copy it locally. */
10205 /* Note we need to sue SvCUR() on our mother_re, because it, in
10206 turn, may well be pointing to its own mother_re. */
10207 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
10208 SvCUR(ret->mother_re)+1));
10209 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
10211 ret->mother_re = NULL;
10215 #endif /* PERL_IN_XSUB_RE */
10220 This is the internal complement to regdupe() which is used to copy
10221 the structure pointed to by the *pprivate pointer in the regexp.
10222 This is the core version of the extension overridable cloning hook.
10223 The regexp structure being duplicated will be copied by perl prior
10224 to this and will be provided as the regexp *r argument, however
10225 with the /old/ structures pprivate pointer value. Thus this routine
10226 may override any copying normally done by perl.
10228 It returns a pointer to the new regexp_internal structure.
10232 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
10235 struct regexp *const r = (struct regexp *)SvANY(rx);
10236 regexp_internal *reti;
10238 RXi_GET_DECL(r,ri);
10240 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
10242 npar = r->nparens+1;
10245 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
10246 Copy(ri->program, reti->program, len+1, regnode);
10249 reti->regstclass = NULL;
10252 struct reg_data *d;
10253 const int count = ri->data->count;
10256 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
10257 char, struct reg_data);
10258 Newx(d->what, count, U8);
10261 for (i = 0; i < count; i++) {
10262 d->what[i] = ri->data->what[i];
10263 switch (d->what[i]) {
10264 /* legal options are one of: sSfpontTua
10265 see also regcomp.h and pregfree() */
10266 case 'a': /* actually an AV, but the dup function is identical. */
10269 case 'p': /* actually an AV, but the dup function is identical. */
10270 case 'u': /* actually an HV, but the dup function is identical. */
10271 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
10274 /* This is cheating. */
10275 Newx(d->data[i], 1, struct regnode_charclass_class);
10276 StructCopy(ri->data->data[i], d->data[i],
10277 struct regnode_charclass_class);
10278 reti->regstclass = (regnode*)d->data[i];
10281 /* Compiled op trees are readonly and in shared memory,
10282 and can thus be shared without duplication. */
10284 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
10288 /* Trie stclasses are readonly and can thus be shared
10289 * without duplication. We free the stclass in pregfree
10290 * when the corresponding reg_ac_data struct is freed.
10292 reti->regstclass= ri->regstclass;
10296 ((reg_trie_data*)ri->data->data[i])->refcount++;
10300 d->data[i] = ri->data->data[i];
10303 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
10312 reti->name_list_idx = ri->name_list_idx;
10314 #ifdef RE_TRACK_PATTERN_OFFSETS
10315 if (ri->u.offsets) {
10316 Newx(reti->u.offsets, 2*len+1, U32);
10317 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
10320 SetProgLen(reti,len);
10323 return (void*)reti;
10326 #endif /* USE_ITHREADS */
10328 #ifndef PERL_IN_XSUB_RE
10331 - regnext - dig the "next" pointer out of a node
10334 Perl_regnext(pTHX_ register regnode *p)
10337 register I32 offset;
10342 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
10343 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
10346 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
10355 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
10358 STRLEN l1 = strlen(pat1);
10359 STRLEN l2 = strlen(pat2);
10362 const char *message;
10364 PERL_ARGS_ASSERT_RE_CROAK2;
10370 Copy(pat1, buf, l1 , char);
10371 Copy(pat2, buf + l1, l2 , char);
10372 buf[l1 + l2] = '\n';
10373 buf[l1 + l2 + 1] = '\0';
10375 /* ANSI variant takes additional second argument */
10376 va_start(args, pat2);
10380 msv = vmess(buf, &args);
10382 message = SvPV_const(msv,l1);
10385 Copy(message, buf, l1 , char);
10386 buf[l1-1] = '\0'; /* Overwrite \n */
10387 Perl_croak(aTHX_ "%s", buf);
10390 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
10392 #ifndef PERL_IN_XSUB_RE
10394 Perl_save_re_context(pTHX)
10398 struct re_save_state *state;
10400 SAVEVPTR(PL_curcop);
10401 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
10403 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
10404 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
10405 SSPUSHUV(SAVEt_RE_STATE);
10407 Copy(&PL_reg_state, state, 1, struct re_save_state);
10409 PL_reg_start_tmp = 0;
10410 PL_reg_start_tmpl = 0;
10411 PL_reg_oldsaved = NULL;
10412 PL_reg_oldsavedlen = 0;
10413 PL_reg_maxiter = 0;
10414 PL_reg_leftiter = 0;
10415 PL_reg_poscache = NULL;
10416 PL_reg_poscache_size = 0;
10417 #ifdef PERL_OLD_COPY_ON_WRITE
10421 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
10423 const REGEXP * const rx = PM_GETRE(PL_curpm);
10426 for (i = 1; i <= RX_NPARENS(rx); i++) {
10427 char digits[TYPE_CHARS(long)];
10428 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
10429 GV *const *const gvp
10430 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
10433 GV * const gv = *gvp;
10434 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
10444 clear_re(pTHX_ void *r)
10447 ReREFCNT_dec((REGEXP *)r);
10453 S_put_byte(pTHX_ SV *sv, int c)
10455 PERL_ARGS_ASSERT_PUT_BYTE;
10457 /* Our definition of isPRINT() ignores locales, so only bytes that are
10458 not part of UTF-8 are considered printable. I assume that the same
10459 holds for UTF-EBCDIC.
10460 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
10461 which Wikipedia says:
10463 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
10464 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
10465 identical, to the ASCII delete (DEL) or rubout control character.
10466 ) So the old condition can be simplified to !isPRINT(c) */
10469 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
10472 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
10476 const char string = c;
10477 if (c == '-' || c == ']' || c == '\\' || c == '^')
10478 sv_catpvs(sv, "\\");
10479 sv_catpvn(sv, &string, 1);
10484 #define CLEAR_OPTSTART \
10485 if (optstart) STMT_START { \
10486 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
10490 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
10492 STATIC const regnode *
10493 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
10494 const regnode *last, const regnode *plast,
10495 SV* sv, I32 indent, U32 depth)
10498 register U8 op = PSEUDO; /* Arbitrary non-END op. */
10499 register const regnode *next;
10500 const regnode *optstart= NULL;
10502 RXi_GET_DECL(r,ri);
10503 GET_RE_DEBUG_FLAGS_DECL;
10505 PERL_ARGS_ASSERT_DUMPUNTIL;
10507 #ifdef DEBUG_DUMPUNTIL
10508 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
10509 last ? last-start : 0,plast ? plast-start : 0);
10512 if (plast && plast < last)
10515 while (PL_regkind[op] != END && (!last || node < last)) {
10516 /* While that wasn't END last time... */
10519 if (op == CLOSE || op == WHILEM)
10521 next = regnext((regnode *)node);
10524 if (OP(node) == OPTIMIZED) {
10525 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
10532 regprop(r, sv, node);
10533 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
10534 (int)(2*indent + 1), "", SvPVX_const(sv));
10536 if (OP(node) != OPTIMIZED) {
10537 if (next == NULL) /* Next ptr. */
10538 PerlIO_printf(Perl_debug_log, " (0)");
10539 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
10540 PerlIO_printf(Perl_debug_log, " (FAIL)");
10542 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
10543 (void)PerlIO_putc(Perl_debug_log, '\n');
10547 if (PL_regkind[(U8)op] == BRANCHJ) {
10550 register const regnode *nnode = (OP(next) == LONGJMP
10551 ? regnext((regnode *)next)
10553 if (last && nnode > last)
10555 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
10558 else if (PL_regkind[(U8)op] == BRANCH) {
10560 DUMPUNTIL(NEXTOPER(node), next);
10562 else if ( PL_regkind[(U8)op] == TRIE ) {
10563 const regnode *this_trie = node;
10564 const char op = OP(node);
10565 const U32 n = ARG(node);
10566 const reg_ac_data * const ac = op>=AHOCORASICK ?
10567 (reg_ac_data *)ri->data->data[n] :
10569 const reg_trie_data * const trie =
10570 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
10572 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
10574 const regnode *nextbranch= NULL;
10577 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
10578 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
10580 PerlIO_printf(Perl_debug_log, "%*s%s ",
10581 (int)(2*(indent+3)), "",
10582 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
10583 PL_colors[0], PL_colors[1],
10584 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
10585 PERL_PV_PRETTY_ELLIPSES |
10586 PERL_PV_PRETTY_LTGT
10591 U16 dist= trie->jump[word_idx+1];
10592 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
10593 (UV)((dist ? this_trie + dist : next) - start));
10596 nextbranch= this_trie + trie->jump[0];
10597 DUMPUNTIL(this_trie + dist, nextbranch);
10599 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
10600 nextbranch= regnext((regnode *)nextbranch);
10602 PerlIO_printf(Perl_debug_log, "\n");
10605 if (last && next > last)
10610 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
10611 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
10612 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
10614 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
10616 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
10618 else if ( op == PLUS || op == STAR) {
10619 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
10621 else if (op == ANYOF) {
10622 /* arglen 1 + class block */
10623 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
10624 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
10625 node = NEXTOPER(node);
10627 else if (PL_regkind[(U8)op] == EXACT) {
10628 /* Literal string, where present. */
10629 node += NODE_SZ_STR(node) - 1;
10630 node = NEXTOPER(node);
10633 node = NEXTOPER(node);
10634 node += regarglen[(U8)op];
10636 if (op == CURLYX || op == OPEN)
10640 #ifdef DEBUG_DUMPUNTIL
10641 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
10646 #endif /* DEBUGGING */
10650 * c-indentation-style: bsd
10651 * c-basic-offset: 4
10652 * indent-tabs-mode: t
10655 * ex: set ts=8 sts=4 sw=4 noet: