5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
89 #ifndef PERL_IN_XSUB_RE
90 # include "charclass_invlists.h"
93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
100 # if defined(BUGGY_MSC6)
101 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
102 # pragma optimize("a",off)
103 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
104 # pragma optimize("w",on )
105 # endif /* BUGGY_MSC6 */
109 #define STATIC static
113 typedef struct RExC_state_t {
114 U32 flags; /* are we folding, multilining? */
115 char *precomp; /* uncompiled string. */
116 REGEXP *rx_sv; /* The SV that is the regexp. */
117 regexp *rx; /* perl core regexp structure */
118 regexp_internal *rxi; /* internal data for regexp object pprivate field */
119 char *start; /* Start of input for compile */
120 char *end; /* End of input for compile */
121 char *parse; /* Input-scan pointer. */
122 I32 whilem_seen; /* number of WHILEM in this expr */
123 regnode *emit_start; /* Start of emitted-code area */
124 regnode *emit_bound; /* First regnode outside of the allocated space */
125 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
126 I32 naughty; /* How bad is this pattern? */
127 I32 sawback; /* Did we see \1, ...? */
129 I32 size; /* Code size. */
130 I32 npar; /* Capture buffer count, (OPEN). */
131 I32 cpar; /* Capture buffer count, (CLOSE). */
132 I32 nestroot; /* root parens we are in - used by accept */
136 regnode **open_parens; /* pointers to open parens */
137 regnode **close_parens; /* pointers to close parens */
138 regnode *opend; /* END node in program */
139 I32 utf8; /* whether the pattern is utf8 or not */
140 I32 orig_utf8; /* whether the pattern was originally in utf8 */
141 /* XXX use this for future optimisation of case
142 * where pattern must be upgraded to utf8. */
143 I32 uni_semantics; /* If a d charset modifier should use unicode
144 rules, even if the pattern is not in
146 HV *paren_names; /* Paren names */
148 regnode **recurse; /* Recurse regops */
149 I32 recurse_count; /* Number of recurse regops */
152 I32 override_recoding;
153 struct reg_code_block *code_blocks; /* positions of literal (?{})
155 int num_code_blocks; /* size of code_blocks[] */
156 int code_index; /* next code_blocks[] slot */
158 char *starttry; /* -Dr: where regtry was called. */
159 #define RExC_starttry (pRExC_state->starttry)
162 const char *lastparse;
164 AV *paren_name_list; /* idx -> name */
165 #define RExC_lastparse (pRExC_state->lastparse)
166 #define RExC_lastnum (pRExC_state->lastnum)
167 #define RExC_paren_name_list (pRExC_state->paren_name_list)
171 #define RExC_flags (pRExC_state->flags)
172 #define RExC_precomp (pRExC_state->precomp)
173 #define RExC_rx_sv (pRExC_state->rx_sv)
174 #define RExC_rx (pRExC_state->rx)
175 #define RExC_rxi (pRExC_state->rxi)
176 #define RExC_start (pRExC_state->start)
177 #define RExC_end (pRExC_state->end)
178 #define RExC_parse (pRExC_state->parse)
179 #define RExC_whilem_seen (pRExC_state->whilem_seen)
180 #ifdef RE_TRACK_PATTERN_OFFSETS
181 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
183 #define RExC_emit (pRExC_state->emit)
184 #define RExC_emit_start (pRExC_state->emit_start)
185 #define RExC_emit_bound (pRExC_state->emit_bound)
186 #define RExC_naughty (pRExC_state->naughty)
187 #define RExC_sawback (pRExC_state->sawback)
188 #define RExC_seen (pRExC_state->seen)
189 #define RExC_size (pRExC_state->size)
190 #define RExC_npar (pRExC_state->npar)
191 #define RExC_nestroot (pRExC_state->nestroot)
192 #define RExC_extralen (pRExC_state->extralen)
193 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
194 #define RExC_seen_evals (pRExC_state->seen_evals)
195 #define RExC_utf8 (pRExC_state->utf8)
196 #define RExC_uni_semantics (pRExC_state->uni_semantics)
197 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
198 #define RExC_open_parens (pRExC_state->open_parens)
199 #define RExC_close_parens (pRExC_state->close_parens)
200 #define RExC_opend (pRExC_state->opend)
201 #define RExC_paren_names (pRExC_state->paren_names)
202 #define RExC_recurse (pRExC_state->recurse)
203 #define RExC_recurse_count (pRExC_state->recurse_count)
204 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
205 #define RExC_contains_locale (pRExC_state->contains_locale)
206 #define RExC_override_recoding (pRExC_state->override_recoding)
209 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
210 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
211 ((*s) == '{' && regcurly(s)))
214 #undef SPSTART /* dratted cpp namespace... */
217 * Flags to be passed up and down.
219 #define WORST 0 /* Worst case. */
220 #define HASWIDTH 0x01 /* Known to match non-null strings. */
222 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
223 * character, and if utf8, must be invariant. Note that this is not the same
224 * thing as REGNODE_SIMPLE */
226 #define SPSTART 0x04 /* Starts with * or +. */
227 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
228 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
230 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
232 /* whether trie related optimizations are enabled */
233 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
234 #define TRIE_STUDY_OPT
235 #define FULL_TRIE_STUDY
241 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
242 #define PBITVAL(paren) (1 << ((paren) & 7))
243 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
244 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
245 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
247 /* If not already in utf8, do a longjmp back to the beginning */
248 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
249 #define REQUIRE_UTF8 STMT_START { \
250 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
253 /* About scan_data_t.
255 During optimisation we recurse through the regexp program performing
256 various inplace (keyhole style) optimisations. In addition study_chunk
257 and scan_commit populate this data structure with information about
258 what strings MUST appear in the pattern. We look for the longest
259 string that must appear at a fixed location, and we look for the
260 longest string that may appear at a floating location. So for instance
265 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
266 strings (because they follow a .* construct). study_chunk will identify
267 both FOO and BAR as being the longest fixed and floating strings respectively.
269 The strings can be composites, for instance
273 will result in a composite fixed substring 'foo'.
275 For each string some basic information is maintained:
277 - offset or min_offset
278 This is the position the string must appear at, or not before.
279 It also implicitly (when combined with minlenp) tells us how many
280 characters must match before the string we are searching for.
281 Likewise when combined with minlenp and the length of the string it
282 tells us how many characters must appear after the string we have
286 Only used for floating strings. This is the rightmost point that
287 the string can appear at. If set to I32 max it indicates that the
288 string can occur infinitely far to the right.
291 A pointer to the minimum length of the pattern that the string
292 was found inside. This is important as in the case of positive
293 lookahead or positive lookbehind we can have multiple patterns
298 The minimum length of the pattern overall is 3, the minimum length
299 of the lookahead part is 3, but the minimum length of the part that
300 will actually match is 1. So 'FOO's minimum length is 3, but the
301 minimum length for the F is 1. This is important as the minimum length
302 is used to determine offsets in front of and behind the string being
303 looked for. Since strings can be composites this is the length of the
304 pattern at the time it was committed with a scan_commit. Note that
305 the length is calculated by study_chunk, so that the minimum lengths
306 are not known until the full pattern has been compiled, thus the
307 pointer to the value.
311 In the case of lookbehind the string being searched for can be
312 offset past the start point of the final matching string.
313 If this value was just blithely removed from the min_offset it would
314 invalidate some of the calculations for how many chars must match
315 before or after (as they are derived from min_offset and minlen and
316 the length of the string being searched for).
317 When the final pattern is compiled and the data is moved from the
318 scan_data_t structure into the regexp structure the information
319 about lookbehind is factored in, with the information that would
320 have been lost precalculated in the end_shift field for the
323 The fields pos_min and pos_delta are used to store the minimum offset
324 and the delta to the maximum offset at the current point in the pattern.
328 typedef struct scan_data_t {
329 /*I32 len_min; unused */
330 /*I32 len_delta; unused */
334 I32 last_end; /* min value, <0 unless valid. */
337 SV **longest; /* Either &l_fixed, or &l_float. */
338 SV *longest_fixed; /* longest fixed string found in pattern */
339 I32 offset_fixed; /* offset where it starts */
340 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
341 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
342 SV *longest_float; /* longest floating string found in pattern */
343 I32 offset_float_min; /* earliest point in string it can appear */
344 I32 offset_float_max; /* latest point in string it can appear */
345 I32 *minlen_float; /* pointer to the minlen relevant to the string */
346 I32 lookbehind_float; /* is the position of the string modified by LB */
350 struct regnode_charclass_class *start_class;
354 * Forward declarations for pregcomp()'s friends.
357 static const scan_data_t zero_scan_data =
358 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
360 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
361 #define SF_BEFORE_SEOL 0x0001
362 #define SF_BEFORE_MEOL 0x0002
363 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
364 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
367 # define SF_FIX_SHIFT_EOL (0+2)
368 # define SF_FL_SHIFT_EOL (0+4)
370 # define SF_FIX_SHIFT_EOL (+2)
371 # define SF_FL_SHIFT_EOL (+4)
374 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
375 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
377 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
378 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
379 #define SF_IS_INF 0x0040
380 #define SF_HAS_PAR 0x0080
381 #define SF_IN_PAR 0x0100
382 #define SF_HAS_EVAL 0x0200
383 #define SCF_DO_SUBSTR 0x0400
384 #define SCF_DO_STCLASS_AND 0x0800
385 #define SCF_DO_STCLASS_OR 0x1000
386 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
387 #define SCF_WHILEM_VISITED_POS 0x2000
389 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
390 #define SCF_SEEN_ACCEPT 0x8000
392 #define UTF cBOOL(RExC_utf8)
394 /* The enums for all these are ordered so things work out correctly */
395 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
396 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
397 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
398 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
399 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
400 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
401 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
403 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
405 #define OOB_UNICODE 12345678
406 #define OOB_NAMEDCLASS -1
408 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
409 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
412 /* length of regex to show in messages that don't mark a position within */
413 #define RegexLengthToShowInErrorMessages 127
416 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
417 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
418 * op/pragma/warn/regcomp.
420 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
421 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
423 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
426 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
427 * arg. Show regex, up to a maximum length. If it's too long, chop and add
430 #define _FAIL(code) STMT_START { \
431 const char *ellipses = ""; \
432 IV len = RExC_end - RExC_precomp; \
435 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
436 if (len > RegexLengthToShowInErrorMessages) { \
437 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
438 len = RegexLengthToShowInErrorMessages - 10; \
444 #define FAIL(msg) _FAIL( \
445 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
446 msg, (int)len, RExC_precomp, ellipses))
448 #define FAIL2(msg,arg) _FAIL( \
449 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
450 arg, (int)len, RExC_precomp, ellipses))
453 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
455 #define Simple_vFAIL(m) STMT_START { \
456 const IV offset = RExC_parse - RExC_precomp; \
457 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
458 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
462 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
464 #define vFAIL(m) STMT_START { \
466 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
471 * Like Simple_vFAIL(), but accepts two arguments.
473 #define Simple_vFAIL2(m,a1) STMT_START { \
474 const IV offset = RExC_parse - RExC_precomp; \
475 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
476 (int)offset, RExC_precomp, RExC_precomp + offset); \
480 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
482 #define vFAIL2(m,a1) STMT_START { \
484 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
485 Simple_vFAIL2(m, a1); \
490 * Like Simple_vFAIL(), but accepts three arguments.
492 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
493 const IV offset = RExC_parse - RExC_precomp; \
494 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
495 (int)offset, RExC_precomp, RExC_precomp + offset); \
499 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
501 #define vFAIL3(m,a1,a2) STMT_START { \
503 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
504 Simple_vFAIL3(m, a1, a2); \
508 * Like Simple_vFAIL(), but accepts four arguments.
510 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
511 const IV offset = RExC_parse - RExC_precomp; \
512 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
513 (int)offset, RExC_precomp, RExC_precomp + offset); \
516 #define ckWARNreg(loc,m) STMT_START { \
517 const IV offset = loc - RExC_precomp; \
518 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
519 (int)offset, RExC_precomp, RExC_precomp + offset); \
522 #define ckWARNregdep(loc,m) STMT_START { \
523 const IV offset = loc - RExC_precomp; \
524 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
526 (int)offset, RExC_precomp, RExC_precomp + offset); \
529 #define ckWARN2regdep(loc,m, a1) STMT_START { \
530 const IV offset = loc - RExC_precomp; \
531 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
533 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
536 #define ckWARN2reg(loc, m, a1) STMT_START { \
537 const IV offset = loc - RExC_precomp; \
538 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
539 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
542 #define vWARN3(loc, m, a1, a2) STMT_START { \
543 const IV offset = loc - RExC_precomp; \
544 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
545 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
548 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
549 const IV offset = loc - RExC_precomp; \
550 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
551 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
554 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
555 const IV offset = loc - RExC_precomp; \
556 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
557 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
560 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
561 const IV offset = loc - RExC_precomp; \
562 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
563 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
566 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
567 const IV offset = loc - RExC_precomp; \
568 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
569 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
573 /* Allow for side effects in s */
574 #define REGC(c,s) STMT_START { \
575 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
578 /* Macros for recording node offsets. 20001227 mjd@plover.com
579 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
580 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
581 * Element 0 holds the number n.
582 * Position is 1 indexed.
584 #ifndef RE_TRACK_PATTERN_OFFSETS
585 #define Set_Node_Offset_To_R(node,byte)
586 #define Set_Node_Offset(node,byte)
587 #define Set_Cur_Node_Offset
588 #define Set_Node_Length_To_R(node,len)
589 #define Set_Node_Length(node,len)
590 #define Set_Node_Cur_Length(node)
591 #define Node_Offset(n)
592 #define Node_Length(n)
593 #define Set_Node_Offset_Length(node,offset,len)
594 #define ProgLen(ri) ri->u.proglen
595 #define SetProgLen(ri,x) ri->u.proglen = x
597 #define ProgLen(ri) ri->u.offsets[0]
598 #define SetProgLen(ri,x) ri->u.offsets[0] = x
599 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
601 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
602 __LINE__, (int)(node), (int)(byte))); \
604 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
606 RExC_offsets[2*(node)-1] = (byte); \
611 #define Set_Node_Offset(node,byte) \
612 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
613 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
615 #define Set_Node_Length_To_R(node,len) STMT_START { \
617 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
618 __LINE__, (int)(node), (int)(len))); \
620 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
622 RExC_offsets[2*(node)] = (len); \
627 #define Set_Node_Length(node,len) \
628 Set_Node_Length_To_R((node)-RExC_emit_start, len)
629 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
630 #define Set_Node_Cur_Length(node) \
631 Set_Node_Length(node, RExC_parse - parse_start)
633 /* Get offsets and lengths */
634 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
635 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
637 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
638 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
639 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
643 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
644 #define EXPERIMENTAL_INPLACESCAN
645 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
647 #define DEBUG_STUDYDATA(str,data,depth) \
648 DEBUG_OPTIMISE_MORE_r(if(data){ \
649 PerlIO_printf(Perl_debug_log, \
650 "%*s" str "Pos:%"IVdf"/%"IVdf \
651 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
652 (int)(depth)*2, "", \
653 (IV)((data)->pos_min), \
654 (IV)((data)->pos_delta), \
655 (UV)((data)->flags), \
656 (IV)((data)->whilem_c), \
657 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
658 is_inf ? "INF " : "" \
660 if ((data)->last_found) \
661 PerlIO_printf(Perl_debug_log, \
662 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
663 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
664 SvPVX_const((data)->last_found), \
665 (IV)((data)->last_end), \
666 (IV)((data)->last_start_min), \
667 (IV)((data)->last_start_max), \
668 ((data)->longest && \
669 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
670 SvPVX_const((data)->longest_fixed), \
671 (IV)((data)->offset_fixed), \
672 ((data)->longest && \
673 (data)->longest==&((data)->longest_float)) ? "*" : "", \
674 SvPVX_const((data)->longest_float), \
675 (IV)((data)->offset_float_min), \
676 (IV)((data)->offset_float_max) \
678 PerlIO_printf(Perl_debug_log,"\n"); \
681 static void clear_re(pTHX_ void *r);
683 /* Mark that we cannot extend a found fixed substring at this point.
684 Update the longest found anchored substring and the longest found
685 floating substrings if needed. */
688 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
690 const STRLEN l = CHR_SVLEN(data->last_found);
691 const STRLEN old_l = CHR_SVLEN(*data->longest);
692 GET_RE_DEBUG_FLAGS_DECL;
694 PERL_ARGS_ASSERT_SCAN_COMMIT;
696 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
697 SvSetMagicSV(*data->longest, data->last_found);
698 if (*data->longest == data->longest_fixed) {
699 data->offset_fixed = l ? data->last_start_min : data->pos_min;
700 if (data->flags & SF_BEFORE_EOL)
702 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
704 data->flags &= ~SF_FIX_BEFORE_EOL;
705 data->minlen_fixed=minlenp;
706 data->lookbehind_fixed=0;
708 else { /* *data->longest == data->longest_float */
709 data->offset_float_min = l ? data->last_start_min : data->pos_min;
710 data->offset_float_max = (l
711 ? data->last_start_max
712 : data->pos_min + data->pos_delta);
713 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
714 data->offset_float_max = I32_MAX;
715 if (data->flags & SF_BEFORE_EOL)
717 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
719 data->flags &= ~SF_FL_BEFORE_EOL;
720 data->minlen_float=minlenp;
721 data->lookbehind_float=0;
724 SvCUR_set(data->last_found, 0);
726 SV * const sv = data->last_found;
727 if (SvUTF8(sv) && SvMAGICAL(sv)) {
728 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
734 data->flags &= ~SF_BEFORE_EOL;
735 DEBUG_STUDYDATA("commit: ",data,0);
738 /* Can match anything (initialization) */
740 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
742 PERL_ARGS_ASSERT_CL_ANYTHING;
744 ANYOF_BITMAP_SETALL(cl);
745 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
746 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
748 /* If any portion of the regex is to operate under locale rules,
749 * initialization includes it. The reason this isn't done for all regexes
750 * is that the optimizer was written under the assumption that locale was
751 * all-or-nothing. Given the complexity and lack of documentation in the
752 * optimizer, and that there are inadequate test cases for locale, so many
753 * parts of it may not work properly, it is safest to avoid locale unless
755 if (RExC_contains_locale) {
756 ANYOF_CLASS_SETALL(cl); /* /l uses class */
757 cl->flags |= ANYOF_LOCALE;
760 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
764 /* Can match anything (initialization) */
766 S_cl_is_anything(const struct regnode_charclass_class *cl)
770 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
772 for (value = 0; value <= ANYOF_MAX; value += 2)
773 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
775 if (!(cl->flags & ANYOF_UNICODE_ALL))
777 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
782 /* Can match anything (initialization) */
784 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
786 PERL_ARGS_ASSERT_CL_INIT;
788 Zero(cl, 1, struct regnode_charclass_class);
790 cl_anything(pRExC_state, cl);
791 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
794 /* These two functions currently do the exact same thing */
795 #define cl_init_zero S_cl_init
797 /* 'AND' a given class with another one. Can create false positives. 'cl'
798 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
799 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
801 S_cl_and(struct regnode_charclass_class *cl,
802 const struct regnode_charclass_class *and_with)
804 PERL_ARGS_ASSERT_CL_AND;
806 assert(and_with->type == ANYOF);
808 /* I (khw) am not sure all these restrictions are necessary XXX */
809 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
810 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
811 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
812 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
813 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
816 if (and_with->flags & ANYOF_INVERT)
817 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
818 cl->bitmap[i] &= ~and_with->bitmap[i];
820 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
821 cl->bitmap[i] &= and_with->bitmap[i];
822 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
824 if (and_with->flags & ANYOF_INVERT) {
826 /* Here, the and'ed node is inverted. Get the AND of the flags that
827 * aren't affected by the inversion. Those that are affected are
828 * handled individually below */
829 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
830 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
831 cl->flags |= affected_flags;
833 /* We currently don't know how to deal with things that aren't in the
834 * bitmap, but we know that the intersection is no greater than what
835 * is already in cl, so let there be false positives that get sorted
836 * out after the synthetic start class succeeds, and the node is
837 * matched for real. */
839 /* The inversion of these two flags indicate that the resulting
840 * intersection doesn't have them */
841 if (and_with->flags & ANYOF_UNICODE_ALL) {
842 cl->flags &= ~ANYOF_UNICODE_ALL;
844 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
845 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
848 else { /* and'd node is not inverted */
849 U8 outside_bitmap_but_not_utf8; /* Temp variable */
851 if (! ANYOF_NONBITMAP(and_with)) {
853 /* Here 'and_with' doesn't match anything outside the bitmap
854 * (except possibly ANYOF_UNICODE_ALL), which means the
855 * intersection can't either, except for ANYOF_UNICODE_ALL, in
856 * which case we don't know what the intersection is, but it's no
857 * greater than what cl already has, so can just leave it alone,
858 * with possible false positives */
859 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
860 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
861 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
864 else if (! ANYOF_NONBITMAP(cl)) {
866 /* Here, 'and_with' does match something outside the bitmap, and cl
867 * doesn't have a list of things to match outside the bitmap. If
868 * cl can match all code points above 255, the intersection will
869 * be those above-255 code points that 'and_with' matches. If cl
870 * can't match all Unicode code points, it means that it can't
871 * match anything outside the bitmap (since the 'if' that got us
872 * into this block tested for that), so we leave the bitmap empty.
874 if (cl->flags & ANYOF_UNICODE_ALL) {
875 ARG_SET(cl, ARG(and_with));
877 /* and_with's ARG may match things that don't require UTF8.
878 * And now cl's will too, in spite of this being an 'and'. See
879 * the comments below about the kludge */
880 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
884 /* Here, both 'and_with' and cl match something outside the
885 * bitmap. Currently we do not do the intersection, so just match
886 * whatever cl had at the beginning. */
890 /* Take the intersection of the two sets of flags. However, the
891 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
892 * kludge around the fact that this flag is not treated like the others
893 * which are initialized in cl_anything(). The way the optimizer works
894 * is that the synthetic start class (SSC) is initialized to match
895 * anything, and then the first time a real node is encountered, its
896 * values are AND'd with the SSC's with the result being the values of
897 * the real node. However, there are paths through the optimizer where
898 * the AND never gets called, so those initialized bits are set
899 * inappropriately, which is not usually a big deal, as they just cause
900 * false positives in the SSC, which will just mean a probably
901 * imperceptible slow down in execution. However this bit has a
902 * higher false positive consequence in that it can cause utf8.pm,
903 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
904 * bigger slowdown and also causes significant extra memory to be used.
905 * In order to prevent this, the code now takes a different tack. The
906 * bit isn't set unless some part of the regular expression needs it,
907 * but once set it won't get cleared. This means that these extra
908 * modules won't get loaded unless there was some path through the
909 * pattern that would have required them anyway, and so any false
910 * positives that occur by not ANDing them out when they could be
911 * aren't as severe as they would be if we treated this bit like all
913 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
914 & ANYOF_NONBITMAP_NON_UTF8;
915 cl->flags &= and_with->flags;
916 cl->flags |= outside_bitmap_but_not_utf8;
920 /* 'OR' a given class with another one. Can create false positives. 'cl'
921 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
922 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
924 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
926 PERL_ARGS_ASSERT_CL_OR;
928 if (or_with->flags & ANYOF_INVERT) {
930 /* Here, the or'd node is to be inverted. This means we take the
931 * complement of everything not in the bitmap, but currently we don't
932 * know what that is, so give up and match anything */
933 if (ANYOF_NONBITMAP(or_with)) {
934 cl_anything(pRExC_state, cl);
937 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
938 * <= (B1 | !B2) | (CL1 | !CL2)
939 * which is wasteful if CL2 is small, but we ignore CL2:
940 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
941 * XXXX Can we handle case-fold? Unclear:
942 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
943 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
945 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
946 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
947 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
950 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
951 cl->bitmap[i] |= ~or_with->bitmap[i];
952 } /* XXXX: logic is complicated otherwise */
954 cl_anything(pRExC_state, cl);
957 /* And, we can just take the union of the flags that aren't affected
958 * by the inversion */
959 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
961 /* For the remaining flags:
962 ANYOF_UNICODE_ALL and inverted means to not match anything above
963 255, which means that the union with cl should just be
964 what cl has in it, so can ignore this flag
965 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
966 is 127-255 to match them, but then invert that, so the
967 union with cl should just be what cl has in it, so can
970 } else { /* 'or_with' is not inverted */
971 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
972 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
973 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
974 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
977 /* OR char bitmap and class bitmap separately */
978 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
979 cl->bitmap[i] |= or_with->bitmap[i];
980 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
981 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
982 cl->classflags[i] |= or_with->classflags[i];
983 cl->flags |= ANYOF_CLASS;
986 else { /* XXXX: logic is complicated, leave it along for a moment. */
987 cl_anything(pRExC_state, cl);
990 if (ANYOF_NONBITMAP(or_with)) {
992 /* Use the added node's outside-the-bit-map match if there isn't a
993 * conflict. If there is a conflict (both nodes match something
994 * outside the bitmap, but what they match outside is not the same
995 * pointer, and hence not easily compared until XXX we extend
996 * inversion lists this far), give up and allow the start class to
997 * match everything outside the bitmap. If that stuff is all above
998 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
999 if (! ANYOF_NONBITMAP(cl)) {
1000 ARG_SET(cl, ARG(or_with));
1002 else if (ARG(cl) != ARG(or_with)) {
1004 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1005 cl_anything(pRExC_state, cl);
1008 cl->flags |= ANYOF_UNICODE_ALL;
1013 /* Take the union */
1014 cl->flags |= or_with->flags;
1018 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1019 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1020 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1021 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1026 dump_trie(trie,widecharmap,revcharmap)
1027 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1028 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1030 These routines dump out a trie in a somewhat readable format.
1031 The _interim_ variants are used for debugging the interim
1032 tables that are used to generate the final compressed
1033 representation which is what dump_trie expects.
1035 Part of the reason for their existence is to provide a form
1036 of documentation as to how the different representations function.
1041 Dumps the final compressed table form of the trie to Perl_debug_log.
1042 Used for debugging make_trie().
1046 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1047 AV *revcharmap, U32 depth)
1050 SV *sv=sv_newmortal();
1051 int colwidth= widecharmap ? 6 : 4;
1053 GET_RE_DEBUG_FLAGS_DECL;
1055 PERL_ARGS_ASSERT_DUMP_TRIE;
1057 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1058 (int)depth * 2 + 2,"",
1059 "Match","Base","Ofs" );
1061 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1062 SV ** const tmp = av_fetch( revcharmap, state, 0);
1064 PerlIO_printf( Perl_debug_log, "%*s",
1066 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1067 PL_colors[0], PL_colors[1],
1068 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1069 PERL_PV_ESCAPE_FIRSTCHAR
1074 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1075 (int)depth * 2 + 2,"");
1077 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1078 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1079 PerlIO_printf( Perl_debug_log, "\n");
1081 for( state = 1 ; state < trie->statecount ; state++ ) {
1082 const U32 base = trie->states[ state ].trans.base;
1084 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1086 if ( trie->states[ state ].wordnum ) {
1087 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1089 PerlIO_printf( Perl_debug_log, "%6s", "" );
1092 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1097 while( ( base + ofs < trie->uniquecharcount ) ||
1098 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1099 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1102 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1104 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1105 if ( ( base + ofs >= trie->uniquecharcount ) &&
1106 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1107 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1109 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1111 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1113 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1117 PerlIO_printf( Perl_debug_log, "]");
1120 PerlIO_printf( Perl_debug_log, "\n" );
1122 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1123 for (word=1; word <= trie->wordcount; word++) {
1124 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1125 (int)word, (int)(trie->wordinfo[word].prev),
1126 (int)(trie->wordinfo[word].len));
1128 PerlIO_printf(Perl_debug_log, "\n" );
1131 Dumps a fully constructed but uncompressed trie in list form.
1132 List tries normally only are used for construction when the number of
1133 possible chars (trie->uniquecharcount) is very high.
1134 Used for debugging make_trie().
1137 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1138 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1142 SV *sv=sv_newmortal();
1143 int colwidth= widecharmap ? 6 : 4;
1144 GET_RE_DEBUG_FLAGS_DECL;
1146 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1148 /* print out the table precompression. */
1149 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1150 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1151 "------:-----+-----------------\n" );
1153 for( state=1 ; state < next_alloc ; state ++ ) {
1156 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1157 (int)depth * 2 + 2,"", (UV)state );
1158 if ( ! trie->states[ state ].wordnum ) {
1159 PerlIO_printf( Perl_debug_log, "%5s| ","");
1161 PerlIO_printf( Perl_debug_log, "W%4x| ",
1162 trie->states[ state ].wordnum
1165 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1166 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1168 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1170 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1171 PL_colors[0], PL_colors[1],
1172 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1173 PERL_PV_ESCAPE_FIRSTCHAR
1175 TRIE_LIST_ITEM(state,charid).forid,
1176 (UV)TRIE_LIST_ITEM(state,charid).newstate
1179 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1180 (int)((depth * 2) + 14), "");
1183 PerlIO_printf( Perl_debug_log, "\n");
1188 Dumps a fully constructed but uncompressed trie in table form.
1189 This is the normal DFA style state transition table, with a few
1190 twists to facilitate compression later.
1191 Used for debugging make_trie().
1194 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1195 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1200 SV *sv=sv_newmortal();
1201 int colwidth= widecharmap ? 6 : 4;
1202 GET_RE_DEBUG_FLAGS_DECL;
1204 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1207 print out the table precompression so that we can do a visual check
1208 that they are identical.
1211 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1213 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1214 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1216 PerlIO_printf( Perl_debug_log, "%*s",
1218 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1219 PL_colors[0], PL_colors[1],
1220 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1221 PERL_PV_ESCAPE_FIRSTCHAR
1227 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1229 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1230 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1233 PerlIO_printf( Perl_debug_log, "\n" );
1235 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1237 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1238 (int)depth * 2 + 2,"",
1239 (UV)TRIE_NODENUM( state ) );
1241 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1242 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1244 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1246 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1248 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1249 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1251 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1252 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1260 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1261 startbranch: the first branch in the whole branch sequence
1262 first : start branch of sequence of branch-exact nodes.
1263 May be the same as startbranch
1264 last : Thing following the last branch.
1265 May be the same as tail.
1266 tail : item following the branch sequence
1267 count : words in the sequence
1268 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1269 depth : indent depth
1271 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1273 A trie is an N'ary tree where the branches are determined by digital
1274 decomposition of the key. IE, at the root node you look up the 1st character and
1275 follow that branch repeat until you find the end of the branches. Nodes can be
1276 marked as "accepting" meaning they represent a complete word. Eg:
1280 would convert into the following structure. Numbers represent states, letters
1281 following numbers represent valid transitions on the letter from that state, if
1282 the number is in square brackets it represents an accepting state, otherwise it
1283 will be in parenthesis.
1285 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1289 (1) +-i->(6)-+-s->[7]
1291 +-s->(3)-+-h->(4)-+-e->[5]
1293 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1295 This shows that when matching against the string 'hers' we will begin at state 1
1296 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1297 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1298 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1299 single traverse. We store a mapping from accepting to state to which word was
1300 matched, and then when we have multiple possibilities we try to complete the
1301 rest of the regex in the order in which they occured in the alternation.
1303 The only prior NFA like behaviour that would be changed by the TRIE support is
1304 the silent ignoring of duplicate alternations which are of the form:
1306 / (DUPE|DUPE) X? (?{ ... }) Y /x
1308 Thus EVAL blocks following a trie may be called a different number of times with
1309 and without the optimisation. With the optimisations dupes will be silently
1310 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1311 the following demonstrates:
1313 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1315 which prints out 'word' three times, but
1317 'words'=~/(word|word|word)(?{ print $1 })S/
1319 which doesnt print it out at all. This is due to other optimisations kicking in.
1321 Example of what happens on a structural level:
1323 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1325 1: CURLYM[1] {1,32767}(18)
1336 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1337 and should turn into:
1339 1: CURLYM[1] {1,32767}(18)
1341 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1349 Cases where tail != last would be like /(?foo|bar)baz/:
1359 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1360 and would end up looking like:
1363 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1370 d = uvuni_to_utf8_flags(d, uv, 0);
1372 is the recommended Unicode-aware way of saying
1377 #define TRIE_STORE_REVCHAR(val) \
1380 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1381 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1382 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1383 SvCUR_set(zlopp, kapow - flrbbbbb); \
1386 av_push(revcharmap, zlopp); \
1388 char ooooff = (char)val; \
1389 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1393 #define TRIE_READ_CHAR STMT_START { \
1396 /* if it is UTF then it is either already folded, or does not need folding */ \
1397 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1399 else if (folder == PL_fold_latin1) { \
1400 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1401 if ( foldlen > 0 ) { \
1402 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1408 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1409 skiplen = UNISKIP(uvc); \
1410 foldlen -= skiplen; \
1411 scan = foldbuf + skiplen; \
1414 /* raw data, will be folded later if needed */ \
1422 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1423 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1424 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1425 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1427 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1428 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1429 TRIE_LIST_CUR( state )++; \
1432 #define TRIE_LIST_NEW(state) STMT_START { \
1433 Newxz( trie->states[ state ].trans.list, \
1434 4, reg_trie_trans_le ); \
1435 TRIE_LIST_CUR( state ) = 1; \
1436 TRIE_LIST_LEN( state ) = 4; \
1439 #define TRIE_HANDLE_WORD(state) STMT_START { \
1440 U16 dupe= trie->states[ state ].wordnum; \
1441 regnode * const noper_next = regnext( noper ); \
1444 /* store the word for dumping */ \
1446 if (OP(noper) != NOTHING) \
1447 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1449 tmp = newSVpvn_utf8( "", 0, UTF ); \
1450 av_push( trie_words, tmp ); \
1454 trie->wordinfo[curword].prev = 0; \
1455 trie->wordinfo[curword].len = wordlen; \
1456 trie->wordinfo[curword].accept = state; \
1458 if ( noper_next < tail ) { \
1460 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1461 trie->jump[curword] = (U16)(noper_next - convert); \
1463 jumper = noper_next; \
1465 nextbranch= regnext(cur); \
1469 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1470 /* chain, so that when the bits of chain are later */\
1471 /* linked together, the dups appear in the chain */\
1472 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1473 trie->wordinfo[dupe].prev = curword; \
1475 /* we haven't inserted this word yet. */ \
1476 trie->states[ state ].wordnum = curword; \
1481 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1482 ( ( base + charid >= ucharcount \
1483 && base + charid < ubound \
1484 && state == trie->trans[ base - ucharcount + charid ].check \
1485 && trie->trans[ base - ucharcount + charid ].next ) \
1486 ? trie->trans[ base - ucharcount + charid ].next \
1487 : ( state==1 ? special : 0 ) \
1491 #define MADE_JUMP_TRIE 2
1492 #define MADE_EXACT_TRIE 4
1495 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1498 /* first pass, loop through and scan words */
1499 reg_trie_data *trie;
1500 HV *widecharmap = NULL;
1501 AV *revcharmap = newAV();
1503 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1508 regnode *jumper = NULL;
1509 regnode *nextbranch = NULL;
1510 regnode *convert = NULL;
1511 U32 *prev_states; /* temp array mapping each state to previous one */
1512 /* we just use folder as a flag in utf8 */
1513 const U8 * folder = NULL;
1516 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1517 AV *trie_words = NULL;
1518 /* along with revcharmap, this only used during construction but both are
1519 * useful during debugging so we store them in the struct when debugging.
1522 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1523 STRLEN trie_charcount=0;
1525 SV *re_trie_maxbuff;
1526 GET_RE_DEBUG_FLAGS_DECL;
1528 PERL_ARGS_ASSERT_MAKE_TRIE;
1530 PERL_UNUSED_ARG(depth);
1537 case EXACTFU_TRICKYFOLD:
1538 case EXACTFU: folder = PL_fold_latin1; break;
1539 case EXACTF: folder = PL_fold; break;
1540 case EXACTFL: folder = PL_fold_locale; break;
1541 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1544 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1546 trie->startstate = 1;
1547 trie->wordcount = word_count;
1548 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1549 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1551 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1552 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1553 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1556 trie_words = newAV();
1559 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1560 if (!SvIOK(re_trie_maxbuff)) {
1561 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1563 DEBUG_TRIE_COMPILE_r({
1564 PerlIO_printf( Perl_debug_log,
1565 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1566 (int)depth * 2 + 2, "",
1567 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1568 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1572 /* Find the node we are going to overwrite */
1573 if ( first == startbranch && OP( last ) != BRANCH ) {
1574 /* whole branch chain */
1577 /* branch sub-chain */
1578 convert = NEXTOPER( first );
1581 /* -- First loop and Setup --
1583 We first traverse the branches and scan each word to determine if it
1584 contains widechars, and how many unique chars there are, this is
1585 important as we have to build a table with at least as many columns as we
1588 We use an array of integers to represent the character codes 0..255
1589 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1590 native representation of the character value as the key and IV's for the
1593 *TODO* If we keep track of how many times each character is used we can
1594 remap the columns so that the table compression later on is more
1595 efficient in terms of memory by ensuring the most common value is in the
1596 middle and the least common are on the outside. IMO this would be better
1597 than a most to least common mapping as theres a decent chance the most
1598 common letter will share a node with the least common, meaning the node
1599 will not be compressible. With a middle is most common approach the worst
1600 case is when we have the least common nodes twice.
1604 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1605 regnode *noper = NEXTOPER( cur );
1606 const U8 *uc = (U8*)STRING( noper );
1607 const U8 *e = uc + STR_LEN( noper );
1609 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1611 const U8 *scan = (U8*)NULL;
1612 U32 wordlen = 0; /* required init */
1614 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1616 if (OP(noper) == NOTHING) {
1617 regnode *noper_next= regnext(noper);
1618 if (noper_next != tail && OP(noper_next) == flags) {
1620 uc= (U8*)STRING(noper);
1621 e= uc + STR_LEN(noper);
1622 trie->minlen= STR_LEN(noper);
1629 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1630 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1631 regardless of encoding */
1632 if (OP( noper ) == EXACTFU_SS) {
1633 /* false positives are ok, so just set this */
1634 TRIE_BITMAP_SET(trie,0xDF);
1637 for ( ; uc < e ; uc += len ) {
1638 TRIE_CHARCOUNT(trie)++;
1643 U8 folded= folder[ (U8) uvc ];
1644 if ( !trie->charmap[ folded ] ) {
1645 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1646 TRIE_STORE_REVCHAR( folded );
1649 if ( !trie->charmap[ uvc ] ) {
1650 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1651 TRIE_STORE_REVCHAR( uvc );
1654 /* store the codepoint in the bitmap, and its folded
1656 TRIE_BITMAP_SET(trie, uvc);
1658 /* store the folded codepoint */
1659 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1662 /* store first byte of utf8 representation of
1663 variant codepoints */
1664 if (! UNI_IS_INVARIANT(uvc)) {
1665 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1668 set_bit = 0; /* We've done our bit :-) */
1673 widecharmap = newHV();
1675 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1678 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1680 if ( !SvTRUE( *svpp ) ) {
1681 sv_setiv( *svpp, ++trie->uniquecharcount );
1682 TRIE_STORE_REVCHAR(uvc);
1686 if( cur == first ) {
1687 trie->minlen = chars;
1688 trie->maxlen = chars;
1689 } else if (chars < trie->minlen) {
1690 trie->minlen = chars;
1691 } else if (chars > trie->maxlen) {
1692 trie->maxlen = chars;
1694 if (OP( noper ) == EXACTFU_SS) {
1695 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1696 if (trie->minlen > 1)
1699 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1700 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1701 * - We assume that any such sequence might match a 2 byte string */
1702 if (trie->minlen > 2 )
1706 } /* end first pass */
1707 DEBUG_TRIE_COMPILE_r(
1708 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1709 (int)depth * 2 + 2,"",
1710 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1711 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1712 (int)trie->minlen, (int)trie->maxlen )
1716 We now know what we are dealing with in terms of unique chars and
1717 string sizes so we can calculate how much memory a naive
1718 representation using a flat table will take. If it's over a reasonable
1719 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1720 conservative but potentially much slower representation using an array
1723 At the end we convert both representations into the same compressed
1724 form that will be used in regexec.c for matching with. The latter
1725 is a form that cannot be used to construct with but has memory
1726 properties similar to the list form and access properties similar
1727 to the table form making it both suitable for fast searches and
1728 small enough that its feasable to store for the duration of a program.
1730 See the comment in the code where the compressed table is produced
1731 inplace from the flat tabe representation for an explanation of how
1732 the compression works.
1737 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1740 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1742 Second Pass -- Array Of Lists Representation
1744 Each state will be represented by a list of charid:state records
1745 (reg_trie_trans_le) the first such element holds the CUR and LEN
1746 points of the allocated array. (See defines above).
1748 We build the initial structure using the lists, and then convert
1749 it into the compressed table form which allows faster lookups
1750 (but cant be modified once converted).
1753 STRLEN transcount = 1;
1755 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1756 "%*sCompiling trie using list compiler\n",
1757 (int)depth * 2 + 2, ""));
1759 trie->states = (reg_trie_state *)
1760 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1761 sizeof(reg_trie_state) );
1765 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1767 regnode *noper = NEXTOPER( cur );
1768 U8 *uc = (U8*)STRING( noper );
1769 const U8 *e = uc + STR_LEN( noper );
1770 U32 state = 1; /* required init */
1771 U16 charid = 0; /* sanity init */
1772 U8 *scan = (U8*)NULL; /* sanity init */
1773 STRLEN foldlen = 0; /* required init */
1774 U32 wordlen = 0; /* required init */
1775 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1778 if (OP(noper) == NOTHING) {
1779 regnode *noper_next= regnext(noper);
1780 if (noper_next != tail && OP(noper_next) == flags) {
1782 uc= (U8*)STRING(noper);
1783 e= uc + STR_LEN(noper);
1787 if (OP(noper) != NOTHING) {
1788 for ( ; uc < e ; uc += len ) {
1793 charid = trie->charmap[ uvc ];
1795 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1799 charid=(U16)SvIV( *svpp );
1802 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1809 if ( !trie->states[ state ].trans.list ) {
1810 TRIE_LIST_NEW( state );
1812 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1813 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1814 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1819 newstate = next_alloc++;
1820 prev_states[newstate] = state;
1821 TRIE_LIST_PUSH( state, charid, newstate );
1826 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1830 TRIE_HANDLE_WORD(state);
1832 } /* end second pass */
1834 /* next alloc is the NEXT state to be allocated */
1835 trie->statecount = next_alloc;
1836 trie->states = (reg_trie_state *)
1837 PerlMemShared_realloc( trie->states,
1839 * sizeof(reg_trie_state) );
1841 /* and now dump it out before we compress it */
1842 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1843 revcharmap, next_alloc,
1847 trie->trans = (reg_trie_trans *)
1848 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1855 for( state=1 ; state < next_alloc ; state ++ ) {
1859 DEBUG_TRIE_COMPILE_MORE_r(
1860 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1864 if (trie->states[state].trans.list) {
1865 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1869 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1870 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1871 if ( forid < minid ) {
1873 } else if ( forid > maxid ) {
1877 if ( transcount < tp + maxid - minid + 1) {
1879 trie->trans = (reg_trie_trans *)
1880 PerlMemShared_realloc( trie->trans,
1882 * sizeof(reg_trie_trans) );
1883 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1885 base = trie->uniquecharcount + tp - minid;
1886 if ( maxid == minid ) {
1888 for ( ; zp < tp ; zp++ ) {
1889 if ( ! trie->trans[ zp ].next ) {
1890 base = trie->uniquecharcount + zp - minid;
1891 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1892 trie->trans[ zp ].check = state;
1898 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1899 trie->trans[ tp ].check = state;
1904 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1905 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1906 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1907 trie->trans[ tid ].check = state;
1909 tp += ( maxid - minid + 1 );
1911 Safefree(trie->states[ state ].trans.list);
1914 DEBUG_TRIE_COMPILE_MORE_r(
1915 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1918 trie->states[ state ].trans.base=base;
1920 trie->lasttrans = tp + 1;
1924 Second Pass -- Flat Table Representation.
1926 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1927 We know that we will need Charcount+1 trans at most to store the data
1928 (one row per char at worst case) So we preallocate both structures
1929 assuming worst case.
1931 We then construct the trie using only the .next slots of the entry
1934 We use the .check field of the first entry of the node temporarily to
1935 make compression both faster and easier by keeping track of how many non
1936 zero fields are in the node.
1938 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1941 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1942 number representing the first entry of the node, and state as a
1943 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1944 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1945 are 2 entrys per node. eg:
1953 The table is internally in the right hand, idx form. However as we also
1954 have to deal with the states array which is indexed by nodenum we have to
1955 use TRIE_NODENUM() to convert.
1958 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1959 "%*sCompiling trie using table compiler\n",
1960 (int)depth * 2 + 2, ""));
1962 trie->trans = (reg_trie_trans *)
1963 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1964 * trie->uniquecharcount + 1,
1965 sizeof(reg_trie_trans) );
1966 trie->states = (reg_trie_state *)
1967 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1968 sizeof(reg_trie_state) );
1969 next_alloc = trie->uniquecharcount + 1;
1972 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1974 regnode *noper = NEXTOPER( cur );
1975 const U8 *uc = (U8*)STRING( noper );
1976 const U8 *e = uc + STR_LEN( noper );
1978 U32 state = 1; /* required init */
1980 U16 charid = 0; /* sanity init */
1981 U32 accept_state = 0; /* sanity init */
1982 U8 *scan = (U8*)NULL; /* sanity init */
1984 STRLEN foldlen = 0; /* required init */
1985 U32 wordlen = 0; /* required init */
1987 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1989 if (OP(noper) == NOTHING) {
1990 regnode *noper_next= regnext(noper);
1991 if (noper_next != tail && OP(noper_next) == flags) {
1993 uc= (U8*)STRING(noper);
1994 e= uc + STR_LEN(noper);
1998 if ( OP(noper) != NOTHING ) {
1999 for ( ; uc < e ; uc += len ) {
2004 charid = trie->charmap[ uvc ];
2006 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2007 charid = svpp ? (U16)SvIV(*svpp) : 0;
2011 if ( !trie->trans[ state + charid ].next ) {
2012 trie->trans[ state + charid ].next = next_alloc;
2013 trie->trans[ state ].check++;
2014 prev_states[TRIE_NODENUM(next_alloc)]
2015 = TRIE_NODENUM(state);
2016 next_alloc += trie->uniquecharcount;
2018 state = trie->trans[ state + charid ].next;
2020 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2022 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2025 accept_state = TRIE_NODENUM( state );
2026 TRIE_HANDLE_WORD(accept_state);
2028 } /* end second pass */
2030 /* and now dump it out before we compress it */
2031 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2033 next_alloc, depth+1));
2037 * Inplace compress the table.*
2039 For sparse data sets the table constructed by the trie algorithm will
2040 be mostly 0/FAIL transitions or to put it another way mostly empty.
2041 (Note that leaf nodes will not contain any transitions.)
2043 This algorithm compresses the tables by eliminating most such
2044 transitions, at the cost of a modest bit of extra work during lookup:
2046 - Each states[] entry contains a .base field which indicates the
2047 index in the state[] array wheres its transition data is stored.
2049 - If .base is 0 there are no valid transitions from that node.
2051 - If .base is nonzero then charid is added to it to find an entry in
2054 -If trans[states[state].base+charid].check!=state then the
2055 transition is taken to be a 0/Fail transition. Thus if there are fail
2056 transitions at the front of the node then the .base offset will point
2057 somewhere inside the previous nodes data (or maybe even into a node
2058 even earlier), but the .check field determines if the transition is
2062 The following process inplace converts the table to the compressed
2063 table: We first do not compress the root node 1,and mark all its
2064 .check pointers as 1 and set its .base pointer as 1 as well. This
2065 allows us to do a DFA construction from the compressed table later,
2066 and ensures that any .base pointers we calculate later are greater
2069 - We set 'pos' to indicate the first entry of the second node.
2071 - We then iterate over the columns of the node, finding the first and
2072 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2073 and set the .check pointers accordingly, and advance pos
2074 appropriately and repreat for the next node. Note that when we copy
2075 the next pointers we have to convert them from the original
2076 NODEIDX form to NODENUM form as the former is not valid post
2079 - If a node has no transitions used we mark its base as 0 and do not
2080 advance the pos pointer.
2082 - If a node only has one transition we use a second pointer into the
2083 structure to fill in allocated fail transitions from other states.
2084 This pointer is independent of the main pointer and scans forward
2085 looking for null transitions that are allocated to a state. When it
2086 finds one it writes the single transition into the "hole". If the
2087 pointer doesnt find one the single transition is appended as normal.
2089 - Once compressed we can Renew/realloc the structures to release the
2092 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2093 specifically Fig 3.47 and the associated pseudocode.
2097 const U32 laststate = TRIE_NODENUM( next_alloc );
2100 trie->statecount = laststate;
2102 for ( state = 1 ; state < laststate ; state++ ) {
2104 const U32 stateidx = TRIE_NODEIDX( state );
2105 const U32 o_used = trie->trans[ stateidx ].check;
2106 U32 used = trie->trans[ stateidx ].check;
2107 trie->trans[ stateidx ].check = 0;
2109 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2110 if ( flag || trie->trans[ stateidx + charid ].next ) {
2111 if ( trie->trans[ stateidx + charid ].next ) {
2113 for ( ; zp < pos ; zp++ ) {
2114 if ( ! trie->trans[ zp ].next ) {
2118 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2119 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2120 trie->trans[ zp ].check = state;
2121 if ( ++zp > pos ) pos = zp;
2128 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2130 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2131 trie->trans[ pos ].check = state;
2136 trie->lasttrans = pos + 1;
2137 trie->states = (reg_trie_state *)
2138 PerlMemShared_realloc( trie->states, laststate
2139 * sizeof(reg_trie_state) );
2140 DEBUG_TRIE_COMPILE_MORE_r(
2141 PerlIO_printf( Perl_debug_log,
2142 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2143 (int)depth * 2 + 2,"",
2144 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2147 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2150 } /* end table compress */
2152 DEBUG_TRIE_COMPILE_MORE_r(
2153 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2154 (int)depth * 2 + 2, "",
2155 (UV)trie->statecount,
2156 (UV)trie->lasttrans)
2158 /* resize the trans array to remove unused space */
2159 trie->trans = (reg_trie_trans *)
2160 PerlMemShared_realloc( trie->trans, trie->lasttrans
2161 * sizeof(reg_trie_trans) );
2163 { /* Modify the program and insert the new TRIE node */
2164 U8 nodetype =(U8)(flags & 0xFF);
2168 regnode *optimize = NULL;
2169 #ifdef RE_TRACK_PATTERN_OFFSETS
2172 U32 mjd_nodelen = 0;
2173 #endif /* RE_TRACK_PATTERN_OFFSETS */
2174 #endif /* DEBUGGING */
2176 This means we convert either the first branch or the first Exact,
2177 depending on whether the thing following (in 'last') is a branch
2178 or not and whther first is the startbranch (ie is it a sub part of
2179 the alternation or is it the whole thing.)
2180 Assuming its a sub part we convert the EXACT otherwise we convert
2181 the whole branch sequence, including the first.
2183 /* Find the node we are going to overwrite */
2184 if ( first != startbranch || OP( last ) == BRANCH ) {
2185 /* branch sub-chain */
2186 NEXT_OFF( first ) = (U16)(last - first);
2187 #ifdef RE_TRACK_PATTERN_OFFSETS
2189 mjd_offset= Node_Offset((convert));
2190 mjd_nodelen= Node_Length((convert));
2193 /* whole branch chain */
2195 #ifdef RE_TRACK_PATTERN_OFFSETS
2198 const regnode *nop = NEXTOPER( convert );
2199 mjd_offset= Node_Offset((nop));
2200 mjd_nodelen= Node_Length((nop));
2204 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2205 (int)depth * 2 + 2, "",
2206 (UV)mjd_offset, (UV)mjd_nodelen)
2209 /* But first we check to see if there is a common prefix we can
2210 split out as an EXACT and put in front of the TRIE node. */
2211 trie->startstate= 1;
2212 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2214 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2218 const U32 base = trie->states[ state ].trans.base;
2220 if ( trie->states[state].wordnum )
2223 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2224 if ( ( base + ofs >= trie->uniquecharcount ) &&
2225 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2226 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2228 if ( ++count > 1 ) {
2229 SV **tmp = av_fetch( revcharmap, ofs, 0);
2230 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2231 if ( state == 1 ) break;
2233 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2235 PerlIO_printf(Perl_debug_log,
2236 "%*sNew Start State=%"UVuf" Class: [",
2237 (int)depth * 2 + 2, "",
2240 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2241 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2243 TRIE_BITMAP_SET(trie,*ch);
2245 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2247 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2251 TRIE_BITMAP_SET(trie,*ch);
2253 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2254 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2260 SV **tmp = av_fetch( revcharmap, idx, 0);
2262 char *ch = SvPV( *tmp, len );
2264 SV *sv=sv_newmortal();
2265 PerlIO_printf( Perl_debug_log,
2266 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2267 (int)depth * 2 + 2, "",
2269 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2270 PL_colors[0], PL_colors[1],
2271 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2272 PERL_PV_ESCAPE_FIRSTCHAR
2277 OP( convert ) = nodetype;
2278 str=STRING(convert);
2281 STR_LEN(convert) += len;
2287 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2292 trie->prefixlen = (state-1);
2294 regnode *n = convert+NODE_SZ_STR(convert);
2295 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2296 trie->startstate = state;
2297 trie->minlen -= (state - 1);
2298 trie->maxlen -= (state - 1);
2300 /* At least the UNICOS C compiler choked on this
2301 * being argument to DEBUG_r(), so let's just have
2304 #ifdef PERL_EXT_RE_BUILD
2310 regnode *fix = convert;
2311 U32 word = trie->wordcount;
2313 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2314 while( ++fix < n ) {
2315 Set_Node_Offset_Length(fix, 0, 0);
2318 SV ** const tmp = av_fetch( trie_words, word, 0 );
2320 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2321 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2323 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2331 NEXT_OFF(convert) = (U16)(tail - convert);
2332 DEBUG_r(optimize= n);
2338 if ( trie->maxlen ) {
2339 NEXT_OFF( convert ) = (U16)(tail - convert);
2340 ARG_SET( convert, data_slot );
2341 /* Store the offset to the first unabsorbed branch in
2342 jump[0], which is otherwise unused by the jump logic.
2343 We use this when dumping a trie and during optimisation. */
2345 trie->jump[0] = (U16)(nextbranch - convert);
2347 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2348 * and there is a bitmap
2349 * and the first "jump target" node we found leaves enough room
2350 * then convert the TRIE node into a TRIEC node, with the bitmap
2351 * embedded inline in the opcode - this is hypothetically faster.
2353 if ( !trie->states[trie->startstate].wordnum
2355 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2357 OP( convert ) = TRIEC;
2358 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2359 PerlMemShared_free(trie->bitmap);
2362 OP( convert ) = TRIE;
2364 /* store the type in the flags */
2365 convert->flags = nodetype;
2369 + regarglen[ OP( convert ) ];
2371 /* XXX We really should free up the resource in trie now,
2372 as we won't use them - (which resources?) dmq */
2374 /* needed for dumping*/
2375 DEBUG_r(if (optimize) {
2376 regnode *opt = convert;
2378 while ( ++opt < optimize) {
2379 Set_Node_Offset_Length(opt,0,0);
2382 Try to clean up some of the debris left after the
2385 while( optimize < jumper ) {
2386 mjd_nodelen += Node_Length((optimize));
2387 OP( optimize ) = OPTIMIZED;
2388 Set_Node_Offset_Length(optimize,0,0);
2391 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2393 } /* end node insert */
2395 /* Finish populating the prev field of the wordinfo array. Walk back
2396 * from each accept state until we find another accept state, and if
2397 * so, point the first word's .prev field at the second word. If the
2398 * second already has a .prev field set, stop now. This will be the
2399 * case either if we've already processed that word's accept state,
2400 * or that state had multiple words, and the overspill words were
2401 * already linked up earlier.
2408 for (word=1; word <= trie->wordcount; word++) {
2410 if (trie->wordinfo[word].prev)
2412 state = trie->wordinfo[word].accept;
2414 state = prev_states[state];
2417 prev = trie->states[state].wordnum;
2421 trie->wordinfo[word].prev = prev;
2423 Safefree(prev_states);
2427 /* and now dump out the compressed format */
2428 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2430 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2432 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2433 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2435 SvREFCNT_dec(revcharmap);
2439 : trie->startstate>1
2445 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2447 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2449 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2450 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2453 We find the fail state for each state in the trie, this state is the longest proper
2454 suffix of the current state's 'word' that is also a proper prefix of another word in our
2455 trie. State 1 represents the word '' and is thus the default fail state. This allows
2456 the DFA not to have to restart after its tried and failed a word at a given point, it
2457 simply continues as though it had been matching the other word in the first place.
2459 'abcdgu'=~/abcdefg|cdgu/
2460 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2461 fail, which would bring us to the state representing 'd' in the second word where we would
2462 try 'g' and succeed, proceeding to match 'cdgu'.
2464 /* add a fail transition */
2465 const U32 trie_offset = ARG(source);
2466 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2468 const U32 ucharcount = trie->uniquecharcount;
2469 const U32 numstates = trie->statecount;
2470 const U32 ubound = trie->lasttrans + ucharcount;
2474 U32 base = trie->states[ 1 ].trans.base;
2477 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2478 GET_RE_DEBUG_FLAGS_DECL;
2480 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2482 PERL_UNUSED_ARG(depth);
2486 ARG_SET( stclass, data_slot );
2487 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2488 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2489 aho->trie=trie_offset;
2490 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2491 Copy( trie->states, aho->states, numstates, reg_trie_state );
2492 Newxz( q, numstates, U32);
2493 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2496 /* initialize fail[0..1] to be 1 so that we always have
2497 a valid final fail state */
2498 fail[ 0 ] = fail[ 1 ] = 1;
2500 for ( charid = 0; charid < ucharcount ; charid++ ) {
2501 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2503 q[ q_write ] = newstate;
2504 /* set to point at the root */
2505 fail[ q[ q_write++ ] ]=1;
2508 while ( q_read < q_write) {
2509 const U32 cur = q[ q_read++ % numstates ];
2510 base = trie->states[ cur ].trans.base;
2512 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2513 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2515 U32 fail_state = cur;
2518 fail_state = fail[ fail_state ];
2519 fail_base = aho->states[ fail_state ].trans.base;
2520 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2522 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2523 fail[ ch_state ] = fail_state;
2524 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2526 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2528 q[ q_write++ % numstates] = ch_state;
2532 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2533 when we fail in state 1, this allows us to use the
2534 charclass scan to find a valid start char. This is based on the principle
2535 that theres a good chance the string being searched contains lots of stuff
2536 that cant be a start char.
2538 fail[ 0 ] = fail[ 1 ] = 0;
2539 DEBUG_TRIE_COMPILE_r({
2540 PerlIO_printf(Perl_debug_log,
2541 "%*sStclass Failtable (%"UVuf" states): 0",
2542 (int)(depth * 2), "", (UV)numstates
2544 for( q_read=1; q_read<numstates; q_read++ ) {
2545 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2547 PerlIO_printf(Perl_debug_log, "\n");
2550 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2555 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2556 * These need to be revisited when a newer toolchain becomes available.
2558 #if defined(__sparc64__) && defined(__GNUC__)
2559 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2560 # undef SPARC64_GCC_WORKAROUND
2561 # define SPARC64_GCC_WORKAROUND 1
2565 #define DEBUG_PEEP(str,scan,depth) \
2566 DEBUG_OPTIMISE_r({if (scan){ \
2567 SV * const mysv=sv_newmortal(); \
2568 regnode *Next = regnext(scan); \
2569 regprop(RExC_rx, mysv, scan); \
2570 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2571 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2572 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2576 /* The below joins as many adjacent EXACTish nodes as possible into a single
2577 * one, and looks for problematic sequences of characters whose folds vs.
2578 * non-folds have sufficiently different lengths, that the optimizer would be
2579 * fooled into rejecting legitimate matches of them, and the trie construction
2580 * code can't cope with them. The joining is only done if:
2581 * 1) there is room in the current conglomerated node to entirely contain the
2583 * 2) they are the exact same node type
2585 * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2586 * these get optimized out
2588 * If there are problematic code sequences, *min_subtract is set to the delta
2589 * that the minimum size of the node can be less than its actual size. And,
2590 * the node type of the result is changed to reflect that it contains these
2593 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2594 * and contains LATIN SMALL LETTER SHARP S
2596 * This is as good a place as any to discuss the design of handling these
2597 * problematic sequences. It's been wrong in Perl for a very long time. There
2598 * are three code points in Unicode whose folded lengths differ so much from
2599 * the un-folded lengths that it causes problems for the optimizer and trie
2600 * construction. Why only these are problematic, and not others where lengths
2601 * also differ is something I (khw) do not understand. New versions of Unicode
2602 * might add more such code points. Hopefully the logic in fold_grind.t that
2603 * figures out what to test (in part by verifying that each size-combination
2604 * gets tested) will catch any that do come along, so they can be added to the
2605 * special handling below. The chances of new ones are actually rather small,
2606 * as most, if not all, of the world's scripts that have casefolding have
2607 * already been encoded by Unicode. Also, a number of Unicode's decisions were
2608 * made to allow compatibility with pre-existing standards, and almost all of
2609 * those have already been dealt with. These would otherwise be the most
2610 * likely candidates for generating further tricky sequences. In other words,
2611 * Unicode by itself is unlikely to add new ones unless it is for compatibility
2612 * with pre-existing standards, and there aren't many of those left.
2614 * The previous designs for dealing with these involved assigning a special
2615 * node for them. This approach doesn't work, as evidenced by this example:
2616 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2617 * Both these fold to "sss", but if the pattern is parsed to create a node of
2618 * that would match just the \xDF, it won't be able to handle the case where a
2619 * successful match would have to cross the node's boundary. The new approach
2620 * that hopefully generally solves the problem generates an EXACTFU_SS node
2623 * There are a number of components to the approach (a lot of work for just
2624 * three code points!):
2625 * 1) This routine examines each EXACTFish node that could contain the
2626 * problematic sequences. It returns in *min_subtract how much to
2627 * subtract from the the actual length of the string to get a real minimum
2628 * for one that could match it. This number is usually 0 except for the
2629 * problematic sequences. This delta is used by the caller to adjust the
2630 * min length of the match, and the delta between min and max, so that the
2631 * optimizer doesn't reject these possibilities based on size constraints.
2632 * 2) These sequences are not currently correctly handled by the trie code
2633 * either, so it changes the joined node type to ops that are not handled
2634 * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
2635 * 3) This is sufficient for the two Greek sequences (described below), but
2636 * the one involving the Sharp s (\xDF) needs more. The node type
2637 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2638 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2639 * case where there is a possible fold length change. That means that a
2640 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2641 * itself with length changes, and so can be processed faster. regexec.c
2642 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2643 * is pre-folded by regcomp.c. This saves effort in regex matching.
2644 * However, probably mostly for historical reasons, the pre-folding isn't
2645 * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2646 * nodes, as what they fold to isn't known until runtime.) The fold
2647 * possibilities for the non-UTF8 patterns are quite simple, except for
2648 * the sharp s. All the ones that don't involve a UTF-8 target string
2649 * are members of a fold-pair, and arrays are set up for all of them
2650 * that quickly find the other member of the pair. It might actually
2651 * be faster to pre-fold these, but it isn't currently done, except for
2652 * the sharp s. Code elsewhere in this file makes sure that it gets
2653 * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
2654 * issues described in the next item.
2655 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2656 * 'ss' or not is not knowable at compile time. It will match iff the
2657 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2658 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2659 * it can't be folded to "ss" at compile time, unlike EXACTFU does as
2660 * described in item 3). An assumption that the optimizer part of
2661 * regexec.c (probably unwittingly) makes is that a character in the
2662 * pattern corresponds to at most a single character in the target string.
2663 * (And I do mean character, and not byte here, unlike other parts of the
2664 * documentation that have never been updated to account for multibyte
2665 * Unicode.) This assumption is wrong only in this case, as all other
2666 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2667 * virtue of having this file pre-fold UTF-8 patterns. I'm
2668 * reluctant to try to change this assumption, so instead the code punts.
2669 * This routine examines EXACTF nodes for the sharp s, and returns a
2670 * boolean indicating whether or not the node is an EXACTF node that
2671 * contains a sharp s. When it is true, the caller sets a flag that later
2672 * causes the optimizer in this file to not set values for the floating
2673 * and fixed string lengths, and thus avoids the optimizer code in
2674 * regexec.c that makes the invalid assumption. Thus, there is no
2675 * optimization based on string lengths for EXACTF nodes that contain the
2676 * sharp s. This only happens for /id rules (which means the pattern
2680 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2681 if (PL_regkind[OP(scan)] == EXACT) \
2682 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2685 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2686 /* Merge several consecutive EXACTish nodes into one. */
2687 regnode *n = regnext(scan);
2689 regnode *next = scan + NODE_SZ_STR(scan);
2693 regnode *stop = scan;
2694 GET_RE_DEBUG_FLAGS_DECL;
2696 PERL_UNUSED_ARG(depth);
2699 PERL_ARGS_ASSERT_JOIN_EXACT;
2700 #ifndef EXPERIMENTAL_INPLACESCAN
2701 PERL_UNUSED_ARG(flags);
2702 PERL_UNUSED_ARG(val);
2704 DEBUG_PEEP("join",scan,depth);
2706 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2707 * EXACT ones that are mergeable to the current one. */
2709 && (PL_regkind[OP(n)] == NOTHING
2710 || (stringok && OP(n) == OP(scan)))
2712 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2715 if (OP(n) == TAIL || n > next)
2717 if (PL_regkind[OP(n)] == NOTHING) {
2718 DEBUG_PEEP("skip:",n,depth);
2719 NEXT_OFF(scan) += NEXT_OFF(n);
2720 next = n + NODE_STEP_REGNODE;
2727 else if (stringok) {
2728 const unsigned int oldl = STR_LEN(scan);
2729 regnode * const nnext = regnext(n);
2731 if (oldl + STR_LEN(n) > U8_MAX)
2734 DEBUG_PEEP("merg",n,depth);
2737 NEXT_OFF(scan) += NEXT_OFF(n);
2738 STR_LEN(scan) += STR_LEN(n);
2739 next = n + NODE_SZ_STR(n);
2740 /* Now we can overwrite *n : */
2741 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2749 #ifdef EXPERIMENTAL_INPLACESCAN
2750 if (flags && !NEXT_OFF(n)) {
2751 DEBUG_PEEP("atch", val, depth);
2752 if (reg_off_by_arg[OP(n)]) {
2753 ARG_SET(n, val - n);
2756 NEXT_OFF(n) = val - n;
2764 *has_exactf_sharp_s = FALSE;
2766 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2767 * can now analyze for sequences of problematic code points. (Prior to
2768 * this final joining, sequences could have been split over boundaries, and
2769 * hence missed). The sequences only happen in folding, hence for any
2770 * non-EXACT EXACTish node */
2771 if (OP(scan) != EXACT) {
2773 U8 * s0 = (U8*) STRING(scan);
2774 U8 * const s_end = s0 + STR_LEN(scan);
2776 /* The below is perhaps overboard, but this allows us to save a test
2777 * each time through the loop at the expense of a mask. This is
2778 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2779 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2780 * This uses an exclusive 'or' to find that bit and then inverts it to
2781 * form a mask, with just a single 0, in the bit position where 'S' and
2783 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2784 const U8 s_masked = 's' & S_or_s_mask;
2786 /* One pass is made over the node's string looking for all the
2787 * possibilities. to avoid some tests in the loop, there are two main
2788 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2792 /* There are two problematic Greek code points in Unicode
2795 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2796 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2802 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2803 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2805 * This means that in case-insensitive matching (or "loose
2806 * matching", as Unicode calls it), an EXACTF of length six (the
2807 * UTF-8 encoded byte length of the above casefolded versions) can
2808 * match a target string of length two (the byte length of UTF-8
2809 * encoded U+0390 or U+03B0). This would rather mess up the
2810 * minimum length computation. (there are other code points that
2811 * also fold to these two sequences, but the delta is smaller)
2813 * If these sequences are found, the minimum length is decreased by
2814 * four (six minus two).
2816 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2817 * LETTER SHARP S. We decrease the min length by 1 for each
2818 * occurrence of 'ss' found */
2820 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2821 # define U390_first_byte 0xb4
2822 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2823 # define U3B0_first_byte 0xb5
2824 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2826 # define U390_first_byte 0xce
2827 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2828 # define U3B0_first_byte 0xcf
2829 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2831 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2832 yields a net of 0 */
2833 /* Examine the string for one of the problematic sequences */
2835 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2836 * sequence we are looking for is 2 */
2840 /* Look for the first byte in each problematic sequence */
2842 /* We don't have to worry about other things that fold to
2843 * 's' (such as the long s, U+017F), as all above-latin1
2844 * code points have been pre-folded */
2848 /* Current character is an 's' or 'S'. If next one is
2849 * as well, we have the dreaded sequence */
2850 if (((*(s+1) & S_or_s_mask) == s_masked)
2851 /* These two node types don't have special handling
2853 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2856 OP(scan) = EXACTFU_SS;
2857 s++; /* No need to look at this character again */
2861 case U390_first_byte:
2862 if (s_end - s >= len
2864 /* The 1's are because are skipping comparing the
2866 && memEQ(s + 1, U390_tail, len - 1))
2868 goto greek_sequence;
2872 case U3B0_first_byte:
2873 if (! (s_end - s >= len
2874 && memEQ(s + 1, U3B0_tail, len - 1)))
2881 /* This can't currently be handled by trie's, so change
2882 * the node type to indicate this. If EXACTFA and
2883 * EXACTFL were ever to be handled by trie's, this
2884 * would have to be changed. If this node has already
2885 * been changed to EXACTFU_SS in this loop, leave it as
2886 * is. (I (khw) think it doesn't matter in regexec.c
2887 * for UTF patterns, but no need to change it */
2888 if (OP(scan) == EXACTFU) {
2889 OP(scan) = EXACTFU_TRICKYFOLD;
2891 s += 6; /* We already know what this sequence is. Skip
2897 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2899 /* Here, the pattern is not UTF-8. We need to look only for the
2900 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2901 * in the final position. Otherwise we can stop looking 1 byte
2902 * earlier because have to find both the first and second 's' */
2903 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2905 for (s = s0; s < upper; s++) {
2910 && ((*(s+1) & S_or_s_mask) == s_masked))
2914 /* EXACTF nodes need to know that the minimum
2915 * length changed so that a sharp s in the string
2916 * can match this ss in the pattern, but they
2917 * remain EXACTF nodes, as they are not trie'able,
2918 * so don't have to invent a new node type to
2919 * exclude them from the trie code */
2920 if (OP(scan) != EXACTF) {
2921 OP(scan) = EXACTFU_SS;
2926 case LATIN_SMALL_LETTER_SHARP_S:
2927 if (OP(scan) == EXACTF) {
2928 *has_exactf_sharp_s = TRUE;
2937 /* Allow dumping but overwriting the collection of skipped
2938 * ops and/or strings with fake optimized ops */
2939 n = scan + NODE_SZ_STR(scan);
2947 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2951 /* REx optimizer. Converts nodes into quicker variants "in place".
2952 Finds fixed substrings. */
2954 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2955 to the position after last scanned or to NULL. */
2957 #define INIT_AND_WITHP \
2958 assert(!and_withp); \
2959 Newx(and_withp,1,struct regnode_charclass_class); \
2960 SAVEFREEPV(and_withp)
2962 /* this is a chain of data about sub patterns we are processing that
2963 need to be handled separately/specially in study_chunk. Its so
2964 we can simulate recursion without losing state. */
2966 typedef struct scan_frame {
2967 regnode *last; /* last node to process in this frame */
2968 regnode *next; /* next node to process when last is reached */
2969 struct scan_frame *prev; /*previous frame*/
2970 I32 stop; /* what stopparen do we use */
2974 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2976 #define CASE_SYNST_FNC(nAmE) \
2978 if (flags & SCF_DO_STCLASS_AND) { \
2979 for (value = 0; value < 256; value++) \
2980 if (!is_ ## nAmE ## _cp(value)) \
2981 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2984 for (value = 0; value < 256; value++) \
2985 if (is_ ## nAmE ## _cp(value)) \
2986 ANYOF_BITMAP_SET(data->start_class, value); \
2990 if (flags & SCF_DO_STCLASS_AND) { \
2991 for (value = 0; value < 256; value++) \
2992 if (is_ ## nAmE ## _cp(value)) \
2993 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2996 for (value = 0; value < 256; value++) \
2997 if (!is_ ## nAmE ## _cp(value)) \
2998 ANYOF_BITMAP_SET(data->start_class, value); \
3005 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3006 I32 *minlenp, I32 *deltap,
3011 struct regnode_charclass_class *and_withp,
3012 U32 flags, U32 depth)
3013 /* scanp: Start here (read-write). */
3014 /* deltap: Write maxlen-minlen here. */
3015 /* last: Stop before this one. */
3016 /* data: string data about the pattern */
3017 /* stopparen: treat close N as END */
3018 /* recursed: which subroutines have we recursed into */
3019 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3022 I32 min = 0, pars = 0, code;
3023 regnode *scan = *scanp, *next;
3025 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3026 int is_inf_internal = 0; /* The studied chunk is infinite */
3027 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3028 scan_data_t data_fake;
3029 SV *re_trie_maxbuff = NULL;
3030 regnode *first_non_open = scan;
3031 I32 stopmin = I32_MAX;
3032 scan_frame *frame = NULL;
3033 GET_RE_DEBUG_FLAGS_DECL;
3035 PERL_ARGS_ASSERT_STUDY_CHUNK;
3038 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3042 while (first_non_open && OP(first_non_open) == OPEN)
3043 first_non_open=regnext(first_non_open);
3048 while ( scan && OP(scan) != END && scan < last ){
3049 UV min_subtract = 0; /* How much to subtract from the minimum node
3050 length to get a real minimum (because the
3051 folded version may be shorter) */
3052 bool has_exactf_sharp_s = FALSE;
3053 /* Peephole optimizer: */
3054 DEBUG_STUDYDATA("Peep:", data,depth);
3055 DEBUG_PEEP("Peep",scan,depth);
3057 /* Its not clear to khw or hv why this is done here, and not in the
3058 * clauses that deal with EXACT nodes. khw's guess is that it's
3059 * because of a previous design */
3060 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3062 /* Follow the next-chain of the current node and optimize
3063 away all the NOTHINGs from it. */
3064 if (OP(scan) != CURLYX) {
3065 const int max = (reg_off_by_arg[OP(scan)]
3067 /* I32 may be smaller than U16 on CRAYs! */
3068 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3069 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3073 /* Skip NOTHING and LONGJMP. */
3074 while ((n = regnext(n))
3075 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3076 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3077 && off + noff < max)
3079 if (reg_off_by_arg[OP(scan)])
3082 NEXT_OFF(scan) = off;
3087 /* The principal pseudo-switch. Cannot be a switch, since we
3088 look into several different things. */
3089 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3090 || OP(scan) == IFTHEN) {
3091 next = regnext(scan);
3093 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3095 if (OP(next) == code || code == IFTHEN) {
3096 /* NOTE - There is similar code to this block below for handling
3097 TRIE nodes on a re-study. If you change stuff here check there
3099 I32 max1 = 0, min1 = I32_MAX, num = 0;
3100 struct regnode_charclass_class accum;
3101 regnode * const startbranch=scan;
3103 if (flags & SCF_DO_SUBSTR)
3104 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3105 if (flags & SCF_DO_STCLASS)
3106 cl_init_zero(pRExC_state, &accum);
3108 while (OP(scan) == code) {
3109 I32 deltanext, minnext, f = 0, fake;
3110 struct regnode_charclass_class this_class;
3113 data_fake.flags = 0;
3115 data_fake.whilem_c = data->whilem_c;
3116 data_fake.last_closep = data->last_closep;
3119 data_fake.last_closep = &fake;
3121 data_fake.pos_delta = delta;
3122 next = regnext(scan);
3123 scan = NEXTOPER(scan);
3125 scan = NEXTOPER(scan);
3126 if (flags & SCF_DO_STCLASS) {
3127 cl_init(pRExC_state, &this_class);
3128 data_fake.start_class = &this_class;
3129 f = SCF_DO_STCLASS_AND;
3131 if (flags & SCF_WHILEM_VISITED_POS)
3132 f |= SCF_WHILEM_VISITED_POS;
3134 /* we suppose the run is continuous, last=next...*/
3135 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3137 stopparen, recursed, NULL, f,depth+1);
3140 if (max1 < minnext + deltanext)
3141 max1 = minnext + deltanext;
3142 if (deltanext == I32_MAX)
3143 is_inf = is_inf_internal = 1;
3145 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3147 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3148 if ( stopmin > minnext)
3149 stopmin = min + min1;
3150 flags &= ~SCF_DO_SUBSTR;
3152 data->flags |= SCF_SEEN_ACCEPT;
3155 if (data_fake.flags & SF_HAS_EVAL)
3156 data->flags |= SF_HAS_EVAL;
3157 data->whilem_c = data_fake.whilem_c;
3159 if (flags & SCF_DO_STCLASS)
3160 cl_or(pRExC_state, &accum, &this_class);
3162 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3164 if (flags & SCF_DO_SUBSTR) {
3165 data->pos_min += min1;
3166 data->pos_delta += max1 - min1;
3167 if (max1 != min1 || is_inf)
3168 data->longest = &(data->longest_float);
3171 delta += max1 - min1;
3172 if (flags & SCF_DO_STCLASS_OR) {
3173 cl_or(pRExC_state, data->start_class, &accum);
3175 cl_and(data->start_class, and_withp);
3176 flags &= ~SCF_DO_STCLASS;
3179 else if (flags & SCF_DO_STCLASS_AND) {
3181 cl_and(data->start_class, &accum);
3182 flags &= ~SCF_DO_STCLASS;
3185 /* Switch to OR mode: cache the old value of
3186 * data->start_class */
3188 StructCopy(data->start_class, and_withp,
3189 struct regnode_charclass_class);
3190 flags &= ~SCF_DO_STCLASS_AND;
3191 StructCopy(&accum, data->start_class,
3192 struct regnode_charclass_class);
3193 flags |= SCF_DO_STCLASS_OR;
3194 data->start_class->flags |= ANYOF_EOS;
3198 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3201 Assuming this was/is a branch we are dealing with: 'scan' now
3202 points at the item that follows the branch sequence, whatever
3203 it is. We now start at the beginning of the sequence and look
3210 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3212 If we can find such a subsequence we need to turn the first
3213 element into a trie and then add the subsequent branch exact
3214 strings to the trie.
3218 1. patterns where the whole set of branches can be converted.
3220 2. patterns where only a subset can be converted.
3222 In case 1 we can replace the whole set with a single regop
3223 for the trie. In case 2 we need to keep the start and end
3226 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3227 becomes BRANCH TRIE; BRANCH X;
3229 There is an additional case, that being where there is a
3230 common prefix, which gets split out into an EXACT like node
3231 preceding the TRIE node.
3233 If x(1..n)==tail then we can do a simple trie, if not we make
3234 a "jump" trie, such that when we match the appropriate word
3235 we "jump" to the appropriate tail node. Essentially we turn
3236 a nested if into a case structure of sorts.
3241 if (!re_trie_maxbuff) {
3242 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3243 if (!SvIOK(re_trie_maxbuff))
3244 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3246 if ( SvIV(re_trie_maxbuff)>=0 ) {
3248 regnode *first = (regnode *)NULL;
3249 regnode *last = (regnode *)NULL;
3250 regnode *tail = scan;
3255 SV * const mysv = sv_newmortal(); /* for dumping */
3257 /* var tail is used because there may be a TAIL
3258 regop in the way. Ie, the exacts will point to the
3259 thing following the TAIL, but the last branch will
3260 point at the TAIL. So we advance tail. If we
3261 have nested (?:) we may have to move through several
3265 while ( OP( tail ) == TAIL ) {
3266 /* this is the TAIL generated by (?:) */
3267 tail = regnext( tail );
3271 DEBUG_TRIE_COMPILE_r({
3272 regprop(RExC_rx, mysv, tail );
3273 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3274 (int)depth * 2 + 2, "",
3275 "Looking for TRIE'able sequences. Tail node is: ",
3276 SvPV_nolen_const( mysv )
3282 Step through the branches
3283 cur represents each branch,
3284 noper is the first thing to be matched as part of that branch
3285 noper_next is the regnext() of that node.
3287 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3288 via a "jump trie" but we also support building with NOJUMPTRIE,
3289 which restricts the trie logic to structures like /FOO|BAR/.
3291 If noper is a trieable nodetype then the branch is a possible optimization
3292 target. If we are building under NOJUMPTRIE then we require that noper_next
3293 is the same as scan (our current position in the regex program).
3295 Once we have two or more consecutive such branches we can create a
3296 trie of the EXACT's contents and stitch it in place into the program.
3298 If the sequence represents all of the branches in the alternation we
3299 replace the entire thing with a single TRIE node.
3301 Otherwise when it is a subsequence we need to stitch it in place and
3302 replace only the relevant branches. This means the first branch has
3303 to remain as it is used by the alternation logic, and its next pointer,
3304 and needs to be repointed at the item on the branch chain following
3305 the last branch we have optimized away.
3307 This could be either a BRANCH, in which case the subsequence is internal,
3308 or it could be the item following the branch sequence in which case the
3309 subsequence is at the end (which does not necessarily mean the first node
3310 is the start of the alternation).
3312 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3315 ----------------+-----------
3319 EXACTFU_SS | EXACTFU
3320 EXACTFU_TRICKYFOLD | EXACTFU
3325 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3326 ( EXACT == (X) ) ? EXACT : \
3327 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3330 /* dont use tail as the end marker for this traverse */
3331 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3332 regnode * const noper = NEXTOPER( cur );
3333 U8 noper_type = OP( noper );
3334 U8 noper_trietype = TRIE_TYPE( noper_type );
3335 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3336 regnode * const noper_next = regnext( noper );
3337 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3338 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3341 DEBUG_TRIE_COMPILE_r({
3342 regprop(RExC_rx, mysv, cur);
3343 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3344 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3346 regprop(RExC_rx, mysv, noper);
3347 PerlIO_printf( Perl_debug_log, " -> %s",
3348 SvPV_nolen_const(mysv));
3351 regprop(RExC_rx, mysv, noper_next );
3352 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3353 SvPV_nolen_const(mysv));
3355 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3356 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3357 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3361 /* Is noper a trieable nodetype that can be merged with the
3362 * current trie (if there is one)? */
3366 ( noper_trietype == NOTHING)
3367 || ( trietype == NOTHING )
3368 || ( trietype == noper_trietype )
3371 && noper_next == tail
3375 /* Handle mergable triable node
3376 * Either we are the first node in a new trieable sequence,
3377 * in which case we do some bookkeeping, otherwise we update
3378 * the end pointer. */
3381 trietype = noper_trietype;
3382 if ( noper_trietype == NOTHING ) {
3383 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3384 regnode * const noper_next = regnext( noper );
3385 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3386 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3389 if ( noper_next_trietype )
3390 trietype = noper_next_trietype;
3393 if ( trietype == NOTHING )
3394 trietype = noper_trietype;
3399 } /* end handle mergable triable node */
3401 /* handle unmergable node -
3402 * noper may either be a triable node which can not be tried
3403 * together with the current trie, or a non triable node */
3405 /* If last is set and trietype is not NOTHING then we have found
3406 * at least two triable branch sequences in a row of a similar
3407 * trietype so we can turn them into a trie. If/when we
3408 * allow NOTHING to start a trie sequence this condition will be
3409 * required, and it isn't expensive so we leave it in for now. */
3410 if ( trietype != NOTHING )
3411 make_trie( pRExC_state,
3412 startbranch, first, cur, tail, count,
3413 trietype, depth+1 );
3414 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3418 && noper_next == tail
3421 /* noper is triable, so we can start a new trie sequence */
3424 trietype = noper_trietype;
3426 /* if we already saw a first but the current node is not triable then we have
3427 * to reset the first information. */
3432 } /* end handle unmergable node */
3433 } /* loop over branches */
3434 DEBUG_TRIE_COMPILE_r({
3435 regprop(RExC_rx, mysv, cur);
3436 PerlIO_printf( Perl_debug_log,
3437 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3438 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3442 if ( trietype != NOTHING ) {
3443 /* the last branch of the sequence was part of a trie,
3444 * so we have to construct it here outside of the loop
3446 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3447 #ifdef TRIE_STUDY_OPT
3448 if ( ((made == MADE_EXACT_TRIE &&
3449 startbranch == first)
3450 || ( first_non_open == first )) &&
3452 flags |= SCF_TRIE_RESTUDY;
3453 if ( startbranch == first
3456 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3461 /* at this point we know whatever we have is a NOTHING sequence/branch
3462 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3464 if ( startbranch == first ) {
3466 /* the entire thing is a NOTHING sequence, something like this:
3467 * (?:|) So we can turn it into a plain NOTHING op. */
3468 DEBUG_TRIE_COMPILE_r({
3469 regprop(RExC_rx, mysv, cur);
3470 PerlIO_printf( Perl_debug_log,
3471 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3472 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3475 OP(startbranch)= NOTHING;
3476 NEXT_OFF(startbranch)= tail - startbranch;
3477 for ( opt= startbranch + 1; opt < tail ; opt++ )
3481 } /* end if ( last) */
3482 } /* TRIE_MAXBUF is non zero */
3487 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3488 scan = NEXTOPER(NEXTOPER(scan));
3489 } else /* single branch is optimized. */
3490 scan = NEXTOPER(scan);
3492 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3493 scan_frame *newframe = NULL;
3498 if (OP(scan) != SUSPEND) {
3499 /* set the pointer */
3500 if (OP(scan) == GOSUB) {
3502 RExC_recurse[ARG2L(scan)] = scan;
3503 start = RExC_open_parens[paren-1];
3504 end = RExC_close_parens[paren-1];
3507 start = RExC_rxi->program + 1;
3511 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3512 SAVEFREEPV(recursed);
3514 if (!PAREN_TEST(recursed,paren+1)) {
3515 PAREN_SET(recursed,paren+1);
3516 Newx(newframe,1,scan_frame);
3518 if (flags & SCF_DO_SUBSTR) {
3519 SCAN_COMMIT(pRExC_state,data,minlenp);
3520 data->longest = &(data->longest_float);
3522 is_inf = is_inf_internal = 1;
3523 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3524 cl_anything(pRExC_state, data->start_class);
3525 flags &= ~SCF_DO_STCLASS;
3528 Newx(newframe,1,scan_frame);
3531 end = regnext(scan);
3536 SAVEFREEPV(newframe);
3537 newframe->next = regnext(scan);
3538 newframe->last = last;
3539 newframe->stop = stopparen;
3540 newframe->prev = frame;
3550 else if (OP(scan) == EXACT) {
3551 I32 l = STR_LEN(scan);
3554 const U8 * const s = (U8*)STRING(scan);
3555 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3556 l = utf8_length(s, s + l);
3558 uc = *((U8*)STRING(scan));
3561 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3562 /* The code below prefers earlier match for fixed
3563 offset, later match for variable offset. */
3564 if (data->last_end == -1) { /* Update the start info. */
3565 data->last_start_min = data->pos_min;
3566 data->last_start_max = is_inf
3567 ? I32_MAX : data->pos_min + data->pos_delta;
3569 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3571 SvUTF8_on(data->last_found);
3573 SV * const sv = data->last_found;
3574 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3575 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3576 if (mg && mg->mg_len >= 0)
3577 mg->mg_len += utf8_length((U8*)STRING(scan),
3578 (U8*)STRING(scan)+STR_LEN(scan));
3580 data->last_end = data->pos_min + l;
3581 data->pos_min += l; /* As in the first entry. */
3582 data->flags &= ~SF_BEFORE_EOL;
3584 if (flags & SCF_DO_STCLASS_AND) {
3585 /* Check whether it is compatible with what we know already! */
3589 /* If compatible, we or it in below. It is compatible if is
3590 * in the bitmp and either 1) its bit or its fold is set, or 2)
3591 * it's for a locale. Even if there isn't unicode semantics
3592 * here, at runtime there may be because of matching against a
3593 * utf8 string, so accept a possible false positive for
3594 * latin1-range folds */
3596 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3597 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3598 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3599 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3604 ANYOF_CLASS_ZERO(data->start_class);
3605 ANYOF_BITMAP_ZERO(data->start_class);
3607 ANYOF_BITMAP_SET(data->start_class, uc);
3608 else if (uc >= 0x100) {
3611 /* Some Unicode code points fold to the Latin1 range; as
3612 * XXX temporary code, instead of figuring out if this is
3613 * one, just assume it is and set all the start class bits
3614 * that could be some such above 255 code point's fold
3615 * which will generate fals positives. As the code
3616 * elsewhere that does compute the fold settles down, it
3617 * can be extracted out and re-used here */
3618 for (i = 0; i < 256; i++){
3619 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3620 ANYOF_BITMAP_SET(data->start_class, i);
3624 data->start_class->flags &= ~ANYOF_EOS;
3626 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3628 else if (flags & SCF_DO_STCLASS_OR) {
3629 /* false positive possible if the class is case-folded */
3631 ANYOF_BITMAP_SET(data->start_class, uc);
3633 data->start_class->flags |= ANYOF_UNICODE_ALL;
3634 data->start_class->flags &= ~ANYOF_EOS;
3635 cl_and(data->start_class, and_withp);
3637 flags &= ~SCF_DO_STCLASS;
3639 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3640 I32 l = STR_LEN(scan);
3641 UV uc = *((U8*)STRING(scan));
3643 /* Search for fixed substrings supports EXACT only. */
3644 if (flags & SCF_DO_SUBSTR) {
3646 SCAN_COMMIT(pRExC_state, data, minlenp);
3649 const U8 * const s = (U8 *)STRING(scan);
3650 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3651 l = utf8_length(s, s + l);
3653 else if (has_exactf_sharp_s) {
3654 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3656 min += l - min_subtract;
3660 delta += min_subtract;
3661 if (flags & SCF_DO_SUBSTR) {
3662 data->pos_min += l - min_subtract;
3663 if (data->pos_min < 0) {
3666 data->pos_delta += min_subtract;
3668 data->longest = &(data->longest_float);
3671 if (flags & SCF_DO_STCLASS_AND) {
3672 /* Check whether it is compatible with what we know already! */
3675 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3676 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3677 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3681 ANYOF_CLASS_ZERO(data->start_class);
3682 ANYOF_BITMAP_ZERO(data->start_class);
3684 ANYOF_BITMAP_SET(data->start_class, uc);
3685 data->start_class->flags &= ~ANYOF_EOS;
3686 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3687 if (OP(scan) == EXACTFL) {
3688 /* XXX This set is probably no longer necessary, and
3689 * probably wrong as LOCALE now is on in the initial
3691 data->start_class->flags |= ANYOF_LOCALE;
3695 /* Also set the other member of the fold pair. In case
3696 * that unicode semantics is called for at runtime, use
3697 * the full latin1 fold. (Can't do this for locale,
3698 * because not known until runtime) */
3699 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3701 /* All other (EXACTFL handled above) folds except under
3702 * /iaa that include s, S, and sharp_s also may include
3704 if (OP(scan) != EXACTFA) {
3705 if (uc == 's' || uc == 'S') {
3706 ANYOF_BITMAP_SET(data->start_class,
3707 LATIN_SMALL_LETTER_SHARP_S);
3709 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3710 ANYOF_BITMAP_SET(data->start_class, 's');
3711 ANYOF_BITMAP_SET(data->start_class, 'S');
3716 else if (uc >= 0x100) {
3718 for (i = 0; i < 256; i++){
3719 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3720 ANYOF_BITMAP_SET(data->start_class, i);
3725 else if (flags & SCF_DO_STCLASS_OR) {
3726 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3727 /* false positive possible if the class is case-folded.
3728 Assume that the locale settings are the same... */
3730 ANYOF_BITMAP_SET(data->start_class, uc);
3731 if (OP(scan) != EXACTFL) {
3733 /* And set the other member of the fold pair, but
3734 * can't do that in locale because not known until
3736 ANYOF_BITMAP_SET(data->start_class,
3737 PL_fold_latin1[uc]);
3739 /* All folds except under /iaa that include s, S,
3740 * and sharp_s also may include the others */
3741 if (OP(scan) != EXACTFA) {
3742 if (uc == 's' || uc == 'S') {
3743 ANYOF_BITMAP_SET(data->start_class,
3744 LATIN_SMALL_LETTER_SHARP_S);
3746 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3747 ANYOF_BITMAP_SET(data->start_class, 's');
3748 ANYOF_BITMAP_SET(data->start_class, 'S');
3753 data->start_class->flags &= ~ANYOF_EOS;
3755 cl_and(data->start_class, and_withp);
3757 flags &= ~SCF_DO_STCLASS;
3759 else if (REGNODE_VARIES(OP(scan))) {
3760 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3761 I32 f = flags, pos_before = 0;
3762 regnode * const oscan = scan;
3763 struct regnode_charclass_class this_class;
3764 struct regnode_charclass_class *oclass = NULL;
3765 I32 next_is_eval = 0;
3767 switch (PL_regkind[OP(scan)]) {
3768 case WHILEM: /* End of (?:...)* . */
3769 scan = NEXTOPER(scan);
3772 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3773 next = NEXTOPER(scan);
3774 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3776 maxcount = REG_INFTY;
3777 next = regnext(scan);
3778 scan = NEXTOPER(scan);
3782 if (flags & SCF_DO_SUBSTR)
3787 if (flags & SCF_DO_STCLASS) {
3789 maxcount = REG_INFTY;
3790 next = regnext(scan);
3791 scan = NEXTOPER(scan);
3794 is_inf = is_inf_internal = 1;
3795 scan = regnext(scan);
3796 if (flags & SCF_DO_SUBSTR) {
3797 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3798 data->longest = &(data->longest_float);
3800 goto optimize_curly_tail;
3802 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3803 && (scan->flags == stopparen))
3808 mincount = ARG1(scan);
3809 maxcount = ARG2(scan);
3811 next = regnext(scan);
3812 if (OP(scan) == CURLYX) {
3813 I32 lp = (data ? *(data->last_closep) : 0);
3814 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3816 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3817 next_is_eval = (OP(scan) == EVAL);
3819 if (flags & SCF_DO_SUBSTR) {
3820 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3821 pos_before = data->pos_min;
3825 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3827 data->flags |= SF_IS_INF;
3829 if (flags & SCF_DO_STCLASS) {
3830 cl_init(pRExC_state, &this_class);
3831 oclass = data->start_class;
3832 data->start_class = &this_class;
3833 f |= SCF_DO_STCLASS_AND;
3834 f &= ~SCF_DO_STCLASS_OR;
3836 /* Exclude from super-linear cache processing any {n,m}
3837 regops for which the combination of input pos and regex
3838 pos is not enough information to determine if a match
3841 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3842 regex pos at the \s*, the prospects for a match depend not
3843 only on the input position but also on how many (bar\s*)
3844 repeats into the {4,8} we are. */
3845 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3846 f &= ~SCF_WHILEM_VISITED_POS;
3848 /* This will finish on WHILEM, setting scan, or on NULL: */
3849 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3850 last, data, stopparen, recursed, NULL,
3852 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3854 if (flags & SCF_DO_STCLASS)
3855 data->start_class = oclass;
3856 if (mincount == 0 || minnext == 0) {
3857 if (flags & SCF_DO_STCLASS_OR) {
3858 cl_or(pRExC_state, data->start_class, &this_class);
3860 else if (flags & SCF_DO_STCLASS_AND) {
3861 /* Switch to OR mode: cache the old value of
3862 * data->start_class */
3864 StructCopy(data->start_class, and_withp,
3865 struct regnode_charclass_class);
3866 flags &= ~SCF_DO_STCLASS_AND;
3867 StructCopy(&this_class, data->start_class,
3868 struct regnode_charclass_class);
3869 flags |= SCF_DO_STCLASS_OR;
3870 data->start_class->flags |= ANYOF_EOS;
3872 } else { /* Non-zero len */
3873 if (flags & SCF_DO_STCLASS_OR) {
3874 cl_or(pRExC_state, data->start_class, &this_class);
3875 cl_and(data->start_class, and_withp);
3877 else if (flags & SCF_DO_STCLASS_AND)
3878 cl_and(data->start_class, &this_class);
3879 flags &= ~SCF_DO_STCLASS;
3881 if (!scan) /* It was not CURLYX, but CURLY. */
3883 if ( /* ? quantifier ok, except for (?{ ... }) */
3884 (next_is_eval || !(mincount == 0 && maxcount == 1))
3885 && (minnext == 0) && (deltanext == 0)
3886 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3887 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3889 ckWARNreg(RExC_parse,
3890 "Quantifier unexpected on zero-length expression");
3893 min += minnext * mincount;
3894 is_inf_internal |= ((maxcount == REG_INFTY
3895 && (minnext + deltanext) > 0)
3896 || deltanext == I32_MAX);
3897 is_inf |= is_inf_internal;
3898 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3900 /* Try powerful optimization CURLYX => CURLYN. */
3901 if ( OP(oscan) == CURLYX && data
3902 && data->flags & SF_IN_PAR
3903 && !(data->flags & SF_HAS_EVAL)
3904 && !deltanext && minnext == 1 ) {
3905 /* Try to optimize to CURLYN. */
3906 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3907 regnode * const nxt1 = nxt;
3914 if (!REGNODE_SIMPLE(OP(nxt))
3915 && !(PL_regkind[OP(nxt)] == EXACT
3916 && STR_LEN(nxt) == 1))
3922 if (OP(nxt) != CLOSE)
3924 if (RExC_open_parens) {
3925 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3926 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3928 /* Now we know that nxt2 is the only contents: */
3929 oscan->flags = (U8)ARG(nxt);
3931 OP(nxt1) = NOTHING; /* was OPEN. */
3934 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3935 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3936 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3937 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3938 OP(nxt + 1) = OPTIMIZED; /* was count. */
3939 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3944 /* Try optimization CURLYX => CURLYM. */
3945 if ( OP(oscan) == CURLYX && data
3946 && !(data->flags & SF_HAS_PAR)
3947 && !(data->flags & SF_HAS_EVAL)
3948 && !deltanext /* atom is fixed width */
3949 && minnext != 0 /* CURLYM can't handle zero width */
3951 /* XXXX How to optimize if data == 0? */
3952 /* Optimize to a simpler form. */
3953 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3957 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3958 && (OP(nxt2) != WHILEM))
3960 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3961 /* Need to optimize away parenths. */
3962 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3963 /* Set the parenth number. */
3964 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3966 oscan->flags = (U8)ARG(nxt);
3967 if (RExC_open_parens) {
3968 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3969 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3971 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3972 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3975 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3976 OP(nxt + 1) = OPTIMIZED; /* was count. */
3977 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3978 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3981 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3982 regnode *nnxt = regnext(nxt1);
3984 if (reg_off_by_arg[OP(nxt1)])
3985 ARG_SET(nxt1, nxt2 - nxt1);
3986 else if (nxt2 - nxt1 < U16_MAX)
3987 NEXT_OFF(nxt1) = nxt2 - nxt1;
3989 OP(nxt) = NOTHING; /* Cannot beautify */
3994 /* Optimize again: */
3995 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3996 NULL, stopparen, recursed, NULL, 0,depth+1);
4001 else if ((OP(oscan) == CURLYX)
4002 && (flags & SCF_WHILEM_VISITED_POS)
4003 /* See the comment on a similar expression above.
4004 However, this time it's not a subexpression
4005 we care about, but the expression itself. */
4006 && (maxcount == REG_INFTY)
4007 && data && ++data->whilem_c < 16) {
4008 /* This stays as CURLYX, we can put the count/of pair. */
4009 /* Find WHILEM (as in regexec.c) */
4010 regnode *nxt = oscan + NEXT_OFF(oscan);
4012 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4014 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4015 | (RExC_whilem_seen << 4)); /* On WHILEM */
4017 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4019 if (flags & SCF_DO_SUBSTR) {
4020 SV *last_str = NULL;
4021 int counted = mincount != 0;
4023 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4024 #if defined(SPARC64_GCC_WORKAROUND)
4027 const char *s = NULL;
4030 if (pos_before >= data->last_start_min)
4033 b = data->last_start_min;
4036 s = SvPV_const(data->last_found, l);
4037 old = b - data->last_start_min;
4040 I32 b = pos_before >= data->last_start_min
4041 ? pos_before : data->last_start_min;
4043 const char * const s = SvPV_const(data->last_found, l);
4044 I32 old = b - data->last_start_min;
4048 old = utf8_hop((U8*)s, old) - (U8*)s;
4050 /* Get the added string: */
4051 last_str = newSVpvn_utf8(s + old, l, UTF);
4052 if (deltanext == 0 && pos_before == b) {
4053 /* What was added is a constant string */
4055 SvGROW(last_str, (mincount * l) + 1);
4056 repeatcpy(SvPVX(last_str) + l,
4057 SvPVX_const(last_str), l, mincount - 1);
4058 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4059 /* Add additional parts. */
4060 SvCUR_set(data->last_found,
4061 SvCUR(data->last_found) - l);
4062 sv_catsv(data->last_found, last_str);
4064 SV * sv = data->last_found;
4066 SvUTF8(sv) && SvMAGICAL(sv) ?
4067 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4068 if (mg && mg->mg_len >= 0)
4069 mg->mg_len += CHR_SVLEN(last_str) - l;
4071 data->last_end += l * (mincount - 1);
4074 /* start offset must point into the last copy */
4075 data->last_start_min += minnext * (mincount - 1);
4076 data->last_start_max += is_inf ? I32_MAX
4077 : (maxcount - 1) * (minnext + data->pos_delta);
4080 /* It is counted once already... */
4081 data->pos_min += minnext * (mincount - counted);
4082 data->pos_delta += - counted * deltanext +
4083 (minnext + deltanext) * maxcount - minnext * mincount;
4084 if (mincount != maxcount) {
4085 /* Cannot extend fixed substrings found inside
4087 SCAN_COMMIT(pRExC_state,data,minlenp);
4088 if (mincount && last_str) {
4089 SV * const sv = data->last_found;
4090 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4091 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4095 sv_setsv(sv, last_str);
4096 data->last_end = data->pos_min;
4097 data->last_start_min =
4098 data->pos_min - CHR_SVLEN(last_str);
4099 data->last_start_max = is_inf
4101 : data->pos_min + data->pos_delta
4102 - CHR_SVLEN(last_str);
4104 data->longest = &(data->longest_float);
4106 SvREFCNT_dec(last_str);
4108 if (data && (fl & SF_HAS_EVAL))
4109 data->flags |= SF_HAS_EVAL;
4110 optimize_curly_tail:
4111 if (OP(oscan) != CURLYX) {
4112 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4114 NEXT_OFF(oscan) += NEXT_OFF(next);
4117 default: /* REF, ANYOFV, and CLUMP only? */
4118 if (flags & SCF_DO_SUBSTR) {
4119 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4120 data->longest = &(data->longest_float);
4122 is_inf = is_inf_internal = 1;
4123 if (flags & SCF_DO_STCLASS_OR)
4124 cl_anything(pRExC_state, data->start_class);
4125 flags &= ~SCF_DO_STCLASS;
4129 else if (OP(scan) == LNBREAK) {
4130 if (flags & SCF_DO_STCLASS) {
4132 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4133 if (flags & SCF_DO_STCLASS_AND) {
4134 for (value = 0; value < 256; value++)
4135 if (!is_VERTWS_cp(value))
4136 ANYOF_BITMAP_CLEAR(data->start_class, value);
4139 for (value = 0; value < 256; value++)
4140 if (is_VERTWS_cp(value))
4141 ANYOF_BITMAP_SET(data->start_class, value);
4143 if (flags & SCF_DO_STCLASS_OR)
4144 cl_and(data->start_class, and_withp);
4145 flags &= ~SCF_DO_STCLASS;
4149 if (flags & SCF_DO_SUBSTR) {
4150 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4152 data->pos_delta += 1;
4153 data->longest = &(data->longest_float);
4156 else if (REGNODE_SIMPLE(OP(scan))) {
4159 if (flags & SCF_DO_SUBSTR) {
4160 SCAN_COMMIT(pRExC_state,data,minlenp);
4164 if (flags & SCF_DO_STCLASS) {
4165 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4167 /* Some of the logic below assumes that switching
4168 locale on will only add false positives. */
4169 switch (PL_regkind[OP(scan)]) {
4173 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4174 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4175 cl_anything(pRExC_state, data->start_class);
4178 if (OP(scan) == SANY)
4180 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4181 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4182 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4183 cl_anything(pRExC_state, data->start_class);
4185 if (flags & SCF_DO_STCLASS_AND || !value)
4186 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4189 if (flags & SCF_DO_STCLASS_AND)
4190 cl_and(data->start_class,
4191 (struct regnode_charclass_class*)scan);
4193 cl_or(pRExC_state, data->start_class,
4194 (struct regnode_charclass_class*)scan);
4197 if (flags & SCF_DO_STCLASS_AND) {
4198 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4199 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4200 if (OP(scan) == ALNUMU) {
4201 for (value = 0; value < 256; value++) {
4202 if (!isWORDCHAR_L1(value)) {
4203 ANYOF_BITMAP_CLEAR(data->start_class, value);
4207 for (value = 0; value < 256; value++) {
4208 if (!isALNUM(value)) {
4209 ANYOF_BITMAP_CLEAR(data->start_class, value);
4216 if (data->start_class->flags & ANYOF_LOCALE)
4217 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4219 /* Even if under locale, set the bits for non-locale
4220 * in case it isn't a true locale-node. This will
4221 * create false positives if it truly is locale */
4222 if (OP(scan) == ALNUMU) {
4223 for (value = 0; value < 256; value++) {
4224 if (isWORDCHAR_L1(value)) {
4225 ANYOF_BITMAP_SET(data->start_class, value);
4229 for (value = 0; value < 256; value++) {
4230 if (isALNUM(value)) {
4231 ANYOF_BITMAP_SET(data->start_class, value);
4238 if (flags & SCF_DO_STCLASS_AND) {
4239 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4240 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4241 if (OP(scan) == NALNUMU) {
4242 for (value = 0; value < 256; value++) {
4243 if (isWORDCHAR_L1(value)) {
4244 ANYOF_BITMAP_CLEAR(data->start_class, value);
4248 for (value = 0; value < 256; value++) {
4249 if (isALNUM(value)) {
4250 ANYOF_BITMAP_CLEAR(data->start_class, value);
4257 if (data->start_class->flags & ANYOF_LOCALE)
4258 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4260 /* Even if under locale, set the bits for non-locale in
4261 * case it isn't a true locale-node. This will create
4262 * false positives if it truly is locale */
4263 if (OP(scan) == NALNUMU) {
4264 for (value = 0; value < 256; value++) {
4265 if (! isWORDCHAR_L1(value)) {
4266 ANYOF_BITMAP_SET(data->start_class, value);
4270 for (value = 0; value < 256; value++) {
4271 if (! isALNUM(value)) {
4272 ANYOF_BITMAP_SET(data->start_class, value);
4279 if (flags & SCF_DO_STCLASS_AND) {
4280 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4281 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4282 if (OP(scan) == SPACEU) {
4283 for (value = 0; value < 256; value++) {
4284 if (!isSPACE_L1(value)) {
4285 ANYOF_BITMAP_CLEAR(data->start_class, value);
4289 for (value = 0; value < 256; value++) {
4290 if (!isSPACE(value)) {
4291 ANYOF_BITMAP_CLEAR(data->start_class, value);
4298 if (data->start_class->flags & ANYOF_LOCALE) {
4299 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4301 if (OP(scan) == SPACEU) {
4302 for (value = 0; value < 256; value++) {
4303 if (isSPACE_L1(value)) {
4304 ANYOF_BITMAP_SET(data->start_class, value);
4308 for (value = 0; value < 256; value++) {
4309 if (isSPACE(value)) {
4310 ANYOF_BITMAP_SET(data->start_class, value);
4317 if (flags & SCF_DO_STCLASS_AND) {
4318 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4319 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4320 if (OP(scan) == NSPACEU) {
4321 for (value = 0; value < 256; value++) {
4322 if (isSPACE_L1(value)) {
4323 ANYOF_BITMAP_CLEAR(data->start_class, value);
4327 for (value = 0; value < 256; value++) {
4328 if (isSPACE(value)) {
4329 ANYOF_BITMAP_CLEAR(data->start_class, value);
4336 if (data->start_class->flags & ANYOF_LOCALE)
4337 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4338 if (OP(scan) == NSPACEU) {
4339 for (value = 0; value < 256; value++) {
4340 if (!isSPACE_L1(value)) {
4341 ANYOF_BITMAP_SET(data->start_class, value);
4346 for (value = 0; value < 256; value++) {
4347 if (!isSPACE(value)) {
4348 ANYOF_BITMAP_SET(data->start_class, value);
4355 if (flags & SCF_DO_STCLASS_AND) {
4356 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4357 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4358 for (value = 0; value < 256; value++)
4359 if (!isDIGIT(value))
4360 ANYOF_BITMAP_CLEAR(data->start_class, value);
4364 if (data->start_class->flags & ANYOF_LOCALE)
4365 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4366 for (value = 0; value < 256; value++)
4368 ANYOF_BITMAP_SET(data->start_class, value);
4372 if (flags & SCF_DO_STCLASS_AND) {
4373 if (!(data->start_class->flags & ANYOF_LOCALE))
4374 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4375 for (value = 0; value < 256; value++)
4377 ANYOF_BITMAP_CLEAR(data->start_class, value);
4380 if (data->start_class->flags & ANYOF_LOCALE)
4381 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4382 for (value = 0; value < 256; value++)
4383 if (!isDIGIT(value))
4384 ANYOF_BITMAP_SET(data->start_class, value);
4387 CASE_SYNST_FNC(VERTWS);
4388 CASE_SYNST_FNC(HORIZWS);
4391 if (flags & SCF_DO_STCLASS_OR)
4392 cl_and(data->start_class, and_withp);
4393 flags &= ~SCF_DO_STCLASS;
4396 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4397 data->flags |= (OP(scan) == MEOL
4401 else if ( PL_regkind[OP(scan)] == BRANCHJ
4402 /* Lookbehind, or need to calculate parens/evals/stclass: */
4403 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4404 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4405 if ( OP(scan) == UNLESSM &&
4407 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4408 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4411 regnode *upto= regnext(scan);
4413 SV * const mysv_val=sv_newmortal();
4414 DEBUG_STUDYDATA("OPFAIL",data,depth);
4416 /*DEBUG_PARSE_MSG("opfail");*/
4417 regprop(RExC_rx, mysv_val, upto);
4418 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4419 SvPV_nolen_const(mysv_val),
4420 (IV)REG_NODE_NUM(upto),
4425 NEXT_OFF(scan) = upto - scan;
4426 for (opt= scan + 1; opt < upto ; opt++)
4427 OP(opt) = OPTIMIZED;
4431 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4432 || OP(scan) == UNLESSM )
4434 /* Negative Lookahead/lookbehind
4435 In this case we can't do fixed string optimisation.
4438 I32 deltanext, minnext, fake = 0;
4440 struct regnode_charclass_class intrnl;
4443 data_fake.flags = 0;
4445 data_fake.whilem_c = data->whilem_c;
4446 data_fake.last_closep = data->last_closep;
4449 data_fake.last_closep = &fake;
4450 data_fake.pos_delta = delta;
4451 if ( flags & SCF_DO_STCLASS && !scan->flags
4452 && OP(scan) == IFMATCH ) { /* Lookahead */
4453 cl_init(pRExC_state, &intrnl);
4454 data_fake.start_class = &intrnl;
4455 f |= SCF_DO_STCLASS_AND;
4457 if (flags & SCF_WHILEM_VISITED_POS)
4458 f |= SCF_WHILEM_VISITED_POS;
4459 next = regnext(scan);
4460 nscan = NEXTOPER(NEXTOPER(scan));
4461 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4462 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4465 FAIL("Variable length lookbehind not implemented");
4467 else if (minnext > (I32)U8_MAX) {
4468 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4470 scan->flags = (U8)minnext;
4473 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4475 if (data_fake.flags & SF_HAS_EVAL)
4476 data->flags |= SF_HAS_EVAL;
4477 data->whilem_c = data_fake.whilem_c;
4479 if (f & SCF_DO_STCLASS_AND) {
4480 if (flags & SCF_DO_STCLASS_OR) {
4481 /* OR before, AND after: ideally we would recurse with
4482 * data_fake to get the AND applied by study of the
4483 * remainder of the pattern, and then derecurse;
4484 * *** HACK *** for now just treat as "no information".
4485 * See [perl #56690].
4487 cl_init(pRExC_state, data->start_class);
4489 /* AND before and after: combine and continue */
4490 const int was = (data->start_class->flags & ANYOF_EOS);
4492 cl_and(data->start_class, &intrnl);
4494 data->start_class->flags |= ANYOF_EOS;
4498 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4500 /* Positive Lookahead/lookbehind
4501 In this case we can do fixed string optimisation,
4502 but we must be careful about it. Note in the case of
4503 lookbehind the positions will be offset by the minimum
4504 length of the pattern, something we won't know about
4505 until after the recurse.
4507 I32 deltanext, fake = 0;
4509 struct regnode_charclass_class intrnl;
4511 /* We use SAVEFREEPV so that when the full compile
4512 is finished perl will clean up the allocated
4513 minlens when it's all done. This way we don't
4514 have to worry about freeing them when we know
4515 they wont be used, which would be a pain.
4518 Newx( minnextp, 1, I32 );
4519 SAVEFREEPV(minnextp);
4522 StructCopy(data, &data_fake, scan_data_t);
4523 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4526 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4527 data_fake.last_found=newSVsv(data->last_found);
4531 data_fake.last_closep = &fake;
4532 data_fake.flags = 0;
4533 data_fake.pos_delta = delta;
4535 data_fake.flags |= SF_IS_INF;
4536 if ( flags & SCF_DO_STCLASS && !scan->flags
4537 && OP(scan) == IFMATCH ) { /* Lookahead */
4538 cl_init(pRExC_state, &intrnl);
4539 data_fake.start_class = &intrnl;
4540 f |= SCF_DO_STCLASS_AND;
4542 if (flags & SCF_WHILEM_VISITED_POS)
4543 f |= SCF_WHILEM_VISITED_POS;
4544 next = regnext(scan);
4545 nscan = NEXTOPER(NEXTOPER(scan));
4547 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4548 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4551 FAIL("Variable length lookbehind not implemented");
4553 else if (*minnextp > (I32)U8_MAX) {
4554 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4556 scan->flags = (U8)*minnextp;
4561 if (f & SCF_DO_STCLASS_AND) {
4562 const int was = (data->start_class->flags & ANYOF_EOS);
4564 cl_and(data->start_class, &intrnl);
4566 data->start_class->flags |= ANYOF_EOS;
4569 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4571 if (data_fake.flags & SF_HAS_EVAL)
4572 data->flags |= SF_HAS_EVAL;
4573 data->whilem_c = data_fake.whilem_c;
4574 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4575 if (RExC_rx->minlen<*minnextp)
4576 RExC_rx->minlen=*minnextp;
4577 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4578 SvREFCNT_dec(data_fake.last_found);
4580 if ( data_fake.minlen_fixed != minlenp )
4582 data->offset_fixed= data_fake.offset_fixed;
4583 data->minlen_fixed= data_fake.minlen_fixed;
4584 data->lookbehind_fixed+= scan->flags;
4586 if ( data_fake.minlen_float != minlenp )
4588 data->minlen_float= data_fake.minlen_float;
4589 data->offset_float_min=data_fake.offset_float_min;
4590 data->offset_float_max=data_fake.offset_float_max;
4591 data->lookbehind_float+= scan->flags;
4598 else if (OP(scan) == OPEN) {
4599 if (stopparen != (I32)ARG(scan))
4602 else if (OP(scan) == CLOSE) {
4603 if (stopparen == (I32)ARG(scan)) {
4606 if ((I32)ARG(scan) == is_par) {
4607 next = regnext(scan);
4609 if ( next && (OP(next) != WHILEM) && next < last)
4610 is_par = 0; /* Disable optimization */
4613 *(data->last_closep) = ARG(scan);
4615 else if (OP(scan) == EVAL) {
4617 data->flags |= SF_HAS_EVAL;
4619 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4620 if (flags & SCF_DO_SUBSTR) {
4621 SCAN_COMMIT(pRExC_state,data,minlenp);
4622 flags &= ~SCF_DO_SUBSTR;
4624 if (data && OP(scan)==ACCEPT) {
4625 data->flags |= SCF_SEEN_ACCEPT;
4630 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4632 if (flags & SCF_DO_SUBSTR) {
4633 SCAN_COMMIT(pRExC_state,data,minlenp);
4634 data->longest = &(data->longest_float);
4636 is_inf = is_inf_internal = 1;
4637 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4638 cl_anything(pRExC_state, data->start_class);
4639 flags &= ~SCF_DO_STCLASS;
4641 else if (OP(scan) == GPOS) {
4642 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4643 !(delta || is_inf || (data && data->pos_delta)))
4645 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4646 RExC_rx->extflags |= RXf_ANCH_GPOS;
4647 if (RExC_rx->gofs < (U32)min)
4648 RExC_rx->gofs = min;
4650 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4654 #ifdef TRIE_STUDY_OPT
4655 #ifdef FULL_TRIE_STUDY
4656 else if (PL_regkind[OP(scan)] == TRIE) {
4657 /* NOTE - There is similar code to this block above for handling
4658 BRANCH nodes on the initial study. If you change stuff here
4660 regnode *trie_node= scan;
4661 regnode *tail= regnext(scan);
4662 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4663 I32 max1 = 0, min1 = I32_MAX;
4664 struct regnode_charclass_class accum;
4666 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4667 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4668 if (flags & SCF_DO_STCLASS)
4669 cl_init_zero(pRExC_state, &accum);
4675 const regnode *nextbranch= NULL;
4678 for ( word=1 ; word <= trie->wordcount ; word++)
4680 I32 deltanext=0, minnext=0, f = 0, fake;
4681 struct regnode_charclass_class this_class;
4683 data_fake.flags = 0;
4685 data_fake.whilem_c = data->whilem_c;
4686 data_fake.last_closep = data->last_closep;
4689 data_fake.last_closep = &fake;
4690 data_fake.pos_delta = delta;
4691 if (flags & SCF_DO_STCLASS) {
4692 cl_init(pRExC_state, &this_class);
4693 data_fake.start_class = &this_class;
4694 f = SCF_DO_STCLASS_AND;
4696 if (flags & SCF_WHILEM_VISITED_POS)
4697 f |= SCF_WHILEM_VISITED_POS;
4699 if (trie->jump[word]) {
4701 nextbranch = trie_node + trie->jump[0];
4702 scan= trie_node + trie->jump[word];
4703 /* We go from the jump point to the branch that follows
4704 it. Note this means we need the vestigal unused branches
4705 even though they arent otherwise used.
4707 minnext = study_chunk(pRExC_state, &scan, minlenp,
4708 &deltanext, (regnode *)nextbranch, &data_fake,
4709 stopparen, recursed, NULL, f,depth+1);
4711 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4712 nextbranch= regnext((regnode*)nextbranch);
4714 if (min1 > (I32)(minnext + trie->minlen))
4715 min1 = minnext + trie->minlen;
4716 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4717 max1 = minnext + deltanext + trie->maxlen;
4718 if (deltanext == I32_MAX)
4719 is_inf = is_inf_internal = 1;
4721 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4723 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4724 if ( stopmin > min + min1)
4725 stopmin = min + min1;
4726 flags &= ~SCF_DO_SUBSTR;
4728 data->flags |= SCF_SEEN_ACCEPT;
4731 if (data_fake.flags & SF_HAS_EVAL)
4732 data->flags |= SF_HAS_EVAL;
4733 data->whilem_c = data_fake.whilem_c;
4735 if (flags & SCF_DO_STCLASS)
4736 cl_or(pRExC_state, &accum, &this_class);
4739 if (flags & SCF_DO_SUBSTR) {
4740 data->pos_min += min1;
4741 data->pos_delta += max1 - min1;
4742 if (max1 != min1 || is_inf)
4743 data->longest = &(data->longest_float);
4746 delta += max1 - min1;
4747 if (flags & SCF_DO_STCLASS_OR) {
4748 cl_or(pRExC_state, data->start_class, &accum);
4750 cl_and(data->start_class, and_withp);
4751 flags &= ~SCF_DO_STCLASS;
4754 else if (flags & SCF_DO_STCLASS_AND) {
4756 cl_and(data->start_class, &accum);
4757 flags &= ~SCF_DO_STCLASS;
4760 /* Switch to OR mode: cache the old value of
4761 * data->start_class */
4763 StructCopy(data->start_class, and_withp,
4764 struct regnode_charclass_class);
4765 flags &= ~SCF_DO_STCLASS_AND;
4766 StructCopy(&accum, data->start_class,
4767 struct regnode_charclass_class);
4768 flags |= SCF_DO_STCLASS_OR;
4769 data->start_class->flags |= ANYOF_EOS;
4776 else if (PL_regkind[OP(scan)] == TRIE) {
4777 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4780 min += trie->minlen;
4781 delta += (trie->maxlen - trie->minlen);
4782 flags &= ~SCF_DO_STCLASS; /* xxx */
4783 if (flags & SCF_DO_SUBSTR) {
4784 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4785 data->pos_min += trie->minlen;
4786 data->pos_delta += (trie->maxlen - trie->minlen);
4787 if (trie->maxlen != trie->minlen)
4788 data->longest = &(data->longest_float);
4790 if (trie->jump) /* no more substrings -- for now /grr*/
4791 flags &= ~SCF_DO_SUBSTR;
4793 #endif /* old or new */
4794 #endif /* TRIE_STUDY_OPT */
4796 /* Else: zero-length, ignore. */
4797 scan = regnext(scan);
4802 stopparen = frame->stop;
4803 frame = frame->prev;
4804 goto fake_study_recurse;
4809 DEBUG_STUDYDATA("pre-fin:",data,depth);
4812 *deltap = is_inf_internal ? I32_MAX : delta;
4813 if (flags & SCF_DO_SUBSTR && is_inf)
4814 data->pos_delta = I32_MAX - data->pos_min;
4815 if (is_par > (I32)U8_MAX)
4817 if (is_par && pars==1 && data) {
4818 data->flags |= SF_IN_PAR;
4819 data->flags &= ~SF_HAS_PAR;
4821 else if (pars && data) {
4822 data->flags |= SF_HAS_PAR;
4823 data->flags &= ~SF_IN_PAR;
4825 if (flags & SCF_DO_STCLASS_OR)
4826 cl_and(data->start_class, and_withp);
4827 if (flags & SCF_TRIE_RESTUDY)
4828 data->flags |= SCF_TRIE_RESTUDY;
4830 DEBUG_STUDYDATA("post-fin:",data,depth);
4832 return min < stopmin ? min : stopmin;
4836 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4838 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4840 PERL_ARGS_ASSERT_ADD_DATA;
4842 Renewc(RExC_rxi->data,
4843 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4844 char, struct reg_data);
4846 Renew(RExC_rxi->data->what, count + n, U8);
4848 Newx(RExC_rxi->data->what, n, U8);
4849 RExC_rxi->data->count = count + n;
4850 Copy(s, RExC_rxi->data->what + count, n, U8);
4854 /*XXX: todo make this not included in a non debugging perl */
4855 #ifndef PERL_IN_XSUB_RE
4857 Perl_reginitcolors(pTHX)
4860 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4862 char *t = savepv(s);
4866 t = strchr(t, '\t');
4872 PL_colors[i] = t = (char *)"";
4877 PL_colors[i++] = (char *)"";
4884 #ifdef TRIE_STUDY_OPT
4885 #define CHECK_RESTUDY_GOTO \
4887 (data.flags & SCF_TRIE_RESTUDY) \
4891 #define CHECK_RESTUDY_GOTO
4895 * pregcomp - compile a regular expression into internal code
4897 * Decides which engine's compiler to call based on the hint currently in
4901 #ifndef PERL_IN_XSUB_RE
4902 #define RE_ENGINE_PTR &PL_core_reg_engine
4904 extern const struct regexp_engine my_reg_engine;
4905 #define RE_ENGINE_PTR &my_reg_engine
4908 #ifndef PERL_IN_XSUB_RE
4910 /* return the currently in-scope regex engine (or NULL if none) */
4913 Perl_current_re_engine(pTHX)
4917 if (IN_PERL_COMPILETIME) {
4918 HV * const table = GvHV(PL_hintgv);
4923 ptr = hv_fetchs(table, "regcomp", FALSE);
4924 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4926 return INT2PTR(regexp_engine*,SvIV(*ptr));
4930 if (!PL_curcop->cop_hints_hash)
4932 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4933 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4935 return INT2PTR(regexp_engine*,SvIV(ptr));
4941 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4944 regexp_engine *eng = current_re_engine();
4946 PERL_ARGS_ASSERT_PREGCOMP;
4948 /* Dispatch a request to compile a regexp to correct regexp engine. */
4950 GET_RE_DEBUG_FLAGS_DECL;
4952 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4955 return CALLREGCOMP_ENG(eng, pattern, flags);
4957 return Perl_re_compile(aTHX_ pattern, flags);
4961 /* public(ish) wrapper for Perl_re_op_compile that only takes an SV
4962 * pattern rather than a list of OPs */
4965 Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
4967 SV *pat = pattern; /* defeat constness! */
4968 PERL_ARGS_ASSERT_RE_COMPILE;
4969 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4970 NULL, NULL, NULL, orig_pm_flags);
4975 * Perl_re_op_compile - the perl internal RE engine's function to compile a
4976 * regular expression into internal code.
4977 * The pattern may be passed either as:
4978 * a list of SVs (patternp plus pat_count)
4979 * a list of OPs (expr)
4980 * If both are passed, the SV list is used, but the OP list indicates
4981 * which SVs are actually pre-compiled code blocks
4983 * The SVs in the list have magic and qr overloading applied to them (and
4984 * the list may be modified in-place with replacement SVs in the latter
4987 * If the pattern hasn't changed from old_re, then old_re will be
4990 * If eng is set (and not equal to PL_core_reg_engine), then just do the
4991 * initial concatenation of arguments, then pass on to the external
4994 * If is_bare_re is not null, set it to a boolean indicating whether the
4995 * arg list reduced (after overloading) to a single bare regex which has
4996 * been returned (i.e. /$qr/).
4998 * We can't allocate space until we know how big the compiled form will be,
4999 * but we can't compile it (and thus know how big it is) until we've got a
5000 * place to put the code. So we cheat: we compile it twice, once with code
5001 * generation turned off and size counting turned on, and once "for real".
5002 * This also means that we don't allocate space until we are sure that the
5003 * thing really will compile successfully, and we never have to move the
5004 * code and thus invalidate pointers into it. (Note that it has to be in
5005 * one piece because free() must be able to free it all.) [NB: not true in perl]
5007 * Beware that the optimization-preparation code in here knows about some
5008 * of the structure of the compiled regexp. [I'll say.]
5012 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5013 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5014 int *is_bare_re, U32 orig_pm_flags)
5019 register regexp_internal *ri;
5029 /* these are all flags - maybe they should be turned
5030 * into a single int with different bit masks */
5031 I32 sawlookahead = 0;
5034 bool used_setjump = FALSE;
5035 regex_charset initial_charset = get_regex_charset(orig_pm_flags);
5036 bool code_is_utf8 = 0;
5041 RExC_state_t RExC_state;
5042 RExC_state_t * const pRExC_state = &RExC_state;
5043 #ifdef TRIE_STUDY_OPT
5045 RExC_state_t copyRExC_state;
5047 GET_RE_DEBUG_FLAGS_DECL;
5049 DEBUG_r(if (!PL_colorset) reginitcolors());
5051 #ifndef PERL_IN_XSUB_RE
5052 /* Initialize these here instead of as-needed, as is quick and avoids
5053 * having to test them each time otherwise */
5054 if (! PL_AboveLatin1) {
5055 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5056 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5057 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5059 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5060 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5062 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5063 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5065 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5066 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5068 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5070 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5071 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5073 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5075 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5076 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5078 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5079 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5081 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5082 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5084 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5085 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5087 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5088 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5090 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5091 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5093 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5094 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5096 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5097 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5099 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5101 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5102 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5104 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5105 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5109 pRExC_state->code_blocks = NULL;
5110 pRExC_state->num_code_blocks = 0;
5115 if (expr && (expr->op_type == OP_LIST ||
5116 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5118 /* is the source UTF8, and how many code blocks are there? */
5122 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5123 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5125 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5126 /* count of DO blocks */
5130 pRExC_state->num_code_blocks = ncode;
5131 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5136 /* handle a list of SVs */
5140 /* apply magic and RE overloading to each arg */
5141 for (svp = patternp; svp < patternp + pat_count; svp++) {
5144 if (SvROK(rx) && SvAMAGIC(rx)) {
5145 SV *sv = AMG_CALLunary(rx, regexp_amg);
5149 if (SvTYPE(sv) != SVt_REGEXP)
5150 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5156 if (pat_count > 1) {
5157 /* concat multiple args and find any code block indexes */
5163 if (pRExC_state->num_code_blocks) {
5164 o = cLISTOPx(expr)->op_first;
5165 assert(o->op_type == OP_PUSHMARK);
5169 pat = newSVpvn("", 0);
5172 /* determine if the pattern is going to be utf8 (needed
5173 * in advance to align code block indices correctly).
5174 * XXX This could fail to be detected for an arg with
5175 * overloading but not concat overloading; but the main effect
5176 * in this obscure case is to need a 'use re eval' for a
5177 * literal code block */
5178 for (svp = patternp; svp < patternp + pat_count; svp++) {
5185 for (svp = patternp; svp < patternp + pat_count; svp++) {
5186 SV *sv, *msv = *svp;
5190 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5191 assert(n < pRExC_state->num_code_blocks);
5192 pRExC_state->code_blocks[n].start = SvCUR(pat);
5193 pRExC_state->code_blocks[n].block = o;
5194 pRExC_state->code_blocks[n].src_regex = NULL;
5197 o = o->op_sibling; /* skip CONST */
5203 /* extract any code blocks within any embedded qr//'s */
5207 if (SvTYPE(rx) == SVt_REGEXP
5208 && RX_ENGINE((REGEXP*)rx) == RE_ENGINE_PTR)
5211 RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5212 if (ri->num_code_blocks) {
5214 Renew(pRExC_state->code_blocks,
5215 pRExC_state->num_code_blocks + ri->num_code_blocks,
5216 struct reg_code_block);
5217 pRExC_state->num_code_blocks += ri->num_code_blocks;
5218 for (i=0; i < ri->num_code_blocks; i++) {
5219 struct reg_code_block *src, *dst;
5220 STRLEN offset = SvCUR(pat)
5221 + ((struct regexp *)SvANY(rx))->pre_prefix;
5222 assert(n < pRExC_state->num_code_blocks);
5223 src = &ri->code_blocks[i];
5224 dst = &pRExC_state->code_blocks[n];
5225 dst->start = src->start + offset;
5226 dst->end = src->end + offset;
5227 dst->block = src->block;
5228 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5237 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5238 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5241 /* overloading involved: all bets are off over literal
5242 * code. Pretend we haven't seen it */
5243 pRExC_state->num_code_blocks -= n;
5248 sv_catsv_nomg(pat, msv);
5250 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5258 /* handle bare regex: foo =~ $re */
5263 if (SvTYPE(re) == SVt_REGEXP) {
5267 Safefree(pRExC_state->code_blocks);
5273 /* not a list of SVs, so must be a list of OPs */
5275 if (expr->op_type == OP_LIST) {
5280 pat = newSVpvn("", 0);
5285 /* given a list of CONSTs and DO blocks in expr, append all
5286 * the CONSTs to pat, and record the start and end of each
5287 * code block in code_blocks[] (each DO{} op is followed by an
5288 * OP_CONST containing the corresponding literal '(?{...})
5291 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5292 if (o->op_type == OP_CONST) {
5293 sv_catsv(pat, cSVOPo_sv);
5295 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5299 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5300 assert(i+1 < pRExC_state->num_code_blocks);
5301 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5302 pRExC_state->code_blocks[i].block = o;
5303 pRExC_state->code_blocks[i].src_regex = NULL;
5309 assert(expr->op_type == OP_CONST);
5310 pat = cSVOPx_sv(expr);
5314 exp = SvPV_nomg(pat, plen);
5316 if (eng && eng != RE_ENGINE_PTR) {
5317 if ((SvUTF8(pat) && IN_BYTES)
5318 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5320 /* make a temporary copy; either to convert to bytes,
5321 * or to avoid repeating get-magic / overloaded stringify */
5322 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5323 (IN_BYTES ? 0 : SvUTF8(pat)));
5325 Safefree(pRExC_state->code_blocks);
5326 return CALLREGCOMP_ENG(eng, pat, orig_pm_flags);
5330 && !!RX_UTF8(old_re) == !!SvUTF8(pat)
5331 && RX_PRECOMP(old_re) && RX_PRELEN(old_re) == plen
5332 && memEQ(RX_PRECOMP(old_re), exp, plen))
5334 ReREFCNT_inc(old_re);
5335 Safefree(pRExC_state->code_blocks);
5339 /* ignore the utf8ness if the pattern is 0 length */
5340 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5341 RExC_uni_semantics = 0;
5342 RExC_contains_locale = 0;
5344 /****************** LONG JUMP TARGET HERE***********************/
5345 /* Longjmp back to here if have to switch in midstream to utf8 */
5346 if (! RExC_orig_utf8) {
5347 JMPENV_PUSH(jump_ret);
5348 used_setjump = TRUE;
5351 if (jump_ret == 0) { /* First time through */
5355 SV *dsv= sv_newmortal();
5356 RE_PV_QUOTED_DECL(s, RExC_utf8,
5357 dsv, exp, plen, 60);
5358 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5359 PL_colors[4],PL_colors[5],s);
5362 else { /* longjumped back */
5365 STRLEN s = 0, d = 0;
5368 /* If the cause for the longjmp was other than changing to utf8, pop
5369 * our own setjmp, and longjmp to the correct handler */
5370 if (jump_ret != UTF8_LONGJMP) {
5372 JMPENV_JUMP(jump_ret);
5377 /* It's possible to write a regexp in ascii that represents Unicode
5378 codepoints outside of the byte range, such as via \x{100}. If we
5379 detect such a sequence we have to convert the entire pattern to utf8
5380 and then recompile, as our sizing calculation will have been based
5381 on 1 byte == 1 character, but we will need to use utf8 to encode
5382 at least some part of the pattern, and therefore must convert the whole
5385 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5386 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5388 /* upgrade pattern to UTF8, and if there are code blocks,
5389 * recalculate the indices.
5390 * This is essentially an unrolled Perl_bytes_to_utf8() */
5392 src = (U8*)SvPV_nomg(pat, plen);
5393 Newx(dst, plen * 2 + 1, U8);
5396 const UV uv = NATIVE_TO_ASCII(src[s]);
5397 if (UNI_IS_INVARIANT(uv))
5398 dst[d] = (U8)UTF_TO_NATIVE(uv);
5400 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5401 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5403 if (n < pRExC_state->num_code_blocks) {
5404 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5405 pRExC_state->code_blocks[n].start = d;
5406 assert(dst[d] == '(');
5409 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5410 pRExC_state->code_blocks[n].end = d;
5411 assert(dst[d] == ')');
5424 RExC_orig_utf8 = RExC_utf8 = 1;
5426 /* we've changed the string; check again whether it matches
5427 * the old pattern, to avoid recompilation */
5430 && RX_PRECOMP(old_re) && RX_PRELEN(old_re) == plen
5431 && memEQ(RX_PRECOMP(old_re), exp, plen))
5433 ReREFCNT_inc(old_re);
5437 Safefree(pRExC_state->code_blocks);
5443 #ifdef TRIE_STUDY_OPT
5447 pm_flags = orig_pm_flags;
5449 if (initial_charset == REGEX_LOCALE_CHARSET) {
5450 RExC_contains_locale = 1;
5452 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5454 /* Set to use unicode semantics if the pattern is in utf8 and has the
5455 * 'depends' charset specified, as it means unicode when utf8 */
5456 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5460 RExC_flags = pm_flags;
5464 RExC_in_lookbehind = 0;
5465 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5466 RExC_seen_evals = 0;
5468 RExC_override_recoding = 0;
5470 /* First pass: determine size, legality. */
5478 RExC_emit = &PL_regdummy;
5479 RExC_whilem_seen = 0;
5480 RExC_open_parens = NULL;
5481 RExC_close_parens = NULL;
5483 RExC_paren_names = NULL;
5485 RExC_paren_name_list = NULL;
5487 RExC_recurse = NULL;
5488 RExC_recurse_count = 0;
5489 pRExC_state->code_index = 0;
5491 #if 0 /* REGC() is (currently) a NOP at the first pass.
5492 * Clever compilers notice this and complain. --jhi */
5493 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5495 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
5496 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5497 RExC_precomp = NULL;
5498 Safefree(pRExC_state->code_blocks);
5502 /* Here, finished first pass. Get rid of any added setjmp */
5508 PerlIO_printf(Perl_debug_log,
5509 "Required size %"IVdf" nodes\n"
5510 "Starting second pass (creation)\n",
5513 RExC_lastparse=NULL;
5516 /* The first pass could have found things that force Unicode semantics */
5517 if ((RExC_utf8 || RExC_uni_semantics)
5518 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
5520 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5523 /* Small enough for pointer-storage convention?
5524 If extralen==0, this means that we will not need long jumps. */
5525 if (RExC_size >= 0x10000L && RExC_extralen)
5526 RExC_size += RExC_extralen;
5529 if (RExC_whilem_seen > 15)
5530 RExC_whilem_seen = 15;
5532 /* Allocate space and zero-initialize. Note, the two step process
5533 of zeroing when in debug mode, thus anything assigned has to
5534 happen after that */
5535 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5536 r = (struct regexp*)SvANY(rx);
5537 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5538 char, regexp_internal);
5539 if ( r == NULL || ri == NULL )
5540 FAIL("Regexp out of space");
5542 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5543 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5545 /* bulk initialize base fields with 0. */
5546 Zero(ri, sizeof(regexp_internal), char);
5549 /* non-zero initialization begins here */
5551 r->engine= RE_ENGINE_PTR;
5552 r->extflags = pm_flags;
5553 if (orig_pm_flags & PMf_HAS_CV) {
5554 ri->code_blocks = pRExC_state->code_blocks;
5555 ri->num_code_blocks = pRExC_state->num_code_blocks;
5558 SAVEFREEPV(pRExC_state->code_blocks);
5561 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5562 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5564 /* The caret is output if there are any defaults: if not all the STD
5565 * flags are set, or if no character set specifier is needed */
5567 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5569 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5570 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5571 >> RXf_PMf_STD_PMMOD_SHIFT);
5572 const char *fptr = STD_PAT_MODS; /*"msix"*/
5574 /* Allocate for the worst case, which is all the std flags are turned
5575 * on. If more precision is desired, we could do a population count of
5576 * the flags set. This could be done with a small lookup table, or by
5577 * shifting, masking and adding, or even, when available, assembly
5578 * language for a machine-language population count.
5579 * We never output a minus, as all those are defaults, so are
5580 * covered by the caret */
5581 const STRLEN wraplen = plen + has_p + has_runon
5582 + has_default /* If needs a caret */
5584 /* If needs a character set specifier */
5585 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5586 + (sizeof(STD_PAT_MODS) - 1)
5587 + (sizeof("(?:)") - 1);
5589 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5592 SvFLAGS(rx) |= SVf_UTF8;
5595 /* If a default, cover it using the caret */
5597 *p++= DEFAULT_PAT_MOD;
5601 const char* const name = get_regex_charset_name(r->extflags, &len);
5602 Copy(name, p, len, char);
5606 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5609 while((ch = *fptr++)) {
5617 Copy(RExC_precomp, p, plen, char);
5618 assert ((RX_WRAPPED(rx) - p) < 16);
5619 r->pre_prefix = p - RX_WRAPPED(rx);
5625 SvCUR_set(rx, p - SvPVX_const(rx));
5629 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5631 if (RExC_seen & REG_SEEN_RECURSE) {
5632 Newxz(RExC_open_parens, RExC_npar,regnode *);
5633 SAVEFREEPV(RExC_open_parens);
5634 Newxz(RExC_close_parens,RExC_npar,regnode *);
5635 SAVEFREEPV(RExC_close_parens);
5638 /* Useful during FAIL. */
5639 #ifdef RE_TRACK_PATTERN_OFFSETS
5640 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5641 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5642 "%s %"UVuf" bytes for offset annotations.\n",
5643 ri->u.offsets ? "Got" : "Couldn't get",
5644 (UV)((2*RExC_size+1) * sizeof(U32))));
5646 SetProgLen(ri,RExC_size);
5651 /* Second pass: emit code. */
5652 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
5657 RExC_emit_start = ri->program;
5658 RExC_emit = ri->program;
5659 RExC_emit_bound = ri->program + RExC_size + 1;
5660 pRExC_state->code_index = 0;
5662 /* Store the count of eval-groups for security checks: */
5663 RExC_rx->seen_evals = RExC_seen_evals;
5664 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5665 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5669 /* XXXX To minimize changes to RE engine we always allocate
5670 3-units-long substrs field. */
5671 Newx(r->substrs, 1, struct reg_substr_data);
5672 if (RExC_recurse_count) {
5673 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5674 SAVEFREEPV(RExC_recurse);
5678 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5679 Zero(r->substrs, 1, struct reg_substr_data);
5681 #ifdef TRIE_STUDY_OPT
5683 StructCopy(&zero_scan_data, &data, scan_data_t);
5684 copyRExC_state = RExC_state;
5687 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5689 RExC_state = copyRExC_state;
5690 if (seen & REG_TOP_LEVEL_BRANCHES)
5691 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5693 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5694 if (data.last_found) {
5695 SvREFCNT_dec(data.longest_fixed);
5696 SvREFCNT_dec(data.longest_float);
5697 SvREFCNT_dec(data.last_found);
5699 StructCopy(&zero_scan_data, &data, scan_data_t);
5702 StructCopy(&zero_scan_data, &data, scan_data_t);
5705 /* Dig out information for optimizations. */
5706 r->extflags = RExC_flags; /* was pm_op */
5707 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5710 SvUTF8_on(rx); /* Unicode in it? */
5711 ri->regstclass = NULL;
5712 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5713 r->intflags |= PREGf_NAUGHTY;
5714 scan = ri->program + 1; /* First BRANCH. */
5716 /* testing for BRANCH here tells us whether there is "must appear"
5717 data in the pattern. If there is then we can use it for optimisations */
5718 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
5720 STRLEN longest_float_length, longest_fixed_length;
5721 struct regnode_charclass_class ch_class; /* pointed to by data */
5723 I32 last_close = 0; /* pointed to by data */
5724 regnode *first= scan;
5725 regnode *first_next= regnext(first);
5727 * Skip introductions and multiplicators >= 1
5728 * so that we can extract the 'meat' of the pattern that must
5729 * match in the large if() sequence following.
5730 * NOTE that EXACT is NOT covered here, as it is normally
5731 * picked up by the optimiser separately.
5733 * This is unfortunate as the optimiser isnt handling lookahead
5734 * properly currently.
5737 while ((OP(first) == OPEN && (sawopen = 1)) ||
5738 /* An OR of *one* alternative - should not happen now. */
5739 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5740 /* for now we can't handle lookbehind IFMATCH*/
5741 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5742 (OP(first) == PLUS) ||
5743 (OP(first) == MINMOD) ||
5744 /* An {n,m} with n>0 */
5745 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5746 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5749 * the only op that could be a regnode is PLUS, all the rest
5750 * will be regnode_1 or regnode_2.
5753 if (OP(first) == PLUS)
5756 first += regarglen[OP(first)];
5758 first = NEXTOPER(first);
5759 first_next= regnext(first);
5762 /* Starting-point info. */
5764 DEBUG_PEEP("first:",first,0);
5765 /* Ignore EXACT as we deal with it later. */
5766 if (PL_regkind[OP(first)] == EXACT) {
5767 if (OP(first) == EXACT)
5768 NOOP; /* Empty, get anchored substr later. */
5770 ri->regstclass = first;
5773 else if (PL_regkind[OP(first)] == TRIE &&
5774 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5777 /* this can happen only on restudy */
5778 if ( OP(first) == TRIE ) {
5779 struct regnode_1 *trieop = (struct regnode_1 *)
5780 PerlMemShared_calloc(1, sizeof(struct regnode_1));
5781 StructCopy(first,trieop,struct regnode_1);
5782 trie_op=(regnode *)trieop;
5784 struct regnode_charclass *trieop = (struct regnode_charclass *)
5785 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5786 StructCopy(first,trieop,struct regnode_charclass);
5787 trie_op=(regnode *)trieop;
5790 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5791 ri->regstclass = trie_op;
5794 else if (REGNODE_SIMPLE(OP(first)))
5795 ri->regstclass = first;
5796 else if (PL_regkind[OP(first)] == BOUND ||
5797 PL_regkind[OP(first)] == NBOUND)
5798 ri->regstclass = first;
5799 else if (PL_regkind[OP(first)] == BOL) {
5800 r->extflags |= (OP(first) == MBOL
5802 : (OP(first) == SBOL
5805 first = NEXTOPER(first);
5808 else if (OP(first) == GPOS) {
5809 r->extflags |= RXf_ANCH_GPOS;
5810 first = NEXTOPER(first);
5813 else if ((!sawopen || !RExC_sawback) &&
5814 (OP(first) == STAR &&
5815 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5816 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5818 /* turn .* into ^.* with an implied $*=1 */
5820 (OP(NEXTOPER(first)) == REG_ANY)
5823 r->extflags |= type;
5824 r->intflags |= PREGf_IMPLICIT;
5825 first = NEXTOPER(first);
5828 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5829 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5830 /* x+ must match at the 1st pos of run of x's */
5831 r->intflags |= PREGf_SKIP;
5833 /* Scan is after the zeroth branch, first is atomic matcher. */
5834 #ifdef TRIE_STUDY_OPT
5837 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5838 (IV)(first - scan + 1))
5842 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5843 (IV)(first - scan + 1))
5849 * If there's something expensive in the r.e., find the
5850 * longest literal string that must appear and make it the
5851 * regmust. Resolve ties in favor of later strings, since
5852 * the regstart check works with the beginning of the r.e.
5853 * and avoiding duplication strengthens checking. Not a
5854 * strong reason, but sufficient in the absence of others.
5855 * [Now we resolve ties in favor of the earlier string if
5856 * it happens that c_offset_min has been invalidated, since the
5857 * earlier string may buy us something the later one won't.]
5860 data.longest_fixed = newSVpvs("");
5861 data.longest_float = newSVpvs("");
5862 data.last_found = newSVpvs("");
5863 data.longest = &(data.longest_fixed);
5865 if (!ri->regstclass) {
5866 cl_init(pRExC_state, &ch_class);
5867 data.start_class = &ch_class;
5868 stclass_flag = SCF_DO_STCLASS_AND;
5869 } else /* XXXX Check for BOUND? */
5871 data.last_closep = &last_close;
5873 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5874 &data, -1, NULL, NULL,
5875 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5881 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5882 && data.last_start_min == 0 && data.last_end > 0
5883 && !RExC_seen_zerolen
5884 && !(RExC_seen & REG_SEEN_VERBARG)
5885 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5886 r->extflags |= RXf_CHECK_ALL;
5887 scan_commit(pRExC_state, &data,&minlen,0);
5888 SvREFCNT_dec(data.last_found);
5890 /* Note that code very similar to this but for anchored string
5891 follows immediately below, changes may need to be made to both.
5894 longest_float_length = CHR_SVLEN(data.longest_float);
5895 if (longest_float_length
5896 || (data.flags & SF_FL_BEFORE_EOL
5897 && (!(data.flags & SF_FL_BEFORE_MEOL)
5898 || (RExC_flags & RXf_PMf_MULTILINE))))
5902 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5903 if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5904 || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
5905 && data.offset_fixed == data.offset_float_min
5906 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5907 goto remove_float; /* As in (a)+. */
5909 /* copy the information about the longest float from the reg_scan_data
5910 over to the program. */
5911 if (SvUTF8(data.longest_float)) {
5912 r->float_utf8 = data.longest_float;
5913 r->float_substr = NULL;
5915 r->float_substr = data.longest_float;
5916 r->float_utf8 = NULL;
5918 /* float_end_shift is how many chars that must be matched that
5919 follow this item. We calculate it ahead of time as once the
5920 lookbehind offset is added in we lose the ability to correctly
5922 ml = data.minlen_float ? *(data.minlen_float)
5923 : (I32)longest_float_length;
5924 r->float_end_shift = ml - data.offset_float_min
5925 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5926 + data.lookbehind_float;
5927 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5928 r->float_max_offset = data.offset_float_max;
5929 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5930 r->float_max_offset -= data.lookbehind_float;
5932 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5933 && (!(data.flags & SF_FL_BEFORE_MEOL)
5934 || (RExC_flags & RXf_PMf_MULTILINE)));
5935 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5939 r->float_substr = r->float_utf8 = NULL;
5940 SvREFCNT_dec(data.longest_float);
5941 longest_float_length = 0;
5944 /* Note that code very similar to this but for floating string
5945 is immediately above, changes may need to be made to both.
5948 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5950 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5951 if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5952 && (longest_fixed_length
5953 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5954 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5955 || (RExC_flags & RXf_PMf_MULTILINE)))) )
5959 /* copy the information about the longest fixed
5960 from the reg_scan_data over to the program. */
5961 if (SvUTF8(data.longest_fixed)) {
5962 r->anchored_utf8 = data.longest_fixed;
5963 r->anchored_substr = NULL;
5965 r->anchored_substr = data.longest_fixed;
5966 r->anchored_utf8 = NULL;
5968 /* fixed_end_shift is how many chars that must be matched that
5969 follow this item. We calculate it ahead of time as once the
5970 lookbehind offset is added in we lose the ability to correctly
5972 ml = data.minlen_fixed ? *(data.minlen_fixed)
5973 : (I32)longest_fixed_length;
5974 r->anchored_end_shift = ml - data.offset_fixed
5975 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5976 + data.lookbehind_fixed;
5977 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5979 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5980 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5981 || (RExC_flags & RXf_PMf_MULTILINE)));
5982 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
5985 r->anchored_substr = r->anchored_utf8 = NULL;
5986 SvREFCNT_dec(data.longest_fixed);
5987 longest_fixed_length = 0;
5990 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5991 ri->regstclass = NULL;
5993 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5995 && !(data.start_class->flags & ANYOF_EOS)
5996 && !cl_is_anything(data.start_class))
5998 const U32 n = add_data(pRExC_state, 1, "f");
5999 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6001 Newx(RExC_rxi->data->data[n], 1,
6002 struct regnode_charclass_class);
6003 StructCopy(data.start_class,
6004 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6005 struct regnode_charclass_class);
6006 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6007 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6008 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6009 regprop(r, sv, (regnode*)data.start_class);
6010 PerlIO_printf(Perl_debug_log,
6011 "synthetic stclass \"%s\".\n",
6012 SvPVX_const(sv));});
6015 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6016 if (longest_fixed_length > longest_float_length) {
6017 r->check_end_shift = r->anchored_end_shift;
6018 r->check_substr = r->anchored_substr;
6019 r->check_utf8 = r->anchored_utf8;
6020 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6021 if (r->extflags & RXf_ANCH_SINGLE)
6022 r->extflags |= RXf_NOSCAN;
6025 r->check_end_shift = r->float_end_shift;
6026 r->check_substr = r->float_substr;
6027 r->check_utf8 = r->float_utf8;
6028 r->check_offset_min = r->float_min_offset;
6029 r->check_offset_max = r->float_max_offset;
6031 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6032 This should be changed ASAP! */
6033 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6034 r->extflags |= RXf_USE_INTUIT;
6035 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6036 r->extflags |= RXf_INTUIT_TAIL;
6038 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6039 if ( (STRLEN)minlen < longest_float_length )
6040 minlen= longest_float_length;
6041 if ( (STRLEN)minlen < longest_fixed_length )
6042 minlen= longest_fixed_length;
6046 /* Several toplevels. Best we can is to set minlen. */
6048 struct regnode_charclass_class ch_class;
6051 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6053 scan = ri->program + 1;
6054 cl_init(pRExC_state, &ch_class);
6055 data.start_class = &ch_class;
6056 data.last_closep = &last_close;
6059 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6060 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6064 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6065 = r->float_substr = r->float_utf8 = NULL;
6067 if (!(data.start_class->flags & ANYOF_EOS)
6068 && !cl_is_anything(data.start_class))
6070 const U32 n = add_data(pRExC_state, 1, "f");
6071 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6073 Newx(RExC_rxi->data->data[n], 1,
6074 struct regnode_charclass_class);
6075 StructCopy(data.start_class,
6076 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6077 struct regnode_charclass_class);
6078 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6079 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6080 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6081 regprop(r, sv, (regnode*)data.start_class);
6082 PerlIO_printf(Perl_debug_log,
6083 "synthetic stclass \"%s\".\n",
6084 SvPVX_const(sv));});
6088 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6089 the "real" pattern. */
6091 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6092 (IV)minlen, (IV)r->minlen);
6094 r->minlenret = minlen;
6095 if (r->minlen < minlen)
6098 if (RExC_seen & REG_SEEN_GPOS)
6099 r->extflags |= RXf_GPOS_SEEN;
6100 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6101 r->extflags |= RXf_LOOKBEHIND_SEEN;
6102 if (RExC_seen & REG_SEEN_EVAL)
6103 r->extflags |= RXf_EVAL_SEEN;
6104 if (RExC_seen & REG_SEEN_CANY)
6105 r->extflags |= RXf_CANY_SEEN;
6106 if (RExC_seen & REG_SEEN_VERBARG)
6107 r->intflags |= PREGf_VERBARG_SEEN;
6108 if (RExC_seen & REG_SEEN_CUTGROUP)
6109 r->intflags |= PREGf_CUTGROUP_SEEN;
6110 if (RExC_paren_names)
6111 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6113 RXp_PAREN_NAMES(r) = NULL;
6115 #ifdef STUPID_PATTERN_CHECKS
6116 if (RX_PRELEN(rx) == 0)
6117 r->extflags |= RXf_NULL;
6118 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6119 /* XXX: this should happen BEFORE we compile */
6120 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6121 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6122 r->extflags |= RXf_WHITE;
6123 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6124 r->extflags |= RXf_START_ONLY;
6126 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6127 /* XXX: this should happen BEFORE we compile */
6128 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6130 regnode *first = ri->program + 1;
6133 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6134 r->extflags |= RXf_NULL;
6135 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6136 r->extflags |= RXf_START_ONLY;
6137 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6138 && OP(regnext(first)) == END)
6139 r->extflags |= RXf_WHITE;
6143 if (RExC_paren_names) {
6144 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6145 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6148 ri->name_list_idx = 0;
6150 if (RExC_recurse_count) {
6151 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6152 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6153 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6156 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6157 /* assume we don't need to swap parens around before we match */
6160 PerlIO_printf(Perl_debug_log,"Final program:\n");
6163 #ifdef RE_TRACK_PATTERN_OFFSETS
6164 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6165 const U32 len = ri->u.offsets[0];
6167 GET_RE_DEBUG_FLAGS_DECL;
6168 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6169 for (i = 1; i <= len; i++) {
6170 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6171 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6172 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6174 PerlIO_printf(Perl_debug_log, "\n");
6180 #undef RE_ENGINE_PTR
6184 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6187 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6189 PERL_UNUSED_ARG(value);
6191 if (flags & RXapif_FETCH) {
6192 return reg_named_buff_fetch(rx, key, flags);
6193 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6194 Perl_croak_no_modify(aTHX);
6196 } else if (flags & RXapif_EXISTS) {
6197 return reg_named_buff_exists(rx, key, flags)
6200 } else if (flags & RXapif_REGNAMES) {
6201 return reg_named_buff_all(rx, flags);
6202 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6203 return reg_named_buff_scalar(rx, flags);
6205 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6211 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6214 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6215 PERL_UNUSED_ARG(lastkey);
6217 if (flags & RXapif_FIRSTKEY)
6218 return reg_named_buff_firstkey(rx, flags);
6219 else if (flags & RXapif_NEXTKEY)
6220 return reg_named_buff_nextkey(rx, flags);
6222 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6228 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6231 AV *retarray = NULL;
6233 struct regexp *const rx = (struct regexp *)SvANY(r);
6235 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6237 if (flags & RXapif_ALL)
6240 if (rx && RXp_PAREN_NAMES(rx)) {
6241 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6244 SV* sv_dat=HeVAL(he_str);
6245 I32 *nums=(I32*)SvPVX(sv_dat);
6246 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6247 if ((I32)(rx->nparens) >= nums[i]
6248 && rx->offs[nums[i]].start != -1
6249 && rx->offs[nums[i]].end != -1)
6252 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6257 ret = newSVsv(&PL_sv_undef);
6260 av_push(retarray, ret);
6263 return newRV_noinc(MUTABLE_SV(retarray));
6270 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6273 struct regexp *const rx = (struct regexp *)SvANY(r);
6275 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6277 if (rx && RXp_PAREN_NAMES(rx)) {
6278 if (flags & RXapif_ALL) {
6279 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6281 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6295 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6297 struct regexp *const rx = (struct regexp *)SvANY(r);
6299 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6301 if ( rx && RXp_PAREN_NAMES(rx) ) {
6302 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6304 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6311 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6313 struct regexp *const rx = (struct regexp *)SvANY(r);
6314 GET_RE_DEBUG_FLAGS_DECL;
6316 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6318 if (rx && RXp_PAREN_NAMES(rx)) {
6319 HV *hv = RXp_PAREN_NAMES(rx);
6321 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6324 SV* sv_dat = HeVAL(temphe);
6325 I32 *nums = (I32*)SvPVX(sv_dat);
6326 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6327 if ((I32)(rx->lastparen) >= nums[i] &&
6328 rx->offs[nums[i]].start != -1 &&
6329 rx->offs[nums[i]].end != -1)
6335 if (parno || flags & RXapif_ALL) {
6336 return newSVhek(HeKEY_hek(temphe));
6344 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6349 struct regexp *const rx = (struct regexp *)SvANY(r);
6351 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6353 if (rx && RXp_PAREN_NAMES(rx)) {
6354 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6355 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6356 } else if (flags & RXapif_ONE) {
6357 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6358 av = MUTABLE_AV(SvRV(ret));
6359 length = av_len(av);
6361 return newSViv(length + 1);
6363 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6367 return &PL_sv_undef;
6371 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6373 struct regexp *const rx = (struct regexp *)SvANY(r);
6376 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6378 if (rx && RXp_PAREN_NAMES(rx)) {
6379 HV *hv= RXp_PAREN_NAMES(rx);
6381 (void)hv_iterinit(hv);
6382 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6385 SV* sv_dat = HeVAL(temphe);
6386 I32 *nums = (I32*)SvPVX(sv_dat);
6387 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6388 if ((I32)(rx->lastparen) >= nums[i] &&
6389 rx->offs[nums[i]].start != -1 &&
6390 rx->offs[nums[i]].end != -1)
6396 if (parno || flags & RXapif_ALL) {
6397 av_push(av, newSVhek(HeKEY_hek(temphe)));
6402 return newRV_noinc(MUTABLE_SV(av));
6406 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6409 struct regexp *const rx = (struct regexp *)SvANY(r);
6414 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6417 sv_setsv(sv,&PL_sv_undef);
6421 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
6423 i = rx->offs[0].start;
6427 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6429 s = rx->subbeg + rx->offs[0].end;
6430 i = rx->sublen - rx->offs[0].end;
6433 if ( 0 <= paren && paren <= (I32)rx->nparens &&
6434 (s1 = rx->offs[paren].start) != -1 &&
6435 (t1 = rx->offs[paren].end) != -1)
6439 s = rx->subbeg + s1;
6441 sv_setsv(sv,&PL_sv_undef);
6444 assert(rx->sublen >= (s - rx->subbeg) + i );
6446 const int oldtainted = PL_tainted;
6448 sv_setpvn(sv, s, i);
6449 PL_tainted = oldtainted;
6450 if ( (rx->extflags & RXf_CANY_SEEN)
6451 ? (RXp_MATCH_UTF8(rx)
6452 && (!i || is_utf8_string((U8*)s, i)))
6453 : (RXp_MATCH_UTF8(rx)) )
6460 if (RXp_MATCH_TAINTED(rx)) {
6461 if (SvTYPE(sv) >= SVt_PVMG) {
6462 MAGIC* const mg = SvMAGIC(sv);
6465 SvMAGIC_set(sv, mg->mg_moremagic);
6467 if ((mgt = SvMAGIC(sv))) {
6468 mg->mg_moremagic = mgt;
6469 SvMAGIC_set(sv, mg);
6479 sv_setsv(sv,&PL_sv_undef);
6485 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6486 SV const * const value)
6488 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6490 PERL_UNUSED_ARG(rx);
6491 PERL_UNUSED_ARG(paren);
6492 PERL_UNUSED_ARG(value);
6495 Perl_croak_no_modify(aTHX);
6499 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6502 struct regexp *const rx = (struct regexp *)SvANY(r);
6506 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6508 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6510 /* $` / ${^PREMATCH} */
6511 case RX_BUFF_IDX_PREMATCH:
6512 if (rx->offs[0].start != -1) {
6513 i = rx->offs[0].start;
6521 /* $' / ${^POSTMATCH} */
6522 case RX_BUFF_IDX_POSTMATCH:
6523 if (rx->offs[0].end != -1) {
6524 i = rx->sublen - rx->offs[0].end;
6526 s1 = rx->offs[0].end;
6532 /* $& / ${^MATCH}, $1, $2, ... */
6534 if (paren <= (I32)rx->nparens &&
6535 (s1 = rx->offs[paren].start) != -1 &&
6536 (t1 = rx->offs[paren].end) != -1)
6541 if (ckWARN(WARN_UNINITIALIZED))
6542 report_uninit((const SV *)sv);
6547 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6548 const char * const s = rx->subbeg + s1;
6553 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6560 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6562 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6563 PERL_UNUSED_ARG(rx);
6567 return newSVpvs("Regexp");
6570 /* Scans the name of a named buffer from the pattern.
6571 * If flags is REG_RSN_RETURN_NULL returns null.
6572 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6573 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6574 * to the parsed name as looked up in the RExC_paren_names hash.
6575 * If there is an error throws a vFAIL().. type exception.
6578 #define REG_RSN_RETURN_NULL 0
6579 #define REG_RSN_RETURN_NAME 1
6580 #define REG_RSN_RETURN_DATA 2
6583 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6585 char *name_start = RExC_parse;
6587 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6589 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6590 /* skip IDFIRST by using do...while */
6593 RExC_parse += UTF8SKIP(RExC_parse);
6594 } while (isALNUM_utf8((U8*)RExC_parse));
6598 } while (isALNUM(*RExC_parse));
6603 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6604 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6605 if ( flags == REG_RSN_RETURN_NAME)
6607 else if (flags==REG_RSN_RETURN_DATA) {
6610 if ( ! sv_name ) /* should not happen*/
6611 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6612 if (RExC_paren_names)
6613 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6615 sv_dat = HeVAL(he_str);
6617 vFAIL("Reference to nonexistent named group");
6621 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6622 (unsigned long) flags);
6629 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6630 int rem=(int)(RExC_end - RExC_parse); \
6639 if (RExC_lastparse!=RExC_parse) \
6640 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6643 iscut ? "..." : "<" \
6646 PerlIO_printf(Perl_debug_log,"%16s",""); \
6649 num = RExC_size + 1; \
6651 num=REG_NODE_NUM(RExC_emit); \
6652 if (RExC_lastnum!=num) \
6653 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6655 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6656 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6657 (int)((depth*2)), "", \
6661 RExC_lastparse=RExC_parse; \
6666 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6667 DEBUG_PARSE_MSG((funcname)); \
6668 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6670 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6671 DEBUG_PARSE_MSG((funcname)); \
6672 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6675 /* This section of code defines the inversion list object and its methods. The
6676 * interfaces are highly subject to change, so as much as possible is static to
6677 * this file. An inversion list is here implemented as a malloc'd C UV array
6678 * with some added info that is placed as UVs at the beginning in a header
6679 * portion. An inversion list for Unicode is an array of code points, sorted
6680 * by ordinal number. The zeroth element is the first code point in the list.
6681 * The 1th element is the first element beyond that not in the list. In other
6682 * words, the first range is
6683 * invlist[0]..(invlist[1]-1)
6684 * The other ranges follow. Thus every element whose index is divisible by two
6685 * marks the beginning of a range that is in the list, and every element not
6686 * divisible by two marks the beginning of a range not in the list. A single
6687 * element inversion list that contains the single code point N generally
6688 * consists of two elements
6691 * (The exception is when N is the highest representable value on the
6692 * machine, in which case the list containing just it would be a single
6693 * element, itself. By extension, if the last range in the list extends to
6694 * infinity, then the first element of that range will be in the inversion list
6695 * at a position that is divisible by two, and is the final element in the
6697 * Taking the complement (inverting) an inversion list is quite simple, if the
6698 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6699 * This implementation reserves an element at the beginning of each inversion list
6700 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6701 * beginning of the list is either that element if 0, or the next one if 1.
6703 * More about inversion lists can be found in "Unicode Demystified"
6704 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6705 * More will be coming when functionality is added later.
6707 * The inversion list data structure is currently implemented as an SV pointing
6708 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6709 * array of UV whose memory management is automatically handled by the existing
6710 * facilities for SV's.
6712 * Some of the methods should always be private to the implementation, and some
6713 * should eventually be made public */
6715 #define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
6716 #define INVLIST_ITER_OFFSET 1 /* Current iteration position */
6718 /* This is a combination of a version and data structure type, so that one
6719 * being passed in can be validated to be an inversion list of the correct
6720 * vintage. When the structure of the header is changed, a new random number
6721 * in the range 2**31-1 should be generated and the new() method changed to
6722 * insert that at this location. Then, if an auxiliary program doesn't change
6723 * correspondingly, it will be discovered immediately */
6724 #define INVLIST_VERSION_ID_OFFSET 2
6725 #define INVLIST_VERSION_ID 1064334010
6727 /* For safety, when adding new elements, remember to #undef them at the end of
6728 * the inversion list code section */
6730 #define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
6731 /* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
6732 * contains the code point U+00000, and begins here. If 1, the inversion list
6733 * doesn't contain U+0000, and it begins at the next UV in the array.
6734 * Inverting an inversion list consists of adding or removing the 0 at the
6735 * beginning of it. By reserving a space for that 0, inversion can be made
6738 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6740 /* Internally things are UVs */
6741 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6742 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6744 #define INVLIST_INITIAL_LEN 10
6746 PERL_STATIC_INLINE UV*
6747 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6749 /* Returns a pointer to the first element in the inversion list's array.
6750 * This is called upon initialization of an inversion list. Where the
6751 * array begins depends on whether the list has the code point U+0000
6752 * in it or not. The other parameter tells it whether the code that
6753 * follows this call is about to put a 0 in the inversion list or not.
6754 * The first element is either the element with 0, if 0, or the next one,
6757 UV* zero = get_invlist_zero_addr(invlist);
6759 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6762 assert(! *get_invlist_len_addr(invlist));
6764 /* 1^1 = 0; 1^0 = 1 */
6765 *zero = 1 ^ will_have_0;
6766 return zero + *zero;
6769 PERL_STATIC_INLINE UV*
6770 S_invlist_array(pTHX_ SV* const invlist)
6772 /* Returns the pointer to the inversion list's array. Every time the
6773 * length changes, this needs to be called in case malloc or realloc moved
6776 PERL_ARGS_ASSERT_INVLIST_ARRAY;
6778 /* Must not be empty. If these fail, you probably didn't check for <len>
6779 * being non-zero before trying to get the array */
6780 assert(*get_invlist_len_addr(invlist));
6781 assert(*get_invlist_zero_addr(invlist) == 0
6782 || *get_invlist_zero_addr(invlist) == 1);
6784 /* The array begins either at the element reserved for zero if the
6785 * list contains 0 (that element will be set to 0), or otherwise the next
6786 * element (in which case the reserved element will be set to 1). */
6787 return (UV *) (get_invlist_zero_addr(invlist)
6788 + *get_invlist_zero_addr(invlist));
6791 PERL_STATIC_INLINE UV*
6792 S_get_invlist_len_addr(pTHX_ SV* invlist)
6794 /* Return the address of the UV that contains the current number
6795 * of used elements in the inversion list */
6797 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6799 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6802 PERL_STATIC_INLINE UV
6803 S_invlist_len(pTHX_ SV* const invlist)
6805 /* Returns the current number of elements stored in the inversion list's
6808 PERL_ARGS_ASSERT_INVLIST_LEN;
6810 return *get_invlist_len_addr(invlist);
6813 PERL_STATIC_INLINE void
6814 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6816 /* Sets the current number of elements stored in the inversion list */
6818 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6820 *get_invlist_len_addr(invlist) = len;
6822 assert(len <= SvLEN(invlist));
6824 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6825 /* If the list contains U+0000, that element is part of the header,
6826 * and should not be counted as part of the array. It will contain
6827 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
6829 * SvCUR_set(invlist,
6830 * TO_INTERNAL_SIZE(len
6831 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
6832 * But, this is only valid if len is not 0. The consequences of not doing
6833 * this is that the memory allocation code may think that 1 more UV is
6834 * being used than actually is, and so might do an unnecessary grow. That
6835 * seems worth not bothering to make this the precise amount.
6837 * Note that when inverting, SvCUR shouldn't change */
6840 PERL_STATIC_INLINE UV
6841 S_invlist_max(pTHX_ SV* const invlist)
6843 /* Returns the maximum number of elements storable in the inversion list's
6844 * array, without having to realloc() */
6846 PERL_ARGS_ASSERT_INVLIST_MAX;
6848 return FROM_INTERNAL_SIZE(SvLEN(invlist));
6851 PERL_STATIC_INLINE UV*
6852 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6854 /* Return the address of the UV that is reserved to hold 0 if the inversion
6855 * list contains 0. This has to be the last element of the heading, as the
6856 * list proper starts with either it if 0, or the next element if not.
6857 * (But we force it to contain either 0 or 1) */
6859 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6861 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6864 #ifndef PERL_IN_XSUB_RE
6866 Perl__new_invlist(pTHX_ IV initial_size)
6869 /* Return a pointer to a newly constructed inversion list, with enough
6870 * space to store 'initial_size' elements. If that number is negative, a
6871 * system default is used instead */
6875 if (initial_size < 0) {
6876 initial_size = INVLIST_INITIAL_LEN;
6879 /* Allocate the initial space */
6880 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6881 invlist_set_len(new_list, 0);
6883 /* Force iterinit() to be used to get iteration to work */
6884 *get_invlist_iter_addr(new_list) = UV_MAX;
6886 /* This should force a segfault if a method doesn't initialize this
6888 *get_invlist_zero_addr(new_list) = UV_MAX;
6890 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6891 #if HEADER_LENGTH != 4
6892 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6900 S__new_invlist_C_array(pTHX_ UV* list)
6902 /* Return a pointer to a newly constructed inversion list, initialized to
6903 * point to <list>, which has to be in the exact correct inversion list
6904 * form, including internal fields. Thus this is a dangerous routine that
6905 * should not be used in the wrong hands */
6907 SV* invlist = newSV_type(SVt_PV);
6909 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6911 SvPV_set(invlist, (char *) list);
6912 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
6913 shouldn't touch it */
6914 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6916 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6917 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6924 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6926 /* Grow the maximum size of an inversion list */
6928 PERL_ARGS_ASSERT_INVLIST_EXTEND;
6930 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6933 PERL_STATIC_INLINE void
6934 S_invlist_trim(pTHX_ SV* const invlist)
6936 PERL_ARGS_ASSERT_INVLIST_TRIM;
6938 /* Change the length of the inversion list to how many entries it currently
6941 SvPV_shrink_to_cur((SV *) invlist);
6944 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6946 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6947 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6949 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6952 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6954 /* Subject to change or removal. Append the range from 'start' to 'end' at
6955 * the end of the inversion list. The range must be above any existing
6959 UV max = invlist_max(invlist);
6960 UV len = invlist_len(invlist);
6962 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6964 if (len == 0) { /* Empty lists must be initialized */
6965 array = _invlist_array_init(invlist, start == 0);
6968 /* Here, the existing list is non-empty. The current max entry in the
6969 * list is generally the first value not in the set, except when the
6970 * set extends to the end of permissible values, in which case it is
6971 * the first entry in that final set, and so this call is an attempt to
6972 * append out-of-order */
6974 UV final_element = len - 1;
6975 array = invlist_array(invlist);
6976 if (array[final_element] > start
6977 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6979 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6980 array[final_element], start,
6981 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
6984 /* Here, it is a legal append. If the new range begins with the first
6985 * value not in the set, it is extending the set, so the new first
6986 * value not in the set is one greater than the newly extended range.
6988 if (array[final_element] == start) {
6989 if (end != UV_MAX) {
6990 array[final_element] = end + 1;
6993 /* But if the end is the maximum representable on the machine,
6994 * just let the range that this would extend to have no end */
6995 invlist_set_len(invlist, len - 1);
7001 /* Here the new range doesn't extend any existing set. Add it */
7003 len += 2; /* Includes an element each for the start and end of range */
7005 /* If overflows the existing space, extend, which may cause the array to be
7008 invlist_extend(invlist, len);
7009 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7010 failure in invlist_array() */
7011 array = invlist_array(invlist);
7014 invlist_set_len(invlist, len);
7017 /* The next item on the list starts the range, the one after that is
7018 * one past the new range. */
7019 array[len - 2] = start;
7020 if (end != UV_MAX) {
7021 array[len - 1] = end + 1;
7024 /* But if the end is the maximum representable on the machine, just let
7025 * the range have no end */
7026 invlist_set_len(invlist, len - 1);
7030 #ifndef PERL_IN_XSUB_RE
7033 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
7035 /* Searches the inversion list for the entry that contains the input code
7036 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7037 * return value is the index into the list's array of the range that
7041 IV high = invlist_len(invlist);
7042 const UV * const array = invlist_array(invlist);
7044 PERL_ARGS_ASSERT_INVLIST_SEARCH;
7046 /* If list is empty or the code point is before the first element, return
7048 if (high == 0 || cp < array[0]) {
7052 /* Binary search. What we are looking for is <i> such that
7053 * array[i] <= cp < array[i+1]
7054 * The loop below converges on the i+1. */
7055 while (low < high) {
7056 IV mid = (low + high) / 2;
7057 if (array[mid] <= cp) {
7060 /* We could do this extra test to exit the loop early.
7061 if (cp < array[low]) {
7066 else { /* cp < array[mid] */
7075 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7077 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7078 * but is used when the swash has an inversion list. This makes this much
7079 * faster, as it uses a binary search instead of a linear one. This is
7080 * intimately tied to that function, and perhaps should be in utf8.c,
7081 * except it is intimately tied to inversion lists as well. It assumes
7082 * that <swatch> is all 0's on input */
7085 const IV len = invlist_len(invlist);
7089 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7091 if (len == 0) { /* Empty inversion list */
7095 array = invlist_array(invlist);
7097 /* Find which element it is */
7098 i = invlist_search(invlist, start);
7100 /* We populate from <start> to <end> */
7101 while (current < end) {
7104 /* The inversion list gives the results for every possible code point
7105 * after the first one in the list. Only those ranges whose index is
7106 * even are ones that the inversion list matches. For the odd ones,
7107 * and if the initial code point is not in the list, we have to skip
7108 * forward to the next element */
7109 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7111 if (i >= len) { /* Finished if beyond the end of the array */
7115 if (current >= end) { /* Finished if beyond the end of what we
7120 assert(current >= start);
7122 /* The current range ends one below the next one, except don't go past
7125 upper = (i < len && array[i] < end) ? array[i] : end;
7127 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7128 * for each code point in it */
7129 for (; current < upper; current++) {
7130 const STRLEN offset = (STRLEN)(current - start);
7131 swatch[offset >> 3] |= 1 << (offset & 7);
7134 /* Quit if at the end of the list */
7137 /* But first, have to deal with the highest possible code point on
7138 * the platform. The previous code assumes that <end> is one
7139 * beyond where we want to populate, but that is impossible at the
7140 * platform's infinity, so have to handle it specially */
7141 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7143 const STRLEN offset = (STRLEN)(end - start);
7144 swatch[offset >> 3] |= 1 << (offset & 7);
7149 /* Advance to the next range, which will be for code points not in the
7159 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7161 /* Take the union of two inversion lists and point <output> to it. *output
7162 * should be defined upon input, and if it points to one of the two lists,
7163 * the reference count to that list will be decremented. The first list,
7164 * <a>, may be NULL, in which case a copy of the second list is returned.
7165 * If <complement_b> is TRUE, the union is taken of the complement
7166 * (inversion) of <b> instead of b itself.
7168 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7169 * Richard Gillam, published by Addison-Wesley, and explained at some
7170 * length there. The preface says to incorporate its examples into your
7171 * code at your own risk.
7173 * The algorithm is like a merge sort.
7175 * XXX A potential performance improvement is to keep track as we go along
7176 * if only one of the inputs contributes to the result, meaning the other
7177 * is a subset of that one. In that case, we can skip the final copy and
7178 * return the larger of the input lists, but then outside code might need
7179 * to keep track of whether to free the input list or not */
7181 UV* array_a; /* a's array */
7183 UV len_a; /* length of a's array */
7186 SV* u; /* the resulting union */
7190 UV i_a = 0; /* current index into a's array */
7194 /* running count, as explained in the algorithm source book; items are
7195 * stopped accumulating and are output when the count changes to/from 0.
7196 * The count is incremented when we start a range that's in the set, and
7197 * decremented when we start a range that's not in the set. So its range
7198 * is 0 to 2. Only when the count is zero is something not in the set.
7202 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7205 /* If either one is empty, the union is the other one */
7206 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
7213 *output = invlist_clone(b);
7215 _invlist_invert(*output);
7217 } /* else *output already = b; */
7220 else if ((len_b = invlist_len(b)) == 0) {
7225 /* The complement of an empty list is a list that has everything in it,
7226 * so the union with <a> includes everything too */
7231 *output = _new_invlist(1);
7232 _append_range_to_invlist(*output, 0, UV_MAX);
7234 else if (*output != a) {
7235 *output = invlist_clone(a);
7237 /* else *output already = a; */
7241 /* Here both lists exist and are non-empty */
7242 array_a = invlist_array(a);
7243 array_b = invlist_array(b);
7245 /* If are to take the union of 'a' with the complement of b, set it
7246 * up so are looking at b's complement. */
7249 /* To complement, we invert: if the first element is 0, remove it. To
7250 * do this, we just pretend the array starts one later, and clear the
7251 * flag as we don't have to do anything else later */
7252 if (array_b[0] == 0) {
7255 complement_b = FALSE;
7259 /* But if the first element is not zero, we unshift a 0 before the
7260 * array. The data structure reserves a space for that 0 (which
7261 * should be a '1' right now), so physical shifting is unneeded,
7262 * but temporarily change that element to 0. Before exiting the
7263 * routine, we must restore the element to '1' */
7270 /* Size the union for the worst case: that the sets are completely
7272 u = _new_invlist(len_a + len_b);
7274 /* Will contain U+0000 if either component does */
7275 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7276 || (len_b > 0 && array_b[0] == 0));
7278 /* Go through each list item by item, stopping when exhausted one of
7280 while (i_a < len_a && i_b < len_b) {
7281 UV cp; /* The element to potentially add to the union's array */
7282 bool cp_in_set; /* is it in the the input list's set or not */
7284 /* We need to take one or the other of the two inputs for the union.
7285 * Since we are merging two sorted lists, we take the smaller of the
7286 * next items. In case of a tie, we take the one that is in its set
7287 * first. If we took one not in the set first, it would decrement the
7288 * count, possibly to 0 which would cause it to be output as ending the
7289 * range, and the next time through we would take the same number, and
7290 * output it again as beginning the next range. By doing it the
7291 * opposite way, there is no possibility that the count will be
7292 * momentarily decremented to 0, and thus the two adjoining ranges will
7293 * be seamlessly merged. (In a tie and both are in the set or both not
7294 * in the set, it doesn't matter which we take first.) */
7295 if (array_a[i_a] < array_b[i_b]
7296 || (array_a[i_a] == array_b[i_b]
7297 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7299 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7303 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7307 /* Here, have chosen which of the two inputs to look at. Only output
7308 * if the running count changes to/from 0, which marks the
7309 * beginning/end of a range in that's in the set */
7312 array_u[i_u++] = cp;
7319 array_u[i_u++] = cp;
7324 /* Here, we are finished going through at least one of the lists, which
7325 * means there is something remaining in at most one. We check if the list
7326 * that hasn't been exhausted is positioned such that we are in the middle
7327 * of a range in its set or not. (i_a and i_b point to the element beyond
7328 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7329 * is potentially more to output.
7330 * There are four cases:
7331 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7332 * in the union is entirely from the non-exhausted set.
7333 * 2) Both were in their sets, count is 2. Nothing further should
7334 * be output, as everything that remains will be in the exhausted
7335 * list's set, hence in the union; decrementing to 1 but not 0 insures
7337 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7338 * Nothing further should be output because the union includes
7339 * everything from the exhausted set. Not decrementing ensures that.
7340 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7341 * decrementing to 0 insures that we look at the remainder of the
7342 * non-exhausted set */
7343 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7344 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7349 /* The final length is what we've output so far, plus what else is about to
7350 * be output. (If 'count' is non-zero, then the input list we exhausted
7351 * has everything remaining up to the machine's limit in its set, and hence
7352 * in the union, so there will be no further output. */
7355 /* At most one of the subexpressions will be non-zero */
7356 len_u += (len_a - i_a) + (len_b - i_b);
7359 /* Set result to final length, which can change the pointer to array_u, so
7361 if (len_u != invlist_len(u)) {
7362 invlist_set_len(u, len_u);
7364 array_u = invlist_array(u);
7367 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7368 * the other) ended with everything above it not in its set. That means
7369 * that the remaining part of the union is precisely the same as the
7370 * non-exhausted list, so can just copy it unchanged. (If both list were
7371 * exhausted at the same time, then the operations below will be both 0.)
7374 IV copy_count; /* At most one will have a non-zero copy count */
7375 if ((copy_count = len_a - i_a) > 0) {
7376 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7378 else if ((copy_count = len_b - i_b) > 0) {
7379 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7383 /* We may be removing a reference to one of the inputs */
7384 if (a == *output || b == *output) {
7385 SvREFCNT_dec(*output);
7388 /* If we've changed b, restore it */
7398 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7400 /* Take the intersection of two inversion lists and point <i> to it. *i
7401 * should be defined upon input, and if it points to one of the two lists,
7402 * the reference count to that list will be decremented.
7403 * If <complement_b> is TRUE, the result will be the intersection of <a>
7404 * and the complement (or inversion) of <b> instead of <b> directly.
7406 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7407 * Richard Gillam, published by Addison-Wesley, and explained at some
7408 * length there. The preface says to incorporate its examples into your
7409 * code at your own risk. In fact, it had bugs
7411 * The algorithm is like a merge sort, and is essentially the same as the
7415 UV* array_a; /* a's array */
7417 UV len_a; /* length of a's array */
7420 SV* r; /* the resulting intersection */
7424 UV i_a = 0; /* current index into a's array */
7428 /* running count, as explained in the algorithm source book; items are
7429 * stopped accumulating and are output when the count changes to/from 2.
7430 * The count is incremented when we start a range that's in the set, and
7431 * decremented when we start a range that's not in the set. So its range
7432 * is 0 to 2. Only when the count is 2 is something in the intersection.
7436 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7439 /* Special case if either one is empty */
7440 len_a = invlist_len(a);
7441 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7443 if (len_a != 0 && complement_b) {
7445 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7446 * be empty. Here, also we are using 'b's complement, which hence
7447 * must be every possible code point. Thus the intersection is
7450 *i = invlist_clone(a);
7456 /* else *i is already 'a' */
7460 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7461 * intersection must be empty */
7468 *i = _new_invlist(0);
7472 /* Here both lists exist and are non-empty */
7473 array_a = invlist_array(a);
7474 array_b = invlist_array(b);
7476 /* If are to take the intersection of 'a' with the complement of b, set it
7477 * up so are looking at b's complement. */
7480 /* To complement, we invert: if the first element is 0, remove it. To
7481 * do this, we just pretend the array starts one later, and clear the
7482 * flag as we don't have to do anything else later */
7483 if (array_b[0] == 0) {
7486 complement_b = FALSE;
7490 /* But if the first element is not zero, we unshift a 0 before the
7491 * array. The data structure reserves a space for that 0 (which
7492 * should be a '1' right now), so physical shifting is unneeded,
7493 * but temporarily change that element to 0. Before exiting the
7494 * routine, we must restore the element to '1' */
7501 /* Size the intersection for the worst case: that the intersection ends up
7502 * fragmenting everything to be completely disjoint */
7503 r= _new_invlist(len_a + len_b);
7505 /* Will contain U+0000 iff both components do */
7506 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7507 && len_b > 0 && array_b[0] == 0);
7509 /* Go through each list item by item, stopping when exhausted one of
7511 while (i_a < len_a && i_b < len_b) {
7512 UV cp; /* The element to potentially add to the intersection's
7514 bool cp_in_set; /* Is it in the input list's set or not */
7516 /* We need to take one or the other of the two inputs for the
7517 * intersection. Since we are merging two sorted lists, we take the
7518 * smaller of the next items. In case of a tie, we take the one that
7519 * is not in its set first (a difference from the union algorithm). If
7520 * we took one in the set first, it would increment the count, possibly
7521 * to 2 which would cause it to be output as starting a range in the
7522 * intersection, and the next time through we would take that same
7523 * number, and output it again as ending the set. By doing it the
7524 * opposite of this, there is no possibility that the count will be
7525 * momentarily incremented to 2. (In a tie and both are in the set or
7526 * both not in the set, it doesn't matter which we take first.) */
7527 if (array_a[i_a] < array_b[i_b]
7528 || (array_a[i_a] == array_b[i_b]
7529 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7531 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7535 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7539 /* Here, have chosen which of the two inputs to look at. Only output
7540 * if the running count changes to/from 2, which marks the
7541 * beginning/end of a range that's in the intersection */
7545 array_r[i_r++] = cp;
7550 array_r[i_r++] = cp;
7556 /* Here, we are finished going through at least one of the lists, which
7557 * means there is something remaining in at most one. We check if the list
7558 * that has been exhausted is positioned such that we are in the middle
7559 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7560 * the ones we care about.) There are four cases:
7561 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7562 * nothing left in the intersection.
7563 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7564 * above 2. What should be output is exactly that which is in the
7565 * non-exhausted set, as everything it has is also in the intersection
7566 * set, and everything it doesn't have can't be in the intersection
7567 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7568 * gets incremented to 2. Like the previous case, the intersection is
7569 * everything that remains in the non-exhausted set.
7570 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7571 * remains 1. And the intersection has nothing more. */
7572 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7573 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7578 /* The final length is what we've output so far plus what else is in the
7579 * intersection. At most one of the subexpressions below will be non-zero */
7582 len_r += (len_a - i_a) + (len_b - i_b);
7585 /* Set result to final length, which can change the pointer to array_r, so
7587 if (len_r != invlist_len(r)) {
7588 invlist_set_len(r, len_r);
7590 array_r = invlist_array(r);
7593 /* Finish outputting any remaining */
7594 if (count >= 2) { /* At most one will have a non-zero copy count */
7596 if ((copy_count = len_a - i_a) > 0) {
7597 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7599 else if ((copy_count = len_b - i_b) > 0) {
7600 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7604 /* We may be removing a reference to one of the inputs */
7605 if (a == *i || b == *i) {
7609 /* If we've changed b, restore it */
7619 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7621 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7622 * set. A pointer to the inversion list is returned. This may actually be
7623 * a new list, in which case the passed in one has been destroyed. The
7624 * passed in inversion list can be NULL, in which case a new one is created
7625 * with just the one range in it */
7630 if (invlist == NULL) {
7631 invlist = _new_invlist(2);
7635 len = invlist_len(invlist);
7638 /* If comes after the final entry, can just append it to the end */
7640 || start >= invlist_array(invlist)
7641 [invlist_len(invlist) - 1])
7643 _append_range_to_invlist(invlist, start, end);
7647 /* Here, can't just append things, create and return a new inversion list
7648 * which is the union of this range and the existing inversion list */
7649 range_invlist = _new_invlist(2);
7650 _append_range_to_invlist(range_invlist, start, end);
7652 _invlist_union(invlist, range_invlist, &invlist);
7654 /* The temporary can be freed */
7655 SvREFCNT_dec(range_invlist);
7662 PERL_STATIC_INLINE SV*
7663 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7664 return _add_range_to_invlist(invlist, cp, cp);
7667 #ifndef PERL_IN_XSUB_RE
7669 Perl__invlist_invert(pTHX_ SV* const invlist)
7671 /* Complement the input inversion list. This adds a 0 if the list didn't
7672 * have a zero; removes it otherwise. As described above, the data
7673 * structure is set up so that this is very efficient */
7675 UV* len_pos = get_invlist_len_addr(invlist);
7677 PERL_ARGS_ASSERT__INVLIST_INVERT;
7679 /* The inverse of matching nothing is matching everything */
7680 if (*len_pos == 0) {
7681 _append_range_to_invlist(invlist, 0, UV_MAX);
7685 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7686 * zero element was a 0, so it is being removed, so the length decrements
7687 * by 1; and vice-versa. SvCUR is unaffected */
7688 if (*get_invlist_zero_addr(invlist) ^= 1) {
7697 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7699 /* Complement the input inversion list (which must be a Unicode property,
7700 * all of which don't match above the Unicode maximum code point.) And
7701 * Perl has chosen to not have the inversion match above that either. This
7702 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7708 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7710 _invlist_invert(invlist);
7712 len = invlist_len(invlist);
7714 if (len != 0) { /* If empty do nothing */
7715 array = invlist_array(invlist);
7716 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7717 /* Add 0x110000. First, grow if necessary */
7719 if (invlist_max(invlist) < len) {
7720 invlist_extend(invlist, len);
7721 array = invlist_array(invlist);
7723 invlist_set_len(invlist, len);
7724 array[len - 1] = PERL_UNICODE_MAX + 1;
7726 else { /* Remove the 0x110000 */
7727 invlist_set_len(invlist, len - 1);
7735 PERL_STATIC_INLINE SV*
7736 S_invlist_clone(pTHX_ SV* const invlist)
7739 /* Return a new inversion list that is a copy of the input one, which is
7742 /* Need to allocate extra space to accommodate Perl's addition of a
7743 * trailing NUL to SvPV's, since it thinks they are always strings */
7744 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7745 STRLEN length = SvCUR(invlist);
7747 PERL_ARGS_ASSERT_INVLIST_CLONE;
7749 SvCUR_set(new_invlist, length); /* This isn't done automatically */
7750 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7755 PERL_STATIC_INLINE UV*
7756 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7758 /* Return the address of the UV that contains the current iteration
7761 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7763 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7766 PERL_STATIC_INLINE UV*
7767 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7769 /* Return the address of the UV that contains the version id. */
7771 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7773 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7776 PERL_STATIC_INLINE void
7777 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
7779 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7781 *get_invlist_iter_addr(invlist) = 0;
7785 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7787 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7788 * This call sets in <*start> and <*end>, the next range in <invlist>.
7789 * Returns <TRUE> if successful and the next call will return the next
7790 * range; <FALSE> if was already at the end of the list. If the latter,
7791 * <*start> and <*end> are unchanged, and the next call to this function
7792 * will start over at the beginning of the list */
7794 UV* pos = get_invlist_iter_addr(invlist);
7795 UV len = invlist_len(invlist);
7798 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7801 *pos = UV_MAX; /* Force iternit() to be required next time */
7805 array = invlist_array(invlist);
7807 *start = array[(*pos)++];
7813 *end = array[(*pos)++] - 1;
7819 #ifndef PERL_IN_XSUB_RE
7821 Perl__invlist_contents(pTHX_ SV* const invlist)
7823 /* Get the contents of an inversion list into a string SV so that they can
7824 * be printed out. It uses the format traditionally done for debug tracing
7828 SV* output = newSVpvs("\n");
7830 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7832 invlist_iterinit(invlist);
7833 while (invlist_iternext(invlist, &start, &end)) {
7834 if (end == UV_MAX) {
7835 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7837 else if (end != start) {
7838 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7842 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7852 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7854 /* Dumps out the ranges in an inversion list. The string 'header'
7855 * if present is output on a line before the first range */
7859 if (header && strlen(header)) {
7860 PerlIO_printf(Perl_debug_log, "%s\n", header);
7862 invlist_iterinit(invlist);
7863 while (invlist_iternext(invlist, &start, &end)) {
7864 if (end == UV_MAX) {
7865 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7868 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7874 #undef HEADER_LENGTH
7875 #undef INVLIST_INITIAL_LENGTH
7876 #undef TO_INTERNAL_SIZE
7877 #undef FROM_INTERNAL_SIZE
7878 #undef INVLIST_LEN_OFFSET
7879 #undef INVLIST_ZERO_OFFSET
7880 #undef INVLIST_ITER_OFFSET
7881 #undef INVLIST_VERSION_ID
7883 /* End of inversion list object */
7886 - reg - regular expression, i.e. main body or parenthesized thing
7888 * Caller must absorb opening parenthesis.
7890 * Combining parenthesis handling with the base level of regular expression
7891 * is a trifle forced, but the need to tie the tails of the branches to what
7892 * follows makes it hard to avoid.
7894 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7896 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7898 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7902 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7903 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7906 register regnode *ret; /* Will be the head of the group. */
7907 register regnode *br;
7908 register regnode *lastbr;
7909 register regnode *ender = NULL;
7910 register I32 parno = 0;
7912 U32 oregflags = RExC_flags;
7913 bool have_branch = 0;
7915 I32 freeze_paren = 0;
7916 I32 after_freeze = 0;
7918 /* for (?g), (?gc), and (?o) warnings; warning
7919 about (?c) will warn about (?g) -- japhy */
7921 #define WASTED_O 0x01
7922 #define WASTED_G 0x02
7923 #define WASTED_C 0x04
7924 #define WASTED_GC (0x02|0x04)
7925 I32 wastedflags = 0x00;
7927 char * parse_start = RExC_parse; /* MJD */
7928 char * const oregcomp_parse = RExC_parse;
7930 GET_RE_DEBUG_FLAGS_DECL;
7932 PERL_ARGS_ASSERT_REG;
7933 DEBUG_PARSE("reg ");
7935 *flagp = 0; /* Tentatively. */
7938 /* Make an OPEN node, if parenthesized. */
7940 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7941 char *start_verb = RExC_parse;
7942 STRLEN verb_len = 0;
7943 char *start_arg = NULL;
7944 unsigned char op = 0;
7946 int internal_argval = 0; /* internal_argval is only useful if !argok */
7947 while ( *RExC_parse && *RExC_parse != ')' ) {
7948 if ( *RExC_parse == ':' ) {
7949 start_arg = RExC_parse + 1;
7955 verb_len = RExC_parse - start_verb;
7958 while ( *RExC_parse && *RExC_parse != ')' )
7960 if ( *RExC_parse != ')' )
7961 vFAIL("Unterminated verb pattern argument");
7962 if ( RExC_parse == start_arg )
7965 if ( *RExC_parse != ')' )
7966 vFAIL("Unterminated verb pattern");
7969 switch ( *start_verb ) {
7970 case 'A': /* (*ACCEPT) */
7971 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7973 internal_argval = RExC_nestroot;
7976 case 'C': /* (*COMMIT) */
7977 if ( memEQs(start_verb,verb_len,"COMMIT") )
7980 case 'F': /* (*FAIL) */
7981 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
7986 case ':': /* (*:NAME) */
7987 case 'M': /* (*MARK:NAME) */
7988 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
7993 case 'P': /* (*PRUNE) */
7994 if ( memEQs(start_verb,verb_len,"PRUNE") )
7997 case 'S': /* (*SKIP) */
7998 if ( memEQs(start_verb,verb_len,"SKIP") )
8001 case 'T': /* (*THEN) */
8002 /* [19:06] <TimToady> :: is then */
8003 if ( memEQs(start_verb,verb_len,"THEN") ) {
8005 RExC_seen |= REG_SEEN_CUTGROUP;
8011 vFAIL3("Unknown verb pattern '%.*s'",
8012 verb_len, start_verb);
8015 if ( start_arg && internal_argval ) {
8016 vFAIL3("Verb pattern '%.*s' may not have an argument",
8017 verb_len, start_verb);
8018 } else if ( argok < 0 && !start_arg ) {
8019 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8020 verb_len, start_verb);
8022 ret = reganode(pRExC_state, op, internal_argval);
8023 if ( ! internal_argval && ! SIZE_ONLY ) {
8025 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8026 ARG(ret) = add_data( pRExC_state, 1, "S" );
8027 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8034 if (!internal_argval)
8035 RExC_seen |= REG_SEEN_VERBARG;
8036 } else if ( start_arg ) {
8037 vFAIL3("Verb pattern '%.*s' may not have an argument",
8038 verb_len, start_verb);
8040 ret = reg_node(pRExC_state, op);
8042 nextchar(pRExC_state);
8045 if (*RExC_parse == '?') { /* (?...) */
8046 bool is_logical = 0;
8047 const char * const seqstart = RExC_parse;
8048 bool has_use_defaults = FALSE;
8051 paren = *RExC_parse++;
8052 ret = NULL; /* For look-ahead/behind. */
8055 case 'P': /* (?P...) variants for those used to PCRE/Python */
8056 paren = *RExC_parse++;
8057 if ( paren == '<') /* (?P<...>) named capture */
8059 else if (paren == '>') { /* (?P>name) named recursion */
8060 goto named_recursion;
8062 else if (paren == '=') { /* (?P=...) named backref */
8063 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8064 you change this make sure you change that */
8065 char* name_start = RExC_parse;
8067 SV *sv_dat = reg_scan_name(pRExC_state,
8068 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8069 if (RExC_parse == name_start || *RExC_parse != ')')
8070 vFAIL2("Sequence %.3s... not terminated",parse_start);
8073 num = add_data( pRExC_state, 1, "S" );
8074 RExC_rxi->data->data[num]=(void*)sv_dat;
8075 SvREFCNT_inc_simple_void(sv_dat);
8078 ret = reganode(pRExC_state,
8081 : (MORE_ASCII_RESTRICTED)
8083 : (AT_LEAST_UNI_SEMANTICS)
8091 Set_Node_Offset(ret, parse_start+1);
8092 Set_Node_Cur_Length(ret); /* MJD */
8094 nextchar(pRExC_state);
8098 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8100 case '<': /* (?<...) */
8101 if (*RExC_parse == '!')
8103 else if (*RExC_parse != '=')
8109 case '\'': /* (?'...') */
8110 name_start= RExC_parse;
8111 svname = reg_scan_name(pRExC_state,
8112 SIZE_ONLY ? /* reverse test from the others */
8113 REG_RSN_RETURN_NAME :
8114 REG_RSN_RETURN_NULL);
8115 if (RExC_parse == name_start) {
8117 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8120 if (*RExC_parse != paren)
8121 vFAIL2("Sequence (?%c... not terminated",
8122 paren=='>' ? '<' : paren);
8126 if (!svname) /* shouldn't happen */
8128 "panic: reg_scan_name returned NULL");
8129 if (!RExC_paren_names) {
8130 RExC_paren_names= newHV();
8131 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8133 RExC_paren_name_list= newAV();
8134 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8137 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8139 sv_dat = HeVAL(he_str);
8141 /* croak baby croak */
8143 "panic: paren_name hash element allocation failed");
8144 } else if ( SvPOK(sv_dat) ) {
8145 /* (?|...) can mean we have dupes so scan to check
8146 its already been stored. Maybe a flag indicating
8147 we are inside such a construct would be useful,
8148 but the arrays are likely to be quite small, so
8149 for now we punt -- dmq */
8150 IV count = SvIV(sv_dat);
8151 I32 *pv = (I32*)SvPVX(sv_dat);
8153 for ( i = 0 ; i < count ; i++ ) {
8154 if ( pv[i] == RExC_npar ) {
8160 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8161 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8162 pv[count] = RExC_npar;
8163 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8166 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8167 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8169 SvIV_set(sv_dat, 1);
8172 /* Yes this does cause a memory leak in debugging Perls */
8173 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8174 SvREFCNT_dec(svname);
8177 /*sv_dump(sv_dat);*/
8179 nextchar(pRExC_state);
8181 goto capturing_parens;
8183 RExC_seen |= REG_SEEN_LOOKBEHIND;
8184 RExC_in_lookbehind++;
8186 case '=': /* (?=...) */
8187 RExC_seen_zerolen++;
8189 case '!': /* (?!...) */
8190 RExC_seen_zerolen++;
8191 if (*RExC_parse == ')') {
8192 ret=reg_node(pRExC_state, OPFAIL);
8193 nextchar(pRExC_state);
8197 case '|': /* (?|...) */
8198 /* branch reset, behave like a (?:...) except that
8199 buffers in alternations share the same numbers */
8201 after_freeze = freeze_paren = RExC_npar;
8203 case ':': /* (?:...) */
8204 case '>': /* (?>...) */
8206 case '$': /* (?$...) */
8207 case '@': /* (?@...) */
8208 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8210 case '#': /* (?#...) */
8211 while (*RExC_parse && *RExC_parse != ')')
8213 if (*RExC_parse != ')')
8214 FAIL("Sequence (?#... not terminated");
8215 nextchar(pRExC_state);
8218 case '0' : /* (?0) */
8219 case 'R' : /* (?R) */
8220 if (*RExC_parse != ')')
8221 FAIL("Sequence (?R) not terminated");
8222 ret = reg_node(pRExC_state, GOSTART);
8223 *flagp |= POSTPONED;
8224 nextchar(pRExC_state);
8227 { /* named and numeric backreferences */
8229 case '&': /* (?&NAME) */
8230 parse_start = RExC_parse - 1;
8233 SV *sv_dat = reg_scan_name(pRExC_state,
8234 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8235 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8237 goto gen_recurse_regop;
8240 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8242 vFAIL("Illegal pattern");
8244 goto parse_recursion;
8246 case '-': /* (?-1) */
8247 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8248 RExC_parse--; /* rewind to let it be handled later */
8252 case '1': case '2': case '3': case '4': /* (?1) */
8253 case '5': case '6': case '7': case '8': case '9':
8256 num = atoi(RExC_parse);
8257 parse_start = RExC_parse - 1; /* MJD */
8258 if (*RExC_parse == '-')
8260 while (isDIGIT(*RExC_parse))
8262 if (*RExC_parse!=')')
8263 vFAIL("Expecting close bracket");
8266 if ( paren == '-' ) {
8268 Diagram of capture buffer numbering.
8269 Top line is the normal capture buffer numbers
8270 Bottom line is the negative indexing as from
8274 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8278 num = RExC_npar + num;
8281 vFAIL("Reference to nonexistent group");
8283 } else if ( paren == '+' ) {
8284 num = RExC_npar + num - 1;
8287 ret = reganode(pRExC_state, GOSUB, num);
8289 if (num > (I32)RExC_rx->nparens) {
8291 vFAIL("Reference to nonexistent group");
8293 ARG2L_SET( ret, RExC_recurse_count++);
8295 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8296 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8300 RExC_seen |= REG_SEEN_RECURSE;
8301 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8302 Set_Node_Offset(ret, parse_start); /* MJD */
8304 *flagp |= POSTPONED;
8305 nextchar(pRExC_state);
8307 } /* named and numeric backreferences */
8310 case '?': /* (??...) */
8312 if (*RExC_parse != '{') {
8314 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8317 *flagp |= POSTPONED;
8318 paren = *RExC_parse++;
8320 case '{': /* (?{...}) */
8325 char *s = RExC_parse;
8327 RExC_seen_zerolen++;
8328 RExC_seen |= REG_SEEN_EVAL;
8330 if ( pRExC_state->num_code_blocks
8331 && pRExC_state->code_index < pRExC_state->num_code_blocks
8332 && pRExC_state->code_blocks[pRExC_state->code_index].start
8333 == (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8336 /* this is a pre-compiled literal (?{}) */
8337 struct reg_code_block *cb =
8338 &pRExC_state->code_blocks[pRExC_state->code_index];
8339 RExC_parse = RExC_start + cb->end;
8344 if (cb->src_regex) {
8345 n = add_data(pRExC_state, 2, "rl");
8346 RExC_rxi->data->data[n] =
8347 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8348 RExC_rxi->data->data[n+1] = (void*)o->op_next;
8351 n = add_data(pRExC_state, 1,
8352 (RExC_flags & PMf_HAS_CV) ? "L" : "l");
8353 RExC_rxi->data->data[n] = (void*)o->op_next;
8356 pRExC_state->code_index++;
8359 while (count && (c = *RExC_parse)) {
8370 if (*RExC_parse != ')') {
8372 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
8376 OP_4tree *sop, *rop;
8377 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
8380 Perl_save_re_context(aTHX);
8381 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
8382 sop->op_private |= OPpREFCOUNTED;
8383 /* re_dup will OpREFCNT_inc */
8384 OpREFCNT_set(sop, 1);
8387 n = add_data(pRExC_state, 3, "nop");
8388 RExC_rxi->data->data[n] = (void*)rop;
8389 RExC_rxi->data->data[n+1] = (void*)sop;
8390 RExC_rxi->data->data[n+2] = (void*)pad;
8393 else { /* First pass */
8394 if (PL_reginterp_cnt < ++RExC_seen_evals
8396 /* No compiled RE interpolated, has runtime
8397 components ===> unsafe. */
8398 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8399 if (PL_tainting && PL_tainted)
8400 FAIL("Eval-group in insecure regular expression");
8401 #if PERL_VERSION > 8
8402 if (IN_PERL_COMPILETIME)
8407 nextchar(pRExC_state);
8410 ret = reg_node(pRExC_state, LOGICAL);
8413 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
8414 /* deal with the length of this later - MJD */
8417 ret = reganode(pRExC_state, EVAL, n);
8418 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8419 Set_Node_Offset(ret, parse_start);
8422 case '(': /* (?(?{...})...) and (?(?=...)...) */
8425 if (RExC_parse[0] == '?') { /* (?(?...)) */
8426 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8427 || RExC_parse[1] == '<'
8428 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8431 ret = reg_node(pRExC_state, LOGICAL);
8434 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8438 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8439 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8441 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8442 char *name_start= RExC_parse++;
8444 SV *sv_dat=reg_scan_name(pRExC_state,
8445 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8446 if (RExC_parse == name_start || *RExC_parse != ch)
8447 vFAIL2("Sequence (?(%c... not terminated",
8448 (ch == '>' ? '<' : ch));
8451 num = add_data( pRExC_state, 1, "S" );
8452 RExC_rxi->data->data[num]=(void*)sv_dat;
8453 SvREFCNT_inc_simple_void(sv_dat);
8455 ret = reganode(pRExC_state,NGROUPP,num);
8456 goto insert_if_check_paren;
8458 else if (RExC_parse[0] == 'D' &&
8459 RExC_parse[1] == 'E' &&
8460 RExC_parse[2] == 'F' &&
8461 RExC_parse[3] == 'I' &&
8462 RExC_parse[4] == 'N' &&
8463 RExC_parse[5] == 'E')
8465 ret = reganode(pRExC_state,DEFINEP,0);
8468 goto insert_if_check_paren;
8470 else if (RExC_parse[0] == 'R') {
8473 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8474 parno = atoi(RExC_parse++);
8475 while (isDIGIT(*RExC_parse))
8477 } else if (RExC_parse[0] == '&') {
8480 sv_dat = reg_scan_name(pRExC_state,
8481 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8482 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8484 ret = reganode(pRExC_state,INSUBP,parno);
8485 goto insert_if_check_paren;
8487 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8490 parno = atoi(RExC_parse++);
8492 while (isDIGIT(*RExC_parse))
8494 ret = reganode(pRExC_state, GROUPP, parno);
8496 insert_if_check_paren:
8497 if ((c = *nextchar(pRExC_state)) != ')')
8498 vFAIL("Switch condition not recognized");
8500 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8501 br = regbranch(pRExC_state, &flags, 1,depth+1);
8503 br = reganode(pRExC_state, LONGJMP, 0);
8505 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8506 c = *nextchar(pRExC_state);
8511 vFAIL("(?(DEFINE)....) does not allow branches");
8512 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8513 regbranch(pRExC_state, &flags, 1,depth+1);
8514 REGTAIL(pRExC_state, ret, lastbr);
8517 c = *nextchar(pRExC_state);
8522 vFAIL("Switch (?(condition)... contains too many branches");
8523 ender = reg_node(pRExC_state, TAIL);
8524 REGTAIL(pRExC_state, br, ender);
8526 REGTAIL(pRExC_state, lastbr, ender);
8527 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8530 REGTAIL(pRExC_state, ret, ender);
8531 RExC_size++; /* XXX WHY do we need this?!!
8532 For large programs it seems to be required
8533 but I can't figure out why. -- dmq*/
8537 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8541 RExC_parse--; /* for vFAIL to print correctly */
8542 vFAIL("Sequence (? incomplete");
8544 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8546 has_use_defaults = TRUE;
8547 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8548 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8549 ? REGEX_UNICODE_CHARSET
8550 : REGEX_DEPENDS_CHARSET);
8554 parse_flags: /* (?i) */
8556 U32 posflags = 0, negflags = 0;
8557 U32 *flagsp = &posflags;
8558 char has_charset_modifier = '\0';
8559 regex_charset cs = get_regex_charset(RExC_flags);
8560 if (cs == REGEX_DEPENDS_CHARSET
8561 && (RExC_utf8 || RExC_uni_semantics))
8563 cs = REGEX_UNICODE_CHARSET;
8566 while (*RExC_parse) {
8567 /* && strchr("iogcmsx", *RExC_parse) */
8568 /* (?g), (?gc) and (?o) are useless here
8569 and must be globally applied -- japhy */
8570 switch (*RExC_parse) {
8571 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8572 case LOCALE_PAT_MOD:
8573 if (has_charset_modifier) {
8574 goto excess_modifier;
8576 else if (flagsp == &negflags) {
8579 cs = REGEX_LOCALE_CHARSET;
8580 has_charset_modifier = LOCALE_PAT_MOD;
8581 RExC_contains_locale = 1;
8583 case UNICODE_PAT_MOD:
8584 if (has_charset_modifier) {
8585 goto excess_modifier;
8587 else if (flagsp == &negflags) {
8590 cs = REGEX_UNICODE_CHARSET;
8591 has_charset_modifier = UNICODE_PAT_MOD;
8593 case ASCII_RESTRICT_PAT_MOD:
8594 if (flagsp == &negflags) {
8597 if (has_charset_modifier) {
8598 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8599 goto excess_modifier;
8601 /* Doubled modifier implies more restricted */
8602 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8605 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8607 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8609 case DEPENDS_PAT_MOD:
8610 if (has_use_defaults) {
8611 goto fail_modifiers;
8613 else if (flagsp == &negflags) {
8616 else if (has_charset_modifier) {
8617 goto excess_modifier;
8620 /* The dual charset means unicode semantics if the
8621 * pattern (or target, not known until runtime) are
8622 * utf8, or something in the pattern indicates unicode
8624 cs = (RExC_utf8 || RExC_uni_semantics)
8625 ? REGEX_UNICODE_CHARSET
8626 : REGEX_DEPENDS_CHARSET;
8627 has_charset_modifier = DEPENDS_PAT_MOD;
8631 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8632 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8634 else if (has_charset_modifier == *(RExC_parse - 1)) {
8635 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8638 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8643 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8645 case ONCE_PAT_MOD: /* 'o' */
8646 case GLOBAL_PAT_MOD: /* 'g' */
8647 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8648 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8649 if (! (wastedflags & wflagbit) ) {
8650 wastedflags |= wflagbit;
8653 "Useless (%s%c) - %suse /%c modifier",
8654 flagsp == &negflags ? "?-" : "?",
8656 flagsp == &negflags ? "don't " : "",
8663 case CONTINUE_PAT_MOD: /* 'c' */
8664 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8665 if (! (wastedflags & WASTED_C) ) {
8666 wastedflags |= WASTED_GC;
8669 "Useless (%sc) - %suse /gc modifier",
8670 flagsp == &negflags ? "?-" : "?",
8671 flagsp == &negflags ? "don't " : ""
8676 case KEEPCOPY_PAT_MOD: /* 'p' */
8677 if (flagsp == &negflags) {
8679 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8681 *flagsp |= RXf_PMf_KEEPCOPY;
8685 /* A flag is a default iff it is following a minus, so
8686 * if there is a minus, it means will be trying to
8687 * re-specify a default which is an error */
8688 if (has_use_defaults || flagsp == &negflags) {
8691 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8695 wastedflags = 0; /* reset so (?g-c) warns twice */
8701 RExC_flags |= posflags;
8702 RExC_flags &= ~negflags;
8703 set_regex_charset(&RExC_flags, cs);
8705 oregflags |= posflags;
8706 oregflags &= ~negflags;
8707 set_regex_charset(&oregflags, cs);
8709 nextchar(pRExC_state);
8720 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8725 }} /* one for the default block, one for the switch */
8732 ret = reganode(pRExC_state, OPEN, parno);
8735 RExC_nestroot = parno;
8736 if (RExC_seen & REG_SEEN_RECURSE
8737 && !RExC_open_parens[parno-1])
8739 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8740 "Setting open paren #%"IVdf" to %d\n",
8741 (IV)parno, REG_NODE_NUM(ret)));
8742 RExC_open_parens[parno-1]= ret;
8745 Set_Node_Length(ret, 1); /* MJD */
8746 Set_Node_Offset(ret, RExC_parse); /* MJD */
8754 /* Pick up the branches, linking them together. */
8755 parse_start = RExC_parse; /* MJD */
8756 br = regbranch(pRExC_state, &flags, 1,depth+1);
8758 /* branch_len = (paren != 0); */
8762 if (*RExC_parse == '|') {
8763 if (!SIZE_ONLY && RExC_extralen) {
8764 reginsert(pRExC_state, BRANCHJ, br, depth+1);
8767 reginsert(pRExC_state, BRANCH, br, depth+1);
8768 Set_Node_Length(br, paren != 0);
8769 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8773 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
8775 else if (paren == ':') {
8776 *flagp |= flags&SIMPLE;
8778 if (is_open) { /* Starts with OPEN. */
8779 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
8781 else if (paren != '?') /* Not Conditional */
8783 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8785 while (*RExC_parse == '|') {
8786 if (!SIZE_ONLY && RExC_extralen) {
8787 ender = reganode(pRExC_state, LONGJMP,0);
8788 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8791 RExC_extralen += 2; /* Account for LONGJMP. */
8792 nextchar(pRExC_state);
8794 if (RExC_npar > after_freeze)
8795 after_freeze = RExC_npar;
8796 RExC_npar = freeze_paren;
8798 br = regbranch(pRExC_state, &flags, 0, depth+1);
8802 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
8804 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8807 if (have_branch || paren != ':') {
8808 /* Make a closing node, and hook it on the end. */
8811 ender = reg_node(pRExC_state, TAIL);
8814 ender = reganode(pRExC_state, CLOSE, parno);
8815 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8816 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8817 "Setting close paren #%"IVdf" to %d\n",
8818 (IV)parno, REG_NODE_NUM(ender)));
8819 RExC_close_parens[parno-1]= ender;
8820 if (RExC_nestroot == parno)
8823 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8824 Set_Node_Length(ender,1); /* MJD */
8830 *flagp &= ~HASWIDTH;
8833 ender = reg_node(pRExC_state, SUCCEED);
8836 ender = reg_node(pRExC_state, END);
8838 assert(!RExC_opend); /* there can only be one! */
8843 DEBUG_PARSE_r(if (!SIZE_ONLY) {
8844 SV * const mysv_val1=sv_newmortal();
8845 SV * const mysv_val2=sv_newmortal();
8846 DEBUG_PARSE_MSG("lsbr");
8847 regprop(RExC_rx, mysv_val1, lastbr);
8848 regprop(RExC_rx, mysv_val2, ender);
8849 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
8850 SvPV_nolen_const(mysv_val1),
8851 (IV)REG_NODE_NUM(lastbr),
8852 SvPV_nolen_const(mysv_val2),
8853 (IV)REG_NODE_NUM(ender),
8854 (IV)(ender - lastbr)
8857 REGTAIL(pRExC_state, lastbr, ender);
8859 if (have_branch && !SIZE_ONLY) {
8862 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8864 /* Hook the tails of the branches to the closing node. */
8865 for (br = ret; br; br = regnext(br)) {
8866 const U8 op = PL_regkind[OP(br)];
8868 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8869 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
8872 else if (op == BRANCHJ) {
8873 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8874 /* for now we always disable this optimisation * /
8875 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
8881 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
8882 DEBUG_PARSE_r(if (!SIZE_ONLY) {
8883 SV * const mysv_val1=sv_newmortal();
8884 SV * const mysv_val2=sv_newmortal();
8885 DEBUG_PARSE_MSG("NADA");
8886 regprop(RExC_rx, mysv_val1, ret);
8887 regprop(RExC_rx, mysv_val2, ender);
8888 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
8889 SvPV_nolen_const(mysv_val1),
8890 (IV)REG_NODE_NUM(ret),
8891 SvPV_nolen_const(mysv_val2),
8892 (IV)REG_NODE_NUM(ender),
8897 if (OP(ender) == TAIL) {
8902 for ( opt= br + 1; opt < ender ; opt++ )
8904 NEXT_OFF(br)= ender - br;
8912 static const char parens[] = "=!<,>";
8914 if (paren && (p = strchr(parens, paren))) {
8915 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8916 int flag = (p - parens) > 1;
8919 node = SUSPEND, flag = 0;
8920 reginsert(pRExC_state, node,ret, depth+1);
8921 Set_Node_Cur_Length(ret);
8922 Set_Node_Offset(ret, parse_start + 1);
8924 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8928 /* Check for proper termination. */
8930 RExC_flags = oregflags;
8931 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8932 RExC_parse = oregcomp_parse;
8933 vFAIL("Unmatched (");
8936 else if (!paren && RExC_parse < RExC_end) {
8937 if (*RExC_parse == ')') {
8939 vFAIL("Unmatched )");
8942 FAIL("Junk on end of regexp"); /* "Can't happen". */
8946 if (RExC_in_lookbehind) {
8947 RExC_in_lookbehind--;
8949 if (after_freeze > RExC_npar)
8950 RExC_npar = after_freeze;
8955 - regbranch - one alternative of an | operator
8957 * Implements the concatenation operator.
8960 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8963 register regnode *ret;
8964 register regnode *chain = NULL;
8965 register regnode *latest;
8966 I32 flags = 0, c = 0;
8967 GET_RE_DEBUG_FLAGS_DECL;
8969 PERL_ARGS_ASSERT_REGBRANCH;
8971 DEBUG_PARSE("brnc");
8976 if (!SIZE_ONLY && RExC_extralen)
8977 ret = reganode(pRExC_state, BRANCHJ,0);
8979 ret = reg_node(pRExC_state, BRANCH);
8980 Set_Node_Length(ret, 1);
8984 if (!first && SIZE_ONLY)
8985 RExC_extralen += 1; /* BRANCHJ */
8987 *flagp = WORST; /* Tentatively. */
8990 nextchar(pRExC_state);
8991 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
8993 latest = regpiece(pRExC_state, &flags,depth+1);
8994 if (latest == NULL) {
8995 if (flags & TRYAGAIN)
8999 else if (ret == NULL)
9001 *flagp |= flags&(HASWIDTH|POSTPONED);
9002 if (chain == NULL) /* First piece. */
9003 *flagp |= flags&SPSTART;
9006 REGTAIL(pRExC_state, chain, latest);
9011 if (chain == NULL) { /* Loop ran zero times. */
9012 chain = reg_node(pRExC_state, NOTHING);
9017 *flagp |= flags&SIMPLE;
9024 - regpiece - something followed by possible [*+?]
9026 * Note that the branching code sequences used for ? and the general cases
9027 * of * and + are somewhat optimized: they use the same NOTHING node as
9028 * both the endmarker for their branch list and the body of the last branch.
9029 * It might seem that this node could be dispensed with entirely, but the
9030 * endmarker role is not redundant.
9033 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9036 register regnode *ret;
9038 register char *next;
9040 const char * const origparse = RExC_parse;
9042 I32 max = REG_INFTY;
9043 #ifdef RE_TRACK_PATTERN_OFFSETS
9046 const char *maxpos = NULL;
9047 GET_RE_DEBUG_FLAGS_DECL;
9049 PERL_ARGS_ASSERT_REGPIECE;
9051 DEBUG_PARSE("piec");
9053 ret = regatom(pRExC_state, &flags,depth+1);
9055 if (flags & TRYAGAIN)
9062 if (op == '{' && regcurly(RExC_parse)) {
9064 #ifdef RE_TRACK_PATTERN_OFFSETS
9065 parse_start = RExC_parse; /* MJD */
9067 next = RExC_parse + 1;
9068 while (isDIGIT(*next) || *next == ',') {
9077 if (*next == '}') { /* got one */
9081 min = atoi(RExC_parse);
9085 maxpos = RExC_parse;
9087 if (!max && *maxpos != '0')
9088 max = REG_INFTY; /* meaning "infinity" */
9089 else if (max >= REG_INFTY)
9090 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9092 nextchar(pRExC_state);
9095 if ((flags&SIMPLE)) {
9096 RExC_naughty += 2 + RExC_naughty / 2;
9097 reginsert(pRExC_state, CURLY, ret, depth+1);
9098 Set_Node_Offset(ret, parse_start+1); /* MJD */
9099 Set_Node_Cur_Length(ret);
9102 regnode * const w = reg_node(pRExC_state, WHILEM);
9105 REGTAIL(pRExC_state, ret, w);
9106 if (!SIZE_ONLY && RExC_extralen) {
9107 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9108 reginsert(pRExC_state, NOTHING,ret, depth+1);
9109 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9111 reginsert(pRExC_state, CURLYX,ret, depth+1);
9113 Set_Node_Offset(ret, parse_start+1);
9114 Set_Node_Length(ret,
9115 op == '{' ? (RExC_parse - parse_start) : 1);
9117 if (!SIZE_ONLY && RExC_extralen)
9118 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9119 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9121 RExC_whilem_seen++, RExC_extralen += 3;
9122 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9131 vFAIL("Can't do {n,m} with n > m");
9133 ARG1_SET(ret, (U16)min);
9134 ARG2_SET(ret, (U16)max);
9146 #if 0 /* Now runtime fix should be reliable. */
9148 /* if this is reinstated, don't forget to put this back into perldiag:
9150 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9152 (F) The part of the regexp subject to either the * or + quantifier
9153 could match an empty string. The {#} shows in the regular
9154 expression about where the problem was discovered.
9158 if (!(flags&HASWIDTH) && op != '?')
9159 vFAIL("Regexp *+ operand could be empty");
9162 #ifdef RE_TRACK_PATTERN_OFFSETS
9163 parse_start = RExC_parse;
9165 nextchar(pRExC_state);
9167 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9169 if (op == '*' && (flags&SIMPLE)) {
9170 reginsert(pRExC_state, STAR, ret, depth+1);
9174 else if (op == '*') {
9178 else if (op == '+' && (flags&SIMPLE)) {
9179 reginsert(pRExC_state, PLUS, ret, depth+1);
9183 else if (op == '+') {
9187 else if (op == '?') {
9192 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9193 ckWARN3reg(RExC_parse,
9194 "%.*s matches null string many times",
9195 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9199 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9200 nextchar(pRExC_state);
9201 reginsert(pRExC_state, MINMOD, ret, depth+1);
9202 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9204 #ifndef REG_ALLOW_MINMOD_SUSPEND
9207 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9209 nextchar(pRExC_state);
9210 ender = reg_node(pRExC_state, SUCCEED);
9211 REGTAIL(pRExC_state, ret, ender);
9212 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9214 ender = reg_node(pRExC_state, TAIL);
9215 REGTAIL(pRExC_state, ret, ender);
9219 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9221 vFAIL("Nested quantifiers");
9228 /* reg_namedseq(pRExC_state,UVp, UV depth)
9230 This is expected to be called by a parser routine that has
9231 recognized '\N' and needs to handle the rest. RExC_parse is
9232 expected to point at the first char following the N at the time
9235 The \N may be inside (indicated by valuep not being NULL) or outside a
9238 \N may begin either a named sequence, or if outside a character class, mean
9239 to match a non-newline. For non single-quoted regexes, the tokenizer has
9240 attempted to decide which, and in the case of a named sequence converted it
9241 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9242 where c1... are the characters in the sequence. For single-quoted regexes,
9243 the tokenizer passes the \N sequence through unchanged; this code will not
9244 attempt to determine this nor expand those. The net effect is that if the
9245 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
9246 signals that this \N occurrence means to match a non-newline.
9248 Only the \N{U+...} form should occur in a character class, for the same
9249 reason that '.' inside a character class means to just match a period: it
9250 just doesn't make sense.
9252 If valuep is non-null then it is assumed that we are parsing inside
9253 of a charclass definition and the first codepoint in the resolved
9254 string is returned via *valuep and the routine will return NULL.
9255 In this mode if a multichar string is returned from the charnames
9256 handler, a warning will be issued, and only the first char in the
9257 sequence will be examined. If the string returned is zero length
9258 then the value of *valuep is undefined and NON-NULL will
9259 be returned to indicate failure. (This will NOT be a valid pointer
9262 If valuep is null then it is assumed that we are parsing normal text and a
9263 new EXACT node is inserted into the program containing the resolved string,
9264 and a pointer to the new node is returned. But if the string is zero length
9265 a NOTHING node is emitted instead.
9267 On success RExC_parse is set to the char following the endbrace.
9268 Parsing failures will generate a fatal error via vFAIL(...)
9271 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
9273 char * endbrace; /* '}' following the name */
9274 regnode *ret = NULL;
9277 GET_RE_DEBUG_FLAGS_DECL;
9279 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
9283 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9284 * modifier. The other meaning does not */
9285 p = (RExC_flags & RXf_PMf_EXTENDED)
9286 ? regwhite( pRExC_state, RExC_parse )
9289 /* Disambiguate between \N meaning a named character versus \N meaning
9290 * [^\n]. The former is assumed when it can't be the latter. */
9291 if (*p != '{' || regcurly(p)) {
9294 /* no bare \N in a charclass */
9295 vFAIL("\\N in a character class must be a named character: \\N{...}");
9297 nextchar(pRExC_state);
9298 ret = reg_node(pRExC_state, REG_ANY);
9299 *flagp |= HASWIDTH|SIMPLE;
9302 Set_Node_Length(ret, 1); /* MJD */
9306 /* Here, we have decided it should be a named sequence */
9308 /* The test above made sure that the next real character is a '{', but
9309 * under the /x modifier, it could be separated by space (or a comment and
9310 * \n) and this is not allowed (for consistency with \x{...} and the
9311 * tokenizer handling of \N{NAME}). */
9312 if (*RExC_parse != '{') {
9313 vFAIL("Missing braces on \\N{}");
9316 RExC_parse++; /* Skip past the '{' */
9318 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9319 || ! (endbrace == RExC_parse /* nothing between the {} */
9320 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9321 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9323 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9324 vFAIL("\\N{NAME} must be resolved by the lexer");
9327 if (endbrace == RExC_parse) { /* empty: \N{} */
9329 RExC_parse = endbrace + 1;
9330 return reg_node(pRExC_state,NOTHING);
9334 ckWARNreg(RExC_parse,
9335 "Ignoring zero length \\N{} in character class"
9337 RExC_parse = endbrace + 1;
9340 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
9343 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
9344 RExC_parse += 2; /* Skip past the 'U+' */
9346 if (valuep) { /* In a bracketed char class */
9347 /* We only pay attention to the first char of
9348 multichar strings being returned. I kinda wonder
9349 if this makes sense as it does change the behaviour
9350 from earlier versions, OTOH that behaviour was broken
9351 as well. XXX Solution is to recharacterize as
9352 [rest-of-class]|multi1|multi2... */
9354 STRLEN length_of_hex;
9355 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9356 | PERL_SCAN_DISALLOW_PREFIX
9357 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9359 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
9360 if (endchar < endbrace) {
9361 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9364 length_of_hex = (STRLEN)(endchar - RExC_parse);
9365 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
9367 /* The tokenizer should have guaranteed validity, but it's possible to
9368 * bypass it by using single quoting, so check */
9369 if (length_of_hex == 0
9370 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9372 RExC_parse += length_of_hex; /* Includes all the valid */
9373 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9374 ? UTF8SKIP(RExC_parse)
9376 /* Guard against malformed utf8 */
9377 if (RExC_parse >= endchar) RExC_parse = endchar;
9378 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9381 RExC_parse = endbrace + 1;
9382 if (endchar == endbrace) return NULL;
9384 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
9386 else { /* Not a char class */
9388 /* What is done here is to convert this to a sub-pattern of the form
9389 * (?:\x{char1}\x{char2}...)
9390 * and then call reg recursively. That way, it retains its atomicness,
9391 * while not having to worry about special handling that some code
9392 * points may have. toke.c has converted the original Unicode values
9393 * to native, so that we can just pass on the hex values unchanged. We
9394 * do have to set a flag to keep recoding from happening in the
9397 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9399 char *endchar; /* Points to '.' or '}' ending cur char in the input
9401 char *orig_end = RExC_end;
9403 while (RExC_parse < endbrace) {
9405 /* Code points are separated by dots. If none, there is only one
9406 * code point, and is terminated by the brace */
9407 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9409 /* Convert to notation the rest of the code understands */
9410 sv_catpv(substitute_parse, "\\x{");
9411 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9412 sv_catpv(substitute_parse, "}");
9414 /* Point to the beginning of the next character in the sequence. */
9415 RExC_parse = endchar + 1;
9417 sv_catpv(substitute_parse, ")");
9419 RExC_parse = SvPV(substitute_parse, len);
9421 /* Don't allow empty number */
9423 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9425 RExC_end = RExC_parse + len;
9427 /* The values are Unicode, and therefore not subject to recoding */
9428 RExC_override_recoding = 1;
9430 ret = reg(pRExC_state, 1, flagp, depth+1);
9432 RExC_parse = endbrace;
9433 RExC_end = orig_end;
9434 RExC_override_recoding = 0;
9436 nextchar(pRExC_state);
9446 * It returns the code point in utf8 for the value in *encp.
9447 * value: a code value in the source encoding
9448 * encp: a pointer to an Encode object
9450 * If the result from Encode is not a single character,
9451 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9454 S_reg_recode(pTHX_ const char value, SV **encp)
9457 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9458 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9459 const STRLEN newlen = SvCUR(sv);
9460 UV uv = UNICODE_REPLACEMENT;
9462 PERL_ARGS_ASSERT_REG_RECODE;
9466 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9469 if (!newlen || numlen != newlen) {
9470 uv = UNICODE_REPLACEMENT;
9478 - regatom - the lowest level
9480 Try to identify anything special at the start of the pattern. If there
9481 is, then handle it as required. This may involve generating a single regop,
9482 such as for an assertion; or it may involve recursing, such as to
9483 handle a () structure.
9485 If the string doesn't start with something special then we gobble up
9486 as much literal text as we can.
9488 Once we have been able to handle whatever type of thing started the
9489 sequence, we return.
9491 Note: we have to be careful with escapes, as they can be both literal
9492 and special, and in the case of \10 and friends can either, depending
9493 on context. Specifically there are two separate switches for handling
9494 escape sequences, with the one for handling literal escapes requiring
9495 a dummy entry for all of the special escapes that are actually handled
9500 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9503 register regnode *ret = NULL;
9505 char *parse_start = RExC_parse;
9507 GET_RE_DEBUG_FLAGS_DECL;
9508 DEBUG_PARSE("atom");
9509 *flagp = WORST; /* Tentatively. */
9511 PERL_ARGS_ASSERT_REGATOM;
9514 switch ((U8)*RExC_parse) {
9516 RExC_seen_zerolen++;
9517 nextchar(pRExC_state);
9518 if (RExC_flags & RXf_PMf_MULTILINE)
9519 ret = reg_node(pRExC_state, MBOL);
9520 else if (RExC_flags & RXf_PMf_SINGLELINE)
9521 ret = reg_node(pRExC_state, SBOL);
9523 ret = reg_node(pRExC_state, BOL);
9524 Set_Node_Length(ret, 1); /* MJD */
9527 nextchar(pRExC_state);
9529 RExC_seen_zerolen++;
9530 if (RExC_flags & RXf_PMf_MULTILINE)
9531 ret = reg_node(pRExC_state, MEOL);
9532 else if (RExC_flags & RXf_PMf_SINGLELINE)
9533 ret = reg_node(pRExC_state, SEOL);
9535 ret = reg_node(pRExC_state, EOL);
9536 Set_Node_Length(ret, 1); /* MJD */
9539 nextchar(pRExC_state);
9540 if (RExC_flags & RXf_PMf_SINGLELINE)
9541 ret = reg_node(pRExC_state, SANY);
9543 ret = reg_node(pRExC_state, REG_ANY);
9544 *flagp |= HASWIDTH|SIMPLE;
9546 Set_Node_Length(ret, 1); /* MJD */
9550 char * const oregcomp_parse = ++RExC_parse;
9551 ret = regclass(pRExC_state,depth+1);
9552 if (*RExC_parse != ']') {
9553 RExC_parse = oregcomp_parse;
9554 vFAIL("Unmatched [");
9556 nextchar(pRExC_state);
9557 *flagp |= HASWIDTH|SIMPLE;
9558 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
9562 nextchar(pRExC_state);
9563 ret = reg(pRExC_state, 1, &flags,depth+1);
9565 if (flags & TRYAGAIN) {
9566 if (RExC_parse == RExC_end) {
9567 /* Make parent create an empty node if needed. */
9575 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9579 if (flags & TRYAGAIN) {
9583 vFAIL("Internal urp");
9584 /* Supposed to be caught earlier. */
9590 vFAIL("Quantifier follows nothing");
9595 This switch handles escape sequences that resolve to some kind
9596 of special regop and not to literal text. Escape sequnces that
9597 resolve to literal text are handled below in the switch marked
9600 Every entry in this switch *must* have a corresponding entry
9601 in the literal escape switch. However, the opposite is not
9602 required, as the default for this switch is to jump to the
9603 literal text handling code.
9605 switch ((U8)*++RExC_parse) {
9606 /* Special Escapes */
9608 RExC_seen_zerolen++;
9609 ret = reg_node(pRExC_state, SBOL);
9611 goto finish_meta_pat;
9613 ret = reg_node(pRExC_state, GPOS);
9614 RExC_seen |= REG_SEEN_GPOS;
9616 goto finish_meta_pat;
9618 RExC_seen_zerolen++;
9619 ret = reg_node(pRExC_state, KEEPS);
9621 /* XXX:dmq : disabling in-place substitution seems to
9622 * be necessary here to avoid cases of memory corruption, as
9623 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9625 RExC_seen |= REG_SEEN_LOOKBEHIND;
9626 goto finish_meta_pat;
9628 ret = reg_node(pRExC_state, SEOL);
9630 RExC_seen_zerolen++; /* Do not optimize RE away */
9631 goto finish_meta_pat;
9633 ret = reg_node(pRExC_state, EOS);
9635 RExC_seen_zerolen++; /* Do not optimize RE away */
9636 goto finish_meta_pat;
9638 ret = reg_node(pRExC_state, CANY);
9639 RExC_seen |= REG_SEEN_CANY;
9640 *flagp |= HASWIDTH|SIMPLE;
9641 goto finish_meta_pat;
9643 ret = reg_node(pRExC_state, CLUMP);
9645 goto finish_meta_pat;
9647 switch (get_regex_charset(RExC_flags)) {
9648 case REGEX_LOCALE_CHARSET:
9651 case REGEX_UNICODE_CHARSET:
9654 case REGEX_ASCII_RESTRICTED_CHARSET:
9655 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9658 case REGEX_DEPENDS_CHARSET:
9664 ret = reg_node(pRExC_state, op);
9665 *flagp |= HASWIDTH|SIMPLE;
9666 goto finish_meta_pat;
9668 switch (get_regex_charset(RExC_flags)) {
9669 case REGEX_LOCALE_CHARSET:
9672 case REGEX_UNICODE_CHARSET:
9675 case REGEX_ASCII_RESTRICTED_CHARSET:
9676 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9679 case REGEX_DEPENDS_CHARSET:
9685 ret = reg_node(pRExC_state, op);
9686 *flagp |= HASWIDTH|SIMPLE;
9687 goto finish_meta_pat;
9689 RExC_seen_zerolen++;
9690 RExC_seen |= REG_SEEN_LOOKBEHIND;
9691 switch (get_regex_charset(RExC_flags)) {
9692 case REGEX_LOCALE_CHARSET:
9695 case REGEX_UNICODE_CHARSET:
9698 case REGEX_ASCII_RESTRICTED_CHARSET:
9699 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9702 case REGEX_DEPENDS_CHARSET:
9708 ret = reg_node(pRExC_state, op);
9709 FLAGS(ret) = get_regex_charset(RExC_flags);
9711 goto finish_meta_pat;
9713 RExC_seen_zerolen++;
9714 RExC_seen |= REG_SEEN_LOOKBEHIND;
9715 switch (get_regex_charset(RExC_flags)) {
9716 case REGEX_LOCALE_CHARSET:
9719 case REGEX_UNICODE_CHARSET:
9722 case REGEX_ASCII_RESTRICTED_CHARSET:
9723 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9726 case REGEX_DEPENDS_CHARSET:
9732 ret = reg_node(pRExC_state, op);
9733 FLAGS(ret) = get_regex_charset(RExC_flags);
9735 goto finish_meta_pat;
9737 switch (get_regex_charset(RExC_flags)) {
9738 case REGEX_LOCALE_CHARSET:
9741 case REGEX_UNICODE_CHARSET:
9744 case REGEX_ASCII_RESTRICTED_CHARSET:
9745 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9748 case REGEX_DEPENDS_CHARSET:
9754 ret = reg_node(pRExC_state, op);
9755 *flagp |= HASWIDTH|SIMPLE;
9756 goto finish_meta_pat;
9758 switch (get_regex_charset(RExC_flags)) {
9759 case REGEX_LOCALE_CHARSET:
9762 case REGEX_UNICODE_CHARSET:
9765 case REGEX_ASCII_RESTRICTED_CHARSET:
9766 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9769 case REGEX_DEPENDS_CHARSET:
9775 ret = reg_node(pRExC_state, op);
9776 *flagp |= HASWIDTH|SIMPLE;
9777 goto finish_meta_pat;
9779 switch (get_regex_charset(RExC_flags)) {
9780 case REGEX_LOCALE_CHARSET:
9783 case REGEX_ASCII_RESTRICTED_CHARSET:
9784 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9787 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9788 case REGEX_UNICODE_CHARSET:
9794 ret = reg_node(pRExC_state, op);
9795 *flagp |= HASWIDTH|SIMPLE;
9796 goto finish_meta_pat;
9798 switch (get_regex_charset(RExC_flags)) {
9799 case REGEX_LOCALE_CHARSET:
9802 case REGEX_ASCII_RESTRICTED_CHARSET:
9803 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9806 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9807 case REGEX_UNICODE_CHARSET:
9813 ret = reg_node(pRExC_state, op);
9814 *flagp |= HASWIDTH|SIMPLE;
9815 goto finish_meta_pat;
9817 ret = reg_node(pRExC_state, LNBREAK);
9818 *flagp |= HASWIDTH|SIMPLE;
9819 goto finish_meta_pat;
9821 ret = reg_node(pRExC_state, HORIZWS);
9822 *flagp |= HASWIDTH|SIMPLE;
9823 goto finish_meta_pat;
9825 ret = reg_node(pRExC_state, NHORIZWS);
9826 *flagp |= HASWIDTH|SIMPLE;
9827 goto finish_meta_pat;
9829 ret = reg_node(pRExC_state, VERTWS);
9830 *flagp |= HASWIDTH|SIMPLE;
9831 goto finish_meta_pat;
9833 ret = reg_node(pRExC_state, NVERTWS);
9834 *flagp |= HASWIDTH|SIMPLE;
9836 nextchar(pRExC_state);
9837 Set_Node_Length(ret, 2); /* MJD */
9842 char* const oldregxend = RExC_end;
9844 char* parse_start = RExC_parse - 2;
9847 if (RExC_parse[1] == '{') {
9848 /* a lovely hack--pretend we saw [\pX] instead */
9849 RExC_end = strchr(RExC_parse, '}');
9851 const U8 c = (U8)*RExC_parse;
9853 RExC_end = oldregxend;
9854 vFAIL2("Missing right brace on \\%c{}", c);
9859 RExC_end = RExC_parse + 2;
9860 if (RExC_end > oldregxend)
9861 RExC_end = oldregxend;
9865 ret = regclass(pRExC_state,depth+1);
9867 RExC_end = oldregxend;
9870 Set_Node_Offset(ret, parse_start + 2);
9871 Set_Node_Cur_Length(ret);
9872 nextchar(pRExC_state);
9873 *flagp |= HASWIDTH|SIMPLE;
9877 /* Handle \N and \N{NAME} here and not below because it can be
9878 multicharacter. join_exact() will join them up later on.
9879 Also this makes sure that things like /\N{BLAH}+/ and
9880 \N{BLAH} being multi char Just Happen. dmq*/
9882 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9884 case 'k': /* Handle \k<NAME> and \k'NAME' */
9887 char ch= RExC_parse[1];
9888 if (ch != '<' && ch != '\'' && ch != '{') {
9890 vFAIL2("Sequence %.2s... not terminated",parse_start);
9892 /* this pretty much dupes the code for (?P=...) in reg(), if
9893 you change this make sure you change that */
9894 char* name_start = (RExC_parse += 2);
9896 SV *sv_dat = reg_scan_name(pRExC_state,
9897 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9898 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9899 if (RExC_parse == name_start || *RExC_parse != ch)
9900 vFAIL2("Sequence %.3s... not terminated",parse_start);
9903 num = add_data( pRExC_state, 1, "S" );
9904 RExC_rxi->data->data[num]=(void*)sv_dat;
9905 SvREFCNT_inc_simple_void(sv_dat);
9909 ret = reganode(pRExC_state,
9912 : (MORE_ASCII_RESTRICTED)
9914 : (AT_LEAST_UNI_SEMANTICS)
9922 /* override incorrect value set in reganode MJD */
9923 Set_Node_Offset(ret, parse_start+1);
9924 Set_Node_Cur_Length(ret); /* MJD */
9925 nextchar(pRExC_state);
9931 case '1': case '2': case '3': case '4':
9932 case '5': case '6': case '7': case '8': case '9':
9935 bool isg = *RExC_parse == 'g';
9940 if (*RExC_parse == '{') {
9944 if (*RExC_parse == '-') {
9948 if (hasbrace && !isDIGIT(*RExC_parse)) {
9949 if (isrel) RExC_parse--;
9951 goto parse_named_seq;
9953 num = atoi(RExC_parse);
9954 if (isg && num == 0)
9955 vFAIL("Reference to invalid group 0");
9957 num = RExC_npar - num;
9959 vFAIL("Reference to nonexistent or unclosed group");
9961 if (!isg && num > 9 && num >= RExC_npar)
9964 char * const parse_start = RExC_parse - 1; /* MJD */
9965 while (isDIGIT(*RExC_parse))
9967 if (parse_start == RExC_parse - 1)
9968 vFAIL("Unterminated \\g... pattern");
9970 if (*RExC_parse != '}')
9971 vFAIL("Unterminated \\g{...} pattern");
9975 if (num > (I32)RExC_rx->nparens)
9976 vFAIL("Reference to nonexistent group");
9979 ret = reganode(pRExC_state,
9982 : (MORE_ASCII_RESTRICTED)
9984 : (AT_LEAST_UNI_SEMANTICS)
9992 /* override incorrect value set in reganode MJD */
9993 Set_Node_Offset(ret, parse_start+1);
9994 Set_Node_Cur_Length(ret); /* MJD */
9996 nextchar(pRExC_state);
10001 if (RExC_parse >= RExC_end)
10002 FAIL("Trailing \\");
10005 /* Do not generate "unrecognized" warnings here, we fall
10006 back into the quick-grab loop below */
10013 if (RExC_flags & RXf_PMf_EXTENDED) {
10014 if ( reg_skipcomment( pRExC_state ) )
10021 parse_start = RExC_parse - 1;
10026 register STRLEN len;
10031 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
10034 /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
10035 * it is folded to 'ss' even if not utf8 */
10036 bool is_exactfu_sharp_s;
10039 node_type = ((! FOLD) ? EXACT
10042 : (MORE_ASCII_RESTRICTED)
10044 : (AT_LEAST_UNI_SEMANTICS)
10047 ret = reg_node(pRExC_state, node_type);
10050 /* XXX The node can hold up to 255 bytes, yet this only goes to
10051 * 127. I (khw) do not know why. Keeping it somewhat less than
10052 * 255 allows us to not have to worry about overflow due to
10053 * converting to utf8 and fold expansion, but that value is
10054 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10055 * split up by this limit into a single one using the real max of
10056 * 255. Even at 127, this breaks under rare circumstances. If
10057 * folding, we do not want to split a node at a character that is a
10058 * non-final in a multi-char fold, as an input string could just
10059 * happen to want to match across the node boundary. The join
10060 * would solve that problem if the join actually happens. But a
10061 * series of more than two nodes in a row each of 127 would cause
10062 * the first join to succeed to get to 254, but then there wouldn't
10063 * be room for the next one, which could at be one of those split
10064 * multi-char folds. I don't know of any fool-proof solution. One
10065 * could back off to end with only a code point that isn't such a
10066 * non-final, but it is possible for there not to be any in the
10068 for (len = 0, p = RExC_parse - 1;
10069 len < 127 && p < RExC_end;
10072 char * const oldp = p;
10074 if (RExC_flags & RXf_PMf_EXTENDED)
10075 p = regwhite( pRExC_state, p );
10086 /* Literal Escapes Switch
10088 This switch is meant to handle escape sequences that
10089 resolve to a literal character.
10091 Every escape sequence that represents something
10092 else, like an assertion or a char class, is handled
10093 in the switch marked 'Special Escapes' above in this
10094 routine, but also has an entry here as anything that
10095 isn't explicitly mentioned here will be treated as
10096 an unescaped equivalent literal.
10099 switch ((U8)*++p) {
10100 /* These are all the special escapes. */
10101 case 'A': /* Start assertion */
10102 case 'b': case 'B': /* Word-boundary assertion*/
10103 case 'C': /* Single char !DANGEROUS! */
10104 case 'd': case 'D': /* digit class */
10105 case 'g': case 'G': /* generic-backref, pos assertion */
10106 case 'h': case 'H': /* HORIZWS */
10107 case 'k': case 'K': /* named backref, keep marker */
10108 case 'N': /* named char sequence */
10109 case 'p': case 'P': /* Unicode property */
10110 case 'R': /* LNBREAK */
10111 case 's': case 'S': /* space class */
10112 case 'v': case 'V': /* VERTWS */
10113 case 'w': case 'W': /* word class */
10114 case 'X': /* eXtended Unicode "combining character sequence" */
10115 case 'z': case 'Z': /* End of line/string assertion */
10119 /* Anything after here is an escape that resolves to a
10120 literal. (Except digits, which may or may not)
10139 ender = ASCII_TO_NATIVE('\033');
10143 ender = ASCII_TO_NATIVE('\007');
10148 STRLEN brace_len = len;
10150 const char* error_msg;
10152 bool valid = grok_bslash_o(p,
10159 RExC_parse = p; /* going to die anyway; point
10160 to exact spot of failure */
10167 if (PL_encoding && ender < 0x100) {
10168 goto recode_encoding;
10170 if (ender > 0xff) {
10177 char* const e = strchr(p, '}');
10180 RExC_parse = p + 1;
10181 vFAIL("Missing right brace on \\x{}");
10184 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10185 | PERL_SCAN_DISALLOW_PREFIX;
10186 STRLEN numlen = e - p - 1;
10187 ender = grok_hex(p + 1, &numlen, &flags, NULL);
10194 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10196 ender = grok_hex(p, &numlen, &flags, NULL);
10199 if (PL_encoding && ender < 0x100)
10200 goto recode_encoding;
10204 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10206 case '0': case '1': case '2': case '3':case '4':
10207 case '5': case '6': case '7': case '8':case '9':
10209 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10211 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10213 ender = grok_oct(p, &numlen, &flags, NULL);
10214 if (ender > 0xff) {
10223 if (PL_encoding && ender < 0x100)
10224 goto recode_encoding;
10227 if (! RExC_override_recoding) {
10228 SV* enc = PL_encoding;
10229 ender = reg_recode((const char)(U8)ender, &enc);
10230 if (!enc && SIZE_ONLY)
10231 ckWARNreg(p, "Invalid escape in the specified encoding");
10237 FAIL("Trailing \\");
10240 if (!SIZE_ONLY&& isALPHA(*p)) {
10241 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10243 goto normal_default;
10247 /* Currently we don't warn when the lbrace is at the start
10248 * of a construct. This catches it in the middle of a
10249 * literal string, or when its the first thing after
10250 * something like "\b" */
10252 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10254 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10259 if (UTF8_IS_START(*p) && UTF) {
10261 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10262 &numlen, UTF8_ALLOW_DEFAULT);
10268 } /* End of switch on the literal */
10270 is_exactfu_sharp_s = (node_type == EXACTFU
10271 && ender == LATIN_SMALL_LETTER_SHARP_S);
10272 if ( RExC_flags & RXf_PMf_EXTENDED)
10273 p = regwhite( pRExC_state, p );
10274 if ((UTF && FOLD) || is_exactfu_sharp_s) {
10275 /* Prime the casefolded buffer. Locale rules, which apply
10276 * only to code points < 256, aren't known until execution,
10277 * so for them, just output the original character using
10278 * utf8. If we start to fold non-UTF patterns, be sure to
10279 * update join_exact() */
10280 if (LOC && ender < 256) {
10281 if (UNI_IS_INVARIANT(ender)) {
10282 *tmpbuf = (U8) ender;
10285 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
10286 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
10290 else if (isASCII(ender)) { /* Note: Here can't also be LOC
10292 ender = toLOWER(ender);
10293 *tmpbuf = (U8) ender;
10296 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
10298 /* Locale and /aa require more selectivity about the
10299 * fold, so are handled below. Otherwise, here, just
10301 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
10304 /* Under locale rules or /aa we are not to mix,
10305 * respectively, ords < 256 or ASCII with non-. So
10306 * reject folds that mix them, using only the
10307 * non-folded code point. So do the fold to a
10308 * temporary, and inspect each character in it. */
10309 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
10311 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
10312 U8* e = s + foldlen;
10313 bool fold_ok = TRUE;
10317 || (LOC && (UTF8_IS_INVARIANT(*s)
10318 || UTF8_IS_DOWNGRADEABLE_START(*s))))
10326 Copy(trialbuf, tmpbuf, foldlen, U8);
10330 uvuni_to_utf8(tmpbuf, ender);
10331 foldlen = UNISKIP(ender);
10335 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
10338 else if (UTF || is_exactfu_sharp_s) {
10340 /* Emit all the Unicode characters. */
10342 for (foldbuf = tmpbuf;
10344 foldlen -= numlen) {
10346 /* tmpbuf has been constructed by us, so we
10347 * know it is valid utf8 */
10348 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10350 const STRLEN unilen = reguni(pRExC_state, ender, s);
10353 /* In EBCDIC the numlen
10354 * and unilen can differ. */
10356 if (numlen >= foldlen)
10360 break; /* "Can't happen." */
10364 const STRLEN unilen = reguni(pRExC_state, ender, s);
10373 REGC((char)ender, s++);
10377 if (UTF || is_exactfu_sharp_s) {
10379 /* Emit all the Unicode characters. */
10381 for (foldbuf = tmpbuf;
10383 foldlen -= numlen) {
10384 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10386 const STRLEN unilen = reguni(pRExC_state, ender, s);
10389 /* In EBCDIC the numlen
10390 * and unilen can differ. */
10392 if (numlen >= foldlen)
10400 const STRLEN unilen = reguni(pRExC_state, ender, s);
10409 REGC((char)ender, s++);
10412 loopdone: /* Jumped to when encounters something that shouldn't be in
10414 RExC_parse = p - 1;
10415 Set_Node_Cur_Length(ret); /* MJD */
10416 nextchar(pRExC_state);
10418 /* len is STRLEN which is unsigned, need to copy to signed */
10421 vFAIL("Internal disaster");
10424 *flagp |= HASWIDTH;
10425 if (len == 1 && UNI_IS_INVARIANT(ender))
10429 RExC_size += STR_SZ(len);
10431 STR_LEN(ret) = len;
10432 RExC_emit += STR_SZ(len);
10440 /* Jumped to when an unrecognized character set is encountered */
10442 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
10447 S_regwhite( RExC_state_t *pRExC_state, char *p )
10449 const char *e = RExC_end;
10451 PERL_ARGS_ASSERT_REGWHITE;
10456 else if (*p == '#') {
10459 if (*p++ == '\n') {
10465 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10473 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
10474 Character classes ([:foo:]) can also be negated ([:^foo:]).
10475 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
10476 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
10477 but trigger failures because they are currently unimplemented. */
10479 #define POSIXCC_DONE(c) ((c) == ':')
10480 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
10481 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
10484 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
10487 I32 namedclass = OOB_NAMEDCLASS;
10489 PERL_ARGS_ASSERT_REGPPOSIXCC;
10491 if (value == '[' && RExC_parse + 1 < RExC_end &&
10492 /* I smell either [: or [= or [. -- POSIX has been here, right? */
10493 POSIXCC(UCHARAT(RExC_parse))) {
10494 const char c = UCHARAT(RExC_parse);
10495 char* const s = RExC_parse++;
10497 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
10499 if (RExC_parse == RExC_end)
10500 /* Grandfather lone [:, [=, [. */
10503 const char* const t = RExC_parse++; /* skip over the c */
10506 if (UCHARAT(RExC_parse) == ']') {
10507 const char *posixcc = s + 1;
10508 RExC_parse++; /* skip over the ending ] */
10511 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
10512 const I32 skip = t - posixcc;
10514 /* Initially switch on the length of the name. */
10517 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
10518 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
10521 /* Names all of length 5. */
10522 /* alnum alpha ascii blank cntrl digit graph lower
10523 print punct space upper */
10524 /* Offset 4 gives the best switch position. */
10525 switch (posixcc[4]) {
10527 if (memEQ(posixcc, "alph", 4)) /* alpha */
10528 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
10531 if (memEQ(posixcc, "spac", 4)) /* space */
10532 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
10535 if (memEQ(posixcc, "grap", 4)) /* graph */
10536 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
10539 if (memEQ(posixcc, "asci", 4)) /* ascii */
10540 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
10543 if (memEQ(posixcc, "blan", 4)) /* blank */
10544 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
10547 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10548 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
10551 if (memEQ(posixcc, "alnu", 4)) /* alnum */
10552 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
10555 if (memEQ(posixcc, "lowe", 4)) /* lower */
10556 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10557 else if (memEQ(posixcc, "uppe", 4)) /* upper */
10558 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
10561 if (memEQ(posixcc, "digi", 4)) /* digit */
10562 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10563 else if (memEQ(posixcc, "prin", 4)) /* print */
10564 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10565 else if (memEQ(posixcc, "punc", 4)) /* punct */
10566 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
10571 if (memEQ(posixcc, "xdigit", 6))
10572 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
10576 if (namedclass == OOB_NAMEDCLASS)
10577 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10579 assert (posixcc[skip] == ':');
10580 assert (posixcc[skip+1] == ']');
10581 } else if (!SIZE_ONLY) {
10582 /* [[=foo=]] and [[.foo.]] are still future. */
10584 /* adjust RExC_parse so the warning shows after
10585 the class closes */
10586 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
10588 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10591 /* Maternal grandfather:
10592 * "[:" ending in ":" but not in ":]" */
10602 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10606 PERL_ARGS_ASSERT_CHECKPOSIXCC;
10608 if (POSIXCC(UCHARAT(RExC_parse))) {
10609 const char *s = RExC_parse;
10610 const char c = *s++;
10612 while (isALNUM(*s))
10614 if (*s && c == *s && s[1] == ']') {
10616 "POSIX syntax [%c %c] belongs inside character classes",
10619 /* [[=foo=]] and [[.foo.]] are still future. */
10620 if (POSIXCC_NOTYET(c)) {
10621 /* adjust RExC_parse so the error shows after
10622 the class closes */
10623 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10625 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10631 /* Generate the code to add a full posix character <class> to the bracketed
10632 * character class given by <node>. (<node> is needed only under locale rules)
10633 * destlist is the inversion list for non-locale rules that this class is
10635 * sourcelist is the ASCII-range inversion list to add under /a rules
10636 * Xsourcelist is the full Unicode range list to use otherwise. */
10637 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10639 SV* scratch_list = NULL; \
10641 /* Set this class in the node for runtime matching */ \
10642 ANYOF_CLASS_SET(node, class); \
10644 /* For above Latin1 code points, we use the full Unicode range */ \
10645 _invlist_intersection(PL_AboveLatin1, \
10648 /* And set the output to it, adding instead if there already is an \
10649 * output. Checking if <destlist> is NULL first saves an extra \
10650 * clone. Its reference count will be decremented at the next \
10651 * union, etc, or if this is the only instance, at the end of the \
10653 if (! destlist) { \
10654 destlist = scratch_list; \
10657 _invlist_union(destlist, scratch_list, &destlist); \
10658 SvREFCNT_dec(scratch_list); \
10662 /* For non-locale, just add it to any existing list */ \
10663 _invlist_union(destlist, \
10664 (AT_LEAST_ASCII_RESTRICTED) \
10670 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10672 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10674 SV* scratch_list = NULL; \
10675 ANYOF_CLASS_SET(node, class); \
10676 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
10677 if (! destlist) { \
10678 destlist = scratch_list; \
10681 _invlist_union(destlist, scratch_list, &destlist); \
10682 SvREFCNT_dec(scratch_list); \
10686 _invlist_union_complement_2nd(destlist, \
10687 (AT_LEAST_ASCII_RESTRICTED) \
10691 /* Under /d, everything in the upper half of the Latin1 range \
10692 * matches this complement */ \
10693 if (DEPENDS_SEMANTICS) { \
10694 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10698 /* Generate the code to add a posix character <class> to the bracketed
10699 * character class given by <node>. (<node> is needed only under locale rules)
10700 * destlist is the inversion list for non-locale rules that this class is
10702 * sourcelist is the ASCII-range inversion list to add under /a rules
10703 * l1_sourcelist is the Latin1 range list to use otherwise.
10704 * Xpropertyname is the name to add to <run_time_list> of the property to
10705 * specify the code points above Latin1 that will have to be
10706 * determined at run-time
10707 * run_time_list is a SV* that contains text names of properties that are to
10708 * be computed at run time. This concatenates <Xpropertyname>
10709 * to it, apppropriately
10710 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10712 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10713 l1_sourcelist, Xpropertyname, run_time_list) \
10714 /* First, resolve whether to use the ASCII-only list or the L1 \
10716 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
10717 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
10718 Xpropertyname, run_time_list)
10720 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
10721 Xpropertyname, run_time_list) \
10722 /* If not /a matching, there are going to be code points we will have \
10723 * to defer to runtime to look-up */ \
10724 if (! AT_LEAST_ASCII_RESTRICTED) { \
10725 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10728 ANYOF_CLASS_SET(node, class); \
10731 _invlist_union(destlist, sourcelist, &destlist); \
10734 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
10735 * this and DO_N_POSIX */
10736 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10737 l1_sourcelist, Xpropertyname, run_time_list) \
10738 if (AT_LEAST_ASCII_RESTRICTED) { \
10739 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
10742 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10744 ANYOF_CLASS_SET(node, namedclass); \
10747 SV* scratch_list = NULL; \
10748 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
10749 if (! destlist) { \
10750 destlist = scratch_list; \
10753 _invlist_union(destlist, scratch_list, &destlist); \
10754 SvREFCNT_dec(scratch_list); \
10756 if (DEPENDS_SEMANTICS) { \
10757 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10763 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10766 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10767 * Locale folding is done at run-time, so this function should not be
10768 * called for nodes that are for locales.
10770 * This function sets the bit corresponding to the fold of the input
10771 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
10774 * It also knows about the characters that are in the bitmap that have
10775 * folds that are matchable only outside it, and sets the appropriate lists
10778 * It returns the number of bits that actually changed from 0 to 1 */
10783 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10785 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
10788 /* It assumes the bit for 'value' has already been set */
10789 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10790 ANYOF_BITMAP_SET(node, fold);
10793 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10794 /* Certain Latin1 characters have matches outside the bitmap. To get
10795 * here, 'value' is one of those characters. None of these matches is
10796 * valid for ASCII characters under /aa, which have been excluded by
10797 * the 'if' above. The matches fall into three categories:
10798 * 1) They are singly folded-to or -from an above 255 character, as
10799 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10801 * 2) They are part of a multi-char fold with another character in the
10802 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10803 * 3) They are part of a multi-char fold with a character not in the
10804 * bitmap, such as various ligatures.
10805 * We aren't dealing fully with multi-char folds, except we do deal
10806 * with the pattern containing a character that has a multi-char fold
10807 * (not so much the inverse).
10808 * For types 1) and 3), the matches only happen when the target string
10809 * is utf8; that's not true for 2), and we set a flag for it.
10811 * The code below adds to the passed in inversion list the single fold
10812 * closures for 'value'. The values are hard-coded here so that an
10813 * innocent-looking character class, like /[ks]/i won't have to go out
10814 * to disk to find the possible matches. XXX It would be better to
10815 * generate these via regen, in case a new version of the Unicode
10816 * standard adds new mappings, though that is not really likely. */
10821 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10825 /* LATIN SMALL LETTER LONG S */
10826 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10829 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10830 GREEK_SMALL_LETTER_MU);
10831 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10832 GREEK_CAPITAL_LETTER_MU);
10834 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10835 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10836 /* ANGSTROM SIGN */
10837 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10838 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
10839 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10840 PL_fold_latin1[value]);
10843 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10844 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10845 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10847 case LATIN_SMALL_LETTER_SHARP_S:
10848 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10849 LATIN_CAPITAL_LETTER_SHARP_S);
10851 /* Under /a, /d, and /u, this can match the two chars "ss" */
10852 if (! MORE_ASCII_RESTRICTED) {
10853 add_alternate(alternate_ptr, (U8 *) "ss", 2);
10855 /* And under /u or /a, it can match even if the target is
10857 if (AT_LEAST_UNI_SEMANTICS) {
10858 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10862 case 'F': case 'f':
10863 case 'I': case 'i':
10864 case 'L': case 'l':
10865 case 'T': case 't':
10866 case 'A': case 'a':
10867 case 'H': case 'h':
10868 case 'J': case 'j':
10869 case 'N': case 'n':
10870 case 'W': case 'w':
10871 case 'Y': case 'y':
10872 /* These all are targets of multi-character folds from code
10873 * points that require UTF8 to express, so they can't match
10874 * unless the target string is in UTF-8, so no action here is
10875 * necessary, as regexec.c properly handles the general case
10876 * for UTF-8 matching */
10879 /* Use deprecated warning to increase the chances of this
10881 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10885 else if (DEPENDS_SEMANTICS
10886 && ! isASCII(value)
10887 && PL_fold_latin1[value] != value)
10889 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10890 * folds only when the target string is in UTF-8. We add the fold
10891 * here to the list of things to match outside the bitmap, which
10892 * won't be looked at unless it is UTF8 (or else if something else
10893 * says to look even if not utf8, but those things better not happen
10894 * under DEPENDS semantics. */
10895 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10902 PERL_STATIC_INLINE U8
10903 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10905 /* This inline function sets a bit in the bitmap if not already set, and if
10906 * appropriate, its fold, returning the number of bits that actually
10907 * changed from 0 to 1 */
10911 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10913 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
10917 ANYOF_BITMAP_SET(node, value);
10920 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
10921 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10928 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10930 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10931 * alternate list, pointed to by 'alternate_ptr'. This is an array of
10932 * the multi-character folds of characters in the node */
10935 PERL_ARGS_ASSERT_ADD_ALTERNATE;
10937 if (! *alternate_ptr) {
10938 *alternate_ptr = newAV();
10940 sv = newSVpvn_utf8((char*)string, len, TRUE);
10941 av_push(*alternate_ptr, sv);
10946 parse a class specification and produce either an ANYOF node that
10947 matches the pattern or perhaps will be optimized into an EXACTish node
10948 instead. The node contains a bit map for the first 256 characters, with the
10949 corresponding bit set if that character is in the list. For characters
10950 above 255, a range list is used */
10953 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10956 register UV nextvalue;
10957 register IV prevvalue = OOB_UNICODE;
10958 register IV range = 0;
10959 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10960 register regnode *ret;
10963 char *rangebegin = NULL;
10964 bool need_class = 0;
10965 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
10967 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10968 than just initialized. */
10969 SV* properties = NULL; /* Code points that match \p{} \P{} */
10970 UV element_count = 0; /* Number of distinct elements in the class.
10971 Optimizations may be possible if this is tiny */
10974 /* Unicode properties are stored in a swash; this holds the current one
10975 * being parsed. If this swash is the only above-latin1 component of the
10976 * character class, an optimization is to pass it directly on to the
10977 * execution engine. Otherwise, it is set to NULL to indicate that there
10978 * are other things in the class that have to be dealt with at execution
10980 SV* swash = NULL; /* Code points that match \p{} \P{} */
10982 /* Set if a component of this character class is user-defined; just passed
10983 * on to the engine */
10984 UV has_user_defined_property = 0;
10986 /* code points this node matches that can't be stored in the bitmap */
10987 SV* nonbitmap = NULL;
10989 /* The items that are to match that aren't stored in the bitmap, but are a
10990 * result of things that are stored there. This is the fold closure of
10991 * such a character, either because it has DEPENDS semantics and shouldn't
10992 * be matched unless the target string is utf8, or is a code point that is
10993 * too large for the bit map, as for example, the fold of the MICRO SIGN is
10994 * above 255. This all is solely for performance reasons. By having this
10995 * code know the outside-the-bitmap folds that the bitmapped characters are
10996 * involved with, we don't have to go out to disk to find the list of
10997 * matches, unless the character class includes code points that aren't
10998 * storable in the bit map. That means that a character class with an 's'
10999 * in it, for example, doesn't need to go out to disk to find everything
11000 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
11001 * empty unless there is something whose fold we don't know about, and will
11002 * have to go out to the disk to find. */
11003 SV* l1_fold_invlist = NULL;
11005 /* List of multi-character folds that are matched by this node */
11006 AV* unicode_alternate = NULL;
11008 UV literal_endpoint = 0;
11010 UV stored = 0; /* how many chars stored in the bitmap */
11012 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11013 case we need to change the emitted regop to an EXACT. */
11014 const char * orig_parse = RExC_parse;
11015 GET_RE_DEBUG_FLAGS_DECL;
11017 PERL_ARGS_ASSERT_REGCLASS;
11019 PERL_UNUSED_ARG(depth);
11022 DEBUG_PARSE("clas");
11024 /* Assume we are going to generate an ANYOF node. */
11025 ret = reganode(pRExC_state, ANYOF, 0);
11029 ANYOF_FLAGS(ret) = 0;
11032 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11036 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
11038 /* We have decided to not allow multi-char folds in inverted character
11039 * classes, due to the confusion that can happen, especially with
11040 * classes that are designed for a non-Unicode world: You have the
11041 * peculiar case that:
11042 "s s" =~ /^[^\xDF]+$/i => Y
11043 "ss" =~ /^[^\xDF]+$/i => N
11045 * See [perl #89750] */
11046 allow_full_fold = FALSE;
11050 RExC_size += ANYOF_SKIP;
11051 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11054 RExC_emit += ANYOF_SKIP;
11056 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11058 ANYOF_BITMAP_ZERO(ret);
11059 listsv = newSVpvs("# comment\n");
11060 initial_listsv_len = SvCUR(listsv);
11063 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11065 if (!SIZE_ONLY && POSIXCC(nextvalue))
11066 checkposixcc(pRExC_state);
11068 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11069 if (UCHARAT(RExC_parse) == ']')
11070 goto charclassloop;
11073 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11077 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11080 rangebegin = RExC_parse;
11084 value = utf8n_to_uvchr((U8*)RExC_parse,
11085 RExC_end - RExC_parse,
11086 &numlen, UTF8_ALLOW_DEFAULT);
11087 RExC_parse += numlen;
11090 value = UCHARAT(RExC_parse++);
11092 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11093 if (value == '[' && POSIXCC(nextvalue))
11094 namedclass = regpposixcc(pRExC_state, value);
11095 else if (value == '\\') {
11097 value = utf8n_to_uvchr((U8*)RExC_parse,
11098 RExC_end - RExC_parse,
11099 &numlen, UTF8_ALLOW_DEFAULT);
11100 RExC_parse += numlen;
11103 value = UCHARAT(RExC_parse++);
11104 /* Some compilers cannot handle switching on 64-bit integer
11105 * values, therefore value cannot be an UV. Yes, this will
11106 * be a problem later if we want switch on Unicode.
11107 * A similar issue a little bit later when switching on
11108 * namedclass. --jhi */
11109 switch ((I32)value) {
11110 case 'w': namedclass = ANYOF_ALNUM; break;
11111 case 'W': namedclass = ANYOF_NALNUM; break;
11112 case 's': namedclass = ANYOF_SPACE; break;
11113 case 'S': namedclass = ANYOF_NSPACE; break;
11114 case 'd': namedclass = ANYOF_DIGIT; break;
11115 case 'D': namedclass = ANYOF_NDIGIT; break;
11116 case 'v': namedclass = ANYOF_VERTWS; break;
11117 case 'V': namedclass = ANYOF_NVERTWS; break;
11118 case 'h': namedclass = ANYOF_HORIZWS; break;
11119 case 'H': namedclass = ANYOF_NHORIZWS; break;
11120 case 'N': /* Handle \N{NAME} in class */
11122 /* We only pay attention to the first char of
11123 multichar strings being returned. I kinda wonder
11124 if this makes sense as it does change the behaviour
11125 from earlier versions, OTOH that behaviour was broken
11127 UV v; /* value is register so we cant & it /grrr */
11128 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
11138 if (RExC_parse >= RExC_end)
11139 vFAIL2("Empty \\%c{}", (U8)value);
11140 if (*RExC_parse == '{') {
11141 const U8 c = (U8)value;
11142 e = strchr(RExC_parse++, '}');
11144 vFAIL2("Missing right brace on \\%c{}", c);
11145 while (isSPACE(UCHARAT(RExC_parse)))
11147 if (e == RExC_parse)
11148 vFAIL2("Empty \\%c{}", c);
11149 n = e - RExC_parse;
11150 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11161 if (UCHARAT(RExC_parse) == '^') {
11164 value = value == 'p' ? 'P' : 'p'; /* toggle */
11165 while (isSPACE(UCHARAT(RExC_parse))) {
11170 /* Try to get the definition of the property into
11171 * <invlist>. If /i is in effect, the effective property
11172 * will have its name be <__NAME_i>. The design is
11173 * discussed in commit
11174 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11175 Newx(name, n + sizeof("_i__\n"), char);
11177 sprintf(name, "%s%.*s%s\n",
11178 (FOLD) ? "__" : "",
11184 /* Look up the property name, and get its swash and
11185 * inversion list, if the property is found */
11187 SvREFCNT_dec(swash);
11189 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11192 TRUE, /* this routine will handle
11193 undefined properties */
11194 NULL, FALSE /* No inversion list */
11198 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
11200 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11202 || ! (invlist = *invlistsvp))
11205 SvREFCNT_dec(swash);
11209 /* Here didn't find it. It could be a user-defined
11210 * property that will be available at run-time. Add it
11211 * to the list to look up then */
11212 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11213 (value == 'p' ? '+' : '!'),
11215 has_user_defined_property = 1;
11217 /* We don't know yet, so have to assume that the
11218 * property could match something in the Latin1 range,
11219 * hence something that isn't utf8 */
11220 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11224 /* Here, did get the swash and its inversion list. If
11225 * the swash is from a user-defined property, then this
11226 * whole character class should be regarded as such */
11227 SV** user_defined_svp =
11228 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11229 "USER_DEFINED", FALSE);
11230 if (user_defined_svp) {
11231 has_user_defined_property
11232 |= SvUV(*user_defined_svp);
11235 /* Invert if asking for the complement */
11236 if (value == 'P') {
11237 _invlist_union_complement_2nd(properties, invlist, &properties);
11239 /* The swash can't be used as-is, because we've
11240 * inverted things; delay removing it to here after
11241 * have copied its invlist above */
11242 SvREFCNT_dec(swash);
11246 _invlist_union(properties, invlist, &properties);
11251 RExC_parse = e + 1;
11252 namedclass = ANYOF_MAX; /* no official name, but it's named */
11254 /* \p means they want Unicode semantics */
11255 RExC_uni_semantics = 1;
11258 case 'n': value = '\n'; break;
11259 case 'r': value = '\r'; break;
11260 case 't': value = '\t'; break;
11261 case 'f': value = '\f'; break;
11262 case 'b': value = '\b'; break;
11263 case 'e': value = ASCII_TO_NATIVE('\033');break;
11264 case 'a': value = ASCII_TO_NATIVE('\007');break;
11266 RExC_parse--; /* function expects to be pointed at the 'o' */
11268 const char* error_msg;
11269 bool valid = grok_bslash_o(RExC_parse,
11274 RExC_parse += numlen;
11279 if (PL_encoding && value < 0x100) {
11280 goto recode_encoding;
11284 if (*RExC_parse == '{') {
11285 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
11286 | PERL_SCAN_DISALLOW_PREFIX;
11287 char * const e = strchr(RExC_parse++, '}');
11289 vFAIL("Missing right brace on \\x{}");
11291 numlen = e - RExC_parse;
11292 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
11293 RExC_parse = e + 1;
11296 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
11298 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
11299 RExC_parse += numlen;
11301 if (PL_encoding && value < 0x100)
11302 goto recode_encoding;
11305 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11307 case '0': case '1': case '2': case '3': case '4':
11308 case '5': case '6': case '7':
11310 /* Take 1-3 octal digits */
11311 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11313 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11314 RExC_parse += numlen;
11315 if (PL_encoding && value < 0x100)
11316 goto recode_encoding;
11320 if (! RExC_override_recoding) {
11321 SV* enc = PL_encoding;
11322 value = reg_recode((const char)(U8)value, &enc);
11323 if (!enc && SIZE_ONLY)
11324 ckWARNreg(RExC_parse,
11325 "Invalid escape in the specified encoding");
11329 /* Allow \_ to not give an error */
11330 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11331 ckWARN2reg(RExC_parse,
11332 "Unrecognized escape \\%c in character class passed through",
11337 } /* end of \blah */
11340 literal_endpoint++;
11343 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11345 /* What matches in a locale is not known until runtime, so need to
11346 * (one time per class) allocate extra space to pass to regexec.
11347 * The space will contain a bit for each named class that is to be
11348 * matched against. This isn't needed for \p{} and pseudo-classes,
11349 * as they are not affected by locale, and hence are dealt with
11351 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
11354 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11357 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11358 ANYOF_CLASS_ZERO(ret);
11360 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11363 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11364 * literal, as is the character that began the false range, i.e.
11365 * the 'a' in the examples */
11369 RExC_parse >= rangebegin ?
11370 RExC_parse - rangebegin : 0;
11371 ckWARN4reg(RExC_parse,
11372 "False [] range \"%*.*s\"",
11376 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11377 if (prevvalue < 256) {
11379 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
11382 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
11386 range = 0; /* this was not a true range */
11391 /* Possible truncation here but in some 64-bit environments
11392 * the compiler gets heartburn about switch on 64-bit values.
11393 * A similar issue a little earlier when switching on value.
11395 switch ((I32)namedclass) {
11397 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11398 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11399 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11401 case ANYOF_NALNUMC:
11402 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11403 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11406 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11407 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11410 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11411 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11415 ANYOF_CLASS_SET(ret, namedclass);
11418 _invlist_union(properties, PL_ASCII, &properties);
11423 ANYOF_CLASS_SET(ret, namedclass);
11426 _invlist_union_complement_2nd(properties,
11427 PL_ASCII, &properties);
11428 if (DEPENDS_SEMANTICS) {
11429 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11434 DO_POSIX(ret, namedclass, properties,
11435 PL_PosixBlank, PL_XPosixBlank);
11438 DO_N_POSIX(ret, namedclass, properties,
11439 PL_PosixBlank, PL_XPosixBlank);
11442 DO_POSIX(ret, namedclass, properties,
11443 PL_PosixCntrl, PL_XPosixCntrl);
11446 DO_N_POSIX(ret, namedclass, properties,
11447 PL_PosixCntrl, PL_XPosixCntrl);
11450 /* There are no digits in the Latin1 range outside of
11451 * ASCII, so call the macro that doesn't have to resolve
11453 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
11454 PL_PosixDigit, "XPosixDigit", listsv);
11457 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11458 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
11461 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11462 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11465 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11466 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11468 case ANYOF_HORIZWS:
11469 /* For these, we use the nonbitmap, as /d doesn't make a
11470 * difference in what these match. There would be problems
11471 * if these characters had folds other than themselves, as
11472 * nonbitmap is subject to folding. It turns out that \h
11473 * is just a synonym for XPosixBlank */
11474 _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
11476 case ANYOF_NHORIZWS:
11477 _invlist_union_complement_2nd(nonbitmap,
11478 PL_XPosixBlank, &nonbitmap);
11482 { /* These require special handling, as they differ under
11483 folding, matching Cased there (which in the ASCII range
11484 is the same as Alpha */
11490 if (FOLD && ! LOC) {
11491 ascii_source = PL_PosixAlpha;
11492 l1_source = PL_L1Cased;
11496 ascii_source = PL_PosixLower;
11497 l1_source = PL_L1PosixLower;
11498 Xname = "XPosixLower";
11500 if (namedclass == ANYOF_LOWER) {
11501 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11502 ascii_source, l1_source, Xname, listsv);
11505 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11506 properties, ascii_source, l1_source, Xname, listsv);
11511 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11512 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11515 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11516 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11519 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11520 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11523 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11524 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11527 DO_POSIX(ret, namedclass, properties,
11528 PL_PosixSpace, PL_XPosixSpace);
11530 case ANYOF_NPSXSPC:
11531 DO_N_POSIX(ret, namedclass, properties,
11532 PL_PosixSpace, PL_XPosixSpace);
11535 DO_POSIX(ret, namedclass, properties,
11536 PL_PerlSpace, PL_XPerlSpace);
11539 DO_N_POSIX(ret, namedclass, properties,
11540 PL_PerlSpace, PL_XPerlSpace);
11542 case ANYOF_UPPER: /* Same as LOWER, above */
11549 if (FOLD && ! LOC) {
11550 ascii_source = PL_PosixAlpha;
11551 l1_source = PL_L1Cased;
11555 ascii_source = PL_PosixUpper;
11556 l1_source = PL_L1PosixUpper;
11557 Xname = "XPosixUpper";
11559 if (namedclass == ANYOF_UPPER) {
11560 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11561 ascii_source, l1_source, Xname, listsv);
11564 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11565 properties, ascii_source, l1_source, Xname, listsv);
11569 case ANYOF_ALNUM: /* Really is 'Word' */
11570 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11571 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11574 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11575 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11578 /* For these, we use the nonbitmap, as /d doesn't make a
11579 * difference in what these match. There would be problems
11580 * if these characters had folds other than themselves, as
11581 * nonbitmap is subject to folding */
11582 _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
11584 case ANYOF_NVERTWS:
11585 _invlist_union_complement_2nd(nonbitmap,
11586 PL_VertSpace, &nonbitmap);
11589 DO_POSIX(ret, namedclass, properties,
11590 PL_PosixXDigit, PL_XPosixXDigit);
11592 case ANYOF_NXDIGIT:
11593 DO_N_POSIX(ret, namedclass, properties,
11594 PL_PosixXDigit, PL_XPosixXDigit);
11597 /* this is to handle \p and \P */
11600 vFAIL("Invalid [::] class");
11606 } /* end of namedclass \blah */
11609 if (prevvalue > (IV)value) /* b-a */ {
11610 const int w = RExC_parse - rangebegin;
11611 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11612 range = 0; /* not a valid range */
11616 prevvalue = value; /* save the beginning of the range */
11617 if (RExC_parse+1 < RExC_end
11618 && *RExC_parse == '-'
11619 && RExC_parse[1] != ']')
11623 /* a bad range like \w-, [:word:]- ? */
11624 if (namedclass > OOB_NAMEDCLASS) {
11625 if (ckWARN(WARN_REGEXP)) {
11627 RExC_parse >= rangebegin ?
11628 RExC_parse - rangebegin : 0;
11630 "False [] range \"%*.*s\"",
11635 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11637 range = 1; /* yeah, it's a range! */
11638 continue; /* but do it the next time */
11642 /* non-Latin1 code point implies unicode semantics. Must be set in
11643 * pass1 so is there for the whole of pass 2 */
11645 RExC_uni_semantics = 1;
11648 /* now is the next time */
11650 if (prevvalue < 256) {
11651 const IV ceilvalue = value < 256 ? value : 255;
11654 /* In EBCDIC [\x89-\x91] should include
11655 * the \x8e but [i-j] should not. */
11656 if (literal_endpoint == 2 &&
11657 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11658 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11660 if (isLOWER(prevvalue)) {
11661 for (i = prevvalue; i <= ceilvalue; i++)
11662 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11664 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11667 for (i = prevvalue; i <= ceilvalue; i++)
11668 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11670 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11676 for (i = prevvalue; i <= ceilvalue; i++) {
11677 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11681 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
11682 const UV natvalue = NATIVE_TO_UNI(value);
11683 nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11686 literal_endpoint = 0;
11690 range = 0; /* this range (if it was one) is done now */
11697 /****** !SIZE_ONLY AFTER HERE *********/
11699 /* If folding and there are code points above 255, we calculate all
11700 * characters that could fold to or from the ones already on the list */
11701 if (FOLD && nonbitmap) {
11702 UV start, end; /* End points of code point ranges */
11704 SV* fold_intersection = NULL;
11706 /* This is a list of all the characters that participate in folds
11707 * (except marks, etc in multi-char folds */
11708 if (! PL_utf8_foldable) {
11709 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11710 PL_utf8_foldable = _swash_to_invlist(swash);
11711 SvREFCNT_dec(swash);
11714 /* This is a hash that for a particular fold gives all characters
11715 * that are involved in it */
11716 if (! PL_utf8_foldclosures) {
11718 /* If we were unable to find any folds, then we likely won't be
11719 * able to find the closures. So just create an empty list.
11720 * Folding will effectively be restricted to the non-Unicode rules
11721 * hard-coded into Perl. (This case happens legitimately during
11722 * compilation of Perl itself before the Unicode tables are
11724 if (invlist_len(PL_utf8_foldable) == 0) {
11725 PL_utf8_foldclosures = newHV();
11727 /* If the folds haven't been read in, call a fold function
11729 if (! PL_utf8_tofold) {
11730 U8 dummy[UTF8_MAXBYTES+1];
11733 /* This particular string is above \xff in both UTF-8 and
11735 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11736 assert(PL_utf8_tofold); /* Verify that worked */
11738 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
11742 /* Only the characters in this class that participate in folds need be
11743 * checked. Get the intersection of this class and all the possible
11744 * characters that are foldable. This can quickly narrow down a large
11746 _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
11748 /* Now look at the foldable characters in this class individually */
11749 invlist_iterinit(fold_intersection);
11750 while (invlist_iternext(fold_intersection, &start, &end)) {
11753 /* Look at every character in the range */
11754 for (j = start; j <= end; j++) {
11757 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11760 _to_uni_fold_flags(j, foldbuf, &foldlen,
11761 (allow_full_fold) ? FOLD_FLAGS_FULL : 0);
11763 if (foldlen > (STRLEN)UNISKIP(f)) {
11765 /* Any multicharacter foldings (disallowed in lookbehind
11766 * patterns) require the following transform: [ABCDEF] ->
11767 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11768 * folds into "rst", all other characters fold to single
11769 * characters. We save away these multicharacter foldings,
11770 * to be later saved as part of the additional "s" data. */
11771 if (! RExC_in_lookbehind) {
11773 U8* e = foldbuf + foldlen;
11775 /* If any of the folded characters of this are in the
11776 * Latin1 range, tell the regex engine that this can
11777 * match a non-utf8 target string. The only multi-byte
11778 * fold whose source is in the Latin1 range (U+00DF)
11779 * applies only when the target string is utf8, or
11780 * under unicode rules */
11781 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11784 /* Can't mix ascii with non- under /aa */
11785 if (MORE_ASCII_RESTRICTED
11786 && (isASCII(*loc) != isASCII(j)))
11788 goto end_multi_fold;
11790 if (UTF8_IS_INVARIANT(*loc)
11791 || UTF8_IS_DOWNGRADEABLE_START(*loc))
11793 /* Can't mix above and below 256 under LOC
11796 goto end_multi_fold;
11799 |= ANYOF_NONBITMAP_NON_UTF8;
11802 loc += UTF8SKIP(loc);
11806 add_alternate(&unicode_alternate, foldbuf, foldlen);
11810 /* This is special-cased, as it is the only letter which
11811 * has both a multi-fold and single-fold in Latin1. All
11812 * the other chars that have single and multi-folds are
11813 * always in utf8, and the utf8 folding algorithm catches
11815 if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11816 stored += set_regclass_bit(pRExC_state,
11818 LATIN_SMALL_LETTER_SHARP_S,
11819 &l1_fold_invlist, &unicode_alternate);
11823 /* Single character fold. Add everything in its fold
11824 * closure to the list that this node should match */
11827 /* The fold closures data structure is a hash with the keys
11828 * being every character that is folded to, like 'k', and
11829 * the values each an array of everything that folds to its
11830 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
11831 if ((listp = hv_fetch(PL_utf8_foldclosures,
11832 (char *) foldbuf, foldlen, FALSE)))
11834 AV* list = (AV*) *listp;
11836 for (k = 0; k <= av_len(list); k++) {
11837 SV** c_p = av_fetch(list, k, FALSE);
11840 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11844 /* /aa doesn't allow folds between ASCII and non-;
11845 * /l doesn't allow them between above and below
11847 if ((MORE_ASCII_RESTRICTED
11848 && (isASCII(c) != isASCII(j)))
11849 || (LOC && ((c < 256) != (j < 256))))
11854 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11855 stored += set_regclass_bit(pRExC_state,
11858 &l1_fold_invlist, &unicode_alternate);
11860 /* It may be that the code point is already in
11861 * this range or already in the bitmap, in
11862 * which case we need do nothing */
11863 else if ((c < start || c > end)
11865 || ! ANYOF_BITMAP_TEST(ret, c)))
11867 nonbitmap = add_cp_to_invlist(nonbitmap, c);
11874 SvREFCNT_dec(fold_intersection);
11877 /* Combine the two lists into one. */
11878 if (l1_fold_invlist) {
11880 _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
11881 SvREFCNT_dec(l1_fold_invlist);
11884 nonbitmap = l1_fold_invlist;
11888 /* And combine the result (if any) with any inversion list from properties.
11889 * The lists are kept separate up to now because we don't want to fold the
11893 _invlist_union(nonbitmap, properties, &nonbitmap);
11894 SvREFCNT_dec(properties);
11897 nonbitmap = properties;
11901 /* Here, <nonbitmap> contains all the code points we can determine at
11902 * compile time that we haven't put into the bitmap. Go through it, and
11903 * for things that belong in the bitmap, put them there, and delete from
11907 /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11908 * possibly only should match when the target string is UTF-8 */
11909 UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11911 /* This gets set if we actually need to modify things */
11912 bool change_invlist = FALSE;
11916 /* Start looking through <nonbitmap> */
11917 invlist_iterinit(nonbitmap);
11918 while (invlist_iternext(nonbitmap, &start, &end)) {
11922 /* Quit if are above what we should change */
11923 if (start > max_cp_to_set) {
11927 change_invlist = TRUE;
11929 /* Set all the bits in the range, up to the max that we are doing */
11930 high = (end < max_cp_to_set) ? end : max_cp_to_set;
11931 for (i = start; i <= (int) high; i++) {
11932 if (! ANYOF_BITMAP_TEST(ret, i)) {
11933 ANYOF_BITMAP_SET(ret, i);
11941 /* Done with loop; remove any code points that are in the bitmap from
11943 if (change_invlist) {
11944 _invlist_subtract(nonbitmap,
11945 (DEPENDS_SEMANTICS)
11951 /* If have completely emptied it, remove it completely */
11952 if (invlist_len(nonbitmap) == 0) {
11953 SvREFCNT_dec(nonbitmap);
11958 /* Here, we have calculated what code points should be in the character
11959 * class. <nonbitmap> does not overlap the bitmap except possibly in the
11960 * case of DEPENDS rules.
11962 * Now we can see about various optimizations. Fold calculation (which we
11963 * did above) needs to take place before inversion. Otherwise /[^k]/i
11964 * would invert to include K, which under /i would match k, which it
11967 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
11968 * set the FOLD flag yet, so this does optimize those. It doesn't
11969 * optimize locale. Doing so perhaps could be done as long as there is
11970 * nothing like \w in it; some thought also would have to be given to the
11971 * interaction with above 0x100 chars */
11972 if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11974 && ! unicode_alternate
11975 /* In case of /d, there are some things that should match only when in
11976 * not in the bitmap, i.e., they require UTF8 to match. These are
11977 * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11978 * case, they don't require UTF8, so can invert here */
11980 || ! DEPENDS_SEMANTICS
11981 || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11982 && SvCUR(listsv) == initial_listsv_len)
11986 for (i = 0; i < 256; ++i) {
11987 if (ANYOF_BITMAP_TEST(ret, i)) {
11988 ANYOF_BITMAP_CLEAR(ret, i);
11991 ANYOF_BITMAP_SET(ret, i);
11996 /* The inversion means that everything above 255 is matched */
11997 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
12000 /* Here, also has things outside the bitmap that may overlap with
12001 * the bitmap. We have to sync them up, so that they get inverted
12002 * in both places. Earlier, we removed all overlaps except in the
12003 * case of /d rules, so no syncing is needed except for this case
12005 SV *remove_list = NULL;
12007 if (DEPENDS_SEMANTICS) {
12010 /* Set the bits that correspond to the ones that aren't in the
12011 * bitmap. Otherwise, when we invert, we'll miss these.
12012 * Earlier, we removed from the nonbitmap all code points
12013 * < 128, so there is no extra work here */
12014 invlist_iterinit(nonbitmap);
12015 while (invlist_iternext(nonbitmap, &start, &end)) {
12016 if (start > 255) { /* The bit map goes to 255 */
12022 for (i = start; i <= (int) end; ++i) {
12023 ANYOF_BITMAP_SET(ret, i);
12030 /* Now invert both the bitmap and the nonbitmap. Anything in the
12031 * bitmap has to also be removed from the non-bitmap, but again,
12032 * there should not be overlap unless is /d rules. */
12033 _invlist_invert(nonbitmap);
12035 /* Any swash can't be used as-is, because we've inverted things */
12037 SvREFCNT_dec(swash);
12041 for (i = 0; i < 256; ++i) {
12042 if (ANYOF_BITMAP_TEST(ret, i)) {
12043 ANYOF_BITMAP_CLEAR(ret, i);
12044 if (DEPENDS_SEMANTICS) {
12045 if (! remove_list) {
12046 remove_list = _new_invlist(2);
12048 remove_list = add_cp_to_invlist(remove_list, i);
12052 ANYOF_BITMAP_SET(ret, i);
12058 /* And do the removal */
12059 if (DEPENDS_SEMANTICS) {
12061 _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
12062 SvREFCNT_dec(remove_list);
12066 /* There is no overlap for non-/d, so just delete anything
12068 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
12072 stored = 256 - stored;
12074 /* Clear the invert flag since have just done it here */
12075 ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
12078 /* Folding in the bitmap is taken care of above, but not for locale (for
12079 * which we have to wait to see what folding is in effect at runtime), and
12080 * for some things not in the bitmap (only the upper latin folds in this
12081 * case, as all other single-char folding has been set above). Set
12082 * run-time fold flag for these */
12084 || (DEPENDS_SEMANTICS
12086 && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
12087 || unicode_alternate))
12089 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
12092 /* A single character class can be "optimized" into an EXACTish node.
12093 * Note that since we don't currently count how many characters there are
12094 * outside the bitmap, we are XXX missing optimization possibilities for
12095 * them. This optimization can't happen unless this is a truly single
12096 * character class, which means that it can't be an inversion into a
12097 * many-character class, and there must be no possibility of there being
12098 * things outside the bitmap. 'stored' (only) for locales doesn't include
12099 * \w, etc, so have to make a special test that they aren't present
12101 * Similarly A 2-character class of the very special form like [bB] can be
12102 * optimized into an EXACTFish node, but only for non-locales, and for
12103 * characters which only have the two folds; so things like 'fF' and 'Ii'
12104 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
12107 && ! unicode_alternate
12108 && SvCUR(listsv) == initial_listsv_len
12109 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
12110 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12111 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
12112 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12113 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
12114 /* If the latest code point has a fold whose
12115 * bit is set, it must be the only other one */
12116 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
12117 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
12119 /* Note that the information needed to decide to do this optimization
12120 * is not currently available until the 2nd pass, and that the actually
12121 * used EXACTish node takes less space than the calculated ANYOF node,
12122 * and hence the amount of space calculated in the first pass is larger
12123 * than actually used, so this optimization doesn't gain us any space.
12124 * But an EXACT node is faster than an ANYOF node, and can be combined
12125 * with any adjacent EXACT nodes later by the optimizer for further
12126 * gains. The speed of executing an EXACTF is similar to an ANYOF
12127 * node, so the optimization advantage comes from the ability to join
12128 * it to adjacent EXACT nodes */
12130 const char * cur_parse= RExC_parse;
12132 RExC_emit = (regnode *)orig_emit;
12133 RExC_parse = (char *)orig_parse;
12137 /* A locale node with one point can be folded; all the other cases
12138 * with folding will have two points, since we calculate them above
12140 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
12147 else { /* else 2 chars in the bit map: the folds of each other */
12149 /* Use the folded value, which for the cases where we get here,
12150 * is just the lower case of the current one (which may resolve to
12151 * itself, or to the other one */
12152 value = toLOWER_LATIN1(value);
12154 /* To join adjacent nodes, they must be the exact EXACTish type.
12155 * Try to use the most likely type, by using EXACTFA if possible,
12156 * then EXACTFU if the regex calls for it, or is required because
12157 * the character is non-ASCII. (If <value> is ASCII, its fold is
12158 * also ASCII for the cases where we get here.) */
12159 if (MORE_ASCII_RESTRICTED && isASCII(value)) {
12162 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
12165 else { /* Otherwise, more likely to be EXACTF type */
12170 ret = reg_node(pRExC_state, op);
12171 RExC_parse = (char *)cur_parse;
12172 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
12173 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
12174 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
12176 RExC_emit += STR_SZ(2);
12179 *STRING(ret)= (char)value;
12181 RExC_emit += STR_SZ(1);
12183 SvREFCNT_dec(listsv);
12187 /* If there is a swash and more than one element, we can't use the swash in
12188 * the optimization below. */
12189 if (swash && element_count > 1) {
12190 SvREFCNT_dec(swash);
12194 && SvCUR(listsv) == initial_listsv_len
12195 && ! unicode_alternate)
12197 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12198 SvREFCNT_dec(listsv);
12199 SvREFCNT_dec(unicode_alternate);
12202 /* av[0] stores the character class description in its textual form:
12203 * used later (regexec.c:Perl_regclass_swash()) to initialize the
12204 * appropriate swash, and is also useful for dumping the regnode.
12205 * av[1] if NULL, is a placeholder to later contain the swash computed
12206 * from av[0]. But if no further computation need be done, the
12207 * swash is stored there now.
12208 * av[2] stores the multicharacter foldings, used later in
12209 * regexec.c:S_reginclass().
12210 * av[3] stores the nonbitmap inversion list for use in addition or
12211 * instead of av[0]; not used if av[1] isn't NULL
12212 * av[4] is set if any component of the class is from a user-defined
12213 * property; not used if av[1] isn't NULL */
12214 AV * const av = newAV();
12217 av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
12221 av_store(av, 1, swash);
12222 SvREFCNT_dec(nonbitmap);
12225 av_store(av, 1, NULL);
12227 av_store(av, 3, nonbitmap);
12228 av_store(av, 4, newSVuv(has_user_defined_property));
12232 /* Store any computed multi-char folds only if we are allowing
12234 if (allow_full_fold) {
12235 av_store(av, 2, MUTABLE_SV(unicode_alternate));
12236 if (unicode_alternate) { /* This node is variable length */
12241 av_store(av, 2, NULL);
12243 rv = newRV_noinc(MUTABLE_SV(av));
12244 n = add_data(pRExC_state, 1, "s");
12245 RExC_rxi->data->data[n] = (void*)rv;
12252 /* reg_skipcomment()
12254 Absorbs an /x style # comments from the input stream.
12255 Returns true if there is more text remaining in the stream.
12256 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12257 terminates the pattern without including a newline.
12259 Note its the callers responsibility to ensure that we are
12260 actually in /x mode
12265 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12269 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12271 while (RExC_parse < RExC_end)
12272 if (*RExC_parse++ == '\n') {
12277 /* we ran off the end of the pattern without ending
12278 the comment, so we have to add an \n when wrapping */
12279 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12287 Advances the parse position, and optionally absorbs
12288 "whitespace" from the inputstream.
12290 Without /x "whitespace" means (?#...) style comments only,
12291 with /x this means (?#...) and # comments and whitespace proper.
12293 Returns the RExC_parse point from BEFORE the scan occurs.
12295 This is the /x friendly way of saying RExC_parse++.
12299 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12301 char* const retval = RExC_parse++;
12303 PERL_ARGS_ASSERT_NEXTCHAR;
12306 if (RExC_end - RExC_parse >= 3
12307 && *RExC_parse == '('
12308 && RExC_parse[1] == '?'
12309 && RExC_parse[2] == '#')
12311 while (*RExC_parse != ')') {
12312 if (RExC_parse == RExC_end)
12313 FAIL("Sequence (?#... not terminated");
12319 if (RExC_flags & RXf_PMf_EXTENDED) {
12320 if (isSPACE(*RExC_parse)) {
12324 else if (*RExC_parse == '#') {
12325 if ( reg_skipcomment( pRExC_state ) )
12334 - reg_node - emit a node
12336 STATIC regnode * /* Location. */
12337 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12340 register regnode *ptr;
12341 regnode * const ret = RExC_emit;
12342 GET_RE_DEBUG_FLAGS_DECL;
12344 PERL_ARGS_ASSERT_REG_NODE;
12347 SIZE_ALIGN(RExC_size);
12351 if (RExC_emit >= RExC_emit_bound)
12352 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12353 op, RExC_emit, RExC_emit_bound);
12355 NODE_ALIGN_FILL(ret);
12357 FILL_ADVANCE_NODE(ptr, op);
12358 #ifdef RE_TRACK_PATTERN_OFFSETS
12359 if (RExC_offsets) { /* MJD */
12360 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
12361 "reg_node", __LINE__,
12363 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
12364 ? "Overwriting end of array!\n" : "OK",
12365 (UV)(RExC_emit - RExC_emit_start),
12366 (UV)(RExC_parse - RExC_start),
12367 (UV)RExC_offsets[0]));
12368 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
12376 - reganode - emit a node with an argument
12378 STATIC regnode * /* Location. */
12379 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12382 register regnode *ptr;
12383 regnode * const ret = RExC_emit;
12384 GET_RE_DEBUG_FLAGS_DECL;
12386 PERL_ARGS_ASSERT_REGANODE;
12389 SIZE_ALIGN(RExC_size);
12394 assert(2==regarglen[op]+1);
12396 Anything larger than this has to allocate the extra amount.
12397 If we changed this to be:
12399 RExC_size += (1 + regarglen[op]);
12401 then it wouldn't matter. Its not clear what side effect
12402 might come from that so its not done so far.
12407 if (RExC_emit >= RExC_emit_bound)
12408 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12409 op, RExC_emit, RExC_emit_bound);
12411 NODE_ALIGN_FILL(ret);
12413 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
12414 #ifdef RE_TRACK_PATTERN_OFFSETS
12415 if (RExC_offsets) { /* MJD */
12416 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12420 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
12421 "Overwriting end of array!\n" : "OK",
12422 (UV)(RExC_emit - RExC_emit_start),
12423 (UV)(RExC_parse - RExC_start),
12424 (UV)RExC_offsets[0]));
12425 Set_Cur_Node_Offset;
12433 - reguni - emit (if appropriate) a Unicode character
12436 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
12440 PERL_ARGS_ASSERT_REGUNI;
12442 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
12446 - reginsert - insert an operator in front of already-emitted operand
12448 * Means relocating the operand.
12451 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
12454 register regnode *src;
12455 register regnode *dst;
12456 register regnode *place;
12457 const int offset = regarglen[(U8)op];
12458 const int size = NODE_STEP_REGNODE + offset;
12459 GET_RE_DEBUG_FLAGS_DECL;
12461 PERL_ARGS_ASSERT_REGINSERT;
12462 PERL_UNUSED_ARG(depth);
12463 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
12464 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
12473 if (RExC_open_parens) {
12475 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
12476 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12477 if ( RExC_open_parens[paren] >= opnd ) {
12478 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
12479 RExC_open_parens[paren] += size;
12481 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12483 if ( RExC_close_parens[paren] >= opnd ) {
12484 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
12485 RExC_close_parens[paren] += size;
12487 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12492 while (src > opnd) {
12493 StructCopy(--src, --dst, regnode);
12494 #ifdef RE_TRACK_PATTERN_OFFSETS
12495 if (RExC_offsets) { /* MJD 20010112 */
12496 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
12500 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
12501 ? "Overwriting end of array!\n" : "OK",
12502 (UV)(src - RExC_emit_start),
12503 (UV)(dst - RExC_emit_start),
12504 (UV)RExC_offsets[0]));
12505 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
12506 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
12512 place = opnd; /* Op node, where operand used to be. */
12513 #ifdef RE_TRACK_PATTERN_OFFSETS
12514 if (RExC_offsets) { /* MJD */
12515 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12519 (UV)(place - RExC_emit_start) > RExC_offsets[0]
12520 ? "Overwriting end of array!\n" : "OK",
12521 (UV)(place - RExC_emit_start),
12522 (UV)(RExC_parse - RExC_start),
12523 (UV)RExC_offsets[0]));
12524 Set_Node_Offset(place, RExC_parse);
12525 Set_Node_Length(place, 1);
12528 src = NEXTOPER(place);
12529 FILL_ADVANCE_NODE(place, op);
12530 Zero(src, offset, regnode);
12534 - regtail - set the next-pointer at the end of a node chain of p to val.
12535 - SEE ALSO: regtail_study
12537 /* TODO: All three parms should be const */
12539 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12542 register regnode *scan;
12543 GET_RE_DEBUG_FLAGS_DECL;
12545 PERL_ARGS_ASSERT_REGTAIL;
12547 PERL_UNUSED_ARG(depth);
12553 /* Find last node. */
12556 regnode * const temp = regnext(scan);
12558 SV * const mysv=sv_newmortal();
12559 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12560 regprop(RExC_rx, mysv, scan);
12561 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12562 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12563 (temp == NULL ? "->" : ""),
12564 (temp == NULL ? PL_reg_name[OP(val)] : "")
12572 if (reg_off_by_arg[OP(scan)]) {
12573 ARG_SET(scan, val - scan);
12576 NEXT_OFF(scan) = val - scan;
12582 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12583 - Look for optimizable sequences at the same time.
12584 - currently only looks for EXACT chains.
12586 This is experimental code. The idea is to use this routine to perform
12587 in place optimizations on branches and groups as they are constructed,
12588 with the long term intention of removing optimization from study_chunk so
12589 that it is purely analytical.
12591 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12592 to control which is which.
12595 /* TODO: All four parms should be const */
12598 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12601 register regnode *scan;
12603 #ifdef EXPERIMENTAL_INPLACESCAN
12606 GET_RE_DEBUG_FLAGS_DECL;
12608 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12614 /* Find last node. */
12618 regnode * const temp = regnext(scan);
12619 #ifdef EXPERIMENTAL_INPLACESCAN
12620 if (PL_regkind[OP(scan)] == EXACT) {
12621 bool has_exactf_sharp_s; /* Unexamined in this routine */
12622 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12627 switch (OP(scan)) {
12633 case EXACTFU_TRICKYFOLD:
12635 if( exact == PSEUDO )
12637 else if ( exact != OP(scan) )
12646 SV * const mysv=sv_newmortal();
12647 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12648 regprop(RExC_rx, mysv, scan);
12649 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12650 SvPV_nolen_const(mysv),
12651 REG_NODE_NUM(scan),
12652 PL_reg_name[exact]);
12659 SV * const mysv_val=sv_newmortal();
12660 DEBUG_PARSE_MSG("");
12661 regprop(RExC_rx, mysv_val, val);
12662 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12663 SvPV_nolen_const(mysv_val),
12664 (IV)REG_NODE_NUM(val),
12668 if (reg_off_by_arg[OP(scan)]) {
12669 ARG_SET(scan, val - scan);
12672 NEXT_OFF(scan) = val - scan;
12680 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12684 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12690 for (bit=0; bit<32; bit++) {
12691 if (flags & (1<<bit)) {
12692 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
12695 if (!set++ && lead)
12696 PerlIO_printf(Perl_debug_log, "%s",lead);
12697 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12700 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12701 if (!set++ && lead) {
12702 PerlIO_printf(Perl_debug_log, "%s",lead);
12705 case REGEX_UNICODE_CHARSET:
12706 PerlIO_printf(Perl_debug_log, "UNICODE");
12708 case REGEX_LOCALE_CHARSET:
12709 PerlIO_printf(Perl_debug_log, "LOCALE");
12711 case REGEX_ASCII_RESTRICTED_CHARSET:
12712 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12714 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12715 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12718 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12724 PerlIO_printf(Perl_debug_log, "\n");
12726 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12732 Perl_regdump(pTHX_ const regexp *r)
12736 SV * const sv = sv_newmortal();
12737 SV *dsv= sv_newmortal();
12738 RXi_GET_DECL(r,ri);
12739 GET_RE_DEBUG_FLAGS_DECL;
12741 PERL_ARGS_ASSERT_REGDUMP;
12743 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12745 /* Header fields of interest. */
12746 if (r->anchored_substr) {
12747 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12748 RE_SV_DUMPLEN(r->anchored_substr), 30);
12749 PerlIO_printf(Perl_debug_log,
12750 "anchored %s%s at %"IVdf" ",
12751 s, RE_SV_TAIL(r->anchored_substr),
12752 (IV)r->anchored_offset);
12753 } else if (r->anchored_utf8) {
12754 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12755 RE_SV_DUMPLEN(r->anchored_utf8), 30);
12756 PerlIO_printf(Perl_debug_log,
12757 "anchored utf8 %s%s at %"IVdf" ",
12758 s, RE_SV_TAIL(r->anchored_utf8),
12759 (IV)r->anchored_offset);
12761 if (r->float_substr) {
12762 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12763 RE_SV_DUMPLEN(r->float_substr), 30);
12764 PerlIO_printf(Perl_debug_log,
12765 "floating %s%s at %"IVdf"..%"UVuf" ",
12766 s, RE_SV_TAIL(r->float_substr),
12767 (IV)r->float_min_offset, (UV)r->float_max_offset);
12768 } else if (r->float_utf8) {
12769 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12770 RE_SV_DUMPLEN(r->float_utf8), 30);
12771 PerlIO_printf(Perl_debug_log,
12772 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12773 s, RE_SV_TAIL(r->float_utf8),
12774 (IV)r->float_min_offset, (UV)r->float_max_offset);
12776 if (r->check_substr || r->check_utf8)
12777 PerlIO_printf(Perl_debug_log,
12779 (r->check_substr == r->float_substr
12780 && r->check_utf8 == r->float_utf8
12781 ? "(checking floating" : "(checking anchored"));
12782 if (r->extflags & RXf_NOSCAN)
12783 PerlIO_printf(Perl_debug_log, " noscan");
12784 if (r->extflags & RXf_CHECK_ALL)
12785 PerlIO_printf(Perl_debug_log, " isall");
12786 if (r->check_substr || r->check_utf8)
12787 PerlIO_printf(Perl_debug_log, ") ");
12789 if (ri->regstclass) {
12790 regprop(r, sv, ri->regstclass);
12791 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12793 if (r->extflags & RXf_ANCH) {
12794 PerlIO_printf(Perl_debug_log, "anchored");
12795 if (r->extflags & RXf_ANCH_BOL)
12796 PerlIO_printf(Perl_debug_log, "(BOL)");
12797 if (r->extflags & RXf_ANCH_MBOL)
12798 PerlIO_printf(Perl_debug_log, "(MBOL)");
12799 if (r->extflags & RXf_ANCH_SBOL)
12800 PerlIO_printf(Perl_debug_log, "(SBOL)");
12801 if (r->extflags & RXf_ANCH_GPOS)
12802 PerlIO_printf(Perl_debug_log, "(GPOS)");
12803 PerlIO_putc(Perl_debug_log, ' ');
12805 if (r->extflags & RXf_GPOS_SEEN)
12806 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12807 if (r->intflags & PREGf_SKIP)
12808 PerlIO_printf(Perl_debug_log, "plus ");
12809 if (r->intflags & PREGf_IMPLICIT)
12810 PerlIO_printf(Perl_debug_log, "implicit ");
12811 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12812 if (r->extflags & RXf_EVAL_SEEN)
12813 PerlIO_printf(Perl_debug_log, "with eval ");
12814 PerlIO_printf(Perl_debug_log, "\n");
12815 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12817 PERL_ARGS_ASSERT_REGDUMP;
12818 PERL_UNUSED_CONTEXT;
12819 PERL_UNUSED_ARG(r);
12820 #endif /* DEBUGGING */
12824 - regprop - printable representation of opcode
12826 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12829 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12830 if (flags & ANYOF_INVERT) \
12831 /*make sure the invert info is in each */ \
12832 sv_catpvs(sv, "^"); \
12838 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
12843 RXi_GET_DECL(prog,progi);
12844 GET_RE_DEBUG_FLAGS_DECL;
12846 PERL_ARGS_ASSERT_REGPROP;
12850 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
12851 /* It would be nice to FAIL() here, but this may be called from
12852 regexec.c, and it would be hard to supply pRExC_state. */
12853 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
12854 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
12856 k = PL_regkind[OP(o)];
12859 sv_catpvs(sv, " ");
12860 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12861 * is a crude hack but it may be the best for now since
12862 * we have no flag "this EXACTish node was UTF-8"
12864 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12865 PERL_PV_ESCAPE_UNI_DETECT |
12866 PERL_PV_ESCAPE_NONASCII |
12867 PERL_PV_PRETTY_ELLIPSES |
12868 PERL_PV_PRETTY_LTGT |
12869 PERL_PV_PRETTY_NOCLEAR
12871 } else if (k == TRIE) {
12872 /* print the details of the trie in dumpuntil instead, as
12873 * progi->data isn't available here */
12874 const char op = OP(o);
12875 const U32 n = ARG(o);
12876 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
12877 (reg_ac_data *)progi->data->data[n] :
12879 const reg_trie_data * const trie
12880 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
12882 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
12883 DEBUG_TRIE_COMPILE_r(
12884 Perl_sv_catpvf(aTHX_ sv,
12885 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12886 (UV)trie->startstate,
12887 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
12888 (UV)trie->wordcount,
12891 (UV)TRIE_CHARCOUNT(trie),
12892 (UV)trie->uniquecharcount
12895 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12897 int rangestart = -1;
12898 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
12899 sv_catpvs(sv, "[");
12900 for (i = 0; i <= 256; i++) {
12901 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12902 if (rangestart == -1)
12904 } else if (rangestart != -1) {
12905 if (i <= rangestart + 3)
12906 for (; rangestart < i; rangestart++)
12907 put_byte(sv, rangestart);
12909 put_byte(sv, rangestart);
12910 sv_catpvs(sv, "-");
12911 put_byte(sv, i - 1);
12916 sv_catpvs(sv, "]");
12919 } else if (k == CURLY) {
12920 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12921 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12922 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12924 else if (k == WHILEM && o->flags) /* Ordinal/of */
12925 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12926 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12927 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
12928 if ( RXp_PAREN_NAMES(prog) ) {
12929 if ( k != REF || (OP(o) < NREF)) {
12930 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12931 SV **name= av_fetch(list, ARG(o), 0 );
12933 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12936 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12937 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12938 I32 *nums=(I32*)SvPVX(sv_dat);
12939 SV **name= av_fetch(list, nums[0], 0 );
12942 for ( n=0; n<SvIVX(sv_dat); n++ ) {
12943 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12944 (n ? "," : ""), (IV)nums[n]);
12946 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12950 } else if (k == GOSUB)
12951 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12952 else if (k == VERB) {
12954 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12955 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12956 } else if (k == LOGICAL)
12957 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
12958 else if (k == ANYOF) {
12959 int i, rangestart = -1;
12960 const U8 flags = ANYOF_FLAGS(o);
12963 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12964 static const char * const anyofs[] = {
12997 if (flags & ANYOF_LOCALE)
12998 sv_catpvs(sv, "{loc}");
12999 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
13000 sv_catpvs(sv, "{i}");
13001 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13002 if (flags & ANYOF_INVERT)
13003 sv_catpvs(sv, "^");
13005 /* output what the standard cp 0-255 bitmap matches */
13006 for (i = 0; i <= 256; i++) {
13007 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13008 if (rangestart == -1)
13010 } else if (rangestart != -1) {
13011 if (i <= rangestart + 3)
13012 for (; rangestart < i; rangestart++)
13013 put_byte(sv, rangestart);
13015 put_byte(sv, rangestart);
13016 sv_catpvs(sv, "-");
13017 put_byte(sv, i - 1);
13024 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13025 /* output any special charclass tests (used entirely under use locale) */
13026 if (ANYOF_CLASS_TEST_ANY_SET(o))
13027 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13028 if (ANYOF_CLASS_TEST(o,i)) {
13029 sv_catpv(sv, anyofs[i]);
13033 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13035 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13036 sv_catpvs(sv, "{non-utf8-latin1-all}");
13039 /* output information about the unicode matching */
13040 if (flags & ANYOF_UNICODE_ALL)
13041 sv_catpvs(sv, "{unicode_all}");
13042 else if (ANYOF_NONBITMAP(o))
13043 sv_catpvs(sv, "{unicode}");
13044 if (flags & ANYOF_NONBITMAP_NON_UTF8)
13045 sv_catpvs(sv, "{outside bitmap}");
13047 if (ANYOF_NONBITMAP(o)) {
13048 SV *lv; /* Set if there is something outside the bit map */
13049 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
13050 bool byte_output = FALSE; /* If something in the bitmap has been
13053 if (lv && lv != &PL_sv_undef) {
13055 U8 s[UTF8_MAXBYTES_CASE+1];
13057 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13058 uvchr_to_utf8(s, i);
13061 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
13065 && swash_fetch(sw, s, TRUE))
13067 if (rangestart == -1)
13069 } else if (rangestart != -1) {
13070 byte_output = TRUE;
13071 if (i <= rangestart + 3)
13072 for (; rangestart < i; rangestart++) {
13073 put_byte(sv, rangestart);
13076 put_byte(sv, rangestart);
13077 sv_catpvs(sv, "-");
13086 char *s = savesvpv(lv);
13087 char * const origs = s;
13089 while (*s && *s != '\n')
13093 const char * const t = ++s;
13096 sv_catpvs(sv, " ");
13102 /* Truncate very long output */
13103 if (s - origs > 256) {
13104 Perl_sv_catpvf(aTHX_ sv,
13106 (int) (s - origs - 1),
13112 else if (*s == '\t') {
13131 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13133 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13134 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13136 PERL_UNUSED_CONTEXT;
13137 PERL_UNUSED_ARG(sv);
13138 PERL_UNUSED_ARG(o);
13139 PERL_UNUSED_ARG(prog);
13140 #endif /* DEBUGGING */
13144 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13145 { /* Assume that RE_INTUIT is set */
13147 struct regexp *const prog = (struct regexp *)SvANY(r);
13148 GET_RE_DEBUG_FLAGS_DECL;
13150 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13151 PERL_UNUSED_CONTEXT;
13155 const char * const s = SvPV_nolen_const(prog->check_substr
13156 ? prog->check_substr : prog->check_utf8);
13158 if (!PL_colorset) reginitcolors();
13159 PerlIO_printf(Perl_debug_log,
13160 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13162 prog->check_substr ? "" : "utf8 ",
13163 PL_colors[5],PL_colors[0],
13166 (strlen(s) > 60 ? "..." : ""));
13169 return prog->check_substr ? prog->check_substr : prog->check_utf8;
13175 handles refcounting and freeing the perl core regexp structure. When
13176 it is necessary to actually free the structure the first thing it
13177 does is call the 'free' method of the regexp_engine associated to
13178 the regexp, allowing the handling of the void *pprivate; member
13179 first. (This routine is not overridable by extensions, which is why
13180 the extensions free is called first.)
13182 See regdupe and regdupe_internal if you change anything here.
13184 #ifndef PERL_IN_XSUB_RE
13186 Perl_pregfree(pTHX_ REGEXP *r)
13192 Perl_pregfree2(pTHX_ REGEXP *rx)
13195 struct regexp *const r = (struct regexp *)SvANY(rx);
13196 GET_RE_DEBUG_FLAGS_DECL;
13198 PERL_ARGS_ASSERT_PREGFREE2;
13200 if (r->mother_re) {
13201 ReREFCNT_dec(r->mother_re);
13203 CALLREGFREE_PVT(rx); /* free the private data */
13204 SvREFCNT_dec(RXp_PAREN_NAMES(r));
13207 SvREFCNT_dec(r->anchored_substr);
13208 SvREFCNT_dec(r->anchored_utf8);
13209 SvREFCNT_dec(r->float_substr);
13210 SvREFCNT_dec(r->float_utf8);
13211 Safefree(r->substrs);
13213 RX_MATCH_COPY_FREE(rx);
13214 #ifdef PERL_OLD_COPY_ON_WRITE
13215 SvREFCNT_dec(r->saved_copy);
13218 SvREFCNT_dec(r->qr_anoncv);
13223 This is a hacky workaround to the structural issue of match results
13224 being stored in the regexp structure which is in turn stored in
13225 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13226 could be PL_curpm in multiple contexts, and could require multiple
13227 result sets being associated with the pattern simultaneously, such
13228 as when doing a recursive match with (??{$qr})
13230 The solution is to make a lightweight copy of the regexp structure
13231 when a qr// is returned from the code executed by (??{$qr}) this
13232 lightweight copy doesn't actually own any of its data except for
13233 the starp/end and the actual regexp structure itself.
13239 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
13241 struct regexp *ret;
13242 struct regexp *const r = (struct regexp *)SvANY(rx);
13243 register const I32 npar = r->nparens+1;
13245 PERL_ARGS_ASSERT_REG_TEMP_COPY;
13248 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13249 ret = (struct regexp *)SvANY(ret_x);
13251 (void)ReREFCNT_inc(rx);
13252 /* We can take advantage of the existing "copied buffer" mechanism in SVs
13253 by pointing directly at the buffer, but flagging that the allocated
13254 space in the copy is zero. As we've just done a struct copy, it's now
13255 a case of zero-ing that, rather than copying the current length. */
13256 SvPV_set(ret_x, RX_WRAPPED(rx));
13257 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
13258 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
13259 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13260 SvLEN_set(ret_x, 0);
13261 SvSTASH_set(ret_x, NULL);
13262 SvMAGIC_set(ret_x, NULL);
13263 Newx(ret->offs, npar, regexp_paren_pair);
13264 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13266 Newx(ret->substrs, 1, struct reg_substr_data);
13267 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13269 SvREFCNT_inc_void(ret->anchored_substr);
13270 SvREFCNT_inc_void(ret->anchored_utf8);
13271 SvREFCNT_inc_void(ret->float_substr);
13272 SvREFCNT_inc_void(ret->float_utf8);
13274 /* check_substr and check_utf8, if non-NULL, point to either their
13275 anchored or float namesakes, and don't hold a second reference. */
13277 RX_MATCH_COPIED_off(ret_x);
13278 #ifdef PERL_OLD_COPY_ON_WRITE
13279 ret->saved_copy = NULL;
13281 ret->mother_re = rx;
13282 SvREFCNT_inc_void(ret->qr_anoncv);
13288 /* regfree_internal()
13290 Free the private data in a regexp. This is overloadable by
13291 extensions. Perl takes care of the regexp structure in pregfree(),
13292 this covers the *pprivate pointer which technically perl doesn't
13293 know about, however of course we have to handle the
13294 regexp_internal structure when no extension is in use.
13296 Note this is called before freeing anything in the regexp
13301 Perl_regfree_internal(pTHX_ REGEXP * const rx)
13304 struct regexp *const r = (struct regexp *)SvANY(rx);
13305 RXi_GET_DECL(r,ri);
13306 GET_RE_DEBUG_FLAGS_DECL;
13308 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13314 SV *dsv= sv_newmortal();
13315 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13316 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
13317 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
13318 PL_colors[4],PL_colors[5],s);
13321 #ifdef RE_TRACK_PATTERN_OFFSETS
13323 Safefree(ri->u.offsets); /* 20010421 MJD */
13325 if (ri->code_blocks) {
13327 for (n = 0; n < ri->num_code_blocks; n++)
13328 SvREFCNT_dec(ri->code_blocks[n].src_regex);
13329 Safefree(ri->code_blocks);
13333 int n = ri->data->count;
13334 PAD* new_comppad = NULL;
13339 /* If you add a ->what type here, update the comment in regcomp.h */
13340 switch (ri->data->what[n]) {
13346 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13349 Safefree(ri->data->data[n]);
13352 new_comppad = MUTABLE_AV(ri->data->data[n]);
13355 if (new_comppad == NULL)
13356 Perl_croak(aTHX_ "panic: pregfree comppad");
13357 PAD_SAVE_LOCAL(old_comppad,
13358 /* Watch out for global destruction's random ordering. */
13359 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
13362 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
13365 op_free((OP_4tree*)ri->data->data[n]);
13367 PAD_RESTORE_LOCAL(old_comppad);
13368 SvREFCNT_dec(MUTABLE_SV(new_comppad));
13369 new_comppad = NULL;
13376 { /* Aho Corasick add-on structure for a trie node.
13377 Used in stclass optimization only */
13379 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13381 refcount = --aho->refcount;
13384 PerlMemShared_free(aho->states);
13385 PerlMemShared_free(aho->fail);
13386 /* do this last!!!! */
13387 PerlMemShared_free(ri->data->data[n]);
13388 PerlMemShared_free(ri->regstclass);
13394 /* trie structure. */
13396 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13398 refcount = --trie->refcount;
13401 PerlMemShared_free(trie->charmap);
13402 PerlMemShared_free(trie->states);
13403 PerlMemShared_free(trie->trans);
13405 PerlMemShared_free(trie->bitmap);
13407 PerlMemShared_free(trie->jump);
13408 PerlMemShared_free(trie->wordinfo);
13409 /* do this last!!!! */
13410 PerlMemShared_free(ri->data->data[n]);
13415 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
13418 Safefree(ri->data->what);
13419 Safefree(ri->data);
13425 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
13426 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
13427 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
13430 re_dup - duplicate a regexp.
13432 This routine is expected to clone a given regexp structure. It is only
13433 compiled under USE_ITHREADS.
13435 After all of the core data stored in struct regexp is duplicated
13436 the regexp_engine.dupe method is used to copy any private data
13437 stored in the *pprivate pointer. This allows extensions to handle
13438 any duplication it needs to do.
13440 See pregfree() and regfree_internal() if you change anything here.
13442 #if defined(USE_ITHREADS)
13443 #ifndef PERL_IN_XSUB_RE
13445 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13449 const struct regexp *r = (const struct regexp *)SvANY(sstr);
13450 struct regexp *ret = (struct regexp *)SvANY(dstr);
13452 PERL_ARGS_ASSERT_RE_DUP_GUTS;
13454 npar = r->nparens+1;
13455 Newx(ret->offs, npar, regexp_paren_pair);
13456 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13458 /* no need to copy these */
13459 Newx(ret->swap, npar, regexp_paren_pair);
13462 if (ret->substrs) {
13463 /* Do it this way to avoid reading from *r after the StructCopy().
13464 That way, if any of the sv_dup_inc()s dislodge *r from the L1
13465 cache, it doesn't matter. */
13466 const bool anchored = r->check_substr
13467 ? r->check_substr == r->anchored_substr
13468 : r->check_utf8 == r->anchored_utf8;
13469 Newx(ret->substrs, 1, struct reg_substr_data);
13470 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13472 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
13473 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
13474 ret->float_substr = sv_dup_inc(ret->float_substr, param);
13475 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
13477 /* check_substr and check_utf8, if non-NULL, point to either their
13478 anchored or float namesakes, and don't hold a second reference. */
13480 if (ret->check_substr) {
13482 assert(r->check_utf8 == r->anchored_utf8);
13483 ret->check_substr = ret->anchored_substr;
13484 ret->check_utf8 = ret->anchored_utf8;
13486 assert(r->check_substr == r->float_substr);
13487 assert(r->check_utf8 == r->float_utf8);
13488 ret->check_substr = ret->float_substr;
13489 ret->check_utf8 = ret->float_utf8;
13491 } else if (ret->check_utf8) {
13493 ret->check_utf8 = ret->anchored_utf8;
13495 ret->check_utf8 = ret->float_utf8;
13500 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13501 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13504 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
13506 if (RX_MATCH_COPIED(dstr))
13507 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
13509 ret->subbeg = NULL;
13510 #ifdef PERL_OLD_COPY_ON_WRITE
13511 ret->saved_copy = NULL;
13514 if (ret->mother_re) {
13515 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
13516 /* Our storage points directly to our mother regexp, but that's
13517 1: a buffer in a different thread
13518 2: something we no longer hold a reference on
13519 so we need to copy it locally. */
13520 /* Note we need to use SvCUR(), rather than
13521 SvLEN(), on our mother_re, because it, in
13522 turn, may well be pointing to its own mother_re. */
13523 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
13524 SvCUR(ret->mother_re)+1));
13525 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
13527 ret->mother_re = NULL;
13531 #endif /* PERL_IN_XSUB_RE */
13536 This is the internal complement to regdupe() which is used to copy
13537 the structure pointed to by the *pprivate pointer in the regexp.
13538 This is the core version of the extension overridable cloning hook.
13539 The regexp structure being duplicated will be copied by perl prior
13540 to this and will be provided as the regexp *r argument, however
13541 with the /old/ structures pprivate pointer value. Thus this routine
13542 may override any copying normally done by perl.
13544 It returns a pointer to the new regexp_internal structure.
13548 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13551 struct regexp *const r = (struct regexp *)SvANY(rx);
13552 regexp_internal *reti;
13554 RXi_GET_DECL(r,ri);
13556 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13560 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
13561 Copy(ri->program, reti->program, len+1, regnode);
13563 reti->num_code_blocks = ri->num_code_blocks;
13564 if (ri->code_blocks) {
13566 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
13567 struct reg_code_block);
13568 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
13569 struct reg_code_block);
13570 for (n = 0; n < ri->num_code_blocks; n++)
13571 reti->code_blocks[n].src_regex = (REGEXP*)
13572 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
13575 reti->code_blocks = NULL;
13577 reti->regstclass = NULL;
13580 struct reg_data *d;
13581 const int count = ri->data->count;
13584 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13585 char, struct reg_data);
13586 Newx(d->what, count, U8);
13589 for (i = 0; i < count; i++) {
13590 d->what[i] = ri->data->what[i];
13591 switch (d->what[i]) {
13592 /* legal options are one of: sSfpontTua
13593 see also regcomp.h and pregfree() */
13594 case 'a': /* actually an AV, but the dup function is identical. */
13598 case 'p': /* actually an AV, but the dup function is identical. */
13599 case 'u': /* actually an HV, but the dup function is identical. */
13600 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13603 /* This is cheating. */
13604 Newx(d->data[i], 1, struct regnode_charclass_class);
13605 StructCopy(ri->data->data[i], d->data[i],
13606 struct regnode_charclass_class);
13607 reti->regstclass = (regnode*)d->data[i];
13610 /* Compiled op trees are readonly and in shared memory,
13611 and can thus be shared without duplication. */
13613 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
13617 /* Trie stclasses are readonly and can thus be shared
13618 * without duplication. We free the stclass in pregfree
13619 * when the corresponding reg_ac_data struct is freed.
13621 reti->regstclass= ri->regstclass;
13625 ((reg_trie_data*)ri->data->data[i])->refcount++;
13631 d->data[i] = ri->data->data[i];
13634 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13643 reti->name_list_idx = ri->name_list_idx;
13645 #ifdef RE_TRACK_PATTERN_OFFSETS
13646 if (ri->u.offsets) {
13647 Newx(reti->u.offsets, 2*len+1, U32);
13648 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13651 SetProgLen(reti,len);
13654 return (void*)reti;
13657 #endif /* USE_ITHREADS */
13659 #ifndef PERL_IN_XSUB_RE
13662 - regnext - dig the "next" pointer out of a node
13665 Perl_regnext(pTHX_ register regnode *p)
13668 register I32 offset;
13673 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
13674 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13677 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13686 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13689 STRLEN l1 = strlen(pat1);
13690 STRLEN l2 = strlen(pat2);
13693 const char *message;
13695 PERL_ARGS_ASSERT_RE_CROAK2;
13701 Copy(pat1, buf, l1 , char);
13702 Copy(pat2, buf + l1, l2 , char);
13703 buf[l1 + l2] = '\n';
13704 buf[l1 + l2 + 1] = '\0';
13706 /* ANSI variant takes additional second argument */
13707 va_start(args, pat2);
13711 msv = vmess(buf, &args);
13713 message = SvPV_const(msv,l1);
13716 Copy(message, buf, l1 , char);
13717 buf[l1-1] = '\0'; /* Overwrite \n */
13718 Perl_croak(aTHX_ "%s", buf);
13721 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13723 #ifndef PERL_IN_XSUB_RE
13725 Perl_save_re_context(pTHX)
13729 struct re_save_state *state;
13731 SAVEVPTR(PL_curcop);
13732 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13734 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13735 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13736 SSPUSHUV(SAVEt_RE_STATE);
13738 Copy(&PL_reg_state, state, 1, struct re_save_state);
13740 PL_reg_start_tmp = 0;
13741 PL_reg_start_tmpl = 0;
13742 PL_reg_oldsaved = NULL;
13743 PL_reg_oldsavedlen = 0;
13744 PL_reg_maxiter = 0;
13745 PL_reg_leftiter = 0;
13746 PL_reg_poscache = NULL;
13747 PL_reg_poscache_size = 0;
13748 #ifdef PERL_OLD_COPY_ON_WRITE
13752 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13754 const REGEXP * const rx = PM_GETRE(PL_curpm);
13757 for (i = 1; i <= RX_NPARENS(rx); i++) {
13758 char digits[TYPE_CHARS(long)];
13759 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13760 GV *const *const gvp
13761 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13764 GV * const gv = *gvp;
13765 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13775 clear_re(pTHX_ void *r)
13778 ReREFCNT_dec((REGEXP *)r);
13784 S_put_byte(pTHX_ SV *sv, int c)
13786 PERL_ARGS_ASSERT_PUT_BYTE;
13788 /* Our definition of isPRINT() ignores locales, so only bytes that are
13789 not part of UTF-8 are considered printable. I assume that the same
13790 holds for UTF-EBCDIC.
13791 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13792 which Wikipedia says:
13794 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13795 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13796 identical, to the ASCII delete (DEL) or rubout control character.
13797 ) So the old condition can be simplified to !isPRINT(c) */
13800 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13803 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13807 const char string = c;
13808 if (c == '-' || c == ']' || c == '\\' || c == '^')
13809 sv_catpvs(sv, "\\");
13810 sv_catpvn(sv, &string, 1);
13815 #define CLEAR_OPTSTART \
13816 if (optstart) STMT_START { \
13817 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13821 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13823 STATIC const regnode *
13824 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13825 const regnode *last, const regnode *plast,
13826 SV* sv, I32 indent, U32 depth)
13829 register U8 op = PSEUDO; /* Arbitrary non-END op. */
13830 register const regnode *next;
13831 const regnode *optstart= NULL;
13833 RXi_GET_DECL(r,ri);
13834 GET_RE_DEBUG_FLAGS_DECL;
13836 PERL_ARGS_ASSERT_DUMPUNTIL;
13838 #ifdef DEBUG_DUMPUNTIL
13839 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13840 last ? last-start : 0,plast ? plast-start : 0);
13843 if (plast && plast < last)
13846 while (PL_regkind[op] != END && (!last || node < last)) {
13847 /* While that wasn't END last time... */
13850 if (op == CLOSE || op == WHILEM)
13852 next = regnext((regnode *)node);
13855 if (OP(node) == OPTIMIZED) {
13856 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13863 regprop(r, sv, node);
13864 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13865 (int)(2*indent + 1), "", SvPVX_const(sv));
13867 if (OP(node) != OPTIMIZED) {
13868 if (next == NULL) /* Next ptr. */
13869 PerlIO_printf(Perl_debug_log, " (0)");
13870 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13871 PerlIO_printf(Perl_debug_log, " (FAIL)");
13873 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13874 (void)PerlIO_putc(Perl_debug_log, '\n');
13878 if (PL_regkind[(U8)op] == BRANCHJ) {
13881 register const regnode *nnode = (OP(next) == LONGJMP
13882 ? regnext((regnode *)next)
13884 if (last && nnode > last)
13886 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
13889 else if (PL_regkind[(U8)op] == BRANCH) {
13891 DUMPUNTIL(NEXTOPER(node), next);
13893 else if ( PL_regkind[(U8)op] == TRIE ) {
13894 const regnode *this_trie = node;
13895 const char op = OP(node);
13896 const U32 n = ARG(node);
13897 const reg_ac_data * const ac = op>=AHOCORASICK ?
13898 (reg_ac_data *)ri->data->data[n] :
13900 const reg_trie_data * const trie =
13901 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
13903 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
13905 const regnode *nextbranch= NULL;
13908 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
13909 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
13911 PerlIO_printf(Perl_debug_log, "%*s%s ",
13912 (int)(2*(indent+3)), "",
13913 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
13914 PL_colors[0], PL_colors[1],
13915 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
13916 PERL_PV_PRETTY_ELLIPSES |
13917 PERL_PV_PRETTY_LTGT
13922 U16 dist= trie->jump[word_idx+1];
13923 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13924 (UV)((dist ? this_trie + dist : next) - start));
13927 nextbranch= this_trie + trie->jump[0];
13928 DUMPUNTIL(this_trie + dist, nextbranch);
13930 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13931 nextbranch= regnext((regnode *)nextbranch);
13933 PerlIO_printf(Perl_debug_log, "\n");
13936 if (last && next > last)
13941 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
13942 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13943 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
13945 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13947 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13949 else if ( op == PLUS || op == STAR) {
13950 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13952 else if (PL_regkind[(U8)op] == ANYOF) {
13953 /* arglen 1 + class block */
13954 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13955 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13956 node = NEXTOPER(node);
13958 else if (PL_regkind[(U8)op] == EXACT) {
13959 /* Literal string, where present. */
13960 node += NODE_SZ_STR(node) - 1;
13961 node = NEXTOPER(node);
13964 node = NEXTOPER(node);
13965 node += regarglen[(U8)op];
13967 if (op == CURLYX || op == OPEN)
13971 #ifdef DEBUG_DUMPUNTIL
13972 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13977 #endif /* DEBUGGING */
13981 * c-indentation-style: bsd
13982 * c-basic-offset: 4
13983 * indent-tabs-mode: nil
13986 * ex: set ts=8 sts=4 sw=4 et: