5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
88 #include "dquote_static.c"
89 #ifndef PERL_IN_XSUB_RE
90 # include "charclass_invlists.h"
93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
100 # if defined(BUGGY_MSC6)
101 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
102 # pragma optimize("a",off)
103 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
104 # pragma optimize("w",on )
105 # endif /* BUGGY_MSC6 */
109 #define STATIC static
113 typedef struct RExC_state_t {
114 U32 flags; /* RXf_* are we folding, multilining? */
115 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
116 char *precomp; /* uncompiled string. */
117 REGEXP *rx_sv; /* The SV that is the regexp. */
118 regexp *rx; /* perl core regexp structure */
119 regexp_internal *rxi; /* internal data for regexp object pprivate field */
120 char *start; /* Start of input for compile */
121 char *end; /* End of input for compile */
122 char *parse; /* Input-scan pointer. */
123 I32 whilem_seen; /* number of WHILEM in this expr */
124 regnode *emit_start; /* Start of emitted-code area */
125 regnode *emit_bound; /* First regnode outside of the allocated space */
126 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
127 I32 naughty; /* How bad is this pattern? */
128 I32 sawback; /* Did we see \1, ...? */
130 I32 size; /* Code size. */
131 I32 npar; /* Capture buffer count, (OPEN). */
132 I32 cpar; /* Capture buffer count, (CLOSE). */
133 I32 nestroot; /* root parens we are in - used by accept */
137 regnode **open_parens; /* pointers to open parens */
138 regnode **close_parens; /* pointers to close parens */
139 regnode *opend; /* END node in program */
140 I32 utf8; /* whether the pattern is utf8 or not */
141 I32 orig_utf8; /* whether the pattern was originally in utf8 */
142 /* XXX use this for future optimisation of case
143 * where pattern must be upgraded to utf8. */
144 I32 uni_semantics; /* If a d charset modifier should use unicode
145 rules, even if the pattern is not in
147 HV *paren_names; /* Paren names */
149 regnode **recurse; /* Recurse regops */
150 I32 recurse_count; /* Number of recurse regops */
153 I32 override_recoding;
154 struct reg_code_block *code_blocks; /* positions of literal (?{})
156 int num_code_blocks; /* size of code_blocks[] */
157 int code_index; /* next code_blocks[] slot */
159 char *starttry; /* -Dr: where regtry was called. */
160 #define RExC_starttry (pRExC_state->starttry)
163 const char *lastparse;
165 AV *paren_name_list; /* idx -> name */
166 #define RExC_lastparse (pRExC_state->lastparse)
167 #define RExC_lastnum (pRExC_state->lastnum)
168 #define RExC_paren_name_list (pRExC_state->paren_name_list)
172 #define RExC_flags (pRExC_state->flags)
173 #define RExC_pm_flags (pRExC_state->pm_flags)
174 #define RExC_precomp (pRExC_state->precomp)
175 #define RExC_rx_sv (pRExC_state->rx_sv)
176 #define RExC_rx (pRExC_state->rx)
177 #define RExC_rxi (pRExC_state->rxi)
178 #define RExC_start (pRExC_state->start)
179 #define RExC_end (pRExC_state->end)
180 #define RExC_parse (pRExC_state->parse)
181 #define RExC_whilem_seen (pRExC_state->whilem_seen)
182 #ifdef RE_TRACK_PATTERN_OFFSETS
183 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
185 #define RExC_emit (pRExC_state->emit)
186 #define RExC_emit_start (pRExC_state->emit_start)
187 #define RExC_emit_bound (pRExC_state->emit_bound)
188 #define RExC_naughty (pRExC_state->naughty)
189 #define RExC_sawback (pRExC_state->sawback)
190 #define RExC_seen (pRExC_state->seen)
191 #define RExC_size (pRExC_state->size)
192 #define RExC_npar (pRExC_state->npar)
193 #define RExC_nestroot (pRExC_state->nestroot)
194 #define RExC_extralen (pRExC_state->extralen)
195 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
196 #define RExC_seen_evals (pRExC_state->seen_evals)
197 #define RExC_utf8 (pRExC_state->utf8)
198 #define RExC_uni_semantics (pRExC_state->uni_semantics)
199 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
200 #define RExC_open_parens (pRExC_state->open_parens)
201 #define RExC_close_parens (pRExC_state->close_parens)
202 #define RExC_opend (pRExC_state->opend)
203 #define RExC_paren_names (pRExC_state->paren_names)
204 #define RExC_recurse (pRExC_state->recurse)
205 #define RExC_recurse_count (pRExC_state->recurse_count)
206 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
207 #define RExC_contains_locale (pRExC_state->contains_locale)
208 #define RExC_override_recoding (pRExC_state->override_recoding)
211 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
212 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
213 ((*s) == '{' && regcurly(s)))
216 #undef SPSTART /* dratted cpp namespace... */
219 * Flags to be passed up and down.
221 #define WORST 0 /* Worst case. */
222 #define HASWIDTH 0x01 /* Known to match non-null strings. */
224 /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
225 * character, and if utf8, must be invariant. Note that this is not the same
226 * thing as REGNODE_SIMPLE */
228 #define SPSTART 0x04 /* Starts with * or +. */
229 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
230 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
232 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
234 /* whether trie related optimizations are enabled */
235 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
236 #define TRIE_STUDY_OPT
237 #define FULL_TRIE_STUDY
243 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
244 #define PBITVAL(paren) (1 << ((paren) & 7))
245 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
246 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
247 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
249 /* If not already in utf8, do a longjmp back to the beginning */
250 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
251 #define REQUIRE_UTF8 STMT_START { \
252 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
255 /* About scan_data_t.
257 During optimisation we recurse through the regexp program performing
258 various inplace (keyhole style) optimisations. In addition study_chunk
259 and scan_commit populate this data structure with information about
260 what strings MUST appear in the pattern. We look for the longest
261 string that must appear at a fixed location, and we look for the
262 longest string that may appear at a floating location. So for instance
267 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
268 strings (because they follow a .* construct). study_chunk will identify
269 both FOO and BAR as being the longest fixed and floating strings respectively.
271 The strings can be composites, for instance
275 will result in a composite fixed substring 'foo'.
277 For each string some basic information is maintained:
279 - offset or min_offset
280 This is the position the string must appear at, or not before.
281 It also implicitly (when combined with minlenp) tells us how many
282 characters must match before the string we are searching for.
283 Likewise when combined with minlenp and the length of the string it
284 tells us how many characters must appear after the string we have
288 Only used for floating strings. This is the rightmost point that
289 the string can appear at. If set to I32 max it indicates that the
290 string can occur infinitely far to the right.
293 A pointer to the minimum length of the pattern that the string
294 was found inside. This is important as in the case of positive
295 lookahead or positive lookbehind we can have multiple patterns
300 The minimum length of the pattern overall is 3, the minimum length
301 of the lookahead part is 3, but the minimum length of the part that
302 will actually match is 1. So 'FOO's minimum length is 3, but the
303 minimum length for the F is 1. This is important as the minimum length
304 is used to determine offsets in front of and behind the string being
305 looked for. Since strings can be composites this is the length of the
306 pattern at the time it was committed with a scan_commit. Note that
307 the length is calculated by study_chunk, so that the minimum lengths
308 are not known until the full pattern has been compiled, thus the
309 pointer to the value.
313 In the case of lookbehind the string being searched for can be
314 offset past the start point of the final matching string.
315 If this value was just blithely removed from the min_offset it would
316 invalidate some of the calculations for how many chars must match
317 before or after (as they are derived from min_offset and minlen and
318 the length of the string being searched for).
319 When the final pattern is compiled and the data is moved from the
320 scan_data_t structure into the regexp structure the information
321 about lookbehind is factored in, with the information that would
322 have been lost precalculated in the end_shift field for the
325 The fields pos_min and pos_delta are used to store the minimum offset
326 and the delta to the maximum offset at the current point in the pattern.
330 typedef struct scan_data_t {
331 /*I32 len_min; unused */
332 /*I32 len_delta; unused */
336 I32 last_end; /* min value, <0 unless valid. */
339 SV **longest; /* Either &l_fixed, or &l_float. */
340 SV *longest_fixed; /* longest fixed string found in pattern */
341 I32 offset_fixed; /* offset where it starts */
342 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
343 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
344 SV *longest_float; /* longest floating string found in pattern */
345 I32 offset_float_min; /* earliest point in string it can appear */
346 I32 offset_float_max; /* latest point in string it can appear */
347 I32 *minlen_float; /* pointer to the minlen relevant to the string */
348 I32 lookbehind_float; /* is the position of the string modified by LB */
352 struct regnode_charclass_class *start_class;
356 * Forward declarations for pregcomp()'s friends.
359 static const scan_data_t zero_scan_data =
360 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
362 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
363 #define SF_BEFORE_SEOL 0x0001
364 #define SF_BEFORE_MEOL 0x0002
365 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
366 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
369 # define SF_FIX_SHIFT_EOL (0+2)
370 # define SF_FL_SHIFT_EOL (0+4)
372 # define SF_FIX_SHIFT_EOL (+2)
373 # define SF_FL_SHIFT_EOL (+4)
376 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
377 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
379 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
380 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
381 #define SF_IS_INF 0x0040
382 #define SF_HAS_PAR 0x0080
383 #define SF_IN_PAR 0x0100
384 #define SF_HAS_EVAL 0x0200
385 #define SCF_DO_SUBSTR 0x0400
386 #define SCF_DO_STCLASS_AND 0x0800
387 #define SCF_DO_STCLASS_OR 0x1000
388 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
389 #define SCF_WHILEM_VISITED_POS 0x2000
391 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
392 #define SCF_SEEN_ACCEPT 0x8000
394 #define UTF cBOOL(RExC_utf8)
396 /* The enums for all these are ordered so things work out correctly */
397 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
398 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
399 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
400 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
401 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
402 #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
403 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
405 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
407 #define OOB_UNICODE 12345678
408 #define OOB_NAMEDCLASS -1
410 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
411 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
414 /* length of regex to show in messages that don't mark a position within */
415 #define RegexLengthToShowInErrorMessages 127
418 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
419 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
420 * op/pragma/warn/regcomp.
422 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
423 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
425 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
428 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
429 * arg. Show regex, up to a maximum length. If it's too long, chop and add
432 #define _FAIL(code) STMT_START { \
433 const char *ellipses = ""; \
434 IV len = RExC_end - RExC_precomp; \
437 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
438 if (len > RegexLengthToShowInErrorMessages) { \
439 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
440 len = RegexLengthToShowInErrorMessages - 10; \
446 #define FAIL(msg) _FAIL( \
447 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
448 msg, (int)len, RExC_precomp, ellipses))
450 #define FAIL2(msg,arg) _FAIL( \
451 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
452 arg, (int)len, RExC_precomp, ellipses))
455 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
457 #define Simple_vFAIL(m) STMT_START { \
458 const IV offset = RExC_parse - RExC_precomp; \
459 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
460 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
464 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
466 #define vFAIL(m) STMT_START { \
468 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
473 * Like Simple_vFAIL(), but accepts two arguments.
475 #define Simple_vFAIL2(m,a1) STMT_START { \
476 const IV offset = RExC_parse - RExC_precomp; \
477 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
478 (int)offset, RExC_precomp, RExC_precomp + offset); \
482 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
484 #define vFAIL2(m,a1) STMT_START { \
486 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
487 Simple_vFAIL2(m, a1); \
492 * Like Simple_vFAIL(), but accepts three arguments.
494 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
495 const IV offset = RExC_parse - RExC_precomp; \
496 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
497 (int)offset, RExC_precomp, RExC_precomp + offset); \
501 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
503 #define vFAIL3(m,a1,a2) STMT_START { \
505 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
506 Simple_vFAIL3(m, a1, a2); \
510 * Like Simple_vFAIL(), but accepts four arguments.
512 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
513 const IV offset = RExC_parse - RExC_precomp; \
514 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
515 (int)offset, RExC_precomp, RExC_precomp + offset); \
518 #define ckWARNreg(loc,m) STMT_START { \
519 const IV offset = loc - RExC_precomp; \
520 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
521 (int)offset, RExC_precomp, RExC_precomp + offset); \
524 #define ckWARNregdep(loc,m) STMT_START { \
525 const IV offset = loc - RExC_precomp; \
526 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
528 (int)offset, RExC_precomp, RExC_precomp + offset); \
531 #define ckWARN2regdep(loc,m, a1) STMT_START { \
532 const IV offset = loc - RExC_precomp; \
533 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
535 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
538 #define ckWARN2reg(loc, m, a1) STMT_START { \
539 const IV offset = loc - RExC_precomp; \
540 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
541 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
544 #define vWARN3(loc, m, a1, a2) STMT_START { \
545 const IV offset = loc - RExC_precomp; \
546 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
547 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
550 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
551 const IV offset = loc - RExC_precomp; \
552 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
553 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
556 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
557 const IV offset = loc - RExC_precomp; \
558 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
559 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
562 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
563 const IV offset = loc - RExC_precomp; \
564 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
565 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
568 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
569 const IV offset = loc - RExC_precomp; \
570 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
571 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
575 /* Allow for side effects in s */
576 #define REGC(c,s) STMT_START { \
577 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
580 /* Macros for recording node offsets. 20001227 mjd@plover.com
581 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
582 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
583 * Element 0 holds the number n.
584 * Position is 1 indexed.
586 #ifndef RE_TRACK_PATTERN_OFFSETS
587 #define Set_Node_Offset_To_R(node,byte)
588 #define Set_Node_Offset(node,byte)
589 #define Set_Cur_Node_Offset
590 #define Set_Node_Length_To_R(node,len)
591 #define Set_Node_Length(node,len)
592 #define Set_Node_Cur_Length(node)
593 #define Node_Offset(n)
594 #define Node_Length(n)
595 #define Set_Node_Offset_Length(node,offset,len)
596 #define ProgLen(ri) ri->u.proglen
597 #define SetProgLen(ri,x) ri->u.proglen = x
599 #define ProgLen(ri) ri->u.offsets[0]
600 #define SetProgLen(ri,x) ri->u.offsets[0] = x
601 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
603 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
604 __LINE__, (int)(node), (int)(byte))); \
606 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
608 RExC_offsets[2*(node)-1] = (byte); \
613 #define Set_Node_Offset(node,byte) \
614 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
615 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
617 #define Set_Node_Length_To_R(node,len) STMT_START { \
619 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
620 __LINE__, (int)(node), (int)(len))); \
622 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
624 RExC_offsets[2*(node)] = (len); \
629 #define Set_Node_Length(node,len) \
630 Set_Node_Length_To_R((node)-RExC_emit_start, len)
631 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
632 #define Set_Node_Cur_Length(node) \
633 Set_Node_Length(node, RExC_parse - parse_start)
635 /* Get offsets and lengths */
636 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
637 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
639 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
640 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
641 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
645 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
646 #define EXPERIMENTAL_INPLACESCAN
647 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
649 #define DEBUG_STUDYDATA(str,data,depth) \
650 DEBUG_OPTIMISE_MORE_r(if(data){ \
651 PerlIO_printf(Perl_debug_log, \
652 "%*s" str "Pos:%"IVdf"/%"IVdf \
653 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
654 (int)(depth)*2, "", \
655 (IV)((data)->pos_min), \
656 (IV)((data)->pos_delta), \
657 (UV)((data)->flags), \
658 (IV)((data)->whilem_c), \
659 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
660 is_inf ? "INF " : "" \
662 if ((data)->last_found) \
663 PerlIO_printf(Perl_debug_log, \
664 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
665 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
666 SvPVX_const((data)->last_found), \
667 (IV)((data)->last_end), \
668 (IV)((data)->last_start_min), \
669 (IV)((data)->last_start_max), \
670 ((data)->longest && \
671 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
672 SvPVX_const((data)->longest_fixed), \
673 (IV)((data)->offset_fixed), \
674 ((data)->longest && \
675 (data)->longest==&((data)->longest_float)) ? "*" : "", \
676 SvPVX_const((data)->longest_float), \
677 (IV)((data)->offset_float_min), \
678 (IV)((data)->offset_float_max) \
680 PerlIO_printf(Perl_debug_log,"\n"); \
683 static void clear_re(pTHX_ void *r);
685 /* Mark that we cannot extend a found fixed substring at this point.
686 Update the longest found anchored substring and the longest found
687 floating substrings if needed. */
690 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
692 const STRLEN l = CHR_SVLEN(data->last_found);
693 const STRLEN old_l = CHR_SVLEN(*data->longest);
694 GET_RE_DEBUG_FLAGS_DECL;
696 PERL_ARGS_ASSERT_SCAN_COMMIT;
698 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
699 SvSetMagicSV(*data->longest, data->last_found);
700 if (*data->longest == data->longest_fixed) {
701 data->offset_fixed = l ? data->last_start_min : data->pos_min;
702 if (data->flags & SF_BEFORE_EOL)
704 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
706 data->flags &= ~SF_FIX_BEFORE_EOL;
707 data->minlen_fixed=minlenp;
708 data->lookbehind_fixed=0;
710 else { /* *data->longest == data->longest_float */
711 data->offset_float_min = l ? data->last_start_min : data->pos_min;
712 data->offset_float_max = (l
713 ? data->last_start_max
714 : data->pos_min + data->pos_delta);
715 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
716 data->offset_float_max = I32_MAX;
717 if (data->flags & SF_BEFORE_EOL)
719 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
721 data->flags &= ~SF_FL_BEFORE_EOL;
722 data->minlen_float=minlenp;
723 data->lookbehind_float=0;
726 SvCUR_set(data->last_found, 0);
728 SV * const sv = data->last_found;
729 if (SvUTF8(sv) && SvMAGICAL(sv)) {
730 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
736 data->flags &= ~SF_BEFORE_EOL;
737 DEBUG_STUDYDATA("commit: ",data,0);
740 /* Can match anything (initialization) */
742 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
744 PERL_ARGS_ASSERT_CL_ANYTHING;
746 ANYOF_BITMAP_SETALL(cl);
747 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
748 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
750 /* If any portion of the regex is to operate under locale rules,
751 * initialization includes it. The reason this isn't done for all regexes
752 * is that the optimizer was written under the assumption that locale was
753 * all-or-nothing. Given the complexity and lack of documentation in the
754 * optimizer, and that there are inadequate test cases for locale, so many
755 * parts of it may not work properly, it is safest to avoid locale unless
757 if (RExC_contains_locale) {
758 ANYOF_CLASS_SETALL(cl); /* /l uses class */
759 cl->flags |= ANYOF_LOCALE;
762 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
766 /* Can match anything (initialization) */
768 S_cl_is_anything(const struct regnode_charclass_class *cl)
772 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
774 for (value = 0; value <= ANYOF_MAX; value += 2)
775 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
777 if (!(cl->flags & ANYOF_UNICODE_ALL))
779 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
784 /* Can match anything (initialization) */
786 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
788 PERL_ARGS_ASSERT_CL_INIT;
790 Zero(cl, 1, struct regnode_charclass_class);
792 cl_anything(pRExC_state, cl);
793 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
796 /* These two functions currently do the exact same thing */
797 #define cl_init_zero S_cl_init
799 /* 'AND' a given class with another one. Can create false positives. 'cl'
800 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
801 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
803 S_cl_and(struct regnode_charclass_class *cl,
804 const struct regnode_charclass_class *and_with)
806 PERL_ARGS_ASSERT_CL_AND;
808 assert(and_with->type == ANYOF);
810 /* I (khw) am not sure all these restrictions are necessary XXX */
811 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
812 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
813 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
814 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
815 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
818 if (and_with->flags & ANYOF_INVERT)
819 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
820 cl->bitmap[i] &= ~and_with->bitmap[i];
822 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
823 cl->bitmap[i] &= and_with->bitmap[i];
824 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
826 if (and_with->flags & ANYOF_INVERT) {
828 /* Here, the and'ed node is inverted. Get the AND of the flags that
829 * aren't affected by the inversion. Those that are affected are
830 * handled individually below */
831 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
832 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
833 cl->flags |= affected_flags;
835 /* We currently don't know how to deal with things that aren't in the
836 * bitmap, but we know that the intersection is no greater than what
837 * is already in cl, so let there be false positives that get sorted
838 * out after the synthetic start class succeeds, and the node is
839 * matched for real. */
841 /* The inversion of these two flags indicate that the resulting
842 * intersection doesn't have them */
843 if (and_with->flags & ANYOF_UNICODE_ALL) {
844 cl->flags &= ~ANYOF_UNICODE_ALL;
846 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
847 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
850 else { /* and'd node is not inverted */
851 U8 outside_bitmap_but_not_utf8; /* Temp variable */
853 if (! ANYOF_NONBITMAP(and_with)) {
855 /* Here 'and_with' doesn't match anything outside the bitmap
856 * (except possibly ANYOF_UNICODE_ALL), which means the
857 * intersection can't either, except for ANYOF_UNICODE_ALL, in
858 * which case we don't know what the intersection is, but it's no
859 * greater than what cl already has, so can just leave it alone,
860 * with possible false positives */
861 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
862 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
863 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
866 else if (! ANYOF_NONBITMAP(cl)) {
868 /* Here, 'and_with' does match something outside the bitmap, and cl
869 * doesn't have a list of things to match outside the bitmap. If
870 * cl can match all code points above 255, the intersection will
871 * be those above-255 code points that 'and_with' matches. If cl
872 * can't match all Unicode code points, it means that it can't
873 * match anything outside the bitmap (since the 'if' that got us
874 * into this block tested for that), so we leave the bitmap empty.
876 if (cl->flags & ANYOF_UNICODE_ALL) {
877 ARG_SET(cl, ARG(and_with));
879 /* and_with's ARG may match things that don't require UTF8.
880 * And now cl's will too, in spite of this being an 'and'. See
881 * the comments below about the kludge */
882 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
886 /* Here, both 'and_with' and cl match something outside the
887 * bitmap. Currently we do not do the intersection, so just match
888 * whatever cl had at the beginning. */
892 /* Take the intersection of the two sets of flags. However, the
893 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
894 * kludge around the fact that this flag is not treated like the others
895 * which are initialized in cl_anything(). The way the optimizer works
896 * is that the synthetic start class (SSC) is initialized to match
897 * anything, and then the first time a real node is encountered, its
898 * values are AND'd with the SSC's with the result being the values of
899 * the real node. However, there are paths through the optimizer where
900 * the AND never gets called, so those initialized bits are set
901 * inappropriately, which is not usually a big deal, as they just cause
902 * false positives in the SSC, which will just mean a probably
903 * imperceptible slow down in execution. However this bit has a
904 * higher false positive consequence in that it can cause utf8.pm,
905 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
906 * bigger slowdown and also causes significant extra memory to be used.
907 * In order to prevent this, the code now takes a different tack. The
908 * bit isn't set unless some part of the regular expression needs it,
909 * but once set it won't get cleared. This means that these extra
910 * modules won't get loaded unless there was some path through the
911 * pattern that would have required them anyway, and so any false
912 * positives that occur by not ANDing them out when they could be
913 * aren't as severe as they would be if we treated this bit like all
915 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
916 & ANYOF_NONBITMAP_NON_UTF8;
917 cl->flags &= and_with->flags;
918 cl->flags |= outside_bitmap_but_not_utf8;
922 /* 'OR' a given class with another one. Can create false positives. 'cl'
923 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
924 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
926 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
928 PERL_ARGS_ASSERT_CL_OR;
930 if (or_with->flags & ANYOF_INVERT) {
932 /* Here, the or'd node is to be inverted. This means we take the
933 * complement of everything not in the bitmap, but currently we don't
934 * know what that is, so give up and match anything */
935 if (ANYOF_NONBITMAP(or_with)) {
936 cl_anything(pRExC_state, cl);
939 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
940 * <= (B1 | !B2) | (CL1 | !CL2)
941 * which is wasteful if CL2 is small, but we ignore CL2:
942 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
943 * XXXX Can we handle case-fold? Unclear:
944 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
945 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
947 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
948 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
949 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
952 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
953 cl->bitmap[i] |= ~or_with->bitmap[i];
954 } /* XXXX: logic is complicated otherwise */
956 cl_anything(pRExC_state, cl);
959 /* And, we can just take the union of the flags that aren't affected
960 * by the inversion */
961 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
963 /* For the remaining flags:
964 ANYOF_UNICODE_ALL and inverted means to not match anything above
965 255, which means that the union with cl should just be
966 what cl has in it, so can ignore this flag
967 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
968 is 127-255 to match them, but then invert that, so the
969 union with cl should just be what cl has in it, so can
972 } else { /* 'or_with' is not inverted */
973 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
974 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
975 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
976 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
979 /* OR char bitmap and class bitmap separately */
980 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
981 cl->bitmap[i] |= or_with->bitmap[i];
982 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
983 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
984 cl->classflags[i] |= or_with->classflags[i];
985 cl->flags |= ANYOF_CLASS;
988 else { /* XXXX: logic is complicated, leave it along for a moment. */
989 cl_anything(pRExC_state, cl);
992 if (ANYOF_NONBITMAP(or_with)) {
994 /* Use the added node's outside-the-bit-map match if there isn't a
995 * conflict. If there is a conflict (both nodes match something
996 * outside the bitmap, but what they match outside is not the same
997 * pointer, and hence not easily compared until XXX we extend
998 * inversion lists this far), give up and allow the start class to
999 * match everything outside the bitmap. If that stuff is all above
1000 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1001 if (! ANYOF_NONBITMAP(cl)) {
1002 ARG_SET(cl, ARG(or_with));
1004 else if (ARG(cl) != ARG(or_with)) {
1006 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1007 cl_anything(pRExC_state, cl);
1010 cl->flags |= ANYOF_UNICODE_ALL;
1015 /* Take the union */
1016 cl->flags |= or_with->flags;
1020 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1021 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1022 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1023 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1028 dump_trie(trie,widecharmap,revcharmap)
1029 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1030 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1032 These routines dump out a trie in a somewhat readable format.
1033 The _interim_ variants are used for debugging the interim
1034 tables that are used to generate the final compressed
1035 representation which is what dump_trie expects.
1037 Part of the reason for their existence is to provide a form
1038 of documentation as to how the different representations function.
1043 Dumps the final compressed table form of the trie to Perl_debug_log.
1044 Used for debugging make_trie().
1048 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1049 AV *revcharmap, U32 depth)
1052 SV *sv=sv_newmortal();
1053 int colwidth= widecharmap ? 6 : 4;
1055 GET_RE_DEBUG_FLAGS_DECL;
1057 PERL_ARGS_ASSERT_DUMP_TRIE;
1059 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1060 (int)depth * 2 + 2,"",
1061 "Match","Base","Ofs" );
1063 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1064 SV ** const tmp = av_fetch( revcharmap, state, 0);
1066 PerlIO_printf( Perl_debug_log, "%*s",
1068 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1069 PL_colors[0], PL_colors[1],
1070 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1071 PERL_PV_ESCAPE_FIRSTCHAR
1076 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1077 (int)depth * 2 + 2,"");
1079 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1080 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1081 PerlIO_printf( Perl_debug_log, "\n");
1083 for( state = 1 ; state < trie->statecount ; state++ ) {
1084 const U32 base = trie->states[ state ].trans.base;
1086 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1088 if ( trie->states[ state ].wordnum ) {
1089 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1091 PerlIO_printf( Perl_debug_log, "%6s", "" );
1094 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1099 while( ( base + ofs < trie->uniquecharcount ) ||
1100 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1101 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1104 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1106 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1107 if ( ( base + ofs >= trie->uniquecharcount ) &&
1108 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1109 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1111 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1113 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1115 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1119 PerlIO_printf( Perl_debug_log, "]");
1122 PerlIO_printf( Perl_debug_log, "\n" );
1124 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1125 for (word=1; word <= trie->wordcount; word++) {
1126 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1127 (int)word, (int)(trie->wordinfo[word].prev),
1128 (int)(trie->wordinfo[word].len));
1130 PerlIO_printf(Perl_debug_log, "\n" );
1133 Dumps a fully constructed but uncompressed trie in list form.
1134 List tries normally only are used for construction when the number of
1135 possible chars (trie->uniquecharcount) is very high.
1136 Used for debugging make_trie().
1139 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1140 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1144 SV *sv=sv_newmortal();
1145 int colwidth= widecharmap ? 6 : 4;
1146 GET_RE_DEBUG_FLAGS_DECL;
1148 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1150 /* print out the table precompression. */
1151 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1152 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1153 "------:-----+-----------------\n" );
1155 for( state=1 ; state < next_alloc ; state ++ ) {
1158 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1159 (int)depth * 2 + 2,"", (UV)state );
1160 if ( ! trie->states[ state ].wordnum ) {
1161 PerlIO_printf( Perl_debug_log, "%5s| ","");
1163 PerlIO_printf( Perl_debug_log, "W%4x| ",
1164 trie->states[ state ].wordnum
1167 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1168 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1170 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1172 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1173 PL_colors[0], PL_colors[1],
1174 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1175 PERL_PV_ESCAPE_FIRSTCHAR
1177 TRIE_LIST_ITEM(state,charid).forid,
1178 (UV)TRIE_LIST_ITEM(state,charid).newstate
1181 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1182 (int)((depth * 2) + 14), "");
1185 PerlIO_printf( Perl_debug_log, "\n");
1190 Dumps a fully constructed but uncompressed trie in table form.
1191 This is the normal DFA style state transition table, with a few
1192 twists to facilitate compression later.
1193 Used for debugging make_trie().
1196 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1197 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1202 SV *sv=sv_newmortal();
1203 int colwidth= widecharmap ? 6 : 4;
1204 GET_RE_DEBUG_FLAGS_DECL;
1206 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1209 print out the table precompression so that we can do a visual check
1210 that they are identical.
1213 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1215 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1216 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1218 PerlIO_printf( Perl_debug_log, "%*s",
1220 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1221 PL_colors[0], PL_colors[1],
1222 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1223 PERL_PV_ESCAPE_FIRSTCHAR
1229 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1231 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1232 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1235 PerlIO_printf( Perl_debug_log, "\n" );
1237 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1239 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1240 (int)depth * 2 + 2,"",
1241 (UV)TRIE_NODENUM( state ) );
1243 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1244 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1246 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1248 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1250 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1251 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1253 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1254 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1262 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1263 startbranch: the first branch in the whole branch sequence
1264 first : start branch of sequence of branch-exact nodes.
1265 May be the same as startbranch
1266 last : Thing following the last branch.
1267 May be the same as tail.
1268 tail : item following the branch sequence
1269 count : words in the sequence
1270 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1271 depth : indent depth
1273 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1275 A trie is an N'ary tree where the branches are determined by digital
1276 decomposition of the key. IE, at the root node you look up the 1st character and
1277 follow that branch repeat until you find the end of the branches. Nodes can be
1278 marked as "accepting" meaning they represent a complete word. Eg:
1282 would convert into the following structure. Numbers represent states, letters
1283 following numbers represent valid transitions on the letter from that state, if
1284 the number is in square brackets it represents an accepting state, otherwise it
1285 will be in parenthesis.
1287 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1291 (1) +-i->(6)-+-s->[7]
1293 +-s->(3)-+-h->(4)-+-e->[5]
1295 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1297 This shows that when matching against the string 'hers' we will begin at state 1
1298 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1299 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1300 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1301 single traverse. We store a mapping from accepting to state to which word was
1302 matched, and then when we have multiple possibilities we try to complete the
1303 rest of the regex in the order in which they occured in the alternation.
1305 The only prior NFA like behaviour that would be changed by the TRIE support is
1306 the silent ignoring of duplicate alternations which are of the form:
1308 / (DUPE|DUPE) X? (?{ ... }) Y /x
1310 Thus EVAL blocks following a trie may be called a different number of times with
1311 and without the optimisation. With the optimisations dupes will be silently
1312 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1313 the following demonstrates:
1315 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1317 which prints out 'word' three times, but
1319 'words'=~/(word|word|word)(?{ print $1 })S/
1321 which doesnt print it out at all. This is due to other optimisations kicking in.
1323 Example of what happens on a structural level:
1325 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1327 1: CURLYM[1] {1,32767}(18)
1338 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1339 and should turn into:
1341 1: CURLYM[1] {1,32767}(18)
1343 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1351 Cases where tail != last would be like /(?foo|bar)baz/:
1361 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1362 and would end up looking like:
1365 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1372 d = uvuni_to_utf8_flags(d, uv, 0);
1374 is the recommended Unicode-aware way of saying
1379 #define TRIE_STORE_REVCHAR(val) \
1382 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1383 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1384 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1385 SvCUR_set(zlopp, kapow - flrbbbbb); \
1388 av_push(revcharmap, zlopp); \
1390 char ooooff = (char)val; \
1391 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1395 #define TRIE_READ_CHAR STMT_START { \
1398 /* if it is UTF then it is either already folded, or does not need folding */ \
1399 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1401 else if (folder == PL_fold_latin1) { \
1402 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1403 if ( foldlen > 0 ) { \
1404 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1410 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1411 skiplen = UNISKIP(uvc); \
1412 foldlen -= skiplen; \
1413 scan = foldbuf + skiplen; \
1416 /* raw data, will be folded later if needed */ \
1424 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1425 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1426 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1427 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1429 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1430 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1431 TRIE_LIST_CUR( state )++; \
1434 #define TRIE_LIST_NEW(state) STMT_START { \
1435 Newxz( trie->states[ state ].trans.list, \
1436 4, reg_trie_trans_le ); \
1437 TRIE_LIST_CUR( state ) = 1; \
1438 TRIE_LIST_LEN( state ) = 4; \
1441 #define TRIE_HANDLE_WORD(state) STMT_START { \
1442 U16 dupe= trie->states[ state ].wordnum; \
1443 regnode * const noper_next = regnext( noper ); \
1446 /* store the word for dumping */ \
1448 if (OP(noper) != NOTHING) \
1449 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1451 tmp = newSVpvn_utf8( "", 0, UTF ); \
1452 av_push( trie_words, tmp ); \
1456 trie->wordinfo[curword].prev = 0; \
1457 trie->wordinfo[curword].len = wordlen; \
1458 trie->wordinfo[curword].accept = state; \
1460 if ( noper_next < tail ) { \
1462 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1463 trie->jump[curword] = (U16)(noper_next - convert); \
1465 jumper = noper_next; \
1467 nextbranch= regnext(cur); \
1471 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1472 /* chain, so that when the bits of chain are later */\
1473 /* linked together, the dups appear in the chain */\
1474 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1475 trie->wordinfo[dupe].prev = curword; \
1477 /* we haven't inserted this word yet. */ \
1478 trie->states[ state ].wordnum = curword; \
1483 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1484 ( ( base + charid >= ucharcount \
1485 && base + charid < ubound \
1486 && state == trie->trans[ base - ucharcount + charid ].check \
1487 && trie->trans[ base - ucharcount + charid ].next ) \
1488 ? trie->trans[ base - ucharcount + charid ].next \
1489 : ( state==1 ? special : 0 ) \
1493 #define MADE_JUMP_TRIE 2
1494 #define MADE_EXACT_TRIE 4
1497 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1500 /* first pass, loop through and scan words */
1501 reg_trie_data *trie;
1502 HV *widecharmap = NULL;
1503 AV *revcharmap = newAV();
1505 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1510 regnode *jumper = NULL;
1511 regnode *nextbranch = NULL;
1512 regnode *convert = NULL;
1513 U32 *prev_states; /* temp array mapping each state to previous one */
1514 /* we just use folder as a flag in utf8 */
1515 const U8 * folder = NULL;
1518 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1519 AV *trie_words = NULL;
1520 /* along with revcharmap, this only used during construction but both are
1521 * useful during debugging so we store them in the struct when debugging.
1524 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1525 STRLEN trie_charcount=0;
1527 SV *re_trie_maxbuff;
1528 GET_RE_DEBUG_FLAGS_DECL;
1530 PERL_ARGS_ASSERT_MAKE_TRIE;
1532 PERL_UNUSED_ARG(depth);
1539 case EXACTFU_TRICKYFOLD:
1540 case EXACTFU: folder = PL_fold_latin1; break;
1541 case EXACTF: folder = PL_fold; break;
1542 case EXACTFL: folder = PL_fold_locale; break;
1543 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1546 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1548 trie->startstate = 1;
1549 trie->wordcount = word_count;
1550 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1551 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1553 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1554 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1555 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1558 trie_words = newAV();
1561 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1562 if (!SvIOK(re_trie_maxbuff)) {
1563 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1565 DEBUG_TRIE_COMPILE_r({
1566 PerlIO_printf( Perl_debug_log,
1567 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1568 (int)depth * 2 + 2, "",
1569 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1570 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1574 /* Find the node we are going to overwrite */
1575 if ( first == startbranch && OP( last ) != BRANCH ) {
1576 /* whole branch chain */
1579 /* branch sub-chain */
1580 convert = NEXTOPER( first );
1583 /* -- First loop and Setup --
1585 We first traverse the branches and scan each word to determine if it
1586 contains widechars, and how many unique chars there are, this is
1587 important as we have to build a table with at least as many columns as we
1590 We use an array of integers to represent the character codes 0..255
1591 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1592 native representation of the character value as the key and IV's for the
1595 *TODO* If we keep track of how many times each character is used we can
1596 remap the columns so that the table compression later on is more
1597 efficient in terms of memory by ensuring the most common value is in the
1598 middle and the least common are on the outside. IMO this would be better
1599 than a most to least common mapping as theres a decent chance the most
1600 common letter will share a node with the least common, meaning the node
1601 will not be compressible. With a middle is most common approach the worst
1602 case is when we have the least common nodes twice.
1606 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1607 regnode *noper = NEXTOPER( cur );
1608 const U8 *uc = (U8*)STRING( noper );
1609 const U8 *e = uc + STR_LEN( noper );
1611 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1613 const U8 *scan = (U8*)NULL;
1614 U32 wordlen = 0; /* required init */
1616 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1618 if (OP(noper) == NOTHING) {
1619 regnode *noper_next= regnext(noper);
1620 if (noper_next != tail && OP(noper_next) == flags) {
1622 uc= (U8*)STRING(noper);
1623 e= uc + STR_LEN(noper);
1624 trie->minlen= STR_LEN(noper);
1631 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1632 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1633 regardless of encoding */
1634 if (OP( noper ) == EXACTFU_SS) {
1635 /* false positives are ok, so just set this */
1636 TRIE_BITMAP_SET(trie,0xDF);
1639 for ( ; uc < e ; uc += len ) {
1640 TRIE_CHARCOUNT(trie)++;
1645 U8 folded= folder[ (U8) uvc ];
1646 if ( !trie->charmap[ folded ] ) {
1647 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1648 TRIE_STORE_REVCHAR( folded );
1651 if ( !trie->charmap[ uvc ] ) {
1652 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1653 TRIE_STORE_REVCHAR( uvc );
1656 /* store the codepoint in the bitmap, and its folded
1658 TRIE_BITMAP_SET(trie, uvc);
1660 /* store the folded codepoint */
1661 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1664 /* store first byte of utf8 representation of
1665 variant codepoints */
1666 if (! UNI_IS_INVARIANT(uvc)) {
1667 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1670 set_bit = 0; /* We've done our bit :-) */
1675 widecharmap = newHV();
1677 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1680 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1682 if ( !SvTRUE( *svpp ) ) {
1683 sv_setiv( *svpp, ++trie->uniquecharcount );
1684 TRIE_STORE_REVCHAR(uvc);
1688 if( cur == first ) {
1689 trie->minlen = chars;
1690 trie->maxlen = chars;
1691 } else if (chars < trie->minlen) {
1692 trie->minlen = chars;
1693 } else if (chars > trie->maxlen) {
1694 trie->maxlen = chars;
1696 if (OP( noper ) == EXACTFU_SS) {
1697 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1698 if (trie->minlen > 1)
1701 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1702 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1703 * - We assume that any such sequence might match a 2 byte string */
1704 if (trie->minlen > 2 )
1708 } /* end first pass */
1709 DEBUG_TRIE_COMPILE_r(
1710 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1711 (int)depth * 2 + 2,"",
1712 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1713 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1714 (int)trie->minlen, (int)trie->maxlen )
1718 We now know what we are dealing with in terms of unique chars and
1719 string sizes so we can calculate how much memory a naive
1720 representation using a flat table will take. If it's over a reasonable
1721 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1722 conservative but potentially much slower representation using an array
1725 At the end we convert both representations into the same compressed
1726 form that will be used in regexec.c for matching with. The latter
1727 is a form that cannot be used to construct with but has memory
1728 properties similar to the list form and access properties similar
1729 to the table form making it both suitable for fast searches and
1730 small enough that its feasable to store for the duration of a program.
1732 See the comment in the code where the compressed table is produced
1733 inplace from the flat tabe representation for an explanation of how
1734 the compression works.
1739 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1742 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1744 Second Pass -- Array Of Lists Representation
1746 Each state will be represented by a list of charid:state records
1747 (reg_trie_trans_le) the first such element holds the CUR and LEN
1748 points of the allocated array. (See defines above).
1750 We build the initial structure using the lists, and then convert
1751 it into the compressed table form which allows faster lookups
1752 (but cant be modified once converted).
1755 STRLEN transcount = 1;
1757 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1758 "%*sCompiling trie using list compiler\n",
1759 (int)depth * 2 + 2, ""));
1761 trie->states = (reg_trie_state *)
1762 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1763 sizeof(reg_trie_state) );
1767 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1769 regnode *noper = NEXTOPER( cur );
1770 U8 *uc = (U8*)STRING( noper );
1771 const U8 *e = uc + STR_LEN( noper );
1772 U32 state = 1; /* required init */
1773 U16 charid = 0; /* sanity init */
1774 U8 *scan = (U8*)NULL; /* sanity init */
1775 STRLEN foldlen = 0; /* required init */
1776 U32 wordlen = 0; /* required init */
1777 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1780 if (OP(noper) == NOTHING) {
1781 regnode *noper_next= regnext(noper);
1782 if (noper_next != tail && OP(noper_next) == flags) {
1784 uc= (U8*)STRING(noper);
1785 e= uc + STR_LEN(noper);
1789 if (OP(noper) != NOTHING) {
1790 for ( ; uc < e ; uc += len ) {
1795 charid = trie->charmap[ uvc ];
1797 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1801 charid=(U16)SvIV( *svpp );
1804 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1811 if ( !trie->states[ state ].trans.list ) {
1812 TRIE_LIST_NEW( state );
1814 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1815 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1816 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1821 newstate = next_alloc++;
1822 prev_states[newstate] = state;
1823 TRIE_LIST_PUSH( state, charid, newstate );
1828 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1832 TRIE_HANDLE_WORD(state);
1834 } /* end second pass */
1836 /* next alloc is the NEXT state to be allocated */
1837 trie->statecount = next_alloc;
1838 trie->states = (reg_trie_state *)
1839 PerlMemShared_realloc( trie->states,
1841 * sizeof(reg_trie_state) );
1843 /* and now dump it out before we compress it */
1844 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1845 revcharmap, next_alloc,
1849 trie->trans = (reg_trie_trans *)
1850 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1857 for( state=1 ; state < next_alloc ; state ++ ) {
1861 DEBUG_TRIE_COMPILE_MORE_r(
1862 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1866 if (trie->states[state].trans.list) {
1867 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1871 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1872 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1873 if ( forid < minid ) {
1875 } else if ( forid > maxid ) {
1879 if ( transcount < tp + maxid - minid + 1) {
1881 trie->trans = (reg_trie_trans *)
1882 PerlMemShared_realloc( trie->trans,
1884 * sizeof(reg_trie_trans) );
1885 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1887 base = trie->uniquecharcount + tp - minid;
1888 if ( maxid == minid ) {
1890 for ( ; zp < tp ; zp++ ) {
1891 if ( ! trie->trans[ zp ].next ) {
1892 base = trie->uniquecharcount + zp - minid;
1893 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1894 trie->trans[ zp ].check = state;
1900 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1901 trie->trans[ tp ].check = state;
1906 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1907 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1908 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1909 trie->trans[ tid ].check = state;
1911 tp += ( maxid - minid + 1 );
1913 Safefree(trie->states[ state ].trans.list);
1916 DEBUG_TRIE_COMPILE_MORE_r(
1917 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1920 trie->states[ state ].trans.base=base;
1922 trie->lasttrans = tp + 1;
1926 Second Pass -- Flat Table Representation.
1928 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1929 We know that we will need Charcount+1 trans at most to store the data
1930 (one row per char at worst case) So we preallocate both structures
1931 assuming worst case.
1933 We then construct the trie using only the .next slots of the entry
1936 We use the .check field of the first entry of the node temporarily to
1937 make compression both faster and easier by keeping track of how many non
1938 zero fields are in the node.
1940 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1943 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1944 number representing the first entry of the node, and state as a
1945 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1946 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1947 are 2 entrys per node. eg:
1955 The table is internally in the right hand, idx form. However as we also
1956 have to deal with the states array which is indexed by nodenum we have to
1957 use TRIE_NODENUM() to convert.
1960 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1961 "%*sCompiling trie using table compiler\n",
1962 (int)depth * 2 + 2, ""));
1964 trie->trans = (reg_trie_trans *)
1965 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1966 * trie->uniquecharcount + 1,
1967 sizeof(reg_trie_trans) );
1968 trie->states = (reg_trie_state *)
1969 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1970 sizeof(reg_trie_state) );
1971 next_alloc = trie->uniquecharcount + 1;
1974 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1976 regnode *noper = NEXTOPER( cur );
1977 const U8 *uc = (U8*)STRING( noper );
1978 const U8 *e = uc + STR_LEN( noper );
1980 U32 state = 1; /* required init */
1982 U16 charid = 0; /* sanity init */
1983 U32 accept_state = 0; /* sanity init */
1984 U8 *scan = (U8*)NULL; /* sanity init */
1986 STRLEN foldlen = 0; /* required init */
1987 U32 wordlen = 0; /* required init */
1989 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1991 if (OP(noper) == NOTHING) {
1992 regnode *noper_next= regnext(noper);
1993 if (noper_next != tail && OP(noper_next) == flags) {
1995 uc= (U8*)STRING(noper);
1996 e= uc + STR_LEN(noper);
2000 if ( OP(noper) != NOTHING ) {
2001 for ( ; uc < e ; uc += len ) {
2006 charid = trie->charmap[ uvc ];
2008 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2009 charid = svpp ? (U16)SvIV(*svpp) : 0;
2013 if ( !trie->trans[ state + charid ].next ) {
2014 trie->trans[ state + charid ].next = next_alloc;
2015 trie->trans[ state ].check++;
2016 prev_states[TRIE_NODENUM(next_alloc)]
2017 = TRIE_NODENUM(state);
2018 next_alloc += trie->uniquecharcount;
2020 state = trie->trans[ state + charid ].next;
2022 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2024 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2027 accept_state = TRIE_NODENUM( state );
2028 TRIE_HANDLE_WORD(accept_state);
2030 } /* end second pass */
2032 /* and now dump it out before we compress it */
2033 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2035 next_alloc, depth+1));
2039 * Inplace compress the table.*
2041 For sparse data sets the table constructed by the trie algorithm will
2042 be mostly 0/FAIL transitions or to put it another way mostly empty.
2043 (Note that leaf nodes will not contain any transitions.)
2045 This algorithm compresses the tables by eliminating most such
2046 transitions, at the cost of a modest bit of extra work during lookup:
2048 - Each states[] entry contains a .base field which indicates the
2049 index in the state[] array wheres its transition data is stored.
2051 - If .base is 0 there are no valid transitions from that node.
2053 - If .base is nonzero then charid is added to it to find an entry in
2056 -If trans[states[state].base+charid].check!=state then the
2057 transition is taken to be a 0/Fail transition. Thus if there are fail
2058 transitions at the front of the node then the .base offset will point
2059 somewhere inside the previous nodes data (or maybe even into a node
2060 even earlier), but the .check field determines if the transition is
2064 The following process inplace converts the table to the compressed
2065 table: We first do not compress the root node 1,and mark all its
2066 .check pointers as 1 and set its .base pointer as 1 as well. This
2067 allows us to do a DFA construction from the compressed table later,
2068 and ensures that any .base pointers we calculate later are greater
2071 - We set 'pos' to indicate the first entry of the second node.
2073 - We then iterate over the columns of the node, finding the first and
2074 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2075 and set the .check pointers accordingly, and advance pos
2076 appropriately and repreat for the next node. Note that when we copy
2077 the next pointers we have to convert them from the original
2078 NODEIDX form to NODENUM form as the former is not valid post
2081 - If a node has no transitions used we mark its base as 0 and do not
2082 advance the pos pointer.
2084 - If a node only has one transition we use a second pointer into the
2085 structure to fill in allocated fail transitions from other states.
2086 This pointer is independent of the main pointer and scans forward
2087 looking for null transitions that are allocated to a state. When it
2088 finds one it writes the single transition into the "hole". If the
2089 pointer doesnt find one the single transition is appended as normal.
2091 - Once compressed we can Renew/realloc the structures to release the
2094 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2095 specifically Fig 3.47 and the associated pseudocode.
2099 const U32 laststate = TRIE_NODENUM( next_alloc );
2102 trie->statecount = laststate;
2104 for ( state = 1 ; state < laststate ; state++ ) {
2106 const U32 stateidx = TRIE_NODEIDX( state );
2107 const U32 o_used = trie->trans[ stateidx ].check;
2108 U32 used = trie->trans[ stateidx ].check;
2109 trie->trans[ stateidx ].check = 0;
2111 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2112 if ( flag || trie->trans[ stateidx + charid ].next ) {
2113 if ( trie->trans[ stateidx + charid ].next ) {
2115 for ( ; zp < pos ; zp++ ) {
2116 if ( ! trie->trans[ zp ].next ) {
2120 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2121 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2122 trie->trans[ zp ].check = state;
2123 if ( ++zp > pos ) pos = zp;
2130 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2132 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2133 trie->trans[ pos ].check = state;
2138 trie->lasttrans = pos + 1;
2139 trie->states = (reg_trie_state *)
2140 PerlMemShared_realloc( trie->states, laststate
2141 * sizeof(reg_trie_state) );
2142 DEBUG_TRIE_COMPILE_MORE_r(
2143 PerlIO_printf( Perl_debug_log,
2144 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2145 (int)depth * 2 + 2,"",
2146 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2149 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2152 } /* end table compress */
2154 DEBUG_TRIE_COMPILE_MORE_r(
2155 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2156 (int)depth * 2 + 2, "",
2157 (UV)trie->statecount,
2158 (UV)trie->lasttrans)
2160 /* resize the trans array to remove unused space */
2161 trie->trans = (reg_trie_trans *)
2162 PerlMemShared_realloc( trie->trans, trie->lasttrans
2163 * sizeof(reg_trie_trans) );
2165 { /* Modify the program and insert the new TRIE node */
2166 U8 nodetype =(U8)(flags & 0xFF);
2170 regnode *optimize = NULL;
2171 #ifdef RE_TRACK_PATTERN_OFFSETS
2174 U32 mjd_nodelen = 0;
2175 #endif /* RE_TRACK_PATTERN_OFFSETS */
2176 #endif /* DEBUGGING */
2178 This means we convert either the first branch or the first Exact,
2179 depending on whether the thing following (in 'last') is a branch
2180 or not and whther first is the startbranch (ie is it a sub part of
2181 the alternation or is it the whole thing.)
2182 Assuming its a sub part we convert the EXACT otherwise we convert
2183 the whole branch sequence, including the first.
2185 /* Find the node we are going to overwrite */
2186 if ( first != startbranch || OP( last ) == BRANCH ) {
2187 /* branch sub-chain */
2188 NEXT_OFF( first ) = (U16)(last - first);
2189 #ifdef RE_TRACK_PATTERN_OFFSETS
2191 mjd_offset= Node_Offset((convert));
2192 mjd_nodelen= Node_Length((convert));
2195 /* whole branch chain */
2197 #ifdef RE_TRACK_PATTERN_OFFSETS
2200 const regnode *nop = NEXTOPER( convert );
2201 mjd_offset= Node_Offset((nop));
2202 mjd_nodelen= Node_Length((nop));
2206 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2207 (int)depth * 2 + 2, "",
2208 (UV)mjd_offset, (UV)mjd_nodelen)
2211 /* But first we check to see if there is a common prefix we can
2212 split out as an EXACT and put in front of the TRIE node. */
2213 trie->startstate= 1;
2214 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2216 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2220 const U32 base = trie->states[ state ].trans.base;
2222 if ( trie->states[state].wordnum )
2225 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2226 if ( ( base + ofs >= trie->uniquecharcount ) &&
2227 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2228 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2230 if ( ++count > 1 ) {
2231 SV **tmp = av_fetch( revcharmap, ofs, 0);
2232 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2233 if ( state == 1 ) break;
2235 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2237 PerlIO_printf(Perl_debug_log,
2238 "%*sNew Start State=%"UVuf" Class: [",
2239 (int)depth * 2 + 2, "",
2242 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2243 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2245 TRIE_BITMAP_SET(trie,*ch);
2247 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2249 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2253 TRIE_BITMAP_SET(trie,*ch);
2255 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2256 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2262 SV **tmp = av_fetch( revcharmap, idx, 0);
2264 char *ch = SvPV( *tmp, len );
2266 SV *sv=sv_newmortal();
2267 PerlIO_printf( Perl_debug_log,
2268 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2269 (int)depth * 2 + 2, "",
2271 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2272 PL_colors[0], PL_colors[1],
2273 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2274 PERL_PV_ESCAPE_FIRSTCHAR
2279 OP( convert ) = nodetype;
2280 str=STRING(convert);
2283 STR_LEN(convert) += len;
2289 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2294 trie->prefixlen = (state-1);
2296 regnode *n = convert+NODE_SZ_STR(convert);
2297 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2298 trie->startstate = state;
2299 trie->minlen -= (state - 1);
2300 trie->maxlen -= (state - 1);
2302 /* At least the UNICOS C compiler choked on this
2303 * being argument to DEBUG_r(), so let's just have
2306 #ifdef PERL_EXT_RE_BUILD
2312 regnode *fix = convert;
2313 U32 word = trie->wordcount;
2315 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2316 while( ++fix < n ) {
2317 Set_Node_Offset_Length(fix, 0, 0);
2320 SV ** const tmp = av_fetch( trie_words, word, 0 );
2322 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2323 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2325 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2333 NEXT_OFF(convert) = (U16)(tail - convert);
2334 DEBUG_r(optimize= n);
2340 if ( trie->maxlen ) {
2341 NEXT_OFF( convert ) = (U16)(tail - convert);
2342 ARG_SET( convert, data_slot );
2343 /* Store the offset to the first unabsorbed branch in
2344 jump[0], which is otherwise unused by the jump logic.
2345 We use this when dumping a trie and during optimisation. */
2347 trie->jump[0] = (U16)(nextbranch - convert);
2349 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2350 * and there is a bitmap
2351 * and the first "jump target" node we found leaves enough room
2352 * then convert the TRIE node into a TRIEC node, with the bitmap
2353 * embedded inline in the opcode - this is hypothetically faster.
2355 if ( !trie->states[trie->startstate].wordnum
2357 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2359 OP( convert ) = TRIEC;
2360 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2361 PerlMemShared_free(trie->bitmap);
2364 OP( convert ) = TRIE;
2366 /* store the type in the flags */
2367 convert->flags = nodetype;
2371 + regarglen[ OP( convert ) ];
2373 /* XXX We really should free up the resource in trie now,
2374 as we won't use them - (which resources?) dmq */
2376 /* needed for dumping*/
2377 DEBUG_r(if (optimize) {
2378 regnode *opt = convert;
2380 while ( ++opt < optimize) {
2381 Set_Node_Offset_Length(opt,0,0);
2384 Try to clean up some of the debris left after the
2387 while( optimize < jumper ) {
2388 mjd_nodelen += Node_Length((optimize));
2389 OP( optimize ) = OPTIMIZED;
2390 Set_Node_Offset_Length(optimize,0,0);
2393 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2395 } /* end node insert */
2397 /* Finish populating the prev field of the wordinfo array. Walk back
2398 * from each accept state until we find another accept state, and if
2399 * so, point the first word's .prev field at the second word. If the
2400 * second already has a .prev field set, stop now. This will be the
2401 * case either if we've already processed that word's accept state,
2402 * or that state had multiple words, and the overspill words were
2403 * already linked up earlier.
2410 for (word=1; word <= trie->wordcount; word++) {
2412 if (trie->wordinfo[word].prev)
2414 state = trie->wordinfo[word].accept;
2416 state = prev_states[state];
2419 prev = trie->states[state].wordnum;
2423 trie->wordinfo[word].prev = prev;
2425 Safefree(prev_states);
2429 /* and now dump out the compressed format */
2430 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2432 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2434 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2435 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2437 SvREFCNT_dec(revcharmap);
2441 : trie->startstate>1
2447 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2449 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2451 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2452 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2455 We find the fail state for each state in the trie, this state is the longest proper
2456 suffix of the current state's 'word' that is also a proper prefix of another word in our
2457 trie. State 1 represents the word '' and is thus the default fail state. This allows
2458 the DFA not to have to restart after its tried and failed a word at a given point, it
2459 simply continues as though it had been matching the other word in the first place.
2461 'abcdgu'=~/abcdefg|cdgu/
2462 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2463 fail, which would bring us to the state representing 'd' in the second word where we would
2464 try 'g' and succeed, proceeding to match 'cdgu'.
2466 /* add a fail transition */
2467 const U32 trie_offset = ARG(source);
2468 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2470 const U32 ucharcount = trie->uniquecharcount;
2471 const U32 numstates = trie->statecount;
2472 const U32 ubound = trie->lasttrans + ucharcount;
2476 U32 base = trie->states[ 1 ].trans.base;
2479 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2480 GET_RE_DEBUG_FLAGS_DECL;
2482 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2484 PERL_UNUSED_ARG(depth);
2488 ARG_SET( stclass, data_slot );
2489 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2490 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2491 aho->trie=trie_offset;
2492 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2493 Copy( trie->states, aho->states, numstates, reg_trie_state );
2494 Newxz( q, numstates, U32);
2495 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2498 /* initialize fail[0..1] to be 1 so that we always have
2499 a valid final fail state */
2500 fail[ 0 ] = fail[ 1 ] = 1;
2502 for ( charid = 0; charid < ucharcount ; charid++ ) {
2503 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2505 q[ q_write ] = newstate;
2506 /* set to point at the root */
2507 fail[ q[ q_write++ ] ]=1;
2510 while ( q_read < q_write) {
2511 const U32 cur = q[ q_read++ % numstates ];
2512 base = trie->states[ cur ].trans.base;
2514 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2515 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2517 U32 fail_state = cur;
2520 fail_state = fail[ fail_state ];
2521 fail_base = aho->states[ fail_state ].trans.base;
2522 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2524 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2525 fail[ ch_state ] = fail_state;
2526 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2528 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2530 q[ q_write++ % numstates] = ch_state;
2534 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2535 when we fail in state 1, this allows us to use the
2536 charclass scan to find a valid start char. This is based on the principle
2537 that theres a good chance the string being searched contains lots of stuff
2538 that cant be a start char.
2540 fail[ 0 ] = fail[ 1 ] = 0;
2541 DEBUG_TRIE_COMPILE_r({
2542 PerlIO_printf(Perl_debug_log,
2543 "%*sStclass Failtable (%"UVuf" states): 0",
2544 (int)(depth * 2), "", (UV)numstates
2546 for( q_read=1; q_read<numstates; q_read++ ) {
2547 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2549 PerlIO_printf(Perl_debug_log, "\n");
2552 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2557 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2558 * These need to be revisited when a newer toolchain becomes available.
2560 #if defined(__sparc64__) && defined(__GNUC__)
2561 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2562 # undef SPARC64_GCC_WORKAROUND
2563 # define SPARC64_GCC_WORKAROUND 1
2567 #define DEBUG_PEEP(str,scan,depth) \
2568 DEBUG_OPTIMISE_r({if (scan){ \
2569 SV * const mysv=sv_newmortal(); \
2570 regnode *Next = regnext(scan); \
2571 regprop(RExC_rx, mysv, scan); \
2572 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2573 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2574 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2578 /* The below joins as many adjacent EXACTish nodes as possible into a single
2579 * one, and looks for problematic sequences of characters whose folds vs.
2580 * non-folds have sufficiently different lengths, that the optimizer would be
2581 * fooled into rejecting legitimate matches of them, and the trie construction
2582 * code can't cope with them. The joining is only done if:
2583 * 1) there is room in the current conglomerated node to entirely contain the
2585 * 2) they are the exact same node type
2587 * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2588 * these get optimized out
2590 * If there are problematic code sequences, *min_subtract is set to the delta
2591 * that the minimum size of the node can be less than its actual size. And,
2592 * the node type of the result is changed to reflect that it contains these
2595 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2596 * and contains LATIN SMALL LETTER SHARP S
2598 * This is as good a place as any to discuss the design of handling these
2599 * problematic sequences. It's been wrong in Perl for a very long time. There
2600 * are three code points in Unicode whose folded lengths differ so much from
2601 * the un-folded lengths that it causes problems for the optimizer and trie
2602 * construction. Why only these are problematic, and not others where lengths
2603 * also differ is something I (khw) do not understand. New versions of Unicode
2604 * might add more such code points. Hopefully the logic in fold_grind.t that
2605 * figures out what to test (in part by verifying that each size-combination
2606 * gets tested) will catch any that do come along, so they can be added to the
2607 * special handling below. The chances of new ones are actually rather small,
2608 * as most, if not all, of the world's scripts that have casefolding have
2609 * already been encoded by Unicode. Also, a number of Unicode's decisions were
2610 * made to allow compatibility with pre-existing standards, and almost all of
2611 * those have already been dealt with. These would otherwise be the most
2612 * likely candidates for generating further tricky sequences. In other words,
2613 * Unicode by itself is unlikely to add new ones unless it is for compatibility
2614 * with pre-existing standards, and there aren't many of those left.
2616 * The previous designs for dealing with these involved assigning a special
2617 * node for them. This approach doesn't work, as evidenced by this example:
2618 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2619 * Both these fold to "sss", but if the pattern is parsed to create a node of
2620 * that would match just the \xDF, it won't be able to handle the case where a
2621 * successful match would have to cross the node's boundary. The new approach
2622 * that hopefully generally solves the problem generates an EXACTFU_SS node
2625 * There are a number of components to the approach (a lot of work for just
2626 * three code points!):
2627 * 1) This routine examines each EXACTFish node that could contain the
2628 * problematic sequences. It returns in *min_subtract how much to
2629 * subtract from the the actual length of the string to get a real minimum
2630 * for one that could match it. This number is usually 0 except for the
2631 * problematic sequences. This delta is used by the caller to adjust the
2632 * min length of the match, and the delta between min and max, so that the
2633 * optimizer doesn't reject these possibilities based on size constraints.
2634 * 2) These sequences are not currently correctly handled by the trie code
2635 * either, so it changes the joined node type to ops that are not handled
2636 * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
2637 * 3) This is sufficient for the two Greek sequences (described below), but
2638 * the one involving the Sharp s (\xDF) needs more. The node type
2639 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2640 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2641 * case where there is a possible fold length change. That means that a
2642 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2643 * itself with length changes, and so can be processed faster. regexec.c
2644 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2645 * is pre-folded by regcomp.c. This saves effort in regex matching.
2646 * However, probably mostly for historical reasons, the pre-folding isn't
2647 * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2648 * nodes, as what they fold to isn't known until runtime.) The fold
2649 * possibilities for the non-UTF8 patterns are quite simple, except for
2650 * the sharp s. All the ones that don't involve a UTF-8 target string
2651 * are members of a fold-pair, and arrays are set up for all of them
2652 * that quickly find the other member of the pair. It might actually
2653 * be faster to pre-fold these, but it isn't currently done, except for
2654 * the sharp s. Code elsewhere in this file makes sure that it gets
2655 * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
2656 * issues described in the next item.
2657 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2658 * 'ss' or not is not knowable at compile time. It will match iff the
2659 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2660 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2661 * it can't be folded to "ss" at compile time, unlike EXACTFU does as
2662 * described in item 3). An assumption that the optimizer part of
2663 * regexec.c (probably unwittingly) makes is that a character in the
2664 * pattern corresponds to at most a single character in the target string.
2665 * (And I do mean character, and not byte here, unlike other parts of the
2666 * documentation that have never been updated to account for multibyte
2667 * Unicode.) This assumption is wrong only in this case, as all other
2668 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2669 * virtue of having this file pre-fold UTF-8 patterns. I'm
2670 * reluctant to try to change this assumption, so instead the code punts.
2671 * This routine examines EXACTF nodes for the sharp s, and returns a
2672 * boolean indicating whether or not the node is an EXACTF node that
2673 * contains a sharp s. When it is true, the caller sets a flag that later
2674 * causes the optimizer in this file to not set values for the floating
2675 * and fixed string lengths, and thus avoids the optimizer code in
2676 * regexec.c that makes the invalid assumption. Thus, there is no
2677 * optimization based on string lengths for EXACTF nodes that contain the
2678 * sharp s. This only happens for /id rules (which means the pattern
2682 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2683 if (PL_regkind[OP(scan)] == EXACT) \
2684 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2687 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2688 /* Merge several consecutive EXACTish nodes into one. */
2689 regnode *n = regnext(scan);
2691 regnode *next = scan + NODE_SZ_STR(scan);
2695 regnode *stop = scan;
2696 GET_RE_DEBUG_FLAGS_DECL;
2698 PERL_UNUSED_ARG(depth);
2701 PERL_ARGS_ASSERT_JOIN_EXACT;
2702 #ifndef EXPERIMENTAL_INPLACESCAN
2703 PERL_UNUSED_ARG(flags);
2704 PERL_UNUSED_ARG(val);
2706 DEBUG_PEEP("join",scan,depth);
2708 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2709 * EXACT ones that are mergeable to the current one. */
2711 && (PL_regkind[OP(n)] == NOTHING
2712 || (stringok && OP(n) == OP(scan)))
2714 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2717 if (OP(n) == TAIL || n > next)
2719 if (PL_regkind[OP(n)] == NOTHING) {
2720 DEBUG_PEEP("skip:",n,depth);
2721 NEXT_OFF(scan) += NEXT_OFF(n);
2722 next = n + NODE_STEP_REGNODE;
2729 else if (stringok) {
2730 const unsigned int oldl = STR_LEN(scan);
2731 regnode * const nnext = regnext(n);
2733 if (oldl + STR_LEN(n) > U8_MAX)
2736 DEBUG_PEEP("merg",n,depth);
2739 NEXT_OFF(scan) += NEXT_OFF(n);
2740 STR_LEN(scan) += STR_LEN(n);
2741 next = n + NODE_SZ_STR(n);
2742 /* Now we can overwrite *n : */
2743 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2751 #ifdef EXPERIMENTAL_INPLACESCAN
2752 if (flags && !NEXT_OFF(n)) {
2753 DEBUG_PEEP("atch", val, depth);
2754 if (reg_off_by_arg[OP(n)]) {
2755 ARG_SET(n, val - n);
2758 NEXT_OFF(n) = val - n;
2766 *has_exactf_sharp_s = FALSE;
2768 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2769 * can now analyze for sequences of problematic code points. (Prior to
2770 * this final joining, sequences could have been split over boundaries, and
2771 * hence missed). The sequences only happen in folding, hence for any
2772 * non-EXACT EXACTish node */
2773 if (OP(scan) != EXACT) {
2775 U8 * s0 = (U8*) STRING(scan);
2776 U8 * const s_end = s0 + STR_LEN(scan);
2778 /* The below is perhaps overboard, but this allows us to save a test
2779 * each time through the loop at the expense of a mask. This is
2780 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2781 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2782 * This uses an exclusive 'or' to find that bit and then inverts it to
2783 * form a mask, with just a single 0, in the bit position where 'S' and
2785 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2786 const U8 s_masked = 's' & S_or_s_mask;
2788 /* One pass is made over the node's string looking for all the
2789 * possibilities. to avoid some tests in the loop, there are two main
2790 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2794 /* There are two problematic Greek code points in Unicode
2797 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2798 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2804 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2805 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2807 * This means that in case-insensitive matching (or "loose
2808 * matching", as Unicode calls it), an EXACTF of length six (the
2809 * UTF-8 encoded byte length of the above casefolded versions) can
2810 * match a target string of length two (the byte length of UTF-8
2811 * encoded U+0390 or U+03B0). This would rather mess up the
2812 * minimum length computation. (there are other code points that
2813 * also fold to these two sequences, but the delta is smaller)
2815 * If these sequences are found, the minimum length is decreased by
2816 * four (six minus two).
2818 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2819 * LETTER SHARP S. We decrease the min length by 1 for each
2820 * occurrence of 'ss' found */
2822 #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2823 # define U390_first_byte 0xb4
2824 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2825 # define U3B0_first_byte 0xb5
2826 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
2828 # define U390_first_byte 0xce
2829 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2830 # define U3B0_first_byte 0xcf
2831 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
2833 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2834 yields a net of 0 */
2835 /* Examine the string for one of the problematic sequences */
2837 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2838 * sequence we are looking for is 2 */
2842 /* Look for the first byte in each problematic sequence */
2844 /* We don't have to worry about other things that fold to
2845 * 's' (such as the long s, U+017F), as all above-latin1
2846 * code points have been pre-folded */
2850 /* Current character is an 's' or 'S'. If next one is
2851 * as well, we have the dreaded sequence */
2852 if (((*(s+1) & S_or_s_mask) == s_masked)
2853 /* These two node types don't have special handling
2855 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2858 OP(scan) = EXACTFU_SS;
2859 s++; /* No need to look at this character again */
2863 case U390_first_byte:
2864 if (s_end - s >= len
2866 /* The 1's are because are skipping comparing the
2868 && memEQ(s + 1, U390_tail, len - 1))
2870 goto greek_sequence;
2874 case U3B0_first_byte:
2875 if (! (s_end - s >= len
2876 && memEQ(s + 1, U3B0_tail, len - 1)))
2883 /* This can't currently be handled by trie's, so change
2884 * the node type to indicate this. If EXACTFA and
2885 * EXACTFL were ever to be handled by trie's, this
2886 * would have to be changed. If this node has already
2887 * been changed to EXACTFU_SS in this loop, leave it as
2888 * is. (I (khw) think it doesn't matter in regexec.c
2889 * for UTF patterns, but no need to change it */
2890 if (OP(scan) == EXACTFU) {
2891 OP(scan) = EXACTFU_TRICKYFOLD;
2893 s += 6; /* We already know what this sequence is. Skip
2899 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2901 /* Here, the pattern is not UTF-8. We need to look only for the
2902 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2903 * in the final position. Otherwise we can stop looking 1 byte
2904 * earlier because have to find both the first and second 's' */
2905 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2907 for (s = s0; s < upper; s++) {
2912 && ((*(s+1) & S_or_s_mask) == s_masked))
2916 /* EXACTF nodes need to know that the minimum
2917 * length changed so that a sharp s in the string
2918 * can match this ss in the pattern, but they
2919 * remain EXACTF nodes, as they are not trie'able,
2920 * so don't have to invent a new node type to
2921 * exclude them from the trie code */
2922 if (OP(scan) != EXACTF) {
2923 OP(scan) = EXACTFU_SS;
2928 case LATIN_SMALL_LETTER_SHARP_S:
2929 if (OP(scan) == EXACTF) {
2930 *has_exactf_sharp_s = TRUE;
2939 /* Allow dumping but overwriting the collection of skipped
2940 * ops and/or strings with fake optimized ops */
2941 n = scan + NODE_SZ_STR(scan);
2949 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2953 /* REx optimizer. Converts nodes into quicker variants "in place".
2954 Finds fixed substrings. */
2956 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2957 to the position after last scanned or to NULL. */
2959 #define INIT_AND_WITHP \
2960 assert(!and_withp); \
2961 Newx(and_withp,1,struct regnode_charclass_class); \
2962 SAVEFREEPV(and_withp)
2964 /* this is a chain of data about sub patterns we are processing that
2965 need to be handled separately/specially in study_chunk. Its so
2966 we can simulate recursion without losing state. */
2968 typedef struct scan_frame {
2969 regnode *last; /* last node to process in this frame */
2970 regnode *next; /* next node to process when last is reached */
2971 struct scan_frame *prev; /*previous frame*/
2972 I32 stop; /* what stopparen do we use */
2976 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2978 #define CASE_SYNST_FNC(nAmE) \
2980 if (flags & SCF_DO_STCLASS_AND) { \
2981 for (value = 0; value < 256; value++) \
2982 if (!is_ ## nAmE ## _cp(value)) \
2983 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2986 for (value = 0; value < 256; value++) \
2987 if (is_ ## nAmE ## _cp(value)) \
2988 ANYOF_BITMAP_SET(data->start_class, value); \
2992 if (flags & SCF_DO_STCLASS_AND) { \
2993 for (value = 0; value < 256; value++) \
2994 if (is_ ## nAmE ## _cp(value)) \
2995 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2998 for (value = 0; value < 256; value++) \
2999 if (!is_ ## nAmE ## _cp(value)) \
3000 ANYOF_BITMAP_SET(data->start_class, value); \
3007 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3008 I32 *minlenp, I32 *deltap,
3013 struct regnode_charclass_class *and_withp,
3014 U32 flags, U32 depth)
3015 /* scanp: Start here (read-write). */
3016 /* deltap: Write maxlen-minlen here. */
3017 /* last: Stop before this one. */
3018 /* data: string data about the pattern */
3019 /* stopparen: treat close N as END */
3020 /* recursed: which subroutines have we recursed into */
3021 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3024 I32 min = 0, pars = 0, code;
3025 regnode *scan = *scanp, *next;
3027 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3028 int is_inf_internal = 0; /* The studied chunk is infinite */
3029 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3030 scan_data_t data_fake;
3031 SV *re_trie_maxbuff = NULL;
3032 regnode *first_non_open = scan;
3033 I32 stopmin = I32_MAX;
3034 scan_frame *frame = NULL;
3035 GET_RE_DEBUG_FLAGS_DECL;
3037 PERL_ARGS_ASSERT_STUDY_CHUNK;
3040 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3044 while (first_non_open && OP(first_non_open) == OPEN)
3045 first_non_open=regnext(first_non_open);
3050 while ( scan && OP(scan) != END && scan < last ){
3051 UV min_subtract = 0; /* How much to subtract from the minimum node
3052 length to get a real minimum (because the
3053 folded version may be shorter) */
3054 bool has_exactf_sharp_s = FALSE;
3055 /* Peephole optimizer: */
3056 DEBUG_STUDYDATA("Peep:", data,depth);
3057 DEBUG_PEEP("Peep",scan,depth);
3059 /* Its not clear to khw or hv why this is done here, and not in the
3060 * clauses that deal with EXACT nodes. khw's guess is that it's
3061 * because of a previous design */
3062 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3064 /* Follow the next-chain of the current node and optimize
3065 away all the NOTHINGs from it. */
3066 if (OP(scan) != CURLYX) {
3067 const int max = (reg_off_by_arg[OP(scan)]
3069 /* I32 may be smaller than U16 on CRAYs! */
3070 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3071 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3075 /* Skip NOTHING and LONGJMP. */
3076 while ((n = regnext(n))
3077 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3078 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3079 && off + noff < max)
3081 if (reg_off_by_arg[OP(scan)])
3084 NEXT_OFF(scan) = off;
3089 /* The principal pseudo-switch. Cannot be a switch, since we
3090 look into several different things. */
3091 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3092 || OP(scan) == IFTHEN) {
3093 next = regnext(scan);
3095 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3097 if (OP(next) == code || code == IFTHEN) {
3098 /* NOTE - There is similar code to this block below for handling
3099 TRIE nodes on a re-study. If you change stuff here check there
3101 I32 max1 = 0, min1 = I32_MAX, num = 0;
3102 struct regnode_charclass_class accum;
3103 regnode * const startbranch=scan;
3105 if (flags & SCF_DO_SUBSTR)
3106 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3107 if (flags & SCF_DO_STCLASS)
3108 cl_init_zero(pRExC_state, &accum);
3110 while (OP(scan) == code) {
3111 I32 deltanext, minnext, f = 0, fake;
3112 struct regnode_charclass_class this_class;
3115 data_fake.flags = 0;
3117 data_fake.whilem_c = data->whilem_c;
3118 data_fake.last_closep = data->last_closep;
3121 data_fake.last_closep = &fake;
3123 data_fake.pos_delta = delta;
3124 next = regnext(scan);
3125 scan = NEXTOPER(scan);
3127 scan = NEXTOPER(scan);
3128 if (flags & SCF_DO_STCLASS) {
3129 cl_init(pRExC_state, &this_class);
3130 data_fake.start_class = &this_class;
3131 f = SCF_DO_STCLASS_AND;
3133 if (flags & SCF_WHILEM_VISITED_POS)
3134 f |= SCF_WHILEM_VISITED_POS;
3136 /* we suppose the run is continuous, last=next...*/
3137 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3139 stopparen, recursed, NULL, f,depth+1);
3142 if (max1 < minnext + deltanext)
3143 max1 = minnext + deltanext;
3144 if (deltanext == I32_MAX)
3145 is_inf = is_inf_internal = 1;
3147 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3149 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3150 if ( stopmin > minnext)
3151 stopmin = min + min1;
3152 flags &= ~SCF_DO_SUBSTR;
3154 data->flags |= SCF_SEEN_ACCEPT;
3157 if (data_fake.flags & SF_HAS_EVAL)
3158 data->flags |= SF_HAS_EVAL;
3159 data->whilem_c = data_fake.whilem_c;
3161 if (flags & SCF_DO_STCLASS)
3162 cl_or(pRExC_state, &accum, &this_class);
3164 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3166 if (flags & SCF_DO_SUBSTR) {
3167 data->pos_min += min1;
3168 data->pos_delta += max1 - min1;
3169 if (max1 != min1 || is_inf)
3170 data->longest = &(data->longest_float);
3173 delta += max1 - min1;
3174 if (flags & SCF_DO_STCLASS_OR) {
3175 cl_or(pRExC_state, data->start_class, &accum);
3177 cl_and(data->start_class, and_withp);
3178 flags &= ~SCF_DO_STCLASS;
3181 else if (flags & SCF_DO_STCLASS_AND) {
3183 cl_and(data->start_class, &accum);
3184 flags &= ~SCF_DO_STCLASS;
3187 /* Switch to OR mode: cache the old value of
3188 * data->start_class */
3190 StructCopy(data->start_class, and_withp,
3191 struct regnode_charclass_class);
3192 flags &= ~SCF_DO_STCLASS_AND;
3193 StructCopy(&accum, data->start_class,
3194 struct regnode_charclass_class);
3195 flags |= SCF_DO_STCLASS_OR;
3196 data->start_class->flags |= ANYOF_EOS;
3200 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3203 Assuming this was/is a branch we are dealing with: 'scan' now
3204 points at the item that follows the branch sequence, whatever
3205 it is. We now start at the beginning of the sequence and look
3212 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3214 If we can find such a subsequence we need to turn the first
3215 element into a trie and then add the subsequent branch exact
3216 strings to the trie.
3220 1. patterns where the whole set of branches can be converted.
3222 2. patterns where only a subset can be converted.
3224 In case 1 we can replace the whole set with a single regop
3225 for the trie. In case 2 we need to keep the start and end
3228 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3229 becomes BRANCH TRIE; BRANCH X;
3231 There is an additional case, that being where there is a
3232 common prefix, which gets split out into an EXACT like node
3233 preceding the TRIE node.
3235 If x(1..n)==tail then we can do a simple trie, if not we make
3236 a "jump" trie, such that when we match the appropriate word
3237 we "jump" to the appropriate tail node. Essentially we turn
3238 a nested if into a case structure of sorts.
3243 if (!re_trie_maxbuff) {
3244 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3245 if (!SvIOK(re_trie_maxbuff))
3246 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3248 if ( SvIV(re_trie_maxbuff)>=0 ) {
3250 regnode *first = (regnode *)NULL;
3251 regnode *last = (regnode *)NULL;
3252 regnode *tail = scan;
3257 SV * const mysv = sv_newmortal(); /* for dumping */
3259 /* var tail is used because there may be a TAIL
3260 regop in the way. Ie, the exacts will point to the
3261 thing following the TAIL, but the last branch will
3262 point at the TAIL. So we advance tail. If we
3263 have nested (?:) we may have to move through several
3267 while ( OP( tail ) == TAIL ) {
3268 /* this is the TAIL generated by (?:) */
3269 tail = regnext( tail );
3273 DEBUG_TRIE_COMPILE_r({
3274 regprop(RExC_rx, mysv, tail );
3275 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3276 (int)depth * 2 + 2, "",
3277 "Looking for TRIE'able sequences. Tail node is: ",
3278 SvPV_nolen_const( mysv )
3284 Step through the branches
3285 cur represents each branch,
3286 noper is the first thing to be matched as part of that branch
3287 noper_next is the regnext() of that node.
3289 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3290 via a "jump trie" but we also support building with NOJUMPTRIE,
3291 which restricts the trie logic to structures like /FOO|BAR/.
3293 If noper is a trieable nodetype then the branch is a possible optimization
3294 target. If we are building under NOJUMPTRIE then we require that noper_next
3295 is the same as scan (our current position in the regex program).
3297 Once we have two or more consecutive such branches we can create a
3298 trie of the EXACT's contents and stitch it in place into the program.
3300 If the sequence represents all of the branches in the alternation we
3301 replace the entire thing with a single TRIE node.
3303 Otherwise when it is a subsequence we need to stitch it in place and
3304 replace only the relevant branches. This means the first branch has
3305 to remain as it is used by the alternation logic, and its next pointer,
3306 and needs to be repointed at the item on the branch chain following
3307 the last branch we have optimized away.
3309 This could be either a BRANCH, in which case the subsequence is internal,
3310 or it could be the item following the branch sequence in which case the
3311 subsequence is at the end (which does not necessarily mean the first node
3312 is the start of the alternation).
3314 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3317 ----------------+-----------
3321 EXACTFU_SS | EXACTFU
3322 EXACTFU_TRICKYFOLD | EXACTFU
3327 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3328 ( EXACT == (X) ) ? EXACT : \
3329 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3332 /* dont use tail as the end marker for this traverse */
3333 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3334 regnode * const noper = NEXTOPER( cur );
3335 U8 noper_type = OP( noper );
3336 U8 noper_trietype = TRIE_TYPE( noper_type );
3337 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3338 regnode * const noper_next = regnext( noper );
3339 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3340 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3343 DEBUG_TRIE_COMPILE_r({
3344 regprop(RExC_rx, mysv, cur);
3345 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3346 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3348 regprop(RExC_rx, mysv, noper);
3349 PerlIO_printf( Perl_debug_log, " -> %s",
3350 SvPV_nolen_const(mysv));
3353 regprop(RExC_rx, mysv, noper_next );
3354 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3355 SvPV_nolen_const(mysv));
3357 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3358 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3359 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3363 /* Is noper a trieable nodetype that can be merged with the
3364 * current trie (if there is one)? */
3368 ( noper_trietype == NOTHING)
3369 || ( trietype == NOTHING )
3370 || ( trietype == noper_trietype )
3373 && noper_next == tail
3377 /* Handle mergable triable node
3378 * Either we are the first node in a new trieable sequence,
3379 * in which case we do some bookkeeping, otherwise we update
3380 * the end pointer. */
3383 trietype = noper_trietype;
3384 if ( noper_trietype == NOTHING ) {
3385 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3386 regnode * const noper_next = regnext( noper );
3387 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3388 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3391 if ( noper_next_trietype )
3392 trietype = noper_next_trietype;
3395 if ( trietype == NOTHING )
3396 trietype = noper_trietype;
3401 } /* end handle mergable triable node */
3403 /* handle unmergable node -
3404 * noper may either be a triable node which can not be tried
3405 * together with the current trie, or a non triable node */
3407 /* If last is set and trietype is not NOTHING then we have found
3408 * at least two triable branch sequences in a row of a similar
3409 * trietype so we can turn them into a trie. If/when we
3410 * allow NOTHING to start a trie sequence this condition will be
3411 * required, and it isn't expensive so we leave it in for now. */
3412 if ( trietype != NOTHING )
3413 make_trie( pRExC_state,
3414 startbranch, first, cur, tail, count,
3415 trietype, depth+1 );
3416 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3420 && noper_next == tail
3423 /* noper is triable, so we can start a new trie sequence */
3426 trietype = noper_trietype;
3428 /* if we already saw a first but the current node is not triable then we have
3429 * to reset the first information. */
3434 } /* end handle unmergable node */
3435 } /* loop over branches */
3436 DEBUG_TRIE_COMPILE_r({
3437 regprop(RExC_rx, mysv, cur);
3438 PerlIO_printf( Perl_debug_log,
3439 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3440 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3444 if ( trietype != NOTHING ) {
3445 /* the last branch of the sequence was part of a trie,
3446 * so we have to construct it here outside of the loop
3448 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3449 #ifdef TRIE_STUDY_OPT
3450 if ( ((made == MADE_EXACT_TRIE &&
3451 startbranch == first)
3452 || ( first_non_open == first )) &&
3454 flags |= SCF_TRIE_RESTUDY;
3455 if ( startbranch == first
3458 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3463 /* at this point we know whatever we have is a NOTHING sequence/branch
3464 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3466 if ( startbranch == first ) {
3468 /* the entire thing is a NOTHING sequence, something like this:
3469 * (?:|) So we can turn it into a plain NOTHING op. */
3470 DEBUG_TRIE_COMPILE_r({
3471 regprop(RExC_rx, mysv, cur);
3472 PerlIO_printf( Perl_debug_log,
3473 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3474 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3477 OP(startbranch)= NOTHING;
3478 NEXT_OFF(startbranch)= tail - startbranch;
3479 for ( opt= startbranch + 1; opt < tail ; opt++ )
3483 } /* end if ( last) */
3484 } /* TRIE_MAXBUF is non zero */
3489 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3490 scan = NEXTOPER(NEXTOPER(scan));
3491 } else /* single branch is optimized. */
3492 scan = NEXTOPER(scan);
3494 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3495 scan_frame *newframe = NULL;
3500 if (OP(scan) != SUSPEND) {
3501 /* set the pointer */
3502 if (OP(scan) == GOSUB) {
3504 RExC_recurse[ARG2L(scan)] = scan;
3505 start = RExC_open_parens[paren-1];
3506 end = RExC_close_parens[paren-1];
3509 start = RExC_rxi->program + 1;
3513 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3514 SAVEFREEPV(recursed);
3516 if (!PAREN_TEST(recursed,paren+1)) {
3517 PAREN_SET(recursed,paren+1);
3518 Newx(newframe,1,scan_frame);
3520 if (flags & SCF_DO_SUBSTR) {
3521 SCAN_COMMIT(pRExC_state,data,minlenp);
3522 data->longest = &(data->longest_float);
3524 is_inf = is_inf_internal = 1;
3525 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3526 cl_anything(pRExC_state, data->start_class);
3527 flags &= ~SCF_DO_STCLASS;
3530 Newx(newframe,1,scan_frame);
3533 end = regnext(scan);
3538 SAVEFREEPV(newframe);
3539 newframe->next = regnext(scan);
3540 newframe->last = last;
3541 newframe->stop = stopparen;
3542 newframe->prev = frame;
3552 else if (OP(scan) == EXACT) {
3553 I32 l = STR_LEN(scan);
3556 const U8 * const s = (U8*)STRING(scan);
3557 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3558 l = utf8_length(s, s + l);
3560 uc = *((U8*)STRING(scan));
3563 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3564 /* The code below prefers earlier match for fixed
3565 offset, later match for variable offset. */
3566 if (data->last_end == -1) { /* Update the start info. */
3567 data->last_start_min = data->pos_min;
3568 data->last_start_max = is_inf
3569 ? I32_MAX : data->pos_min + data->pos_delta;
3571 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3573 SvUTF8_on(data->last_found);
3575 SV * const sv = data->last_found;
3576 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3577 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3578 if (mg && mg->mg_len >= 0)
3579 mg->mg_len += utf8_length((U8*)STRING(scan),
3580 (U8*)STRING(scan)+STR_LEN(scan));
3582 data->last_end = data->pos_min + l;
3583 data->pos_min += l; /* As in the first entry. */
3584 data->flags &= ~SF_BEFORE_EOL;
3586 if (flags & SCF_DO_STCLASS_AND) {
3587 /* Check whether it is compatible with what we know already! */
3591 /* If compatible, we or it in below. It is compatible if is
3592 * in the bitmp and either 1) its bit or its fold is set, or 2)
3593 * it's for a locale. Even if there isn't unicode semantics
3594 * here, at runtime there may be because of matching against a
3595 * utf8 string, so accept a possible false positive for
3596 * latin1-range folds */
3598 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3599 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3600 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
3601 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3606 ANYOF_CLASS_ZERO(data->start_class);
3607 ANYOF_BITMAP_ZERO(data->start_class);
3609 ANYOF_BITMAP_SET(data->start_class, uc);
3610 else if (uc >= 0x100) {
3613 /* Some Unicode code points fold to the Latin1 range; as
3614 * XXX temporary code, instead of figuring out if this is
3615 * one, just assume it is and set all the start class bits
3616 * that could be some such above 255 code point's fold
3617 * which will generate fals positives. As the code
3618 * elsewhere that does compute the fold settles down, it
3619 * can be extracted out and re-used here */
3620 for (i = 0; i < 256; i++){
3621 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3622 ANYOF_BITMAP_SET(data->start_class, i);
3626 data->start_class->flags &= ~ANYOF_EOS;
3628 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3630 else if (flags & SCF_DO_STCLASS_OR) {
3631 /* false positive possible if the class is case-folded */
3633 ANYOF_BITMAP_SET(data->start_class, uc);
3635 data->start_class->flags |= ANYOF_UNICODE_ALL;
3636 data->start_class->flags &= ~ANYOF_EOS;
3637 cl_and(data->start_class, and_withp);
3639 flags &= ~SCF_DO_STCLASS;
3641 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3642 I32 l = STR_LEN(scan);
3643 UV uc = *((U8*)STRING(scan));
3645 /* Search for fixed substrings supports EXACT only. */
3646 if (flags & SCF_DO_SUBSTR) {
3648 SCAN_COMMIT(pRExC_state, data, minlenp);
3651 const U8 * const s = (U8 *)STRING(scan);
3652 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3653 l = utf8_length(s, s + l);
3655 else if (has_exactf_sharp_s) {
3656 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3658 min += l - min_subtract;
3662 delta += min_subtract;
3663 if (flags & SCF_DO_SUBSTR) {
3664 data->pos_min += l - min_subtract;
3665 if (data->pos_min < 0) {
3668 data->pos_delta += min_subtract;
3670 data->longest = &(data->longest_float);
3673 if (flags & SCF_DO_STCLASS_AND) {
3674 /* Check whether it is compatible with what we know already! */
3677 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3678 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3679 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3683 ANYOF_CLASS_ZERO(data->start_class);
3684 ANYOF_BITMAP_ZERO(data->start_class);
3686 ANYOF_BITMAP_SET(data->start_class, uc);
3687 data->start_class->flags &= ~ANYOF_EOS;
3688 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
3689 if (OP(scan) == EXACTFL) {
3690 /* XXX This set is probably no longer necessary, and
3691 * probably wrong as LOCALE now is on in the initial
3693 data->start_class->flags |= ANYOF_LOCALE;
3697 /* Also set the other member of the fold pair. In case
3698 * that unicode semantics is called for at runtime, use
3699 * the full latin1 fold. (Can't do this for locale,
3700 * because not known until runtime) */
3701 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3703 /* All other (EXACTFL handled above) folds except under
3704 * /iaa that include s, S, and sharp_s also may include
3706 if (OP(scan) != EXACTFA) {
3707 if (uc == 's' || uc == 'S') {
3708 ANYOF_BITMAP_SET(data->start_class,
3709 LATIN_SMALL_LETTER_SHARP_S);
3711 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3712 ANYOF_BITMAP_SET(data->start_class, 's');
3713 ANYOF_BITMAP_SET(data->start_class, 'S');
3718 else if (uc >= 0x100) {
3720 for (i = 0; i < 256; i++){
3721 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3722 ANYOF_BITMAP_SET(data->start_class, i);
3727 else if (flags & SCF_DO_STCLASS_OR) {
3728 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
3729 /* false positive possible if the class is case-folded.
3730 Assume that the locale settings are the same... */
3732 ANYOF_BITMAP_SET(data->start_class, uc);
3733 if (OP(scan) != EXACTFL) {
3735 /* And set the other member of the fold pair, but
3736 * can't do that in locale because not known until
3738 ANYOF_BITMAP_SET(data->start_class,
3739 PL_fold_latin1[uc]);
3741 /* All folds except under /iaa that include s, S,
3742 * and sharp_s also may include the others */
3743 if (OP(scan) != EXACTFA) {
3744 if (uc == 's' || uc == 'S') {
3745 ANYOF_BITMAP_SET(data->start_class,
3746 LATIN_SMALL_LETTER_SHARP_S);
3748 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3749 ANYOF_BITMAP_SET(data->start_class, 's');
3750 ANYOF_BITMAP_SET(data->start_class, 'S');
3755 data->start_class->flags &= ~ANYOF_EOS;
3757 cl_and(data->start_class, and_withp);
3759 flags &= ~SCF_DO_STCLASS;
3761 else if (REGNODE_VARIES(OP(scan))) {
3762 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3763 I32 f = flags, pos_before = 0;
3764 regnode * const oscan = scan;
3765 struct regnode_charclass_class this_class;
3766 struct regnode_charclass_class *oclass = NULL;
3767 I32 next_is_eval = 0;
3769 switch (PL_regkind[OP(scan)]) {
3770 case WHILEM: /* End of (?:...)* . */
3771 scan = NEXTOPER(scan);
3774 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3775 next = NEXTOPER(scan);
3776 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3778 maxcount = REG_INFTY;
3779 next = regnext(scan);
3780 scan = NEXTOPER(scan);
3784 if (flags & SCF_DO_SUBSTR)
3789 if (flags & SCF_DO_STCLASS) {
3791 maxcount = REG_INFTY;
3792 next = regnext(scan);
3793 scan = NEXTOPER(scan);
3796 is_inf = is_inf_internal = 1;
3797 scan = regnext(scan);
3798 if (flags & SCF_DO_SUBSTR) {
3799 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3800 data->longest = &(data->longest_float);
3802 goto optimize_curly_tail;
3804 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3805 && (scan->flags == stopparen))
3810 mincount = ARG1(scan);
3811 maxcount = ARG2(scan);
3813 next = regnext(scan);
3814 if (OP(scan) == CURLYX) {
3815 I32 lp = (data ? *(data->last_closep) : 0);
3816 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3818 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3819 next_is_eval = (OP(scan) == EVAL);
3821 if (flags & SCF_DO_SUBSTR) {
3822 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3823 pos_before = data->pos_min;
3827 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3829 data->flags |= SF_IS_INF;
3831 if (flags & SCF_DO_STCLASS) {
3832 cl_init(pRExC_state, &this_class);
3833 oclass = data->start_class;
3834 data->start_class = &this_class;
3835 f |= SCF_DO_STCLASS_AND;
3836 f &= ~SCF_DO_STCLASS_OR;
3838 /* Exclude from super-linear cache processing any {n,m}
3839 regops for which the combination of input pos and regex
3840 pos is not enough information to determine if a match
3843 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3844 regex pos at the \s*, the prospects for a match depend not
3845 only on the input position but also on how many (bar\s*)
3846 repeats into the {4,8} we are. */
3847 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3848 f &= ~SCF_WHILEM_VISITED_POS;
3850 /* This will finish on WHILEM, setting scan, or on NULL: */
3851 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3852 last, data, stopparen, recursed, NULL,
3854 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3856 if (flags & SCF_DO_STCLASS)
3857 data->start_class = oclass;
3858 if (mincount == 0 || minnext == 0) {
3859 if (flags & SCF_DO_STCLASS_OR) {
3860 cl_or(pRExC_state, data->start_class, &this_class);
3862 else if (flags & SCF_DO_STCLASS_AND) {
3863 /* Switch to OR mode: cache the old value of
3864 * data->start_class */
3866 StructCopy(data->start_class, and_withp,
3867 struct regnode_charclass_class);
3868 flags &= ~SCF_DO_STCLASS_AND;
3869 StructCopy(&this_class, data->start_class,
3870 struct regnode_charclass_class);
3871 flags |= SCF_DO_STCLASS_OR;
3872 data->start_class->flags |= ANYOF_EOS;
3874 } else { /* Non-zero len */
3875 if (flags & SCF_DO_STCLASS_OR) {
3876 cl_or(pRExC_state, data->start_class, &this_class);
3877 cl_and(data->start_class, and_withp);
3879 else if (flags & SCF_DO_STCLASS_AND)
3880 cl_and(data->start_class, &this_class);
3881 flags &= ~SCF_DO_STCLASS;
3883 if (!scan) /* It was not CURLYX, but CURLY. */
3885 if ( /* ? quantifier ok, except for (?{ ... }) */
3886 (next_is_eval || !(mincount == 0 && maxcount == 1))
3887 && (minnext == 0) && (deltanext == 0)
3888 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3889 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3891 ckWARNreg(RExC_parse,
3892 "Quantifier unexpected on zero-length expression");
3895 min += minnext * mincount;
3896 is_inf_internal |= ((maxcount == REG_INFTY
3897 && (minnext + deltanext) > 0)
3898 || deltanext == I32_MAX);
3899 is_inf |= is_inf_internal;
3900 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3902 /* Try powerful optimization CURLYX => CURLYN. */
3903 if ( OP(oscan) == CURLYX && data
3904 && data->flags & SF_IN_PAR
3905 && !(data->flags & SF_HAS_EVAL)
3906 && !deltanext && minnext == 1 ) {
3907 /* Try to optimize to CURLYN. */
3908 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3909 regnode * const nxt1 = nxt;
3916 if (!REGNODE_SIMPLE(OP(nxt))
3917 && !(PL_regkind[OP(nxt)] == EXACT
3918 && STR_LEN(nxt) == 1))
3924 if (OP(nxt) != CLOSE)
3926 if (RExC_open_parens) {
3927 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3928 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3930 /* Now we know that nxt2 is the only contents: */
3931 oscan->flags = (U8)ARG(nxt);
3933 OP(nxt1) = NOTHING; /* was OPEN. */
3936 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3937 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3938 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3939 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3940 OP(nxt + 1) = OPTIMIZED; /* was count. */
3941 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3946 /* Try optimization CURLYX => CURLYM. */
3947 if ( OP(oscan) == CURLYX && data
3948 && !(data->flags & SF_HAS_PAR)
3949 && !(data->flags & SF_HAS_EVAL)
3950 && !deltanext /* atom is fixed width */
3951 && minnext != 0 /* CURLYM can't handle zero width */
3953 /* XXXX How to optimize if data == 0? */
3954 /* Optimize to a simpler form. */
3955 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3959 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3960 && (OP(nxt2) != WHILEM))
3962 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3963 /* Need to optimize away parenths. */
3964 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3965 /* Set the parenth number. */
3966 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3968 oscan->flags = (U8)ARG(nxt);
3969 if (RExC_open_parens) {
3970 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3971 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3973 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3974 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3977 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3978 OP(nxt + 1) = OPTIMIZED; /* was count. */
3979 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3980 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3983 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3984 regnode *nnxt = regnext(nxt1);
3986 if (reg_off_by_arg[OP(nxt1)])
3987 ARG_SET(nxt1, nxt2 - nxt1);
3988 else if (nxt2 - nxt1 < U16_MAX)
3989 NEXT_OFF(nxt1) = nxt2 - nxt1;
3991 OP(nxt) = NOTHING; /* Cannot beautify */
3996 /* Optimize again: */
3997 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3998 NULL, stopparen, recursed, NULL, 0,depth+1);
4003 else if ((OP(oscan) == CURLYX)
4004 && (flags & SCF_WHILEM_VISITED_POS)
4005 /* See the comment on a similar expression above.
4006 However, this time it's not a subexpression
4007 we care about, but the expression itself. */
4008 && (maxcount == REG_INFTY)
4009 && data && ++data->whilem_c < 16) {
4010 /* This stays as CURLYX, we can put the count/of pair. */
4011 /* Find WHILEM (as in regexec.c) */
4012 regnode *nxt = oscan + NEXT_OFF(oscan);
4014 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4016 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4017 | (RExC_whilem_seen << 4)); /* On WHILEM */
4019 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4021 if (flags & SCF_DO_SUBSTR) {
4022 SV *last_str = NULL;
4023 int counted = mincount != 0;
4025 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4026 #if defined(SPARC64_GCC_WORKAROUND)
4029 const char *s = NULL;
4032 if (pos_before >= data->last_start_min)
4035 b = data->last_start_min;
4038 s = SvPV_const(data->last_found, l);
4039 old = b - data->last_start_min;
4042 I32 b = pos_before >= data->last_start_min
4043 ? pos_before : data->last_start_min;
4045 const char * const s = SvPV_const(data->last_found, l);
4046 I32 old = b - data->last_start_min;
4050 old = utf8_hop((U8*)s, old) - (U8*)s;
4052 /* Get the added string: */
4053 last_str = newSVpvn_utf8(s + old, l, UTF);
4054 if (deltanext == 0 && pos_before == b) {
4055 /* What was added is a constant string */
4057 SvGROW(last_str, (mincount * l) + 1);
4058 repeatcpy(SvPVX(last_str) + l,
4059 SvPVX_const(last_str), l, mincount - 1);
4060 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4061 /* Add additional parts. */
4062 SvCUR_set(data->last_found,
4063 SvCUR(data->last_found) - l);
4064 sv_catsv(data->last_found, last_str);
4066 SV * sv = data->last_found;
4068 SvUTF8(sv) && SvMAGICAL(sv) ?
4069 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4070 if (mg && mg->mg_len >= 0)
4071 mg->mg_len += CHR_SVLEN(last_str) - l;
4073 data->last_end += l * (mincount - 1);
4076 /* start offset must point into the last copy */
4077 data->last_start_min += minnext * (mincount - 1);
4078 data->last_start_max += is_inf ? I32_MAX
4079 : (maxcount - 1) * (minnext + data->pos_delta);
4082 /* It is counted once already... */
4083 data->pos_min += minnext * (mincount - counted);
4084 data->pos_delta += - counted * deltanext +
4085 (minnext + deltanext) * maxcount - minnext * mincount;
4086 if (mincount != maxcount) {
4087 /* Cannot extend fixed substrings found inside
4089 SCAN_COMMIT(pRExC_state,data,minlenp);
4090 if (mincount && last_str) {
4091 SV * const sv = data->last_found;
4092 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4093 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4097 sv_setsv(sv, last_str);
4098 data->last_end = data->pos_min;
4099 data->last_start_min =
4100 data->pos_min - CHR_SVLEN(last_str);
4101 data->last_start_max = is_inf
4103 : data->pos_min + data->pos_delta
4104 - CHR_SVLEN(last_str);
4106 data->longest = &(data->longest_float);
4108 SvREFCNT_dec(last_str);
4110 if (data && (fl & SF_HAS_EVAL))
4111 data->flags |= SF_HAS_EVAL;
4112 optimize_curly_tail:
4113 if (OP(oscan) != CURLYX) {
4114 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4116 NEXT_OFF(oscan) += NEXT_OFF(next);
4119 default: /* REF, ANYOFV, and CLUMP only? */
4120 if (flags & SCF_DO_SUBSTR) {
4121 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4122 data->longest = &(data->longest_float);
4124 is_inf = is_inf_internal = 1;
4125 if (flags & SCF_DO_STCLASS_OR)
4126 cl_anything(pRExC_state, data->start_class);
4127 flags &= ~SCF_DO_STCLASS;
4131 else if (OP(scan) == LNBREAK) {
4132 if (flags & SCF_DO_STCLASS) {
4134 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4135 if (flags & SCF_DO_STCLASS_AND) {
4136 for (value = 0; value < 256; value++)
4137 if (!is_VERTWS_cp(value))
4138 ANYOF_BITMAP_CLEAR(data->start_class, value);
4141 for (value = 0; value < 256; value++)
4142 if (is_VERTWS_cp(value))
4143 ANYOF_BITMAP_SET(data->start_class, value);
4145 if (flags & SCF_DO_STCLASS_OR)
4146 cl_and(data->start_class, and_withp);
4147 flags &= ~SCF_DO_STCLASS;
4151 if (flags & SCF_DO_SUBSTR) {
4152 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4154 data->pos_delta += 1;
4155 data->longest = &(data->longest_float);
4158 else if (REGNODE_SIMPLE(OP(scan))) {
4161 if (flags & SCF_DO_SUBSTR) {
4162 SCAN_COMMIT(pRExC_state,data,minlenp);
4166 if (flags & SCF_DO_STCLASS) {
4167 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4169 /* Some of the logic below assumes that switching
4170 locale on will only add false positives. */
4171 switch (PL_regkind[OP(scan)]) {
4175 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4176 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4177 cl_anything(pRExC_state, data->start_class);
4180 if (OP(scan) == SANY)
4182 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4183 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4184 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4185 cl_anything(pRExC_state, data->start_class);
4187 if (flags & SCF_DO_STCLASS_AND || !value)
4188 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4191 if (flags & SCF_DO_STCLASS_AND)
4192 cl_and(data->start_class,
4193 (struct regnode_charclass_class*)scan);
4195 cl_or(pRExC_state, data->start_class,
4196 (struct regnode_charclass_class*)scan);
4199 if (flags & SCF_DO_STCLASS_AND) {
4200 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4201 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
4202 if (OP(scan) == ALNUMU) {
4203 for (value = 0; value < 256; value++) {
4204 if (!isWORDCHAR_L1(value)) {
4205 ANYOF_BITMAP_CLEAR(data->start_class, value);
4209 for (value = 0; value < 256; value++) {
4210 if (!isALNUM(value)) {
4211 ANYOF_BITMAP_CLEAR(data->start_class, value);
4218 if (data->start_class->flags & ANYOF_LOCALE)
4219 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
4221 /* Even if under locale, set the bits for non-locale
4222 * in case it isn't a true locale-node. This will
4223 * create false positives if it truly is locale */
4224 if (OP(scan) == ALNUMU) {
4225 for (value = 0; value < 256; value++) {
4226 if (isWORDCHAR_L1(value)) {
4227 ANYOF_BITMAP_SET(data->start_class, value);
4231 for (value = 0; value < 256; value++) {
4232 if (isALNUM(value)) {
4233 ANYOF_BITMAP_SET(data->start_class, value);
4240 if (flags & SCF_DO_STCLASS_AND) {
4241 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4242 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
4243 if (OP(scan) == NALNUMU) {
4244 for (value = 0; value < 256; value++) {
4245 if (isWORDCHAR_L1(value)) {
4246 ANYOF_BITMAP_CLEAR(data->start_class, value);
4250 for (value = 0; value < 256; value++) {
4251 if (isALNUM(value)) {
4252 ANYOF_BITMAP_CLEAR(data->start_class, value);
4259 if (data->start_class->flags & ANYOF_LOCALE)
4260 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
4262 /* Even if under locale, set the bits for non-locale in
4263 * case it isn't a true locale-node. This will create
4264 * false positives if it truly is locale */
4265 if (OP(scan) == NALNUMU) {
4266 for (value = 0; value < 256; value++) {
4267 if (! isWORDCHAR_L1(value)) {
4268 ANYOF_BITMAP_SET(data->start_class, value);
4272 for (value = 0; value < 256; value++) {
4273 if (! isALNUM(value)) {
4274 ANYOF_BITMAP_SET(data->start_class, value);
4281 if (flags & SCF_DO_STCLASS_AND) {
4282 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4283 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4284 if (OP(scan) == SPACEU) {
4285 for (value = 0; value < 256; value++) {
4286 if (!isSPACE_L1(value)) {
4287 ANYOF_BITMAP_CLEAR(data->start_class, value);
4291 for (value = 0; value < 256; value++) {
4292 if (!isSPACE(value)) {
4293 ANYOF_BITMAP_CLEAR(data->start_class, value);
4300 if (data->start_class->flags & ANYOF_LOCALE) {
4301 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4303 if (OP(scan) == SPACEU) {
4304 for (value = 0; value < 256; value++) {
4305 if (isSPACE_L1(value)) {
4306 ANYOF_BITMAP_SET(data->start_class, value);
4310 for (value = 0; value < 256; value++) {
4311 if (isSPACE(value)) {
4312 ANYOF_BITMAP_SET(data->start_class, value);
4319 if (flags & SCF_DO_STCLASS_AND) {
4320 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4321 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4322 if (OP(scan) == NSPACEU) {
4323 for (value = 0; value < 256; value++) {
4324 if (isSPACE_L1(value)) {
4325 ANYOF_BITMAP_CLEAR(data->start_class, value);
4329 for (value = 0; value < 256; value++) {
4330 if (isSPACE(value)) {
4331 ANYOF_BITMAP_CLEAR(data->start_class, value);
4338 if (data->start_class->flags & ANYOF_LOCALE)
4339 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4340 if (OP(scan) == NSPACEU) {
4341 for (value = 0; value < 256; value++) {
4342 if (!isSPACE_L1(value)) {
4343 ANYOF_BITMAP_SET(data->start_class, value);
4348 for (value = 0; value < 256; value++) {
4349 if (!isSPACE(value)) {
4350 ANYOF_BITMAP_SET(data->start_class, value);
4357 if (flags & SCF_DO_STCLASS_AND) {
4358 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4359 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4360 for (value = 0; value < 256; value++)
4361 if (!isDIGIT(value))
4362 ANYOF_BITMAP_CLEAR(data->start_class, value);
4366 if (data->start_class->flags & ANYOF_LOCALE)
4367 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4368 for (value = 0; value < 256; value++)
4370 ANYOF_BITMAP_SET(data->start_class, value);
4374 if (flags & SCF_DO_STCLASS_AND) {
4375 if (!(data->start_class->flags & ANYOF_LOCALE))
4376 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4377 for (value = 0; value < 256; value++)
4379 ANYOF_BITMAP_CLEAR(data->start_class, value);
4382 if (data->start_class->flags & ANYOF_LOCALE)
4383 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4384 for (value = 0; value < 256; value++)
4385 if (!isDIGIT(value))
4386 ANYOF_BITMAP_SET(data->start_class, value);
4389 CASE_SYNST_FNC(VERTWS);
4390 CASE_SYNST_FNC(HORIZWS);
4393 if (flags & SCF_DO_STCLASS_OR)
4394 cl_and(data->start_class, and_withp);
4395 flags &= ~SCF_DO_STCLASS;
4398 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4399 data->flags |= (OP(scan) == MEOL
4403 else if ( PL_regkind[OP(scan)] == BRANCHJ
4404 /* Lookbehind, or need to calculate parens/evals/stclass: */
4405 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4406 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4407 if ( OP(scan) == UNLESSM &&
4409 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4410 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4413 regnode *upto= regnext(scan);
4415 SV * const mysv_val=sv_newmortal();
4416 DEBUG_STUDYDATA("OPFAIL",data,depth);
4418 /*DEBUG_PARSE_MSG("opfail");*/
4419 regprop(RExC_rx, mysv_val, upto);
4420 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4421 SvPV_nolen_const(mysv_val),
4422 (IV)REG_NODE_NUM(upto),
4427 NEXT_OFF(scan) = upto - scan;
4428 for (opt= scan + 1; opt < upto ; opt++)
4429 OP(opt) = OPTIMIZED;
4433 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4434 || OP(scan) == UNLESSM )
4436 /* Negative Lookahead/lookbehind
4437 In this case we can't do fixed string optimisation.
4440 I32 deltanext, minnext, fake = 0;
4442 struct regnode_charclass_class intrnl;
4445 data_fake.flags = 0;
4447 data_fake.whilem_c = data->whilem_c;
4448 data_fake.last_closep = data->last_closep;
4451 data_fake.last_closep = &fake;
4452 data_fake.pos_delta = delta;
4453 if ( flags & SCF_DO_STCLASS && !scan->flags
4454 && OP(scan) == IFMATCH ) { /* Lookahead */
4455 cl_init(pRExC_state, &intrnl);
4456 data_fake.start_class = &intrnl;
4457 f |= SCF_DO_STCLASS_AND;
4459 if (flags & SCF_WHILEM_VISITED_POS)
4460 f |= SCF_WHILEM_VISITED_POS;
4461 next = regnext(scan);
4462 nscan = NEXTOPER(NEXTOPER(scan));
4463 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4464 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4467 FAIL("Variable length lookbehind not implemented");
4469 else if (minnext > (I32)U8_MAX) {
4470 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4472 scan->flags = (U8)minnext;
4475 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4477 if (data_fake.flags & SF_HAS_EVAL)
4478 data->flags |= SF_HAS_EVAL;
4479 data->whilem_c = data_fake.whilem_c;
4481 if (f & SCF_DO_STCLASS_AND) {
4482 if (flags & SCF_DO_STCLASS_OR) {
4483 /* OR before, AND after: ideally we would recurse with
4484 * data_fake to get the AND applied by study of the
4485 * remainder of the pattern, and then derecurse;
4486 * *** HACK *** for now just treat as "no information".
4487 * See [perl #56690].
4489 cl_init(pRExC_state, data->start_class);
4491 /* AND before and after: combine and continue */
4492 const int was = (data->start_class->flags & ANYOF_EOS);
4494 cl_and(data->start_class, &intrnl);
4496 data->start_class->flags |= ANYOF_EOS;
4500 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4502 /* Positive Lookahead/lookbehind
4503 In this case we can do fixed string optimisation,
4504 but we must be careful about it. Note in the case of
4505 lookbehind the positions will be offset by the minimum
4506 length of the pattern, something we won't know about
4507 until after the recurse.
4509 I32 deltanext, fake = 0;
4511 struct regnode_charclass_class intrnl;
4513 /* We use SAVEFREEPV so that when the full compile
4514 is finished perl will clean up the allocated
4515 minlens when it's all done. This way we don't
4516 have to worry about freeing them when we know
4517 they wont be used, which would be a pain.
4520 Newx( minnextp, 1, I32 );
4521 SAVEFREEPV(minnextp);
4524 StructCopy(data, &data_fake, scan_data_t);
4525 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4528 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4529 data_fake.last_found=newSVsv(data->last_found);
4533 data_fake.last_closep = &fake;
4534 data_fake.flags = 0;
4535 data_fake.pos_delta = delta;
4537 data_fake.flags |= SF_IS_INF;
4538 if ( flags & SCF_DO_STCLASS && !scan->flags
4539 && OP(scan) == IFMATCH ) { /* Lookahead */
4540 cl_init(pRExC_state, &intrnl);
4541 data_fake.start_class = &intrnl;
4542 f |= SCF_DO_STCLASS_AND;
4544 if (flags & SCF_WHILEM_VISITED_POS)
4545 f |= SCF_WHILEM_VISITED_POS;
4546 next = regnext(scan);
4547 nscan = NEXTOPER(NEXTOPER(scan));
4549 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4550 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4553 FAIL("Variable length lookbehind not implemented");
4555 else if (*minnextp > (I32)U8_MAX) {
4556 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4558 scan->flags = (U8)*minnextp;
4563 if (f & SCF_DO_STCLASS_AND) {
4564 const int was = (data->start_class->flags & ANYOF_EOS);
4566 cl_and(data->start_class, &intrnl);
4568 data->start_class->flags |= ANYOF_EOS;
4571 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4573 if (data_fake.flags & SF_HAS_EVAL)
4574 data->flags |= SF_HAS_EVAL;
4575 data->whilem_c = data_fake.whilem_c;
4576 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4577 if (RExC_rx->minlen<*minnextp)
4578 RExC_rx->minlen=*minnextp;
4579 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4580 SvREFCNT_dec(data_fake.last_found);
4582 if ( data_fake.minlen_fixed != minlenp )
4584 data->offset_fixed= data_fake.offset_fixed;
4585 data->minlen_fixed= data_fake.minlen_fixed;
4586 data->lookbehind_fixed+= scan->flags;
4588 if ( data_fake.minlen_float != minlenp )
4590 data->minlen_float= data_fake.minlen_float;
4591 data->offset_float_min=data_fake.offset_float_min;
4592 data->offset_float_max=data_fake.offset_float_max;
4593 data->lookbehind_float+= scan->flags;
4600 else if (OP(scan) == OPEN) {
4601 if (stopparen != (I32)ARG(scan))
4604 else if (OP(scan) == CLOSE) {
4605 if (stopparen == (I32)ARG(scan)) {
4608 if ((I32)ARG(scan) == is_par) {
4609 next = regnext(scan);
4611 if ( next && (OP(next) != WHILEM) && next < last)
4612 is_par = 0; /* Disable optimization */
4615 *(data->last_closep) = ARG(scan);
4617 else if (OP(scan) == EVAL) {
4619 data->flags |= SF_HAS_EVAL;
4621 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4622 if (flags & SCF_DO_SUBSTR) {
4623 SCAN_COMMIT(pRExC_state,data,minlenp);
4624 flags &= ~SCF_DO_SUBSTR;
4626 if (data && OP(scan)==ACCEPT) {
4627 data->flags |= SCF_SEEN_ACCEPT;
4632 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4634 if (flags & SCF_DO_SUBSTR) {
4635 SCAN_COMMIT(pRExC_state,data,minlenp);
4636 data->longest = &(data->longest_float);
4638 is_inf = is_inf_internal = 1;
4639 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4640 cl_anything(pRExC_state, data->start_class);
4641 flags &= ~SCF_DO_STCLASS;
4643 else if (OP(scan) == GPOS) {
4644 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4645 !(delta || is_inf || (data && data->pos_delta)))
4647 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4648 RExC_rx->extflags |= RXf_ANCH_GPOS;
4649 if (RExC_rx->gofs < (U32)min)
4650 RExC_rx->gofs = min;
4652 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4656 #ifdef TRIE_STUDY_OPT
4657 #ifdef FULL_TRIE_STUDY
4658 else if (PL_regkind[OP(scan)] == TRIE) {
4659 /* NOTE - There is similar code to this block above for handling
4660 BRANCH nodes on the initial study. If you change stuff here
4662 regnode *trie_node= scan;
4663 regnode *tail= regnext(scan);
4664 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4665 I32 max1 = 0, min1 = I32_MAX;
4666 struct regnode_charclass_class accum;
4668 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4669 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4670 if (flags & SCF_DO_STCLASS)
4671 cl_init_zero(pRExC_state, &accum);
4677 const regnode *nextbranch= NULL;
4680 for ( word=1 ; word <= trie->wordcount ; word++)
4682 I32 deltanext=0, minnext=0, f = 0, fake;
4683 struct regnode_charclass_class this_class;
4685 data_fake.flags = 0;
4687 data_fake.whilem_c = data->whilem_c;
4688 data_fake.last_closep = data->last_closep;
4691 data_fake.last_closep = &fake;
4692 data_fake.pos_delta = delta;
4693 if (flags & SCF_DO_STCLASS) {
4694 cl_init(pRExC_state, &this_class);
4695 data_fake.start_class = &this_class;
4696 f = SCF_DO_STCLASS_AND;
4698 if (flags & SCF_WHILEM_VISITED_POS)
4699 f |= SCF_WHILEM_VISITED_POS;
4701 if (trie->jump[word]) {
4703 nextbranch = trie_node + trie->jump[0];
4704 scan= trie_node + trie->jump[word];
4705 /* We go from the jump point to the branch that follows
4706 it. Note this means we need the vestigal unused branches
4707 even though they arent otherwise used.
4709 minnext = study_chunk(pRExC_state, &scan, minlenp,
4710 &deltanext, (regnode *)nextbranch, &data_fake,
4711 stopparen, recursed, NULL, f,depth+1);
4713 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4714 nextbranch= regnext((regnode*)nextbranch);
4716 if (min1 > (I32)(minnext + trie->minlen))
4717 min1 = minnext + trie->minlen;
4718 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4719 max1 = minnext + deltanext + trie->maxlen;
4720 if (deltanext == I32_MAX)
4721 is_inf = is_inf_internal = 1;
4723 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4725 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4726 if ( stopmin > min + min1)
4727 stopmin = min + min1;
4728 flags &= ~SCF_DO_SUBSTR;
4730 data->flags |= SCF_SEEN_ACCEPT;
4733 if (data_fake.flags & SF_HAS_EVAL)
4734 data->flags |= SF_HAS_EVAL;
4735 data->whilem_c = data_fake.whilem_c;
4737 if (flags & SCF_DO_STCLASS)
4738 cl_or(pRExC_state, &accum, &this_class);
4741 if (flags & SCF_DO_SUBSTR) {
4742 data->pos_min += min1;
4743 data->pos_delta += max1 - min1;
4744 if (max1 != min1 || is_inf)
4745 data->longest = &(data->longest_float);
4748 delta += max1 - min1;
4749 if (flags & SCF_DO_STCLASS_OR) {
4750 cl_or(pRExC_state, data->start_class, &accum);
4752 cl_and(data->start_class, and_withp);
4753 flags &= ~SCF_DO_STCLASS;
4756 else if (flags & SCF_DO_STCLASS_AND) {
4758 cl_and(data->start_class, &accum);
4759 flags &= ~SCF_DO_STCLASS;
4762 /* Switch to OR mode: cache the old value of
4763 * data->start_class */
4765 StructCopy(data->start_class, and_withp,
4766 struct regnode_charclass_class);
4767 flags &= ~SCF_DO_STCLASS_AND;
4768 StructCopy(&accum, data->start_class,
4769 struct regnode_charclass_class);
4770 flags |= SCF_DO_STCLASS_OR;
4771 data->start_class->flags |= ANYOF_EOS;
4778 else if (PL_regkind[OP(scan)] == TRIE) {
4779 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4782 min += trie->minlen;
4783 delta += (trie->maxlen - trie->minlen);
4784 flags &= ~SCF_DO_STCLASS; /* xxx */
4785 if (flags & SCF_DO_SUBSTR) {
4786 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4787 data->pos_min += trie->minlen;
4788 data->pos_delta += (trie->maxlen - trie->minlen);
4789 if (trie->maxlen != trie->minlen)
4790 data->longest = &(data->longest_float);
4792 if (trie->jump) /* no more substrings -- for now /grr*/
4793 flags &= ~SCF_DO_SUBSTR;
4795 #endif /* old or new */
4796 #endif /* TRIE_STUDY_OPT */
4798 /* Else: zero-length, ignore. */
4799 scan = regnext(scan);
4804 stopparen = frame->stop;
4805 frame = frame->prev;
4806 goto fake_study_recurse;
4811 DEBUG_STUDYDATA("pre-fin:",data,depth);
4814 *deltap = is_inf_internal ? I32_MAX : delta;
4815 if (flags & SCF_DO_SUBSTR && is_inf)
4816 data->pos_delta = I32_MAX - data->pos_min;
4817 if (is_par > (I32)U8_MAX)
4819 if (is_par && pars==1 && data) {
4820 data->flags |= SF_IN_PAR;
4821 data->flags &= ~SF_HAS_PAR;
4823 else if (pars && data) {
4824 data->flags |= SF_HAS_PAR;
4825 data->flags &= ~SF_IN_PAR;
4827 if (flags & SCF_DO_STCLASS_OR)
4828 cl_and(data->start_class, and_withp);
4829 if (flags & SCF_TRIE_RESTUDY)
4830 data->flags |= SCF_TRIE_RESTUDY;
4832 DEBUG_STUDYDATA("post-fin:",data,depth);
4834 return min < stopmin ? min : stopmin;
4838 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4840 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4842 PERL_ARGS_ASSERT_ADD_DATA;
4844 Renewc(RExC_rxi->data,
4845 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4846 char, struct reg_data);
4848 Renew(RExC_rxi->data->what, count + n, U8);
4850 Newx(RExC_rxi->data->what, n, U8);
4851 RExC_rxi->data->count = count + n;
4852 Copy(s, RExC_rxi->data->what + count, n, U8);
4856 /*XXX: todo make this not included in a non debugging perl */
4857 #ifndef PERL_IN_XSUB_RE
4859 Perl_reginitcolors(pTHX)
4862 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4864 char *t = savepv(s);
4868 t = strchr(t, '\t');
4874 PL_colors[i] = t = (char *)"";
4879 PL_colors[i++] = (char *)"";
4886 #ifdef TRIE_STUDY_OPT
4887 #define CHECK_RESTUDY_GOTO \
4889 (data.flags & SCF_TRIE_RESTUDY) \
4893 #define CHECK_RESTUDY_GOTO
4897 * pregcomp - compile a regular expression into internal code
4899 * Decides which engine's compiler to call based on the hint currently in
4903 #ifndef PERL_IN_XSUB_RE
4905 /* return the currently in-scope regex engine (or the default if none) */
4907 regexp_engine const *
4908 Perl_current_re_engine(pTHX)
4912 if (IN_PERL_COMPILETIME) {
4913 HV * const table = GvHV(PL_hintgv);
4917 return &PL_core_reg_engine;
4918 ptr = hv_fetchs(table, "regcomp", FALSE);
4919 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4920 return &PL_core_reg_engine;
4921 return INT2PTR(regexp_engine*,SvIV(*ptr));
4925 if (!PL_curcop->cop_hints_hash)
4926 return &PL_core_reg_engine;
4927 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4928 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4929 return &PL_core_reg_engine;
4930 return INT2PTR(regexp_engine*,SvIV(ptr));
4936 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4939 regexp_engine const *eng = current_re_engine();
4940 GET_RE_DEBUG_FLAGS_DECL;
4942 PERL_ARGS_ASSERT_PREGCOMP;
4944 /* Dispatch a request to compile a regexp to correct regexp engine. */
4946 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4949 return CALLREGCOMP_ENG(eng, pattern, flags);
4953 /* public(ish) wrapper for Perl_re_op_compile that only takes an SV
4954 * pattern rather than a list of OPs */
4957 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4959 SV *pat = pattern; /* defeat constness! */
4960 PERL_ARGS_ASSERT_RE_COMPILE;
4961 return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(),
4962 NULL, NULL, rx_flags, 0);
4967 * Perl_re_op_compile - the perl internal RE engine's function to compile a
4968 * regular expression into internal code.
4969 * The pattern may be passed either as:
4970 * a list of SVs (patternp plus pat_count)
4971 * a list of OPs (expr)
4972 * If both are passed, the SV list is used, but the OP list indicates
4973 * which SVs are actually pre-compiled code blocks
4975 * The SVs in the list have magic and qr overloading applied to them (and
4976 * the list may be modified in-place with replacement SVs in the latter
4979 * If the pattern hasn't changed from old_re, then old_re will be
4982 * eng is the current engine. If that engine has an op_comp method, then
4983 * handle directly (i.e. we assume that op_comp was us); otherwise, just
4984 * do the initial concatenation of arguments and pass on to the external
4987 * If is_bare_re is not null, set it to a boolean indicating whether the
4988 * arg list reduced (after overloading) to a single bare regex which has
4989 * been returned (i.e. /$qr/).
4991 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
4993 * pm_flags contains the PMf_* flags from the calling PMOP. Currently
4994 * we're only interested in PMf_HAS_CV and PMf_IS_QR.
4996 * We can't allocate space until we know how big the compiled form will be,
4997 * but we can't compile it (and thus know how big it is) until we've got a
4998 * place to put the code. So we cheat: we compile it twice, once with code
4999 * generation turned off and size counting turned on, and once "for real".
5000 * This also means that we don't allocate space until we are sure that the
5001 * thing really will compile successfully, and we never have to move the
5002 * code and thus invalidate pointers into it. (Note that it has to be in
5003 * one piece because free() must be able to free it all.) [NB: not true in perl]
5005 * Beware that the optimization-preparation code in here knows about some
5006 * of the structure of the compiled regexp. [I'll say.]
5010 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5011 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5012 int *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5017 register regexp_internal *ri;
5027 /* these are all flags - maybe they should be turned
5028 * into a single int with different bit masks */
5029 I32 sawlookahead = 0;
5032 bool used_setjump = FALSE;
5033 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5034 bool code_is_utf8 = 0;
5039 RExC_state_t RExC_state;
5040 RExC_state_t * const pRExC_state = &RExC_state;
5041 #ifdef TRIE_STUDY_OPT
5043 RExC_state_t copyRExC_state;
5045 GET_RE_DEBUG_FLAGS_DECL;
5047 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5049 DEBUG_r(if (!PL_colorset) reginitcolors());
5051 #ifndef PERL_IN_XSUB_RE
5052 /* Initialize these here instead of as-needed, as is quick and avoids
5053 * having to test them each time otherwise */
5054 if (! PL_AboveLatin1) {
5055 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5056 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5057 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5059 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5060 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5062 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5063 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5065 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5066 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5068 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5070 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5071 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5073 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5075 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5076 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5078 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5079 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5081 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5082 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5084 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5085 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5087 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5088 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5090 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5091 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5093 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5094 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5096 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5097 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5099 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5101 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5102 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5104 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5105 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5109 pRExC_state->code_blocks = NULL;
5110 pRExC_state->num_code_blocks = 0;
5115 if (expr && (expr->op_type == OP_LIST ||
5116 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5118 /* is the source UTF8, and how many code blocks are there? */
5122 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5123 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5125 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5126 /* count of DO blocks */
5130 pRExC_state->num_code_blocks = ncode;
5131 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5136 /* handle a list of SVs */
5140 /* apply magic and RE overloading to each arg */
5141 for (svp = patternp; svp < patternp + pat_count; svp++) {
5144 if (SvROK(rx) && SvAMAGIC(rx)) {
5145 SV *sv = AMG_CALLunary(rx, regexp_amg);
5149 if (SvTYPE(sv) != SVt_REGEXP)
5150 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5156 if (pat_count > 1) {
5157 /* concat multiple args and find any code block indexes */
5163 if (pRExC_state->num_code_blocks) {
5164 o = cLISTOPx(expr)->op_first;
5165 assert(o->op_type == OP_PUSHMARK);
5169 pat = newSVpvn("", 0);
5172 /* determine if the pattern is going to be utf8 (needed
5173 * in advance to align code block indices correctly).
5174 * XXX This could fail to be detected for an arg with
5175 * overloading but not concat overloading; but the main effect
5176 * in this obscure case is to need a 'use re eval' for a
5177 * literal code block */
5178 for (svp = patternp; svp < patternp + pat_count; svp++) {
5185 for (svp = patternp; svp < patternp + pat_count; svp++) {
5186 SV *sv, *msv = *svp;
5190 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5191 assert(n < pRExC_state->num_code_blocks);
5192 pRExC_state->code_blocks[n].start = SvCUR(pat);
5193 pRExC_state->code_blocks[n].block = o;
5194 pRExC_state->code_blocks[n].src_regex = NULL;
5197 o = o->op_sibling; /* skip CONST */
5203 /* extract any code blocks within any embedded qr//'s */
5207 if (SvTYPE(rx) == SVt_REGEXP
5208 && RX_ENGINE((REGEXP*)rx)->op_comp)
5211 RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5212 if (ri->num_code_blocks) {
5214 /* the presence of an embedded qr// with code means
5215 * we should always recompile: the text of the
5216 * qr// may not have changed, but it may be a
5217 * different closure than last time */
5219 Renew(pRExC_state->code_blocks,
5220 pRExC_state->num_code_blocks + ri->num_code_blocks,
5221 struct reg_code_block);
5222 pRExC_state->num_code_blocks += ri->num_code_blocks;
5223 for (i=0; i < ri->num_code_blocks; i++) {
5224 struct reg_code_block *src, *dst;
5225 STRLEN offset = SvCUR(pat)
5226 + ((struct regexp *)SvANY(rx))->pre_prefix;
5227 assert(n < pRExC_state->num_code_blocks);
5228 src = &ri->code_blocks[i];
5229 dst = &pRExC_state->code_blocks[n];
5230 dst->start = src->start + offset;
5231 dst->end = src->end + offset;
5232 dst->block = src->block;
5233 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5242 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5243 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5246 /* overloading involved: all bets are off over literal
5247 * code. Pretend we haven't seen it */
5248 pRExC_state->num_code_blocks -= n;
5253 sv_catsv_nomg(pat, msv);
5255 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5263 /* handle bare regex: foo =~ $re */
5268 if (SvTYPE(re) == SVt_REGEXP) {
5272 Safefree(pRExC_state->code_blocks);
5278 /* not a list of SVs, so must be a list of OPs */
5280 if (expr->op_type == OP_LIST) {
5285 pat = newSVpvn("", 0);
5290 /* given a list of CONSTs and DO blocks in expr, append all
5291 * the CONSTs to pat, and record the start and end of each
5292 * code block in code_blocks[] (each DO{} op is followed by an
5293 * OP_CONST containing the corresponding literal '(?{...})
5296 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5297 if (o->op_type == OP_CONST) {
5298 sv_catsv(pat, cSVOPo_sv);
5300 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5304 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5305 assert(i+1 < pRExC_state->num_code_blocks);
5306 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5307 pRExC_state->code_blocks[i].block = o;
5308 pRExC_state->code_blocks[i].src_regex = NULL;
5314 assert(expr->op_type == OP_CONST);
5315 pat = cSVOPx_sv(expr);
5319 exp = SvPV_nomg(pat, plen);
5321 if (!eng->op_comp) {
5322 if ((SvUTF8(pat) && IN_BYTES)
5323 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5325 /* make a temporary copy; either to convert to bytes,
5326 * or to avoid repeating get-magic / overloaded stringify */
5327 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5328 (IN_BYTES ? 0 : SvUTF8(pat)));
5330 Safefree(pRExC_state->code_blocks);
5331 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5334 /* ignore the utf8ness if the pattern is 0 length */
5335 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5336 RExC_uni_semantics = 0;
5337 RExC_contains_locale = 0;
5339 /****************** LONG JUMP TARGET HERE***********************/
5340 /* Longjmp back to here if have to switch in midstream to utf8 */
5341 if (! RExC_orig_utf8) {
5342 JMPENV_PUSH(jump_ret);
5343 used_setjump = TRUE;
5346 if (jump_ret == 0) { /* First time through */
5350 SV *dsv= sv_newmortal();
5351 RE_PV_QUOTED_DECL(s, RExC_utf8,
5352 dsv, exp, plen, 60);
5353 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5354 PL_colors[4],PL_colors[5],s);
5357 else { /* longjumped back */
5360 STRLEN s = 0, d = 0;
5363 /* If the cause for the longjmp was other than changing to utf8, pop
5364 * our own setjmp, and longjmp to the correct handler */
5365 if (jump_ret != UTF8_LONGJMP) {
5367 JMPENV_JUMP(jump_ret);
5372 /* It's possible to write a regexp in ascii that represents Unicode
5373 codepoints outside of the byte range, such as via \x{100}. If we
5374 detect such a sequence we have to convert the entire pattern to utf8
5375 and then recompile, as our sizing calculation will have been based
5376 on 1 byte == 1 character, but we will need to use utf8 to encode
5377 at least some part of the pattern, and therefore must convert the whole
5380 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5381 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5383 /* upgrade pattern to UTF8, and if there are code blocks,
5384 * recalculate the indices.
5385 * This is essentially an unrolled Perl_bytes_to_utf8() */
5387 src = (U8*)SvPV_nomg(pat, plen);
5388 Newx(dst, plen * 2 + 1, U8);
5391 const UV uv = NATIVE_TO_ASCII(src[s]);
5392 if (UNI_IS_INVARIANT(uv))
5393 dst[d] = (U8)UTF_TO_NATIVE(uv);
5395 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5396 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5398 if (n < pRExC_state->num_code_blocks) {
5399 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5400 pRExC_state->code_blocks[n].start = d;
5401 assert(dst[d] == '(');
5404 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5405 pRExC_state->code_blocks[n].end = d;
5406 assert(dst[d] == ')');
5419 RExC_orig_utf8 = RExC_utf8 = 1;
5422 /* return old regex if pattern hasn't changed */
5426 && !!RX_UTF8(old_re) == !!RExC_utf8
5427 && RX_PRECOMP(old_re)
5428 && RX_PRELEN(old_re) == plen
5429 && memEQ(RX_PRECOMP(old_re), exp, plen))
5431 /* see if there are any run-time code blocks */
5435 for (s = 0; s < plen; s++) {
5436 if (n < pRExC_state->num_code_blocks
5437 && s == pRExC_state->code_blocks[n].start)
5439 s = pRExC_state->code_blocks[n].end;
5443 if (exp[s] == '(' && exp[s+1] == '?' &&
5444 (exp[s+2] == '{' || (exp[s+2] == '?' && exp[s+3] == '{')))
5450 /* with runtime code, always recompile */
5452 ReREFCNT_inc(old_re);
5456 Safefree(pRExC_state->code_blocks);
5461 #ifdef TRIE_STUDY_OPT
5465 rx_flags = orig_rx_flags;
5467 if (initial_charset == REGEX_LOCALE_CHARSET) {
5468 RExC_contains_locale = 1;
5470 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5472 /* Set to use unicode semantics if the pattern is in utf8 and has the
5473 * 'depends' charset specified, as it means unicode when utf8 */
5474 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5478 RExC_flags = rx_flags;
5479 RExC_pm_flags = pm_flags;
5483 RExC_in_lookbehind = 0;
5484 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5485 RExC_seen_evals = 0;
5487 RExC_override_recoding = 0;
5489 /* First pass: determine size, legality. */
5497 RExC_emit = &PL_regdummy;
5498 RExC_whilem_seen = 0;
5499 RExC_open_parens = NULL;
5500 RExC_close_parens = NULL;
5502 RExC_paren_names = NULL;
5504 RExC_paren_name_list = NULL;
5506 RExC_recurse = NULL;
5507 RExC_recurse_count = 0;
5508 pRExC_state->code_index = 0;
5510 #if 0 /* REGC() is (currently) a NOP at the first pass.
5511 * Clever compilers notice this and complain. --jhi */
5512 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5514 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
5515 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5516 RExC_precomp = NULL;
5517 Safefree(pRExC_state->code_blocks);
5521 /* Here, finished first pass. Get rid of any added setjmp */
5527 PerlIO_printf(Perl_debug_log,
5528 "Required size %"IVdf" nodes\n"
5529 "Starting second pass (creation)\n",
5532 RExC_lastparse=NULL;
5535 /* The first pass could have found things that force Unicode semantics */
5536 if ((RExC_utf8 || RExC_uni_semantics)
5537 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5539 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5542 /* Small enough for pointer-storage convention?
5543 If extralen==0, this means that we will not need long jumps. */
5544 if (RExC_size >= 0x10000L && RExC_extralen)
5545 RExC_size += RExC_extralen;
5548 if (RExC_whilem_seen > 15)
5549 RExC_whilem_seen = 15;
5551 /* Allocate space and zero-initialize. Note, the two step process
5552 of zeroing when in debug mode, thus anything assigned has to
5553 happen after that */
5554 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5555 r = (struct regexp*)SvANY(rx);
5556 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5557 char, regexp_internal);
5558 if ( r == NULL || ri == NULL )
5559 FAIL("Regexp out of space");
5561 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5562 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5564 /* bulk initialize base fields with 0. */
5565 Zero(ri, sizeof(regexp_internal), char);
5568 /* non-zero initialization begins here */
5571 r->extflags = rx_flags;
5572 if (pm_flags & PMf_IS_QR) {
5573 ri->code_blocks = pRExC_state->code_blocks;
5574 ri->num_code_blocks = pRExC_state->num_code_blocks;
5577 SAVEFREEPV(pRExC_state->code_blocks);
5580 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5581 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5583 /* The caret is output if there are any defaults: if not all the STD
5584 * flags are set, or if no character set specifier is needed */
5586 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5588 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5589 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5590 >> RXf_PMf_STD_PMMOD_SHIFT);
5591 const char *fptr = STD_PAT_MODS; /*"msix"*/
5593 /* Allocate for the worst case, which is all the std flags are turned
5594 * on. If more precision is desired, we could do a population count of
5595 * the flags set. This could be done with a small lookup table, or by
5596 * shifting, masking and adding, or even, when available, assembly
5597 * language for a machine-language population count.
5598 * We never output a minus, as all those are defaults, so are
5599 * covered by the caret */
5600 const STRLEN wraplen = plen + has_p + has_runon
5601 + has_default /* If needs a caret */
5603 /* If needs a character set specifier */
5604 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5605 + (sizeof(STD_PAT_MODS) - 1)
5606 + (sizeof("(?:)") - 1);
5608 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5611 SvFLAGS(rx) |= SVf_UTF8;
5614 /* If a default, cover it using the caret */
5616 *p++= DEFAULT_PAT_MOD;
5620 const char* const name = get_regex_charset_name(r->extflags, &len);
5621 Copy(name, p, len, char);
5625 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5628 while((ch = *fptr++)) {
5636 Copy(RExC_precomp, p, plen, char);
5637 assert ((RX_WRAPPED(rx) - p) < 16);
5638 r->pre_prefix = p - RX_WRAPPED(rx);
5644 SvCUR_set(rx, p - SvPVX_const(rx));
5648 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5650 if (RExC_seen & REG_SEEN_RECURSE) {
5651 Newxz(RExC_open_parens, RExC_npar,regnode *);
5652 SAVEFREEPV(RExC_open_parens);
5653 Newxz(RExC_close_parens,RExC_npar,regnode *);
5654 SAVEFREEPV(RExC_close_parens);
5657 /* Useful during FAIL. */
5658 #ifdef RE_TRACK_PATTERN_OFFSETS
5659 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5660 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5661 "%s %"UVuf" bytes for offset annotations.\n",
5662 ri->u.offsets ? "Got" : "Couldn't get",
5663 (UV)((2*RExC_size+1) * sizeof(U32))));
5665 SetProgLen(ri,RExC_size);
5670 /* Second pass: emit code. */
5671 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5672 RExC_pm_flags = pm_flags;
5677 RExC_emit_start = ri->program;
5678 RExC_emit = ri->program;
5679 RExC_emit_bound = ri->program + RExC_size + 1;
5680 pRExC_state->code_index = 0;
5682 /* Store the count of eval-groups for security checks: */
5683 RExC_rx->seen_evals = RExC_seen_evals;
5684 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5685 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5689 /* XXXX To minimize changes to RE engine we always allocate
5690 3-units-long substrs field. */
5691 Newx(r->substrs, 1, struct reg_substr_data);
5692 if (RExC_recurse_count) {
5693 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5694 SAVEFREEPV(RExC_recurse);
5698 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5699 Zero(r->substrs, 1, struct reg_substr_data);
5701 #ifdef TRIE_STUDY_OPT
5703 StructCopy(&zero_scan_data, &data, scan_data_t);
5704 copyRExC_state = RExC_state;
5707 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5709 RExC_state = copyRExC_state;
5710 if (seen & REG_TOP_LEVEL_BRANCHES)
5711 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5713 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
5714 if (data.last_found) {
5715 SvREFCNT_dec(data.longest_fixed);
5716 SvREFCNT_dec(data.longest_float);
5717 SvREFCNT_dec(data.last_found);
5719 StructCopy(&zero_scan_data, &data, scan_data_t);
5722 StructCopy(&zero_scan_data, &data, scan_data_t);
5725 /* Dig out information for optimizations. */
5726 r->extflags = RExC_flags; /* was pm_op */
5727 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5730 SvUTF8_on(rx); /* Unicode in it? */
5731 ri->regstclass = NULL;
5732 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
5733 r->intflags |= PREGf_NAUGHTY;
5734 scan = ri->program + 1; /* First BRANCH. */
5736 /* testing for BRANCH here tells us whether there is "must appear"
5737 data in the pattern. If there is then we can use it for optimisations */
5738 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
5740 STRLEN longest_float_length, longest_fixed_length;
5741 struct regnode_charclass_class ch_class; /* pointed to by data */
5743 I32 last_close = 0; /* pointed to by data */
5744 regnode *first= scan;
5745 regnode *first_next= regnext(first);
5747 * Skip introductions and multiplicators >= 1
5748 * so that we can extract the 'meat' of the pattern that must
5749 * match in the large if() sequence following.
5750 * NOTE that EXACT is NOT covered here, as it is normally
5751 * picked up by the optimiser separately.
5753 * This is unfortunate as the optimiser isnt handling lookahead
5754 * properly currently.
5757 while ((OP(first) == OPEN && (sawopen = 1)) ||
5758 /* An OR of *one* alternative - should not happen now. */
5759 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
5760 /* for now we can't handle lookbehind IFMATCH*/
5761 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
5762 (OP(first) == PLUS) ||
5763 (OP(first) == MINMOD) ||
5764 /* An {n,m} with n>0 */
5765 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5766 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
5769 * the only op that could be a regnode is PLUS, all the rest
5770 * will be regnode_1 or regnode_2.
5773 if (OP(first) == PLUS)
5776 first += regarglen[OP(first)];
5778 first = NEXTOPER(first);
5779 first_next= regnext(first);
5782 /* Starting-point info. */
5784 DEBUG_PEEP("first:",first,0);
5785 /* Ignore EXACT as we deal with it later. */
5786 if (PL_regkind[OP(first)] == EXACT) {
5787 if (OP(first) == EXACT)
5788 NOOP; /* Empty, get anchored substr later. */
5790 ri->regstclass = first;
5793 else if (PL_regkind[OP(first)] == TRIE &&
5794 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
5797 /* this can happen only on restudy */
5798 if ( OP(first) == TRIE ) {
5799 struct regnode_1 *trieop = (struct regnode_1 *)
5800 PerlMemShared_calloc(1, sizeof(struct regnode_1));
5801 StructCopy(first,trieop,struct regnode_1);
5802 trie_op=(regnode *)trieop;
5804 struct regnode_charclass *trieop = (struct regnode_charclass *)
5805 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
5806 StructCopy(first,trieop,struct regnode_charclass);
5807 trie_op=(regnode *)trieop;
5810 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
5811 ri->regstclass = trie_op;
5814 else if (REGNODE_SIMPLE(OP(first)))
5815 ri->regstclass = first;
5816 else if (PL_regkind[OP(first)] == BOUND ||
5817 PL_regkind[OP(first)] == NBOUND)
5818 ri->regstclass = first;
5819 else if (PL_regkind[OP(first)] == BOL) {
5820 r->extflags |= (OP(first) == MBOL
5822 : (OP(first) == SBOL
5825 first = NEXTOPER(first);
5828 else if (OP(first) == GPOS) {
5829 r->extflags |= RXf_ANCH_GPOS;
5830 first = NEXTOPER(first);
5833 else if ((!sawopen || !RExC_sawback) &&
5834 (OP(first) == STAR &&
5835 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
5836 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
5838 /* turn .* into ^.* with an implied $*=1 */
5840 (OP(NEXTOPER(first)) == REG_ANY)
5843 r->extflags |= type;
5844 r->intflags |= PREGf_IMPLICIT;
5845 first = NEXTOPER(first);
5848 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
5849 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
5850 /* x+ must match at the 1st pos of run of x's */
5851 r->intflags |= PREGf_SKIP;
5853 /* Scan is after the zeroth branch, first is atomic matcher. */
5854 #ifdef TRIE_STUDY_OPT
5857 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5858 (IV)(first - scan + 1))
5862 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5863 (IV)(first - scan + 1))
5869 * If there's something expensive in the r.e., find the
5870 * longest literal string that must appear and make it the
5871 * regmust. Resolve ties in favor of later strings, since
5872 * the regstart check works with the beginning of the r.e.
5873 * and avoiding duplication strengthens checking. Not a
5874 * strong reason, but sufficient in the absence of others.
5875 * [Now we resolve ties in favor of the earlier string if
5876 * it happens that c_offset_min has been invalidated, since the
5877 * earlier string may buy us something the later one won't.]
5880 data.longest_fixed = newSVpvs("");
5881 data.longest_float = newSVpvs("");
5882 data.last_found = newSVpvs("");
5883 data.longest = &(data.longest_fixed);
5885 if (!ri->regstclass) {
5886 cl_init(pRExC_state, &ch_class);
5887 data.start_class = &ch_class;
5888 stclass_flag = SCF_DO_STCLASS_AND;
5889 } else /* XXXX Check for BOUND? */
5891 data.last_closep = &last_close;
5893 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
5894 &data, -1, NULL, NULL,
5895 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
5901 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
5902 && data.last_start_min == 0 && data.last_end > 0
5903 && !RExC_seen_zerolen
5904 && !(RExC_seen & REG_SEEN_VERBARG)
5905 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5906 r->extflags |= RXf_CHECK_ALL;
5907 scan_commit(pRExC_state, &data,&minlen,0);
5908 SvREFCNT_dec(data.last_found);
5910 /* Note that code very similar to this but for anchored string
5911 follows immediately below, changes may need to be made to both.
5914 longest_float_length = CHR_SVLEN(data.longest_float);
5915 if (longest_float_length
5916 || (data.flags & SF_FL_BEFORE_EOL
5917 && (!(data.flags & SF_FL_BEFORE_MEOL)
5918 || (RExC_flags & RXf_PMf_MULTILINE))))
5922 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5923 if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5924 || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
5925 && data.offset_fixed == data.offset_float_min
5926 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
5927 goto remove_float; /* As in (a)+. */
5929 /* copy the information about the longest float from the reg_scan_data
5930 over to the program. */
5931 if (SvUTF8(data.longest_float)) {
5932 r->float_utf8 = data.longest_float;
5933 r->float_substr = NULL;
5935 r->float_substr = data.longest_float;
5936 r->float_utf8 = NULL;
5938 /* float_end_shift is how many chars that must be matched that
5939 follow this item. We calculate it ahead of time as once the
5940 lookbehind offset is added in we lose the ability to correctly
5942 ml = data.minlen_float ? *(data.minlen_float)
5943 : (I32)longest_float_length;
5944 r->float_end_shift = ml - data.offset_float_min
5945 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5946 + data.lookbehind_float;
5947 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
5948 r->float_max_offset = data.offset_float_max;
5949 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
5950 r->float_max_offset -= data.lookbehind_float;
5952 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5953 && (!(data.flags & SF_FL_BEFORE_MEOL)
5954 || (RExC_flags & RXf_PMf_MULTILINE)));
5955 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
5959 r->float_substr = r->float_utf8 = NULL;
5960 SvREFCNT_dec(data.longest_float);
5961 longest_float_length = 0;
5964 /* Note that code very similar to this but for floating string
5965 is immediately above, changes may need to be made to both.
5968 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
5970 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5971 if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5972 && (longest_fixed_length
5973 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5974 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5975 || (RExC_flags & RXf_PMf_MULTILINE)))) )
5979 /* copy the information about the longest fixed
5980 from the reg_scan_data over to the program. */
5981 if (SvUTF8(data.longest_fixed)) {
5982 r->anchored_utf8 = data.longest_fixed;
5983 r->anchored_substr = NULL;
5985 r->anchored_substr = data.longest_fixed;
5986 r->anchored_utf8 = NULL;
5988 /* fixed_end_shift is how many chars that must be matched that
5989 follow this item. We calculate it ahead of time as once the
5990 lookbehind offset is added in we lose the ability to correctly
5992 ml = data.minlen_fixed ? *(data.minlen_fixed)
5993 : (I32)longest_fixed_length;
5994 r->anchored_end_shift = ml - data.offset_fixed
5995 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5996 + data.lookbehind_fixed;
5997 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5999 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
6000 && (!(data.flags & SF_FIX_BEFORE_MEOL)
6001 || (RExC_flags & RXf_PMf_MULTILINE)));
6002 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
6005 r->anchored_substr = r->anchored_utf8 = NULL;
6006 SvREFCNT_dec(data.longest_fixed);
6007 longest_fixed_length = 0;
6010 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6011 ri->regstclass = NULL;
6013 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6015 && !(data.start_class->flags & ANYOF_EOS)
6016 && !cl_is_anything(data.start_class))
6018 const U32 n = add_data(pRExC_state, 1, "f");
6019 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6021 Newx(RExC_rxi->data->data[n], 1,
6022 struct regnode_charclass_class);
6023 StructCopy(data.start_class,
6024 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6025 struct regnode_charclass_class);
6026 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6027 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6028 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6029 regprop(r, sv, (regnode*)data.start_class);
6030 PerlIO_printf(Perl_debug_log,
6031 "synthetic stclass \"%s\".\n",
6032 SvPVX_const(sv));});
6035 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6036 if (longest_fixed_length > longest_float_length) {
6037 r->check_end_shift = r->anchored_end_shift;
6038 r->check_substr = r->anchored_substr;
6039 r->check_utf8 = r->anchored_utf8;
6040 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6041 if (r->extflags & RXf_ANCH_SINGLE)
6042 r->extflags |= RXf_NOSCAN;
6045 r->check_end_shift = r->float_end_shift;
6046 r->check_substr = r->float_substr;
6047 r->check_utf8 = r->float_utf8;
6048 r->check_offset_min = r->float_min_offset;
6049 r->check_offset_max = r->float_max_offset;
6051 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6052 This should be changed ASAP! */
6053 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6054 r->extflags |= RXf_USE_INTUIT;
6055 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6056 r->extflags |= RXf_INTUIT_TAIL;
6058 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6059 if ( (STRLEN)minlen < longest_float_length )
6060 minlen= longest_float_length;
6061 if ( (STRLEN)minlen < longest_fixed_length )
6062 minlen= longest_fixed_length;
6066 /* Several toplevels. Best we can is to set minlen. */
6068 struct regnode_charclass_class ch_class;
6071 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6073 scan = ri->program + 1;
6074 cl_init(pRExC_state, &ch_class);
6075 data.start_class = &ch_class;
6076 data.last_closep = &last_close;
6079 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6080 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6084 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6085 = r->float_substr = r->float_utf8 = NULL;
6087 if (!(data.start_class->flags & ANYOF_EOS)
6088 && !cl_is_anything(data.start_class))
6090 const U32 n = add_data(pRExC_state, 1, "f");
6091 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6093 Newx(RExC_rxi->data->data[n], 1,
6094 struct regnode_charclass_class);
6095 StructCopy(data.start_class,
6096 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6097 struct regnode_charclass_class);
6098 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6099 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6100 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6101 regprop(r, sv, (regnode*)data.start_class);
6102 PerlIO_printf(Perl_debug_log,
6103 "synthetic stclass \"%s\".\n",
6104 SvPVX_const(sv));});
6108 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6109 the "real" pattern. */
6111 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6112 (IV)minlen, (IV)r->minlen);
6114 r->minlenret = minlen;
6115 if (r->minlen < minlen)
6118 if (RExC_seen & REG_SEEN_GPOS)
6119 r->extflags |= RXf_GPOS_SEEN;
6120 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6121 r->extflags |= RXf_LOOKBEHIND_SEEN;
6122 if (RExC_seen & REG_SEEN_EVAL)
6123 r->extflags |= RXf_EVAL_SEEN;
6124 if (RExC_seen & REG_SEEN_CANY)
6125 r->extflags |= RXf_CANY_SEEN;
6126 if (RExC_seen & REG_SEEN_VERBARG)
6127 r->intflags |= PREGf_VERBARG_SEEN;
6128 if (RExC_seen & REG_SEEN_CUTGROUP)
6129 r->intflags |= PREGf_CUTGROUP_SEEN;
6130 if (RExC_paren_names)
6131 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6133 RXp_PAREN_NAMES(r) = NULL;
6135 #ifdef STUPID_PATTERN_CHECKS
6136 if (RX_PRELEN(rx) == 0)
6137 r->extflags |= RXf_NULL;
6138 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6139 /* XXX: this should happen BEFORE we compile */
6140 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6141 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6142 r->extflags |= RXf_WHITE;
6143 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6144 r->extflags |= RXf_START_ONLY;
6146 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
6147 /* XXX: this should happen BEFORE we compile */
6148 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
6150 regnode *first = ri->program + 1;
6153 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6154 r->extflags |= RXf_NULL;
6155 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6156 r->extflags |= RXf_START_ONLY;
6157 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6158 && OP(regnext(first)) == END)
6159 r->extflags |= RXf_WHITE;
6163 if (RExC_paren_names) {
6164 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6165 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6168 ri->name_list_idx = 0;
6170 if (RExC_recurse_count) {
6171 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6172 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6173 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6176 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6177 /* assume we don't need to swap parens around before we match */
6180 PerlIO_printf(Perl_debug_log,"Final program:\n");
6183 #ifdef RE_TRACK_PATTERN_OFFSETS
6184 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6185 const U32 len = ri->u.offsets[0];
6187 GET_RE_DEBUG_FLAGS_DECL;
6188 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6189 for (i = 1; i <= len; i++) {
6190 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6191 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6192 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6194 PerlIO_printf(Perl_debug_log, "\n");
6202 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6205 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6207 PERL_UNUSED_ARG(value);
6209 if (flags & RXapif_FETCH) {
6210 return reg_named_buff_fetch(rx, key, flags);
6211 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6212 Perl_croak_no_modify(aTHX);
6214 } else if (flags & RXapif_EXISTS) {
6215 return reg_named_buff_exists(rx, key, flags)
6218 } else if (flags & RXapif_REGNAMES) {
6219 return reg_named_buff_all(rx, flags);
6220 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6221 return reg_named_buff_scalar(rx, flags);
6223 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6229 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6232 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6233 PERL_UNUSED_ARG(lastkey);
6235 if (flags & RXapif_FIRSTKEY)
6236 return reg_named_buff_firstkey(rx, flags);
6237 else if (flags & RXapif_NEXTKEY)
6238 return reg_named_buff_nextkey(rx, flags);
6240 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6246 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6249 AV *retarray = NULL;
6251 struct regexp *const rx = (struct regexp *)SvANY(r);
6253 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6255 if (flags & RXapif_ALL)
6258 if (rx && RXp_PAREN_NAMES(rx)) {
6259 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6262 SV* sv_dat=HeVAL(he_str);
6263 I32 *nums=(I32*)SvPVX(sv_dat);
6264 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6265 if ((I32)(rx->nparens) >= nums[i]
6266 && rx->offs[nums[i]].start != -1
6267 && rx->offs[nums[i]].end != -1)
6270 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6275 ret = newSVsv(&PL_sv_undef);
6278 av_push(retarray, ret);
6281 return newRV_noinc(MUTABLE_SV(retarray));
6288 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6291 struct regexp *const rx = (struct regexp *)SvANY(r);
6293 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6295 if (rx && RXp_PAREN_NAMES(rx)) {
6296 if (flags & RXapif_ALL) {
6297 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6299 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6313 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6315 struct regexp *const rx = (struct regexp *)SvANY(r);
6317 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6319 if ( rx && RXp_PAREN_NAMES(rx) ) {
6320 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6322 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6329 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6331 struct regexp *const rx = (struct regexp *)SvANY(r);
6332 GET_RE_DEBUG_FLAGS_DECL;
6334 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6336 if (rx && RXp_PAREN_NAMES(rx)) {
6337 HV *hv = RXp_PAREN_NAMES(rx);
6339 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6342 SV* sv_dat = HeVAL(temphe);
6343 I32 *nums = (I32*)SvPVX(sv_dat);
6344 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6345 if ((I32)(rx->lastparen) >= nums[i] &&
6346 rx->offs[nums[i]].start != -1 &&
6347 rx->offs[nums[i]].end != -1)
6353 if (parno || flags & RXapif_ALL) {
6354 return newSVhek(HeKEY_hek(temphe));
6362 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6367 struct regexp *const rx = (struct regexp *)SvANY(r);
6369 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6371 if (rx && RXp_PAREN_NAMES(rx)) {
6372 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6373 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6374 } else if (flags & RXapif_ONE) {
6375 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6376 av = MUTABLE_AV(SvRV(ret));
6377 length = av_len(av);
6379 return newSViv(length + 1);
6381 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6385 return &PL_sv_undef;
6389 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6391 struct regexp *const rx = (struct regexp *)SvANY(r);
6394 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6396 if (rx && RXp_PAREN_NAMES(rx)) {
6397 HV *hv= RXp_PAREN_NAMES(rx);
6399 (void)hv_iterinit(hv);
6400 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6403 SV* sv_dat = HeVAL(temphe);
6404 I32 *nums = (I32*)SvPVX(sv_dat);
6405 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6406 if ((I32)(rx->lastparen) >= nums[i] &&
6407 rx->offs[nums[i]].start != -1 &&
6408 rx->offs[nums[i]].end != -1)
6414 if (parno || flags & RXapif_ALL) {
6415 av_push(av, newSVhek(HeKEY_hek(temphe)));
6420 return newRV_noinc(MUTABLE_SV(av));
6424 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6427 struct regexp *const rx = (struct regexp *)SvANY(r);
6432 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6435 sv_setsv(sv,&PL_sv_undef);
6439 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
6441 i = rx->offs[0].start;
6445 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
6447 s = rx->subbeg + rx->offs[0].end;
6448 i = rx->sublen - rx->offs[0].end;
6451 if ( 0 <= paren && paren <= (I32)rx->nparens &&
6452 (s1 = rx->offs[paren].start) != -1 &&
6453 (t1 = rx->offs[paren].end) != -1)
6457 s = rx->subbeg + s1;
6459 sv_setsv(sv,&PL_sv_undef);
6462 assert(rx->sublen >= (s - rx->subbeg) + i );
6464 const int oldtainted = PL_tainted;
6466 sv_setpvn(sv, s, i);
6467 PL_tainted = oldtainted;
6468 if ( (rx->extflags & RXf_CANY_SEEN)
6469 ? (RXp_MATCH_UTF8(rx)
6470 && (!i || is_utf8_string((U8*)s, i)))
6471 : (RXp_MATCH_UTF8(rx)) )
6478 if (RXp_MATCH_TAINTED(rx)) {
6479 if (SvTYPE(sv) >= SVt_PVMG) {
6480 MAGIC* const mg = SvMAGIC(sv);
6483 SvMAGIC_set(sv, mg->mg_moremagic);
6485 if ((mgt = SvMAGIC(sv))) {
6486 mg->mg_moremagic = mgt;
6487 SvMAGIC_set(sv, mg);
6497 sv_setsv(sv,&PL_sv_undef);
6503 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6504 SV const * const value)
6506 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6508 PERL_UNUSED_ARG(rx);
6509 PERL_UNUSED_ARG(paren);
6510 PERL_UNUSED_ARG(value);
6513 Perl_croak_no_modify(aTHX);
6517 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6520 struct regexp *const rx = (struct regexp *)SvANY(r);
6524 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6526 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6528 /* $` / ${^PREMATCH} */
6529 case RX_BUFF_IDX_PREMATCH:
6530 if (rx->offs[0].start != -1) {
6531 i = rx->offs[0].start;
6539 /* $' / ${^POSTMATCH} */
6540 case RX_BUFF_IDX_POSTMATCH:
6541 if (rx->offs[0].end != -1) {
6542 i = rx->sublen - rx->offs[0].end;
6544 s1 = rx->offs[0].end;
6550 /* $& / ${^MATCH}, $1, $2, ... */
6552 if (paren <= (I32)rx->nparens &&
6553 (s1 = rx->offs[paren].start) != -1 &&
6554 (t1 = rx->offs[paren].end) != -1)
6559 if (ckWARN(WARN_UNINITIALIZED))
6560 report_uninit((const SV *)sv);
6565 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6566 const char * const s = rx->subbeg + s1;
6571 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6578 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6580 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6581 PERL_UNUSED_ARG(rx);
6585 return newSVpvs("Regexp");
6588 /* Scans the name of a named buffer from the pattern.
6589 * If flags is REG_RSN_RETURN_NULL returns null.
6590 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6591 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6592 * to the parsed name as looked up in the RExC_paren_names hash.
6593 * If there is an error throws a vFAIL().. type exception.
6596 #define REG_RSN_RETURN_NULL 0
6597 #define REG_RSN_RETURN_NAME 1
6598 #define REG_RSN_RETURN_DATA 2
6601 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6603 char *name_start = RExC_parse;
6605 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6607 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6608 /* skip IDFIRST by using do...while */
6611 RExC_parse += UTF8SKIP(RExC_parse);
6612 } while (isALNUM_utf8((U8*)RExC_parse));
6616 } while (isALNUM(*RExC_parse));
6621 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6622 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6623 if ( flags == REG_RSN_RETURN_NAME)
6625 else if (flags==REG_RSN_RETURN_DATA) {
6628 if ( ! sv_name ) /* should not happen*/
6629 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6630 if (RExC_paren_names)
6631 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6633 sv_dat = HeVAL(he_str);
6635 vFAIL("Reference to nonexistent named group");
6639 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6640 (unsigned long) flags);
6647 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6648 int rem=(int)(RExC_end - RExC_parse); \
6657 if (RExC_lastparse!=RExC_parse) \
6658 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6661 iscut ? "..." : "<" \
6664 PerlIO_printf(Perl_debug_log,"%16s",""); \
6667 num = RExC_size + 1; \
6669 num=REG_NODE_NUM(RExC_emit); \
6670 if (RExC_lastnum!=num) \
6671 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6673 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6674 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6675 (int)((depth*2)), "", \
6679 RExC_lastparse=RExC_parse; \
6684 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6685 DEBUG_PARSE_MSG((funcname)); \
6686 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6688 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6689 DEBUG_PARSE_MSG((funcname)); \
6690 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6693 /* This section of code defines the inversion list object and its methods. The
6694 * interfaces are highly subject to change, so as much as possible is static to
6695 * this file. An inversion list is here implemented as a malloc'd C UV array
6696 * with some added info that is placed as UVs at the beginning in a header
6697 * portion. An inversion list for Unicode is an array of code points, sorted
6698 * by ordinal number. The zeroth element is the first code point in the list.
6699 * The 1th element is the first element beyond that not in the list. In other
6700 * words, the first range is
6701 * invlist[0]..(invlist[1]-1)
6702 * The other ranges follow. Thus every element whose index is divisible by two
6703 * marks the beginning of a range that is in the list, and every element not
6704 * divisible by two marks the beginning of a range not in the list. A single
6705 * element inversion list that contains the single code point N generally
6706 * consists of two elements
6709 * (The exception is when N is the highest representable value on the
6710 * machine, in which case the list containing just it would be a single
6711 * element, itself. By extension, if the last range in the list extends to
6712 * infinity, then the first element of that range will be in the inversion list
6713 * at a position that is divisible by two, and is the final element in the
6715 * Taking the complement (inverting) an inversion list is quite simple, if the
6716 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6717 * This implementation reserves an element at the beginning of each inversion list
6718 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6719 * beginning of the list is either that element if 0, or the next one if 1.
6721 * More about inversion lists can be found in "Unicode Demystified"
6722 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6723 * More will be coming when functionality is added later.
6725 * The inversion list data structure is currently implemented as an SV pointing
6726 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6727 * array of UV whose memory management is automatically handled by the existing
6728 * facilities for SV's.
6730 * Some of the methods should always be private to the implementation, and some
6731 * should eventually be made public */
6733 #define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
6734 #define INVLIST_ITER_OFFSET 1 /* Current iteration position */
6736 /* This is a combination of a version and data structure type, so that one
6737 * being passed in can be validated to be an inversion list of the correct
6738 * vintage. When the structure of the header is changed, a new random number
6739 * in the range 2**31-1 should be generated and the new() method changed to
6740 * insert that at this location. Then, if an auxiliary program doesn't change
6741 * correspondingly, it will be discovered immediately */
6742 #define INVLIST_VERSION_ID_OFFSET 2
6743 #define INVLIST_VERSION_ID 1064334010
6745 /* For safety, when adding new elements, remember to #undef them at the end of
6746 * the inversion list code section */
6748 #define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
6749 /* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
6750 * contains the code point U+00000, and begins here. If 1, the inversion list
6751 * doesn't contain U+0000, and it begins at the next UV in the array.
6752 * Inverting an inversion list consists of adding or removing the 0 at the
6753 * beginning of it. By reserving a space for that 0, inversion can be made
6756 #define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
6758 /* Internally things are UVs */
6759 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6760 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6762 #define INVLIST_INITIAL_LEN 10
6764 PERL_STATIC_INLINE UV*
6765 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6767 /* Returns a pointer to the first element in the inversion list's array.
6768 * This is called upon initialization of an inversion list. Where the
6769 * array begins depends on whether the list has the code point U+0000
6770 * in it or not. The other parameter tells it whether the code that
6771 * follows this call is about to put a 0 in the inversion list or not.
6772 * The first element is either the element with 0, if 0, or the next one,
6775 UV* zero = get_invlist_zero_addr(invlist);
6777 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6780 assert(! *get_invlist_len_addr(invlist));
6782 /* 1^1 = 0; 1^0 = 1 */
6783 *zero = 1 ^ will_have_0;
6784 return zero + *zero;
6787 PERL_STATIC_INLINE UV*
6788 S_invlist_array(pTHX_ SV* const invlist)
6790 /* Returns the pointer to the inversion list's array. Every time the
6791 * length changes, this needs to be called in case malloc or realloc moved
6794 PERL_ARGS_ASSERT_INVLIST_ARRAY;
6796 /* Must not be empty. If these fail, you probably didn't check for <len>
6797 * being non-zero before trying to get the array */
6798 assert(*get_invlist_len_addr(invlist));
6799 assert(*get_invlist_zero_addr(invlist) == 0
6800 || *get_invlist_zero_addr(invlist) == 1);
6802 /* The array begins either at the element reserved for zero if the
6803 * list contains 0 (that element will be set to 0), or otherwise the next
6804 * element (in which case the reserved element will be set to 1). */
6805 return (UV *) (get_invlist_zero_addr(invlist)
6806 + *get_invlist_zero_addr(invlist));
6809 PERL_STATIC_INLINE UV*
6810 S_get_invlist_len_addr(pTHX_ SV* invlist)
6812 /* Return the address of the UV that contains the current number
6813 * of used elements in the inversion list */
6815 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6817 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6820 PERL_STATIC_INLINE UV
6821 S_invlist_len(pTHX_ SV* const invlist)
6823 /* Returns the current number of elements stored in the inversion list's
6826 PERL_ARGS_ASSERT_INVLIST_LEN;
6828 return *get_invlist_len_addr(invlist);
6831 PERL_STATIC_INLINE void
6832 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6834 /* Sets the current number of elements stored in the inversion list */
6836 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6838 *get_invlist_len_addr(invlist) = len;
6840 assert(len <= SvLEN(invlist));
6842 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6843 /* If the list contains U+0000, that element is part of the header,
6844 * and should not be counted as part of the array. It will contain
6845 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
6847 * SvCUR_set(invlist,
6848 * TO_INTERNAL_SIZE(len
6849 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
6850 * But, this is only valid if len is not 0. The consequences of not doing
6851 * this is that the memory allocation code may think that 1 more UV is
6852 * being used than actually is, and so might do an unnecessary grow. That
6853 * seems worth not bothering to make this the precise amount.
6855 * Note that when inverting, SvCUR shouldn't change */
6858 PERL_STATIC_INLINE UV
6859 S_invlist_max(pTHX_ SV* const invlist)
6861 /* Returns the maximum number of elements storable in the inversion list's
6862 * array, without having to realloc() */
6864 PERL_ARGS_ASSERT_INVLIST_MAX;
6866 return FROM_INTERNAL_SIZE(SvLEN(invlist));
6869 PERL_STATIC_INLINE UV*
6870 S_get_invlist_zero_addr(pTHX_ SV* invlist)
6872 /* Return the address of the UV that is reserved to hold 0 if the inversion
6873 * list contains 0. This has to be the last element of the heading, as the
6874 * list proper starts with either it if 0, or the next element if not.
6875 * (But we force it to contain either 0 or 1) */
6877 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6879 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6882 #ifndef PERL_IN_XSUB_RE
6884 Perl__new_invlist(pTHX_ IV initial_size)
6887 /* Return a pointer to a newly constructed inversion list, with enough
6888 * space to store 'initial_size' elements. If that number is negative, a
6889 * system default is used instead */
6893 if (initial_size < 0) {
6894 initial_size = INVLIST_INITIAL_LEN;
6897 /* Allocate the initial space */
6898 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6899 invlist_set_len(new_list, 0);
6901 /* Force iterinit() to be used to get iteration to work */
6902 *get_invlist_iter_addr(new_list) = UV_MAX;
6904 /* This should force a segfault if a method doesn't initialize this
6906 *get_invlist_zero_addr(new_list) = UV_MAX;
6908 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6909 #if HEADER_LENGTH != 4
6910 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6918 S__new_invlist_C_array(pTHX_ UV* list)
6920 /* Return a pointer to a newly constructed inversion list, initialized to
6921 * point to <list>, which has to be in the exact correct inversion list
6922 * form, including internal fields. Thus this is a dangerous routine that
6923 * should not be used in the wrong hands */
6925 SV* invlist = newSV_type(SVt_PV);
6927 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6929 SvPV_set(invlist, (char *) list);
6930 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
6931 shouldn't touch it */
6932 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6934 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6935 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6942 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
6944 /* Grow the maximum size of an inversion list */
6946 PERL_ARGS_ASSERT_INVLIST_EXTEND;
6948 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
6951 PERL_STATIC_INLINE void
6952 S_invlist_trim(pTHX_ SV* const invlist)
6954 PERL_ARGS_ASSERT_INVLIST_TRIM;
6956 /* Change the length of the inversion list to how many entries it currently
6959 SvPV_shrink_to_cur((SV *) invlist);
6962 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6964 #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6965 #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
6967 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6970 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
6972 /* Subject to change or removal. Append the range from 'start' to 'end' at
6973 * the end of the inversion list. The range must be above any existing
6977 UV max = invlist_max(invlist);
6978 UV len = invlist_len(invlist);
6980 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6982 if (len == 0) { /* Empty lists must be initialized */
6983 array = _invlist_array_init(invlist, start == 0);
6986 /* Here, the existing list is non-empty. The current max entry in the
6987 * list is generally the first value not in the set, except when the
6988 * set extends to the end of permissible values, in which case it is
6989 * the first entry in that final set, and so this call is an attempt to
6990 * append out-of-order */
6992 UV final_element = len - 1;
6993 array = invlist_array(invlist);
6994 if (array[final_element] > start
6995 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
6997 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6998 array[final_element], start,
6999 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7002 /* Here, it is a legal append. If the new range begins with the first
7003 * value not in the set, it is extending the set, so the new first
7004 * value not in the set is one greater than the newly extended range.
7006 if (array[final_element] == start) {
7007 if (end != UV_MAX) {
7008 array[final_element] = end + 1;
7011 /* But if the end is the maximum representable on the machine,
7012 * just let the range that this would extend to have no end */
7013 invlist_set_len(invlist, len - 1);
7019 /* Here the new range doesn't extend any existing set. Add it */
7021 len += 2; /* Includes an element each for the start and end of range */
7023 /* If overflows the existing space, extend, which may cause the array to be
7026 invlist_extend(invlist, len);
7027 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7028 failure in invlist_array() */
7029 array = invlist_array(invlist);
7032 invlist_set_len(invlist, len);
7035 /* The next item on the list starts the range, the one after that is
7036 * one past the new range. */
7037 array[len - 2] = start;
7038 if (end != UV_MAX) {
7039 array[len - 1] = end + 1;
7042 /* But if the end is the maximum representable on the machine, just let
7043 * the range have no end */
7044 invlist_set_len(invlist, len - 1);
7048 #ifndef PERL_IN_XSUB_RE
7051 S_invlist_search(pTHX_ SV* const invlist, const UV cp)
7053 /* Searches the inversion list for the entry that contains the input code
7054 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7055 * return value is the index into the list's array of the range that
7059 IV high = invlist_len(invlist);
7060 const UV * const array = invlist_array(invlist);
7062 PERL_ARGS_ASSERT_INVLIST_SEARCH;
7064 /* If list is empty or the code point is before the first element, return
7066 if (high == 0 || cp < array[0]) {
7070 /* Binary search. What we are looking for is <i> such that
7071 * array[i] <= cp < array[i+1]
7072 * The loop below converges on the i+1. */
7073 while (low < high) {
7074 IV mid = (low + high) / 2;
7075 if (array[mid] <= cp) {
7078 /* We could do this extra test to exit the loop early.
7079 if (cp < array[low]) {
7084 else { /* cp < array[mid] */
7093 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7095 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7096 * but is used when the swash has an inversion list. This makes this much
7097 * faster, as it uses a binary search instead of a linear one. This is
7098 * intimately tied to that function, and perhaps should be in utf8.c,
7099 * except it is intimately tied to inversion lists as well. It assumes
7100 * that <swatch> is all 0's on input */
7103 const IV len = invlist_len(invlist);
7107 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7109 if (len == 0) { /* Empty inversion list */
7113 array = invlist_array(invlist);
7115 /* Find which element it is */
7116 i = invlist_search(invlist, start);
7118 /* We populate from <start> to <end> */
7119 while (current < end) {
7122 /* The inversion list gives the results for every possible code point
7123 * after the first one in the list. Only those ranges whose index is
7124 * even are ones that the inversion list matches. For the odd ones,
7125 * and if the initial code point is not in the list, we have to skip
7126 * forward to the next element */
7127 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7129 if (i >= len) { /* Finished if beyond the end of the array */
7133 if (current >= end) { /* Finished if beyond the end of what we
7138 assert(current >= start);
7140 /* The current range ends one below the next one, except don't go past
7143 upper = (i < len && array[i] < end) ? array[i] : end;
7145 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7146 * for each code point in it */
7147 for (; current < upper; current++) {
7148 const STRLEN offset = (STRLEN)(current - start);
7149 swatch[offset >> 3] |= 1 << (offset & 7);
7152 /* Quit if at the end of the list */
7155 /* But first, have to deal with the highest possible code point on
7156 * the platform. The previous code assumes that <end> is one
7157 * beyond where we want to populate, but that is impossible at the
7158 * platform's infinity, so have to handle it specially */
7159 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7161 const STRLEN offset = (STRLEN)(end - start);
7162 swatch[offset >> 3] |= 1 << (offset & 7);
7167 /* Advance to the next range, which will be for code points not in the
7177 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7179 /* Take the union of two inversion lists and point <output> to it. *output
7180 * should be defined upon input, and if it points to one of the two lists,
7181 * the reference count to that list will be decremented. The first list,
7182 * <a>, may be NULL, in which case a copy of the second list is returned.
7183 * If <complement_b> is TRUE, the union is taken of the complement
7184 * (inversion) of <b> instead of b itself.
7186 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7187 * Richard Gillam, published by Addison-Wesley, and explained at some
7188 * length there. The preface says to incorporate its examples into your
7189 * code at your own risk.
7191 * The algorithm is like a merge sort.
7193 * XXX A potential performance improvement is to keep track as we go along
7194 * if only one of the inputs contributes to the result, meaning the other
7195 * is a subset of that one. In that case, we can skip the final copy and
7196 * return the larger of the input lists, but then outside code might need
7197 * to keep track of whether to free the input list or not */
7199 UV* array_a; /* a's array */
7201 UV len_a; /* length of a's array */
7204 SV* u; /* the resulting union */
7208 UV i_a = 0; /* current index into a's array */
7212 /* running count, as explained in the algorithm source book; items are
7213 * stopped accumulating and are output when the count changes to/from 0.
7214 * The count is incremented when we start a range that's in the set, and
7215 * decremented when we start a range that's not in the set. So its range
7216 * is 0 to 2. Only when the count is zero is something not in the set.
7220 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7223 /* If either one is empty, the union is the other one */
7224 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
7231 *output = invlist_clone(b);
7233 _invlist_invert(*output);
7235 } /* else *output already = b; */
7238 else if ((len_b = invlist_len(b)) == 0) {
7243 /* The complement of an empty list is a list that has everything in it,
7244 * so the union with <a> includes everything too */
7249 *output = _new_invlist(1);
7250 _append_range_to_invlist(*output, 0, UV_MAX);
7252 else if (*output != a) {
7253 *output = invlist_clone(a);
7255 /* else *output already = a; */
7259 /* Here both lists exist and are non-empty */
7260 array_a = invlist_array(a);
7261 array_b = invlist_array(b);
7263 /* If are to take the union of 'a' with the complement of b, set it
7264 * up so are looking at b's complement. */
7267 /* To complement, we invert: if the first element is 0, remove it. To
7268 * do this, we just pretend the array starts one later, and clear the
7269 * flag as we don't have to do anything else later */
7270 if (array_b[0] == 0) {
7273 complement_b = FALSE;
7277 /* But if the first element is not zero, we unshift a 0 before the
7278 * array. The data structure reserves a space for that 0 (which
7279 * should be a '1' right now), so physical shifting is unneeded,
7280 * but temporarily change that element to 0. Before exiting the
7281 * routine, we must restore the element to '1' */
7288 /* Size the union for the worst case: that the sets are completely
7290 u = _new_invlist(len_a + len_b);
7292 /* Will contain U+0000 if either component does */
7293 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7294 || (len_b > 0 && array_b[0] == 0));
7296 /* Go through each list item by item, stopping when exhausted one of
7298 while (i_a < len_a && i_b < len_b) {
7299 UV cp; /* The element to potentially add to the union's array */
7300 bool cp_in_set; /* is it in the the input list's set or not */
7302 /* We need to take one or the other of the two inputs for the union.
7303 * Since we are merging two sorted lists, we take the smaller of the
7304 * next items. In case of a tie, we take the one that is in its set
7305 * first. If we took one not in the set first, it would decrement the
7306 * count, possibly to 0 which would cause it to be output as ending the
7307 * range, and the next time through we would take the same number, and
7308 * output it again as beginning the next range. By doing it the
7309 * opposite way, there is no possibility that the count will be
7310 * momentarily decremented to 0, and thus the two adjoining ranges will
7311 * be seamlessly merged. (In a tie and both are in the set or both not
7312 * in the set, it doesn't matter which we take first.) */
7313 if (array_a[i_a] < array_b[i_b]
7314 || (array_a[i_a] == array_b[i_b]
7315 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7317 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7321 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7325 /* Here, have chosen which of the two inputs to look at. Only output
7326 * if the running count changes to/from 0, which marks the
7327 * beginning/end of a range in that's in the set */
7330 array_u[i_u++] = cp;
7337 array_u[i_u++] = cp;
7342 /* Here, we are finished going through at least one of the lists, which
7343 * means there is something remaining in at most one. We check if the list
7344 * that hasn't been exhausted is positioned such that we are in the middle
7345 * of a range in its set or not. (i_a and i_b point to the element beyond
7346 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7347 * is potentially more to output.
7348 * There are four cases:
7349 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7350 * in the union is entirely from the non-exhausted set.
7351 * 2) Both were in their sets, count is 2. Nothing further should
7352 * be output, as everything that remains will be in the exhausted
7353 * list's set, hence in the union; decrementing to 1 but not 0 insures
7355 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7356 * Nothing further should be output because the union includes
7357 * everything from the exhausted set. Not decrementing ensures that.
7358 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7359 * decrementing to 0 insures that we look at the remainder of the
7360 * non-exhausted set */
7361 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7362 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7367 /* The final length is what we've output so far, plus what else is about to
7368 * be output. (If 'count' is non-zero, then the input list we exhausted
7369 * has everything remaining up to the machine's limit in its set, and hence
7370 * in the union, so there will be no further output. */
7373 /* At most one of the subexpressions will be non-zero */
7374 len_u += (len_a - i_a) + (len_b - i_b);
7377 /* Set result to final length, which can change the pointer to array_u, so
7379 if (len_u != invlist_len(u)) {
7380 invlist_set_len(u, len_u);
7382 array_u = invlist_array(u);
7385 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7386 * the other) ended with everything above it not in its set. That means
7387 * that the remaining part of the union is precisely the same as the
7388 * non-exhausted list, so can just copy it unchanged. (If both list were
7389 * exhausted at the same time, then the operations below will be both 0.)
7392 IV copy_count; /* At most one will have a non-zero copy count */
7393 if ((copy_count = len_a - i_a) > 0) {
7394 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7396 else if ((copy_count = len_b - i_b) > 0) {
7397 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7401 /* We may be removing a reference to one of the inputs */
7402 if (a == *output || b == *output) {
7403 SvREFCNT_dec(*output);
7406 /* If we've changed b, restore it */
7416 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7418 /* Take the intersection of two inversion lists and point <i> to it. *i
7419 * should be defined upon input, and if it points to one of the two lists,
7420 * the reference count to that list will be decremented.
7421 * If <complement_b> is TRUE, the result will be the intersection of <a>
7422 * and the complement (or inversion) of <b> instead of <b> directly.
7424 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7425 * Richard Gillam, published by Addison-Wesley, and explained at some
7426 * length there. The preface says to incorporate its examples into your
7427 * code at your own risk. In fact, it had bugs
7429 * The algorithm is like a merge sort, and is essentially the same as the
7433 UV* array_a; /* a's array */
7435 UV len_a; /* length of a's array */
7438 SV* r; /* the resulting intersection */
7442 UV i_a = 0; /* current index into a's array */
7446 /* running count, as explained in the algorithm source book; items are
7447 * stopped accumulating and are output when the count changes to/from 2.
7448 * The count is incremented when we start a range that's in the set, and
7449 * decremented when we start a range that's not in the set. So its range
7450 * is 0 to 2. Only when the count is 2 is something in the intersection.
7454 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7457 /* Special case if either one is empty */
7458 len_a = invlist_len(a);
7459 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
7461 if (len_a != 0 && complement_b) {
7463 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7464 * be empty. Here, also we are using 'b's complement, which hence
7465 * must be every possible code point. Thus the intersection is
7468 *i = invlist_clone(a);
7474 /* else *i is already 'a' */
7478 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7479 * intersection must be empty */
7486 *i = _new_invlist(0);
7490 /* Here both lists exist and are non-empty */
7491 array_a = invlist_array(a);
7492 array_b = invlist_array(b);
7494 /* If are to take the intersection of 'a' with the complement of b, set it
7495 * up so are looking at b's complement. */
7498 /* To complement, we invert: if the first element is 0, remove it. To
7499 * do this, we just pretend the array starts one later, and clear the
7500 * flag as we don't have to do anything else later */
7501 if (array_b[0] == 0) {
7504 complement_b = FALSE;
7508 /* But if the first element is not zero, we unshift a 0 before the
7509 * array. The data structure reserves a space for that 0 (which
7510 * should be a '1' right now), so physical shifting is unneeded,
7511 * but temporarily change that element to 0. Before exiting the
7512 * routine, we must restore the element to '1' */
7519 /* Size the intersection for the worst case: that the intersection ends up
7520 * fragmenting everything to be completely disjoint */
7521 r= _new_invlist(len_a + len_b);
7523 /* Will contain U+0000 iff both components do */
7524 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7525 && len_b > 0 && array_b[0] == 0);
7527 /* Go through each list item by item, stopping when exhausted one of
7529 while (i_a < len_a && i_b < len_b) {
7530 UV cp; /* The element to potentially add to the intersection's
7532 bool cp_in_set; /* Is it in the input list's set or not */
7534 /* We need to take one or the other of the two inputs for the
7535 * intersection. Since we are merging two sorted lists, we take the
7536 * smaller of the next items. In case of a tie, we take the one that
7537 * is not in its set first (a difference from the union algorithm). If
7538 * we took one in the set first, it would increment the count, possibly
7539 * to 2 which would cause it to be output as starting a range in the
7540 * intersection, and the next time through we would take that same
7541 * number, and output it again as ending the set. By doing it the
7542 * opposite of this, there is no possibility that the count will be
7543 * momentarily incremented to 2. (In a tie and both are in the set or
7544 * both not in the set, it doesn't matter which we take first.) */
7545 if (array_a[i_a] < array_b[i_b]
7546 || (array_a[i_a] == array_b[i_b]
7547 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7549 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7553 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7557 /* Here, have chosen which of the two inputs to look at. Only output
7558 * if the running count changes to/from 2, which marks the
7559 * beginning/end of a range that's in the intersection */
7563 array_r[i_r++] = cp;
7568 array_r[i_r++] = cp;
7574 /* Here, we are finished going through at least one of the lists, which
7575 * means there is something remaining in at most one. We check if the list
7576 * that has been exhausted is positioned such that we are in the middle
7577 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7578 * the ones we care about.) There are four cases:
7579 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7580 * nothing left in the intersection.
7581 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7582 * above 2. What should be output is exactly that which is in the
7583 * non-exhausted set, as everything it has is also in the intersection
7584 * set, and everything it doesn't have can't be in the intersection
7585 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7586 * gets incremented to 2. Like the previous case, the intersection is
7587 * everything that remains in the non-exhausted set.
7588 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7589 * remains 1. And the intersection has nothing more. */
7590 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7591 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7596 /* The final length is what we've output so far plus what else is in the
7597 * intersection. At most one of the subexpressions below will be non-zero */
7600 len_r += (len_a - i_a) + (len_b - i_b);
7603 /* Set result to final length, which can change the pointer to array_r, so
7605 if (len_r != invlist_len(r)) {
7606 invlist_set_len(r, len_r);
7608 array_r = invlist_array(r);
7611 /* Finish outputting any remaining */
7612 if (count >= 2) { /* At most one will have a non-zero copy count */
7614 if ((copy_count = len_a - i_a) > 0) {
7615 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7617 else if ((copy_count = len_b - i_b) > 0) {
7618 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7622 /* We may be removing a reference to one of the inputs */
7623 if (a == *i || b == *i) {
7627 /* If we've changed b, restore it */
7637 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7639 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7640 * set. A pointer to the inversion list is returned. This may actually be
7641 * a new list, in which case the passed in one has been destroyed. The
7642 * passed in inversion list can be NULL, in which case a new one is created
7643 * with just the one range in it */
7648 if (invlist == NULL) {
7649 invlist = _new_invlist(2);
7653 len = invlist_len(invlist);
7656 /* If comes after the final entry, can just append it to the end */
7658 || start >= invlist_array(invlist)
7659 [invlist_len(invlist) - 1])
7661 _append_range_to_invlist(invlist, start, end);
7665 /* Here, can't just append things, create and return a new inversion list
7666 * which is the union of this range and the existing inversion list */
7667 range_invlist = _new_invlist(2);
7668 _append_range_to_invlist(range_invlist, start, end);
7670 _invlist_union(invlist, range_invlist, &invlist);
7672 /* The temporary can be freed */
7673 SvREFCNT_dec(range_invlist);
7680 PERL_STATIC_INLINE SV*
7681 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7682 return _add_range_to_invlist(invlist, cp, cp);
7685 #ifndef PERL_IN_XSUB_RE
7687 Perl__invlist_invert(pTHX_ SV* const invlist)
7689 /* Complement the input inversion list. This adds a 0 if the list didn't
7690 * have a zero; removes it otherwise. As described above, the data
7691 * structure is set up so that this is very efficient */
7693 UV* len_pos = get_invlist_len_addr(invlist);
7695 PERL_ARGS_ASSERT__INVLIST_INVERT;
7697 /* The inverse of matching nothing is matching everything */
7698 if (*len_pos == 0) {
7699 _append_range_to_invlist(invlist, 0, UV_MAX);
7703 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7704 * zero element was a 0, so it is being removed, so the length decrements
7705 * by 1; and vice-versa. SvCUR is unaffected */
7706 if (*get_invlist_zero_addr(invlist) ^= 1) {
7715 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7717 /* Complement the input inversion list (which must be a Unicode property,
7718 * all of which don't match above the Unicode maximum code point.) And
7719 * Perl has chosen to not have the inversion match above that either. This
7720 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7726 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7728 _invlist_invert(invlist);
7730 len = invlist_len(invlist);
7732 if (len != 0) { /* If empty do nothing */
7733 array = invlist_array(invlist);
7734 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7735 /* Add 0x110000. First, grow if necessary */
7737 if (invlist_max(invlist) < len) {
7738 invlist_extend(invlist, len);
7739 array = invlist_array(invlist);
7741 invlist_set_len(invlist, len);
7742 array[len - 1] = PERL_UNICODE_MAX + 1;
7744 else { /* Remove the 0x110000 */
7745 invlist_set_len(invlist, len - 1);
7753 PERL_STATIC_INLINE SV*
7754 S_invlist_clone(pTHX_ SV* const invlist)
7757 /* Return a new inversion list that is a copy of the input one, which is
7760 /* Need to allocate extra space to accommodate Perl's addition of a
7761 * trailing NUL to SvPV's, since it thinks they are always strings */
7762 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
7763 STRLEN length = SvCUR(invlist);
7765 PERL_ARGS_ASSERT_INVLIST_CLONE;
7767 SvCUR_set(new_invlist, length); /* This isn't done automatically */
7768 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7773 PERL_STATIC_INLINE UV*
7774 S_get_invlist_iter_addr(pTHX_ SV* invlist)
7776 /* Return the address of the UV that contains the current iteration
7779 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7781 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7784 PERL_STATIC_INLINE UV*
7785 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7787 /* Return the address of the UV that contains the version id. */
7789 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7791 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7794 PERL_STATIC_INLINE void
7795 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
7797 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7799 *get_invlist_iter_addr(invlist) = 0;
7803 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7805 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7806 * This call sets in <*start> and <*end>, the next range in <invlist>.
7807 * Returns <TRUE> if successful and the next call will return the next
7808 * range; <FALSE> if was already at the end of the list. If the latter,
7809 * <*start> and <*end> are unchanged, and the next call to this function
7810 * will start over at the beginning of the list */
7812 UV* pos = get_invlist_iter_addr(invlist);
7813 UV len = invlist_len(invlist);
7816 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7819 *pos = UV_MAX; /* Force iternit() to be required next time */
7823 array = invlist_array(invlist);
7825 *start = array[(*pos)++];
7831 *end = array[(*pos)++] - 1;
7837 #ifndef PERL_IN_XSUB_RE
7839 Perl__invlist_contents(pTHX_ SV* const invlist)
7841 /* Get the contents of an inversion list into a string SV so that they can
7842 * be printed out. It uses the format traditionally done for debug tracing
7846 SV* output = newSVpvs("\n");
7848 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7850 invlist_iterinit(invlist);
7851 while (invlist_iternext(invlist, &start, &end)) {
7852 if (end == UV_MAX) {
7853 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7855 else if (end != start) {
7856 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7860 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7870 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7872 /* Dumps out the ranges in an inversion list. The string 'header'
7873 * if present is output on a line before the first range */
7877 if (header && strlen(header)) {
7878 PerlIO_printf(Perl_debug_log, "%s\n", header);
7880 invlist_iterinit(invlist);
7881 while (invlist_iternext(invlist, &start, &end)) {
7882 if (end == UV_MAX) {
7883 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7886 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7892 #undef HEADER_LENGTH
7893 #undef INVLIST_INITIAL_LENGTH
7894 #undef TO_INTERNAL_SIZE
7895 #undef FROM_INTERNAL_SIZE
7896 #undef INVLIST_LEN_OFFSET
7897 #undef INVLIST_ZERO_OFFSET
7898 #undef INVLIST_ITER_OFFSET
7899 #undef INVLIST_VERSION_ID
7901 /* End of inversion list object */
7904 - reg - regular expression, i.e. main body or parenthesized thing
7906 * Caller must absorb opening parenthesis.
7908 * Combining parenthesis handling with the base level of regular expression
7909 * is a trifle forced, but the need to tie the tails of the branches to what
7910 * follows makes it hard to avoid.
7912 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7914 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7916 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7920 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
7921 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
7924 register regnode *ret; /* Will be the head of the group. */
7925 register regnode *br;
7926 register regnode *lastbr;
7927 register regnode *ender = NULL;
7928 register I32 parno = 0;
7930 U32 oregflags = RExC_flags;
7931 bool have_branch = 0;
7933 I32 freeze_paren = 0;
7934 I32 after_freeze = 0;
7936 /* for (?g), (?gc), and (?o) warnings; warning
7937 about (?c) will warn about (?g) -- japhy */
7939 #define WASTED_O 0x01
7940 #define WASTED_G 0x02
7941 #define WASTED_C 0x04
7942 #define WASTED_GC (0x02|0x04)
7943 I32 wastedflags = 0x00;
7945 char * parse_start = RExC_parse; /* MJD */
7946 char * const oregcomp_parse = RExC_parse;
7948 GET_RE_DEBUG_FLAGS_DECL;
7950 PERL_ARGS_ASSERT_REG;
7951 DEBUG_PARSE("reg ");
7953 *flagp = 0; /* Tentatively. */
7956 /* Make an OPEN node, if parenthesized. */
7958 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7959 char *start_verb = RExC_parse;
7960 STRLEN verb_len = 0;
7961 char *start_arg = NULL;
7962 unsigned char op = 0;
7964 int internal_argval = 0; /* internal_argval is only useful if !argok */
7965 while ( *RExC_parse && *RExC_parse != ')' ) {
7966 if ( *RExC_parse == ':' ) {
7967 start_arg = RExC_parse + 1;
7973 verb_len = RExC_parse - start_verb;
7976 while ( *RExC_parse && *RExC_parse != ')' )
7978 if ( *RExC_parse != ')' )
7979 vFAIL("Unterminated verb pattern argument");
7980 if ( RExC_parse == start_arg )
7983 if ( *RExC_parse != ')' )
7984 vFAIL("Unterminated verb pattern");
7987 switch ( *start_verb ) {
7988 case 'A': /* (*ACCEPT) */
7989 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
7991 internal_argval = RExC_nestroot;
7994 case 'C': /* (*COMMIT) */
7995 if ( memEQs(start_verb,verb_len,"COMMIT") )
7998 case 'F': /* (*FAIL) */
7999 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8004 case ':': /* (*:NAME) */
8005 case 'M': /* (*MARK:NAME) */
8006 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8011 case 'P': /* (*PRUNE) */
8012 if ( memEQs(start_verb,verb_len,"PRUNE") )
8015 case 'S': /* (*SKIP) */
8016 if ( memEQs(start_verb,verb_len,"SKIP") )
8019 case 'T': /* (*THEN) */
8020 /* [19:06] <TimToady> :: is then */
8021 if ( memEQs(start_verb,verb_len,"THEN") ) {
8023 RExC_seen |= REG_SEEN_CUTGROUP;
8029 vFAIL3("Unknown verb pattern '%.*s'",
8030 verb_len, start_verb);
8033 if ( start_arg && internal_argval ) {
8034 vFAIL3("Verb pattern '%.*s' may not have an argument",
8035 verb_len, start_verb);
8036 } else if ( argok < 0 && !start_arg ) {
8037 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8038 verb_len, start_verb);
8040 ret = reganode(pRExC_state, op, internal_argval);
8041 if ( ! internal_argval && ! SIZE_ONLY ) {
8043 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8044 ARG(ret) = add_data( pRExC_state, 1, "S" );
8045 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8052 if (!internal_argval)
8053 RExC_seen |= REG_SEEN_VERBARG;
8054 } else if ( start_arg ) {
8055 vFAIL3("Verb pattern '%.*s' may not have an argument",
8056 verb_len, start_verb);
8058 ret = reg_node(pRExC_state, op);
8060 nextchar(pRExC_state);
8063 if (*RExC_parse == '?') { /* (?...) */
8064 bool is_logical = 0;
8065 const char * const seqstart = RExC_parse;
8066 bool has_use_defaults = FALSE;
8069 paren = *RExC_parse++;
8070 ret = NULL; /* For look-ahead/behind. */
8073 case 'P': /* (?P...) variants for those used to PCRE/Python */
8074 paren = *RExC_parse++;
8075 if ( paren == '<') /* (?P<...>) named capture */
8077 else if (paren == '>') { /* (?P>name) named recursion */
8078 goto named_recursion;
8080 else if (paren == '=') { /* (?P=...) named backref */
8081 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8082 you change this make sure you change that */
8083 char* name_start = RExC_parse;
8085 SV *sv_dat = reg_scan_name(pRExC_state,
8086 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8087 if (RExC_parse == name_start || *RExC_parse != ')')
8088 vFAIL2("Sequence %.3s... not terminated",parse_start);
8091 num = add_data( pRExC_state, 1, "S" );
8092 RExC_rxi->data->data[num]=(void*)sv_dat;
8093 SvREFCNT_inc_simple_void(sv_dat);
8096 ret = reganode(pRExC_state,
8099 : (MORE_ASCII_RESTRICTED)
8101 : (AT_LEAST_UNI_SEMANTICS)
8109 Set_Node_Offset(ret, parse_start+1);
8110 Set_Node_Cur_Length(ret); /* MJD */
8112 nextchar(pRExC_state);
8116 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8118 case '<': /* (?<...) */
8119 if (*RExC_parse == '!')
8121 else if (*RExC_parse != '=')
8127 case '\'': /* (?'...') */
8128 name_start= RExC_parse;
8129 svname = reg_scan_name(pRExC_state,
8130 SIZE_ONLY ? /* reverse test from the others */
8131 REG_RSN_RETURN_NAME :
8132 REG_RSN_RETURN_NULL);
8133 if (RExC_parse == name_start) {
8135 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8138 if (*RExC_parse != paren)
8139 vFAIL2("Sequence (?%c... not terminated",
8140 paren=='>' ? '<' : paren);
8144 if (!svname) /* shouldn't happen */
8146 "panic: reg_scan_name returned NULL");
8147 if (!RExC_paren_names) {
8148 RExC_paren_names= newHV();
8149 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8151 RExC_paren_name_list= newAV();
8152 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8155 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8157 sv_dat = HeVAL(he_str);
8159 /* croak baby croak */
8161 "panic: paren_name hash element allocation failed");
8162 } else if ( SvPOK(sv_dat) ) {
8163 /* (?|...) can mean we have dupes so scan to check
8164 its already been stored. Maybe a flag indicating
8165 we are inside such a construct would be useful,
8166 but the arrays are likely to be quite small, so
8167 for now we punt -- dmq */
8168 IV count = SvIV(sv_dat);
8169 I32 *pv = (I32*)SvPVX(sv_dat);
8171 for ( i = 0 ; i < count ; i++ ) {
8172 if ( pv[i] == RExC_npar ) {
8178 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8179 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8180 pv[count] = RExC_npar;
8181 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8184 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8185 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8187 SvIV_set(sv_dat, 1);
8190 /* Yes this does cause a memory leak in debugging Perls */
8191 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8192 SvREFCNT_dec(svname);
8195 /*sv_dump(sv_dat);*/
8197 nextchar(pRExC_state);
8199 goto capturing_parens;
8201 RExC_seen |= REG_SEEN_LOOKBEHIND;
8202 RExC_in_lookbehind++;
8204 case '=': /* (?=...) */
8205 RExC_seen_zerolen++;
8207 case '!': /* (?!...) */
8208 RExC_seen_zerolen++;
8209 if (*RExC_parse == ')') {
8210 ret=reg_node(pRExC_state, OPFAIL);
8211 nextchar(pRExC_state);
8215 case '|': /* (?|...) */
8216 /* branch reset, behave like a (?:...) except that
8217 buffers in alternations share the same numbers */
8219 after_freeze = freeze_paren = RExC_npar;
8221 case ':': /* (?:...) */
8222 case '>': /* (?>...) */
8224 case '$': /* (?$...) */
8225 case '@': /* (?@...) */
8226 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8228 case '#': /* (?#...) */
8229 while (*RExC_parse && *RExC_parse != ')')
8231 if (*RExC_parse != ')')
8232 FAIL("Sequence (?#... not terminated");
8233 nextchar(pRExC_state);
8236 case '0' : /* (?0) */
8237 case 'R' : /* (?R) */
8238 if (*RExC_parse != ')')
8239 FAIL("Sequence (?R) not terminated");
8240 ret = reg_node(pRExC_state, GOSTART);
8241 *flagp |= POSTPONED;
8242 nextchar(pRExC_state);
8245 { /* named and numeric backreferences */
8247 case '&': /* (?&NAME) */
8248 parse_start = RExC_parse - 1;
8251 SV *sv_dat = reg_scan_name(pRExC_state,
8252 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8253 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8255 goto gen_recurse_regop;
8258 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8260 vFAIL("Illegal pattern");
8262 goto parse_recursion;
8264 case '-': /* (?-1) */
8265 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8266 RExC_parse--; /* rewind to let it be handled later */
8270 case '1': case '2': case '3': case '4': /* (?1) */
8271 case '5': case '6': case '7': case '8': case '9':
8274 num = atoi(RExC_parse);
8275 parse_start = RExC_parse - 1; /* MJD */
8276 if (*RExC_parse == '-')
8278 while (isDIGIT(*RExC_parse))
8280 if (*RExC_parse!=')')
8281 vFAIL("Expecting close bracket");
8284 if ( paren == '-' ) {
8286 Diagram of capture buffer numbering.
8287 Top line is the normal capture buffer numbers
8288 Bottom line is the negative indexing as from
8292 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8296 num = RExC_npar + num;
8299 vFAIL("Reference to nonexistent group");
8301 } else if ( paren == '+' ) {
8302 num = RExC_npar + num - 1;
8305 ret = reganode(pRExC_state, GOSUB, num);
8307 if (num > (I32)RExC_rx->nparens) {
8309 vFAIL("Reference to nonexistent group");
8311 ARG2L_SET( ret, RExC_recurse_count++);
8313 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8314 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8318 RExC_seen |= REG_SEEN_RECURSE;
8319 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8320 Set_Node_Offset(ret, parse_start); /* MJD */
8322 *flagp |= POSTPONED;
8323 nextchar(pRExC_state);
8325 } /* named and numeric backreferences */
8328 case '?': /* (??...) */
8330 if (*RExC_parse != '{') {
8332 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8335 *flagp |= POSTPONED;
8336 paren = *RExC_parse++;
8338 case '{': /* (?{...}) */
8343 char *s = RExC_parse;
8345 RExC_seen_zerolen++;
8346 RExC_seen |= REG_SEEN_EVAL;
8348 if ( pRExC_state->num_code_blocks
8349 && pRExC_state->code_index < pRExC_state->num_code_blocks
8350 && pRExC_state->code_blocks[pRExC_state->code_index].start
8351 == (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8354 /* this is a pre-compiled literal (?{}) */
8355 struct reg_code_block *cb =
8356 &pRExC_state->code_blocks[pRExC_state->code_index];
8357 RExC_parse = RExC_start + cb->end;
8362 if (cb->src_regex) {
8363 n = add_data(pRExC_state, 2, "rl");
8364 RExC_rxi->data->data[n] =
8365 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8366 RExC_rxi->data->data[n+1] = (void*)o->op_next;
8369 n = add_data(pRExC_state, 1,
8370 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8371 RExC_rxi->data->data[n] = (void*)o->op_next;
8374 pRExC_state->code_index++;
8377 while (count && (c = *RExC_parse)) {
8388 if (*RExC_parse != ')') {
8390 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
8394 OP_4tree *sop, *rop;
8395 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
8398 Perl_save_re_context(aTHX);
8399 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
8400 sop->op_private |= OPpREFCOUNTED;
8401 /* re_dup will OpREFCNT_inc */
8402 OpREFCNT_set(sop, 1);
8405 n = add_data(pRExC_state, 3, "nop");
8406 RExC_rxi->data->data[n] = (void*)rop;
8407 RExC_rxi->data->data[n+1] = (void*)sop;
8408 RExC_rxi->data->data[n+2] = (void*)pad;
8411 else { /* First pass */
8412 if (PL_reginterp_cnt < ++RExC_seen_evals
8414 /* No compiled RE interpolated, has runtime
8415 components ===> unsafe. */
8416 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8417 if (PL_tainting && PL_tainted)
8418 FAIL("Eval-group in insecure regular expression");
8419 #if PERL_VERSION > 8
8420 if (IN_PERL_COMPILETIME)
8425 nextchar(pRExC_state);
8428 ret = reg_node(pRExC_state, LOGICAL);
8431 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
8432 /* deal with the length of this later - MJD */
8435 ret = reganode(pRExC_state, EVAL, n);
8436 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8437 Set_Node_Offset(ret, parse_start);
8440 case '(': /* (?(?{...})...) and (?(?=...)...) */
8443 if (RExC_parse[0] == '?') { /* (?(?...)) */
8444 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8445 || RExC_parse[1] == '<'
8446 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8449 ret = reg_node(pRExC_state, LOGICAL);
8452 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8456 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8457 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8459 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8460 char *name_start= RExC_parse++;
8462 SV *sv_dat=reg_scan_name(pRExC_state,
8463 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8464 if (RExC_parse == name_start || *RExC_parse != ch)
8465 vFAIL2("Sequence (?(%c... not terminated",
8466 (ch == '>' ? '<' : ch));
8469 num = add_data( pRExC_state, 1, "S" );
8470 RExC_rxi->data->data[num]=(void*)sv_dat;
8471 SvREFCNT_inc_simple_void(sv_dat);
8473 ret = reganode(pRExC_state,NGROUPP,num);
8474 goto insert_if_check_paren;
8476 else if (RExC_parse[0] == 'D' &&
8477 RExC_parse[1] == 'E' &&
8478 RExC_parse[2] == 'F' &&
8479 RExC_parse[3] == 'I' &&
8480 RExC_parse[4] == 'N' &&
8481 RExC_parse[5] == 'E')
8483 ret = reganode(pRExC_state,DEFINEP,0);
8486 goto insert_if_check_paren;
8488 else if (RExC_parse[0] == 'R') {
8491 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8492 parno = atoi(RExC_parse++);
8493 while (isDIGIT(*RExC_parse))
8495 } else if (RExC_parse[0] == '&') {
8498 sv_dat = reg_scan_name(pRExC_state,
8499 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8500 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8502 ret = reganode(pRExC_state,INSUBP,parno);
8503 goto insert_if_check_paren;
8505 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8508 parno = atoi(RExC_parse++);
8510 while (isDIGIT(*RExC_parse))
8512 ret = reganode(pRExC_state, GROUPP, parno);
8514 insert_if_check_paren:
8515 if ((c = *nextchar(pRExC_state)) != ')')
8516 vFAIL("Switch condition not recognized");
8518 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8519 br = regbranch(pRExC_state, &flags, 1,depth+1);
8521 br = reganode(pRExC_state, LONGJMP, 0);
8523 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8524 c = *nextchar(pRExC_state);
8529 vFAIL("(?(DEFINE)....) does not allow branches");
8530 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8531 regbranch(pRExC_state, &flags, 1,depth+1);
8532 REGTAIL(pRExC_state, ret, lastbr);
8535 c = *nextchar(pRExC_state);
8540 vFAIL("Switch (?(condition)... contains too many branches");
8541 ender = reg_node(pRExC_state, TAIL);
8542 REGTAIL(pRExC_state, br, ender);
8544 REGTAIL(pRExC_state, lastbr, ender);
8545 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8548 REGTAIL(pRExC_state, ret, ender);
8549 RExC_size++; /* XXX WHY do we need this?!!
8550 For large programs it seems to be required
8551 but I can't figure out why. -- dmq*/
8555 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8559 RExC_parse--; /* for vFAIL to print correctly */
8560 vFAIL("Sequence (? incomplete");
8562 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8564 has_use_defaults = TRUE;
8565 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8566 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8567 ? REGEX_UNICODE_CHARSET
8568 : REGEX_DEPENDS_CHARSET);
8572 parse_flags: /* (?i) */
8574 U32 posflags = 0, negflags = 0;
8575 U32 *flagsp = &posflags;
8576 char has_charset_modifier = '\0';
8577 regex_charset cs = get_regex_charset(RExC_flags);
8578 if (cs == REGEX_DEPENDS_CHARSET
8579 && (RExC_utf8 || RExC_uni_semantics))
8581 cs = REGEX_UNICODE_CHARSET;
8584 while (*RExC_parse) {
8585 /* && strchr("iogcmsx", *RExC_parse) */
8586 /* (?g), (?gc) and (?o) are useless here
8587 and must be globally applied -- japhy */
8588 switch (*RExC_parse) {
8589 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8590 case LOCALE_PAT_MOD:
8591 if (has_charset_modifier) {
8592 goto excess_modifier;
8594 else if (flagsp == &negflags) {
8597 cs = REGEX_LOCALE_CHARSET;
8598 has_charset_modifier = LOCALE_PAT_MOD;
8599 RExC_contains_locale = 1;
8601 case UNICODE_PAT_MOD:
8602 if (has_charset_modifier) {
8603 goto excess_modifier;
8605 else if (flagsp == &negflags) {
8608 cs = REGEX_UNICODE_CHARSET;
8609 has_charset_modifier = UNICODE_PAT_MOD;
8611 case ASCII_RESTRICT_PAT_MOD:
8612 if (flagsp == &negflags) {
8615 if (has_charset_modifier) {
8616 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8617 goto excess_modifier;
8619 /* Doubled modifier implies more restricted */
8620 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8623 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8625 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8627 case DEPENDS_PAT_MOD:
8628 if (has_use_defaults) {
8629 goto fail_modifiers;
8631 else if (flagsp == &negflags) {
8634 else if (has_charset_modifier) {
8635 goto excess_modifier;
8638 /* The dual charset means unicode semantics if the
8639 * pattern (or target, not known until runtime) are
8640 * utf8, or something in the pattern indicates unicode
8642 cs = (RExC_utf8 || RExC_uni_semantics)
8643 ? REGEX_UNICODE_CHARSET
8644 : REGEX_DEPENDS_CHARSET;
8645 has_charset_modifier = DEPENDS_PAT_MOD;
8649 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
8650 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
8652 else if (has_charset_modifier == *(RExC_parse - 1)) {
8653 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
8656 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
8661 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8663 case ONCE_PAT_MOD: /* 'o' */
8664 case GLOBAL_PAT_MOD: /* 'g' */
8665 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8666 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
8667 if (! (wastedflags & wflagbit) ) {
8668 wastedflags |= wflagbit;
8671 "Useless (%s%c) - %suse /%c modifier",
8672 flagsp == &negflags ? "?-" : "?",
8674 flagsp == &negflags ? "don't " : "",
8681 case CONTINUE_PAT_MOD: /* 'c' */
8682 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
8683 if (! (wastedflags & WASTED_C) ) {
8684 wastedflags |= WASTED_GC;
8687 "Useless (%sc) - %suse /gc modifier",
8688 flagsp == &negflags ? "?-" : "?",
8689 flagsp == &negflags ? "don't " : ""
8694 case KEEPCOPY_PAT_MOD: /* 'p' */
8695 if (flagsp == &negflags) {
8697 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
8699 *flagsp |= RXf_PMf_KEEPCOPY;
8703 /* A flag is a default iff it is following a minus, so
8704 * if there is a minus, it means will be trying to
8705 * re-specify a default which is an error */
8706 if (has_use_defaults || flagsp == &negflags) {
8709 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8713 wastedflags = 0; /* reset so (?g-c) warns twice */
8719 RExC_flags |= posflags;
8720 RExC_flags &= ~negflags;
8721 set_regex_charset(&RExC_flags, cs);
8723 oregflags |= posflags;
8724 oregflags &= ~negflags;
8725 set_regex_charset(&oregflags, cs);
8727 nextchar(pRExC_state);
8738 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8743 }} /* one for the default block, one for the switch */
8750 ret = reganode(pRExC_state, OPEN, parno);
8753 RExC_nestroot = parno;
8754 if (RExC_seen & REG_SEEN_RECURSE
8755 && !RExC_open_parens[parno-1])
8757 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8758 "Setting open paren #%"IVdf" to %d\n",
8759 (IV)parno, REG_NODE_NUM(ret)));
8760 RExC_open_parens[parno-1]= ret;
8763 Set_Node_Length(ret, 1); /* MJD */
8764 Set_Node_Offset(ret, RExC_parse); /* MJD */
8772 /* Pick up the branches, linking them together. */
8773 parse_start = RExC_parse; /* MJD */
8774 br = regbranch(pRExC_state, &flags, 1,depth+1);
8776 /* branch_len = (paren != 0); */
8780 if (*RExC_parse == '|') {
8781 if (!SIZE_ONLY && RExC_extralen) {
8782 reginsert(pRExC_state, BRANCHJ, br, depth+1);
8785 reginsert(pRExC_state, BRANCH, br, depth+1);
8786 Set_Node_Length(br, paren != 0);
8787 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8791 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
8793 else if (paren == ':') {
8794 *flagp |= flags&SIMPLE;
8796 if (is_open) { /* Starts with OPEN. */
8797 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
8799 else if (paren != '?') /* Not Conditional */
8801 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8803 while (*RExC_parse == '|') {
8804 if (!SIZE_ONLY && RExC_extralen) {
8805 ender = reganode(pRExC_state, LONGJMP,0);
8806 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
8809 RExC_extralen += 2; /* Account for LONGJMP. */
8810 nextchar(pRExC_state);
8812 if (RExC_npar > after_freeze)
8813 after_freeze = RExC_npar;
8814 RExC_npar = freeze_paren;
8816 br = regbranch(pRExC_state, &flags, 0, depth+1);
8820 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
8822 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
8825 if (have_branch || paren != ':') {
8826 /* Make a closing node, and hook it on the end. */
8829 ender = reg_node(pRExC_state, TAIL);
8832 ender = reganode(pRExC_state, CLOSE, parno);
8833 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8834 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8835 "Setting close paren #%"IVdf" to %d\n",
8836 (IV)parno, REG_NODE_NUM(ender)));
8837 RExC_close_parens[parno-1]= ender;
8838 if (RExC_nestroot == parno)
8841 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8842 Set_Node_Length(ender,1); /* MJD */
8848 *flagp &= ~HASWIDTH;
8851 ender = reg_node(pRExC_state, SUCCEED);
8854 ender = reg_node(pRExC_state, END);
8856 assert(!RExC_opend); /* there can only be one! */
8861 DEBUG_PARSE_r(if (!SIZE_ONLY) {
8862 SV * const mysv_val1=sv_newmortal();
8863 SV * const mysv_val2=sv_newmortal();
8864 DEBUG_PARSE_MSG("lsbr");
8865 regprop(RExC_rx, mysv_val1, lastbr);
8866 regprop(RExC_rx, mysv_val2, ender);
8867 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
8868 SvPV_nolen_const(mysv_val1),
8869 (IV)REG_NODE_NUM(lastbr),
8870 SvPV_nolen_const(mysv_val2),
8871 (IV)REG_NODE_NUM(ender),
8872 (IV)(ender - lastbr)
8875 REGTAIL(pRExC_state, lastbr, ender);
8877 if (have_branch && !SIZE_ONLY) {
8880 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8882 /* Hook the tails of the branches to the closing node. */
8883 for (br = ret; br; br = regnext(br)) {
8884 const U8 op = PL_regkind[OP(br)];
8886 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
8887 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
8890 else if (op == BRANCHJ) {
8891 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
8892 /* for now we always disable this optimisation * /
8893 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
8899 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
8900 DEBUG_PARSE_r(if (!SIZE_ONLY) {
8901 SV * const mysv_val1=sv_newmortal();
8902 SV * const mysv_val2=sv_newmortal();
8903 DEBUG_PARSE_MSG("NADA");
8904 regprop(RExC_rx, mysv_val1, ret);
8905 regprop(RExC_rx, mysv_val2, ender);
8906 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
8907 SvPV_nolen_const(mysv_val1),
8908 (IV)REG_NODE_NUM(ret),
8909 SvPV_nolen_const(mysv_val2),
8910 (IV)REG_NODE_NUM(ender),
8915 if (OP(ender) == TAIL) {
8920 for ( opt= br + 1; opt < ender ; opt++ )
8922 NEXT_OFF(br)= ender - br;
8930 static const char parens[] = "=!<,>";
8932 if (paren && (p = strchr(parens, paren))) {
8933 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
8934 int flag = (p - parens) > 1;
8937 node = SUSPEND, flag = 0;
8938 reginsert(pRExC_state, node,ret, depth+1);
8939 Set_Node_Cur_Length(ret);
8940 Set_Node_Offset(ret, parse_start + 1);
8942 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
8946 /* Check for proper termination. */
8948 RExC_flags = oregflags;
8949 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8950 RExC_parse = oregcomp_parse;
8951 vFAIL("Unmatched (");
8954 else if (!paren && RExC_parse < RExC_end) {
8955 if (*RExC_parse == ')') {
8957 vFAIL("Unmatched )");
8960 FAIL("Junk on end of regexp"); /* "Can't happen". */
8964 if (RExC_in_lookbehind) {
8965 RExC_in_lookbehind--;
8967 if (after_freeze > RExC_npar)
8968 RExC_npar = after_freeze;
8973 - regbranch - one alternative of an | operator
8975 * Implements the concatenation operator.
8978 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
8981 register regnode *ret;
8982 register regnode *chain = NULL;
8983 register regnode *latest;
8984 I32 flags = 0, c = 0;
8985 GET_RE_DEBUG_FLAGS_DECL;
8987 PERL_ARGS_ASSERT_REGBRANCH;
8989 DEBUG_PARSE("brnc");
8994 if (!SIZE_ONLY && RExC_extralen)
8995 ret = reganode(pRExC_state, BRANCHJ,0);
8997 ret = reg_node(pRExC_state, BRANCH);
8998 Set_Node_Length(ret, 1);
9002 if (!first && SIZE_ONLY)
9003 RExC_extralen += 1; /* BRANCHJ */
9005 *flagp = WORST; /* Tentatively. */
9008 nextchar(pRExC_state);
9009 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9011 latest = regpiece(pRExC_state, &flags,depth+1);
9012 if (latest == NULL) {
9013 if (flags & TRYAGAIN)
9017 else if (ret == NULL)
9019 *flagp |= flags&(HASWIDTH|POSTPONED);
9020 if (chain == NULL) /* First piece. */
9021 *flagp |= flags&SPSTART;
9024 REGTAIL(pRExC_state, chain, latest);
9029 if (chain == NULL) { /* Loop ran zero times. */
9030 chain = reg_node(pRExC_state, NOTHING);
9035 *flagp |= flags&SIMPLE;
9042 - regpiece - something followed by possible [*+?]
9044 * Note that the branching code sequences used for ? and the general cases
9045 * of * and + are somewhat optimized: they use the same NOTHING node as
9046 * both the endmarker for their branch list and the body of the last branch.
9047 * It might seem that this node could be dispensed with entirely, but the
9048 * endmarker role is not redundant.
9051 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9054 register regnode *ret;
9056 register char *next;
9058 const char * const origparse = RExC_parse;
9060 I32 max = REG_INFTY;
9061 #ifdef RE_TRACK_PATTERN_OFFSETS
9064 const char *maxpos = NULL;
9065 GET_RE_DEBUG_FLAGS_DECL;
9067 PERL_ARGS_ASSERT_REGPIECE;
9069 DEBUG_PARSE("piec");
9071 ret = regatom(pRExC_state, &flags,depth+1);
9073 if (flags & TRYAGAIN)
9080 if (op == '{' && regcurly(RExC_parse)) {
9082 #ifdef RE_TRACK_PATTERN_OFFSETS
9083 parse_start = RExC_parse; /* MJD */
9085 next = RExC_parse + 1;
9086 while (isDIGIT(*next) || *next == ',') {
9095 if (*next == '}') { /* got one */
9099 min = atoi(RExC_parse);
9103 maxpos = RExC_parse;
9105 if (!max && *maxpos != '0')
9106 max = REG_INFTY; /* meaning "infinity" */
9107 else if (max >= REG_INFTY)
9108 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9110 nextchar(pRExC_state);
9113 if ((flags&SIMPLE)) {
9114 RExC_naughty += 2 + RExC_naughty / 2;
9115 reginsert(pRExC_state, CURLY, ret, depth+1);
9116 Set_Node_Offset(ret, parse_start+1); /* MJD */
9117 Set_Node_Cur_Length(ret);
9120 regnode * const w = reg_node(pRExC_state, WHILEM);
9123 REGTAIL(pRExC_state, ret, w);
9124 if (!SIZE_ONLY && RExC_extralen) {
9125 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9126 reginsert(pRExC_state, NOTHING,ret, depth+1);
9127 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9129 reginsert(pRExC_state, CURLYX,ret, depth+1);
9131 Set_Node_Offset(ret, parse_start+1);
9132 Set_Node_Length(ret,
9133 op == '{' ? (RExC_parse - parse_start) : 1);
9135 if (!SIZE_ONLY && RExC_extralen)
9136 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9137 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9139 RExC_whilem_seen++, RExC_extralen += 3;
9140 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9149 vFAIL("Can't do {n,m} with n > m");
9151 ARG1_SET(ret, (U16)min);
9152 ARG2_SET(ret, (U16)max);
9164 #if 0 /* Now runtime fix should be reliable. */
9166 /* if this is reinstated, don't forget to put this back into perldiag:
9168 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9170 (F) The part of the regexp subject to either the * or + quantifier
9171 could match an empty string. The {#} shows in the regular
9172 expression about where the problem was discovered.
9176 if (!(flags&HASWIDTH) && op != '?')
9177 vFAIL("Regexp *+ operand could be empty");
9180 #ifdef RE_TRACK_PATTERN_OFFSETS
9181 parse_start = RExC_parse;
9183 nextchar(pRExC_state);
9185 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9187 if (op == '*' && (flags&SIMPLE)) {
9188 reginsert(pRExC_state, STAR, ret, depth+1);
9192 else if (op == '*') {
9196 else if (op == '+' && (flags&SIMPLE)) {
9197 reginsert(pRExC_state, PLUS, ret, depth+1);
9201 else if (op == '+') {
9205 else if (op == '?') {
9210 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9211 ckWARN3reg(RExC_parse,
9212 "%.*s matches null string many times",
9213 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9217 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9218 nextchar(pRExC_state);
9219 reginsert(pRExC_state, MINMOD, ret, depth+1);
9220 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9222 #ifndef REG_ALLOW_MINMOD_SUSPEND
9225 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9227 nextchar(pRExC_state);
9228 ender = reg_node(pRExC_state, SUCCEED);
9229 REGTAIL(pRExC_state, ret, ender);
9230 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9232 ender = reg_node(pRExC_state, TAIL);
9233 REGTAIL(pRExC_state, ret, ender);
9237 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9239 vFAIL("Nested quantifiers");
9246 /* reg_namedseq(pRExC_state,UVp, UV depth)
9248 This is expected to be called by a parser routine that has
9249 recognized '\N' and needs to handle the rest. RExC_parse is
9250 expected to point at the first char following the N at the time
9253 The \N may be inside (indicated by valuep not being NULL) or outside a
9256 \N may begin either a named sequence, or if outside a character class, mean
9257 to match a non-newline. For non single-quoted regexes, the tokenizer has
9258 attempted to decide which, and in the case of a named sequence converted it
9259 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9260 where c1... are the characters in the sequence. For single-quoted regexes,
9261 the tokenizer passes the \N sequence through unchanged; this code will not
9262 attempt to determine this nor expand those. The net effect is that if the
9263 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
9264 signals that this \N occurrence means to match a non-newline.
9266 Only the \N{U+...} form should occur in a character class, for the same
9267 reason that '.' inside a character class means to just match a period: it
9268 just doesn't make sense.
9270 If valuep is non-null then it is assumed that we are parsing inside
9271 of a charclass definition and the first codepoint in the resolved
9272 string is returned via *valuep and the routine will return NULL.
9273 In this mode if a multichar string is returned from the charnames
9274 handler, a warning will be issued, and only the first char in the
9275 sequence will be examined. If the string returned is zero length
9276 then the value of *valuep is undefined and NON-NULL will
9277 be returned to indicate failure. (This will NOT be a valid pointer
9280 If valuep is null then it is assumed that we are parsing normal text and a
9281 new EXACT node is inserted into the program containing the resolved string,
9282 and a pointer to the new node is returned. But if the string is zero length
9283 a NOTHING node is emitted instead.
9285 On success RExC_parse is set to the char following the endbrace.
9286 Parsing failures will generate a fatal error via vFAIL(...)
9289 S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
9291 char * endbrace; /* '}' following the name */
9292 regnode *ret = NULL;
9295 GET_RE_DEBUG_FLAGS_DECL;
9297 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
9301 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9302 * modifier. The other meaning does not */
9303 p = (RExC_flags & RXf_PMf_EXTENDED)
9304 ? regwhite( pRExC_state, RExC_parse )
9307 /* Disambiguate between \N meaning a named character versus \N meaning
9308 * [^\n]. The former is assumed when it can't be the latter. */
9309 if (*p != '{' || regcurly(p)) {
9312 /* no bare \N in a charclass */
9313 vFAIL("\\N in a character class must be a named character: \\N{...}");
9315 nextchar(pRExC_state);
9316 ret = reg_node(pRExC_state, REG_ANY);
9317 *flagp |= HASWIDTH|SIMPLE;
9320 Set_Node_Length(ret, 1); /* MJD */
9324 /* Here, we have decided it should be a named sequence */
9326 /* The test above made sure that the next real character is a '{', but
9327 * under the /x modifier, it could be separated by space (or a comment and
9328 * \n) and this is not allowed (for consistency with \x{...} and the
9329 * tokenizer handling of \N{NAME}). */
9330 if (*RExC_parse != '{') {
9331 vFAIL("Missing braces on \\N{}");
9334 RExC_parse++; /* Skip past the '{' */
9336 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9337 || ! (endbrace == RExC_parse /* nothing between the {} */
9338 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9339 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9341 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9342 vFAIL("\\N{NAME} must be resolved by the lexer");
9345 if (endbrace == RExC_parse) { /* empty: \N{} */
9347 RExC_parse = endbrace + 1;
9348 return reg_node(pRExC_state,NOTHING);
9352 ckWARNreg(RExC_parse,
9353 "Ignoring zero length \\N{} in character class"
9355 RExC_parse = endbrace + 1;
9358 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
9361 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
9362 RExC_parse += 2; /* Skip past the 'U+' */
9364 if (valuep) { /* In a bracketed char class */
9365 /* We only pay attention to the first char of
9366 multichar strings being returned. I kinda wonder
9367 if this makes sense as it does change the behaviour
9368 from earlier versions, OTOH that behaviour was broken
9369 as well. XXX Solution is to recharacterize as
9370 [rest-of-class]|multi1|multi2... */
9372 STRLEN length_of_hex;
9373 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9374 | PERL_SCAN_DISALLOW_PREFIX
9375 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9377 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
9378 if (endchar < endbrace) {
9379 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9382 length_of_hex = (STRLEN)(endchar - RExC_parse);
9383 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
9385 /* The tokenizer should have guaranteed validity, but it's possible to
9386 * bypass it by using single quoting, so check */
9387 if (length_of_hex == 0
9388 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9390 RExC_parse += length_of_hex; /* Includes all the valid */
9391 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9392 ? UTF8SKIP(RExC_parse)
9394 /* Guard against malformed utf8 */
9395 if (RExC_parse >= endchar) RExC_parse = endchar;
9396 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9399 RExC_parse = endbrace + 1;
9400 if (endchar == endbrace) return NULL;
9402 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
9404 else { /* Not a char class */
9406 /* What is done here is to convert this to a sub-pattern of the form
9407 * (?:\x{char1}\x{char2}...)
9408 * and then call reg recursively. That way, it retains its atomicness,
9409 * while not having to worry about special handling that some code
9410 * points may have. toke.c has converted the original Unicode values
9411 * to native, so that we can just pass on the hex values unchanged. We
9412 * do have to set a flag to keep recoding from happening in the
9415 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9417 char *endchar; /* Points to '.' or '}' ending cur char in the input
9419 char *orig_end = RExC_end;
9421 while (RExC_parse < endbrace) {
9423 /* Code points are separated by dots. If none, there is only one
9424 * code point, and is terminated by the brace */
9425 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9427 /* Convert to notation the rest of the code understands */
9428 sv_catpv(substitute_parse, "\\x{");
9429 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9430 sv_catpv(substitute_parse, "}");
9432 /* Point to the beginning of the next character in the sequence. */
9433 RExC_parse = endchar + 1;
9435 sv_catpv(substitute_parse, ")");
9437 RExC_parse = SvPV(substitute_parse, len);
9439 /* Don't allow empty number */
9441 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9443 RExC_end = RExC_parse + len;
9445 /* The values are Unicode, and therefore not subject to recoding */
9446 RExC_override_recoding = 1;
9448 ret = reg(pRExC_state, 1, flagp, depth+1);
9450 RExC_parse = endbrace;
9451 RExC_end = orig_end;
9452 RExC_override_recoding = 0;
9454 nextchar(pRExC_state);
9464 * It returns the code point in utf8 for the value in *encp.
9465 * value: a code value in the source encoding
9466 * encp: a pointer to an Encode object
9468 * If the result from Encode is not a single character,
9469 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9472 S_reg_recode(pTHX_ const char value, SV **encp)
9475 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9476 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9477 const STRLEN newlen = SvCUR(sv);
9478 UV uv = UNICODE_REPLACEMENT;
9480 PERL_ARGS_ASSERT_REG_RECODE;
9484 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9487 if (!newlen || numlen != newlen) {
9488 uv = UNICODE_REPLACEMENT;
9496 - regatom - the lowest level
9498 Try to identify anything special at the start of the pattern. If there
9499 is, then handle it as required. This may involve generating a single regop,
9500 such as for an assertion; or it may involve recursing, such as to
9501 handle a () structure.
9503 If the string doesn't start with something special then we gobble up
9504 as much literal text as we can.
9506 Once we have been able to handle whatever type of thing started the
9507 sequence, we return.
9509 Note: we have to be careful with escapes, as they can be both literal
9510 and special, and in the case of \10 and friends can either, depending
9511 on context. Specifically there are two separate switches for handling
9512 escape sequences, with the one for handling literal escapes requiring
9513 a dummy entry for all of the special escapes that are actually handled
9518 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9521 register regnode *ret = NULL;
9523 char *parse_start = RExC_parse;
9525 GET_RE_DEBUG_FLAGS_DECL;
9526 DEBUG_PARSE("atom");
9527 *flagp = WORST; /* Tentatively. */
9529 PERL_ARGS_ASSERT_REGATOM;
9532 switch ((U8)*RExC_parse) {
9534 RExC_seen_zerolen++;
9535 nextchar(pRExC_state);
9536 if (RExC_flags & RXf_PMf_MULTILINE)
9537 ret = reg_node(pRExC_state, MBOL);
9538 else if (RExC_flags & RXf_PMf_SINGLELINE)
9539 ret = reg_node(pRExC_state, SBOL);
9541 ret = reg_node(pRExC_state, BOL);
9542 Set_Node_Length(ret, 1); /* MJD */
9545 nextchar(pRExC_state);
9547 RExC_seen_zerolen++;
9548 if (RExC_flags & RXf_PMf_MULTILINE)
9549 ret = reg_node(pRExC_state, MEOL);
9550 else if (RExC_flags & RXf_PMf_SINGLELINE)
9551 ret = reg_node(pRExC_state, SEOL);
9553 ret = reg_node(pRExC_state, EOL);
9554 Set_Node_Length(ret, 1); /* MJD */
9557 nextchar(pRExC_state);
9558 if (RExC_flags & RXf_PMf_SINGLELINE)
9559 ret = reg_node(pRExC_state, SANY);
9561 ret = reg_node(pRExC_state, REG_ANY);
9562 *flagp |= HASWIDTH|SIMPLE;
9564 Set_Node_Length(ret, 1); /* MJD */
9568 char * const oregcomp_parse = ++RExC_parse;
9569 ret = regclass(pRExC_state,depth+1);
9570 if (*RExC_parse != ']') {
9571 RExC_parse = oregcomp_parse;
9572 vFAIL("Unmatched [");
9574 nextchar(pRExC_state);
9575 *flagp |= HASWIDTH|SIMPLE;
9576 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
9580 nextchar(pRExC_state);
9581 ret = reg(pRExC_state, 1, &flags,depth+1);
9583 if (flags & TRYAGAIN) {
9584 if (RExC_parse == RExC_end) {
9585 /* Make parent create an empty node if needed. */
9593 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9597 if (flags & TRYAGAIN) {
9601 vFAIL("Internal urp");
9602 /* Supposed to be caught earlier. */
9608 vFAIL("Quantifier follows nothing");
9613 This switch handles escape sequences that resolve to some kind
9614 of special regop and not to literal text. Escape sequnces that
9615 resolve to literal text are handled below in the switch marked
9618 Every entry in this switch *must* have a corresponding entry
9619 in the literal escape switch. However, the opposite is not
9620 required, as the default for this switch is to jump to the
9621 literal text handling code.
9623 switch ((U8)*++RExC_parse) {
9624 /* Special Escapes */
9626 RExC_seen_zerolen++;
9627 ret = reg_node(pRExC_state, SBOL);
9629 goto finish_meta_pat;
9631 ret = reg_node(pRExC_state, GPOS);
9632 RExC_seen |= REG_SEEN_GPOS;
9634 goto finish_meta_pat;
9636 RExC_seen_zerolen++;
9637 ret = reg_node(pRExC_state, KEEPS);
9639 /* XXX:dmq : disabling in-place substitution seems to
9640 * be necessary here to avoid cases of memory corruption, as
9641 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9643 RExC_seen |= REG_SEEN_LOOKBEHIND;
9644 goto finish_meta_pat;
9646 ret = reg_node(pRExC_state, SEOL);
9648 RExC_seen_zerolen++; /* Do not optimize RE away */
9649 goto finish_meta_pat;
9651 ret = reg_node(pRExC_state, EOS);
9653 RExC_seen_zerolen++; /* Do not optimize RE away */
9654 goto finish_meta_pat;
9656 ret = reg_node(pRExC_state, CANY);
9657 RExC_seen |= REG_SEEN_CANY;
9658 *flagp |= HASWIDTH|SIMPLE;
9659 goto finish_meta_pat;
9661 ret = reg_node(pRExC_state, CLUMP);
9663 goto finish_meta_pat;
9665 switch (get_regex_charset(RExC_flags)) {
9666 case REGEX_LOCALE_CHARSET:
9669 case REGEX_UNICODE_CHARSET:
9672 case REGEX_ASCII_RESTRICTED_CHARSET:
9673 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9676 case REGEX_DEPENDS_CHARSET:
9682 ret = reg_node(pRExC_state, op);
9683 *flagp |= HASWIDTH|SIMPLE;
9684 goto finish_meta_pat;
9686 switch (get_regex_charset(RExC_flags)) {
9687 case REGEX_LOCALE_CHARSET:
9690 case REGEX_UNICODE_CHARSET:
9693 case REGEX_ASCII_RESTRICTED_CHARSET:
9694 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9697 case REGEX_DEPENDS_CHARSET:
9703 ret = reg_node(pRExC_state, op);
9704 *flagp |= HASWIDTH|SIMPLE;
9705 goto finish_meta_pat;
9707 RExC_seen_zerolen++;
9708 RExC_seen |= REG_SEEN_LOOKBEHIND;
9709 switch (get_regex_charset(RExC_flags)) {
9710 case REGEX_LOCALE_CHARSET:
9713 case REGEX_UNICODE_CHARSET:
9716 case REGEX_ASCII_RESTRICTED_CHARSET:
9717 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9720 case REGEX_DEPENDS_CHARSET:
9726 ret = reg_node(pRExC_state, op);
9727 FLAGS(ret) = get_regex_charset(RExC_flags);
9729 goto finish_meta_pat;
9731 RExC_seen_zerolen++;
9732 RExC_seen |= REG_SEEN_LOOKBEHIND;
9733 switch (get_regex_charset(RExC_flags)) {
9734 case REGEX_LOCALE_CHARSET:
9737 case REGEX_UNICODE_CHARSET:
9740 case REGEX_ASCII_RESTRICTED_CHARSET:
9741 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9744 case REGEX_DEPENDS_CHARSET:
9750 ret = reg_node(pRExC_state, op);
9751 FLAGS(ret) = get_regex_charset(RExC_flags);
9753 goto finish_meta_pat;
9755 switch (get_regex_charset(RExC_flags)) {
9756 case REGEX_LOCALE_CHARSET:
9759 case REGEX_UNICODE_CHARSET:
9762 case REGEX_ASCII_RESTRICTED_CHARSET:
9763 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9766 case REGEX_DEPENDS_CHARSET:
9772 ret = reg_node(pRExC_state, op);
9773 *flagp |= HASWIDTH|SIMPLE;
9774 goto finish_meta_pat;
9776 switch (get_regex_charset(RExC_flags)) {
9777 case REGEX_LOCALE_CHARSET:
9780 case REGEX_UNICODE_CHARSET:
9783 case REGEX_ASCII_RESTRICTED_CHARSET:
9784 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9787 case REGEX_DEPENDS_CHARSET:
9793 ret = reg_node(pRExC_state, op);
9794 *flagp |= HASWIDTH|SIMPLE;
9795 goto finish_meta_pat;
9797 switch (get_regex_charset(RExC_flags)) {
9798 case REGEX_LOCALE_CHARSET:
9801 case REGEX_ASCII_RESTRICTED_CHARSET:
9802 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9805 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9806 case REGEX_UNICODE_CHARSET:
9812 ret = reg_node(pRExC_state, op);
9813 *flagp |= HASWIDTH|SIMPLE;
9814 goto finish_meta_pat;
9816 switch (get_regex_charset(RExC_flags)) {
9817 case REGEX_LOCALE_CHARSET:
9820 case REGEX_ASCII_RESTRICTED_CHARSET:
9821 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
9824 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9825 case REGEX_UNICODE_CHARSET:
9831 ret = reg_node(pRExC_state, op);
9832 *flagp |= HASWIDTH|SIMPLE;
9833 goto finish_meta_pat;
9835 ret = reg_node(pRExC_state, LNBREAK);
9836 *flagp |= HASWIDTH|SIMPLE;
9837 goto finish_meta_pat;
9839 ret = reg_node(pRExC_state, HORIZWS);
9840 *flagp |= HASWIDTH|SIMPLE;
9841 goto finish_meta_pat;
9843 ret = reg_node(pRExC_state, NHORIZWS);
9844 *flagp |= HASWIDTH|SIMPLE;
9845 goto finish_meta_pat;
9847 ret = reg_node(pRExC_state, VERTWS);
9848 *flagp |= HASWIDTH|SIMPLE;
9849 goto finish_meta_pat;
9851 ret = reg_node(pRExC_state, NVERTWS);
9852 *flagp |= HASWIDTH|SIMPLE;
9854 nextchar(pRExC_state);
9855 Set_Node_Length(ret, 2); /* MJD */
9860 char* const oldregxend = RExC_end;
9862 char* parse_start = RExC_parse - 2;
9865 if (RExC_parse[1] == '{') {
9866 /* a lovely hack--pretend we saw [\pX] instead */
9867 RExC_end = strchr(RExC_parse, '}');
9869 const U8 c = (U8)*RExC_parse;
9871 RExC_end = oldregxend;
9872 vFAIL2("Missing right brace on \\%c{}", c);
9877 RExC_end = RExC_parse + 2;
9878 if (RExC_end > oldregxend)
9879 RExC_end = oldregxend;
9883 ret = regclass(pRExC_state,depth+1);
9885 RExC_end = oldregxend;
9888 Set_Node_Offset(ret, parse_start + 2);
9889 Set_Node_Cur_Length(ret);
9890 nextchar(pRExC_state);
9891 *flagp |= HASWIDTH|SIMPLE;
9895 /* Handle \N and \N{NAME} here and not below because it can be
9896 multicharacter. join_exact() will join them up later on.
9897 Also this makes sure that things like /\N{BLAH}+/ and
9898 \N{BLAH} being multi char Just Happen. dmq*/
9900 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
9902 case 'k': /* Handle \k<NAME> and \k'NAME' */
9905 char ch= RExC_parse[1];
9906 if (ch != '<' && ch != '\'' && ch != '{') {
9908 vFAIL2("Sequence %.2s... not terminated",parse_start);
9910 /* this pretty much dupes the code for (?P=...) in reg(), if
9911 you change this make sure you change that */
9912 char* name_start = (RExC_parse += 2);
9914 SV *sv_dat = reg_scan_name(pRExC_state,
9915 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9916 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
9917 if (RExC_parse == name_start || *RExC_parse != ch)
9918 vFAIL2("Sequence %.3s... not terminated",parse_start);
9921 num = add_data( pRExC_state, 1, "S" );
9922 RExC_rxi->data->data[num]=(void*)sv_dat;
9923 SvREFCNT_inc_simple_void(sv_dat);
9927 ret = reganode(pRExC_state,
9930 : (MORE_ASCII_RESTRICTED)
9932 : (AT_LEAST_UNI_SEMANTICS)
9940 /* override incorrect value set in reganode MJD */
9941 Set_Node_Offset(ret, parse_start+1);
9942 Set_Node_Cur_Length(ret); /* MJD */
9943 nextchar(pRExC_state);
9949 case '1': case '2': case '3': case '4':
9950 case '5': case '6': case '7': case '8': case '9':
9953 bool isg = *RExC_parse == 'g';
9958 if (*RExC_parse == '{') {
9962 if (*RExC_parse == '-') {
9966 if (hasbrace && !isDIGIT(*RExC_parse)) {
9967 if (isrel) RExC_parse--;
9969 goto parse_named_seq;
9971 num = atoi(RExC_parse);
9972 if (isg && num == 0)
9973 vFAIL("Reference to invalid group 0");
9975 num = RExC_npar - num;
9977 vFAIL("Reference to nonexistent or unclosed group");
9979 if (!isg && num > 9 && num >= RExC_npar)
9982 char * const parse_start = RExC_parse - 1; /* MJD */
9983 while (isDIGIT(*RExC_parse))
9985 if (parse_start == RExC_parse - 1)
9986 vFAIL("Unterminated \\g... pattern");
9988 if (*RExC_parse != '}')
9989 vFAIL("Unterminated \\g{...} pattern");
9993 if (num > (I32)RExC_rx->nparens)
9994 vFAIL("Reference to nonexistent group");
9997 ret = reganode(pRExC_state,
10000 : (MORE_ASCII_RESTRICTED)
10002 : (AT_LEAST_UNI_SEMANTICS)
10008 *flagp |= HASWIDTH;
10010 /* override incorrect value set in reganode MJD */
10011 Set_Node_Offset(ret, parse_start+1);
10012 Set_Node_Cur_Length(ret); /* MJD */
10014 nextchar(pRExC_state);
10019 if (RExC_parse >= RExC_end)
10020 FAIL("Trailing \\");
10023 /* Do not generate "unrecognized" warnings here, we fall
10024 back into the quick-grab loop below */
10031 if (RExC_flags & RXf_PMf_EXTENDED) {
10032 if ( reg_skipcomment( pRExC_state ) )
10039 parse_start = RExC_parse - 1;
10044 register STRLEN len;
10049 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
10052 /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
10053 * it is folded to 'ss' even if not utf8 */
10054 bool is_exactfu_sharp_s;
10057 node_type = ((! FOLD) ? EXACT
10060 : (MORE_ASCII_RESTRICTED)
10062 : (AT_LEAST_UNI_SEMANTICS)
10065 ret = reg_node(pRExC_state, node_type);
10068 /* XXX The node can hold up to 255 bytes, yet this only goes to
10069 * 127. I (khw) do not know why. Keeping it somewhat less than
10070 * 255 allows us to not have to worry about overflow due to
10071 * converting to utf8 and fold expansion, but that value is
10072 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10073 * split up by this limit into a single one using the real max of
10074 * 255. Even at 127, this breaks under rare circumstances. If
10075 * folding, we do not want to split a node at a character that is a
10076 * non-final in a multi-char fold, as an input string could just
10077 * happen to want to match across the node boundary. The join
10078 * would solve that problem if the join actually happens. But a
10079 * series of more than two nodes in a row each of 127 would cause
10080 * the first join to succeed to get to 254, but then there wouldn't
10081 * be room for the next one, which could at be one of those split
10082 * multi-char folds. I don't know of any fool-proof solution. One
10083 * could back off to end with only a code point that isn't such a
10084 * non-final, but it is possible for there not to be any in the
10086 for (len = 0, p = RExC_parse - 1;
10087 len < 127 && p < RExC_end;
10090 char * const oldp = p;
10092 if (RExC_flags & RXf_PMf_EXTENDED)
10093 p = regwhite( pRExC_state, p );
10104 /* Literal Escapes Switch
10106 This switch is meant to handle escape sequences that
10107 resolve to a literal character.
10109 Every escape sequence that represents something
10110 else, like an assertion or a char class, is handled
10111 in the switch marked 'Special Escapes' above in this
10112 routine, but also has an entry here as anything that
10113 isn't explicitly mentioned here will be treated as
10114 an unescaped equivalent literal.
10117 switch ((U8)*++p) {
10118 /* These are all the special escapes. */
10119 case 'A': /* Start assertion */
10120 case 'b': case 'B': /* Word-boundary assertion*/
10121 case 'C': /* Single char !DANGEROUS! */
10122 case 'd': case 'D': /* digit class */
10123 case 'g': case 'G': /* generic-backref, pos assertion */
10124 case 'h': case 'H': /* HORIZWS */
10125 case 'k': case 'K': /* named backref, keep marker */
10126 case 'N': /* named char sequence */
10127 case 'p': case 'P': /* Unicode property */
10128 case 'R': /* LNBREAK */
10129 case 's': case 'S': /* space class */
10130 case 'v': case 'V': /* VERTWS */
10131 case 'w': case 'W': /* word class */
10132 case 'X': /* eXtended Unicode "combining character sequence" */
10133 case 'z': case 'Z': /* End of line/string assertion */
10137 /* Anything after here is an escape that resolves to a
10138 literal. (Except digits, which may or may not)
10157 ender = ASCII_TO_NATIVE('\033');
10161 ender = ASCII_TO_NATIVE('\007');
10166 STRLEN brace_len = len;
10168 const char* error_msg;
10170 bool valid = grok_bslash_o(p,
10177 RExC_parse = p; /* going to die anyway; point
10178 to exact spot of failure */
10185 if (PL_encoding && ender < 0x100) {
10186 goto recode_encoding;
10188 if (ender > 0xff) {
10195 char* const e = strchr(p, '}');
10198 RExC_parse = p + 1;
10199 vFAIL("Missing right brace on \\x{}");
10202 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10203 | PERL_SCAN_DISALLOW_PREFIX;
10204 STRLEN numlen = e - p - 1;
10205 ender = grok_hex(p + 1, &numlen, &flags, NULL);
10212 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
10214 ender = grok_hex(p, &numlen, &flags, NULL);
10217 if (PL_encoding && ender < 0x100)
10218 goto recode_encoding;
10222 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10224 case '0': case '1': case '2': case '3':case '4':
10225 case '5': case '6': case '7': case '8':case '9':
10227 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10229 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10231 ender = grok_oct(p, &numlen, &flags, NULL);
10232 if (ender > 0xff) {
10241 if (PL_encoding && ender < 0x100)
10242 goto recode_encoding;
10245 if (! RExC_override_recoding) {
10246 SV* enc = PL_encoding;
10247 ender = reg_recode((const char)(U8)ender, &enc);
10248 if (!enc && SIZE_ONLY)
10249 ckWARNreg(p, "Invalid escape in the specified encoding");
10255 FAIL("Trailing \\");
10258 if (!SIZE_ONLY&& isALPHA(*p)) {
10259 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10261 goto normal_default;
10265 /* Currently we don't warn when the lbrace is at the start
10266 * of a construct. This catches it in the middle of a
10267 * literal string, or when its the first thing after
10268 * something like "\b" */
10270 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10272 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10277 if (UTF8_IS_START(*p) && UTF) {
10279 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10280 &numlen, UTF8_ALLOW_DEFAULT);
10286 } /* End of switch on the literal */
10288 is_exactfu_sharp_s = (node_type == EXACTFU
10289 && ender == LATIN_SMALL_LETTER_SHARP_S);
10290 if ( RExC_flags & RXf_PMf_EXTENDED)
10291 p = regwhite( pRExC_state, p );
10292 if ((UTF && FOLD) || is_exactfu_sharp_s) {
10293 /* Prime the casefolded buffer. Locale rules, which apply
10294 * only to code points < 256, aren't known until execution,
10295 * so for them, just output the original character using
10296 * utf8. If we start to fold non-UTF patterns, be sure to
10297 * update join_exact() */
10298 if (LOC && ender < 256) {
10299 if (UNI_IS_INVARIANT(ender)) {
10300 *tmpbuf = (U8) ender;
10303 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
10304 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
10308 else if (isASCII(ender)) { /* Note: Here can't also be LOC
10310 ender = toLOWER(ender);
10311 *tmpbuf = (U8) ender;
10314 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
10316 /* Locale and /aa require more selectivity about the
10317 * fold, so are handled below. Otherwise, here, just
10319 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
10322 /* Under locale rules or /aa we are not to mix,
10323 * respectively, ords < 256 or ASCII with non-. So
10324 * reject folds that mix them, using only the
10325 * non-folded code point. So do the fold to a
10326 * temporary, and inspect each character in it. */
10327 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
10329 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
10330 U8* e = s + foldlen;
10331 bool fold_ok = TRUE;
10335 || (LOC && (UTF8_IS_INVARIANT(*s)
10336 || UTF8_IS_DOWNGRADEABLE_START(*s))))
10344 Copy(trialbuf, tmpbuf, foldlen, U8);
10348 uvuni_to_utf8(tmpbuf, ender);
10349 foldlen = UNISKIP(ender);
10353 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
10356 else if (UTF || is_exactfu_sharp_s) {
10358 /* Emit all the Unicode characters. */
10360 for (foldbuf = tmpbuf;
10362 foldlen -= numlen) {
10364 /* tmpbuf has been constructed by us, so we
10365 * know it is valid utf8 */
10366 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10368 const STRLEN unilen = reguni(pRExC_state, ender, s);
10371 /* In EBCDIC the numlen
10372 * and unilen can differ. */
10374 if (numlen >= foldlen)
10378 break; /* "Can't happen." */
10382 const STRLEN unilen = reguni(pRExC_state, ender, s);
10391 REGC((char)ender, s++);
10395 if (UTF || is_exactfu_sharp_s) {
10397 /* Emit all the Unicode characters. */
10399 for (foldbuf = tmpbuf;
10401 foldlen -= numlen) {
10402 ender = valid_utf8_to_uvchr(foldbuf, &numlen);
10404 const STRLEN unilen = reguni(pRExC_state, ender, s);
10407 /* In EBCDIC the numlen
10408 * and unilen can differ. */
10410 if (numlen >= foldlen)
10418 const STRLEN unilen = reguni(pRExC_state, ender, s);
10427 REGC((char)ender, s++);
10430 loopdone: /* Jumped to when encounters something that shouldn't be in
10432 RExC_parse = p - 1;
10433 Set_Node_Cur_Length(ret); /* MJD */
10434 nextchar(pRExC_state);
10436 /* len is STRLEN which is unsigned, need to copy to signed */
10439 vFAIL("Internal disaster");
10442 *flagp |= HASWIDTH;
10443 if (len == 1 && UNI_IS_INVARIANT(ender))
10447 RExC_size += STR_SZ(len);
10449 STR_LEN(ret) = len;
10450 RExC_emit += STR_SZ(len);
10458 /* Jumped to when an unrecognized character set is encountered */
10460 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
10465 S_regwhite( RExC_state_t *pRExC_state, char *p )
10467 const char *e = RExC_end;
10469 PERL_ARGS_ASSERT_REGWHITE;
10474 else if (*p == '#') {
10477 if (*p++ == '\n') {
10483 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
10491 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
10492 Character classes ([:foo:]) can also be negated ([:^foo:]).
10493 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
10494 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
10495 but trigger failures because they are currently unimplemented. */
10497 #define POSIXCC_DONE(c) ((c) == ':')
10498 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
10499 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
10502 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
10505 I32 namedclass = OOB_NAMEDCLASS;
10507 PERL_ARGS_ASSERT_REGPPOSIXCC;
10509 if (value == '[' && RExC_parse + 1 < RExC_end &&
10510 /* I smell either [: or [= or [. -- POSIX has been here, right? */
10511 POSIXCC(UCHARAT(RExC_parse))) {
10512 const char c = UCHARAT(RExC_parse);
10513 char* const s = RExC_parse++;
10515 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
10517 if (RExC_parse == RExC_end)
10518 /* Grandfather lone [:, [=, [. */
10521 const char* const t = RExC_parse++; /* skip over the c */
10524 if (UCHARAT(RExC_parse) == ']') {
10525 const char *posixcc = s + 1;
10526 RExC_parse++; /* skip over the ending ] */
10529 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
10530 const I32 skip = t - posixcc;
10532 /* Initially switch on the length of the name. */
10535 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
10536 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
10539 /* Names all of length 5. */
10540 /* alnum alpha ascii blank cntrl digit graph lower
10541 print punct space upper */
10542 /* Offset 4 gives the best switch position. */
10543 switch (posixcc[4]) {
10545 if (memEQ(posixcc, "alph", 4)) /* alpha */
10546 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
10549 if (memEQ(posixcc, "spac", 4)) /* space */
10550 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
10553 if (memEQ(posixcc, "grap", 4)) /* graph */
10554 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
10557 if (memEQ(posixcc, "asci", 4)) /* ascii */
10558 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
10561 if (memEQ(posixcc, "blan", 4)) /* blank */
10562 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
10565 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
10566 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
10569 if (memEQ(posixcc, "alnu", 4)) /* alnum */
10570 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
10573 if (memEQ(posixcc, "lowe", 4)) /* lower */
10574 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
10575 else if (memEQ(posixcc, "uppe", 4)) /* upper */
10576 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
10579 if (memEQ(posixcc, "digi", 4)) /* digit */
10580 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
10581 else if (memEQ(posixcc, "prin", 4)) /* print */
10582 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
10583 else if (memEQ(posixcc, "punc", 4)) /* punct */
10584 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
10589 if (memEQ(posixcc, "xdigit", 6))
10590 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
10594 if (namedclass == OOB_NAMEDCLASS)
10595 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
10597 assert (posixcc[skip] == ':');
10598 assert (posixcc[skip+1] == ']');
10599 } else if (!SIZE_ONLY) {
10600 /* [[=foo=]] and [[.foo.]] are still future. */
10602 /* adjust RExC_parse so the warning shows after
10603 the class closes */
10604 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
10606 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10609 /* Maternal grandfather:
10610 * "[:" ending in ":" but not in ":]" */
10620 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
10624 PERL_ARGS_ASSERT_CHECKPOSIXCC;
10626 if (POSIXCC(UCHARAT(RExC_parse))) {
10627 const char *s = RExC_parse;
10628 const char c = *s++;
10630 while (isALNUM(*s))
10632 if (*s && c == *s && s[1] == ']') {
10634 "POSIX syntax [%c %c] belongs inside character classes",
10637 /* [[=foo=]] and [[.foo.]] are still future. */
10638 if (POSIXCC_NOTYET(c)) {
10639 /* adjust RExC_parse so the error shows after
10640 the class closes */
10641 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
10643 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10649 /* Generate the code to add a full posix character <class> to the bracketed
10650 * character class given by <node>. (<node> is needed only under locale rules)
10651 * destlist is the inversion list for non-locale rules that this class is
10653 * sourcelist is the ASCII-range inversion list to add under /a rules
10654 * Xsourcelist is the full Unicode range list to use otherwise. */
10655 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10657 SV* scratch_list = NULL; \
10659 /* Set this class in the node for runtime matching */ \
10660 ANYOF_CLASS_SET(node, class); \
10662 /* For above Latin1 code points, we use the full Unicode range */ \
10663 _invlist_intersection(PL_AboveLatin1, \
10666 /* And set the output to it, adding instead if there already is an \
10667 * output. Checking if <destlist> is NULL first saves an extra \
10668 * clone. Its reference count will be decremented at the next \
10669 * union, etc, or if this is the only instance, at the end of the \
10671 if (! destlist) { \
10672 destlist = scratch_list; \
10675 _invlist_union(destlist, scratch_list, &destlist); \
10676 SvREFCNT_dec(scratch_list); \
10680 /* For non-locale, just add it to any existing list */ \
10681 _invlist_union(destlist, \
10682 (AT_LEAST_ASCII_RESTRICTED) \
10688 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10690 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10692 SV* scratch_list = NULL; \
10693 ANYOF_CLASS_SET(node, class); \
10694 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
10695 if (! destlist) { \
10696 destlist = scratch_list; \
10699 _invlist_union(destlist, scratch_list, &destlist); \
10700 SvREFCNT_dec(scratch_list); \
10704 _invlist_union_complement_2nd(destlist, \
10705 (AT_LEAST_ASCII_RESTRICTED) \
10709 /* Under /d, everything in the upper half of the Latin1 range \
10710 * matches this complement */ \
10711 if (DEPENDS_SEMANTICS) { \
10712 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10716 /* Generate the code to add a posix character <class> to the bracketed
10717 * character class given by <node>. (<node> is needed only under locale rules)
10718 * destlist is the inversion list for non-locale rules that this class is
10720 * sourcelist is the ASCII-range inversion list to add under /a rules
10721 * l1_sourcelist is the Latin1 range list to use otherwise.
10722 * Xpropertyname is the name to add to <run_time_list> of the property to
10723 * specify the code points above Latin1 that will have to be
10724 * determined at run-time
10725 * run_time_list is a SV* that contains text names of properties that are to
10726 * be computed at run time. This concatenates <Xpropertyname>
10727 * to it, apppropriately
10728 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10730 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10731 l1_sourcelist, Xpropertyname, run_time_list) \
10732 /* First, resolve whether to use the ASCII-only list or the L1 \
10734 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
10735 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
10736 Xpropertyname, run_time_list)
10738 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
10739 Xpropertyname, run_time_list) \
10740 /* If not /a matching, there are going to be code points we will have \
10741 * to defer to runtime to look-up */ \
10742 if (! AT_LEAST_ASCII_RESTRICTED) { \
10743 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10746 ANYOF_CLASS_SET(node, class); \
10749 _invlist_union(destlist, sourcelist, &destlist); \
10752 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
10753 * this and DO_N_POSIX */
10754 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10755 l1_sourcelist, Xpropertyname, run_time_list) \
10756 if (AT_LEAST_ASCII_RESTRICTED) { \
10757 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
10760 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10762 ANYOF_CLASS_SET(node, namedclass); \
10765 SV* scratch_list = NULL; \
10766 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
10767 if (! destlist) { \
10768 destlist = scratch_list; \
10771 _invlist_union(destlist, scratch_list, &destlist); \
10772 SvREFCNT_dec(scratch_list); \
10774 if (DEPENDS_SEMANTICS) { \
10775 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10781 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10784 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10785 * Locale folding is done at run-time, so this function should not be
10786 * called for nodes that are for locales.
10788 * This function sets the bit corresponding to the fold of the input
10789 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
10792 * It also knows about the characters that are in the bitmap that have
10793 * folds that are matchable only outside it, and sets the appropriate lists
10796 * It returns the number of bits that actually changed from 0 to 1 */
10801 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10803 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
10806 /* It assumes the bit for 'value' has already been set */
10807 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10808 ANYOF_BITMAP_SET(node, fold);
10811 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10812 /* Certain Latin1 characters have matches outside the bitmap. To get
10813 * here, 'value' is one of those characters. None of these matches is
10814 * valid for ASCII characters under /aa, which have been excluded by
10815 * the 'if' above. The matches fall into three categories:
10816 * 1) They are singly folded-to or -from an above 255 character, as
10817 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10819 * 2) They are part of a multi-char fold with another character in the
10820 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10821 * 3) They are part of a multi-char fold with a character not in the
10822 * bitmap, such as various ligatures.
10823 * We aren't dealing fully with multi-char folds, except we do deal
10824 * with the pattern containing a character that has a multi-char fold
10825 * (not so much the inverse).
10826 * For types 1) and 3), the matches only happen when the target string
10827 * is utf8; that's not true for 2), and we set a flag for it.
10829 * The code below adds to the passed in inversion list the single fold
10830 * closures for 'value'. The values are hard-coded here so that an
10831 * innocent-looking character class, like /[ks]/i won't have to go out
10832 * to disk to find the possible matches. XXX It would be better to
10833 * generate these via regen, in case a new version of the Unicode
10834 * standard adds new mappings, though that is not really likely. */
10839 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10843 /* LATIN SMALL LETTER LONG S */
10844 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10847 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10848 GREEK_SMALL_LETTER_MU);
10849 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10850 GREEK_CAPITAL_LETTER_MU);
10852 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10853 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10854 /* ANGSTROM SIGN */
10855 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10856 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
10857 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10858 PL_fold_latin1[value]);
10861 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10862 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10863 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10865 case LATIN_SMALL_LETTER_SHARP_S:
10866 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10867 LATIN_CAPITAL_LETTER_SHARP_S);
10869 /* Under /a, /d, and /u, this can match the two chars "ss" */
10870 if (! MORE_ASCII_RESTRICTED) {
10871 add_alternate(alternate_ptr, (U8 *) "ss", 2);
10873 /* And under /u or /a, it can match even if the target is
10875 if (AT_LEAST_UNI_SEMANTICS) {
10876 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10880 case 'F': case 'f':
10881 case 'I': case 'i':
10882 case 'L': case 'l':
10883 case 'T': case 't':
10884 case 'A': case 'a':
10885 case 'H': case 'h':
10886 case 'J': case 'j':
10887 case 'N': case 'n':
10888 case 'W': case 'w':
10889 case 'Y': case 'y':
10890 /* These all are targets of multi-character folds from code
10891 * points that require UTF8 to express, so they can't match
10892 * unless the target string is in UTF-8, so no action here is
10893 * necessary, as regexec.c properly handles the general case
10894 * for UTF-8 matching */
10897 /* Use deprecated warning to increase the chances of this
10899 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10903 else if (DEPENDS_SEMANTICS
10904 && ! isASCII(value)
10905 && PL_fold_latin1[value] != value)
10907 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10908 * folds only when the target string is in UTF-8. We add the fold
10909 * here to the list of things to match outside the bitmap, which
10910 * won't be looked at unless it is UTF8 (or else if something else
10911 * says to look even if not utf8, but those things better not happen
10912 * under DEPENDS semantics. */
10913 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
10920 PERL_STATIC_INLINE U8
10921 S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
10923 /* This inline function sets a bit in the bitmap if not already set, and if
10924 * appropriate, its fold, returning the number of bits that actually
10925 * changed from 0 to 1 */
10929 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10931 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
10935 ANYOF_BITMAP_SET(node, value);
10938 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
10939 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
10946 S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10948 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10949 * alternate list, pointed to by 'alternate_ptr'. This is an array of
10950 * the multi-character folds of characters in the node */
10953 PERL_ARGS_ASSERT_ADD_ALTERNATE;
10955 if (! *alternate_ptr) {
10956 *alternate_ptr = newAV();
10958 sv = newSVpvn_utf8((char*)string, len, TRUE);
10959 av_push(*alternate_ptr, sv);
10964 parse a class specification and produce either an ANYOF node that
10965 matches the pattern or perhaps will be optimized into an EXACTish node
10966 instead. The node contains a bit map for the first 256 characters, with the
10967 corresponding bit set if that character is in the list. For characters
10968 above 255, a range list is used */
10971 S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
10974 register UV nextvalue;
10975 register IV prevvalue = OOB_UNICODE;
10976 register IV range = 0;
10977 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
10978 register regnode *ret;
10981 char *rangebegin = NULL;
10982 bool need_class = 0;
10983 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
10985 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10986 than just initialized. */
10987 SV* properties = NULL; /* Code points that match \p{} \P{} */
10988 UV element_count = 0; /* Number of distinct elements in the class.
10989 Optimizations may be possible if this is tiny */
10992 /* Unicode properties are stored in a swash; this holds the current one
10993 * being parsed. If this swash is the only above-latin1 component of the
10994 * character class, an optimization is to pass it directly on to the
10995 * execution engine. Otherwise, it is set to NULL to indicate that there
10996 * are other things in the class that have to be dealt with at execution
10998 SV* swash = NULL; /* Code points that match \p{} \P{} */
11000 /* Set if a component of this character class is user-defined; just passed
11001 * on to the engine */
11002 UV has_user_defined_property = 0;
11004 /* code points this node matches that can't be stored in the bitmap */
11005 SV* nonbitmap = NULL;
11007 /* The items that are to match that aren't stored in the bitmap, but are a
11008 * result of things that are stored there. This is the fold closure of
11009 * such a character, either because it has DEPENDS semantics and shouldn't
11010 * be matched unless the target string is utf8, or is a code point that is
11011 * too large for the bit map, as for example, the fold of the MICRO SIGN is
11012 * above 255. This all is solely for performance reasons. By having this
11013 * code know the outside-the-bitmap folds that the bitmapped characters are
11014 * involved with, we don't have to go out to disk to find the list of
11015 * matches, unless the character class includes code points that aren't
11016 * storable in the bit map. That means that a character class with an 's'
11017 * in it, for example, doesn't need to go out to disk to find everything
11018 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
11019 * empty unless there is something whose fold we don't know about, and will
11020 * have to go out to the disk to find. */
11021 SV* l1_fold_invlist = NULL;
11023 /* List of multi-character folds that are matched by this node */
11024 AV* unicode_alternate = NULL;
11026 UV literal_endpoint = 0;
11028 UV stored = 0; /* how many chars stored in the bitmap */
11030 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11031 case we need to change the emitted regop to an EXACT. */
11032 const char * orig_parse = RExC_parse;
11033 GET_RE_DEBUG_FLAGS_DECL;
11035 PERL_ARGS_ASSERT_REGCLASS;
11037 PERL_UNUSED_ARG(depth);
11040 DEBUG_PARSE("clas");
11042 /* Assume we are going to generate an ANYOF node. */
11043 ret = reganode(pRExC_state, ANYOF, 0);
11047 ANYOF_FLAGS(ret) = 0;
11050 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11054 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
11056 /* We have decided to not allow multi-char folds in inverted character
11057 * classes, due to the confusion that can happen, especially with
11058 * classes that are designed for a non-Unicode world: You have the
11059 * peculiar case that:
11060 "s s" =~ /^[^\xDF]+$/i => Y
11061 "ss" =~ /^[^\xDF]+$/i => N
11063 * See [perl #89750] */
11064 allow_full_fold = FALSE;
11068 RExC_size += ANYOF_SKIP;
11069 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11072 RExC_emit += ANYOF_SKIP;
11074 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11076 ANYOF_BITMAP_ZERO(ret);
11077 listsv = newSVpvs("# comment\n");
11078 initial_listsv_len = SvCUR(listsv);
11081 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11083 if (!SIZE_ONLY && POSIXCC(nextvalue))
11084 checkposixcc(pRExC_state);
11086 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11087 if (UCHARAT(RExC_parse) == ']')
11088 goto charclassloop;
11091 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11095 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11098 rangebegin = RExC_parse;
11102 value = utf8n_to_uvchr((U8*)RExC_parse,
11103 RExC_end - RExC_parse,
11104 &numlen, UTF8_ALLOW_DEFAULT);
11105 RExC_parse += numlen;
11108 value = UCHARAT(RExC_parse++);
11110 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11111 if (value == '[' && POSIXCC(nextvalue))
11112 namedclass = regpposixcc(pRExC_state, value);
11113 else if (value == '\\') {
11115 value = utf8n_to_uvchr((U8*)RExC_parse,
11116 RExC_end - RExC_parse,
11117 &numlen, UTF8_ALLOW_DEFAULT);
11118 RExC_parse += numlen;
11121 value = UCHARAT(RExC_parse++);
11122 /* Some compilers cannot handle switching on 64-bit integer
11123 * values, therefore value cannot be an UV. Yes, this will
11124 * be a problem later if we want switch on Unicode.
11125 * A similar issue a little bit later when switching on
11126 * namedclass. --jhi */
11127 switch ((I32)value) {
11128 case 'w': namedclass = ANYOF_ALNUM; break;
11129 case 'W': namedclass = ANYOF_NALNUM; break;
11130 case 's': namedclass = ANYOF_SPACE; break;
11131 case 'S': namedclass = ANYOF_NSPACE; break;
11132 case 'd': namedclass = ANYOF_DIGIT; break;
11133 case 'D': namedclass = ANYOF_NDIGIT; break;
11134 case 'v': namedclass = ANYOF_VERTWS; break;
11135 case 'V': namedclass = ANYOF_NVERTWS; break;
11136 case 'h': namedclass = ANYOF_HORIZWS; break;
11137 case 'H': namedclass = ANYOF_NHORIZWS; break;
11138 case 'N': /* Handle \N{NAME} in class */
11140 /* We only pay attention to the first char of
11141 multichar strings being returned. I kinda wonder
11142 if this makes sense as it does change the behaviour
11143 from earlier versions, OTOH that behaviour was broken
11145 UV v; /* value is register so we cant & it /grrr */
11146 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
11156 if (RExC_parse >= RExC_end)
11157 vFAIL2("Empty \\%c{}", (U8)value);
11158 if (*RExC_parse == '{') {
11159 const U8 c = (U8)value;
11160 e = strchr(RExC_parse++, '}');
11162 vFAIL2("Missing right brace on \\%c{}", c);
11163 while (isSPACE(UCHARAT(RExC_parse)))
11165 if (e == RExC_parse)
11166 vFAIL2("Empty \\%c{}", c);
11167 n = e - RExC_parse;
11168 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11179 if (UCHARAT(RExC_parse) == '^') {
11182 value = value == 'p' ? 'P' : 'p'; /* toggle */
11183 while (isSPACE(UCHARAT(RExC_parse))) {
11188 /* Try to get the definition of the property into
11189 * <invlist>. If /i is in effect, the effective property
11190 * will have its name be <__NAME_i>. The design is
11191 * discussed in commit
11192 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11193 Newx(name, n + sizeof("_i__\n"), char);
11195 sprintf(name, "%s%.*s%s\n",
11196 (FOLD) ? "__" : "",
11202 /* Look up the property name, and get its swash and
11203 * inversion list, if the property is found */
11205 SvREFCNT_dec(swash);
11207 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11210 TRUE, /* this routine will handle
11211 undefined properties */
11212 NULL, FALSE /* No inversion list */
11216 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
11218 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11220 || ! (invlist = *invlistsvp))
11223 SvREFCNT_dec(swash);
11227 /* Here didn't find it. It could be a user-defined
11228 * property that will be available at run-time. Add it
11229 * to the list to look up then */
11230 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11231 (value == 'p' ? '+' : '!'),
11233 has_user_defined_property = 1;
11235 /* We don't know yet, so have to assume that the
11236 * property could match something in the Latin1 range,
11237 * hence something that isn't utf8 */
11238 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11242 /* Here, did get the swash and its inversion list. If
11243 * the swash is from a user-defined property, then this
11244 * whole character class should be regarded as such */
11245 SV** user_defined_svp =
11246 hv_fetchs(MUTABLE_HV(SvRV(swash)),
11247 "USER_DEFINED", FALSE);
11248 if (user_defined_svp) {
11249 has_user_defined_property
11250 |= SvUV(*user_defined_svp);
11253 /* Invert if asking for the complement */
11254 if (value == 'P') {
11255 _invlist_union_complement_2nd(properties, invlist, &properties);
11257 /* The swash can't be used as-is, because we've
11258 * inverted things; delay removing it to here after
11259 * have copied its invlist above */
11260 SvREFCNT_dec(swash);
11264 _invlist_union(properties, invlist, &properties);
11269 RExC_parse = e + 1;
11270 namedclass = ANYOF_MAX; /* no official name, but it's named */
11272 /* \p means they want Unicode semantics */
11273 RExC_uni_semantics = 1;
11276 case 'n': value = '\n'; break;
11277 case 'r': value = '\r'; break;
11278 case 't': value = '\t'; break;
11279 case 'f': value = '\f'; break;
11280 case 'b': value = '\b'; break;
11281 case 'e': value = ASCII_TO_NATIVE('\033');break;
11282 case 'a': value = ASCII_TO_NATIVE('\007');break;
11284 RExC_parse--; /* function expects to be pointed at the 'o' */
11286 const char* error_msg;
11287 bool valid = grok_bslash_o(RExC_parse,
11292 RExC_parse += numlen;
11297 if (PL_encoding && value < 0x100) {
11298 goto recode_encoding;
11302 if (*RExC_parse == '{') {
11303 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
11304 | PERL_SCAN_DISALLOW_PREFIX;
11305 char * const e = strchr(RExC_parse++, '}');
11307 vFAIL("Missing right brace on \\x{}");
11309 numlen = e - RExC_parse;
11310 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
11311 RExC_parse = e + 1;
11314 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
11316 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
11317 RExC_parse += numlen;
11319 if (PL_encoding && value < 0x100)
11320 goto recode_encoding;
11323 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11325 case '0': case '1': case '2': case '3': case '4':
11326 case '5': case '6': case '7':
11328 /* Take 1-3 octal digits */
11329 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11331 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11332 RExC_parse += numlen;
11333 if (PL_encoding && value < 0x100)
11334 goto recode_encoding;
11338 if (! RExC_override_recoding) {
11339 SV* enc = PL_encoding;
11340 value = reg_recode((const char)(U8)value, &enc);
11341 if (!enc && SIZE_ONLY)
11342 ckWARNreg(RExC_parse,
11343 "Invalid escape in the specified encoding");
11347 /* Allow \_ to not give an error */
11348 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11349 ckWARN2reg(RExC_parse,
11350 "Unrecognized escape \\%c in character class passed through",
11355 } /* end of \blah */
11358 literal_endpoint++;
11361 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11363 /* What matches in a locale is not known until runtime, so need to
11364 * (one time per class) allocate extra space to pass to regexec.
11365 * The space will contain a bit for each named class that is to be
11366 * matched against. This isn't needed for \p{} and pseudo-classes,
11367 * as they are not affected by locale, and hence are dealt with
11369 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
11372 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11375 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11376 ANYOF_CLASS_ZERO(ret);
11378 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11381 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11382 * literal, as is the character that began the false range, i.e.
11383 * the 'a' in the examples */
11387 RExC_parse >= rangebegin ?
11388 RExC_parse - rangebegin : 0;
11389 ckWARN4reg(RExC_parse,
11390 "False [] range \"%*.*s\"",
11394 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11395 if (prevvalue < 256) {
11397 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
11400 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
11404 range = 0; /* this was not a true range */
11409 /* Possible truncation here but in some 64-bit environments
11410 * the compiler gets heartburn about switch on 64-bit values.
11411 * A similar issue a little earlier when switching on value.
11413 switch ((I32)namedclass) {
11415 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11416 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11417 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11419 case ANYOF_NALNUMC:
11420 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11421 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11424 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11425 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11428 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11429 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11433 ANYOF_CLASS_SET(ret, namedclass);
11436 _invlist_union(properties, PL_ASCII, &properties);
11441 ANYOF_CLASS_SET(ret, namedclass);
11444 _invlist_union_complement_2nd(properties,
11445 PL_ASCII, &properties);
11446 if (DEPENDS_SEMANTICS) {
11447 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11452 DO_POSIX(ret, namedclass, properties,
11453 PL_PosixBlank, PL_XPosixBlank);
11456 DO_N_POSIX(ret, namedclass, properties,
11457 PL_PosixBlank, PL_XPosixBlank);
11460 DO_POSIX(ret, namedclass, properties,
11461 PL_PosixCntrl, PL_XPosixCntrl);
11464 DO_N_POSIX(ret, namedclass, properties,
11465 PL_PosixCntrl, PL_XPosixCntrl);
11468 /* There are no digits in the Latin1 range outside of
11469 * ASCII, so call the macro that doesn't have to resolve
11471 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
11472 PL_PosixDigit, "XPosixDigit", listsv);
11475 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11476 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
11479 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11480 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11483 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11484 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11486 case ANYOF_HORIZWS:
11487 /* For these, we use the nonbitmap, as /d doesn't make a
11488 * difference in what these match. There would be problems
11489 * if these characters had folds other than themselves, as
11490 * nonbitmap is subject to folding. It turns out that \h
11491 * is just a synonym for XPosixBlank */
11492 _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
11494 case ANYOF_NHORIZWS:
11495 _invlist_union_complement_2nd(nonbitmap,
11496 PL_XPosixBlank, &nonbitmap);
11500 { /* These require special handling, as they differ under
11501 folding, matching Cased there (which in the ASCII range
11502 is the same as Alpha */
11508 if (FOLD && ! LOC) {
11509 ascii_source = PL_PosixAlpha;
11510 l1_source = PL_L1Cased;
11514 ascii_source = PL_PosixLower;
11515 l1_source = PL_L1PosixLower;
11516 Xname = "XPosixLower";
11518 if (namedclass == ANYOF_LOWER) {
11519 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11520 ascii_source, l1_source, Xname, listsv);
11523 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11524 properties, ascii_source, l1_source, Xname, listsv);
11529 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11530 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11533 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11534 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
11537 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11538 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11541 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11542 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
11545 DO_POSIX(ret, namedclass, properties,
11546 PL_PosixSpace, PL_XPosixSpace);
11548 case ANYOF_NPSXSPC:
11549 DO_N_POSIX(ret, namedclass, properties,
11550 PL_PosixSpace, PL_XPosixSpace);
11553 DO_POSIX(ret, namedclass, properties,
11554 PL_PerlSpace, PL_XPerlSpace);
11557 DO_N_POSIX(ret, namedclass, properties,
11558 PL_PerlSpace, PL_XPerlSpace);
11560 case ANYOF_UPPER: /* Same as LOWER, above */
11567 if (FOLD && ! LOC) {
11568 ascii_source = PL_PosixAlpha;
11569 l1_source = PL_L1Cased;
11573 ascii_source = PL_PosixUpper;
11574 l1_source = PL_L1PosixUpper;
11575 Xname = "XPosixUpper";
11577 if (namedclass == ANYOF_UPPER) {
11578 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11579 ascii_source, l1_source, Xname, listsv);
11582 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
11583 properties, ascii_source, l1_source, Xname, listsv);
11587 case ANYOF_ALNUM: /* Really is 'Word' */
11588 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11589 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11592 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
11593 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
11596 /* For these, we use the nonbitmap, as /d doesn't make a
11597 * difference in what these match. There would be problems
11598 * if these characters had folds other than themselves, as
11599 * nonbitmap is subject to folding */
11600 _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
11602 case ANYOF_NVERTWS:
11603 _invlist_union_complement_2nd(nonbitmap,
11604 PL_VertSpace, &nonbitmap);
11607 DO_POSIX(ret, namedclass, properties,
11608 PL_PosixXDigit, PL_XPosixXDigit);
11610 case ANYOF_NXDIGIT:
11611 DO_N_POSIX(ret, namedclass, properties,
11612 PL_PosixXDigit, PL_XPosixXDigit);
11615 /* this is to handle \p and \P */
11618 vFAIL("Invalid [::] class");
11624 } /* end of namedclass \blah */
11627 if (prevvalue > (IV)value) /* b-a */ {
11628 const int w = RExC_parse - rangebegin;
11629 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
11630 range = 0; /* not a valid range */
11634 prevvalue = value; /* save the beginning of the range */
11635 if (RExC_parse+1 < RExC_end
11636 && *RExC_parse == '-'
11637 && RExC_parse[1] != ']')
11641 /* a bad range like \w-, [:word:]- ? */
11642 if (namedclass > OOB_NAMEDCLASS) {
11643 if (ckWARN(WARN_REGEXP)) {
11645 RExC_parse >= rangebegin ?
11646 RExC_parse - rangebegin : 0;
11648 "False [] range \"%*.*s\"",
11653 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
11655 range = 1; /* yeah, it's a range! */
11656 continue; /* but do it the next time */
11660 /* non-Latin1 code point implies unicode semantics. Must be set in
11661 * pass1 so is there for the whole of pass 2 */
11663 RExC_uni_semantics = 1;
11666 /* now is the next time */
11668 if (prevvalue < 256) {
11669 const IV ceilvalue = value < 256 ? value : 255;
11672 /* In EBCDIC [\x89-\x91] should include
11673 * the \x8e but [i-j] should not. */
11674 if (literal_endpoint == 2 &&
11675 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11676 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
11678 if (isLOWER(prevvalue)) {
11679 for (i = prevvalue; i <= ceilvalue; i++)
11680 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11682 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11685 for (i = prevvalue; i <= ceilvalue; i++)
11686 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
11688 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11694 for (i = prevvalue; i <= ceilvalue; i++) {
11695 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
11699 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
11700 const UV natvalue = NATIVE_TO_UNI(value);
11701 nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
11704 literal_endpoint = 0;
11708 range = 0; /* this range (if it was one) is done now */
11715 /****** !SIZE_ONLY AFTER HERE *********/
11717 /* If folding and there are code points above 255, we calculate all
11718 * characters that could fold to or from the ones already on the list */
11719 if (FOLD && nonbitmap) {
11720 UV start, end; /* End points of code point ranges */
11722 SV* fold_intersection = NULL;
11724 /* This is a list of all the characters that participate in folds
11725 * (except marks, etc in multi-char folds */
11726 if (! PL_utf8_foldable) {
11727 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11728 PL_utf8_foldable = _swash_to_invlist(swash);
11729 SvREFCNT_dec(swash);
11732 /* This is a hash that for a particular fold gives all characters
11733 * that are involved in it */
11734 if (! PL_utf8_foldclosures) {
11736 /* If we were unable to find any folds, then we likely won't be
11737 * able to find the closures. So just create an empty list.
11738 * Folding will effectively be restricted to the non-Unicode rules
11739 * hard-coded into Perl. (This case happens legitimately during
11740 * compilation of Perl itself before the Unicode tables are
11742 if (invlist_len(PL_utf8_foldable) == 0) {
11743 PL_utf8_foldclosures = newHV();
11745 /* If the folds haven't been read in, call a fold function
11747 if (! PL_utf8_tofold) {
11748 U8 dummy[UTF8_MAXBYTES+1];
11751 /* This particular string is above \xff in both UTF-8 and
11753 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
11754 assert(PL_utf8_tofold); /* Verify that worked */
11756 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
11760 /* Only the characters in this class that participate in folds need be
11761 * checked. Get the intersection of this class and all the possible
11762 * characters that are foldable. This can quickly narrow down a large
11764 _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
11766 /* Now look at the foldable characters in this class individually */
11767 invlist_iterinit(fold_intersection);
11768 while (invlist_iternext(fold_intersection, &start, &end)) {
11771 /* Look at every character in the range */
11772 for (j = start; j <= end; j++) {
11775 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11778 _to_uni_fold_flags(j, foldbuf, &foldlen,
11779 (allow_full_fold) ? FOLD_FLAGS_FULL : 0);
11781 if (foldlen > (STRLEN)UNISKIP(f)) {
11783 /* Any multicharacter foldings (disallowed in lookbehind
11784 * patterns) require the following transform: [ABCDEF] ->
11785 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11786 * folds into "rst", all other characters fold to single
11787 * characters. We save away these multicharacter foldings,
11788 * to be later saved as part of the additional "s" data. */
11789 if (! RExC_in_lookbehind) {
11791 U8* e = foldbuf + foldlen;
11793 /* If any of the folded characters of this are in the
11794 * Latin1 range, tell the regex engine that this can
11795 * match a non-utf8 target string. The only multi-byte
11796 * fold whose source is in the Latin1 range (U+00DF)
11797 * applies only when the target string is utf8, or
11798 * under unicode rules */
11799 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11802 /* Can't mix ascii with non- under /aa */
11803 if (MORE_ASCII_RESTRICTED
11804 && (isASCII(*loc) != isASCII(j)))
11806 goto end_multi_fold;
11808 if (UTF8_IS_INVARIANT(*loc)
11809 || UTF8_IS_DOWNGRADEABLE_START(*loc))
11811 /* Can't mix above and below 256 under LOC
11814 goto end_multi_fold;
11817 |= ANYOF_NONBITMAP_NON_UTF8;
11820 loc += UTF8SKIP(loc);
11824 add_alternate(&unicode_alternate, foldbuf, foldlen);
11828 /* This is special-cased, as it is the only letter which
11829 * has both a multi-fold and single-fold in Latin1. All
11830 * the other chars that have single and multi-folds are
11831 * always in utf8, and the utf8 folding algorithm catches
11833 if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11834 stored += set_regclass_bit(pRExC_state,
11836 LATIN_SMALL_LETTER_SHARP_S,
11837 &l1_fold_invlist, &unicode_alternate);
11841 /* Single character fold. Add everything in its fold
11842 * closure to the list that this node should match */
11845 /* The fold closures data structure is a hash with the keys
11846 * being every character that is folded to, like 'k', and
11847 * the values each an array of everything that folds to its
11848 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
11849 if ((listp = hv_fetch(PL_utf8_foldclosures,
11850 (char *) foldbuf, foldlen, FALSE)))
11852 AV* list = (AV*) *listp;
11854 for (k = 0; k <= av_len(list); k++) {
11855 SV** c_p = av_fetch(list, k, FALSE);
11858 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11862 /* /aa doesn't allow folds between ASCII and non-;
11863 * /l doesn't allow them between above and below
11865 if ((MORE_ASCII_RESTRICTED
11866 && (isASCII(c) != isASCII(j)))
11867 || (LOC && ((c < 256) != (j < 256))))
11872 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11873 stored += set_regclass_bit(pRExC_state,
11876 &l1_fold_invlist, &unicode_alternate);
11878 /* It may be that the code point is already in
11879 * this range or already in the bitmap, in
11880 * which case we need do nothing */
11881 else if ((c < start || c > end)
11883 || ! ANYOF_BITMAP_TEST(ret, c)))
11885 nonbitmap = add_cp_to_invlist(nonbitmap, c);
11892 SvREFCNT_dec(fold_intersection);
11895 /* Combine the two lists into one. */
11896 if (l1_fold_invlist) {
11898 _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
11899 SvREFCNT_dec(l1_fold_invlist);
11902 nonbitmap = l1_fold_invlist;
11906 /* And combine the result (if any) with any inversion list from properties.
11907 * The lists are kept separate up to now because we don't want to fold the
11911 _invlist_union(nonbitmap, properties, &nonbitmap);
11912 SvREFCNT_dec(properties);
11915 nonbitmap = properties;
11919 /* Here, <nonbitmap> contains all the code points we can determine at
11920 * compile time that we haven't put into the bitmap. Go through it, and
11921 * for things that belong in the bitmap, put them there, and delete from
11925 /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11926 * possibly only should match when the target string is UTF-8 */
11927 UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11929 /* This gets set if we actually need to modify things */
11930 bool change_invlist = FALSE;
11934 /* Start looking through <nonbitmap> */
11935 invlist_iterinit(nonbitmap);
11936 while (invlist_iternext(nonbitmap, &start, &end)) {
11940 /* Quit if are above what we should change */
11941 if (start > max_cp_to_set) {
11945 change_invlist = TRUE;
11947 /* Set all the bits in the range, up to the max that we are doing */
11948 high = (end < max_cp_to_set) ? end : max_cp_to_set;
11949 for (i = start; i <= (int) high; i++) {
11950 if (! ANYOF_BITMAP_TEST(ret, i)) {
11951 ANYOF_BITMAP_SET(ret, i);
11959 /* Done with loop; remove any code points that are in the bitmap from
11961 if (change_invlist) {
11962 _invlist_subtract(nonbitmap,
11963 (DEPENDS_SEMANTICS)
11969 /* If have completely emptied it, remove it completely */
11970 if (invlist_len(nonbitmap) == 0) {
11971 SvREFCNT_dec(nonbitmap);
11976 /* Here, we have calculated what code points should be in the character
11977 * class. <nonbitmap> does not overlap the bitmap except possibly in the
11978 * case of DEPENDS rules.
11980 * Now we can see about various optimizations. Fold calculation (which we
11981 * did above) needs to take place before inversion. Otherwise /[^k]/i
11982 * would invert to include K, which under /i would match k, which it
11985 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
11986 * set the FOLD flag yet, so this does optimize those. It doesn't
11987 * optimize locale. Doing so perhaps could be done as long as there is
11988 * nothing like \w in it; some thought also would have to be given to the
11989 * interaction with above 0x100 chars */
11990 if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11992 && ! unicode_alternate
11993 /* In case of /d, there are some things that should match only when in
11994 * not in the bitmap, i.e., they require UTF8 to match. These are
11995 * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11996 * case, they don't require UTF8, so can invert here */
11998 || ! DEPENDS_SEMANTICS
11999 || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
12000 && SvCUR(listsv) == initial_listsv_len)
12004 for (i = 0; i < 256; ++i) {
12005 if (ANYOF_BITMAP_TEST(ret, i)) {
12006 ANYOF_BITMAP_CLEAR(ret, i);
12009 ANYOF_BITMAP_SET(ret, i);
12014 /* The inversion means that everything above 255 is matched */
12015 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
12018 /* Here, also has things outside the bitmap that may overlap with
12019 * the bitmap. We have to sync them up, so that they get inverted
12020 * in both places. Earlier, we removed all overlaps except in the
12021 * case of /d rules, so no syncing is needed except for this case
12023 SV *remove_list = NULL;
12025 if (DEPENDS_SEMANTICS) {
12028 /* Set the bits that correspond to the ones that aren't in the
12029 * bitmap. Otherwise, when we invert, we'll miss these.
12030 * Earlier, we removed from the nonbitmap all code points
12031 * < 128, so there is no extra work here */
12032 invlist_iterinit(nonbitmap);
12033 while (invlist_iternext(nonbitmap, &start, &end)) {
12034 if (start > 255) { /* The bit map goes to 255 */
12040 for (i = start; i <= (int) end; ++i) {
12041 ANYOF_BITMAP_SET(ret, i);
12048 /* Now invert both the bitmap and the nonbitmap. Anything in the
12049 * bitmap has to also be removed from the non-bitmap, but again,
12050 * there should not be overlap unless is /d rules. */
12051 _invlist_invert(nonbitmap);
12053 /* Any swash can't be used as-is, because we've inverted things */
12055 SvREFCNT_dec(swash);
12059 for (i = 0; i < 256; ++i) {
12060 if (ANYOF_BITMAP_TEST(ret, i)) {
12061 ANYOF_BITMAP_CLEAR(ret, i);
12062 if (DEPENDS_SEMANTICS) {
12063 if (! remove_list) {
12064 remove_list = _new_invlist(2);
12066 remove_list = add_cp_to_invlist(remove_list, i);
12070 ANYOF_BITMAP_SET(ret, i);
12076 /* And do the removal */
12077 if (DEPENDS_SEMANTICS) {
12079 _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
12080 SvREFCNT_dec(remove_list);
12084 /* There is no overlap for non-/d, so just delete anything
12086 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
12090 stored = 256 - stored;
12092 /* Clear the invert flag since have just done it here */
12093 ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
12096 /* Folding in the bitmap is taken care of above, but not for locale (for
12097 * which we have to wait to see what folding is in effect at runtime), and
12098 * for some things not in the bitmap (only the upper latin folds in this
12099 * case, as all other single-char folding has been set above). Set
12100 * run-time fold flag for these */
12102 || (DEPENDS_SEMANTICS
12104 && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
12105 || unicode_alternate))
12107 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
12110 /* A single character class can be "optimized" into an EXACTish node.
12111 * Note that since we don't currently count how many characters there are
12112 * outside the bitmap, we are XXX missing optimization possibilities for
12113 * them. This optimization can't happen unless this is a truly single
12114 * character class, which means that it can't be an inversion into a
12115 * many-character class, and there must be no possibility of there being
12116 * things outside the bitmap. 'stored' (only) for locales doesn't include
12117 * \w, etc, so have to make a special test that they aren't present
12119 * Similarly A 2-character class of the very special form like [bB] can be
12120 * optimized into an EXACTFish node, but only for non-locales, and for
12121 * characters which only have the two folds; so things like 'fF' and 'Ii'
12122 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
12125 && ! unicode_alternate
12126 && SvCUR(listsv) == initial_listsv_len
12127 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
12128 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12129 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
12130 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
12131 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
12132 /* If the latest code point has a fold whose
12133 * bit is set, it must be the only other one */
12134 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
12135 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
12137 /* Note that the information needed to decide to do this optimization
12138 * is not currently available until the 2nd pass, and that the actually
12139 * used EXACTish node takes less space than the calculated ANYOF node,
12140 * and hence the amount of space calculated in the first pass is larger
12141 * than actually used, so this optimization doesn't gain us any space.
12142 * But an EXACT node is faster than an ANYOF node, and can be combined
12143 * with any adjacent EXACT nodes later by the optimizer for further
12144 * gains. The speed of executing an EXACTF is similar to an ANYOF
12145 * node, so the optimization advantage comes from the ability to join
12146 * it to adjacent EXACT nodes */
12148 const char * cur_parse= RExC_parse;
12150 RExC_emit = (regnode *)orig_emit;
12151 RExC_parse = (char *)orig_parse;
12155 /* A locale node with one point can be folded; all the other cases
12156 * with folding will have two points, since we calculate them above
12158 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
12165 else { /* else 2 chars in the bit map: the folds of each other */
12167 /* Use the folded value, which for the cases where we get here,
12168 * is just the lower case of the current one (which may resolve to
12169 * itself, or to the other one */
12170 value = toLOWER_LATIN1(value);
12172 /* To join adjacent nodes, they must be the exact EXACTish type.
12173 * Try to use the most likely type, by using EXACTFA if possible,
12174 * then EXACTFU if the regex calls for it, or is required because
12175 * the character is non-ASCII. (If <value> is ASCII, its fold is
12176 * also ASCII for the cases where we get here.) */
12177 if (MORE_ASCII_RESTRICTED && isASCII(value)) {
12180 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
12183 else { /* Otherwise, more likely to be EXACTF type */
12188 ret = reg_node(pRExC_state, op);
12189 RExC_parse = (char *)cur_parse;
12190 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
12191 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
12192 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
12194 RExC_emit += STR_SZ(2);
12197 *STRING(ret)= (char)value;
12199 RExC_emit += STR_SZ(1);
12201 SvREFCNT_dec(listsv);
12205 /* If there is a swash and more than one element, we can't use the swash in
12206 * the optimization below. */
12207 if (swash && element_count > 1) {
12208 SvREFCNT_dec(swash);
12212 && SvCUR(listsv) == initial_listsv_len
12213 && ! unicode_alternate)
12215 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
12216 SvREFCNT_dec(listsv);
12217 SvREFCNT_dec(unicode_alternate);
12220 /* av[0] stores the character class description in its textual form:
12221 * used later (regexec.c:Perl_regclass_swash()) to initialize the
12222 * appropriate swash, and is also useful for dumping the regnode.
12223 * av[1] if NULL, is a placeholder to later contain the swash computed
12224 * from av[0]. But if no further computation need be done, the
12225 * swash is stored there now.
12226 * av[2] stores the multicharacter foldings, used later in
12227 * regexec.c:S_reginclass().
12228 * av[3] stores the nonbitmap inversion list for use in addition or
12229 * instead of av[0]; not used if av[1] isn't NULL
12230 * av[4] is set if any component of the class is from a user-defined
12231 * property; not used if av[1] isn't NULL */
12232 AV * const av = newAV();
12235 av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
12239 av_store(av, 1, swash);
12240 SvREFCNT_dec(nonbitmap);
12243 av_store(av, 1, NULL);
12245 av_store(av, 3, nonbitmap);
12246 av_store(av, 4, newSVuv(has_user_defined_property));
12250 /* Store any computed multi-char folds only if we are allowing
12252 if (allow_full_fold) {
12253 av_store(av, 2, MUTABLE_SV(unicode_alternate));
12254 if (unicode_alternate) { /* This node is variable length */
12259 av_store(av, 2, NULL);
12261 rv = newRV_noinc(MUTABLE_SV(av));
12262 n = add_data(pRExC_state, 1, "s");
12263 RExC_rxi->data->data[n] = (void*)rv;
12270 /* reg_skipcomment()
12272 Absorbs an /x style # comments from the input stream.
12273 Returns true if there is more text remaining in the stream.
12274 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
12275 terminates the pattern without including a newline.
12277 Note its the callers responsibility to ensure that we are
12278 actually in /x mode
12283 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
12287 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
12289 while (RExC_parse < RExC_end)
12290 if (*RExC_parse++ == '\n') {
12295 /* we ran off the end of the pattern without ending
12296 the comment, so we have to add an \n when wrapping */
12297 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
12305 Advances the parse position, and optionally absorbs
12306 "whitespace" from the inputstream.
12308 Without /x "whitespace" means (?#...) style comments only,
12309 with /x this means (?#...) and # comments and whitespace proper.
12311 Returns the RExC_parse point from BEFORE the scan occurs.
12313 This is the /x friendly way of saying RExC_parse++.
12317 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
12319 char* const retval = RExC_parse++;
12321 PERL_ARGS_ASSERT_NEXTCHAR;
12324 if (RExC_end - RExC_parse >= 3
12325 && *RExC_parse == '('
12326 && RExC_parse[1] == '?'
12327 && RExC_parse[2] == '#')
12329 while (*RExC_parse != ')') {
12330 if (RExC_parse == RExC_end)
12331 FAIL("Sequence (?#... not terminated");
12337 if (RExC_flags & RXf_PMf_EXTENDED) {
12338 if (isSPACE(*RExC_parse)) {
12342 else if (*RExC_parse == '#') {
12343 if ( reg_skipcomment( pRExC_state ) )
12352 - reg_node - emit a node
12354 STATIC regnode * /* Location. */
12355 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
12358 register regnode *ptr;
12359 regnode * const ret = RExC_emit;
12360 GET_RE_DEBUG_FLAGS_DECL;
12362 PERL_ARGS_ASSERT_REG_NODE;
12365 SIZE_ALIGN(RExC_size);
12369 if (RExC_emit >= RExC_emit_bound)
12370 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12371 op, RExC_emit, RExC_emit_bound);
12373 NODE_ALIGN_FILL(ret);
12375 FILL_ADVANCE_NODE(ptr, op);
12376 #ifdef RE_TRACK_PATTERN_OFFSETS
12377 if (RExC_offsets) { /* MJD */
12378 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
12379 "reg_node", __LINE__,
12381 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
12382 ? "Overwriting end of array!\n" : "OK",
12383 (UV)(RExC_emit - RExC_emit_start),
12384 (UV)(RExC_parse - RExC_start),
12385 (UV)RExC_offsets[0]));
12386 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
12394 - reganode - emit a node with an argument
12396 STATIC regnode * /* Location. */
12397 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
12400 register regnode *ptr;
12401 regnode * const ret = RExC_emit;
12402 GET_RE_DEBUG_FLAGS_DECL;
12404 PERL_ARGS_ASSERT_REGANODE;
12407 SIZE_ALIGN(RExC_size);
12412 assert(2==regarglen[op]+1);
12414 Anything larger than this has to allocate the extra amount.
12415 If we changed this to be:
12417 RExC_size += (1 + regarglen[op]);
12419 then it wouldn't matter. Its not clear what side effect
12420 might come from that so its not done so far.
12425 if (RExC_emit >= RExC_emit_bound)
12426 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
12427 op, RExC_emit, RExC_emit_bound);
12429 NODE_ALIGN_FILL(ret);
12431 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
12432 #ifdef RE_TRACK_PATTERN_OFFSETS
12433 if (RExC_offsets) { /* MJD */
12434 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12438 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
12439 "Overwriting end of array!\n" : "OK",
12440 (UV)(RExC_emit - RExC_emit_start),
12441 (UV)(RExC_parse - RExC_start),
12442 (UV)RExC_offsets[0]));
12443 Set_Cur_Node_Offset;
12451 - reguni - emit (if appropriate) a Unicode character
12454 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
12458 PERL_ARGS_ASSERT_REGUNI;
12460 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
12464 - reginsert - insert an operator in front of already-emitted operand
12466 * Means relocating the operand.
12469 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
12472 register regnode *src;
12473 register regnode *dst;
12474 register regnode *place;
12475 const int offset = regarglen[(U8)op];
12476 const int size = NODE_STEP_REGNODE + offset;
12477 GET_RE_DEBUG_FLAGS_DECL;
12479 PERL_ARGS_ASSERT_REGINSERT;
12480 PERL_UNUSED_ARG(depth);
12481 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
12482 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
12491 if (RExC_open_parens) {
12493 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
12494 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
12495 if ( RExC_open_parens[paren] >= opnd ) {
12496 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
12497 RExC_open_parens[paren] += size;
12499 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
12501 if ( RExC_close_parens[paren] >= opnd ) {
12502 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
12503 RExC_close_parens[paren] += size;
12505 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
12510 while (src > opnd) {
12511 StructCopy(--src, --dst, regnode);
12512 #ifdef RE_TRACK_PATTERN_OFFSETS
12513 if (RExC_offsets) { /* MJD 20010112 */
12514 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
12518 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
12519 ? "Overwriting end of array!\n" : "OK",
12520 (UV)(src - RExC_emit_start),
12521 (UV)(dst - RExC_emit_start),
12522 (UV)RExC_offsets[0]));
12523 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
12524 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
12530 place = opnd; /* Op node, where operand used to be. */
12531 #ifdef RE_TRACK_PATTERN_OFFSETS
12532 if (RExC_offsets) { /* MJD */
12533 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
12537 (UV)(place - RExC_emit_start) > RExC_offsets[0]
12538 ? "Overwriting end of array!\n" : "OK",
12539 (UV)(place - RExC_emit_start),
12540 (UV)(RExC_parse - RExC_start),
12541 (UV)RExC_offsets[0]));
12542 Set_Node_Offset(place, RExC_parse);
12543 Set_Node_Length(place, 1);
12546 src = NEXTOPER(place);
12547 FILL_ADVANCE_NODE(place, op);
12548 Zero(src, offset, regnode);
12552 - regtail - set the next-pointer at the end of a node chain of p to val.
12553 - SEE ALSO: regtail_study
12555 /* TODO: All three parms should be const */
12557 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12560 register regnode *scan;
12561 GET_RE_DEBUG_FLAGS_DECL;
12563 PERL_ARGS_ASSERT_REGTAIL;
12565 PERL_UNUSED_ARG(depth);
12571 /* Find last node. */
12574 regnode * const temp = regnext(scan);
12576 SV * const mysv=sv_newmortal();
12577 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
12578 regprop(RExC_rx, mysv, scan);
12579 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
12580 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
12581 (temp == NULL ? "->" : ""),
12582 (temp == NULL ? PL_reg_name[OP(val)] : "")
12590 if (reg_off_by_arg[OP(scan)]) {
12591 ARG_SET(scan, val - scan);
12594 NEXT_OFF(scan) = val - scan;
12600 - regtail_study - set the next-pointer at the end of a node chain of p to val.
12601 - Look for optimizable sequences at the same time.
12602 - currently only looks for EXACT chains.
12604 This is experimental code. The idea is to use this routine to perform
12605 in place optimizations on branches and groups as they are constructed,
12606 with the long term intention of removing optimization from study_chunk so
12607 that it is purely analytical.
12609 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
12610 to control which is which.
12613 /* TODO: All four parms should be const */
12616 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
12619 register regnode *scan;
12621 #ifdef EXPERIMENTAL_INPLACESCAN
12624 GET_RE_DEBUG_FLAGS_DECL;
12626 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12632 /* Find last node. */
12636 regnode * const temp = regnext(scan);
12637 #ifdef EXPERIMENTAL_INPLACESCAN
12638 if (PL_regkind[OP(scan)] == EXACT) {
12639 bool has_exactf_sharp_s; /* Unexamined in this routine */
12640 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
12645 switch (OP(scan)) {
12651 case EXACTFU_TRICKYFOLD:
12653 if( exact == PSEUDO )
12655 else if ( exact != OP(scan) )
12664 SV * const mysv=sv_newmortal();
12665 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12666 regprop(RExC_rx, mysv, scan);
12667 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
12668 SvPV_nolen_const(mysv),
12669 REG_NODE_NUM(scan),
12670 PL_reg_name[exact]);
12677 SV * const mysv_val=sv_newmortal();
12678 DEBUG_PARSE_MSG("");
12679 regprop(RExC_rx, mysv_val, val);
12680 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12681 SvPV_nolen_const(mysv_val),
12682 (IV)REG_NODE_NUM(val),
12686 if (reg_off_by_arg[OP(scan)]) {
12687 ARG_SET(scan, val - scan);
12690 NEXT_OFF(scan) = val - scan;
12698 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
12702 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12708 for (bit=0; bit<32; bit++) {
12709 if (flags & (1<<bit)) {
12710 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
12713 if (!set++ && lead)
12714 PerlIO_printf(Perl_debug_log, "%s",lead);
12715 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12718 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12719 if (!set++ && lead) {
12720 PerlIO_printf(Perl_debug_log, "%s",lead);
12723 case REGEX_UNICODE_CHARSET:
12724 PerlIO_printf(Perl_debug_log, "UNICODE");
12726 case REGEX_LOCALE_CHARSET:
12727 PerlIO_printf(Perl_debug_log, "LOCALE");
12729 case REGEX_ASCII_RESTRICTED_CHARSET:
12730 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12732 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12733 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12736 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12742 PerlIO_printf(Perl_debug_log, "\n");
12744 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12750 Perl_regdump(pTHX_ const regexp *r)
12754 SV * const sv = sv_newmortal();
12755 SV *dsv= sv_newmortal();
12756 RXi_GET_DECL(r,ri);
12757 GET_RE_DEBUG_FLAGS_DECL;
12759 PERL_ARGS_ASSERT_REGDUMP;
12761 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
12763 /* Header fields of interest. */
12764 if (r->anchored_substr) {
12765 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12766 RE_SV_DUMPLEN(r->anchored_substr), 30);
12767 PerlIO_printf(Perl_debug_log,
12768 "anchored %s%s at %"IVdf" ",
12769 s, RE_SV_TAIL(r->anchored_substr),
12770 (IV)r->anchored_offset);
12771 } else if (r->anchored_utf8) {
12772 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12773 RE_SV_DUMPLEN(r->anchored_utf8), 30);
12774 PerlIO_printf(Perl_debug_log,
12775 "anchored utf8 %s%s at %"IVdf" ",
12776 s, RE_SV_TAIL(r->anchored_utf8),
12777 (IV)r->anchored_offset);
12779 if (r->float_substr) {
12780 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12781 RE_SV_DUMPLEN(r->float_substr), 30);
12782 PerlIO_printf(Perl_debug_log,
12783 "floating %s%s at %"IVdf"..%"UVuf" ",
12784 s, RE_SV_TAIL(r->float_substr),
12785 (IV)r->float_min_offset, (UV)r->float_max_offset);
12786 } else if (r->float_utf8) {
12787 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12788 RE_SV_DUMPLEN(r->float_utf8), 30);
12789 PerlIO_printf(Perl_debug_log,
12790 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12791 s, RE_SV_TAIL(r->float_utf8),
12792 (IV)r->float_min_offset, (UV)r->float_max_offset);
12794 if (r->check_substr || r->check_utf8)
12795 PerlIO_printf(Perl_debug_log,
12797 (r->check_substr == r->float_substr
12798 && r->check_utf8 == r->float_utf8
12799 ? "(checking floating" : "(checking anchored"));
12800 if (r->extflags & RXf_NOSCAN)
12801 PerlIO_printf(Perl_debug_log, " noscan");
12802 if (r->extflags & RXf_CHECK_ALL)
12803 PerlIO_printf(Perl_debug_log, " isall");
12804 if (r->check_substr || r->check_utf8)
12805 PerlIO_printf(Perl_debug_log, ") ");
12807 if (ri->regstclass) {
12808 regprop(r, sv, ri->regstclass);
12809 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
12811 if (r->extflags & RXf_ANCH) {
12812 PerlIO_printf(Perl_debug_log, "anchored");
12813 if (r->extflags & RXf_ANCH_BOL)
12814 PerlIO_printf(Perl_debug_log, "(BOL)");
12815 if (r->extflags & RXf_ANCH_MBOL)
12816 PerlIO_printf(Perl_debug_log, "(MBOL)");
12817 if (r->extflags & RXf_ANCH_SBOL)
12818 PerlIO_printf(Perl_debug_log, "(SBOL)");
12819 if (r->extflags & RXf_ANCH_GPOS)
12820 PerlIO_printf(Perl_debug_log, "(GPOS)");
12821 PerlIO_putc(Perl_debug_log, ' ');
12823 if (r->extflags & RXf_GPOS_SEEN)
12824 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
12825 if (r->intflags & PREGf_SKIP)
12826 PerlIO_printf(Perl_debug_log, "plus ");
12827 if (r->intflags & PREGf_IMPLICIT)
12828 PerlIO_printf(Perl_debug_log, "implicit ");
12829 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
12830 if (r->extflags & RXf_EVAL_SEEN)
12831 PerlIO_printf(Perl_debug_log, "with eval ");
12832 PerlIO_printf(Perl_debug_log, "\n");
12833 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
12835 PERL_ARGS_ASSERT_REGDUMP;
12836 PERL_UNUSED_CONTEXT;
12837 PERL_UNUSED_ARG(r);
12838 #endif /* DEBUGGING */
12842 - regprop - printable representation of opcode
12844 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12847 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12848 if (flags & ANYOF_INVERT) \
12849 /*make sure the invert info is in each */ \
12850 sv_catpvs(sv, "^"); \
12856 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
12861 RXi_GET_DECL(prog,progi);
12862 GET_RE_DEBUG_FLAGS_DECL;
12864 PERL_ARGS_ASSERT_REGPROP;
12868 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
12869 /* It would be nice to FAIL() here, but this may be called from
12870 regexec.c, and it would be hard to supply pRExC_state. */
12871 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
12872 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
12874 k = PL_regkind[OP(o)];
12877 sv_catpvs(sv, " ");
12878 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12879 * is a crude hack but it may be the best for now since
12880 * we have no flag "this EXACTish node was UTF-8"
12882 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12883 PERL_PV_ESCAPE_UNI_DETECT |
12884 PERL_PV_ESCAPE_NONASCII |
12885 PERL_PV_PRETTY_ELLIPSES |
12886 PERL_PV_PRETTY_LTGT |
12887 PERL_PV_PRETTY_NOCLEAR
12889 } else if (k == TRIE) {
12890 /* print the details of the trie in dumpuntil instead, as
12891 * progi->data isn't available here */
12892 const char op = OP(o);
12893 const U32 n = ARG(o);
12894 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
12895 (reg_ac_data *)progi->data->data[n] :
12897 const reg_trie_data * const trie
12898 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
12900 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
12901 DEBUG_TRIE_COMPILE_r(
12902 Perl_sv_catpvf(aTHX_ sv,
12903 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12904 (UV)trie->startstate,
12905 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
12906 (UV)trie->wordcount,
12909 (UV)TRIE_CHARCOUNT(trie),
12910 (UV)trie->uniquecharcount
12913 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12915 int rangestart = -1;
12916 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
12917 sv_catpvs(sv, "[");
12918 for (i = 0; i <= 256; i++) {
12919 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12920 if (rangestart == -1)
12922 } else if (rangestart != -1) {
12923 if (i <= rangestart + 3)
12924 for (; rangestart < i; rangestart++)
12925 put_byte(sv, rangestart);
12927 put_byte(sv, rangestart);
12928 sv_catpvs(sv, "-");
12929 put_byte(sv, i - 1);
12934 sv_catpvs(sv, "]");
12937 } else if (k == CURLY) {
12938 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
12939 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12940 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
12942 else if (k == WHILEM && o->flags) /* Ordinal/of */
12943 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
12944 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
12945 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
12946 if ( RXp_PAREN_NAMES(prog) ) {
12947 if ( k != REF || (OP(o) < NREF)) {
12948 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
12949 SV **name= av_fetch(list, ARG(o), 0 );
12951 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12954 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
12955 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
12956 I32 *nums=(I32*)SvPVX(sv_dat);
12957 SV **name= av_fetch(list, nums[0], 0 );
12960 for ( n=0; n<SvIVX(sv_dat); n++ ) {
12961 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12962 (n ? "," : ""), (IV)nums[n]);
12964 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12968 } else if (k == GOSUB)
12969 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
12970 else if (k == VERB) {
12972 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
12973 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
12974 } else if (k == LOGICAL)
12975 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
12976 else if (k == ANYOF) {
12977 int i, rangestart = -1;
12978 const U8 flags = ANYOF_FLAGS(o);
12981 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12982 static const char * const anyofs[] = {
13015 if (flags & ANYOF_LOCALE)
13016 sv_catpvs(sv, "{loc}");
13017 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
13018 sv_catpvs(sv, "{i}");
13019 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13020 if (flags & ANYOF_INVERT)
13021 sv_catpvs(sv, "^");
13023 /* output what the standard cp 0-255 bitmap matches */
13024 for (i = 0; i <= 256; i++) {
13025 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13026 if (rangestart == -1)
13028 } else if (rangestart != -1) {
13029 if (i <= rangestart + 3)
13030 for (; rangestart < i; rangestart++)
13031 put_byte(sv, rangestart);
13033 put_byte(sv, rangestart);
13034 sv_catpvs(sv, "-");
13035 put_byte(sv, i - 1);
13042 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13043 /* output any special charclass tests (used entirely under use locale) */
13044 if (ANYOF_CLASS_TEST_ANY_SET(o))
13045 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13046 if (ANYOF_CLASS_TEST(o,i)) {
13047 sv_catpv(sv, anyofs[i]);
13051 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13053 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13054 sv_catpvs(sv, "{non-utf8-latin1-all}");
13057 /* output information about the unicode matching */
13058 if (flags & ANYOF_UNICODE_ALL)
13059 sv_catpvs(sv, "{unicode_all}");
13060 else if (ANYOF_NONBITMAP(o))
13061 sv_catpvs(sv, "{unicode}");
13062 if (flags & ANYOF_NONBITMAP_NON_UTF8)
13063 sv_catpvs(sv, "{outside bitmap}");
13065 if (ANYOF_NONBITMAP(o)) {
13066 SV *lv; /* Set if there is something outside the bit map */
13067 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
13068 bool byte_output = FALSE; /* If something in the bitmap has been
13071 if (lv && lv != &PL_sv_undef) {
13073 U8 s[UTF8_MAXBYTES_CASE+1];
13075 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13076 uvchr_to_utf8(s, i);
13079 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
13083 && swash_fetch(sw, s, TRUE))
13085 if (rangestart == -1)
13087 } else if (rangestart != -1) {
13088 byte_output = TRUE;
13089 if (i <= rangestart + 3)
13090 for (; rangestart < i; rangestart++) {
13091 put_byte(sv, rangestart);
13094 put_byte(sv, rangestart);
13095 sv_catpvs(sv, "-");
13104 char *s = savesvpv(lv);
13105 char * const origs = s;
13107 while (*s && *s != '\n')
13111 const char * const t = ++s;
13114 sv_catpvs(sv, " ");
13120 /* Truncate very long output */
13121 if (s - origs > 256) {
13122 Perl_sv_catpvf(aTHX_ sv,
13124 (int) (s - origs - 1),
13130 else if (*s == '\t') {
13149 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
13151 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
13152 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
13154 PERL_UNUSED_CONTEXT;
13155 PERL_UNUSED_ARG(sv);
13156 PERL_UNUSED_ARG(o);
13157 PERL_UNUSED_ARG(prog);
13158 #endif /* DEBUGGING */
13162 Perl_re_intuit_string(pTHX_ REGEXP * const r)
13163 { /* Assume that RE_INTUIT is set */
13165 struct regexp *const prog = (struct regexp *)SvANY(r);
13166 GET_RE_DEBUG_FLAGS_DECL;
13168 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
13169 PERL_UNUSED_CONTEXT;
13173 const char * const s = SvPV_nolen_const(prog->check_substr
13174 ? prog->check_substr : prog->check_utf8);
13176 if (!PL_colorset) reginitcolors();
13177 PerlIO_printf(Perl_debug_log,
13178 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
13180 prog->check_substr ? "" : "utf8 ",
13181 PL_colors[5],PL_colors[0],
13184 (strlen(s) > 60 ? "..." : ""));
13187 return prog->check_substr ? prog->check_substr : prog->check_utf8;
13193 handles refcounting and freeing the perl core regexp structure. When
13194 it is necessary to actually free the structure the first thing it
13195 does is call the 'free' method of the regexp_engine associated to
13196 the regexp, allowing the handling of the void *pprivate; member
13197 first. (This routine is not overridable by extensions, which is why
13198 the extensions free is called first.)
13200 See regdupe and regdupe_internal if you change anything here.
13202 #ifndef PERL_IN_XSUB_RE
13204 Perl_pregfree(pTHX_ REGEXP *r)
13210 Perl_pregfree2(pTHX_ REGEXP *rx)
13213 struct regexp *const r = (struct regexp *)SvANY(rx);
13214 GET_RE_DEBUG_FLAGS_DECL;
13216 PERL_ARGS_ASSERT_PREGFREE2;
13218 if (r->mother_re) {
13219 ReREFCNT_dec(r->mother_re);
13221 CALLREGFREE_PVT(rx); /* free the private data */
13222 SvREFCNT_dec(RXp_PAREN_NAMES(r));
13225 SvREFCNT_dec(r->anchored_substr);
13226 SvREFCNT_dec(r->anchored_utf8);
13227 SvREFCNT_dec(r->float_substr);
13228 SvREFCNT_dec(r->float_utf8);
13229 Safefree(r->substrs);
13231 RX_MATCH_COPY_FREE(rx);
13232 #ifdef PERL_OLD_COPY_ON_WRITE
13233 SvREFCNT_dec(r->saved_copy);
13236 SvREFCNT_dec(r->qr_anoncv);
13241 This is a hacky workaround to the structural issue of match results
13242 being stored in the regexp structure which is in turn stored in
13243 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
13244 could be PL_curpm in multiple contexts, and could require multiple
13245 result sets being associated with the pattern simultaneously, such
13246 as when doing a recursive match with (??{$qr})
13248 The solution is to make a lightweight copy of the regexp structure
13249 when a qr// is returned from the code executed by (??{$qr}) this
13250 lightweight copy doesn't actually own any of its data except for
13251 the starp/end and the actual regexp structure itself.
13257 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
13259 struct regexp *ret;
13260 struct regexp *const r = (struct regexp *)SvANY(rx);
13261 register const I32 npar = r->nparens+1;
13263 PERL_ARGS_ASSERT_REG_TEMP_COPY;
13266 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
13267 ret = (struct regexp *)SvANY(ret_x);
13269 (void)ReREFCNT_inc(rx);
13270 /* We can take advantage of the existing "copied buffer" mechanism in SVs
13271 by pointing directly at the buffer, but flagging that the allocated
13272 space in the copy is zero. As we've just done a struct copy, it's now
13273 a case of zero-ing that, rather than copying the current length. */
13274 SvPV_set(ret_x, RX_WRAPPED(rx));
13275 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
13276 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
13277 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
13278 SvLEN_set(ret_x, 0);
13279 SvSTASH_set(ret_x, NULL);
13280 SvMAGIC_set(ret_x, NULL);
13281 Newx(ret->offs, npar, regexp_paren_pair);
13282 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13284 Newx(ret->substrs, 1, struct reg_substr_data);
13285 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13287 SvREFCNT_inc_void(ret->anchored_substr);
13288 SvREFCNT_inc_void(ret->anchored_utf8);
13289 SvREFCNT_inc_void(ret->float_substr);
13290 SvREFCNT_inc_void(ret->float_utf8);
13292 /* check_substr and check_utf8, if non-NULL, point to either their
13293 anchored or float namesakes, and don't hold a second reference. */
13295 RX_MATCH_COPIED_off(ret_x);
13296 #ifdef PERL_OLD_COPY_ON_WRITE
13297 ret->saved_copy = NULL;
13299 ret->mother_re = rx;
13300 SvREFCNT_inc_void(ret->qr_anoncv);
13306 /* regfree_internal()
13308 Free the private data in a regexp. This is overloadable by
13309 extensions. Perl takes care of the regexp structure in pregfree(),
13310 this covers the *pprivate pointer which technically perl doesn't
13311 know about, however of course we have to handle the
13312 regexp_internal structure when no extension is in use.
13314 Note this is called before freeing anything in the regexp
13319 Perl_regfree_internal(pTHX_ REGEXP * const rx)
13322 struct regexp *const r = (struct regexp *)SvANY(rx);
13323 RXi_GET_DECL(r,ri);
13324 GET_RE_DEBUG_FLAGS_DECL;
13326 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
13332 SV *dsv= sv_newmortal();
13333 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
13334 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
13335 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
13336 PL_colors[4],PL_colors[5],s);
13339 #ifdef RE_TRACK_PATTERN_OFFSETS
13341 Safefree(ri->u.offsets); /* 20010421 MJD */
13343 if (ri->code_blocks) {
13345 for (n = 0; n < ri->num_code_blocks; n++)
13346 SvREFCNT_dec(ri->code_blocks[n].src_regex);
13347 Safefree(ri->code_blocks);
13351 int n = ri->data->count;
13352 PAD* new_comppad = NULL;
13357 /* If you add a ->what type here, update the comment in regcomp.h */
13358 switch (ri->data->what[n]) {
13364 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
13367 Safefree(ri->data->data[n]);
13370 new_comppad = MUTABLE_AV(ri->data->data[n]);
13373 if (new_comppad == NULL)
13374 Perl_croak(aTHX_ "panic: pregfree comppad");
13375 PAD_SAVE_LOCAL(old_comppad,
13376 /* Watch out for global destruction's random ordering. */
13377 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
13380 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
13383 op_free((OP_4tree*)ri->data->data[n]);
13385 PAD_RESTORE_LOCAL(old_comppad);
13386 SvREFCNT_dec(MUTABLE_SV(new_comppad));
13387 new_comppad = NULL;
13394 { /* Aho Corasick add-on structure for a trie node.
13395 Used in stclass optimization only */
13397 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
13399 refcount = --aho->refcount;
13402 PerlMemShared_free(aho->states);
13403 PerlMemShared_free(aho->fail);
13404 /* do this last!!!! */
13405 PerlMemShared_free(ri->data->data[n]);
13406 PerlMemShared_free(ri->regstclass);
13412 /* trie structure. */
13414 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
13416 refcount = --trie->refcount;
13419 PerlMemShared_free(trie->charmap);
13420 PerlMemShared_free(trie->states);
13421 PerlMemShared_free(trie->trans);
13423 PerlMemShared_free(trie->bitmap);
13425 PerlMemShared_free(trie->jump);
13426 PerlMemShared_free(trie->wordinfo);
13427 /* do this last!!!! */
13428 PerlMemShared_free(ri->data->data[n]);
13433 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
13436 Safefree(ri->data->what);
13437 Safefree(ri->data);
13443 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
13444 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
13445 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
13448 re_dup - duplicate a regexp.
13450 This routine is expected to clone a given regexp structure. It is only
13451 compiled under USE_ITHREADS.
13453 After all of the core data stored in struct regexp is duplicated
13454 the regexp_engine.dupe method is used to copy any private data
13455 stored in the *pprivate pointer. This allows extensions to handle
13456 any duplication it needs to do.
13458 See pregfree() and regfree_internal() if you change anything here.
13460 #if defined(USE_ITHREADS)
13461 #ifndef PERL_IN_XSUB_RE
13463 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
13467 const struct regexp *r = (const struct regexp *)SvANY(sstr);
13468 struct regexp *ret = (struct regexp *)SvANY(dstr);
13470 PERL_ARGS_ASSERT_RE_DUP_GUTS;
13472 npar = r->nparens+1;
13473 Newx(ret->offs, npar, regexp_paren_pair);
13474 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
13476 /* no need to copy these */
13477 Newx(ret->swap, npar, regexp_paren_pair);
13480 if (ret->substrs) {
13481 /* Do it this way to avoid reading from *r after the StructCopy().
13482 That way, if any of the sv_dup_inc()s dislodge *r from the L1
13483 cache, it doesn't matter. */
13484 const bool anchored = r->check_substr
13485 ? r->check_substr == r->anchored_substr
13486 : r->check_utf8 == r->anchored_utf8;
13487 Newx(ret->substrs, 1, struct reg_substr_data);
13488 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
13490 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
13491 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
13492 ret->float_substr = sv_dup_inc(ret->float_substr, param);
13493 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
13495 /* check_substr and check_utf8, if non-NULL, point to either their
13496 anchored or float namesakes, and don't hold a second reference. */
13498 if (ret->check_substr) {
13500 assert(r->check_utf8 == r->anchored_utf8);
13501 ret->check_substr = ret->anchored_substr;
13502 ret->check_utf8 = ret->anchored_utf8;
13504 assert(r->check_substr == r->float_substr);
13505 assert(r->check_utf8 == r->float_utf8);
13506 ret->check_substr = ret->float_substr;
13507 ret->check_utf8 = ret->float_utf8;
13509 } else if (ret->check_utf8) {
13511 ret->check_utf8 = ret->anchored_utf8;
13513 ret->check_utf8 = ret->float_utf8;
13518 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
13519 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
13522 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
13524 if (RX_MATCH_COPIED(dstr))
13525 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
13527 ret->subbeg = NULL;
13528 #ifdef PERL_OLD_COPY_ON_WRITE
13529 ret->saved_copy = NULL;
13532 if (ret->mother_re) {
13533 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
13534 /* Our storage points directly to our mother regexp, but that's
13535 1: a buffer in a different thread
13536 2: something we no longer hold a reference on
13537 so we need to copy it locally. */
13538 /* Note we need to use SvCUR(), rather than
13539 SvLEN(), on our mother_re, because it, in
13540 turn, may well be pointing to its own mother_re. */
13541 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
13542 SvCUR(ret->mother_re)+1));
13543 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
13545 ret->mother_re = NULL;
13549 #endif /* PERL_IN_XSUB_RE */
13554 This is the internal complement to regdupe() which is used to copy
13555 the structure pointed to by the *pprivate pointer in the regexp.
13556 This is the core version of the extension overridable cloning hook.
13557 The regexp structure being duplicated will be copied by perl prior
13558 to this and will be provided as the regexp *r argument, however
13559 with the /old/ structures pprivate pointer value. Thus this routine
13560 may override any copying normally done by perl.
13562 It returns a pointer to the new regexp_internal structure.
13566 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
13569 struct regexp *const r = (struct regexp *)SvANY(rx);
13570 regexp_internal *reti;
13572 RXi_GET_DECL(r,ri);
13574 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
13578 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
13579 Copy(ri->program, reti->program, len+1, regnode);
13581 reti->num_code_blocks = ri->num_code_blocks;
13582 if (ri->code_blocks) {
13584 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
13585 struct reg_code_block);
13586 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
13587 struct reg_code_block);
13588 for (n = 0; n < ri->num_code_blocks; n++)
13589 reti->code_blocks[n].src_regex = (REGEXP*)
13590 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
13593 reti->code_blocks = NULL;
13595 reti->regstclass = NULL;
13598 struct reg_data *d;
13599 const int count = ri->data->count;
13602 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
13603 char, struct reg_data);
13604 Newx(d->what, count, U8);
13607 for (i = 0; i < count; i++) {
13608 d->what[i] = ri->data->what[i];
13609 switch (d->what[i]) {
13610 /* legal options are one of: sSfpontTua
13611 see also regcomp.h and pregfree() */
13612 case 'a': /* actually an AV, but the dup function is identical. */
13616 case 'p': /* actually an AV, but the dup function is identical. */
13617 case 'u': /* actually an HV, but the dup function is identical. */
13618 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
13621 /* This is cheating. */
13622 Newx(d->data[i], 1, struct regnode_charclass_class);
13623 StructCopy(ri->data->data[i], d->data[i],
13624 struct regnode_charclass_class);
13625 reti->regstclass = (regnode*)d->data[i];
13628 /* Compiled op trees are readonly and in shared memory,
13629 and can thus be shared without duplication. */
13631 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
13635 /* Trie stclasses are readonly and can thus be shared
13636 * without duplication. We free the stclass in pregfree
13637 * when the corresponding reg_ac_data struct is freed.
13639 reti->regstclass= ri->regstclass;
13643 ((reg_trie_data*)ri->data->data[i])->refcount++;
13649 d->data[i] = ri->data->data[i];
13652 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
13661 reti->name_list_idx = ri->name_list_idx;
13663 #ifdef RE_TRACK_PATTERN_OFFSETS
13664 if (ri->u.offsets) {
13665 Newx(reti->u.offsets, 2*len+1, U32);
13666 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13669 SetProgLen(reti,len);
13672 return (void*)reti;
13675 #endif /* USE_ITHREADS */
13677 #ifndef PERL_IN_XSUB_RE
13680 - regnext - dig the "next" pointer out of a node
13683 Perl_regnext(pTHX_ register regnode *p)
13686 register I32 offset;
13691 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
13692 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13695 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13704 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
13707 STRLEN l1 = strlen(pat1);
13708 STRLEN l2 = strlen(pat2);
13711 const char *message;
13713 PERL_ARGS_ASSERT_RE_CROAK2;
13719 Copy(pat1, buf, l1 , char);
13720 Copy(pat2, buf + l1, l2 , char);
13721 buf[l1 + l2] = '\n';
13722 buf[l1 + l2 + 1] = '\0';
13724 /* ANSI variant takes additional second argument */
13725 va_start(args, pat2);
13729 msv = vmess(buf, &args);
13731 message = SvPV_const(msv,l1);
13734 Copy(message, buf, l1 , char);
13735 buf[l1-1] = '\0'; /* Overwrite \n */
13736 Perl_croak(aTHX_ "%s", buf);
13739 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13741 #ifndef PERL_IN_XSUB_RE
13743 Perl_save_re_context(pTHX)
13747 struct re_save_state *state;
13749 SAVEVPTR(PL_curcop);
13750 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13752 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13753 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
13754 SSPUSHUV(SAVEt_RE_STATE);
13756 Copy(&PL_reg_state, state, 1, struct re_save_state);
13758 PL_reg_start_tmp = 0;
13759 PL_reg_start_tmpl = 0;
13760 PL_reg_oldsaved = NULL;
13761 PL_reg_oldsavedlen = 0;
13762 PL_reg_maxiter = 0;
13763 PL_reg_leftiter = 0;
13764 PL_reg_poscache = NULL;
13765 PL_reg_poscache_size = 0;
13766 #ifdef PERL_OLD_COPY_ON_WRITE
13770 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13772 const REGEXP * const rx = PM_GETRE(PL_curpm);
13775 for (i = 1; i <= RX_NPARENS(rx); i++) {
13776 char digits[TYPE_CHARS(long)];
13777 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
13778 GV *const *const gvp
13779 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13782 GV * const gv = *gvp;
13783 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13793 clear_re(pTHX_ void *r)
13796 ReREFCNT_dec((REGEXP *)r);
13802 S_put_byte(pTHX_ SV *sv, int c)
13804 PERL_ARGS_ASSERT_PUT_BYTE;
13806 /* Our definition of isPRINT() ignores locales, so only bytes that are
13807 not part of UTF-8 are considered printable. I assume that the same
13808 holds for UTF-EBCDIC.
13809 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13810 which Wikipedia says:
13812 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13813 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13814 identical, to the ASCII delete (DEL) or rubout control character.
13815 ) So the old condition can be simplified to !isPRINT(c) */
13818 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13821 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13825 const char string = c;
13826 if (c == '-' || c == ']' || c == '\\' || c == '^')
13827 sv_catpvs(sv, "\\");
13828 sv_catpvn(sv, &string, 1);
13833 #define CLEAR_OPTSTART \
13834 if (optstart) STMT_START { \
13835 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
13839 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
13841 STATIC const regnode *
13842 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
13843 const regnode *last, const regnode *plast,
13844 SV* sv, I32 indent, U32 depth)
13847 register U8 op = PSEUDO; /* Arbitrary non-END op. */
13848 register const regnode *next;
13849 const regnode *optstart= NULL;
13851 RXi_GET_DECL(r,ri);
13852 GET_RE_DEBUG_FLAGS_DECL;
13854 PERL_ARGS_ASSERT_DUMPUNTIL;
13856 #ifdef DEBUG_DUMPUNTIL
13857 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13858 last ? last-start : 0,plast ? plast-start : 0);
13861 if (plast && plast < last)
13864 while (PL_regkind[op] != END && (!last || node < last)) {
13865 /* While that wasn't END last time... */
13868 if (op == CLOSE || op == WHILEM)
13870 next = regnext((regnode *)node);
13873 if (OP(node) == OPTIMIZED) {
13874 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
13881 regprop(r, sv, node);
13882 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
13883 (int)(2*indent + 1), "", SvPVX_const(sv));
13885 if (OP(node) != OPTIMIZED) {
13886 if (next == NULL) /* Next ptr. */
13887 PerlIO_printf(Perl_debug_log, " (0)");
13888 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13889 PerlIO_printf(Perl_debug_log, " (FAIL)");
13891 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13892 (void)PerlIO_putc(Perl_debug_log, '\n');
13896 if (PL_regkind[(U8)op] == BRANCHJ) {
13899 register const regnode *nnode = (OP(next) == LONGJMP
13900 ? regnext((regnode *)next)
13902 if (last && nnode > last)
13904 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
13907 else if (PL_regkind[(U8)op] == BRANCH) {
13909 DUMPUNTIL(NEXTOPER(node), next);
13911 else if ( PL_regkind[(U8)op] == TRIE ) {
13912 const regnode *this_trie = node;
13913 const char op = OP(node);
13914 const U32 n = ARG(node);
13915 const reg_ac_data * const ac = op>=AHOCORASICK ?
13916 (reg_ac_data *)ri->data->data[n] :
13918 const reg_trie_data * const trie =
13919 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
13921 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
13923 const regnode *nextbranch= NULL;
13926 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
13927 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
13929 PerlIO_printf(Perl_debug_log, "%*s%s ",
13930 (int)(2*(indent+3)), "",
13931 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
13932 PL_colors[0], PL_colors[1],
13933 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
13934 PERL_PV_PRETTY_ELLIPSES |
13935 PERL_PV_PRETTY_LTGT
13940 U16 dist= trie->jump[word_idx+1];
13941 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13942 (UV)((dist ? this_trie + dist : next) - start));
13945 nextbranch= this_trie + trie->jump[0];
13946 DUMPUNTIL(this_trie + dist, nextbranch);
13948 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13949 nextbranch= regnext((regnode *)nextbranch);
13951 PerlIO_printf(Perl_debug_log, "\n");
13954 if (last && next > last)
13959 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
13960 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13961 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
13963 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
13965 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
13967 else if ( op == PLUS || op == STAR) {
13968 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
13970 else if (PL_regkind[(U8)op] == ANYOF) {
13971 /* arglen 1 + class block */
13972 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
13973 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13974 node = NEXTOPER(node);
13976 else if (PL_regkind[(U8)op] == EXACT) {
13977 /* Literal string, where present. */
13978 node += NODE_SZ_STR(node) - 1;
13979 node = NEXTOPER(node);
13982 node = NEXTOPER(node);
13983 node += regarglen[(U8)op];
13985 if (op == CURLYX || op == OPEN)
13989 #ifdef DEBUG_DUMPUNTIL
13990 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
13995 #endif /* DEBUGGING */
13999 * c-indentation-style: bsd
14000 * c-basic-offset: 4
14001 * indent-tabs-mode: nil
14004 * ex: set ts=8 sts=4 sw=4 et: