5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_inline.h"
90 #include "invlist_inline.h"
91 #include "unicode_constants.h"
93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
101 #define STATIC static
105 #define MIN(a,b) ((a) < (b) ? (a) : (b))
108 /* this is a chain of data about sub patterns we are processing that
109 need to be handled separately/specially in study_chunk. Its so
110 we can simulate recursion without losing state. */
112 typedef struct scan_frame {
113 regnode *last_regnode; /* last node to process in this frame */
114 regnode *next_regnode; /* next node to process when last is reached */
115 U32 prev_recursed_depth;
116 I32 stopparen; /* what stopparen do we use */
117 U32 is_top_frame; /* what flags do we use? */
119 struct scan_frame *this_prev_frame; /* this previous frame */
120 struct scan_frame *prev_frame; /* previous frame */
121 struct scan_frame *next_frame; /* next frame */
124 /* Certain characters are output as a sequence with the first being a
126 #define isBACKSLASHED_PUNCT(c) \
127 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
130 struct RExC_state_t {
131 U32 flags; /* RXf_* are we folding, multilining? */
132 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
133 char *precomp; /* uncompiled string. */
134 char *precomp_end; /* pointer to end of uncompiled string. */
135 REGEXP *rx_sv; /* The SV that is the regexp. */
136 regexp *rx; /* perl core regexp structure */
137 regexp_internal *rxi; /* internal data for regexp object
139 char *start; /* Start of input for compile */
140 char *end; /* End of input for compile */
141 char *parse; /* Input-scan pointer. */
142 char *adjusted_start; /* 'start', adjusted. See code use */
143 STRLEN precomp_adj; /* an offset beyond precomp. See code use */
144 SSize_t whilem_seen; /* number of WHILEM in this expr */
145 regnode *emit_start; /* Start of emitted-code area */
146 regnode *emit_bound; /* First regnode outside of the
148 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
149 implies compiling, so don't emit */
150 regnode_ssc emit_dummy; /* placeholder for emit to point to;
151 large enough for the largest
152 non-EXACTish node, so can use it as
154 I32 naughty; /* How bad is this pattern? */
155 I32 sawback; /* Did we see \1, ...? */
157 SSize_t size; /* Code size. */
158 I32 npar; /* Capture buffer count, (OPEN) plus
159 one. ("par" 0 is the whole
161 I32 nestroot; /* root parens we are in - used by
165 regnode **open_parens; /* pointers to open parens */
166 regnode **close_parens; /* pointers to close parens */
167 regnode *opend; /* END node in program */
168 I32 utf8; /* whether the pattern is utf8 or not */
169 I32 orig_utf8; /* whether the pattern was originally in utf8 */
170 /* XXX use this for future optimisation of case
171 * where pattern must be upgraded to utf8. */
172 I32 uni_semantics; /* If a d charset modifier should use unicode
173 rules, even if the pattern is not in
175 HV *paren_names; /* Paren names */
177 regnode **recurse; /* Recurse regops */
178 I32 recurse_count; /* Number of recurse regops */
179 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
181 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
185 I32 override_recoding;
187 I32 recode_x_to_native;
189 I32 in_multi_char_class;
190 struct reg_code_block *code_blocks; /* positions of literal (?{})
192 int num_code_blocks; /* size of code_blocks[] */
193 int code_index; /* next code_blocks[] slot */
194 SSize_t maxlen; /* mininum possible number of chars in string to match */
195 scan_frame *frame_head;
196 scan_frame *frame_last;
199 #ifdef ADD_TO_REGEXEC
200 char *starttry; /* -Dr: where regtry was called. */
201 #define RExC_starttry (pRExC_state->starttry)
203 SV *runtime_code_qr; /* qr with the runtime code blocks */
205 const char *lastparse;
207 AV *paren_name_list; /* idx -> name */
208 U32 study_chunk_recursed_count;
211 #define RExC_lastparse (pRExC_state->lastparse)
212 #define RExC_lastnum (pRExC_state->lastnum)
213 #define RExC_paren_name_list (pRExC_state->paren_name_list)
214 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
215 #define RExC_mysv (pRExC_state->mysv1)
216 #define RExC_mysv1 (pRExC_state->mysv1)
217 #define RExC_mysv2 (pRExC_state->mysv2)
220 bool seen_unfolded_sharp_s;
223 #define RExC_flags (pRExC_state->flags)
224 #define RExC_pm_flags (pRExC_state->pm_flags)
225 #define RExC_precomp (pRExC_state->precomp)
226 #define RExC_precomp_adj (pRExC_state->precomp_adj)
227 #define RExC_adjusted_start (pRExC_state->adjusted_start)
228 #define RExC_precomp_end (pRExC_state->precomp_end)
229 #define RExC_rx_sv (pRExC_state->rx_sv)
230 #define RExC_rx (pRExC_state->rx)
231 #define RExC_rxi (pRExC_state->rxi)
232 #define RExC_start (pRExC_state->start)
233 #define RExC_end (pRExC_state->end)
234 #define RExC_parse (pRExC_state->parse)
235 #define RExC_whilem_seen (pRExC_state->whilem_seen)
237 /* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
238 * EXACTF node, hence was parsed under /di rules. If later in the parse,
239 * something forces the pattern into using /ui rules, the sharp s should be
240 * folded into the sequence 'ss', which takes up more space than previously
241 * calculated. This means that the sizing pass needs to be restarted. (The
242 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
243 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
244 * so there is no need to resize [perl #125990]. */
245 #define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
247 #ifdef RE_TRACK_PATTERN_OFFSETS
248 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
251 #define RExC_emit (pRExC_state->emit)
252 #define RExC_emit_dummy (pRExC_state->emit_dummy)
253 #define RExC_emit_start (pRExC_state->emit_start)
254 #define RExC_emit_bound (pRExC_state->emit_bound)
255 #define RExC_sawback (pRExC_state->sawback)
256 #define RExC_seen (pRExC_state->seen)
257 #define RExC_size (pRExC_state->size)
258 #define RExC_maxlen (pRExC_state->maxlen)
259 #define RExC_npar (pRExC_state->npar)
260 #define RExC_nestroot (pRExC_state->nestroot)
261 #define RExC_extralen (pRExC_state->extralen)
262 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
263 #define RExC_utf8 (pRExC_state->utf8)
264 #define RExC_uni_semantics (pRExC_state->uni_semantics)
265 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
266 #define RExC_open_parens (pRExC_state->open_parens)
267 #define RExC_close_parens (pRExC_state->close_parens)
268 #define RExC_opend (pRExC_state->opend)
269 #define RExC_paren_names (pRExC_state->paren_names)
270 #define RExC_recurse (pRExC_state->recurse)
271 #define RExC_recurse_count (pRExC_state->recurse_count)
272 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
273 #define RExC_study_chunk_recursed_bytes \
274 (pRExC_state->study_chunk_recursed_bytes)
275 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
276 #define RExC_contains_locale (pRExC_state->contains_locale)
277 #define RExC_contains_i (pRExC_state->contains_i)
278 #define RExC_override_recoding (pRExC_state->override_recoding)
280 # define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
282 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
283 #define RExC_frame_head (pRExC_state->frame_head)
284 #define RExC_frame_last (pRExC_state->frame_last)
285 #define RExC_frame_count (pRExC_state->frame_count)
286 #define RExC_strict (pRExC_state->strict)
288 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
289 * a flag to disable back-off on the fixed/floating substrings - if it's
290 * a high complexity pattern we assume the benefit of avoiding a full match
291 * is worth the cost of checking for the substrings even if they rarely help.
293 #define RExC_naughty (pRExC_state->naughty)
294 #define TOO_NAUGHTY (10)
295 #define MARK_NAUGHTY(add) \
296 if (RExC_naughty < TOO_NAUGHTY) \
297 RExC_naughty += (add)
298 #define MARK_NAUGHTY_EXP(exp, add) \
299 if (RExC_naughty < TOO_NAUGHTY) \
300 RExC_naughty += RExC_naughty / (exp) + (add)
302 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
303 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
304 ((*s) == '{' && regcurly(s)))
307 * Flags to be passed up and down.
309 #define WORST 0 /* Worst case. */
310 #define HASWIDTH 0x01 /* Known to match non-null strings. */
312 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
313 * character. (There needs to be a case: in the switch statement in regexec.c
314 * for any node marked SIMPLE.) Note that this is not the same thing as
317 #define SPSTART 0x04 /* Starts with * or + */
318 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
319 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
320 #define RESTART_PASS1 0x20 /* Need to restart sizing pass */
321 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
322 calcuate sizes as UTF-8 */
324 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
326 /* whether trie related optimizations are enabled */
327 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
328 #define TRIE_STUDY_OPT
329 #define FULL_TRIE_STUDY
335 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
336 #define PBITVAL(paren) (1 << ((paren) & 7))
337 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
338 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
339 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
341 #define REQUIRE_UTF8(flagp) STMT_START { \
344 *flagp = RESTART_PASS1|NEED_UTF8; \
349 /* Change from /d into /u rules, and restart the parse if we've already seen
350 * something whose size would increase as a result, by setting *flagp and
351 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
352 * we've change to /u during the parse. */
353 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
355 if (DEPENDS_SEMANTICS) { \
357 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
358 RExC_uni_semantics = 1; \
359 if (RExC_seen_unfolded_sharp_s) { \
360 *flagp |= RESTART_PASS1; \
361 return restart_retval; \
366 /* This converts the named class defined in regcomp.h to its equivalent class
367 * number defined in handy.h. */
368 #define namedclass_to_classnum(class) ((int) ((class) / 2))
369 #define classnum_to_namedclass(classnum) ((classnum) * 2)
371 #define _invlist_union_complement_2nd(a, b, output) \
372 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
373 #define _invlist_intersection_complement_2nd(a, b, output) \
374 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
376 /* About scan_data_t.
378 During optimisation we recurse through the regexp program performing
379 various inplace (keyhole style) optimisations. In addition study_chunk
380 and scan_commit populate this data structure with information about
381 what strings MUST appear in the pattern. We look for the longest
382 string that must appear at a fixed location, and we look for the
383 longest string that may appear at a floating location. So for instance
388 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
389 strings (because they follow a .* construct). study_chunk will identify
390 both FOO and BAR as being the longest fixed and floating strings respectively.
392 The strings can be composites, for instance
396 will result in a composite fixed substring 'foo'.
398 For each string some basic information is maintained:
400 - offset or min_offset
401 This is the position the string must appear at, or not before.
402 It also implicitly (when combined with minlenp) tells us how many
403 characters must match before the string we are searching for.
404 Likewise when combined with minlenp and the length of the string it
405 tells us how many characters must appear after the string we have
409 Only used for floating strings. This is the rightmost point that
410 the string can appear at. If set to SSize_t_MAX it indicates that the
411 string can occur infinitely far to the right.
414 A pointer to the minimum number of characters of the pattern that the
415 string was found inside. This is important as in the case of positive
416 lookahead or positive lookbehind we can have multiple patterns
421 The minimum length of the pattern overall is 3, the minimum length
422 of the lookahead part is 3, but the minimum length of the part that
423 will actually match is 1. So 'FOO's minimum length is 3, but the
424 minimum length for the F is 1. This is important as the minimum length
425 is used to determine offsets in front of and behind the string being
426 looked for. Since strings can be composites this is the length of the
427 pattern at the time it was committed with a scan_commit. Note that
428 the length is calculated by study_chunk, so that the minimum lengths
429 are not known until the full pattern has been compiled, thus the
430 pointer to the value.
434 In the case of lookbehind the string being searched for can be
435 offset past the start point of the final matching string.
436 If this value was just blithely removed from the min_offset it would
437 invalidate some of the calculations for how many chars must match
438 before or after (as they are derived from min_offset and minlen and
439 the length of the string being searched for).
440 When the final pattern is compiled and the data is moved from the
441 scan_data_t structure into the regexp structure the information
442 about lookbehind is factored in, with the information that would
443 have been lost precalculated in the end_shift field for the
446 The fields pos_min and pos_delta are used to store the minimum offset
447 and the delta to the maximum offset at the current point in the pattern.
451 typedef struct scan_data_t {
452 /*I32 len_min; unused */
453 /*I32 len_delta; unused */
457 SSize_t last_end; /* min value, <0 unless valid. */
458 SSize_t last_start_min;
459 SSize_t last_start_max;
460 SV **longest; /* Either &l_fixed, or &l_float. */
461 SV *longest_fixed; /* longest fixed string found in pattern */
462 SSize_t offset_fixed; /* offset where it starts */
463 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
464 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
465 SV *longest_float; /* longest floating string found in pattern */
466 SSize_t offset_float_min; /* earliest point in string it can appear */
467 SSize_t offset_float_max; /* latest point in string it can appear */
468 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
469 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
472 SSize_t *last_closep;
473 regnode_ssc *start_class;
477 * Forward declarations for pregcomp()'s friends.
480 static const scan_data_t zero_scan_data =
481 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
483 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
484 #define SF_BEFORE_SEOL 0x0001
485 #define SF_BEFORE_MEOL 0x0002
486 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
487 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
489 #define SF_FIX_SHIFT_EOL (+2)
490 #define SF_FL_SHIFT_EOL (+4)
492 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
493 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
495 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
496 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
497 #define SF_IS_INF 0x0040
498 #define SF_HAS_PAR 0x0080
499 #define SF_IN_PAR 0x0100
500 #define SF_HAS_EVAL 0x0200
501 #define SCF_DO_SUBSTR 0x0400
502 #define SCF_DO_STCLASS_AND 0x0800
503 #define SCF_DO_STCLASS_OR 0x1000
504 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
505 #define SCF_WHILEM_VISITED_POS 0x2000
507 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
508 #define SCF_SEEN_ACCEPT 0x8000
509 #define SCF_TRIE_DOING_RESTUDY 0x10000
510 #define SCF_IN_DEFINE 0x20000
515 #define UTF cBOOL(RExC_utf8)
517 /* The enums for all these are ordered so things work out correctly */
518 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
519 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
520 == REGEX_DEPENDS_CHARSET)
521 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
522 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
523 >= REGEX_UNICODE_CHARSET)
524 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
525 == REGEX_ASCII_RESTRICTED_CHARSET)
526 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
527 >= REGEX_ASCII_RESTRICTED_CHARSET)
528 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
529 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
531 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
533 /* For programs that want to be strictly Unicode compatible by dying if any
534 * attempt is made to match a non-Unicode code point against a Unicode
536 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
538 #define OOB_NAMEDCLASS -1
540 /* There is no code point that is out-of-bounds, so this is problematic. But
541 * its only current use is to initialize a variable that is always set before
543 #define OOB_UNICODE 0xDEADBEEF
545 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
546 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
549 /* length of regex to show in messages that don't mark a position within */
550 #define RegexLengthToShowInErrorMessages 127
553 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
554 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
555 * op/pragma/warn/regcomp.
557 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
558 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
560 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
561 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
563 /* The code in this file in places uses one level of recursion with parsing
564 * rebased to an alternate string constructed by us in memory. This can take
565 * the form of something that is completely different from the input, or
566 * something that uses the input as part of the alternate. In the first case,
567 * there should be no possibility of an error, as we are in complete control of
568 * the alternate string. But in the second case we don't control the input
569 * portion, so there may be errors in that. Here's an example:
571 * is handled specially because \x{df} folds to a sequence of more than one
572 * character, 'ss'. What is done is to create and parse an alternate string,
573 * which looks like this:
574 * /(?:\x{DF}|[abc\x{DF}def])/ui
575 * where it uses the input unchanged in the middle of something it constructs,
576 * which is a branch for the DF outside the character class, and clustering
577 * parens around the whole thing. (It knows enough to skip the DF inside the
578 * class while in this substitute parse.) 'abc' and 'def' may have errors that
579 * need to be reported. The general situation looks like this:
582 * Input: ----------------------------------------------------
583 * Constructed: ---------------------------------------------------
586 * The input string sI..eI is the input pattern. The string sC..EC is the
587 * constructed substitute parse string. The portions sC..tC and eC..EC are
588 * constructed by us. The portion tC..eC is an exact duplicate of the input
589 * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that
590 * while parsing, we find an error at xC. We want to display a message showing
591 * the real input string. Thus we need to find the point xI in it which
592 * corresponds to xC. xC >= tC, since the portion of the string sC..tC has
593 * been constructed by us, and so shouldn't have errors. We get:
595 * xI = sI + (tI - sI) + (xC - tC)
597 * and, the offset into sI is:
599 * (xI - sI) = (tI - sI) + (xC - tC)
601 * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
602 * and we save tC as RExC_adjusted_start.
604 * During normal processing of the input pattern, everything points to that,
605 * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
608 #define tI_sI RExC_precomp_adj
609 #define tC RExC_adjusted_start
610 #define sC RExC_precomp
611 #define xI_offset(xC) ((IV) (tI_sI + (xC - tC)))
612 #define xI(xC) (sC + xI_offset(xC))
613 #define eC RExC_precomp_end
615 #define REPORT_LOCATION_ARGS(xC) \
617 (xI(xC) > eC) /* Don't run off end */ \
618 ? eC - sC /* Length before the <--HERE */ \
620 sC), /* The input pattern printed up to the <--HERE */ \
622 (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
623 (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */
625 /* Used to point after bad bytes for an error message, but avoid skipping
626 * past a nul byte. */
627 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
630 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
631 * arg. Show regex, up to a maximum length. If it's too long, chop and add
634 #define _FAIL(code) STMT_START { \
635 const char *ellipses = ""; \
636 IV len = RExC_precomp_end - RExC_precomp; \
639 SAVEFREESV(RExC_rx_sv); \
640 if (len > RegexLengthToShowInErrorMessages) { \
641 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
642 len = RegexLengthToShowInErrorMessages - 10; \
648 #define FAIL(msg) _FAIL( \
649 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
650 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
652 #define FAIL2(msg,arg) _FAIL( \
653 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
654 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
657 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
659 #define Simple_vFAIL(m) STMT_START { \
660 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
661 m, REPORT_LOCATION_ARGS(RExC_parse)); \
665 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
667 #define vFAIL(m) STMT_START { \
669 SAVEFREESV(RExC_rx_sv); \
674 * Like Simple_vFAIL(), but accepts two arguments.
676 #define Simple_vFAIL2(m,a1) STMT_START { \
677 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
678 REPORT_LOCATION_ARGS(RExC_parse)); \
682 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
684 #define vFAIL2(m,a1) STMT_START { \
686 SAVEFREESV(RExC_rx_sv); \
687 Simple_vFAIL2(m, a1); \
692 * Like Simple_vFAIL(), but accepts three arguments.
694 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
695 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
696 REPORT_LOCATION_ARGS(RExC_parse)); \
700 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
702 #define vFAIL3(m,a1,a2) STMT_START { \
704 SAVEFREESV(RExC_rx_sv); \
705 Simple_vFAIL3(m, a1, a2); \
709 * Like Simple_vFAIL(), but accepts four arguments.
711 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
712 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
713 REPORT_LOCATION_ARGS(RExC_parse)); \
716 #define vFAIL4(m,a1,a2,a3) STMT_START { \
718 SAVEFREESV(RExC_rx_sv); \
719 Simple_vFAIL4(m, a1, a2, a3); \
722 /* A specialized version of vFAIL2 that works with UTF8f */
723 #define vFAIL2utf8f(m, a1) STMT_START { \
725 SAVEFREESV(RExC_rx_sv); \
726 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
727 REPORT_LOCATION_ARGS(RExC_parse)); \
730 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
732 SAVEFREESV(RExC_rx_sv); \
733 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
734 REPORT_LOCATION_ARGS(RExC_parse)); \
737 /* These have asserts in them because of [perl #122671] Many warnings in
738 * regcomp.c can occur twice. If they get output in pass1 and later in that
739 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
740 * would get output again. So they should be output in pass2, and these
741 * asserts make sure new warnings follow that paradigm. */
743 /* m is not necessarily a "literal string", in this macro */
744 #define reg_warn_non_literal_string(loc, m) STMT_START { \
745 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
746 "%s" REPORT_LOCATION, \
747 m, REPORT_LOCATION_ARGS(loc)); \
750 #define ckWARNreg(loc,m) STMT_START { \
751 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
753 REPORT_LOCATION_ARGS(loc)); \
756 #define vWARN(loc, m) STMT_START { \
757 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
759 REPORT_LOCATION_ARGS(loc)); \
762 #define vWARN_dep(loc, m) STMT_START { \
763 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
765 REPORT_LOCATION_ARGS(loc)); \
768 #define ckWARNdep(loc,m) STMT_START { \
769 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
771 REPORT_LOCATION_ARGS(loc)); \
774 #define ckWARNregdep(loc,m) STMT_START { \
775 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
778 REPORT_LOCATION_ARGS(loc)); \
781 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
782 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
784 a1, REPORT_LOCATION_ARGS(loc)); \
787 #define ckWARN2reg(loc, m, a1) STMT_START { \
788 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
790 a1, REPORT_LOCATION_ARGS(loc)); \
793 #define vWARN3(loc, m, a1, a2) STMT_START { \
794 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
796 a1, a2, REPORT_LOCATION_ARGS(loc)); \
799 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
800 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
803 REPORT_LOCATION_ARGS(loc)); \
806 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
807 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
810 REPORT_LOCATION_ARGS(loc)); \
813 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
814 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
817 REPORT_LOCATION_ARGS(loc)); \
820 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
821 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
824 REPORT_LOCATION_ARGS(loc)); \
827 /* Macros for recording node offsets. 20001227 mjd@plover.com
828 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
829 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
830 * Element 0 holds the number n.
831 * Position is 1 indexed.
833 #ifndef RE_TRACK_PATTERN_OFFSETS
834 #define Set_Node_Offset_To_R(node,byte)
835 #define Set_Node_Offset(node,byte)
836 #define Set_Cur_Node_Offset
837 #define Set_Node_Length_To_R(node,len)
838 #define Set_Node_Length(node,len)
839 #define Set_Node_Cur_Length(node,start)
840 #define Node_Offset(n)
841 #define Node_Length(n)
842 #define Set_Node_Offset_Length(node,offset,len)
843 #define ProgLen(ri) ri->u.proglen
844 #define SetProgLen(ri,x) ri->u.proglen = x
846 #define ProgLen(ri) ri->u.offsets[0]
847 #define SetProgLen(ri,x) ri->u.offsets[0] = x
848 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
850 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
851 __LINE__, (int)(node), (int)(byte))); \
853 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
856 RExC_offsets[2*(node)-1] = (byte); \
861 #define Set_Node_Offset(node,byte) \
862 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
863 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
865 #define Set_Node_Length_To_R(node,len) STMT_START { \
867 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
868 __LINE__, (int)(node), (int)(len))); \
870 Perl_croak(aTHX_ "value of node is %d in Length macro", \
873 RExC_offsets[2*(node)] = (len); \
878 #define Set_Node_Length(node,len) \
879 Set_Node_Length_To_R((node)-RExC_emit_start, len)
880 #define Set_Node_Cur_Length(node, start) \
881 Set_Node_Length(node, RExC_parse - start)
883 /* Get offsets and lengths */
884 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
885 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
887 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
888 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
889 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
893 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
894 #define EXPERIMENTAL_INPLACESCAN
895 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
897 #define DEBUG_RExC_seen() \
898 DEBUG_OPTIMISE_MORE_r({ \
899 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
901 if (RExC_seen & REG_ZERO_LEN_SEEN) \
902 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
904 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
905 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
907 if (RExC_seen & REG_GPOS_SEEN) \
908 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
910 if (RExC_seen & REG_RECURSE_SEEN) \
911 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
913 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
914 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
916 if (RExC_seen & REG_VERBARG_SEEN) \
917 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
919 if (RExC_seen & REG_CUTGROUP_SEEN) \
920 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
922 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
923 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
925 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
926 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
928 if (RExC_seen & REG_GOSTART_SEEN) \
929 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
931 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
932 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
934 PerlIO_printf(Perl_debug_log,"\n"); \
937 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
938 if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
940 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
942 PerlIO_printf(Perl_debug_log, "%s", open_str); \
943 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
944 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
945 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
946 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
947 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
948 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
949 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
950 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
951 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
952 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
953 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
954 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
955 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
956 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
957 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
958 PerlIO_printf(Perl_debug_log, "%s", close_str); \
962 #define DEBUG_STUDYDATA(str,data,depth) \
963 DEBUG_OPTIMISE_MORE_r(if(data){ \
964 PerlIO_printf(Perl_debug_log, \
965 "%*s" str "Pos:%"IVdf"/%"IVdf \
967 (int)(depth)*2, "", \
968 (IV)((data)->pos_min), \
969 (IV)((data)->pos_delta), \
970 (UV)((data)->flags) \
972 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
973 PerlIO_printf(Perl_debug_log, \
974 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
975 (IV)((data)->whilem_c), \
976 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
977 is_inf ? "INF " : "" \
979 if ((data)->last_found) \
980 PerlIO_printf(Perl_debug_log, \
981 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
982 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
983 SvPVX_const((data)->last_found), \
984 (IV)((data)->last_end), \
985 (IV)((data)->last_start_min), \
986 (IV)((data)->last_start_max), \
987 ((data)->longest && \
988 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
989 SvPVX_const((data)->longest_fixed), \
990 (IV)((data)->offset_fixed), \
991 ((data)->longest && \
992 (data)->longest==&((data)->longest_float)) ? "*" : "", \
993 SvPVX_const((data)->longest_float), \
994 (IV)((data)->offset_float_min), \
995 (IV)((data)->offset_float_max) \
997 PerlIO_printf(Perl_debug_log,"\n"); \
1000 /* is c a control character for which we have a mnemonic? */
1001 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1004 S_cntrl_to_mnemonic(const U8 c)
1006 /* Returns the mnemonic string that represents character 'c', if one
1007 * exists; NULL otherwise. The only ones that exist for the purposes of
1008 * this routine are a few control characters */
1011 case '\a': return "\\a";
1012 case '\b': return "\\b";
1013 case ESC_NATIVE: return "\\e";
1014 case '\f': return "\\f";
1015 case '\n': return "\\n";
1016 case '\r': return "\\r";
1017 case '\t': return "\\t";
1023 /* Mark that we cannot extend a found fixed substring at this point.
1024 Update the longest found anchored substring and the longest found
1025 floating substrings if needed. */
1028 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1029 SSize_t *minlenp, int is_inf)
1031 const STRLEN l = CHR_SVLEN(data->last_found);
1032 const STRLEN old_l = CHR_SVLEN(*data->longest);
1033 GET_RE_DEBUG_FLAGS_DECL;
1035 PERL_ARGS_ASSERT_SCAN_COMMIT;
1037 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1038 SvSetMagicSV(*data->longest, data->last_found);
1039 if (*data->longest == data->longest_fixed) {
1040 data->offset_fixed = l ? data->last_start_min : data->pos_min;
1041 if (data->flags & SF_BEFORE_EOL)
1043 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
1045 data->flags &= ~SF_FIX_BEFORE_EOL;
1046 data->minlen_fixed=minlenp;
1047 data->lookbehind_fixed=0;
1049 else { /* *data->longest == data->longest_float */
1050 data->offset_float_min = l ? data->last_start_min : data->pos_min;
1051 data->offset_float_max = (l
1052 ? data->last_start_max
1053 : (data->pos_delta > SSize_t_MAX - data->pos_min
1055 : data->pos_min + data->pos_delta));
1057 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
1058 data->offset_float_max = SSize_t_MAX;
1059 if (data->flags & SF_BEFORE_EOL)
1061 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1063 data->flags &= ~SF_FL_BEFORE_EOL;
1064 data->minlen_float=minlenp;
1065 data->lookbehind_float=0;
1068 SvCUR_set(data->last_found, 0);
1070 SV * const sv = data->last_found;
1071 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1072 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1077 data->last_end = -1;
1078 data->flags &= ~SF_BEFORE_EOL;
1079 DEBUG_STUDYDATA("commit: ",data,0);
1082 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1083 * list that describes which code points it matches */
1086 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1088 /* Set the SSC 'ssc' to match an empty string or any code point */
1090 PERL_ARGS_ASSERT_SSC_ANYTHING;
1092 assert(is_ANYOF_SYNTHETIC(ssc));
1094 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
1095 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
1096 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1100 S_ssc_is_anything(const regnode_ssc *ssc)
1102 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1103 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1104 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1105 * in any way, so there's no point in using it */
1110 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1112 assert(is_ANYOF_SYNTHETIC(ssc));
1114 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1118 /* See if the list consists solely of the range 0 - Infinity */
1119 invlist_iterinit(ssc->invlist);
1120 ret = invlist_iternext(ssc->invlist, &start, &end)
1124 invlist_iterfinish(ssc->invlist);
1130 /* If e.g., both \w and \W are set, matches everything */
1131 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1133 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1134 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1144 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1146 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1147 * string, any code point, or any posix class under locale */
1149 PERL_ARGS_ASSERT_SSC_INIT;
1151 Zero(ssc, 1, regnode_ssc);
1152 set_ANYOF_SYNTHETIC(ssc);
1153 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1156 /* If any portion of the regex is to operate under locale rules that aren't
1157 * fully known at compile time, initialization includes it. The reason
1158 * this isn't done for all regexes is that the optimizer was written under
1159 * the assumption that locale was all-or-nothing. Given the complexity and
1160 * lack of documentation in the optimizer, and that there are inadequate
1161 * test cases for locale, many parts of it may not work properly, it is
1162 * safest to avoid locale unless necessary. */
1163 if (RExC_contains_locale) {
1164 ANYOF_POSIXL_SETALL(ssc);
1167 ANYOF_POSIXL_ZERO(ssc);
1172 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1173 const regnode_ssc *ssc)
1175 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1176 * to the list of code points matched, and locale posix classes; hence does
1177 * not check its flags) */
1182 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1184 assert(is_ANYOF_SYNTHETIC(ssc));
1186 invlist_iterinit(ssc->invlist);
1187 ret = invlist_iternext(ssc->invlist, &start, &end)
1191 invlist_iterfinish(ssc->invlist);
1197 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1205 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1206 const regnode_charclass* const node)
1208 /* Returns a mortal inversion list defining which code points are matched
1209 * by 'node', which is of type ANYOF. Handles complementing the result if
1210 * appropriate. If some code points aren't knowable at this time, the
1211 * returned list must, and will, contain every code point that is a
1214 SV* invlist = sv_2mortal(_new_invlist(0));
1215 SV* only_utf8_locale_invlist = NULL;
1217 const U32 n = ARG(node);
1218 bool new_node_has_latin1 = FALSE;
1220 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1222 /* Look at the data structure created by S_set_ANYOF_arg() */
1223 if (n != ANYOF_ONLY_HAS_BITMAP) {
1224 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1225 AV * const av = MUTABLE_AV(SvRV(rv));
1226 SV **const ary = AvARRAY(av);
1227 assert(RExC_rxi->data->what[n] == 's');
1229 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1230 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1232 else if (ary[0] && ary[0] != &PL_sv_undef) {
1234 /* Here, no compile-time swash, and there are things that won't be
1235 * known until runtime -- we have to assume it could be anything */
1236 return _add_range_to_invlist(invlist, 0, UV_MAX);
1238 else if (ary[3] && ary[3] != &PL_sv_undef) {
1240 /* Here no compile-time swash, and no run-time only data. Use the
1241 * node's inversion list */
1242 invlist = sv_2mortal(invlist_clone(ary[3]));
1245 /* Get the code points valid only under UTF-8 locales */
1246 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1247 && ary[2] && ary[2] != &PL_sv_undef)
1249 only_utf8_locale_invlist = ary[2];
1253 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1254 * code points, and an inversion list for the others, but if there are code
1255 * points that should match only conditionally on the target string being
1256 * UTF-8, those are placed in the inversion list, and not the bitmap.
1257 * Since there are circumstances under which they could match, they are
1258 * included in the SSC. But if the ANYOF node is to be inverted, we have
1259 * to exclude them here, so that when we invert below, the end result
1260 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1261 * have to do this here before we add the unconditionally matched code
1263 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1264 _invlist_intersection_complement_2nd(invlist,
1269 /* Add in the points from the bit map */
1270 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1271 if (ANYOF_BITMAP_TEST(node, i)) {
1272 invlist = add_cp_to_invlist(invlist, i);
1273 new_node_has_latin1 = TRUE;
1277 /* If this can match all upper Latin1 code points, have to add them
1279 if (OP(node) == ANYOFD
1280 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1282 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1285 /* Similarly for these */
1286 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1287 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1290 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1291 _invlist_invert(invlist);
1293 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
1295 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1296 * locale. We can skip this if there are no 0-255 at all. */
1297 _invlist_union(invlist, PL_Latin1, &invlist);
1300 /* Similarly add the UTF-8 locale possible matches. These have to be
1301 * deferred until after the non-UTF-8 locale ones are taken care of just
1302 * above, or it leads to wrong results under ANYOF_INVERT */
1303 if (only_utf8_locale_invlist) {
1304 _invlist_union_maybe_complement_2nd(invlist,
1305 only_utf8_locale_invlist,
1306 ANYOF_FLAGS(node) & ANYOF_INVERT,
1313 /* These two functions currently do the exact same thing */
1314 #define ssc_init_zero ssc_init
1316 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1317 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1319 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1320 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1321 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1324 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1325 const regnode_charclass *and_with)
1327 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1328 * another SSC or a regular ANYOF class. Can create false positives. */
1333 PERL_ARGS_ASSERT_SSC_AND;
1335 assert(is_ANYOF_SYNTHETIC(ssc));
1337 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1338 * the code point inversion list and just the relevant flags */
1339 if (is_ANYOF_SYNTHETIC(and_with)) {
1340 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1341 anded_flags = ANYOF_FLAGS(and_with);
1343 /* XXX This is a kludge around what appears to be deficiencies in the
1344 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1345 * there are paths through the optimizer where it doesn't get weeded
1346 * out when it should. And if we don't make some extra provision for
1347 * it like the code just below, it doesn't get added when it should.
1348 * This solution is to add it only when AND'ing, which is here, and
1349 * only when what is being AND'ed is the pristine, original node
1350 * matching anything. Thus it is like adding it to ssc_anything() but
1351 * only when the result is to be AND'ed. Probably the same solution
1352 * could be adopted for the same problem we have with /l matching,
1353 * which is solved differently in S_ssc_init(), and that would lead to
1354 * fewer false positives than that solution has. But if this solution
1355 * creates bugs, the consequences are only that a warning isn't raised
1356 * that should be; while the consequences for having /l bugs is
1357 * incorrect matches */
1358 if (ssc_is_anything((regnode_ssc *)and_with)) {
1359 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1363 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1364 if (OP(and_with) == ANYOFD) {
1365 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1368 anded_flags = ANYOF_FLAGS(and_with)
1369 &( ANYOF_COMMON_FLAGS
1370 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1371 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1372 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1374 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1379 ANYOF_FLAGS(ssc) &= anded_flags;
1381 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1382 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1383 * 'and_with' may be inverted. When not inverted, we have the situation of
1385 * (C1 | P1) & (C2 | P2)
1386 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1387 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1388 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1389 * <= ((C1 & C2) | P1 | P2)
1390 * Alternatively, the last few steps could be:
1391 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1392 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1393 * <= (C1 | C2 | (P1 & P2))
1394 * We favor the second approach if either P1 or P2 is non-empty. This is
1395 * because these components are a barrier to doing optimizations, as what
1396 * they match cannot be known until the moment of matching as they are
1397 * dependent on the current locale, 'AND"ing them likely will reduce or
1399 * But we can do better if we know that C1,P1 are in their initial state (a
1400 * frequent occurrence), each matching everything:
1401 * (<everything>) & (C2 | P2) = C2 | P2
1402 * Similarly, if C2,P2 are in their initial state (again a frequent
1403 * occurrence), the result is a no-op
1404 * (C1 | P1) & (<everything>) = C1 | P1
1407 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1408 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1409 * <= (C1 & ~C2) | (P1 & ~P2)
1412 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1413 && ! is_ANYOF_SYNTHETIC(and_with))
1417 ssc_intersection(ssc,
1419 FALSE /* Has already been inverted */
1422 /* If either P1 or P2 is empty, the intersection will be also; can skip
1424 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1425 ANYOF_POSIXL_ZERO(ssc);
1427 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1429 /* Note that the Posix class component P from 'and_with' actually
1431 * P = Pa | Pb | ... | Pn
1432 * where each component is one posix class, such as in [\w\s].
1434 * ~P = ~(Pa | Pb | ... | Pn)
1435 * = ~Pa & ~Pb & ... & ~Pn
1436 * <= ~Pa | ~Pb | ... | ~Pn
1437 * The last is something we can easily calculate, but unfortunately
1438 * is likely to have many false positives. We could do better
1439 * in some (but certainly not all) instances if two classes in
1440 * P have known relationships. For example
1441 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1443 * :lower: & :print: = :lower:
1444 * And similarly for classes that must be disjoint. For example,
1445 * since \s and \w can have no elements in common based on rules in
1446 * the POSIX standard,
1447 * \w & ^\S = nothing
1448 * Unfortunately, some vendor locales do not meet the Posix
1449 * standard, in particular almost everything by Microsoft.
1450 * The loop below just changes e.g., \w into \W and vice versa */
1452 regnode_charclass_posixl temp;
1453 int add = 1; /* To calculate the index of the complement */
1455 ANYOF_POSIXL_ZERO(&temp);
1456 for (i = 0; i < ANYOF_MAX; i++) {
1458 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1459 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1461 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1462 ANYOF_POSIXL_SET(&temp, i + add);
1464 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1466 ANYOF_POSIXL_AND(&temp, ssc);
1468 } /* else ssc already has no posixes */
1469 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1470 in its initial state */
1471 else if (! is_ANYOF_SYNTHETIC(and_with)
1472 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1474 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1475 * copy it over 'ssc' */
1476 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1477 if (is_ANYOF_SYNTHETIC(and_with)) {
1478 StructCopy(and_with, ssc, regnode_ssc);
1481 ssc->invlist = anded_cp_list;
1482 ANYOF_POSIXL_ZERO(ssc);
1483 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1484 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1488 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1489 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1491 /* One or the other of P1, P2 is non-empty. */
1492 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1493 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1495 ssc_union(ssc, anded_cp_list, FALSE);
1497 else { /* P1 = P2 = empty */
1498 ssc_intersection(ssc, anded_cp_list, FALSE);
1504 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1505 const regnode_charclass *or_with)
1507 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1508 * another SSC or a regular ANYOF class. Can create false positives if
1509 * 'or_with' is to be inverted. */
1514 PERL_ARGS_ASSERT_SSC_OR;
1516 assert(is_ANYOF_SYNTHETIC(ssc));
1518 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1519 * the code point inversion list and just the relevant flags */
1520 if (is_ANYOF_SYNTHETIC(or_with)) {
1521 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1522 ored_flags = ANYOF_FLAGS(or_with);
1525 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1526 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1527 if (OP(or_with) != ANYOFD) {
1529 |= ANYOF_FLAGS(or_with)
1530 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1531 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1532 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1534 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1539 ANYOF_FLAGS(ssc) |= ored_flags;
1541 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1542 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1543 * 'or_with' may be inverted. When not inverted, we have the simple
1544 * situation of computing:
1545 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1546 * If P1|P2 yields a situation with both a class and its complement are
1547 * set, like having both \w and \W, this matches all code points, and we
1548 * can delete these from the P component of the ssc going forward. XXX We
1549 * might be able to delete all the P components, but I (khw) am not certain
1550 * about this, and it is better to be safe.
1553 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1554 * <= (C1 | P1) | ~C2
1555 * <= (C1 | ~C2) | P1
1556 * (which results in actually simpler code than the non-inverted case)
1559 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1560 && ! is_ANYOF_SYNTHETIC(or_with))
1562 /* We ignore P2, leaving P1 going forward */
1563 } /* else Not inverted */
1564 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1565 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1566 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1568 for (i = 0; i < ANYOF_MAX; i += 2) {
1569 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1571 ssc_match_all_cp(ssc);
1572 ANYOF_POSIXL_CLEAR(ssc, i);
1573 ANYOF_POSIXL_CLEAR(ssc, i+1);
1581 FALSE /* Already has been inverted */
1585 PERL_STATIC_INLINE void
1586 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1588 PERL_ARGS_ASSERT_SSC_UNION;
1590 assert(is_ANYOF_SYNTHETIC(ssc));
1592 _invlist_union_maybe_complement_2nd(ssc->invlist,
1598 PERL_STATIC_INLINE void
1599 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1601 const bool invert2nd)
1603 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1605 assert(is_ANYOF_SYNTHETIC(ssc));
1607 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1613 PERL_STATIC_INLINE void
1614 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1616 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1618 assert(is_ANYOF_SYNTHETIC(ssc));
1620 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1623 PERL_STATIC_INLINE void
1624 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1626 /* AND just the single code point 'cp' into the SSC 'ssc' */
1628 SV* cp_list = _new_invlist(2);
1630 PERL_ARGS_ASSERT_SSC_CP_AND;
1632 assert(is_ANYOF_SYNTHETIC(ssc));
1634 cp_list = add_cp_to_invlist(cp_list, cp);
1635 ssc_intersection(ssc, cp_list,
1636 FALSE /* Not inverted */
1638 SvREFCNT_dec_NN(cp_list);
1641 PERL_STATIC_INLINE void
1642 S_ssc_clear_locale(regnode_ssc *ssc)
1644 /* Set the SSC 'ssc' to not match any locale things */
1645 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1647 assert(is_ANYOF_SYNTHETIC(ssc));
1649 ANYOF_POSIXL_ZERO(ssc);
1650 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1653 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1656 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1658 /* The synthetic start class is used to hopefully quickly winnow down
1659 * places where a pattern could start a match in the target string. If it
1660 * doesn't really narrow things down that much, there isn't much point to
1661 * having the overhead of using it. This function uses some very crude
1662 * heuristics to decide if to use the ssc or not.
1664 * It returns TRUE if 'ssc' rules out more than half what it considers to
1665 * be the "likely" possible matches, but of course it doesn't know what the
1666 * actual things being matched are going to be; these are only guesses
1668 * For /l matches, it assumes that the only likely matches are going to be
1669 * in the 0-255 range, uniformly distributed, so half of that is 127
1670 * For /a and /d matches, it assumes that the likely matches will be just
1671 * the ASCII range, so half of that is 63
1672 * For /u and there isn't anything matching above the Latin1 range, it
1673 * assumes that that is the only range likely to be matched, and uses
1674 * half that as the cut-off: 127. If anything matches above Latin1,
1675 * it assumes that all of Unicode could match (uniformly), except for
1676 * non-Unicode code points and things in the General Category "Other"
1677 * (unassigned, private use, surrogates, controls and formats). This
1678 * is a much large number. */
1680 const U32 max_match = (LOC)
1684 : (invlist_highest(ssc->invlist) < 256)
1686 : ((NON_OTHER_COUNT + 1) / 2) - 1;
1687 U32 count = 0; /* Running total of number of code points matched by
1689 UV start, end; /* Start and end points of current range in inversion
1692 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1694 invlist_iterinit(ssc->invlist);
1695 while (invlist_iternext(ssc->invlist, &start, &end)) {
1697 /* /u is the only thing that we expect to match above 255; so if not /u
1698 * and even if there are matches above 255, ignore them. This catches
1699 * things like \d under /d which does match the digits above 255, but
1700 * since the pattern is /d, it is not likely to be expecting them */
1701 if (! UNI_SEMANTICS) {
1705 end = MIN(end, 255);
1707 count += end - start + 1;
1708 if (count > max_match) {
1709 invlist_iterfinish(ssc->invlist);
1719 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1721 /* The inversion list in the SSC is marked mortal; now we need a more
1722 * permanent copy, which is stored the same way that is done in a regular
1723 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1726 SV* invlist = invlist_clone(ssc->invlist);
1728 PERL_ARGS_ASSERT_SSC_FINALIZE;
1730 assert(is_ANYOF_SYNTHETIC(ssc));
1732 /* The code in this file assumes that all but these flags aren't relevant
1733 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1734 * by the time we reach here */
1735 assert(! (ANYOF_FLAGS(ssc)
1736 & ~( ANYOF_COMMON_FLAGS
1737 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1738 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
1740 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1742 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1743 NULL, NULL, NULL, FALSE);
1745 /* Make sure is clone-safe */
1746 ssc->invlist = NULL;
1748 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1749 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1752 if (RExC_contains_locale) {
1756 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1759 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1760 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1761 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1762 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1763 ? (TRIE_LIST_CUR( idx ) - 1) \
1769 dump_trie(trie,widecharmap,revcharmap)
1770 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1771 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1773 These routines dump out a trie in a somewhat readable format.
1774 The _interim_ variants are used for debugging the interim
1775 tables that are used to generate the final compressed
1776 representation which is what dump_trie expects.
1778 Part of the reason for their existence is to provide a form
1779 of documentation as to how the different representations function.
1784 Dumps the final compressed table form of the trie to Perl_debug_log.
1785 Used for debugging make_trie().
1789 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1790 AV *revcharmap, U32 depth)
1793 SV *sv=sv_newmortal();
1794 int colwidth= widecharmap ? 6 : 4;
1796 GET_RE_DEBUG_FLAGS_DECL;
1798 PERL_ARGS_ASSERT_DUMP_TRIE;
1800 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1801 (int)depth * 2 + 2,"",
1802 "Match","Base","Ofs" );
1804 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1805 SV ** const tmp = av_fetch( revcharmap, state, 0);
1807 PerlIO_printf( Perl_debug_log, "%*s",
1809 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1810 PL_colors[0], PL_colors[1],
1811 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1812 PERL_PV_ESCAPE_FIRSTCHAR
1817 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1818 (int)depth * 2 + 2,"");
1820 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1821 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1822 PerlIO_printf( Perl_debug_log, "\n");
1824 for( state = 1 ; state < trie->statecount ; state++ ) {
1825 const U32 base = trie->states[ state ].trans.base;
1827 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1828 (int)depth * 2 + 2,"", (UV)state);
1830 if ( trie->states[ state ].wordnum ) {
1831 PerlIO_printf( Perl_debug_log, " W%4X",
1832 trie->states[ state ].wordnum );
1834 PerlIO_printf( Perl_debug_log, "%6s", "" );
1837 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1842 while( ( base + ofs < trie->uniquecharcount ) ||
1843 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1844 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1848 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1850 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1851 if ( ( base + ofs >= trie->uniquecharcount )
1852 && ( base + ofs - trie->uniquecharcount
1854 && trie->trans[ base + ofs
1855 - trie->uniquecharcount ].check == state )
1857 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1859 (UV)trie->trans[ base + ofs
1860 - trie->uniquecharcount ].next );
1862 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1866 PerlIO_printf( Perl_debug_log, "]");
1869 PerlIO_printf( Perl_debug_log, "\n" );
1871 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1873 for (word=1; word <= trie->wordcount; word++) {
1874 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1875 (int)word, (int)(trie->wordinfo[word].prev),
1876 (int)(trie->wordinfo[word].len));
1878 PerlIO_printf(Perl_debug_log, "\n" );
1881 Dumps a fully constructed but uncompressed trie in list form.
1882 List tries normally only are used for construction when the number of
1883 possible chars (trie->uniquecharcount) is very high.
1884 Used for debugging make_trie().
1887 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1888 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1892 SV *sv=sv_newmortal();
1893 int colwidth= widecharmap ? 6 : 4;
1894 GET_RE_DEBUG_FLAGS_DECL;
1896 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1898 /* print out the table precompression. */
1899 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1900 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1901 "------:-----+-----------------\n" );
1903 for( state=1 ; state < next_alloc ; state ++ ) {
1906 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1907 (int)depth * 2 + 2,"", (UV)state );
1908 if ( ! trie->states[ state ].wordnum ) {
1909 PerlIO_printf( Perl_debug_log, "%5s| ","");
1911 PerlIO_printf( Perl_debug_log, "W%4x| ",
1912 trie->states[ state ].wordnum
1915 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1916 SV ** const tmp = av_fetch( revcharmap,
1917 TRIE_LIST_ITEM(state,charid).forid, 0);
1919 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1921 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1923 PL_colors[0], PL_colors[1],
1924 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1925 | PERL_PV_ESCAPE_FIRSTCHAR
1927 TRIE_LIST_ITEM(state,charid).forid,
1928 (UV)TRIE_LIST_ITEM(state,charid).newstate
1931 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1932 (int)((depth * 2) + 14), "");
1935 PerlIO_printf( Perl_debug_log, "\n");
1940 Dumps a fully constructed but uncompressed trie in table form.
1941 This is the normal DFA style state transition table, with a few
1942 twists to facilitate compression later.
1943 Used for debugging make_trie().
1946 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1947 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1952 SV *sv=sv_newmortal();
1953 int colwidth= widecharmap ? 6 : 4;
1954 GET_RE_DEBUG_FLAGS_DECL;
1956 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1959 print out the table precompression so that we can do a visual check
1960 that they are identical.
1963 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1965 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1966 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1968 PerlIO_printf( Perl_debug_log, "%*s",
1970 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1971 PL_colors[0], PL_colors[1],
1972 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1973 PERL_PV_ESCAPE_FIRSTCHAR
1979 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1981 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1982 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1985 PerlIO_printf( Perl_debug_log, "\n" );
1987 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1989 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1990 (int)depth * 2 + 2,"",
1991 (UV)TRIE_NODENUM( state ) );
1993 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1994 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1996 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1998 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
2000 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2001 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
2002 (UV)trie->trans[ state ].check );
2004 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
2005 (UV)trie->trans[ state ].check,
2006 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2014 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2015 startbranch: the first branch in the whole branch sequence
2016 first : start branch of sequence of branch-exact nodes.
2017 May be the same as startbranch
2018 last : Thing following the last branch.
2019 May be the same as tail.
2020 tail : item following the branch sequence
2021 count : words in the sequence
2022 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2023 depth : indent depth
2025 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2027 A trie is an N'ary tree where the branches are determined by digital
2028 decomposition of the key. IE, at the root node you look up the 1st character and
2029 follow that branch repeat until you find the end of the branches. Nodes can be
2030 marked as "accepting" meaning they represent a complete word. Eg:
2034 would convert into the following structure. Numbers represent states, letters
2035 following numbers represent valid transitions on the letter from that state, if
2036 the number is in square brackets it represents an accepting state, otherwise it
2037 will be in parenthesis.
2039 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2043 (1) +-i->(6)-+-s->[7]
2045 +-s->(3)-+-h->(4)-+-e->[5]
2047 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2049 This shows that when matching against the string 'hers' we will begin at state 1
2050 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2051 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2052 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2053 single traverse. We store a mapping from accepting to state to which word was
2054 matched, and then when we have multiple possibilities we try to complete the
2055 rest of the regex in the order in which they occurred in the alternation.
2057 The only prior NFA like behaviour that would be changed by the TRIE support is
2058 the silent ignoring of duplicate alternations which are of the form:
2060 / (DUPE|DUPE) X? (?{ ... }) Y /x
2062 Thus EVAL blocks following a trie may be called a different number of times with
2063 and without the optimisation. With the optimisations dupes will be silently
2064 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2065 the following demonstrates:
2067 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2069 which prints out 'word' three times, but
2071 'words'=~/(word|word|word)(?{ print $1 })S/
2073 which doesnt print it out at all. This is due to other optimisations kicking in.
2075 Example of what happens on a structural level:
2077 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2079 1: CURLYM[1] {1,32767}(18)
2090 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2091 and should turn into:
2093 1: CURLYM[1] {1,32767}(18)
2095 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2103 Cases where tail != last would be like /(?foo|bar)baz/:
2113 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2114 and would end up looking like:
2117 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2124 d = uvchr_to_utf8_flags(d, uv, 0);
2126 is the recommended Unicode-aware way of saying
2131 #define TRIE_STORE_REVCHAR(val) \
2134 SV *zlopp = newSV(UTF8_MAXBYTES); \
2135 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2136 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2137 SvCUR_set(zlopp, kapow - flrbbbbb); \
2140 av_push(revcharmap, zlopp); \
2142 char ooooff = (char)val; \
2143 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2147 /* This gets the next character from the input, folding it if not already
2149 #define TRIE_READ_CHAR STMT_START { \
2152 /* if it is UTF then it is either already folded, or does not need \
2154 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2156 else if (folder == PL_fold_latin1) { \
2157 /* This folder implies Unicode rules, which in the range expressible \
2158 * by not UTF is the lower case, with the two exceptions, one of \
2159 * which should have been taken care of before calling this */ \
2160 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2161 uvc = toLOWER_L1(*uc); \
2162 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2165 /* raw data, will be folded later if needed */ \
2173 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2174 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2175 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2176 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2178 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2179 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2180 TRIE_LIST_CUR( state )++; \
2183 #define TRIE_LIST_NEW(state) STMT_START { \
2184 Newxz( trie->states[ state ].trans.list, \
2185 4, reg_trie_trans_le ); \
2186 TRIE_LIST_CUR( state ) = 1; \
2187 TRIE_LIST_LEN( state ) = 4; \
2190 #define TRIE_HANDLE_WORD(state) STMT_START { \
2191 U16 dupe= trie->states[ state ].wordnum; \
2192 regnode * const noper_next = regnext( noper ); \
2195 /* store the word for dumping */ \
2197 if (OP(noper) != NOTHING) \
2198 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2200 tmp = newSVpvn_utf8( "", 0, UTF ); \
2201 av_push( trie_words, tmp ); \
2205 trie->wordinfo[curword].prev = 0; \
2206 trie->wordinfo[curword].len = wordlen; \
2207 trie->wordinfo[curword].accept = state; \
2209 if ( noper_next < tail ) { \
2211 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2213 trie->jump[curword] = (U16)(noper_next - convert); \
2215 jumper = noper_next; \
2217 nextbranch= regnext(cur); \
2221 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2222 /* chain, so that when the bits of chain are later */\
2223 /* linked together, the dups appear in the chain */\
2224 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2225 trie->wordinfo[dupe].prev = curword; \
2227 /* we haven't inserted this word yet. */ \
2228 trie->states[ state ].wordnum = curword; \
2233 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2234 ( ( base + charid >= ucharcount \
2235 && base + charid < ubound \
2236 && state == trie->trans[ base - ucharcount + charid ].check \
2237 && trie->trans[ base - ucharcount + charid ].next ) \
2238 ? trie->trans[ base - ucharcount + charid ].next \
2239 : ( state==1 ? special : 0 ) \
2243 #define MADE_JUMP_TRIE 2
2244 #define MADE_EXACT_TRIE 4
2247 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2248 regnode *first, regnode *last, regnode *tail,
2249 U32 word_count, U32 flags, U32 depth)
2251 /* first pass, loop through and scan words */
2252 reg_trie_data *trie;
2253 HV *widecharmap = NULL;
2254 AV *revcharmap = newAV();
2260 regnode *jumper = NULL;
2261 regnode *nextbranch = NULL;
2262 regnode *convert = NULL;
2263 U32 *prev_states; /* temp array mapping each state to previous one */
2264 /* we just use folder as a flag in utf8 */
2265 const U8 * folder = NULL;
2268 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2269 AV *trie_words = NULL;
2270 /* along with revcharmap, this only used during construction but both are
2271 * useful during debugging so we store them in the struct when debugging.
2274 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2275 STRLEN trie_charcount=0;
2277 SV *re_trie_maxbuff;
2278 GET_RE_DEBUG_FLAGS_DECL;
2280 PERL_ARGS_ASSERT_MAKE_TRIE;
2282 PERL_UNUSED_ARG(depth);
2286 case EXACT: case EXACTL: break;
2290 case EXACTFLU8: folder = PL_fold_latin1; break;
2291 case EXACTF: folder = PL_fold; break;
2292 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2295 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2297 trie->startstate = 1;
2298 trie->wordcount = word_count;
2299 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2300 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2301 if (flags == EXACT || flags == EXACTL)
2302 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2303 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2304 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2307 trie_words = newAV();
2310 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2311 assert(re_trie_maxbuff);
2312 if (!SvIOK(re_trie_maxbuff)) {
2313 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2315 DEBUG_TRIE_COMPILE_r({
2316 PerlIO_printf( Perl_debug_log,
2317 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2318 (int)depth * 2 + 2, "",
2319 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2320 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2323 /* Find the node we are going to overwrite */
2324 if ( first == startbranch && OP( last ) != BRANCH ) {
2325 /* whole branch chain */
2328 /* branch sub-chain */
2329 convert = NEXTOPER( first );
2332 /* -- First loop and Setup --
2334 We first traverse the branches and scan each word to determine if it
2335 contains widechars, and how many unique chars there are, this is
2336 important as we have to build a table with at least as many columns as we
2339 We use an array of integers to represent the character codes 0..255
2340 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2341 the native representation of the character value as the key and IV's for
2344 *TODO* If we keep track of how many times each character is used we can
2345 remap the columns so that the table compression later on is more
2346 efficient in terms of memory by ensuring the most common value is in the
2347 middle and the least common are on the outside. IMO this would be better
2348 than a most to least common mapping as theres a decent chance the most
2349 common letter will share a node with the least common, meaning the node
2350 will not be compressible. With a middle is most common approach the worst
2351 case is when we have the least common nodes twice.
2355 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2356 regnode *noper = NEXTOPER( cur );
2357 const U8 *uc = (U8*)STRING( noper );
2358 const U8 *e = uc + STR_LEN( noper );
2360 U32 wordlen = 0; /* required init */
2361 STRLEN minchars = 0;
2362 STRLEN maxchars = 0;
2363 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2366 if (OP(noper) == NOTHING) {
2367 regnode *noper_next= regnext(noper);
2368 if (noper_next != tail && OP(noper_next) == flags) {
2370 uc= (U8*)STRING(noper);
2371 e= uc + STR_LEN(noper);
2372 trie->minlen= STR_LEN(noper);
2379 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2380 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2381 regardless of encoding */
2382 if (OP( noper ) == EXACTFU_SS) {
2383 /* false positives are ok, so just set this */
2384 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2387 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2389 TRIE_CHARCOUNT(trie)++;
2392 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2393 * is in effect. Under /i, this character can match itself, or
2394 * anything that folds to it. If not under /i, it can match just
2395 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2396 * all fold to k, and all are single characters. But some folds
2397 * expand to more than one character, so for example LATIN SMALL
2398 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2399 * the string beginning at 'uc' is 'ffi', it could be matched by
2400 * three characters, or just by the one ligature character. (It
2401 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2402 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2403 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2404 * match.) The trie needs to know the minimum and maximum number
2405 * of characters that could match so that it can use size alone to
2406 * quickly reject many match attempts. The max is simple: it is
2407 * the number of folded characters in this branch (since a fold is
2408 * never shorter than what folds to it. */
2412 /* And the min is equal to the max if not under /i (indicated by
2413 * 'folder' being NULL), or there are no multi-character folds. If
2414 * there is a multi-character fold, the min is incremented just
2415 * once, for the character that folds to the sequence. Each
2416 * character in the sequence needs to be added to the list below of
2417 * characters in the trie, but we count only the first towards the
2418 * min number of characters needed. This is done through the
2419 * variable 'foldlen', which is returned by the macros that look
2420 * for these sequences as the number of bytes the sequence
2421 * occupies. Each time through the loop, we decrement 'foldlen' by
2422 * how many bytes the current char occupies. Only when it reaches
2423 * 0 do we increment 'minchars' or look for another multi-character
2425 if (folder == NULL) {
2428 else if (foldlen > 0) {
2429 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2434 /* See if *uc is the beginning of a multi-character fold. If
2435 * so, we decrement the length remaining to look at, to account
2436 * for the current character this iteration. (We can use 'uc'
2437 * instead of the fold returned by TRIE_READ_CHAR because for
2438 * non-UTF, the latin1_safe macro is smart enough to account
2439 * for all the unfolded characters, and because for UTF, the
2440 * string will already have been folded earlier in the
2441 * compilation process */
2443 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2444 foldlen -= UTF8SKIP(uc);
2447 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2452 /* The current character (and any potential folds) should be added
2453 * to the possible matching characters for this position in this
2457 U8 folded= folder[ (U8) uvc ];
2458 if ( !trie->charmap[ folded ] ) {
2459 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2460 TRIE_STORE_REVCHAR( folded );
2463 if ( !trie->charmap[ uvc ] ) {
2464 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2465 TRIE_STORE_REVCHAR( uvc );
2468 /* store the codepoint in the bitmap, and its folded
2470 TRIE_BITMAP_SET(trie, uvc);
2472 /* store the folded codepoint */
2473 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2476 /* store first byte of utf8 representation of
2477 variant codepoints */
2478 if (! UVCHR_IS_INVARIANT(uvc)) {
2479 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2482 set_bit = 0; /* We've done our bit :-) */
2486 /* XXX We could come up with the list of code points that fold
2487 * to this using PL_utf8_foldclosures, except not for
2488 * multi-char folds, as there may be multiple combinations
2489 * there that could work, which needs to wait until runtime to
2490 * resolve (The comment about LIGATURE FFI above is such an
2495 widecharmap = newHV();
2497 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2500 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2502 if ( !SvTRUE( *svpp ) ) {
2503 sv_setiv( *svpp, ++trie->uniquecharcount );
2504 TRIE_STORE_REVCHAR(uvc);
2507 } /* end loop through characters in this branch of the trie */
2509 /* We take the min and max for this branch and combine to find the min
2510 * and max for all branches processed so far */
2511 if( cur == first ) {
2512 trie->minlen = minchars;
2513 trie->maxlen = maxchars;
2514 } else if (minchars < trie->minlen) {
2515 trie->minlen = minchars;
2516 } else if (maxchars > trie->maxlen) {
2517 trie->maxlen = maxchars;
2519 } /* end first pass */
2520 DEBUG_TRIE_COMPILE_r(
2521 PerlIO_printf( Perl_debug_log,
2522 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2523 (int)depth * 2 + 2,"",
2524 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2525 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2526 (int)trie->minlen, (int)trie->maxlen )
2530 We now know what we are dealing with in terms of unique chars and
2531 string sizes so we can calculate how much memory a naive
2532 representation using a flat table will take. If it's over a reasonable
2533 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2534 conservative but potentially much slower representation using an array
2537 At the end we convert both representations into the same compressed
2538 form that will be used in regexec.c for matching with. The latter
2539 is a form that cannot be used to construct with but has memory
2540 properties similar to the list form and access properties similar
2541 to the table form making it both suitable for fast searches and
2542 small enough that its feasable to store for the duration of a program.
2544 See the comment in the code where the compressed table is produced
2545 inplace from the flat tabe representation for an explanation of how
2546 the compression works.
2551 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2554 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2555 > SvIV(re_trie_maxbuff) )
2558 Second Pass -- Array Of Lists Representation
2560 Each state will be represented by a list of charid:state records
2561 (reg_trie_trans_le) the first such element holds the CUR and LEN
2562 points of the allocated array. (See defines above).
2564 We build the initial structure using the lists, and then convert
2565 it into the compressed table form which allows faster lookups
2566 (but cant be modified once converted).
2569 STRLEN transcount = 1;
2571 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2572 "%*sCompiling trie using list compiler\n",
2573 (int)depth * 2 + 2, ""));
2575 trie->states = (reg_trie_state *)
2576 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2577 sizeof(reg_trie_state) );
2581 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2583 regnode *noper = NEXTOPER( cur );
2584 U8 *uc = (U8*)STRING( noper );
2585 const U8 *e = uc + STR_LEN( noper );
2586 U32 state = 1; /* required init */
2587 U16 charid = 0; /* sanity init */
2588 U32 wordlen = 0; /* required init */
2590 if (OP(noper) == NOTHING) {
2591 regnode *noper_next= regnext(noper);
2592 if (noper_next != tail && OP(noper_next) == flags) {
2594 uc= (U8*)STRING(noper);
2595 e= uc + STR_LEN(noper);
2599 if (OP(noper) != NOTHING) {
2600 for ( ; uc < e ; uc += len ) {
2605 charid = trie->charmap[ uvc ];
2607 SV** const svpp = hv_fetch( widecharmap,
2614 charid=(U16)SvIV( *svpp );
2617 /* charid is now 0 if we dont know the char read, or
2618 * nonzero if we do */
2625 if ( !trie->states[ state ].trans.list ) {
2626 TRIE_LIST_NEW( state );
2629 check <= TRIE_LIST_USED( state );
2632 if ( TRIE_LIST_ITEM( state, check ).forid
2635 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2640 newstate = next_alloc++;
2641 prev_states[newstate] = state;
2642 TRIE_LIST_PUSH( state, charid, newstate );
2647 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2651 TRIE_HANDLE_WORD(state);
2653 } /* end second pass */
2655 /* next alloc is the NEXT state to be allocated */
2656 trie->statecount = next_alloc;
2657 trie->states = (reg_trie_state *)
2658 PerlMemShared_realloc( trie->states,
2660 * sizeof(reg_trie_state) );
2662 /* and now dump it out before we compress it */
2663 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2664 revcharmap, next_alloc,
2668 trie->trans = (reg_trie_trans *)
2669 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2676 for( state=1 ; state < next_alloc ; state ++ ) {
2680 DEBUG_TRIE_COMPILE_MORE_r(
2681 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2685 if (trie->states[state].trans.list) {
2686 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2690 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2691 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2692 if ( forid < minid ) {
2694 } else if ( forid > maxid ) {
2698 if ( transcount < tp + maxid - minid + 1) {
2700 trie->trans = (reg_trie_trans *)
2701 PerlMemShared_realloc( trie->trans,
2703 * sizeof(reg_trie_trans) );
2704 Zero( trie->trans + (transcount / 2),
2708 base = trie->uniquecharcount + tp - minid;
2709 if ( maxid == minid ) {
2711 for ( ; zp < tp ; zp++ ) {
2712 if ( ! trie->trans[ zp ].next ) {
2713 base = trie->uniquecharcount + zp - minid;
2714 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2716 trie->trans[ zp ].check = state;
2722 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2724 trie->trans[ tp ].check = state;
2729 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2730 const U32 tid = base
2731 - trie->uniquecharcount
2732 + TRIE_LIST_ITEM( state, idx ).forid;
2733 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2735 trie->trans[ tid ].check = state;
2737 tp += ( maxid - minid + 1 );
2739 Safefree(trie->states[ state ].trans.list);
2742 DEBUG_TRIE_COMPILE_MORE_r(
2743 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2746 trie->states[ state ].trans.base=base;
2748 trie->lasttrans = tp + 1;
2752 Second Pass -- Flat Table Representation.
2754 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2755 each. We know that we will need Charcount+1 trans at most to store
2756 the data (one row per char at worst case) So we preallocate both
2757 structures assuming worst case.
2759 We then construct the trie using only the .next slots of the entry
2762 We use the .check field of the first entry of the node temporarily
2763 to make compression both faster and easier by keeping track of how
2764 many non zero fields are in the node.
2766 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2769 There are two terms at use here: state as a TRIE_NODEIDX() which is
2770 a number representing the first entry of the node, and state as a
2771 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2772 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2773 if there are 2 entrys per node. eg:
2781 The table is internally in the right hand, idx form. However as we
2782 also have to deal with the states array which is indexed by nodenum
2783 we have to use TRIE_NODENUM() to convert.
2786 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2787 "%*sCompiling trie using table compiler\n",
2788 (int)depth * 2 + 2, ""));
2790 trie->trans = (reg_trie_trans *)
2791 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2792 * trie->uniquecharcount + 1,
2793 sizeof(reg_trie_trans) );
2794 trie->states = (reg_trie_state *)
2795 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2796 sizeof(reg_trie_state) );
2797 next_alloc = trie->uniquecharcount + 1;
2800 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2802 regnode *noper = NEXTOPER( cur );
2803 const U8 *uc = (U8*)STRING( noper );
2804 const U8 *e = uc + STR_LEN( noper );
2806 U32 state = 1; /* required init */
2808 U16 charid = 0; /* sanity init */
2809 U32 accept_state = 0; /* sanity init */
2811 U32 wordlen = 0; /* required init */
2813 if (OP(noper) == NOTHING) {
2814 regnode *noper_next= regnext(noper);
2815 if (noper_next != tail && OP(noper_next) == flags) {
2817 uc= (U8*)STRING(noper);
2818 e= uc + STR_LEN(noper);
2822 if ( OP(noper) != NOTHING ) {
2823 for ( ; uc < e ; uc += len ) {
2828 charid = trie->charmap[ uvc ];
2830 SV* const * const svpp = hv_fetch( widecharmap,
2834 charid = svpp ? (U16)SvIV(*svpp) : 0;
2838 if ( !trie->trans[ state + charid ].next ) {
2839 trie->trans[ state + charid ].next = next_alloc;
2840 trie->trans[ state ].check++;
2841 prev_states[TRIE_NODENUM(next_alloc)]
2842 = TRIE_NODENUM(state);
2843 next_alloc += trie->uniquecharcount;
2845 state = trie->trans[ state + charid ].next;
2847 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2849 /* charid is now 0 if we dont know the char read, or
2850 * nonzero if we do */
2853 accept_state = TRIE_NODENUM( state );
2854 TRIE_HANDLE_WORD(accept_state);
2856 } /* end second pass */
2858 /* and now dump it out before we compress it */
2859 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2861 next_alloc, depth+1));
2865 * Inplace compress the table.*
2867 For sparse data sets the table constructed by the trie algorithm will
2868 be mostly 0/FAIL transitions or to put it another way mostly empty.
2869 (Note that leaf nodes will not contain any transitions.)
2871 This algorithm compresses the tables by eliminating most such
2872 transitions, at the cost of a modest bit of extra work during lookup:
2874 - Each states[] entry contains a .base field which indicates the
2875 index in the state[] array wheres its transition data is stored.
2877 - If .base is 0 there are no valid transitions from that node.
2879 - If .base is nonzero then charid is added to it to find an entry in
2882 -If trans[states[state].base+charid].check!=state then the
2883 transition is taken to be a 0/Fail transition. Thus if there are fail
2884 transitions at the front of the node then the .base offset will point
2885 somewhere inside the previous nodes data (or maybe even into a node
2886 even earlier), but the .check field determines if the transition is
2890 The following process inplace converts the table to the compressed
2891 table: We first do not compress the root node 1,and mark all its
2892 .check pointers as 1 and set its .base pointer as 1 as well. This
2893 allows us to do a DFA construction from the compressed table later,
2894 and ensures that any .base pointers we calculate later are greater
2897 - We set 'pos' to indicate the first entry of the second node.
2899 - We then iterate over the columns of the node, finding the first and
2900 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2901 and set the .check pointers accordingly, and advance pos
2902 appropriately and repreat for the next node. Note that when we copy
2903 the next pointers we have to convert them from the original
2904 NODEIDX form to NODENUM form as the former is not valid post
2907 - If a node has no transitions used we mark its base as 0 and do not
2908 advance the pos pointer.
2910 - If a node only has one transition we use a second pointer into the
2911 structure to fill in allocated fail transitions from other states.
2912 This pointer is independent of the main pointer and scans forward
2913 looking for null transitions that are allocated to a state. When it
2914 finds one it writes the single transition into the "hole". If the
2915 pointer doesnt find one the single transition is appended as normal.
2917 - Once compressed we can Renew/realloc the structures to release the
2920 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2921 specifically Fig 3.47 and the associated pseudocode.
2925 const U32 laststate = TRIE_NODENUM( next_alloc );
2928 trie->statecount = laststate;
2930 for ( state = 1 ; state < laststate ; state++ ) {
2932 const U32 stateidx = TRIE_NODEIDX( state );
2933 const U32 o_used = trie->trans[ stateidx ].check;
2934 U32 used = trie->trans[ stateidx ].check;
2935 trie->trans[ stateidx ].check = 0;
2938 used && charid < trie->uniquecharcount;
2941 if ( flag || trie->trans[ stateidx + charid ].next ) {
2942 if ( trie->trans[ stateidx + charid ].next ) {
2944 for ( ; zp < pos ; zp++ ) {
2945 if ( ! trie->trans[ zp ].next ) {
2949 trie->states[ state ].trans.base
2951 + trie->uniquecharcount
2953 trie->trans[ zp ].next
2954 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2956 trie->trans[ zp ].check = state;
2957 if ( ++zp > pos ) pos = zp;
2964 trie->states[ state ].trans.base
2965 = pos + trie->uniquecharcount - charid ;
2967 trie->trans[ pos ].next
2968 = SAFE_TRIE_NODENUM(
2969 trie->trans[ stateidx + charid ].next );
2970 trie->trans[ pos ].check = state;
2975 trie->lasttrans = pos + 1;
2976 trie->states = (reg_trie_state *)
2977 PerlMemShared_realloc( trie->states, laststate
2978 * sizeof(reg_trie_state) );
2979 DEBUG_TRIE_COMPILE_MORE_r(
2980 PerlIO_printf( Perl_debug_log,
2981 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2982 (int)depth * 2 + 2,"",
2983 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2987 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2990 } /* end table compress */
2992 DEBUG_TRIE_COMPILE_MORE_r(
2993 PerlIO_printf(Perl_debug_log,
2994 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2995 (int)depth * 2 + 2, "",
2996 (UV)trie->statecount,
2997 (UV)trie->lasttrans)
2999 /* resize the trans array to remove unused space */
3000 trie->trans = (reg_trie_trans *)
3001 PerlMemShared_realloc( trie->trans, trie->lasttrans
3002 * sizeof(reg_trie_trans) );
3004 { /* Modify the program and insert the new TRIE node */
3005 U8 nodetype =(U8)(flags & 0xFF);
3009 regnode *optimize = NULL;
3010 #ifdef RE_TRACK_PATTERN_OFFSETS
3013 U32 mjd_nodelen = 0;
3014 #endif /* RE_TRACK_PATTERN_OFFSETS */
3015 #endif /* DEBUGGING */
3017 This means we convert either the first branch or the first Exact,
3018 depending on whether the thing following (in 'last') is a branch
3019 or not and whther first is the startbranch (ie is it a sub part of
3020 the alternation or is it the whole thing.)
3021 Assuming its a sub part we convert the EXACT otherwise we convert
3022 the whole branch sequence, including the first.
3024 /* Find the node we are going to overwrite */
3025 if ( first != startbranch || OP( last ) == BRANCH ) {
3026 /* branch sub-chain */
3027 NEXT_OFF( first ) = (U16)(last - first);
3028 #ifdef RE_TRACK_PATTERN_OFFSETS
3030 mjd_offset= Node_Offset((convert));
3031 mjd_nodelen= Node_Length((convert));
3034 /* whole branch chain */
3036 #ifdef RE_TRACK_PATTERN_OFFSETS
3039 const regnode *nop = NEXTOPER( convert );
3040 mjd_offset= Node_Offset((nop));
3041 mjd_nodelen= Node_Length((nop));
3045 PerlIO_printf(Perl_debug_log,
3046 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
3047 (int)depth * 2 + 2, "",
3048 (UV)mjd_offset, (UV)mjd_nodelen)
3051 /* But first we check to see if there is a common prefix we can
3052 split out as an EXACT and put in front of the TRIE node. */
3053 trie->startstate= 1;
3054 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3056 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3060 const U32 base = trie->states[ state ].trans.base;
3062 if ( trie->states[state].wordnum )
3065 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3066 if ( ( base + ofs >= trie->uniquecharcount ) &&
3067 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3068 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3070 if ( ++count > 1 ) {
3071 SV **tmp = av_fetch( revcharmap, ofs, 0);
3072 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
3073 if ( state == 1 ) break;
3075 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3077 PerlIO_printf(Perl_debug_log,
3078 "%*sNew Start State=%"UVuf" Class: [",
3079 (int)depth * 2 + 2, "",
3082 SV ** const tmp = av_fetch( revcharmap, idx, 0);
3083 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3085 TRIE_BITMAP_SET(trie,*ch);
3087 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3089 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3093 TRIE_BITMAP_SET(trie,*ch);
3095 TRIE_BITMAP_SET(trie,folder[ *ch ]);
3096 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
3102 SV **tmp = av_fetch( revcharmap, idx, 0);
3104 char *ch = SvPV( *tmp, len );
3106 SV *sv=sv_newmortal();
3107 PerlIO_printf( Perl_debug_log,
3108 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
3109 (int)depth * 2 + 2, "",
3111 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3112 PL_colors[0], PL_colors[1],
3113 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3114 PERL_PV_ESCAPE_FIRSTCHAR
3119 OP( convert ) = nodetype;
3120 str=STRING(convert);
3123 STR_LEN(convert) += len;
3129 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
3134 trie->prefixlen = (state-1);
3136 regnode *n = convert+NODE_SZ_STR(convert);
3137 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3138 trie->startstate = state;
3139 trie->minlen -= (state - 1);
3140 trie->maxlen -= (state - 1);
3142 /* At least the UNICOS C compiler choked on this
3143 * being argument to DEBUG_r(), so let's just have
3146 #ifdef PERL_EXT_RE_BUILD
3152 regnode *fix = convert;
3153 U32 word = trie->wordcount;
3155 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3156 while( ++fix < n ) {
3157 Set_Node_Offset_Length(fix, 0, 0);
3160 SV ** const tmp = av_fetch( trie_words, word, 0 );
3162 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3163 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3165 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3173 NEXT_OFF(convert) = (U16)(tail - convert);
3174 DEBUG_r(optimize= n);
3180 if ( trie->maxlen ) {
3181 NEXT_OFF( convert ) = (U16)(tail - convert);
3182 ARG_SET( convert, data_slot );
3183 /* Store the offset to the first unabsorbed branch in
3184 jump[0], which is otherwise unused by the jump logic.
3185 We use this when dumping a trie and during optimisation. */
3187 trie->jump[0] = (U16)(nextbranch - convert);
3189 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3190 * and there is a bitmap
3191 * and the first "jump target" node we found leaves enough room
3192 * then convert the TRIE node into a TRIEC node, with the bitmap
3193 * embedded inline in the opcode - this is hypothetically faster.
3195 if ( !trie->states[trie->startstate].wordnum
3197 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3199 OP( convert ) = TRIEC;
3200 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3201 PerlMemShared_free(trie->bitmap);
3204 OP( convert ) = TRIE;
3206 /* store the type in the flags */
3207 convert->flags = nodetype;
3211 + regarglen[ OP( convert ) ];
3213 /* XXX We really should free up the resource in trie now,
3214 as we won't use them - (which resources?) dmq */
3216 /* needed for dumping*/
3217 DEBUG_r(if (optimize) {
3218 regnode *opt = convert;
3220 while ( ++opt < optimize) {
3221 Set_Node_Offset_Length(opt,0,0);
3224 Try to clean up some of the debris left after the
3227 while( optimize < jumper ) {
3228 mjd_nodelen += Node_Length((optimize));
3229 OP( optimize ) = OPTIMIZED;
3230 Set_Node_Offset_Length(optimize,0,0);
3233 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3235 } /* end node insert */
3237 /* Finish populating the prev field of the wordinfo array. Walk back
3238 * from each accept state until we find another accept state, and if
3239 * so, point the first word's .prev field at the second word. If the
3240 * second already has a .prev field set, stop now. This will be the
3241 * case either if we've already processed that word's accept state,
3242 * or that state had multiple words, and the overspill words were
3243 * already linked up earlier.
3250 for (word=1; word <= trie->wordcount; word++) {
3252 if (trie->wordinfo[word].prev)
3254 state = trie->wordinfo[word].accept;
3256 state = prev_states[state];
3259 prev = trie->states[state].wordnum;
3263 trie->wordinfo[word].prev = prev;
3265 Safefree(prev_states);
3269 /* and now dump out the compressed format */
3270 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3272 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3274 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3275 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3277 SvREFCNT_dec_NN(revcharmap);
3281 : trie->startstate>1
3287 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3289 /* The Trie is constructed and compressed now so we can build a fail array if
3292 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3294 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3298 We find the fail state for each state in the trie, this state is the longest
3299 proper suffix of the current state's 'word' that is also a proper prefix of
3300 another word in our trie. State 1 represents the word '' and is thus the
3301 default fail state. This allows the DFA not to have to restart after its
3302 tried and failed a word at a given point, it simply continues as though it
3303 had been matching the other word in the first place.
3305 'abcdgu'=~/abcdefg|cdgu/
3306 When we get to 'd' we are still matching the first word, we would encounter
3307 'g' which would fail, which would bring us to the state representing 'd' in
3308 the second word where we would try 'g' and succeed, proceeding to match
3311 /* add a fail transition */
3312 const U32 trie_offset = ARG(source);
3313 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3315 const U32 ucharcount = trie->uniquecharcount;
3316 const U32 numstates = trie->statecount;
3317 const U32 ubound = trie->lasttrans + ucharcount;
3321 U32 base = trie->states[ 1 ].trans.base;
3324 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3326 GET_RE_DEBUG_FLAGS_DECL;
3328 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3329 PERL_UNUSED_CONTEXT;
3331 PERL_UNUSED_ARG(depth);
3334 if ( OP(source) == TRIE ) {
3335 struct regnode_1 *op = (struct regnode_1 *)
3336 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3337 StructCopy(source,op,struct regnode_1);
3338 stclass = (regnode *)op;
3340 struct regnode_charclass *op = (struct regnode_charclass *)
3341 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3342 StructCopy(source,op,struct regnode_charclass);
3343 stclass = (regnode *)op;
3345 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3347 ARG_SET( stclass, data_slot );
3348 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3349 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3350 aho->trie=trie_offset;
3351 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3352 Copy( trie->states, aho->states, numstates, reg_trie_state );
3353 Newxz( q, numstates, U32);
3354 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3357 /* initialize fail[0..1] to be 1 so that we always have
3358 a valid final fail state */
3359 fail[ 0 ] = fail[ 1 ] = 1;
3361 for ( charid = 0; charid < ucharcount ; charid++ ) {
3362 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3364 q[ q_write ] = newstate;
3365 /* set to point at the root */
3366 fail[ q[ q_write++ ] ]=1;
3369 while ( q_read < q_write) {
3370 const U32 cur = q[ q_read++ % numstates ];
3371 base = trie->states[ cur ].trans.base;
3373 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3374 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3376 U32 fail_state = cur;
3379 fail_state = fail[ fail_state ];
3380 fail_base = aho->states[ fail_state ].trans.base;
3381 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3383 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3384 fail[ ch_state ] = fail_state;
3385 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3387 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3389 q[ q_write++ % numstates] = ch_state;
3393 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3394 when we fail in state 1, this allows us to use the
3395 charclass scan to find a valid start char. This is based on the principle
3396 that theres a good chance the string being searched contains lots of stuff
3397 that cant be a start char.
3399 fail[ 0 ] = fail[ 1 ] = 0;
3400 DEBUG_TRIE_COMPILE_r({
3401 PerlIO_printf(Perl_debug_log,
3402 "%*sStclass Failtable (%"UVuf" states): 0",
3403 (int)(depth * 2), "", (UV)numstates
3405 for( q_read=1; q_read<numstates; q_read++ ) {
3406 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3408 PerlIO_printf(Perl_debug_log, "\n");
3411 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3416 #define DEBUG_PEEP(str,scan,depth) \
3417 DEBUG_OPTIMISE_r({if (scan){ \
3418 regnode *Next = regnext(scan); \
3419 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3420 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3421 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3422 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3423 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3424 PerlIO_printf(Perl_debug_log, "\n"); \
3427 /* The below joins as many adjacent EXACTish nodes as possible into a single
3428 * one. The regop may be changed if the node(s) contain certain sequences that
3429 * require special handling. The joining is only done if:
3430 * 1) there is room in the current conglomerated node to entirely contain the
3432 * 2) they are the exact same node type
3434 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3435 * these get optimized out
3437 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3438 * as possible, even if that means splitting an existing node so that its first
3439 * part is moved to the preceeding node. This would maximise the efficiency of
3440 * memEQ during matching. Elsewhere in this file, khw proposes splitting
3441 * EXACTFish nodes into portions that don't change under folding vs those that
3442 * do. Those portions that don't change may be the only things in the pattern that
3443 * could be used to find fixed and floating strings.
3445 * If a node is to match under /i (folded), the number of characters it matches
3446 * can be different than its character length if it contains a multi-character
3447 * fold. *min_subtract is set to the total delta number of characters of the
3450 * And *unfolded_multi_char is set to indicate whether or not the node contains
3451 * an unfolded multi-char fold. This happens when whether the fold is valid or
3452 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3453 * SMALL LETTER SHARP S, as only if the target string being matched against
3454 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3455 * folding rules depend on the locale in force at runtime. (Multi-char folds
3456 * whose components are all above the Latin1 range are not run-time locale
3457 * dependent, and have already been folded by the time this function is
3460 * This is as good a place as any to discuss the design of handling these
3461 * multi-character fold sequences. It's been wrong in Perl for a very long
3462 * time. There are three code points in Unicode whose multi-character folds
3463 * were long ago discovered to mess things up. The previous designs for
3464 * dealing with these involved assigning a special node for them. This
3465 * approach doesn't always work, as evidenced by this example:
3466 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3467 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3468 * would match just the \xDF, it won't be able to handle the case where a
3469 * successful match would have to cross the node's boundary. The new approach
3470 * that hopefully generally solves the problem generates an EXACTFU_SS node
3471 * that is "sss" in this case.
3473 * It turns out that there are problems with all multi-character folds, and not
3474 * just these three. Now the code is general, for all such cases. The
3475 * approach taken is:
3476 * 1) This routine examines each EXACTFish node that could contain multi-
3477 * character folded sequences. Since a single character can fold into
3478 * such a sequence, the minimum match length for this node is less than
3479 * the number of characters in the node. This routine returns in
3480 * *min_subtract how many characters to subtract from the the actual
3481 * length of the string to get a real minimum match length; it is 0 if
3482 * there are no multi-char foldeds. This delta is used by the caller to
3483 * adjust the min length of the match, and the delta between min and max,
3484 * so that the optimizer doesn't reject these possibilities based on size
3486 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3487 * is used for an EXACTFU node that contains at least one "ss" sequence in
3488 * it. For non-UTF-8 patterns and strings, this is the only case where
3489 * there is a possible fold length change. That means that a regular
3490 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3491 * with length changes, and so can be processed faster. regexec.c takes
3492 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3493 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3494 * known until runtime). This saves effort in regex matching. However,
3495 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3496 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3497 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3498 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3499 * possibilities for the non-UTF8 patterns are quite simple, except for
3500 * the sharp s. All the ones that don't involve a UTF-8 target string are
3501 * members of a fold-pair, and arrays are set up for all of them so that
3502 * the other member of the pair can be found quickly. Code elsewhere in
3503 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3504 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3505 * described in the next item.
3506 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3507 * validity of the fold won't be known until runtime, and so must remain
3508 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3509 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3510 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3511 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3512 * The reason this is a problem is that the optimizer part of regexec.c
3513 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3514 * that a character in the pattern corresponds to at most a single
3515 * character in the target string. (And I do mean character, and not byte
3516 * here, unlike other parts of the documentation that have never been
3517 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3518 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3519 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3520 * nodes, violate the assumption, and they are the only instances where it
3521 * is violated. I'm reluctant to try to change the assumption, as the
3522 * code involved is impenetrable to me (khw), so instead the code here
3523 * punts. This routine examines EXACTFL nodes, and (when the pattern
3524 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3525 * boolean indicating whether or not the node contains such a fold. When
3526 * it is true, the caller sets a flag that later causes the optimizer in
3527 * this file to not set values for the floating and fixed string lengths,
3528 * and thus avoids the optimizer code in regexec.c that makes the invalid
3529 * assumption. Thus, there is no optimization based on string lengths for
3530 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3531 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3532 * assumption is wrong only in these cases is that all other non-UTF-8
3533 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3534 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3535 * EXACTF nodes because we don't know at compile time if it actually
3536 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3537 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3538 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3539 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3540 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3541 * string would require the pattern to be forced into UTF-8, the overhead
3542 * of which we want to avoid. Similarly the unfolded multi-char folds in
3543 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3546 * Similarly, the code that generates tries doesn't currently handle
3547 * not-already-folded multi-char folds, and it looks like a pain to change
3548 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3549 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3550 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3551 * using /iaa matching will be doing so almost entirely with ASCII
3552 * strings, so this should rarely be encountered in practice */
3554 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3555 if (PL_regkind[OP(scan)] == EXACT) \
3556 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3559 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3560 UV *min_subtract, bool *unfolded_multi_char,
3561 U32 flags,regnode *val, U32 depth)
3563 /* Merge several consecutive EXACTish nodes into one. */
3564 regnode *n = regnext(scan);
3566 regnode *next = scan + NODE_SZ_STR(scan);
3570 regnode *stop = scan;
3571 GET_RE_DEBUG_FLAGS_DECL;
3573 PERL_UNUSED_ARG(depth);
3576 PERL_ARGS_ASSERT_JOIN_EXACT;
3577 #ifndef EXPERIMENTAL_INPLACESCAN
3578 PERL_UNUSED_ARG(flags);
3579 PERL_UNUSED_ARG(val);
3581 DEBUG_PEEP("join",scan,depth);
3583 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3584 * EXACT ones that are mergeable to the current one. */
3586 && (PL_regkind[OP(n)] == NOTHING
3587 || (stringok && OP(n) == OP(scan)))
3589 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3592 if (OP(n) == TAIL || n > next)
3594 if (PL_regkind[OP(n)] == NOTHING) {
3595 DEBUG_PEEP("skip:",n,depth);
3596 NEXT_OFF(scan) += NEXT_OFF(n);
3597 next = n + NODE_STEP_REGNODE;
3604 else if (stringok) {
3605 const unsigned int oldl = STR_LEN(scan);
3606 regnode * const nnext = regnext(n);
3608 /* XXX I (khw) kind of doubt that this works on platforms (should
3609 * Perl ever run on one) where U8_MAX is above 255 because of lots
3610 * of other assumptions */
3611 /* Don't join if the sum can't fit into a single node */
3612 if (oldl + STR_LEN(n) > U8_MAX)
3615 DEBUG_PEEP("merg",n,depth);
3618 NEXT_OFF(scan) += NEXT_OFF(n);
3619 STR_LEN(scan) += STR_LEN(n);
3620 next = n + NODE_SZ_STR(n);
3621 /* Now we can overwrite *n : */
3622 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3630 #ifdef EXPERIMENTAL_INPLACESCAN
3631 if (flags && !NEXT_OFF(n)) {
3632 DEBUG_PEEP("atch", val, depth);
3633 if (reg_off_by_arg[OP(n)]) {
3634 ARG_SET(n, val - n);
3637 NEXT_OFF(n) = val - n;
3645 *unfolded_multi_char = FALSE;
3647 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3648 * can now analyze for sequences of problematic code points. (Prior to
3649 * this final joining, sequences could have been split over boundaries, and
3650 * hence missed). The sequences only happen in folding, hence for any
3651 * non-EXACT EXACTish node */
3652 if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3653 U8* s0 = (U8*) STRING(scan);
3655 U8* s_end = s0 + STR_LEN(scan);
3657 int total_count_delta = 0; /* Total delta number of characters that
3658 multi-char folds expand to */
3660 /* One pass is made over the node's string looking for all the
3661 * possibilities. To avoid some tests in the loop, there are two main
3662 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3667 if (OP(scan) == EXACTFL) {
3670 /* An EXACTFL node would already have been changed to another
3671 * node type unless there is at least one character in it that
3672 * is problematic; likely a character whose fold definition
3673 * won't be known until runtime, and so has yet to be folded.
3674 * For all but the UTF-8 locale, folds are 1-1 in length, but
3675 * to handle the UTF-8 case, we need to create a temporary
3676 * folded copy using UTF-8 locale rules in order to analyze it.
3677 * This is because our macros that look to see if a sequence is
3678 * a multi-char fold assume everything is folded (otherwise the
3679 * tests in those macros would be too complicated and slow).
3680 * Note that here, the non-problematic folds will have already
3681 * been done, so we can just copy such characters. We actually
3682 * don't completely fold the EXACTFL string. We skip the
3683 * unfolded multi-char folds, as that would just create work
3684 * below to figure out the size they already are */
3686 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3689 STRLEN s_len = UTF8SKIP(s);
3690 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3691 Copy(s, d, s_len, U8);
3694 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3695 *unfolded_multi_char = TRUE;
3696 Copy(s, d, s_len, U8);
3699 else if (isASCII(*s)) {
3700 *(d++) = toFOLD(*s);
3704 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3710 /* Point the remainder of the routine to look at our temporary
3714 } /* End of creating folded copy of EXACTFL string */
3716 /* Examine the string for a multi-character fold sequence. UTF-8
3717 * patterns have all characters pre-folded by the time this code is
3719 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3720 length sequence we are looking for is 2 */
3722 int count = 0; /* How many characters in a multi-char fold */
3723 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3724 if (! len) { /* Not a multi-char fold: get next char */
3729 /* Nodes with 'ss' require special handling, except for
3730 * EXACTFA-ish for which there is no multi-char fold to this */
3731 if (len == 2 && *s == 's' && *(s+1) == 's'
3732 && OP(scan) != EXACTFA
3733 && OP(scan) != EXACTFA_NO_TRIE)
3736 if (OP(scan) != EXACTFL) {
3737 OP(scan) = EXACTFU_SS;
3741 else { /* Here is a generic multi-char fold. */
3742 U8* multi_end = s + len;
3744 /* Count how many characters are in it. In the case of
3745 * /aa, no folds which contain ASCII code points are
3746 * allowed, so check for those, and skip if found. */
3747 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3748 count = utf8_length(s, multi_end);
3752 while (s < multi_end) {
3755 goto next_iteration;
3765 /* The delta is how long the sequence is minus 1 (1 is how long
3766 * the character that folds to the sequence is) */
3767 total_count_delta += count - 1;
3771 /* We created a temporary folded copy of the string in EXACTFL
3772 * nodes. Therefore we need to be sure it doesn't go below zero,
3773 * as the real string could be shorter */
3774 if (OP(scan) == EXACTFL) {
3775 int total_chars = utf8_length((U8*) STRING(scan),
3776 (U8*) STRING(scan) + STR_LEN(scan));
3777 if (total_count_delta > total_chars) {
3778 total_count_delta = total_chars;
3782 *min_subtract += total_count_delta;
3785 else if (OP(scan) == EXACTFA) {
3787 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3788 * fold to the ASCII range (and there are no existing ones in the
3789 * upper latin1 range). But, as outlined in the comments preceding
3790 * this function, we need to flag any occurrences of the sharp s.
3791 * This character forbids trie formation (because of added
3793 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
3794 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
3795 || UNICODE_DOT_DOT_VERSION > 0)
3797 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3798 OP(scan) = EXACTFA_NO_TRIE;
3799 *unfolded_multi_char = TRUE;
3807 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3808 * folds that are all Latin1. As explained in the comments
3809 * preceding this function, we look also for the sharp s in EXACTF
3810 * and EXACTFL nodes; it can be in the final position. Otherwise
3811 * we can stop looking 1 byte earlier because have to find at least
3812 * two characters for a multi-fold */
3813 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3818 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3819 if (! len) { /* Not a multi-char fold. */
3820 if (*s == LATIN_SMALL_LETTER_SHARP_S
3821 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3823 *unfolded_multi_char = TRUE;
3830 && isALPHA_FOLD_EQ(*s, 's')
3831 && isALPHA_FOLD_EQ(*(s+1), 's'))
3834 /* EXACTF nodes need to know that the minimum length
3835 * changed so that a sharp s in the string can match this
3836 * ss in the pattern, but they remain EXACTF nodes, as they
3837 * won't match this unless the target string is is UTF-8,
3838 * which we don't know until runtime. EXACTFL nodes can't
3839 * transform into EXACTFU nodes */
3840 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3841 OP(scan) = EXACTFU_SS;
3845 *min_subtract += len - 1;
3853 /* Allow dumping but overwriting the collection of skipped
3854 * ops and/or strings with fake optimized ops */
3855 n = scan + NODE_SZ_STR(scan);
3863 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3867 /* REx optimizer. Converts nodes into quicker variants "in place".
3868 Finds fixed substrings. */
3870 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3871 to the position after last scanned or to NULL. */
3873 #define INIT_AND_WITHP \
3874 assert(!and_withp); \
3875 Newx(and_withp,1, regnode_ssc); \
3876 SAVEFREEPV(and_withp)
3880 S_unwind_scan_frames(pTHX_ const void *p)
3882 scan_frame *f= (scan_frame *)p;
3884 scan_frame *n= f->next_frame;
3892 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3893 SSize_t *minlenp, SSize_t *deltap,
3898 regnode_ssc *and_withp,
3899 U32 flags, U32 depth)
3900 /* scanp: Start here (read-write). */
3901 /* deltap: Write maxlen-minlen here. */
3902 /* last: Stop before this one. */
3903 /* data: string data about the pattern */
3904 /* stopparen: treat close N as END */
3905 /* recursed: which subroutines have we recursed into */
3906 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3908 /* There must be at least this number of characters to match */
3911 regnode *scan = *scanp, *next;
3913 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3914 int is_inf_internal = 0; /* The studied chunk is infinite */
3915 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3916 scan_data_t data_fake;
3917 SV *re_trie_maxbuff = NULL;
3918 regnode *first_non_open = scan;
3919 SSize_t stopmin = SSize_t_MAX;
3920 scan_frame *frame = NULL;
3921 GET_RE_DEBUG_FLAGS_DECL;
3923 PERL_ARGS_ASSERT_STUDY_CHUNK;
3927 while (first_non_open && OP(first_non_open) == OPEN)
3928 first_non_open=regnext(first_non_open);
3934 RExC_study_chunk_recursed_count++;
3936 DEBUG_OPTIMISE_MORE_r(
3938 PerlIO_printf(Perl_debug_log,
3939 "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3940 (int)(depth*2), "", (long)stopparen,
3941 (unsigned long)RExC_study_chunk_recursed_count,
3942 (unsigned long)depth, (unsigned long)recursed_depth,
3945 if (recursed_depth) {
3948 for ( j = 0 ; j < recursed_depth ; j++ ) {
3949 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3951 PAREN_TEST(RExC_study_chunk_recursed +
3952 ( j * RExC_study_chunk_recursed_bytes), i )
3955 !PAREN_TEST(RExC_study_chunk_recursed +
3956 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3959 PerlIO_printf(Perl_debug_log," %d",(int)i);
3963 if ( j + 1 < recursed_depth ) {
3964 PerlIO_printf(Perl_debug_log, ",");
3968 PerlIO_printf(Perl_debug_log,"\n");
3971 while ( scan && OP(scan) != END && scan < last ){
3972 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3973 node length to get a real minimum (because
3974 the folded version may be shorter) */
3975 bool unfolded_multi_char = FALSE;
3976 /* Peephole optimizer: */
3977 DEBUG_STUDYDATA("Peep:", data, depth);
3978 DEBUG_PEEP("Peep", scan, depth);
3981 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3982 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3983 * by a different invocation of reg() -- Yves
3985 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3987 /* Follow the next-chain of the current node and optimize
3988 away all the NOTHINGs from it. */
3989 if (OP(scan) != CURLYX) {
3990 const int max = (reg_off_by_arg[OP(scan)]
3992 /* I32 may be smaller than U16 on CRAYs! */
3993 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3994 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3998 /* Skip NOTHING and LONGJMP. */
3999 while ((n = regnext(n))
4000 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4001 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4002 && off + noff < max)
4004 if (reg_off_by_arg[OP(scan)])
4007 NEXT_OFF(scan) = off;
4010 /* The principal pseudo-switch. Cannot be a switch, since we
4011 look into several different things. */
4012 if ( OP(scan) == DEFINEP ) {
4014 SSize_t deltanext = 0;
4015 SSize_t fake_last_close = 0;
4016 I32 f = SCF_IN_DEFINE;
4018 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4019 scan = regnext(scan);
4020 assert( OP(scan) == IFTHEN );
4021 DEBUG_PEEP("expect IFTHEN", scan, depth);
4023 data_fake.last_closep= &fake_last_close;
4025 next = regnext(scan);
4026 scan = NEXTOPER(NEXTOPER(scan));
4027 DEBUG_PEEP("scan", scan, depth);
4028 DEBUG_PEEP("next", next, depth);
4030 /* we suppose the run is continuous, last=next...
4031 * NOTE we dont use the return here! */
4032 (void)study_chunk(pRExC_state, &scan, &minlen,
4033 &deltanext, next, &data_fake, stopparen,
4034 recursed_depth, NULL, f, depth+1);
4039 OP(scan) == BRANCH ||
4040 OP(scan) == BRANCHJ ||
4043 next = regnext(scan);
4046 /* The op(next)==code check below is to see if we
4047 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4048 * IFTHEN is special as it might not appear in pairs.
4049 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4050 * we dont handle it cleanly. */
4051 if (OP(next) == code || code == IFTHEN) {
4052 /* NOTE - There is similar code to this block below for
4053 * handling TRIE nodes on a re-study. If you change stuff here
4054 * check there too. */
4055 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4057 regnode * const startbranch=scan;
4059 if (flags & SCF_DO_SUBSTR) {
4060 /* Cannot merge strings after this. */
4061 scan_commit(pRExC_state, data, minlenp, is_inf);
4064 if (flags & SCF_DO_STCLASS)
4065 ssc_init_zero(pRExC_state, &accum);
4067 while (OP(scan) == code) {
4068 SSize_t deltanext, minnext, fake;
4070 regnode_ssc this_class;
4072 DEBUG_PEEP("Branch", scan, depth);
4075 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4077 data_fake.whilem_c = data->whilem_c;
4078 data_fake.last_closep = data->last_closep;
4081 data_fake.last_closep = &fake;
4083 data_fake.pos_delta = delta;
4084 next = regnext(scan);
4086 scan = NEXTOPER(scan); /* everything */
4087 if (code != BRANCH) /* everything but BRANCH */
4088 scan = NEXTOPER(scan);
4090 if (flags & SCF_DO_STCLASS) {
4091 ssc_init(pRExC_state, &this_class);
4092 data_fake.start_class = &this_class;
4093 f = SCF_DO_STCLASS_AND;
4095 if (flags & SCF_WHILEM_VISITED_POS)
4096 f |= SCF_WHILEM_VISITED_POS;
4098 /* we suppose the run is continuous, last=next...*/
4099 minnext = study_chunk(pRExC_state, &scan, minlenp,
4100 &deltanext, next, &data_fake, stopparen,
4101 recursed_depth, NULL, f,depth+1);
4105 if (deltanext == SSize_t_MAX) {
4106 is_inf = is_inf_internal = 1;
4108 } else if (max1 < minnext + deltanext)
4109 max1 = minnext + deltanext;
4111 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4113 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4114 if ( stopmin > minnext)
4115 stopmin = min + min1;
4116 flags &= ~SCF_DO_SUBSTR;
4118 data->flags |= SCF_SEEN_ACCEPT;
4121 if (data_fake.flags & SF_HAS_EVAL)
4122 data->flags |= SF_HAS_EVAL;
4123 data->whilem_c = data_fake.whilem_c;
4125 if (flags & SCF_DO_STCLASS)
4126 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4128 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4130 if (flags & SCF_DO_SUBSTR) {
4131 data->pos_min += min1;
4132 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4133 data->pos_delta = SSize_t_MAX;
4135 data->pos_delta += max1 - min1;
4136 if (max1 != min1 || is_inf)
4137 data->longest = &(data->longest_float);
4140 if (delta == SSize_t_MAX
4141 || SSize_t_MAX - delta - (max1 - min1) < 0)
4142 delta = SSize_t_MAX;
4144 delta += max1 - min1;
4145 if (flags & SCF_DO_STCLASS_OR) {
4146 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4148 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4149 flags &= ~SCF_DO_STCLASS;
4152 else if (flags & SCF_DO_STCLASS_AND) {
4154 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4155 flags &= ~SCF_DO_STCLASS;
4158 /* Switch to OR mode: cache the old value of
4159 * data->start_class */
4161 StructCopy(data->start_class, and_withp, regnode_ssc);
4162 flags &= ~SCF_DO_STCLASS_AND;
4163 StructCopy(&accum, data->start_class, regnode_ssc);
4164 flags |= SCF_DO_STCLASS_OR;
4168 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4169 OP( startbranch ) == BRANCH )
4173 Assuming this was/is a branch we are dealing with: 'scan'
4174 now points at the item that follows the branch sequence,
4175 whatever it is. We now start at the beginning of the
4176 sequence and look for subsequences of
4182 which would be constructed from a pattern like
4185 If we can find such a subsequence we need to turn the first
4186 element into a trie and then add the subsequent branch exact
4187 strings to the trie.
4191 1. patterns where the whole set of branches can be
4194 2. patterns where only a subset can be converted.
4196 In case 1 we can replace the whole set with a single regop
4197 for the trie. In case 2 we need to keep the start and end
4200 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4201 becomes BRANCH TRIE; BRANCH X;
4203 There is an additional case, that being where there is a
4204 common prefix, which gets split out into an EXACT like node
4205 preceding the TRIE node.
4207 If x(1..n)==tail then we can do a simple trie, if not we make
4208 a "jump" trie, such that when we match the appropriate word
4209 we "jump" to the appropriate tail node. Essentially we turn
4210 a nested if into a case structure of sorts.
4215 if (!re_trie_maxbuff) {
4216 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4217 if (!SvIOK(re_trie_maxbuff))
4218 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4220 if ( SvIV(re_trie_maxbuff)>=0 ) {
4222 regnode *first = (regnode *)NULL;
4223 regnode *last = (regnode *)NULL;
4224 regnode *tail = scan;
4228 /* var tail is used because there may be a TAIL
4229 regop in the way. Ie, the exacts will point to the
4230 thing following the TAIL, but the last branch will
4231 point at the TAIL. So we advance tail. If we
4232 have nested (?:) we may have to move through several
4236 while ( OP( tail ) == TAIL ) {
4237 /* this is the TAIL generated by (?:) */
4238 tail = regnext( tail );
4242 DEBUG_TRIE_COMPILE_r({
4243 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4244 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4245 (int)depth * 2 + 2, "",
4246 "Looking for TRIE'able sequences. Tail node is: ",
4247 SvPV_nolen_const( RExC_mysv )
4253 Step through the branches
4254 cur represents each branch,
4255 noper is the first thing to be matched as part
4257 noper_next is the regnext() of that node.
4259 We normally handle a case like this
4260 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4261 support building with NOJUMPTRIE, which restricts
4262 the trie logic to structures like /FOO|BAR/.
4264 If noper is a trieable nodetype then the branch is
4265 a possible optimization target. If we are building
4266 under NOJUMPTRIE then we require that noper_next is
4267 the same as scan (our current position in the regex
4270 Once we have two or more consecutive such branches
4271 we can create a trie of the EXACT's contents and
4272 stitch it in place into the program.
4274 If the sequence represents all of the branches in
4275 the alternation we replace the entire thing with a
4278 Otherwise when it is a subsequence we need to
4279 stitch it in place and replace only the relevant
4280 branches. This means the first branch has to remain
4281 as it is used by the alternation logic, and its
4282 next pointer, and needs to be repointed at the item
4283 on the branch chain following the last branch we
4284 have optimized away.
4286 This could be either a BRANCH, in which case the
4287 subsequence is internal, or it could be the item
4288 following the branch sequence in which case the
4289 subsequence is at the end (which does not
4290 necessarily mean the first node is the start of the
4293 TRIE_TYPE(X) is a define which maps the optype to a
4297 ----------------+-----------
4301 EXACTFU_SS | EXACTFU
4304 EXACTFLU8 | EXACTFLU8
4308 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4310 : ( EXACT == (X) ) \
4312 : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
4314 : ( EXACTFA == (X) ) \
4316 : ( EXACTL == (X) ) \
4318 : ( EXACTFLU8 == (X) ) \
4322 /* dont use tail as the end marker for this traverse */
4323 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4324 regnode * const noper = NEXTOPER( cur );
4325 U8 noper_type = OP( noper );
4326 U8 noper_trietype = TRIE_TYPE( noper_type );
4327 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4328 regnode * const noper_next = regnext( noper );
4329 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4330 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4333 DEBUG_TRIE_COMPILE_r({
4334 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4335 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4336 (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4338 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4339 PerlIO_printf( Perl_debug_log, " -> %s",
4340 SvPV_nolen_const(RExC_mysv));
4343 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4344 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4345 SvPV_nolen_const(RExC_mysv));
4347 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4348 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4349 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4353 /* Is noper a trieable nodetype that can be merged
4354 * with the current trie (if there is one)? */
4358 ( noper_trietype == NOTHING)
4359 || ( trietype == NOTHING )
4360 || ( trietype == noper_trietype )
4363 && noper_next == tail
4367 /* Handle mergable triable node Either we are
4368 * the first node in a new trieable sequence,
4369 * in which case we do some bookkeeping,
4370 * otherwise we update the end pointer. */
4373 if ( noper_trietype == NOTHING ) {
4374 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4375 regnode * const noper_next = regnext( noper );
4376 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4377 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4380 if ( noper_next_trietype ) {
4381 trietype = noper_next_trietype;
4382 } else if (noper_next_type) {
4383 /* a NOTHING regop is 1 regop wide.
4384 * We need at least two for a trie
4385 * so we can't merge this in */
4389 trietype = noper_trietype;
4392 if ( trietype == NOTHING )
4393 trietype = noper_trietype;
4398 } /* end handle mergable triable node */
4400 /* handle unmergable node -
4401 * noper may either be a triable node which can
4402 * not be tried together with the current trie,
4403 * or a non triable node */
4405 /* If last is set and trietype is not
4406 * NOTHING then we have found at least two
4407 * triable branch sequences in a row of a
4408 * similar trietype so we can turn them
4409 * into a trie. If/when we allow NOTHING to
4410 * start a trie sequence this condition
4411 * will be required, and it isn't expensive
4412 * so we leave it in for now. */
4413 if ( trietype && trietype != NOTHING )
4414 make_trie( pRExC_state,
4415 startbranch, first, cur, tail,
4416 count, trietype, depth+1 );
4417 last = NULL; /* note: we clear/update
4418 first, trietype etc below,
4419 so we dont do it here */
4423 && noper_next == tail
4426 /* noper is triable, so we can start a new
4430 trietype = noper_trietype;
4432 /* if we already saw a first but the
4433 * current node is not triable then we have
4434 * to reset the first information. */
4439 } /* end handle unmergable node */
4440 } /* loop over branches */
4441 DEBUG_TRIE_COMPILE_r({
4442 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4443 PerlIO_printf( Perl_debug_log,
4444 "%*s- %s (%d) <SCAN FINISHED>\n",
4446 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4449 if ( last && trietype ) {
4450 if ( trietype != NOTHING ) {
4451 /* the last branch of the sequence was part of
4452 * a trie, so we have to construct it here
4453 * outside of the loop */
4454 made= make_trie( pRExC_state, startbranch,
4455 first, scan, tail, count,
4456 trietype, depth+1 );
4457 #ifdef TRIE_STUDY_OPT
4458 if ( ((made == MADE_EXACT_TRIE &&
4459 startbranch == first)
4460 || ( first_non_open == first )) &&
4462 flags |= SCF_TRIE_RESTUDY;
4463 if ( startbranch == first
4466 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4471 /* at this point we know whatever we have is a
4472 * NOTHING sequence/branch AND if 'startbranch'
4473 * is 'first' then we can turn the whole thing
4476 if ( startbranch == first ) {
4478 /* the entire thing is a NOTHING sequence,
4479 * something like this: (?:|) So we can
4480 * turn it into a plain NOTHING op. */
4481 DEBUG_TRIE_COMPILE_r({
4482 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4483 PerlIO_printf( Perl_debug_log,
4484 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4485 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4488 OP(startbranch)= NOTHING;
4489 NEXT_OFF(startbranch)= tail - startbranch;
4490 for ( opt= startbranch + 1; opt < tail ; opt++ )
4494 } /* end if ( last) */
4495 } /* TRIE_MAXBUF is non zero */
4500 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4501 scan = NEXTOPER(NEXTOPER(scan));
4502 } else /* single branch is optimized. */
4503 scan = NEXTOPER(scan);
4505 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4507 regnode *start = NULL;
4508 regnode *end = NULL;
4509 U32 my_recursed_depth= recursed_depth;
4512 if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4513 /* Do setup, note this code has side effects beyond
4514 * the rest of this block. Specifically setting
4515 * RExC_recurse[] must happen at least once during
4517 if (OP(scan) == GOSUB) {
4519 RExC_recurse[ARG2L(scan)] = scan;
4520 start = RExC_open_parens[paren-1];
4521 end = RExC_close_parens[paren-1];
4523 start = RExC_rxi->program + 1;
4526 /* NOTE we MUST always execute the above code, even
4527 * if we do nothing with a GOSUB/GOSTART */
4529 ( flags & SCF_IN_DEFINE )
4532 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4534 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4537 /* no need to do anything here if we are in a define. */
4538 /* or we are after some kind of infinite construct
4539 * so we can skip recursing into this item.
4540 * Since it is infinite we will not change the maxlen
4541 * or delta, and if we miss something that might raise
4542 * the minlen it will merely pessimise a little.
4544 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4545 * might result in a minlen of 1 and not of 4,
4546 * but this doesn't make us mismatch, just try a bit
4547 * harder than we should.
4549 scan= regnext(scan);
4556 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4558 /* it is quite possible that there are more efficient ways
4559 * to do this. We maintain a bitmap per level of recursion
4560 * of which patterns we have entered so we can detect if a
4561 * pattern creates a possible infinite loop. When we
4562 * recurse down a level we copy the previous levels bitmap
4563 * down. When we are at recursion level 0 we zero the top
4564 * level bitmap. It would be nice to implement a different
4565 * more efficient way of doing this. In particular the top
4566 * level bitmap may be unnecessary.
4568 if (!recursed_depth) {
4569 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4571 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4572 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4573 RExC_study_chunk_recursed_bytes, U8);
4575 /* we havent recursed into this paren yet, so recurse into it */
4576 DEBUG_STUDYDATA("set:", data,depth);
4577 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4578 my_recursed_depth= recursed_depth + 1;
4580 DEBUG_STUDYDATA("inf:", data,depth);
4581 /* some form of infinite recursion, assume infinite length
4583 if (flags & SCF_DO_SUBSTR) {
4584 scan_commit(pRExC_state, data, minlenp, is_inf);
4585 data->longest = &(data->longest_float);
4587 is_inf = is_inf_internal = 1;
4588 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4589 ssc_anything(data->start_class);
4590 flags &= ~SCF_DO_STCLASS;
4592 start= NULL; /* reset start so we dont recurse later on. */
4597 end = regnext(scan);
4600 scan_frame *newframe;
4602 if (!RExC_frame_last) {
4603 Newxz(newframe, 1, scan_frame);
4604 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4605 RExC_frame_head= newframe;
4607 } else if (!RExC_frame_last->next_frame) {
4608 Newxz(newframe,1,scan_frame);
4609 RExC_frame_last->next_frame= newframe;
4610 newframe->prev_frame= RExC_frame_last;
4613 newframe= RExC_frame_last->next_frame;
4615 RExC_frame_last= newframe;
4617 newframe->next_regnode = regnext(scan);
4618 newframe->last_regnode = last;
4619 newframe->stopparen = stopparen;
4620 newframe->prev_recursed_depth = recursed_depth;
4621 newframe->this_prev_frame= frame;
4623 DEBUG_STUDYDATA("frame-new:",data,depth);
4624 DEBUG_PEEP("fnew", scan, depth);
4631 recursed_depth= my_recursed_depth;
4636 else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4637 SSize_t l = STR_LEN(scan);
4640 const U8 * const s = (U8*)STRING(scan);
4641 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4642 l = utf8_length(s, s + l);
4644 uc = *((U8*)STRING(scan));
4647 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4648 /* The code below prefers earlier match for fixed
4649 offset, later match for variable offset. */
4650 if (data->last_end == -1) { /* Update the start info. */
4651 data->last_start_min = data->pos_min;
4652 data->last_start_max = is_inf
4653 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4655 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4657 SvUTF8_on(data->last_found);
4659 SV * const sv = data->last_found;
4660 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4661 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4662 if (mg && mg->mg_len >= 0)
4663 mg->mg_len += utf8_length((U8*)STRING(scan),
4664 (U8*)STRING(scan)+STR_LEN(scan));
4666 data->last_end = data->pos_min + l;
4667 data->pos_min += l; /* As in the first entry. */
4668 data->flags &= ~SF_BEFORE_EOL;
4671 /* ANDing the code point leaves at most it, and not in locale, and
4672 * can't match null string */
4673 if (flags & SCF_DO_STCLASS_AND) {
4674 ssc_cp_and(data->start_class, uc);
4675 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4676 ssc_clear_locale(data->start_class);
4678 else if (flags & SCF_DO_STCLASS_OR) {
4679 ssc_add_cp(data->start_class, uc);
4680 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4682 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4683 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4685 flags &= ~SCF_DO_STCLASS;
4687 else if (PL_regkind[OP(scan)] == EXACT) {
4688 /* But OP != EXACT!, so is EXACTFish */
4689 SSize_t l = STR_LEN(scan);
4690 const U8 * s = (U8*)STRING(scan);
4692 /* Search for fixed substrings supports EXACT only. */
4693 if (flags & SCF_DO_SUBSTR) {
4695 scan_commit(pRExC_state, data, minlenp, is_inf);
4698 l = utf8_length(s, s + l);
4700 if (unfolded_multi_char) {
4701 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4703 min += l - min_subtract;
4705 delta += min_subtract;
4706 if (flags & SCF_DO_SUBSTR) {
4707 data->pos_min += l - min_subtract;
4708 if (data->pos_min < 0) {
4711 data->pos_delta += min_subtract;
4713 data->longest = &(data->longest_float);
4717 if (flags & SCF_DO_STCLASS) {
4718 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4720 assert(EXACTF_invlist);
4721 if (flags & SCF_DO_STCLASS_AND) {
4722 if (OP(scan) != EXACTFL)
4723 ssc_clear_locale(data->start_class);
4724 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4725 ANYOF_POSIXL_ZERO(data->start_class);
4726 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4728 else { /* SCF_DO_STCLASS_OR */
4729 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4730 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4732 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4733 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4735 flags &= ~SCF_DO_STCLASS;
4736 SvREFCNT_dec(EXACTF_invlist);
4739 else if (REGNODE_VARIES(OP(scan))) {
4740 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4741 I32 fl = 0, f = flags;
4742 regnode * const oscan = scan;
4743 regnode_ssc this_class;
4744 regnode_ssc *oclass = NULL;
4745 I32 next_is_eval = 0;
4747 switch (PL_regkind[OP(scan)]) {
4748 case WHILEM: /* End of (?:...)* . */
4749 scan = NEXTOPER(scan);
4752 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4753 next = NEXTOPER(scan);
4754 if (OP(next) == EXACT
4755 || OP(next) == EXACTL
4756 || (flags & SCF_DO_STCLASS))
4759 maxcount = REG_INFTY;
4760 next = regnext(scan);
4761 scan = NEXTOPER(scan);
4765 if (flags & SCF_DO_SUBSTR)
4770 if (flags & SCF_DO_STCLASS) {
4772 maxcount = REG_INFTY;
4773 next = regnext(scan);
4774 scan = NEXTOPER(scan);
4777 if (flags & SCF_DO_SUBSTR) {
4778 scan_commit(pRExC_state, data, minlenp, is_inf);
4779 /* Cannot extend fixed substrings */
4780 data->longest = &(data->longest_float);
4782 is_inf = is_inf_internal = 1;
4783 scan = regnext(scan);
4784 goto optimize_curly_tail;
4786 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4787 && (scan->flags == stopparen))
4792 mincount = ARG1(scan);
4793 maxcount = ARG2(scan);
4795 next = regnext(scan);
4796 if (OP(scan) == CURLYX) {
4797 I32 lp = (data ? *(data->last_closep) : 0);
4798 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4800 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4801 next_is_eval = (OP(scan) == EVAL);
4803 if (flags & SCF_DO_SUBSTR) {
4805 scan_commit(pRExC_state, data, minlenp, is_inf);
4806 /* Cannot extend fixed substrings */
4807 pos_before = data->pos_min;
4811 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4813 data->flags |= SF_IS_INF;
4815 if (flags & SCF_DO_STCLASS) {
4816 ssc_init(pRExC_state, &this_class);
4817 oclass = data->start_class;
4818 data->start_class = &this_class;
4819 f |= SCF_DO_STCLASS_AND;
4820 f &= ~SCF_DO_STCLASS_OR;
4822 /* Exclude from super-linear cache processing any {n,m}
4823 regops for which the combination of input pos and regex
4824 pos is not enough information to determine if a match
4827 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4828 regex pos at the \s*, the prospects for a match depend not
4829 only on the input position but also on how many (bar\s*)
4830 repeats into the {4,8} we are. */
4831 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4832 f &= ~SCF_WHILEM_VISITED_POS;
4834 /* This will finish on WHILEM, setting scan, or on NULL: */
4835 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4836 last, data, stopparen, recursed_depth, NULL,
4838 ? (f & ~SCF_DO_SUBSTR)
4842 if (flags & SCF_DO_STCLASS)
4843 data->start_class = oclass;
4844 if (mincount == 0 || minnext == 0) {
4845 if (flags & SCF_DO_STCLASS_OR) {
4846 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4848 else if (flags & SCF_DO_STCLASS_AND) {
4849 /* Switch to OR mode: cache the old value of
4850 * data->start_class */
4852 StructCopy(data->start_class, and_withp, regnode_ssc);
4853 flags &= ~SCF_DO_STCLASS_AND;
4854 StructCopy(&this_class, data->start_class, regnode_ssc);
4855 flags |= SCF_DO_STCLASS_OR;
4856 ANYOF_FLAGS(data->start_class)
4857 |= SSC_MATCHES_EMPTY_STRING;
4859 } else { /* Non-zero len */
4860 if (flags & SCF_DO_STCLASS_OR) {
4861 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4862 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4864 else if (flags & SCF_DO_STCLASS_AND)
4865 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4866 flags &= ~SCF_DO_STCLASS;
4868 if (!scan) /* It was not CURLYX, but CURLY. */
4870 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4871 /* ? quantifier ok, except for (?{ ... }) */
4872 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4873 && (minnext == 0) && (deltanext == 0)
4874 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4875 && maxcount <= REG_INFTY/3) /* Complement check for big
4878 /* Fatal warnings may leak the regexp without this: */
4879 SAVEFREESV(RExC_rx_sv);
4880 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4881 "Quantifier unexpected on zero-length expression "
4882 "in regex m/%"UTF8f"/",
4883 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
4885 (void)ReREFCNT_inc(RExC_rx_sv);
4888 min += minnext * mincount;
4889 is_inf_internal |= deltanext == SSize_t_MAX
4890 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4891 is_inf |= is_inf_internal;
4893 delta = SSize_t_MAX;
4895 delta += (minnext + deltanext) * maxcount
4896 - minnext * mincount;
4898 /* Try powerful optimization CURLYX => CURLYN. */
4899 if ( OP(oscan) == CURLYX && data
4900 && data->flags & SF_IN_PAR
4901 && !(data->flags & SF_HAS_EVAL)
4902 && !deltanext && minnext == 1 ) {
4903 /* Try to optimize to CURLYN. */
4904 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4905 regnode * const nxt1 = nxt;
4912 if (!REGNODE_SIMPLE(OP(nxt))
4913 && !(PL_regkind[OP(nxt)] == EXACT
4914 && STR_LEN(nxt) == 1))
4920 if (OP(nxt) != CLOSE)
4922 if (RExC_open_parens) {
4923 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4924 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4926 /* Now we know that nxt2 is the only contents: */
4927 oscan->flags = (U8)ARG(nxt);
4929 OP(nxt1) = NOTHING; /* was OPEN. */
4932 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4933 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4934 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4935 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4936 OP(nxt + 1) = OPTIMIZED; /* was count. */
4937 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4942 /* Try optimization CURLYX => CURLYM. */
4943 if ( OP(oscan) == CURLYX && data
4944 && !(data->flags & SF_HAS_PAR)
4945 && !(data->flags & SF_HAS_EVAL)
4946 && !deltanext /* atom is fixed width */
4947 && minnext != 0 /* CURLYM can't handle zero width */
4949 /* Nor characters whose fold at run-time may be
4950 * multi-character */
4951 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4953 /* XXXX How to optimize if data == 0? */
4954 /* Optimize to a simpler form. */
4955 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4959 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4960 && (OP(nxt2) != WHILEM))
4962 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4963 /* Need to optimize away parenths. */
4964 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4965 /* Set the parenth number. */
4966 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4968 oscan->flags = (U8)ARG(nxt);
4969 if (RExC_open_parens) {
4970 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4971 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4973 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4974 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4977 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4978 OP(nxt + 1) = OPTIMIZED; /* was count. */
4979 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4980 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4983 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4984 regnode *nnxt = regnext(nxt1);
4986 if (reg_off_by_arg[OP(nxt1)])
4987 ARG_SET(nxt1, nxt2 - nxt1);
4988 else if (nxt2 - nxt1 < U16_MAX)
4989 NEXT_OFF(nxt1) = nxt2 - nxt1;
4991 OP(nxt) = NOTHING; /* Cannot beautify */
4996 /* Optimize again: */
4997 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4998 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
5003 else if ((OP(oscan) == CURLYX)
5004 && (flags & SCF_WHILEM_VISITED_POS)
5005 /* See the comment on a similar expression above.
5006 However, this time it's not a subexpression
5007 we care about, but the expression itself. */
5008 && (maxcount == REG_INFTY)
5009 && data && ++data->whilem_c < 16) {
5010 /* This stays as CURLYX, we can put the count/of pair. */
5011 /* Find WHILEM (as in regexec.c) */
5012 regnode *nxt = oscan + NEXT_OFF(oscan);
5014 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5016 PREVOPER(nxt)->flags = (U8)(data->whilem_c
5017 | (RExC_whilem_seen << 4)); /* On WHILEM */
5019 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5021 if (flags & SCF_DO_SUBSTR) {
5022 SV *last_str = NULL;
5023 STRLEN last_chrs = 0;
5024 int counted = mincount != 0;
5026 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5028 SSize_t b = pos_before >= data->last_start_min
5029 ? pos_before : data->last_start_min;
5031 const char * const s = SvPV_const(data->last_found, l);
5032 SSize_t old = b - data->last_start_min;
5035 old = utf8_hop((U8*)s, old) - (U8*)s;
5037 /* Get the added string: */
5038 last_str = newSVpvn_utf8(s + old, l, UTF);
5039 last_chrs = UTF ? utf8_length((U8*)(s + old),
5040 (U8*)(s + old + l)) : l;
5041 if (deltanext == 0 && pos_before == b) {
5042 /* What was added is a constant string */
5045 SvGROW(last_str, (mincount * l) + 1);
5046 repeatcpy(SvPVX(last_str) + l,
5047 SvPVX_const(last_str), l,
5049 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5050 /* Add additional parts. */
5051 SvCUR_set(data->last_found,
5052 SvCUR(data->last_found) - l);
5053 sv_catsv(data->last_found, last_str);
5055 SV * sv = data->last_found;
5057 SvUTF8(sv) && SvMAGICAL(sv) ?
5058 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5059 if (mg && mg->mg_len >= 0)
5060 mg->mg_len += last_chrs * (mincount-1);
5062 last_chrs *= mincount;
5063 data->last_end += l * (mincount - 1);
5066 /* start offset must point into the last copy */
5067 data->last_start_min += minnext * (mincount - 1);
5068 data->last_start_max =
5071 : data->last_start_max +
5072 (maxcount - 1) * (minnext + data->pos_delta);
5075 /* It is counted once already... */
5076 data->pos_min += minnext * (mincount - counted);
5078 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
5079 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
5080 " maxcount=%"UVuf" mincount=%"UVuf"\n",
5081 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5083 if (deltanext != SSize_t_MAX)
5084 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
5085 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5086 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5088 if (deltanext == SSize_t_MAX
5089 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5090 data->pos_delta = SSize_t_MAX;
5092 data->pos_delta += - counted * deltanext +
5093 (minnext + deltanext) * maxcount - minnext * mincount;
5094 if (mincount != maxcount) {
5095 /* Cannot extend fixed substrings found inside
5097 scan_commit(pRExC_state, data, minlenp, is_inf);
5098 if (mincount && last_str) {
5099 SV * const sv = data->last_found;
5100 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5101 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5105 sv_setsv(sv, last_str);
5106 data->last_end = data->pos_min;
5107 data->last_start_min = data->pos_min - last_chrs;
5108 data->last_start_max = is_inf
5110 : data->pos_min + data->pos_delta - last_chrs;
5112 data->longest = &(data->longest_float);
5114 SvREFCNT_dec(last_str);
5116 if (data && (fl & SF_HAS_EVAL))
5117 data->flags |= SF_HAS_EVAL;
5118 optimize_curly_tail:
5119 if (OP(oscan) != CURLYX) {
5120 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5122 NEXT_OFF(oscan) += NEXT_OFF(next);
5128 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5133 if (flags & SCF_DO_SUBSTR) {
5134 /* Cannot expect anything... */
5135 scan_commit(pRExC_state, data, minlenp, is_inf);
5136 data->longest = &(data->longest_float);
5138 is_inf = is_inf_internal = 1;
5139 if (flags & SCF_DO_STCLASS_OR) {
5140 if (OP(scan) == CLUMP) {
5141 /* Actually is any start char, but very few code points
5142 * aren't start characters */
5143 ssc_match_all_cp(data->start_class);
5146 ssc_anything(data->start_class);
5149 flags &= ~SCF_DO_STCLASS;
5153 else if (OP(scan) == LNBREAK) {
5154 if (flags & SCF_DO_STCLASS) {
5155 if (flags & SCF_DO_STCLASS_AND) {
5156 ssc_intersection(data->start_class,
5157 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5158 ssc_clear_locale(data->start_class);
5159 ANYOF_FLAGS(data->start_class)
5160 &= ~SSC_MATCHES_EMPTY_STRING;
5162 else if (flags & SCF_DO_STCLASS_OR) {
5163 ssc_union(data->start_class,
5164 PL_XPosix_ptrs[_CC_VERTSPACE],
5166 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5168 /* See commit msg for
5169 * 749e076fceedeb708a624933726e7989f2302f6a */
5170 ANYOF_FLAGS(data->start_class)
5171 &= ~SSC_MATCHES_EMPTY_STRING;
5173 flags &= ~SCF_DO_STCLASS;
5176 if (delta != SSize_t_MAX)
5177 delta++; /* Because of the 2 char string cr-lf */
5178 if (flags & SCF_DO_SUBSTR) {
5179 /* Cannot expect anything... */
5180 scan_commit(pRExC_state, data, minlenp, is_inf);
5182 data->pos_delta += 1;
5183 data->longest = &(data->longest_float);
5186 else if (REGNODE_SIMPLE(OP(scan))) {
5188 if (flags & SCF_DO_SUBSTR) {
5189 scan_commit(pRExC_state, data, minlenp, is_inf);
5193 if (flags & SCF_DO_STCLASS) {
5195 SV* my_invlist = NULL;
5198 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5199 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5201 /* Some of the logic below assumes that switching
5202 locale on will only add false positives. */
5207 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5211 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5212 ssc_match_all_cp(data->start_class);
5217 SV* REG_ANY_invlist = _new_invlist(2);
5218 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5220 if (flags & SCF_DO_STCLASS_OR) {
5221 ssc_union(data->start_class,
5223 TRUE /* TRUE => invert, hence all but \n
5227 else if (flags & SCF_DO_STCLASS_AND) {
5228 ssc_intersection(data->start_class,
5230 TRUE /* TRUE => invert */
5232 ssc_clear_locale(data->start_class);
5234 SvREFCNT_dec_NN(REG_ANY_invlist);
5241 if (flags & SCF_DO_STCLASS_AND)
5242 ssc_and(pRExC_state, data->start_class,
5243 (regnode_charclass *) scan);
5245 ssc_or(pRExC_state, data->start_class,
5246 (regnode_charclass *) scan);
5254 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5255 if (flags & SCF_DO_STCLASS_AND) {
5256 bool was_there = cBOOL(
5257 ANYOF_POSIXL_TEST(data->start_class,
5259 ANYOF_POSIXL_ZERO(data->start_class);
5260 if (was_there) { /* Do an AND */
5261 ANYOF_POSIXL_SET(data->start_class, namedclass);
5263 /* No individual code points can now match */
5264 data->start_class->invlist
5265 = sv_2mortal(_new_invlist(0));
5268 int complement = namedclass + ((invert) ? -1 : 1);
5270 assert(flags & SCF_DO_STCLASS_OR);
5272 /* If the complement of this class was already there,
5273 * the result is that they match all code points,
5274 * (\d + \D == everything). Remove the classes from
5275 * future consideration. Locale is not relevant in
5277 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5278 ssc_match_all_cp(data->start_class);
5279 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5280 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5282 else { /* The usual case; just add this class to the
5284 ANYOF_POSIXL_SET(data->start_class, namedclass);
5289 case NPOSIXA: /* For these, we always know the exact set of
5294 if (FLAGS(scan) == _CC_ASCII) {
5295 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5298 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5299 PL_XPosix_ptrs[_CC_ASCII],
5310 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5312 /* NPOSIXD matches all upper Latin1 code points unless the
5313 * target string being matched is UTF-8, which is
5314 * unknowable until match time. Since we are going to
5315 * invert, we want to get rid of all of them so that the
5316 * inversion will match all */
5317 if (OP(scan) == NPOSIXD) {
5318 _invlist_subtract(my_invlist, PL_UpperLatin1,
5324 if (flags & SCF_DO_STCLASS_AND) {
5325 ssc_intersection(data->start_class, my_invlist, invert);
5326 ssc_clear_locale(data->start_class);
5329 assert(flags & SCF_DO_STCLASS_OR);
5330 ssc_union(data->start_class, my_invlist, invert);
5332 SvREFCNT_dec(my_invlist);
5334 if (flags & SCF_DO_STCLASS_OR)
5335 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5336 flags &= ~SCF_DO_STCLASS;
5339 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5340 data->flags |= (OP(scan) == MEOL
5343 scan_commit(pRExC_state, data, minlenp, is_inf);
5346 else if ( PL_regkind[OP(scan)] == BRANCHJ
5347 /* Lookbehind, or need to calculate parens/evals/stclass: */
5348 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5349 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5351 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5352 || OP(scan) == UNLESSM )
5354 /* Negative Lookahead/lookbehind
5355 In this case we can't do fixed string optimisation.
5358 SSize_t deltanext, minnext, fake = 0;
5363 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5365 data_fake.whilem_c = data->whilem_c;
5366 data_fake.last_closep = data->last_closep;
5369 data_fake.last_closep = &fake;
5370 data_fake.pos_delta = delta;
5371 if ( flags & SCF_DO_STCLASS && !scan->flags
5372 && OP(scan) == IFMATCH ) { /* Lookahead */
5373 ssc_init(pRExC_state, &intrnl);
5374 data_fake.start_class = &intrnl;
5375 f |= SCF_DO_STCLASS_AND;
5377 if (flags & SCF_WHILEM_VISITED_POS)
5378 f |= SCF_WHILEM_VISITED_POS;
5379 next = regnext(scan);
5380 nscan = NEXTOPER(NEXTOPER(scan));
5381 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5382 last, &data_fake, stopparen,
5383 recursed_depth, NULL, f, depth+1);
5386 FAIL("Variable length lookbehind not implemented");
5388 else if (minnext > (I32)U8_MAX) {
5389 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5392 scan->flags = (U8)minnext;
5395 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5397 if (data_fake.flags & SF_HAS_EVAL)
5398 data->flags |= SF_HAS_EVAL;
5399 data->whilem_c = data_fake.whilem_c;
5401 if (f & SCF_DO_STCLASS_AND) {
5402 if (flags & SCF_DO_STCLASS_OR) {
5403 /* OR before, AND after: ideally we would recurse with
5404 * data_fake to get the AND applied by study of the
5405 * remainder of the pattern, and then derecurse;
5406 * *** HACK *** for now just treat as "no information".
5407 * See [perl #56690].
5409 ssc_init(pRExC_state, data->start_class);
5411 /* AND before and after: combine and continue. These
5412 * assertions are zero-length, so can match an EMPTY
5414 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5415 ANYOF_FLAGS(data->start_class)
5416 |= SSC_MATCHES_EMPTY_STRING;
5420 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5422 /* Positive Lookahead/lookbehind
5423 In this case we can do fixed string optimisation,
5424 but we must be careful about it. Note in the case of
5425 lookbehind the positions will be offset by the minimum
5426 length of the pattern, something we won't know about
5427 until after the recurse.
5429 SSize_t deltanext, fake = 0;
5433 /* We use SAVEFREEPV so that when the full compile
5434 is finished perl will clean up the allocated
5435 minlens when it's all done. This way we don't
5436 have to worry about freeing them when we know
5437 they wont be used, which would be a pain.
5440 Newx( minnextp, 1, SSize_t );
5441 SAVEFREEPV(minnextp);
5444 StructCopy(data, &data_fake, scan_data_t);
5445 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5448 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5449 data_fake.last_found=newSVsv(data->last_found);
5453 data_fake.last_closep = &fake;
5454 data_fake.flags = 0;
5455 data_fake.pos_delta = delta;
5457 data_fake.flags |= SF_IS_INF;
5458 if ( flags & SCF_DO_STCLASS && !scan->flags
5459 && OP(scan) == IFMATCH ) { /* Lookahead */
5460 ssc_init(pRExC_state, &intrnl);
5461 data_fake.start_class = &intrnl;
5462 f |= SCF_DO_STCLASS_AND;
5464 if (flags & SCF_WHILEM_VISITED_POS)
5465 f |= SCF_WHILEM_VISITED_POS;
5466 next = regnext(scan);
5467 nscan = NEXTOPER(NEXTOPER(scan));
5469 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5470 &deltanext, last, &data_fake,
5471 stopparen, recursed_depth, NULL,
5475 FAIL("Variable length lookbehind not implemented");
5477 else if (*minnextp > (I32)U8_MAX) {
5478 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5481 scan->flags = (U8)*minnextp;
5486 if (f & SCF_DO_STCLASS_AND) {
5487 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5488 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5491 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5493 if (data_fake.flags & SF_HAS_EVAL)
5494 data->flags |= SF_HAS_EVAL;
5495 data->whilem_c = data_fake.whilem_c;
5496 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5497 if (RExC_rx->minlen<*minnextp)
5498 RExC_rx->minlen=*minnextp;
5499 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5500 SvREFCNT_dec_NN(data_fake.last_found);
5502 if ( data_fake.minlen_fixed != minlenp )
5504 data->offset_fixed= data_fake.offset_fixed;
5505 data->minlen_fixed= data_fake.minlen_fixed;
5506 data->lookbehind_fixed+= scan->flags;
5508 if ( data_fake.minlen_float != minlenp )
5510 data->minlen_float= data_fake.minlen_float;
5511 data->offset_float_min=data_fake.offset_float_min;
5512 data->offset_float_max=data_fake.offset_float_max;
5513 data->lookbehind_float+= scan->flags;
5520 else if (OP(scan) == OPEN) {
5521 if (stopparen != (I32)ARG(scan))
5524 else if (OP(scan) == CLOSE) {
5525 if (stopparen == (I32)ARG(scan)) {
5528 if ((I32)ARG(scan) == is_par) {
5529 next = regnext(scan);
5531 if ( next && (OP(next) != WHILEM) && next < last)
5532 is_par = 0; /* Disable optimization */
5535 *(data->last_closep) = ARG(scan);
5537 else if (OP(scan) == EVAL) {
5539 data->flags |= SF_HAS_EVAL;
5541 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5542 if (flags & SCF_DO_SUBSTR) {
5543 scan_commit(pRExC_state, data, minlenp, is_inf);
5544 flags &= ~SCF_DO_SUBSTR;
5546 if (data && OP(scan)==ACCEPT) {
5547 data->flags |= SCF_SEEN_ACCEPT;
5552 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5554 if (flags & SCF_DO_SUBSTR) {
5555 scan_commit(pRExC_state, data, minlenp, is_inf);
5556 data->longest = &(data->longest_float);
5558 is_inf = is_inf_internal = 1;
5559 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5560 ssc_anything(data->start_class);
5561 flags &= ~SCF_DO_STCLASS;
5563 else if (OP(scan) == GPOS) {
5564 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5565 !(delta || is_inf || (data && data->pos_delta)))
5567 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5568 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5569 if (RExC_rx->gofs < (STRLEN)min)
5570 RExC_rx->gofs = min;
5572 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5576 #ifdef TRIE_STUDY_OPT
5577 #ifdef FULL_TRIE_STUDY
5578 else if (PL_regkind[OP(scan)] == TRIE) {
5579 /* NOTE - There is similar code to this block above for handling
5580 BRANCH nodes on the initial study. If you change stuff here
5582 regnode *trie_node= scan;
5583 regnode *tail= regnext(scan);
5584 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5585 SSize_t max1 = 0, min1 = SSize_t_MAX;
5588 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5589 /* Cannot merge strings after this. */
5590 scan_commit(pRExC_state, data, minlenp, is_inf);
5592 if (flags & SCF_DO_STCLASS)
5593 ssc_init_zero(pRExC_state, &accum);
5599 const regnode *nextbranch= NULL;
5602 for ( word=1 ; word <= trie->wordcount ; word++)
5604 SSize_t deltanext=0, minnext=0, f = 0, fake;
5605 regnode_ssc this_class;
5607 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5609 data_fake.whilem_c = data->whilem_c;
5610 data_fake.last_closep = data->last_closep;
5613 data_fake.last_closep = &fake;
5614 data_fake.pos_delta = delta;
5615 if (flags & SCF_DO_STCLASS) {
5616 ssc_init(pRExC_state, &this_class);
5617 data_fake.start_class = &this_class;
5618 f = SCF_DO_STCLASS_AND;
5620 if (flags & SCF_WHILEM_VISITED_POS)
5621 f |= SCF_WHILEM_VISITED_POS;
5623 if (trie->jump[word]) {
5625 nextbranch = trie_node + trie->jump[0];
5626 scan= trie_node + trie->jump[word];
5627 /* We go from the jump point to the branch that follows
5628 it. Note this means we need the vestigal unused
5629 branches even though they arent otherwise used. */
5630 minnext = study_chunk(pRExC_state, &scan, minlenp,
5631 &deltanext, (regnode *)nextbranch, &data_fake,
5632 stopparen, recursed_depth, NULL, f,depth+1);
5634 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5635 nextbranch= regnext((regnode*)nextbranch);
5637 if (min1 > (SSize_t)(minnext + trie->minlen))
5638 min1 = minnext + trie->minlen;
5639 if (deltanext == SSize_t_MAX) {
5640 is_inf = is_inf_internal = 1;
5642 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5643 max1 = minnext + deltanext + trie->maxlen;
5645 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5647 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5648 if ( stopmin > min + min1)
5649 stopmin = min + min1;
5650 flags &= ~SCF_DO_SUBSTR;
5652 data->flags |= SCF_SEEN_ACCEPT;
5655 if (data_fake.flags & SF_HAS_EVAL)
5656 data->flags |= SF_HAS_EVAL;
5657 data->whilem_c = data_fake.whilem_c;
5659 if (flags & SCF_DO_STCLASS)
5660 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5663 if (flags & SCF_DO_SUBSTR) {
5664 data->pos_min += min1;
5665 data->pos_delta += max1 - min1;
5666 if (max1 != min1 || is_inf)
5667 data->longest = &(data->longest_float);
5670 if (delta != SSize_t_MAX)
5671 delta += max1 - min1;
5672 if (flags & SCF_DO_STCLASS_OR) {
5673 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5675 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5676 flags &= ~SCF_DO_STCLASS;
5679 else if (flags & SCF_DO_STCLASS_AND) {
5681 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5682 flags &= ~SCF_DO_STCLASS;
5685 /* Switch to OR mode: cache the old value of
5686 * data->start_class */
5688 StructCopy(data->start_class, and_withp, regnode_ssc);
5689 flags &= ~SCF_DO_STCLASS_AND;
5690 StructCopy(&accum, data->start_class, regnode_ssc);
5691 flags |= SCF_DO_STCLASS_OR;
5698 else if (PL_regkind[OP(scan)] == TRIE) {
5699 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5702 min += trie->minlen;
5703 delta += (trie->maxlen - trie->minlen);
5704 flags &= ~SCF_DO_STCLASS; /* xxx */
5705 if (flags & SCF_DO_SUBSTR) {
5706 /* Cannot expect anything... */
5707 scan_commit(pRExC_state, data, minlenp, is_inf);
5708 data->pos_min += trie->minlen;
5709 data->pos_delta += (trie->maxlen - trie->minlen);
5710 if (trie->maxlen != trie->minlen)
5711 data->longest = &(data->longest_float);
5713 if (trie->jump) /* no more substrings -- for now /grr*/
5714 flags &= ~SCF_DO_SUBSTR;
5716 #endif /* old or new */
5717 #endif /* TRIE_STUDY_OPT */
5719 /* Else: zero-length, ignore. */
5720 scan = regnext(scan);
5722 /* If we are exiting a recursion we can unset its recursed bit
5723 * and allow ourselves to enter it again - no danger of an
5724 * infinite loop there.
5725 if (stopparen > -1 && recursed) {
5726 DEBUG_STUDYDATA("unset:", data,depth);
5727 PAREN_UNSET( recursed, stopparen);
5733 DEBUG_STUDYDATA("frame-end:",data,depth);
5734 DEBUG_PEEP("fend", scan, depth);
5736 /* restore previous context */
5737 last = frame->last_regnode;
5738 scan = frame->next_regnode;
5739 stopparen = frame->stopparen;
5740 recursed_depth = frame->prev_recursed_depth;
5742 RExC_frame_last = frame->prev_frame;
5743 frame = frame->this_prev_frame;
5744 goto fake_study_recurse;
5749 DEBUG_STUDYDATA("pre-fin:",data,depth);
5752 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5754 if (flags & SCF_DO_SUBSTR && is_inf)
5755 data->pos_delta = SSize_t_MAX - data->pos_min;
5756 if (is_par > (I32)U8_MAX)
5758 if (is_par && pars==1 && data) {
5759 data->flags |= SF_IN_PAR;
5760 data->flags &= ~SF_HAS_PAR;
5762 else if (pars && data) {
5763 data->flags |= SF_HAS_PAR;
5764 data->flags &= ~SF_IN_PAR;
5766 if (flags & SCF_DO_STCLASS_OR)
5767 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5768 if (flags & SCF_TRIE_RESTUDY)
5769 data->flags |= SCF_TRIE_RESTUDY;
5771 DEBUG_STUDYDATA("post-fin:",data,depth);
5774 SSize_t final_minlen= min < stopmin ? min : stopmin;
5776 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5777 if (final_minlen > SSize_t_MAX - delta)
5778 RExC_maxlen = SSize_t_MAX;
5779 else if (RExC_maxlen < final_minlen + delta)
5780 RExC_maxlen = final_minlen + delta;
5782 return final_minlen;
5784 NOT_REACHED; /* NOTREACHED */
5788 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5790 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5792 PERL_ARGS_ASSERT_ADD_DATA;
5794 Renewc(RExC_rxi->data,
5795 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5796 char, struct reg_data);
5798 Renew(RExC_rxi->data->what, count + n, U8);
5800 Newx(RExC_rxi->data->what, n, U8);
5801 RExC_rxi->data->count = count + n;
5802 Copy(s, RExC_rxi->data->what + count, n, U8);
5806 /*XXX: todo make this not included in a non debugging perl, but appears to be
5807 * used anyway there, in 'use re' */
5808 #ifndef PERL_IN_XSUB_RE
5810 Perl_reginitcolors(pTHX)
5812 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5814 char *t = savepv(s);
5818 t = strchr(t, '\t');
5824 PL_colors[i] = t = (char *)"";
5829 PL_colors[i++] = (char *)"";
5836 #ifdef TRIE_STUDY_OPT
5837 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5840 (data.flags & SCF_TRIE_RESTUDY) \
5848 #define CHECK_RESTUDY_GOTO_butfirst
5852 * pregcomp - compile a regular expression into internal code
5854 * Decides which engine's compiler to call based on the hint currently in
5858 #ifndef PERL_IN_XSUB_RE
5860 /* return the currently in-scope regex engine (or the default if none) */
5862 regexp_engine const *
5863 Perl_current_re_engine(pTHX)
5865 if (IN_PERL_COMPILETIME) {
5866 HV * const table = GvHV(PL_hintgv);
5869 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5870 return &PL_core_reg_engine;
5871 ptr = hv_fetchs(table, "regcomp", FALSE);
5872 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5873 return &PL_core_reg_engine;
5874 return INT2PTR(regexp_engine*,SvIV(*ptr));
5878 if (!PL_curcop->cop_hints_hash)
5879 return &PL_core_reg_engine;
5880 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5881 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5882 return &PL_core_reg_engine;
5883 return INT2PTR(regexp_engine*,SvIV(ptr));
5889 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5891 regexp_engine const *eng = current_re_engine();
5892 GET_RE_DEBUG_FLAGS_DECL;
5894 PERL_ARGS_ASSERT_PREGCOMP;
5896 /* Dispatch a request to compile a regexp to correct regexp engine. */
5898 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5901 return CALLREGCOMP_ENG(eng, pattern, flags);
5905 /* public(ish) entry point for the perl core's own regex compiling code.
5906 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5907 * pattern rather than a list of OPs, and uses the internal engine rather
5908 * than the current one */
5911 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5913 SV *pat = pattern; /* defeat constness! */
5914 PERL_ARGS_ASSERT_RE_COMPILE;
5915 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5916 #ifdef PERL_IN_XSUB_RE
5919 &PL_core_reg_engine,
5921 NULL, NULL, rx_flags, 0);
5925 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5926 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5927 * point to the realloced string and length.
5929 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5933 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5934 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5936 U8 *const src = (U8*)*pat_p;
5941 GET_RE_DEBUG_FLAGS_DECL;
5943 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5944 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5946 Newx(dst, *plen_p * 2 + 1, U8);
5949 while (s < *plen_p) {
5950 append_utf8_from_native_byte(src[s], &d);
5951 if (n < num_code_blocks) {
5952 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5953 pRExC_state->code_blocks[n].start = d - dst - 1;
5954 assert(*(d - 1) == '(');
5957 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5958 pRExC_state->code_blocks[n].end = d - dst - 1;
5959 assert(*(d - 1) == ')');
5968 *pat_p = (char*) dst;
5970 RExC_orig_utf8 = RExC_utf8 = 1;
5975 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5976 * while recording any code block indices, and handling overloading,
5977 * nested qr// objects etc. If pat is null, it will allocate a new
5978 * string, or just return the first arg, if there's only one.
5980 * Returns the malloced/updated pat.
5981 * patternp and pat_count is the array of SVs to be concatted;
5982 * oplist is the optional list of ops that generated the SVs;
5983 * recompile_p is a pointer to a boolean that will be set if
5984 * the regex will need to be recompiled.
5985 * delim, if non-null is an SV that will be inserted between each element
5989 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5990 SV *pat, SV ** const patternp, int pat_count,
5991 OP *oplist, bool *recompile_p, SV *delim)
5995 bool use_delim = FALSE;
5996 bool alloced = FALSE;
5998 /* if we know we have at least two args, create an empty string,
5999 * then concatenate args to that. For no args, return an empty string */
6000 if (!pat && pat_count != 1) {
6006 for (svp = patternp; svp < patternp + pat_count; svp++) {
6009 STRLEN orig_patlen = 0;
6011 SV *msv = use_delim ? delim : *svp;
6012 if (!msv) msv = &PL_sv_undef;
6014 /* if we've got a delimiter, we go round the loop twice for each
6015 * svp slot (except the last), using the delimiter the second
6024 if (SvTYPE(msv) == SVt_PVAV) {
6025 /* we've encountered an interpolated array within
6026 * the pattern, e.g. /...@a..../. Expand the list of elements,
6027 * then recursively append elements.
6028 * The code in this block is based on S_pushav() */
6030 AV *const av = (AV*)msv;
6031 const SSize_t maxarg = AvFILL(av) + 1;
6035 assert(oplist->op_type == OP_PADAV
6036 || oplist->op_type == OP_RV2AV);
6037 oplist = OpSIBLING(oplist);
6040 if (SvRMAGICAL(av)) {
6043 Newx(array, maxarg, SV*);
6045 for (i=0; i < maxarg; i++) {
6046 SV ** const svp = av_fetch(av, i, FALSE);
6047 array[i] = svp ? *svp : &PL_sv_undef;
6051 array = AvARRAY(av);
6053 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6054 array, maxarg, NULL, recompile_p,
6056 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6062 /* we make the assumption here that each op in the list of
6063 * op_siblings maps to one SV pushed onto the stack,
6064 * except for code blocks, with have both an OP_NULL and
6066 * This allows us to match up the list of SVs against the
6067 * list of OPs to find the next code block.
6069 * Note that PUSHMARK PADSV PADSV ..
6071 * PADRANGE PADSV PADSV ..
6072 * so the alignment still works. */
6075 if (oplist->op_type == OP_NULL
6076 && (oplist->op_flags & OPf_SPECIAL))
6078 assert(n < pRExC_state->num_code_blocks);
6079 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
6080 pRExC_state->code_blocks[n].block = oplist;
6081 pRExC_state->code_blocks[n].src_regex = NULL;
6084 oplist = OpSIBLING(oplist); /* skip CONST */
6087 oplist = OpSIBLING(oplist);;
6090 /* apply magic and QR overloading to arg */
6093 if (SvROK(msv) && SvAMAGIC(msv)) {
6094 SV *sv = AMG_CALLunary(msv, regexp_amg);
6098 if (SvTYPE(sv) != SVt_REGEXP)
6099 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6104 /* try concatenation overload ... */
6105 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6106 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6109 /* overloading involved: all bets are off over literal
6110 * code. Pretend we haven't seen it */
6111 pRExC_state->num_code_blocks -= n;
6115 /* ... or failing that, try "" overload */
6116 while (SvAMAGIC(msv)
6117 && (sv = AMG_CALLunary(msv, string_amg))
6121 && SvRV(msv) == SvRV(sv))
6126 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6130 /* this is a partially unrolled
6131 * sv_catsv_nomg(pat, msv);
6132 * that allows us to adjust code block indices if
6135 char *dst = SvPV_force_nomg(pat, dlen);
6137 if (SvUTF8(msv) && !SvUTF8(pat)) {
6138 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6139 sv_setpvn(pat, dst, dlen);
6142 sv_catsv_nomg(pat, msv);
6149 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6152 /* extract any code blocks within any embedded qr//'s */
6153 if (rx && SvTYPE(rx) == SVt_REGEXP
6154 && RX_ENGINE((REGEXP*)rx)->op_comp)
6157 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6158 if (ri->num_code_blocks) {
6160 /* the presence of an embedded qr// with code means
6161 * we should always recompile: the text of the
6162 * qr// may not have changed, but it may be a
6163 * different closure than last time */
6165 Renew(pRExC_state->code_blocks,
6166 pRExC_state->num_code_blocks + ri->num_code_blocks,
6167 struct reg_code_block);
6168 pRExC_state->num_code_blocks += ri->num_code_blocks;
6170 for (i=0; i < ri->num_code_blocks; i++) {
6171 struct reg_code_block *src, *dst;
6172 STRLEN offset = orig_patlen
6173 + ReANY((REGEXP *)rx)->pre_prefix;
6174 assert(n < pRExC_state->num_code_blocks);
6175 src = &ri->code_blocks[i];
6176 dst = &pRExC_state->code_blocks[n];
6177 dst->start = src->start + offset;
6178 dst->end = src->end + offset;
6179 dst->block = src->block;
6180 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6189 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6198 /* see if there are any run-time code blocks in the pattern.
6199 * False positives are allowed */
6202 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6203 char *pat, STRLEN plen)
6208 PERL_UNUSED_CONTEXT;
6210 for (s = 0; s < plen; s++) {
6211 if (n < pRExC_state->num_code_blocks
6212 && s == pRExC_state->code_blocks[n].start)
6214 s = pRExC_state->code_blocks[n].end;
6218 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6220 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6222 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6229 /* Handle run-time code blocks. We will already have compiled any direct
6230 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6231 * copy of it, but with any literal code blocks blanked out and
6232 * appropriate chars escaped; then feed it into
6234 * eval "qr'modified_pattern'"
6238 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6242 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6244 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6245 * and merge them with any code blocks of the original regexp.
6247 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6248 * instead, just save the qr and return FALSE; this tells our caller that
6249 * the original pattern needs upgrading to utf8.
6253 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6254 char *pat, STRLEN plen)
6258 GET_RE_DEBUG_FLAGS_DECL;
6260 if (pRExC_state->runtime_code_qr) {
6261 /* this is the second time we've been called; this should
6262 * only happen if the main pattern got upgraded to utf8
6263 * during compilation; re-use the qr we compiled first time
6264 * round (which should be utf8 too)
6266 qr = pRExC_state->runtime_code_qr;
6267 pRExC_state->runtime_code_qr = NULL;
6268 assert(RExC_utf8 && SvUTF8(qr));
6274 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6278 /* determine how many extra chars we need for ' and \ escaping */
6279 for (s = 0; s < plen; s++) {
6280 if (pat[s] == '\'' || pat[s] == '\\')
6284 Newx(newpat, newlen, char);
6286 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6288 for (s = 0; s < plen; s++) {
6289 if (n < pRExC_state->num_code_blocks
6290 && s == pRExC_state->code_blocks[n].start)
6292 /* blank out literal code block */
6293 assert(pat[s] == '(');
6294 while (s <= pRExC_state->code_blocks[n].end) {
6302 if (pat[s] == '\'' || pat[s] == '\\')
6307 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6311 PerlIO_printf(Perl_debug_log,
6312 "%sre-parsing pattern for runtime code:%s %s\n",
6313 PL_colors[4],PL_colors[5],newpat);
6316 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6322 PUSHSTACKi(PERLSI_REQUIRE);
6323 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6324 * parsing qr''; normally only q'' does this. It also alters
6326 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6327 SvREFCNT_dec_NN(sv);
6332 SV * const errsv = ERRSV;
6333 if (SvTRUE_NN(errsv))
6335 Safefree(pRExC_state->code_blocks);
6336 /* use croak_sv ? */
6337 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6340 assert(SvROK(qr_ref));
6342 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6343 /* the leaving below frees the tmp qr_ref.
6344 * Give qr a life of its own */
6352 if (!RExC_utf8 && SvUTF8(qr)) {
6353 /* first time through; the pattern got upgraded; save the
6354 * qr for the next time through */
6355 assert(!pRExC_state->runtime_code_qr);
6356 pRExC_state->runtime_code_qr = qr;
6361 /* extract any code blocks within the returned qr// */
6364 /* merge the main (r1) and run-time (r2) code blocks into one */
6366 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6367 struct reg_code_block *new_block, *dst;
6368 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6371 if (!r2->num_code_blocks) /* we guessed wrong */
6373 SvREFCNT_dec_NN(qr);
6378 r1->num_code_blocks + r2->num_code_blocks,
6379 struct reg_code_block);
6382 while ( i1 < r1->num_code_blocks
6383 || i2 < r2->num_code_blocks)
6385 struct reg_code_block *src;
6388 if (i1 == r1->num_code_blocks) {
6389 src = &r2->code_blocks[i2++];
6392 else if (i2 == r2->num_code_blocks)
6393 src = &r1->code_blocks[i1++];
6394 else if ( r1->code_blocks[i1].start
6395 < r2->code_blocks[i2].start)
6397 src = &r1->code_blocks[i1++];
6398 assert(src->end < r2->code_blocks[i2].start);
6401 assert( r1->code_blocks[i1].start
6402 > r2->code_blocks[i2].start);
6403 src = &r2->code_blocks[i2++];
6405 assert(src->end < r1->code_blocks[i1].start);
6408 assert(pat[src->start] == '(');
6409 assert(pat[src->end] == ')');
6410 dst->start = src->start;
6411 dst->end = src->end;
6412 dst->block = src->block;
6413 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6417 r1->num_code_blocks += r2->num_code_blocks;
6418 Safefree(r1->code_blocks);
6419 r1->code_blocks = new_block;
6422 SvREFCNT_dec_NN(qr);
6428 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6429 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6430 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6431 STRLEN longest_length, bool eol, bool meol)
6433 /* This is the common code for setting up the floating and fixed length
6434 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6435 * as to whether succeeded or not */
6440 if (! (longest_length
6441 || (eol /* Can't have SEOL and MULTI */
6442 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6444 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6445 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6450 /* copy the information about the longest from the reg_scan_data
6451 over to the program. */
6452 if (SvUTF8(sv_longest)) {
6453 *rx_utf8 = sv_longest;
6456 *rx_substr = sv_longest;
6459 /* end_shift is how many chars that must be matched that
6460 follow this item. We calculate it ahead of time as once the
6461 lookbehind offset is added in we lose the ability to correctly
6463 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6464 *rx_end_shift = ml - offset
6465 - longest_length + (SvTAIL(sv_longest) != 0)
6468 t = (eol/* Can't have SEOL and MULTI */
6469 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6470 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6476 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6477 * regular expression into internal code.
6478 * The pattern may be passed either as:
6479 * a list of SVs (patternp plus pat_count)
6480 * a list of OPs (expr)
6481 * If both are passed, the SV list is used, but the OP list indicates
6482 * which SVs are actually pre-compiled code blocks
6484 * The SVs in the list have magic and qr overloading applied to them (and
6485 * the list may be modified in-place with replacement SVs in the latter
6488 * If the pattern hasn't changed from old_re, then old_re will be
6491 * eng is the current engine. If that engine has an op_comp method, then
6492 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6493 * do the initial concatenation of arguments and pass on to the external
6496 * If is_bare_re is not null, set it to a boolean indicating whether the
6497 * arg list reduced (after overloading) to a single bare regex which has
6498 * been returned (i.e. /$qr/).
6500 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6502 * pm_flags contains the PMf_* flags, typically based on those from the
6503 * pm_flags field of the related PMOP. Currently we're only interested in
6504 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6506 * We can't allocate space until we know how big the compiled form will be,
6507 * but we can't compile it (and thus know how big it is) until we've got a
6508 * place to put the code. So we cheat: we compile it twice, once with code
6509 * generation turned off and size counting turned on, and once "for real".
6510 * This also means that we don't allocate space until we are sure that the
6511 * thing really will compile successfully, and we never have to move the
6512 * code and thus invalidate pointers into it. (Note that it has to be in
6513 * one piece because free() must be able to free it all.) [NB: not true in perl]
6515 * Beware that the optimization-preparation code in here knows about some
6516 * of the structure of the compiled regexp. [I'll say.]
6520 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6521 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6522 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6526 regexp_internal *ri;
6534 SV *code_blocksv = NULL;
6535 SV** new_patternp = patternp;
6537 /* these are all flags - maybe they should be turned
6538 * into a single int with different bit masks */
6539 I32 sawlookahead = 0;
6544 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6546 bool runtime_code = 0;
6548 RExC_state_t RExC_state;
6549 RExC_state_t * const pRExC_state = &RExC_state;
6550 #ifdef TRIE_STUDY_OPT
6552 RExC_state_t copyRExC_state;
6554 GET_RE_DEBUG_FLAGS_DECL;
6556 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6558 DEBUG_r(if (!PL_colorset) reginitcolors());
6560 /* Initialize these here instead of as-needed, as is quick and avoids
6561 * having to test them each time otherwise */
6562 if (! PL_AboveLatin1) {
6563 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6564 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6565 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6566 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6567 PL_HasMultiCharFold =
6568 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6570 /* This is calculated here, because the Perl program that generates the
6571 * static global ones doesn't currently have access to
6572 * NUM_ANYOF_CODE_POINTS */
6573 PL_InBitmap = _new_invlist(2);
6574 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6575 NUM_ANYOF_CODE_POINTS - 1);
6578 pRExC_state->code_blocks = NULL;
6579 pRExC_state->num_code_blocks = 0;
6582 *is_bare_re = FALSE;
6584 if (expr && (expr->op_type == OP_LIST ||
6585 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6586 /* allocate code_blocks if needed */
6590 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6591 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6592 ncode++; /* count of DO blocks */
6594 pRExC_state->num_code_blocks = ncode;
6595 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6600 /* compile-time pattern with just OP_CONSTs and DO blocks */
6605 /* find how many CONSTs there are */
6608 if (expr->op_type == OP_CONST)
6611 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6612 if (o->op_type == OP_CONST)
6616 /* fake up an SV array */
6618 assert(!new_patternp);
6619 Newx(new_patternp, n, SV*);
6620 SAVEFREEPV(new_patternp);
6624 if (expr->op_type == OP_CONST)
6625 new_patternp[n] = cSVOPx_sv(expr);
6627 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6628 if (o->op_type == OP_CONST)
6629 new_patternp[n++] = cSVOPo_sv;
6634 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6635 "Assembling pattern from %d elements%s\n", pat_count,
6636 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6638 /* set expr to the first arg op */
6640 if (pRExC_state->num_code_blocks
6641 && expr->op_type != OP_CONST)
6643 expr = cLISTOPx(expr)->op_first;
6644 assert( expr->op_type == OP_PUSHMARK
6645 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6646 || expr->op_type == OP_PADRANGE);
6647 expr = OpSIBLING(expr);
6650 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6651 expr, &recompile, NULL);
6653 /* handle bare (possibly after overloading) regex: foo =~ $re */
6658 if (SvTYPE(re) == SVt_REGEXP) {
6662 Safefree(pRExC_state->code_blocks);
6663 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6664 "Precompiled pattern%s\n",
6665 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6671 exp = SvPV_nomg(pat, plen);
6673 if (!eng->op_comp) {
6674 if ((SvUTF8(pat) && IN_BYTES)
6675 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6677 /* make a temporary copy; either to convert to bytes,
6678 * or to avoid repeating get-magic / overloaded stringify */
6679 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6680 (IN_BYTES ? 0 : SvUTF8(pat)));
6682 Safefree(pRExC_state->code_blocks);
6683 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6686 /* ignore the utf8ness if the pattern is 0 length */
6687 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6689 RExC_uni_semantics = 0;
6690 RExC_seen_unfolded_sharp_s = 0;
6691 RExC_contains_locale = 0;
6692 RExC_contains_i = 0;
6693 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
6694 pRExC_state->runtime_code_qr = NULL;
6695 RExC_frame_head= NULL;
6696 RExC_frame_last= NULL;
6697 RExC_frame_count= 0;
6700 RExC_mysv1= sv_newmortal();
6701 RExC_mysv2= sv_newmortal();
6704 SV *dsv= sv_newmortal();
6705 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6706 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6707 PL_colors[4],PL_colors[5],s);
6711 /* we jump here if we have to recompile, e.g., from upgrading the pattern
6714 if ((pm_flags & PMf_USE_RE_EVAL)
6715 /* this second condition covers the non-regex literal case,
6716 * i.e. $foo =~ '(?{})'. */
6717 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6719 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6721 /* return old regex if pattern hasn't changed */
6722 /* XXX: note in the below we have to check the flags as well as the
6725 * Things get a touch tricky as we have to compare the utf8 flag
6726 * independently from the compile flags. */
6730 && !!RX_UTF8(old_re) == !!RExC_utf8
6731 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6732 && RX_PRECOMP(old_re)
6733 && RX_PRELEN(old_re) == plen
6734 && memEQ(RX_PRECOMP(old_re), exp, plen)
6735 && !runtime_code /* with runtime code, always recompile */ )
6737 Safefree(pRExC_state->code_blocks);
6741 rx_flags = orig_rx_flags;
6743 if (rx_flags & PMf_FOLD) {
6744 RExC_contains_i = 1;
6746 if ( initial_charset == REGEX_DEPENDS_CHARSET
6747 && (RExC_utf8 ||RExC_uni_semantics))
6750 /* Set to use unicode semantics if the pattern is in utf8 and has the
6751 * 'depends' charset specified, as it means unicode when utf8 */
6752 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6756 RExC_precomp_adj = 0;
6757 RExC_flags = rx_flags;
6758 RExC_pm_flags = pm_flags;
6761 assert(TAINTING_get || !TAINT_get);
6763 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6765 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6766 /* whoops, we have a non-utf8 pattern, whilst run-time code
6767 * got compiled as utf8. Try again with a utf8 pattern */
6768 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6769 pRExC_state->num_code_blocks);
6770 goto redo_first_pass;
6773 assert(!pRExC_state->runtime_code_qr);
6779 RExC_in_lookbehind = 0;
6780 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6782 RExC_override_recoding = 0;
6784 RExC_recode_x_to_native = 0;
6786 RExC_in_multi_char_class = 0;
6788 /* First pass: determine size, legality. */
6790 RExC_start = RExC_adjusted_start = exp;
6791 RExC_end = exp + plen;
6792 RExC_precomp_end = RExC_end;
6797 RExC_emit = (regnode *) &RExC_emit_dummy;
6798 RExC_whilem_seen = 0;
6799 RExC_open_parens = NULL;
6800 RExC_close_parens = NULL;
6802 RExC_paren_names = NULL;
6804 RExC_paren_name_list = NULL;
6806 RExC_recurse = NULL;
6807 RExC_study_chunk_recursed = NULL;
6808 RExC_study_chunk_recursed_bytes= 0;
6809 RExC_recurse_count = 0;
6810 pRExC_state->code_index = 0;
6812 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
6813 * code makes sure the final byte is an uncounted NUL. But should this
6814 * ever not be the case, lots of things could read beyond the end of the
6815 * buffer: loops like
6816 * while(isFOO(*RExC_parse)) RExC_parse++;
6817 * strchr(RExC_parse, "foo");
6818 * etc. So it is worth noting. */
6819 assert(*RExC_end == '\0');
6822 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6824 RExC_lastparse=NULL;
6826 /* reg may croak on us, not giving us a chance to free
6827 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6828 need it to survive as long as the regexp (qr/(?{})/).
6829 We must check that code_blocksv is not already set, because we may
6830 have jumped back to restart the sizing pass. */
6831 if (pRExC_state->code_blocks && !code_blocksv) {
6832 code_blocksv = newSV_type(SVt_PV);
6833 SAVEFREESV(code_blocksv);
6834 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6835 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6837 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6838 /* It's possible to write a regexp in ascii that represents Unicode
6839 codepoints outside of the byte range, such as via \x{100}. If we
6840 detect such a sequence we have to convert the entire pattern to utf8
6841 and then recompile, as our sizing calculation will have been based
6842 on 1 byte == 1 character, but we will need to use utf8 to encode
6843 at least some part of the pattern, and therefore must convert the whole
6846 if (flags & RESTART_PASS1) {
6847 if (flags & NEED_UTF8) {
6848 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6849 pRExC_state->num_code_blocks);
6852 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6853 "Need to redo pass 1\n"));
6856 goto redo_first_pass;
6858 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6861 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6864 PerlIO_printf(Perl_debug_log,
6865 "Required size %"IVdf" nodes\n"
6866 "Starting second pass (creation)\n",
6869 RExC_lastparse=NULL;
6872 /* The first pass could have found things that force Unicode semantics */
6873 if ((RExC_utf8 || RExC_uni_semantics)
6874 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6876 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6879 /* Small enough for pointer-storage convention?
6880 If extralen==0, this means that we will not need long jumps. */
6881 if (RExC_size >= 0x10000L && RExC_extralen)
6882 RExC_size += RExC_extralen;
6885 if (RExC_whilem_seen > 15)
6886 RExC_whilem_seen = 15;
6888 /* Allocate space and zero-initialize. Note, the two step process
6889 of zeroing when in debug mode, thus anything assigned has to
6890 happen after that */
6891 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6893 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6894 char, regexp_internal);
6895 if ( r == NULL || ri == NULL )
6896 FAIL("Regexp out of space");
6898 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6899 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6902 /* bulk initialize base fields with 0. */
6903 Zero(ri, sizeof(regexp_internal), char);
6906 /* non-zero initialization begins here */
6909 r->extflags = rx_flags;
6910 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6912 if (pm_flags & PMf_IS_QR) {
6913 ri->code_blocks = pRExC_state->code_blocks;
6914 ri->num_code_blocks = pRExC_state->num_code_blocks;
6919 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6920 if (pRExC_state->code_blocks[n].src_regex)
6921 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6922 if(pRExC_state->code_blocks)
6923 SAVEFREEPV(pRExC_state->code_blocks); /* often null */
6927 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6928 bool has_charset = (get_regex_charset(r->extflags)
6929 != REGEX_DEPENDS_CHARSET);
6931 /* The caret is output if there are any defaults: if not all the STD
6932 * flags are set, or if no character set specifier is needed */
6934 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6936 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6937 == REG_RUN_ON_COMMENT_SEEN);
6938 U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
6939 >> RXf_PMf_STD_PMMOD_SHIFT);
6940 const char *fptr = STD_PAT_MODS; /*"msixn"*/
6943 /* We output all the necessary flags; we never output a minus, as all
6944 * those are defaults, so are
6945 * covered by the caret */
6946 const STRLEN wraplen = plen + has_p + has_runon
6947 + has_default /* If needs a caret */
6948 + PL_bitcount[reganch] /* 1 char for each set standard flag */
6950 /* If needs a character set specifier */
6951 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6952 + (sizeof("(?:)") - 1);
6954 /* make sure PL_bitcount bounds not exceeded */
6955 assert(sizeof(STD_PAT_MODS) <= 8);
6957 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6958 r->xpv_len_u.xpvlenu_pv = p;
6960 SvFLAGS(rx) |= SVf_UTF8;
6963 /* If a default, cover it using the caret */
6965 *p++= DEFAULT_PAT_MOD;
6969 const char* const name = get_regex_charset_name(r->extflags, &len);
6970 Copy(name, p, len, char);
6974 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6977 while((ch = *fptr++)) {
6985 Copy(RExC_precomp, p, plen, char);
6986 assert ((RX_WRAPPED(rx) - p) < 16);
6987 r->pre_prefix = p - RX_WRAPPED(rx);
6993 SvCUR_set(rx, p - RX_WRAPPED(rx));
6997 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6999 /* setup various meta data about recursion, this all requires
7000 * RExC_npar to be correctly set, and a bit later on we clear it */
7001 if (RExC_seen & REG_RECURSE_SEEN) {
7002 Newxz(RExC_open_parens, RExC_npar,regnode *);
7003 SAVEFREEPV(RExC_open_parens);
7004 Newxz(RExC_close_parens,RExC_npar,regnode *);
7005 SAVEFREEPV(RExC_close_parens);
7007 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
7008 /* Note, RExC_npar is 1 + the number of parens in a pattern.
7009 * So its 1 if there are no parens. */
7010 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
7011 ((RExC_npar & 0x07) != 0);
7012 Newx(RExC_study_chunk_recursed,
7013 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7014 SAVEFREEPV(RExC_study_chunk_recursed);
7017 /* Useful during FAIL. */
7018 #ifdef RE_TRACK_PATTERN_OFFSETS
7019 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
7020 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
7021 "%s %"UVuf" bytes for offset annotations.\n",
7022 ri->u.offsets ? "Got" : "Couldn't get",
7023 (UV)((2*RExC_size+1) * sizeof(U32))));
7025 SetProgLen(ri,RExC_size);
7030 /* Second pass: emit code. */
7031 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
7032 RExC_pm_flags = pm_flags;
7034 RExC_end = exp + plen;
7037 RExC_emit_start = ri->program;
7038 RExC_emit = ri->program;
7039 RExC_emit_bound = ri->program + RExC_size + 1;
7040 pRExC_state->code_index = 0;
7042 *((char*) RExC_emit++) = (char) REG_MAGIC;
7043 if (reg(pRExC_state, 0, &flags,1) == NULL) {
7045 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
7047 /* XXXX To minimize changes to RE engine we always allocate
7048 3-units-long substrs field. */
7049 Newx(r->substrs, 1, struct reg_substr_data);
7050 if (RExC_recurse_count) {
7051 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
7052 SAVEFREEPV(RExC_recurse);
7056 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7058 RExC_study_chunk_recursed_count= 0;
7060 Zero(r->substrs, 1, struct reg_substr_data);
7061 if (RExC_study_chunk_recursed) {
7062 Zero(RExC_study_chunk_recursed,
7063 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
7067 #ifdef TRIE_STUDY_OPT
7069 StructCopy(&zero_scan_data, &data, scan_data_t);
7070 copyRExC_state = RExC_state;
7073 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
7075 RExC_state = copyRExC_state;
7076 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7077 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7079 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7080 StructCopy(&zero_scan_data, &data, scan_data_t);
7083 StructCopy(&zero_scan_data, &data, scan_data_t);
7086 /* Dig out information for optimizations. */
7087 r->extflags = RExC_flags; /* was pm_op */
7088 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7091 SvUTF8_on(rx); /* Unicode in it? */
7092 ri->regstclass = NULL;
7093 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
7094 r->intflags |= PREGf_NAUGHTY;
7095 scan = ri->program + 1; /* First BRANCH. */
7097 /* testing for BRANCH here tells us whether there is "must appear"
7098 data in the pattern. If there is then we can use it for optimisations */
7099 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
7102 STRLEN longest_float_length, longest_fixed_length;
7103 regnode_ssc ch_class; /* pointed to by data */
7105 SSize_t last_close = 0; /* pointed to by data */
7106 regnode *first= scan;
7107 regnode *first_next= regnext(first);
7109 * Skip introductions and multiplicators >= 1
7110 * so that we can extract the 'meat' of the pattern that must
7111 * match in the large if() sequence following.
7112 * NOTE that EXACT is NOT covered here, as it is normally
7113 * picked up by the optimiser separately.
7115 * This is unfortunate as the optimiser isnt handling lookahead
7116 * properly currently.
7119 while ((OP(first) == OPEN && (sawopen = 1)) ||
7120 /* An OR of *one* alternative - should not happen now. */
7121 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7122 /* for now we can't handle lookbehind IFMATCH*/
7123 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7124 (OP(first) == PLUS) ||
7125 (OP(first) == MINMOD) ||
7126 /* An {n,m} with n>0 */
7127 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7128 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7131 * the only op that could be a regnode is PLUS, all the rest
7132 * will be regnode_1 or regnode_2.
7134 * (yves doesn't think this is true)
7136 if (OP(first) == PLUS)
7139 if (OP(first) == MINMOD)
7141 first += regarglen[OP(first)];
7143 first = NEXTOPER(first);
7144 first_next= regnext(first);
7147 /* Starting-point info. */
7149 DEBUG_PEEP("first:",first,0);
7150 /* Ignore EXACT as we deal with it later. */
7151 if (PL_regkind[OP(first)] == EXACT) {
7152 if (OP(first) == EXACT || OP(first) == EXACTL)
7153 NOOP; /* Empty, get anchored substr later. */
7155 ri->regstclass = first;
7158 else if (PL_regkind[OP(first)] == TRIE &&
7159 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7161 /* this can happen only on restudy */
7162 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7165 else if (REGNODE_SIMPLE(OP(first)))
7166 ri->regstclass = first;
7167 else if (PL_regkind[OP(first)] == BOUND ||
7168 PL_regkind[OP(first)] == NBOUND)
7169 ri->regstclass = first;
7170 else if (PL_regkind[OP(first)] == BOL) {
7171 r->intflags |= (OP(first) == MBOL
7174 first = NEXTOPER(first);
7177 else if (OP(first) == GPOS) {
7178 r->intflags |= PREGf_ANCH_GPOS;
7179 first = NEXTOPER(first);
7182 else if ((!sawopen || !RExC_sawback) &&
7184 (OP(first) == STAR &&
7185 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7186 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7188 /* turn .* into ^.* with an implied $*=1 */
7190 (OP(NEXTOPER(first)) == REG_ANY)
7193 r->intflags |= (type | PREGf_IMPLICIT);
7194 first = NEXTOPER(first);
7197 if (sawplus && !sawminmod && !sawlookahead
7198 && (!sawopen || !RExC_sawback)
7199 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7200 /* x+ must match at the 1st pos of run of x's */
7201 r->intflags |= PREGf_SKIP;
7203 /* Scan is after the zeroth branch, first is atomic matcher. */
7204 #ifdef TRIE_STUDY_OPT
7207 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7208 (IV)(first - scan + 1))
7212 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7213 (IV)(first - scan + 1))
7219 * If there's something expensive in the r.e., find the
7220 * longest literal string that must appear and make it the
7221 * regmust. Resolve ties in favor of later strings, since
7222 * the regstart check works with the beginning of the r.e.
7223 * and avoiding duplication strengthens checking. Not a
7224 * strong reason, but sufficient in the absence of others.
7225 * [Now we resolve ties in favor of the earlier string if
7226 * it happens that c_offset_min has been invalidated, since the
7227 * earlier string may buy us something the later one won't.]
7230 data.longest_fixed = newSVpvs("");
7231 data.longest_float = newSVpvs("");
7232 data.last_found = newSVpvs("");
7233 data.longest = &(data.longest_fixed);
7234 ENTER_with_name("study_chunk");
7235 SAVEFREESV(data.longest_fixed);
7236 SAVEFREESV(data.longest_float);
7237 SAVEFREESV(data.last_found);
7239 if (!ri->regstclass) {
7240 ssc_init(pRExC_state, &ch_class);
7241 data.start_class = &ch_class;
7242 stclass_flag = SCF_DO_STCLASS_AND;
7243 } else /* XXXX Check for BOUND? */
7245 data.last_closep = &last_close;
7248 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7249 scan + RExC_size, /* Up to end */
7251 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7252 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7256 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7259 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7260 && data.last_start_min == 0 && data.last_end > 0
7261 && !RExC_seen_zerolen
7262 && !(RExC_seen & REG_VERBARG_SEEN)
7263 && !(RExC_seen & REG_GPOS_SEEN)
7265 r->extflags |= RXf_CHECK_ALL;
7267 scan_commit(pRExC_state, &data,&minlen,0);
7269 longest_float_length = CHR_SVLEN(data.longest_float);
7271 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
7272 && data.offset_fixed == data.offset_float_min
7273 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7274 && S_setup_longest (aTHX_ pRExC_state,
7278 &(r->float_end_shift),
7279 data.lookbehind_float,
7280 data.offset_float_min,
7282 longest_float_length,
7283 cBOOL(data.flags & SF_FL_BEFORE_EOL),
7284 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7286 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7287 r->float_max_offset = data.offset_float_max;
7288 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7289 r->float_max_offset -= data.lookbehind_float;
7290 SvREFCNT_inc_simple_void_NN(data.longest_float);
7293 r->float_substr = r->float_utf8 = NULL;
7294 longest_float_length = 0;
7297 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7299 if (S_setup_longest (aTHX_ pRExC_state,
7301 &(r->anchored_utf8),
7302 &(r->anchored_substr),
7303 &(r->anchored_end_shift),
7304 data.lookbehind_fixed,
7307 longest_fixed_length,
7308 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7309 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7311 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7312 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7315 r->anchored_substr = r->anchored_utf8 = NULL;
7316 longest_fixed_length = 0;
7318 LEAVE_with_name("study_chunk");
7321 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7322 ri->regstclass = NULL;
7324 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7326 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7327 && is_ssc_worth_it(pRExC_state, data.start_class))
7329 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7331 ssc_finalize(pRExC_state, data.start_class);
7333 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7334 StructCopy(data.start_class,
7335 (regnode_ssc*)RExC_rxi->data->data[n],
7337 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7338 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7339 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7340 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7341 PerlIO_printf(Perl_debug_log,
7342 "synthetic stclass \"%s\".\n",
7343 SvPVX_const(sv));});
7344 data.start_class = NULL;
7347 /* A temporary algorithm prefers floated substr to fixed one to dig
7349 if (longest_fixed_length > longest_float_length) {
7350 r->substrs->check_ix = 0;
7351 r->check_end_shift = r->anchored_end_shift;
7352 r->check_substr = r->anchored_substr;
7353 r->check_utf8 = r->anchored_utf8;
7354 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7355 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7356 r->intflags |= PREGf_NOSCAN;
7359 r->substrs->check_ix = 1;
7360 r->check_end_shift = r->float_end_shift;
7361 r->check_substr = r->float_substr;
7362 r->check_utf8 = r->float_utf8;
7363 r->check_offset_min = r->float_min_offset;
7364 r->check_offset_max = r->float_max_offset;
7366 if ((r->check_substr || r->check_utf8) ) {
7367 r->extflags |= RXf_USE_INTUIT;
7368 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7369 r->extflags |= RXf_INTUIT_TAIL;
7371 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7373 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7374 if ( (STRLEN)minlen < longest_float_length )
7375 minlen= longest_float_length;
7376 if ( (STRLEN)minlen < longest_fixed_length )
7377 minlen= longest_fixed_length;
7381 /* Several toplevels. Best we can is to set minlen. */
7383 regnode_ssc ch_class;
7384 SSize_t last_close = 0;
7386 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7388 scan = ri->program + 1;
7389 ssc_init(pRExC_state, &ch_class);
7390 data.start_class = &ch_class;
7391 data.last_closep = &last_close;
7394 minlen = study_chunk(pRExC_state,
7395 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7396 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7397 ? SCF_TRIE_DOING_RESTUDY
7401 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7403 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7404 = r->float_substr = r->float_utf8 = NULL;
7406 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7407 && is_ssc_worth_it(pRExC_state, data.start_class))
7409 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7411 ssc_finalize(pRExC_state, data.start_class);
7413 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7414 StructCopy(data.start_class,
7415 (regnode_ssc*)RExC_rxi->data->data[n],
7417 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7418 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7419 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7420 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7421 PerlIO_printf(Perl_debug_log,
7422 "synthetic stclass \"%s\".\n",
7423 SvPVX_const(sv));});
7424 data.start_class = NULL;
7428 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7429 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7430 r->maxlen = REG_INFTY;
7433 r->maxlen = RExC_maxlen;
7436 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7437 the "real" pattern. */
7439 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7440 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7442 r->minlenret = minlen;
7443 if (r->minlen < minlen)
7446 if (RExC_seen & REG_GPOS_SEEN)
7447 r->intflags |= PREGf_GPOS_SEEN;
7448 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7449 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7451 if (pRExC_state->num_code_blocks)
7452 r->extflags |= RXf_EVAL_SEEN;
7453 if (RExC_seen & REG_VERBARG_SEEN)
7455 r->intflags |= PREGf_VERBARG_SEEN;
7456 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7458 if (RExC_seen & REG_CUTGROUP_SEEN)
7459 r->intflags |= PREGf_CUTGROUP_SEEN;
7460 if (pm_flags & PMf_USE_RE_EVAL)
7461 r->intflags |= PREGf_USE_RE_EVAL;
7462 if (RExC_paren_names)
7463 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7465 RXp_PAREN_NAMES(r) = NULL;
7467 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7468 * so it can be used in pp.c */
7469 if (r->intflags & PREGf_ANCH)
7470 r->extflags |= RXf_IS_ANCHORED;
7474 /* this is used to identify "special" patterns that might result
7475 * in Perl NOT calling the regex engine and instead doing the match "itself",
7476 * particularly special cases in split//. By having the regex compiler
7477 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7478 * we avoid weird issues with equivalent patterns resulting in different behavior,
7479 * AND we allow non Perl engines to get the same optimizations by the setting the
7480 * flags appropriately - Yves */
7481 regnode *first = ri->program + 1;
7483 regnode *next = regnext(first);
7486 if (PL_regkind[fop] == NOTHING && nop == END)
7487 r->extflags |= RXf_NULL;
7488 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7489 /* when fop is SBOL first->flags will be true only when it was
7490 * produced by parsing /\A/, and not when parsing /^/. This is
7491 * very important for the split code as there we want to
7492 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7493 * See rt #122761 for more details. -- Yves */
7494 r->extflags |= RXf_START_ONLY;
7495 else if (fop == PLUS
7496 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7498 r->extflags |= RXf_WHITE;
7499 else if ( r->extflags & RXf_SPLIT
7500 && (fop == EXACT || fop == EXACTL)
7501 && STR_LEN(first) == 1
7502 && *(STRING(first)) == ' '
7504 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7508 if (RExC_contains_locale) {
7509 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7513 if (RExC_paren_names) {
7514 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7515 ri->data->data[ri->name_list_idx]
7516 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7519 ri->name_list_idx = 0;
7521 if (RExC_recurse_count) {
7522 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7523 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7524 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7527 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7528 /* assume we don't need to swap parens around before we match */
7530 PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7531 (unsigned long)RExC_study_chunk_recursed_count);
7535 PerlIO_printf(Perl_debug_log,"Final program:\n");
7538 #ifdef RE_TRACK_PATTERN_OFFSETS
7539 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7540 const STRLEN len = ri->u.offsets[0];
7542 GET_RE_DEBUG_FLAGS_DECL;
7543 PerlIO_printf(Perl_debug_log,
7544 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7545 for (i = 1; i <= len; i++) {
7546 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7547 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7548 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7550 PerlIO_printf(Perl_debug_log, "\n");
7555 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7556 * by setting the regexp SV to readonly-only instead. If the
7557 * pattern's been recompiled, the USEDness should remain. */
7558 if (old_re && SvREADONLY(old_re))
7566 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7569 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7571 PERL_UNUSED_ARG(value);
7573 if (flags & RXapif_FETCH) {
7574 return reg_named_buff_fetch(rx, key, flags);
7575 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7576 Perl_croak_no_modify();
7578 } else if (flags & RXapif_EXISTS) {
7579 return reg_named_buff_exists(rx, key, flags)
7582 } else if (flags & RXapif_REGNAMES) {
7583 return reg_named_buff_all(rx, flags);
7584 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7585 return reg_named_buff_scalar(rx, flags);
7587 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7593 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7596 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7597 PERL_UNUSED_ARG(lastkey);
7599 if (flags & RXapif_FIRSTKEY)
7600 return reg_named_buff_firstkey(rx, flags);
7601 else if (flags & RXapif_NEXTKEY)
7602 return reg_named_buff_nextkey(rx, flags);
7604 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7611 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7614 AV *retarray = NULL;
7616 struct regexp *const rx = ReANY(r);
7618 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7620 if (flags & RXapif_ALL)
7623 if (rx && RXp_PAREN_NAMES(rx)) {
7624 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7627 SV* sv_dat=HeVAL(he_str);
7628 I32 *nums=(I32*)SvPVX(sv_dat);
7629 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7630 if ((I32)(rx->nparens) >= nums[i]
7631 && rx->offs[nums[i]].start != -1
7632 && rx->offs[nums[i]].end != -1)
7635 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7640 ret = newSVsv(&PL_sv_undef);
7643 av_push(retarray, ret);
7646 return newRV_noinc(MUTABLE_SV(retarray));
7653 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7656 struct regexp *const rx = ReANY(r);
7658 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7660 if (rx && RXp_PAREN_NAMES(rx)) {
7661 if (flags & RXapif_ALL) {
7662 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7664 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7666 SvREFCNT_dec_NN(sv);
7678 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7680 struct regexp *const rx = ReANY(r);
7682 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7684 if ( rx && RXp_PAREN_NAMES(rx) ) {
7685 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7687 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7694 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7696 struct regexp *const rx = ReANY(r);
7697 GET_RE_DEBUG_FLAGS_DECL;
7699 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7701 if (rx && RXp_PAREN_NAMES(rx)) {
7702 HV *hv = RXp_PAREN_NAMES(rx);
7704 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7707 SV* sv_dat = HeVAL(temphe);
7708 I32 *nums = (I32*)SvPVX(sv_dat);
7709 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7710 if ((I32)(rx->lastparen) >= nums[i] &&
7711 rx->offs[nums[i]].start != -1 &&
7712 rx->offs[nums[i]].end != -1)
7718 if (parno || flags & RXapif_ALL) {
7719 return newSVhek(HeKEY_hek(temphe));
7727 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7732 struct regexp *const rx = ReANY(r);
7734 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7736 if (rx && RXp_PAREN_NAMES(rx)) {
7737 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7738 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7739 } else if (flags & RXapif_ONE) {
7740 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7741 av = MUTABLE_AV(SvRV(ret));
7742 length = av_tindex(av);
7743 SvREFCNT_dec_NN(ret);
7744 return newSViv(length + 1);
7746 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7751 return &PL_sv_undef;
7755 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7757 struct regexp *const rx = ReANY(r);
7760 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7762 if (rx && RXp_PAREN_NAMES(rx)) {
7763 HV *hv= RXp_PAREN_NAMES(rx);
7765 (void)hv_iterinit(hv);
7766 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7769 SV* sv_dat = HeVAL(temphe);
7770 I32 *nums = (I32*)SvPVX(sv_dat);
7771 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7772 if ((I32)(rx->lastparen) >= nums[i] &&
7773 rx->offs[nums[i]].start != -1 &&
7774 rx->offs[nums[i]].end != -1)
7780 if (parno || flags & RXapif_ALL) {
7781 av_push(av, newSVhek(HeKEY_hek(temphe)));
7786 return newRV_noinc(MUTABLE_SV(av));
7790 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7793 struct regexp *const rx = ReANY(r);
7799 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7801 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7802 || n == RX_BUFF_IDX_CARET_FULLMATCH
7803 || n == RX_BUFF_IDX_CARET_POSTMATCH
7806 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7808 /* on something like
7811 * the KEEPCOPY is set on the PMOP rather than the regex */
7812 if (PL_curpm && r == PM_GETRE(PL_curpm))
7813 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7822 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7823 /* no need to distinguish between them any more */
7824 n = RX_BUFF_IDX_FULLMATCH;
7826 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7827 && rx->offs[0].start != -1)
7829 /* $`, ${^PREMATCH} */
7830 i = rx->offs[0].start;
7834 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7835 && rx->offs[0].end != -1)
7837 /* $', ${^POSTMATCH} */
7838 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7839 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7842 if ( 0 <= n && n <= (I32)rx->nparens &&
7843 (s1 = rx->offs[n].start) != -1 &&
7844 (t1 = rx->offs[n].end) != -1)
7846 /* $&, ${^MATCH}, $1 ... */
7848 s = rx->subbeg + s1 - rx->suboffset;
7853 assert(s >= rx->subbeg);
7854 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7856 #ifdef NO_TAINT_SUPPORT
7857 sv_setpvn(sv, s, i);
7859 const int oldtainted = TAINT_get;
7861 sv_setpvn(sv, s, i);
7862 TAINT_set(oldtainted);
7864 if (RXp_MATCH_UTF8(rx))
7869 if (RXp_MATCH_TAINTED(rx)) {
7870 if (SvTYPE(sv) >= SVt_PVMG) {
7871 MAGIC* const mg = SvMAGIC(sv);
7874 SvMAGIC_set(sv, mg->mg_moremagic);
7876 if ((mgt = SvMAGIC(sv))) {
7877 mg->mg_moremagic = mgt;
7878 SvMAGIC_set(sv, mg);
7889 sv_setsv(sv,&PL_sv_undef);
7895 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7896 SV const * const value)
7898 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7900 PERL_UNUSED_ARG(rx);
7901 PERL_UNUSED_ARG(paren);
7902 PERL_UNUSED_ARG(value);
7905 Perl_croak_no_modify();
7909 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7912 struct regexp *const rx = ReANY(r);
7916 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7918 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7919 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7920 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7923 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7925 /* on something like
7928 * the KEEPCOPY is set on the PMOP rather than the regex */
7929 if (PL_curpm && r == PM_GETRE(PL_curpm))
7930 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7936 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7938 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7939 case RX_BUFF_IDX_PREMATCH: /* $` */
7940 if (rx->offs[0].start != -1) {
7941 i = rx->offs[0].start;
7950 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7951 case RX_BUFF_IDX_POSTMATCH: /* $' */
7952 if (rx->offs[0].end != -1) {
7953 i = rx->sublen - rx->offs[0].end;
7955 s1 = rx->offs[0].end;
7962 default: /* $& / ${^MATCH}, $1, $2, ... */
7963 if (paren <= (I32)rx->nparens &&
7964 (s1 = rx->offs[paren].start) != -1 &&
7965 (t1 = rx->offs[paren].end) != -1)
7971 if (ckWARN(WARN_UNINITIALIZED))
7972 report_uninit((const SV *)sv);
7977 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7978 const char * const s = rx->subbeg - rx->suboffset + s1;
7983 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7990 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7992 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7993 PERL_UNUSED_ARG(rx);
7997 return newSVpvs("Regexp");
8000 /* Scans the name of a named buffer from the pattern.
8001 * If flags is REG_RSN_RETURN_NULL returns null.
8002 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8003 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8004 * to the parsed name as looked up in the RExC_paren_names hash.
8005 * If there is an error throws a vFAIL().. type exception.
8008 #define REG_RSN_RETURN_NULL 0
8009 #define REG_RSN_RETURN_NAME 1
8010 #define REG_RSN_RETURN_DATA 2
8013 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8015 char *name_start = RExC_parse;
8017 PERL_ARGS_ASSERT_REG_SCAN_NAME;
8019 assert (RExC_parse <= RExC_end);
8020 if (RExC_parse == RExC_end) NOOP;
8021 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
8022 /* skip IDFIRST by using do...while */
8025 RExC_parse += UTF8SKIP(RExC_parse);
8026 } while (isWORDCHAR_utf8((U8*)RExC_parse));
8030 } while (isWORDCHAR(*RExC_parse));
8032 RExC_parse++; /* so the <- from the vFAIL is after the offending
8034 vFAIL("Group name must start with a non-digit word character");
8038 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8039 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8040 if ( flags == REG_RSN_RETURN_NAME)
8042 else if (flags==REG_RSN_RETURN_DATA) {
8045 if ( ! sv_name ) /* should not happen*/
8046 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8047 if (RExC_paren_names)
8048 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8050 sv_dat = HeVAL(he_str);
8052 vFAIL("Reference to nonexistent named group");
8056 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8057 (unsigned long) flags);
8059 NOT_REACHED; /* NOTREACHED */
8064 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
8066 if (RExC_lastparse!=RExC_parse) { \
8067 PerlIO_printf(Perl_debug_log, "%s", \
8068 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
8069 RExC_end - RExC_parse, 16, \
8071 PERL_PV_ESCAPE_UNI_DETECT | \
8072 PERL_PV_PRETTY_ELLIPSES | \
8073 PERL_PV_PRETTY_LTGT | \
8074 PERL_PV_ESCAPE_RE | \
8075 PERL_PV_PRETTY_EXACTSIZE \
8079 PerlIO_printf(Perl_debug_log,"%16s",""); \
8082 num = RExC_size + 1; \
8084 num=REG_NODE_NUM(RExC_emit); \
8085 if (RExC_lastnum!=num) \
8086 PerlIO_printf(Perl_debug_log,"|%4d",num); \
8088 PerlIO_printf(Perl_debug_log,"|%4s",""); \
8089 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
8090 (int)((depth*2)), "", \
8094 RExC_lastparse=RExC_parse; \
8099 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
8100 DEBUG_PARSE_MSG((funcname)); \
8101 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
8103 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
8104 DEBUG_PARSE_MSG((funcname)); \
8105 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
8108 /* This section of code defines the inversion list object and its methods. The
8109 * interfaces are highly subject to change, so as much as possible is static to
8110 * this file. An inversion list is here implemented as a malloc'd C UV array
8111 * as an SVt_INVLIST scalar.
8113 * An inversion list for Unicode is an array of code points, sorted by ordinal
8114 * number. The zeroth element is the first code point in the list. The 1th
8115 * element is the first element beyond that not in the list. In other words,
8116 * the first range is
8117 * invlist[0]..(invlist[1]-1)
8118 * The other ranges follow. Thus every element whose index is divisible by two
8119 * marks the beginning of a range that is in the list, and every element not
8120 * divisible by two marks the beginning of a range not in the list. A single
8121 * element inversion list that contains the single code point N generally
8122 * consists of two elements
8125 * (The exception is when N is the highest representable value on the
8126 * machine, in which case the list containing just it would be a single
8127 * element, itself. By extension, if the last range in the list extends to
8128 * infinity, then the first element of that range will be in the inversion list
8129 * at a position that is divisible by two, and is the final element in the
8131 * Taking the complement (inverting) an inversion list is quite simple, if the
8132 * first element is 0, remove it; otherwise add a 0 element at the beginning.
8133 * This implementation reserves an element at the beginning of each inversion
8134 * list to always contain 0; there is an additional flag in the header which
8135 * indicates if the list begins at the 0, or is offset to begin at the next
8138 * More about inversion lists can be found in "Unicode Demystified"
8139 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8140 * More will be coming when functionality is added later.
8142 * The inversion list data structure is currently implemented as an SV pointing
8143 * to an array of UVs that the SV thinks are bytes. This allows us to have an
8144 * array of UV whose memory management is automatically handled by the existing
8145 * facilities for SV's.
8147 * Some of the methods should always be private to the implementation, and some
8148 * should eventually be made public */
8150 /* The header definitions are in F<invlist_inline.h> */
8152 PERL_STATIC_INLINE UV*
8153 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8155 /* Returns a pointer to the first element in the inversion list's array.
8156 * This is called upon initialization of an inversion list. Where the
8157 * array begins depends on whether the list has the code point U+0000 in it
8158 * or not. The other parameter tells it whether the code that follows this
8159 * call is about to put a 0 in the inversion list or not. The first
8160 * element is either the element reserved for 0, if TRUE, or the element
8161 * after it, if FALSE */
8163 bool* offset = get_invlist_offset_addr(invlist);
8164 UV* zero_addr = (UV *) SvPVX(invlist);
8166 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8169 assert(! _invlist_len(invlist));
8173 /* 1^1 = 0; 1^0 = 1 */
8174 *offset = 1 ^ will_have_0;
8175 return zero_addr + *offset;
8178 PERL_STATIC_INLINE void
8179 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8181 /* Sets the current number of elements stored in the inversion list.
8182 * Updates SvCUR correspondingly */
8183 PERL_UNUSED_CONTEXT;
8184 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8186 assert(SvTYPE(invlist) == SVt_INVLIST);
8191 : TO_INTERNAL_SIZE(len + offset));
8192 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8195 #ifndef PERL_IN_XSUB_RE
8197 PERL_STATIC_INLINE IV*
8198 S_get_invlist_previous_index_addr(SV* invlist)
8200 /* Return the address of the IV that is reserved to hold the cached index
8202 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8204 assert(SvTYPE(invlist) == SVt_INVLIST);
8206 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8209 PERL_STATIC_INLINE IV
8210 S_invlist_previous_index(SV* const invlist)
8212 /* Returns cached index of previous search */
8214 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8216 return *get_invlist_previous_index_addr(invlist);
8219 PERL_STATIC_INLINE void
8220 S_invlist_set_previous_index(SV* const invlist, const IV index)
8222 /* Caches <index> for later retrieval */
8224 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8226 assert(index == 0 || index < (int) _invlist_len(invlist));
8228 *get_invlist_previous_index_addr(invlist) = index;
8231 PERL_STATIC_INLINE void
8232 S_invlist_trim(SV* const invlist)
8234 PERL_ARGS_ASSERT_INVLIST_TRIM;
8236 assert(SvTYPE(invlist) == SVt_INVLIST);
8238 /* Change the length of the inversion list to how many entries it currently
8240 SvPV_shrink_to_cur((SV *) invlist);
8243 PERL_STATIC_INLINE bool
8244 S_invlist_is_iterating(SV* const invlist)
8246 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8248 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8251 #endif /* ifndef PERL_IN_XSUB_RE */
8253 PERL_STATIC_INLINE UV
8254 S_invlist_max(SV* const invlist)
8256 /* Returns the maximum number of elements storable in the inversion list's
8257 * array, without having to realloc() */
8259 PERL_ARGS_ASSERT_INVLIST_MAX;
8261 assert(SvTYPE(invlist) == SVt_INVLIST);
8263 /* Assumes worst case, in which the 0 element is not counted in the
8264 * inversion list, so subtracts 1 for that */
8265 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8266 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8267 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8270 #ifndef PERL_IN_XSUB_RE
8272 Perl__new_invlist(pTHX_ IV initial_size)
8275 /* Return a pointer to a newly constructed inversion list, with enough
8276 * space to store 'initial_size' elements. If that number is negative, a
8277 * system default is used instead */
8281 if (initial_size < 0) {
8285 /* Allocate the initial space */
8286 new_list = newSV_type(SVt_INVLIST);
8288 /* First 1 is in case the zero element isn't in the list; second 1 is for
8290 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8291 invlist_set_len(new_list, 0, 0);
8293 /* Force iterinit() to be used to get iteration to work */
8294 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8296 *get_invlist_previous_index_addr(new_list) = 0;
8302 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8304 /* Return a pointer to a newly constructed inversion list, initialized to
8305 * point to <list>, which has to be in the exact correct inversion list
8306 * form, including internal fields. Thus this is a dangerous routine that
8307 * should not be used in the wrong hands. The passed in 'list' contains
8308 * several header fields at the beginning that are not part of the
8309 * inversion list body proper */
8311 const STRLEN length = (STRLEN) list[0];
8312 const UV version_id = list[1];
8313 const bool offset = cBOOL(list[2]);
8314 #define HEADER_LENGTH 3
8315 /* If any of the above changes in any way, you must change HEADER_LENGTH
8316 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8317 * perl -E 'say int(rand 2**31-1)'
8319 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8320 data structure type, so that one being
8321 passed in can be validated to be an
8322 inversion list of the correct vintage.
8325 SV* invlist = newSV_type(SVt_INVLIST);
8327 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8329 if (version_id != INVLIST_VERSION_ID) {
8330 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8333 /* The generated array passed in includes header elements that aren't part
8334 * of the list proper, so start it just after them */
8335 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8337 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8338 shouldn't touch it */
8340 *(get_invlist_offset_addr(invlist)) = offset;
8342 /* The 'length' passed to us is the physical number of elements in the
8343 * inversion list. But if there is an offset the logical number is one
8345 invlist_set_len(invlist, length - offset, offset);
8347 invlist_set_previous_index(invlist, 0);
8349 /* Initialize the iteration pointer. */
8350 invlist_iterfinish(invlist);
8352 SvREADONLY_on(invlist);
8356 #endif /* ifndef PERL_IN_XSUB_RE */
8359 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8361 /* Grow the maximum size of an inversion list */
8363 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8365 assert(SvTYPE(invlist) == SVt_INVLIST);
8367 /* Add one to account for the zero element at the beginning which may not
8368 * be counted by the calling parameters */
8369 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8373 S__append_range_to_invlist(pTHX_ SV* const invlist,
8374 const UV start, const UV end)
8376 /* Subject to change or removal. Append the range from 'start' to 'end' at
8377 * the end of the inversion list. The range must be above any existing
8381 UV max = invlist_max(invlist);
8382 UV len = _invlist_len(invlist);
8385 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8387 if (len == 0) { /* Empty lists must be initialized */
8388 offset = start != 0;
8389 array = _invlist_array_init(invlist, ! offset);
8392 /* Here, the existing list is non-empty. The current max entry in the
8393 * list is generally the first value not in the set, except when the
8394 * set extends to the end of permissible values, in which case it is
8395 * the first entry in that final set, and so this call is an attempt to
8396 * append out-of-order */
8398 UV final_element = len - 1;
8399 array = invlist_array(invlist);
8400 if (array[final_element] > start
8401 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8403 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8404 array[final_element], start,
8405 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8408 /* Here, it is a legal append. If the new range begins with the first
8409 * value not in the set, it is extending the set, so the new first
8410 * value not in the set is one greater than the newly extended range.
8412 offset = *get_invlist_offset_addr(invlist);
8413 if (array[final_element] == start) {
8414 if (end != UV_MAX) {
8415 array[final_element] = end + 1;
8418 /* But if the end is the maximum representable on the machine,
8419 * just let the range that this would extend to have no end */
8420 invlist_set_len(invlist, len - 1, offset);
8426 /* Here the new range doesn't extend any existing set. Add it */
8428 len += 2; /* Includes an element each for the start and end of range */
8430 /* If wll overflow the existing space, extend, which may cause the array to
8433 invlist_extend(invlist, len);
8435 /* Have to set len here to avoid assert failure in invlist_array() */
8436 invlist_set_len(invlist, len, offset);
8438 array = invlist_array(invlist);
8441 invlist_set_len(invlist, len, offset);
8444 /* The next item on the list starts the range, the one after that is
8445 * one past the new range. */
8446 array[len - 2] = start;
8447 if (end != UV_MAX) {
8448 array[len - 1] = end + 1;
8451 /* But if the end is the maximum representable on the machine, just let
8452 * the range have no end */
8453 invlist_set_len(invlist, len - 1, offset);
8457 #ifndef PERL_IN_XSUB_RE
8460 Perl__invlist_search(SV* const invlist, const UV cp)
8462 /* Searches the inversion list for the entry that contains the input code
8463 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8464 * return value is the index into the list's array of the range that
8469 IV high = _invlist_len(invlist);
8470 const IV highest_element = high - 1;
8473 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8475 /* If list is empty, return failure. */
8480 /* (We can't get the array unless we know the list is non-empty) */
8481 array = invlist_array(invlist);
8483 mid = invlist_previous_index(invlist);
8484 assert(mid >=0 && mid <= highest_element);
8486 /* <mid> contains the cache of the result of the previous call to this
8487 * function (0 the first time). See if this call is for the same result,
8488 * or if it is for mid-1. This is under the theory that calls to this
8489 * function will often be for related code points that are near each other.
8490 * And benchmarks show that caching gives better results. We also test
8491 * here if the code point is within the bounds of the list. These tests
8492 * replace others that would have had to be made anyway to make sure that
8493 * the array bounds were not exceeded, and these give us extra information
8494 * at the same time */
8495 if (cp >= array[mid]) {
8496 if (cp >= array[highest_element]) {
8497 return highest_element;
8500 /* Here, array[mid] <= cp < array[highest_element]. This means that
8501 * the final element is not the answer, so can exclude it; it also
8502 * means that <mid> is not the final element, so can refer to 'mid + 1'
8504 if (cp < array[mid + 1]) {
8510 else { /* cp < aray[mid] */
8511 if (cp < array[0]) { /* Fail if outside the array */
8515 if (cp >= array[mid - 1]) {
8520 /* Binary search. What we are looking for is <i> such that
8521 * array[i] <= cp < array[i+1]
8522 * The loop below converges on the i+1. Note that there may not be an
8523 * (i+1)th element in the array, and things work nonetheless */
8524 while (low < high) {
8525 mid = (low + high) / 2;
8526 assert(mid <= highest_element);
8527 if (array[mid] <= cp) { /* cp >= array[mid] */
8530 /* We could do this extra test to exit the loop early.
8531 if (cp < array[low]) {
8536 else { /* cp < array[mid] */
8543 invlist_set_previous_index(invlist, high);
8548 Perl__invlist_populate_swatch(SV* const invlist,
8549 const UV start, const UV end, U8* swatch)
8551 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8552 * but is used when the swash has an inversion list. This makes this much
8553 * faster, as it uses a binary search instead of a linear one. This is
8554 * intimately tied to that function, and perhaps should be in utf8.c,
8555 * except it is intimately tied to inversion lists as well. It assumes
8556 * that <swatch> is all 0's on input */
8559 const IV len = _invlist_len(invlist);
8563 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8565 if (len == 0) { /* Empty inversion list */
8569 array = invlist_array(invlist);
8571 /* Find which element it is */
8572 i = _invlist_search(invlist, start);
8574 /* We populate from <start> to <end> */
8575 while (current < end) {
8578 /* The inversion list gives the results for every possible code point
8579 * after the first one in the list. Only those ranges whose index is
8580 * even are ones that the inversion list matches. For the odd ones,
8581 * and if the initial code point is not in the list, we have to skip
8582 * forward to the next element */
8583 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8585 if (i >= len) { /* Finished if beyond the end of the array */
8589 if (current >= end) { /* Finished if beyond the end of what we
8591 if (LIKELY(end < UV_MAX)) {
8595 /* We get here when the upper bound is the maximum
8596 * representable on the machine, and we are looking for just
8597 * that code point. Have to special case it */
8599 goto join_end_of_list;
8602 assert(current >= start);
8604 /* The current range ends one below the next one, except don't go past
8607 upper = (i < len && array[i] < end) ? array[i] : end;
8609 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8610 * for each code point in it */
8611 for (; current < upper; current++) {
8612 const STRLEN offset = (STRLEN)(current - start);
8613 swatch[offset >> 3] |= 1 << (offset & 7);
8618 /* Quit if at the end of the list */
8621 /* But first, have to deal with the highest possible code point on
8622 * the platform. The previous code assumes that <end> is one
8623 * beyond where we want to populate, but that is impossible at the
8624 * platform's infinity, so have to handle it specially */
8625 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8627 const STRLEN offset = (STRLEN)(end - start);
8628 swatch[offset >> 3] |= 1 << (offset & 7);
8633 /* Advance to the next range, which will be for code points not in the
8642 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8643 const bool complement_b, SV** output)
8645 /* Take the union of two inversion lists and point <output> to it. *output
8646 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8647 * the reference count to that list will be decremented if not already a
8648 * temporary (mortal); otherwise *output will be made correspondingly
8649 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8650 * second list is returned. If <complement_b> is TRUE, the union is taken
8651 * of the complement (inversion) of <b> instead of b itself.
8653 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8654 * Richard Gillam, published by Addison-Wesley, and explained at some
8655 * length there. The preface says to incorporate its examples into your
8656 * code at your own risk.
8658 * The algorithm is like a merge sort.
8660 * XXX A potential performance improvement is to keep track as we go along
8661 * if only one of the inputs contributes to the result, meaning the other
8662 * is a subset of that one. In that case, we can skip the final copy and
8663 * return the larger of the input lists, but then outside code might need
8664 * to keep track of whether to free the input list or not */
8666 const UV* array_a; /* a's array */
8668 UV len_a; /* length of a's array */
8671 SV* u; /* the resulting union */
8675 UV i_a = 0; /* current index into a's array */
8679 /* running count, as explained in the algorithm source book; items are
8680 * stopped accumulating and are output when the count changes to/from 0.
8681 * The count is incremented when we start a range that's in the set, and
8682 * decremented when we start a range that's not in the set. So its range
8683 * is 0 to 2. Only when the count is zero is something not in the set.
8687 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8690 /* If either one is empty, the union is the other one */
8691 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8692 bool make_temp = FALSE; /* Should we mortalize the result? */
8696 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8702 *output = invlist_clone(b);
8704 _invlist_invert(*output);
8706 } /* else *output already = b; */
8709 sv_2mortal(*output);
8713 else if ((len_b = _invlist_len(b)) == 0) {
8714 bool make_temp = FALSE;
8716 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8721 /* The complement of an empty list is a list that has everything in it,
8722 * so the union with <a> includes everything too */
8725 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8729 *output = _new_invlist(1);
8730 _append_range_to_invlist(*output, 0, UV_MAX);
8732 else if (*output != a) {
8733 *output = invlist_clone(a);
8735 /* else *output already = a; */
8738 sv_2mortal(*output);
8743 /* Here both lists exist and are non-empty */
8744 array_a = invlist_array(a);
8745 array_b = invlist_array(b);
8747 /* If are to take the union of 'a' with the complement of b, set it
8748 * up so are looking at b's complement. */
8751 /* To complement, we invert: if the first element is 0, remove it. To
8752 * do this, we just pretend the array starts one later */
8753 if (array_b[0] == 0) {
8759 /* But if the first element is not zero, we pretend the list starts
8760 * at the 0 that is always stored immediately before the array. */
8766 /* Size the union for the worst case: that the sets are completely
8768 u = _new_invlist(len_a + len_b);
8770 /* Will contain U+0000 if either component does */
8771 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8772 || (len_b > 0 && array_b[0] == 0));
8774 /* Go through each list item by item, stopping when exhausted one of
8776 while (i_a < len_a && i_b < len_b) {
8777 UV cp; /* The element to potentially add to the union's array */
8778 bool cp_in_set; /* is it in the the input list's set or not */
8780 /* We need to take one or the other of the two inputs for the union.
8781 * Since we are merging two sorted lists, we take the smaller of the
8782 * next items. In case of a tie, we take the one that is in its set
8783 * first. If we took one not in the set first, it would decrement the
8784 * count, possibly to 0 which would cause it to be output as ending the
8785 * range, and the next time through we would take the same number, and
8786 * output it again as beginning the next range. By doing it the
8787 * opposite way, there is no possibility that the count will be
8788 * momentarily decremented to 0, and thus the two adjoining ranges will
8789 * be seamlessly merged. (In a tie and both are in the set or both not
8790 * in the set, it doesn't matter which we take first.) */
8791 if (array_a[i_a] < array_b[i_b]
8792 || (array_a[i_a] == array_b[i_b]
8793 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8795 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8799 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8800 cp = array_b[i_b++];
8803 /* Here, have chosen which of the two inputs to look at. Only output
8804 * if the running count changes to/from 0, which marks the
8805 * beginning/end of a range in that's in the set */
8808 array_u[i_u++] = cp;
8815 array_u[i_u++] = cp;
8820 /* Here, we are finished going through at least one of the lists, which
8821 * means there is something remaining in at most one. We check if the list
8822 * that hasn't been exhausted is positioned such that we are in the middle
8823 * of a range in its set or not. (i_a and i_b point to the element beyond
8824 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8825 * is potentially more to output.
8826 * There are four cases:
8827 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8828 * in the union is entirely from the non-exhausted set.
8829 * 2) Both were in their sets, count is 2. Nothing further should
8830 * be output, as everything that remains will be in the exhausted
8831 * list's set, hence in the union; decrementing to 1 but not 0 insures
8833 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8834 * Nothing further should be output because the union includes
8835 * everything from the exhausted set. Not decrementing ensures that.
8836 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8837 * decrementing to 0 insures that we look at the remainder of the
8838 * non-exhausted set */
8839 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8840 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8845 /* The final length is what we've output so far, plus what else is about to
8846 * be output. (If 'count' is non-zero, then the input list we exhausted
8847 * has everything remaining up to the machine's limit in its set, and hence
8848 * in the union, so there will be no further output. */
8851 /* At most one of the subexpressions will be non-zero */
8852 len_u += (len_a - i_a) + (len_b - i_b);
8855 /* Set result to final length, which can change the pointer to array_u, so
8857 if (len_u != _invlist_len(u)) {
8858 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8860 array_u = invlist_array(u);
8863 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8864 * the other) ended with everything above it not in its set. That means
8865 * that the remaining part of the union is precisely the same as the
8866 * non-exhausted list, so can just copy it unchanged. (If both list were
8867 * exhausted at the same time, then the operations below will be both 0.)
8870 IV copy_count; /* At most one will have a non-zero copy count */
8871 if ((copy_count = len_a - i_a) > 0) {
8872 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8874 else if ((copy_count = len_b - i_b) > 0) {
8875 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8879 /* We may be removing a reference to one of the inputs. If so, the output
8880 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8881 * count decremented) */
8882 if (a == *output || b == *output) {
8883 assert(! invlist_is_iterating(*output));
8884 if ((SvTEMP(*output))) {
8888 SvREFCNT_dec_NN(*output);
8898 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8899 const bool complement_b, SV** i)
8901 /* Take the intersection of two inversion lists and point <i> to it. *i
8902 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8903 * the reference count to that list will be decremented if not already a
8904 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8905 * The first list, <a>, may be NULL, in which case an empty list is
8906 * returned. If <complement_b> is TRUE, the result will be the
8907 * intersection of <a> and the complement (or inversion) of <b> instead of
8910 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8911 * Richard Gillam, published by Addison-Wesley, and explained at some
8912 * length there. The preface says to incorporate its examples into your
8913 * code at your own risk. In fact, it had bugs
8915 * The algorithm is like a merge sort, and is essentially the same as the
8919 const UV* array_a; /* a's array */
8921 UV len_a; /* length of a's array */
8924 SV* r; /* the resulting intersection */
8928 UV i_a = 0; /* current index into a's array */
8932 /* running count, as explained in the algorithm source book; items are
8933 * stopped accumulating and are output when the count changes to/from 2.
8934 * The count is incremented when we start a range that's in the set, and
8935 * decremented when we start a range that's not in the set. So its range
8936 * is 0 to 2. Only when the count is 2 is something in the intersection.
8940 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8943 /* Special case if either one is empty */
8944 len_a = (a == NULL) ? 0 : _invlist_len(a);
8945 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8946 bool make_temp = FALSE;
8948 if (len_a != 0 && complement_b) {
8950 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8951 * be empty. Here, also we are using 'b's complement, which hence
8952 * must be every possible code point. Thus the intersection is
8956 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8961 *i = invlist_clone(a);
8963 /* else *i is already 'a' */
8971 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8972 * intersection must be empty */
8974 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8979 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8983 *i = _new_invlist(0);
8991 /* Here both lists exist and are non-empty */
8992 array_a = invlist_array(a);
8993 array_b = invlist_array(b);
8995 /* If are to take the intersection of 'a' with the complement of b, set it
8996 * up so are looking at b's complement. */
8999 /* To complement, we invert: if the first element is 0, remove it. To
9000 * do this, we just pretend the array starts one later */
9001 if (array_b[0] == 0) {
9007 /* But if the first element is not zero, we pretend the list starts
9008 * at the 0 that is always stored immediately before the array. */
9014 /* Size the intersection for the worst case: that the intersection ends up
9015 * fragmenting everything to be completely disjoint */
9016 r= _new_invlist(len_a + len_b);
9018 /* Will contain U+0000 iff both components do */
9019 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9020 && len_b > 0 && array_b[0] == 0);
9022 /* Go through each list item by item, stopping when exhausted one of
9024 while (i_a < len_a && i_b < len_b) {
9025 UV cp; /* The element to potentially add to the intersection's
9027 bool cp_in_set; /* Is it in the input list's set or not */
9029 /* We need to take one or the other of the two inputs for the
9030 * intersection. Since we are merging two sorted lists, we take the
9031 * smaller of the next items. In case of a tie, we take the one that
9032 * is not in its set first (a difference from the union algorithm). If
9033 * we took one in the set first, it would increment the count, possibly
9034 * to 2 which would cause it to be output as starting a range in the
9035 * intersection, and the next time through we would take that same
9036 * number, and output it again as ending the set. By doing it the
9037 * opposite of this, there is no possibility that the count will be
9038 * momentarily incremented to 2. (In a tie and both are in the set or
9039 * both not in the set, it doesn't matter which we take first.) */
9040 if (array_a[i_a] < array_b[i_b]
9041 || (array_a[i_a] == array_b[i_b]
9042 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9044 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9048 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9052 /* Here, have chosen which of the two inputs to look at. Only output
9053 * if the running count changes to/from 2, which marks the
9054 * beginning/end of a range that's in the intersection */
9058 array_r[i_r++] = cp;
9063 array_r[i_r++] = cp;
9069 /* Here, we are finished going through at least one of the lists, which
9070 * means there is something remaining in at most one. We check if the list
9071 * that has been exhausted is positioned such that we are in the middle
9072 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
9073 * the ones we care about.) There are four cases:
9074 * 1) Both weren't in their sets, count is 0, and remains 0. There's
9075 * nothing left in the intersection.
9076 * 2) Both were in their sets, count is 2 and perhaps is incremented to
9077 * above 2. What should be output is exactly that which is in the
9078 * non-exhausted set, as everything it has is also in the intersection
9079 * set, and everything it doesn't have can't be in the intersection
9080 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
9081 * gets incremented to 2. Like the previous case, the intersection is
9082 * everything that remains in the non-exhausted set.
9083 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
9084 * remains 1. And the intersection has nothing more. */
9085 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9086 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9091 /* The final length is what we've output so far plus what else is in the
9092 * intersection. At most one of the subexpressions below will be non-zero
9096 len_r += (len_a - i_a) + (len_b - i_b);
9099 /* Set result to final length, which can change the pointer to array_r, so
9101 if (len_r != _invlist_len(r)) {
9102 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9104 array_r = invlist_array(r);
9107 /* Finish outputting any remaining */
9108 if (count >= 2) { /* At most one will have a non-zero copy count */
9110 if ((copy_count = len_a - i_a) > 0) {
9111 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9113 else if ((copy_count = len_b - i_b) > 0) {
9114 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9118 /* We may be removing a reference to one of the inputs. If so, the output
9119 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
9120 * count decremented) */
9121 if (a == *i || b == *i) {
9122 assert(! invlist_is_iterating(*i));
9127 SvREFCNT_dec_NN(*i);
9137 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
9139 /* Add the range from 'start' to 'end' inclusive to the inversion list's
9140 * set. A pointer to the inversion list is returned. This may actually be
9141 * a new list, in which case the passed in one has been destroyed. The
9142 * passed-in inversion list can be NULL, in which case a new one is created
9143 * with just the one range in it */
9148 if (invlist == NULL) {
9149 invlist = _new_invlist(2);
9153 len = _invlist_len(invlist);
9156 /* If comes after the final entry actually in the list, can just append it
9159 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
9160 && start >= invlist_array(invlist)[len - 1]))
9162 _append_range_to_invlist(invlist, start, end);
9166 /* Here, can't just append things, create and return a new inversion list
9167 * which is the union of this range and the existing inversion list. (If
9168 * the new range is well-behaved wrt to the old one, we could just insert
9169 * it, doing a Move() down on the tail of the old one (potentially growing
9170 * it first). But to determine that means we would have the extra
9171 * (possibly throw-away) work of first finding where the new one goes and
9172 * whether it disrupts (splits) an existing range, so it doesn't appear to
9173 * me (khw) that it's worth it) */
9174 range_invlist = _new_invlist(2);
9175 _append_range_to_invlist(range_invlist, start, end);
9177 _invlist_union(invlist, range_invlist, &invlist);
9179 /* The temporary can be freed */
9180 SvREFCNT_dec_NN(range_invlist);
9186 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9187 UV** other_elements_ptr)
9189 /* Create and return an inversion list whose contents are to be populated
9190 * by the caller. The caller gives the number of elements (in 'size') and
9191 * the very first element ('element0'). This function will set
9192 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9195 * Obviously there is some trust involved that the caller will properly
9196 * fill in the other elements of the array.
9198 * (The first element needs to be passed in, as the underlying code does
9199 * things differently depending on whether it is zero or non-zero) */
9201 SV* invlist = _new_invlist(size);
9204 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9206 _append_range_to_invlist(invlist, element0, element0);
9207 offset = *get_invlist_offset_addr(invlist);
9209 invlist_set_len(invlist, size, offset);
9210 *other_elements_ptr = invlist_array(invlist) + 1;
9216 PERL_STATIC_INLINE SV*
9217 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9218 return _add_range_to_invlist(invlist, cp, cp);
9221 #ifndef PERL_IN_XSUB_RE
9223 Perl__invlist_invert(pTHX_ SV* const invlist)
9225 /* Complement the input inversion list. This adds a 0 if the list didn't
9226 * have a zero; removes it otherwise. As described above, the data
9227 * structure is set up so that this is very efficient */
9229 PERL_ARGS_ASSERT__INVLIST_INVERT;
9231 assert(! invlist_is_iterating(invlist));
9233 /* The inverse of matching nothing is matching everything */
9234 if (_invlist_len(invlist) == 0) {
9235 _append_range_to_invlist(invlist, 0, UV_MAX);
9239 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9244 PERL_STATIC_INLINE SV*
9245 S_invlist_clone(pTHX_ SV* const invlist)
9248 /* Return a new inversion list that is a copy of the input one, which is
9249 * unchanged. The new list will not be mortal even if the old one was. */
9251 /* Need to allocate extra space to accommodate Perl's addition of a
9252 * trailing NUL to SvPV's, since it thinks they are always strings */
9253 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9254 STRLEN physical_length = SvCUR(invlist);
9255 bool offset = *(get_invlist_offset_addr(invlist));
9257 PERL_ARGS_ASSERT_INVLIST_CLONE;
9259 *(get_invlist_offset_addr(new_invlist)) = offset;
9260 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9261 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9266 PERL_STATIC_INLINE STRLEN*
9267 S_get_invlist_iter_addr(SV* invlist)
9269 /* Return the address of the UV that contains the current iteration
9272 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9274 assert(SvTYPE(invlist) == SVt_INVLIST);
9276 return &(((XINVLIST*) SvANY(invlist))->iterator);
9279 PERL_STATIC_INLINE void
9280 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9282 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9284 *get_invlist_iter_addr(invlist) = 0;
9287 PERL_STATIC_INLINE void
9288 S_invlist_iterfinish(SV* invlist)
9290 /* Terminate iterator for invlist. This is to catch development errors.
9291 * Any iteration that is interrupted before completed should call this
9292 * function. Functions that add code points anywhere else but to the end
9293 * of an inversion list assert that they are not in the middle of an
9294 * iteration. If they were, the addition would make the iteration
9295 * problematical: if the iteration hadn't reached the place where things
9296 * were being added, it would be ok */
9298 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9300 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9304 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9306 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9307 * This call sets in <*start> and <*end>, the next range in <invlist>.
9308 * Returns <TRUE> if successful and the next call will return the next
9309 * range; <FALSE> if was already at the end of the list. If the latter,
9310 * <*start> and <*end> are unchanged, and the next call to this function
9311 * will start over at the beginning of the list */
9313 STRLEN* pos = get_invlist_iter_addr(invlist);
9314 UV len = _invlist_len(invlist);
9317 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9320 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9324 array = invlist_array(invlist);
9326 *start = array[(*pos)++];
9332 *end = array[(*pos)++] - 1;
9338 PERL_STATIC_INLINE UV
9339 S_invlist_highest(SV* const invlist)
9341 /* Returns the highest code point that matches an inversion list. This API
9342 * has an ambiguity, as it returns 0 under either the highest is actually
9343 * 0, or if the list is empty. If this distinction matters to you, check
9344 * for emptiness before calling this function */
9346 UV len = _invlist_len(invlist);
9349 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9355 array = invlist_array(invlist);
9357 /* The last element in the array in the inversion list always starts a
9358 * range that goes to infinity. That range may be for code points that are
9359 * matched in the inversion list, or it may be for ones that aren't
9360 * matched. In the latter case, the highest code point in the set is one
9361 * less than the beginning of this range; otherwise it is the final element
9362 * of this range: infinity */
9363 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9365 : array[len - 1] - 1;
9368 #ifndef PERL_IN_XSUB_RE
9370 Perl__invlist_contents(pTHX_ SV* const invlist)
9372 /* Get the contents of an inversion list into a string SV so that they can
9373 * be printed out. It uses the format traditionally done for debug tracing
9377 SV* output = newSVpvs("\n");
9379 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9381 assert(! invlist_is_iterating(invlist));
9383 invlist_iterinit(invlist);
9384 while (invlist_iternext(invlist, &start, &end)) {
9385 if (end == UV_MAX) {
9386 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9388 else if (end != start) {
9389 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9393 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9401 #ifndef PERL_IN_XSUB_RE
9403 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9404 const char * const indent, SV* const invlist)
9406 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9407 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9408 * the string 'indent'. The output looks like this:
9409 [0] 0x000A .. 0x000D
9411 [4] 0x2028 .. 0x2029
9412 [6] 0x3104 .. INFINITY
9413 * This means that the first range of code points matched by the list are
9414 * 0xA through 0xD; the second range contains only the single code point
9415 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9416 * are used to define each range (except if the final range extends to
9417 * infinity, only a single element is needed). The array index of the
9418 * first element for the corresponding range is given in brackets. */
9423 PERL_ARGS_ASSERT__INVLIST_DUMP;
9425 if (invlist_is_iterating(invlist)) {
9426 Perl_dump_indent(aTHX_ level, file,
9427 "%sCan't dump inversion list because is in middle of iterating\n",
9432 invlist_iterinit(invlist);
9433 while (invlist_iternext(invlist, &start, &end)) {
9434 if (end == UV_MAX) {
9435 Perl_dump_indent(aTHX_ level, file,
9436 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9437 indent, (UV)count, start);
9439 else if (end != start) {
9440 Perl_dump_indent(aTHX_ level, file,
9441 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9442 indent, (UV)count, start, end);
9445 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9446 indent, (UV)count, start);
9453 Perl__load_PL_utf8_foldclosures (pTHX)
9455 assert(! PL_utf8_foldclosures);
9457 /* If the folds haven't been read in, call a fold function
9459 if (! PL_utf8_tofold) {
9460 U8 dummy[UTF8_MAXBYTES_CASE+1];
9462 /* This string is just a short named one above \xff */
9463 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9464 assert(PL_utf8_tofold); /* Verify that worked */
9466 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9470 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
9472 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9474 /* Return a boolean as to if the two passed in inversion lists are
9475 * identical. The final argument, if TRUE, says to take the complement of
9476 * the second inversion list before doing the comparison */
9478 const UV* array_a = invlist_array(a);
9479 const UV* array_b = invlist_array(b);
9480 UV len_a = _invlist_len(a);
9481 UV len_b = _invlist_len(b);
9483 UV i = 0; /* current index into the arrays */
9484 bool retval = TRUE; /* Assume are identical until proven otherwise */
9486 PERL_ARGS_ASSERT__INVLISTEQ;
9488 /* If are to compare 'a' with the complement of b, set it
9489 * up so are looking at b's complement. */
9492 /* The complement of nothing is everything, so <a> would have to have
9493 * just one element, starting at zero (ending at infinity) */
9495 return (len_a == 1 && array_a[0] == 0);
9497 else if (array_b[0] == 0) {
9499 /* Otherwise, to complement, we invert. Here, the first element is
9500 * 0, just remove it. To do this, we just pretend the array starts
9508 /* But if the first element is not zero, we pretend the list starts
9509 * at the 0 that is always stored immediately before the array. */
9515 /* Make sure that the lengths are the same, as well as the final element
9516 * before looping through the remainder. (Thus we test the length, final,
9517 * and first elements right off the bat) */
9518 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9521 else for (i = 0; i < len_a - 1; i++) {
9522 if (array_a[i] != array_b[i]) {
9533 * As best we can, determine the characters that can match the start of
9534 * the given EXACTF-ish node.
9536 * Returns the invlist as a new SV*; it is the caller's responsibility to
9537 * call SvREFCNT_dec() when done with it.
9540 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9542 const U8 * s = (U8*)STRING(node);
9543 SSize_t bytelen = STR_LEN(node);
9545 /* Start out big enough for 2 separate code points */
9546 SV* invlist = _new_invlist(4);
9548 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9553 /* We punt and assume can match anything if the node begins
9554 * with a multi-character fold. Things are complicated. For
9555 * example, /ffi/i could match any of:
9556 * "\N{LATIN SMALL LIGATURE FFI}"
9557 * "\N{LATIN SMALL LIGATURE FF}I"
9558 * "F\N{LATIN SMALL LIGATURE FI}"
9559 * plus several other things; and making sure we have all the
9560 * possibilities is hard. */
9561 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9562 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9565 /* Any Latin1 range character can potentially match any
9566 * other depending on the locale */
9567 if (OP(node) == EXACTFL) {
9568 _invlist_union(invlist, PL_Latin1, &invlist);
9571 /* But otherwise, it matches at least itself. We can
9572 * quickly tell if it has a distinct fold, and if so,
9573 * it matches that as well */
9574 invlist = add_cp_to_invlist(invlist, uc);
9575 if (IS_IN_SOME_FOLD_L1(uc))
9576 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
9579 /* Some characters match above-Latin1 ones under /i. This
9580 * is true of EXACTFL ones when the locale is UTF-8 */
9581 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
9582 && (! isASCII(uc) || (OP(node) != EXACTFA
9583 && OP(node) != EXACTFA_NO_TRIE)))
9585 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
9589 else { /* Pattern is UTF-8 */
9590 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
9591 STRLEN foldlen = UTF8SKIP(s);
9592 const U8* e = s + bytelen;
9595 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
9597 /* The only code points that aren't folded in a UTF EXACTFish
9598 * node are are the problematic ones in EXACTFL nodes */
9599 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
9600 /* We need to check for the possibility that this EXACTFL
9601 * node begins with a multi-char fold. Therefore we fold
9602 * the first few characters of it so that we can make that
9607 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
9609 *(d++) = (U8) toFOLD(*s);
9614 to_utf8_fold(s, d, &len);
9620 /* And set up so the code below that looks in this folded
9621 * buffer instead of the node's string */
9623 foldlen = UTF8SKIP(folded);
9627 /* When we reach here 's' points to the fold of the first
9628 * character(s) of the node; and 'e' points to far enough along
9629 * the folded string to be just past any possible multi-char
9630 * fold. 'foldlen' is the length in bytes of the first
9633 * Unlike the non-UTF-8 case, the macro for determining if a
9634 * string is a multi-char fold requires all the characters to
9635 * already be folded. This is because of all the complications
9636 * if not. Note that they are folded anyway, except in EXACTFL
9637 * nodes. Like the non-UTF case above, we punt if the node
9638 * begins with a multi-char fold */
9640 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
9641 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9643 else { /* Single char fold */
9645 /* It matches all the things that fold to it, which are
9646 * found in PL_utf8_foldclosures (including itself) */
9647 invlist = add_cp_to_invlist(invlist, uc);
9648 if (! PL_utf8_foldclosures)
9649 _load_PL_utf8_foldclosures();
9650 if ((listp = hv_fetch(PL_utf8_foldclosures,
9651 (char *) s, foldlen, FALSE)))
9653 AV* list = (AV*) *listp;
9655 for (k = 0; k <= av_tindex(list); k++) {
9656 SV** c_p = av_fetch(list, k, FALSE);
9662 /* /aa doesn't allow folds between ASCII and non- */
9663 if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
9664 && isASCII(c) != isASCII(uc))
9669 invlist = add_cp_to_invlist(invlist, c);
9678 #undef HEADER_LENGTH
9679 #undef TO_INTERNAL_SIZE
9680 #undef FROM_INTERNAL_SIZE
9681 #undef INVLIST_VERSION_ID
9683 /* End of inversion list object */
9686 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9688 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9689 * constructs, and updates RExC_flags with them. On input, RExC_parse
9690 * should point to the first flag; it is updated on output to point to the
9691 * final ')' or ':'. There needs to be at least one flag, or this will
9694 /* for (?g), (?gc), and (?o) warnings; warning
9695 about (?c) will warn about (?g) -- japhy */
9697 #define WASTED_O 0x01
9698 #define WASTED_G 0x02
9699 #define WASTED_C 0x04
9700 #define WASTED_GC (WASTED_G|WASTED_C)
9701 I32 wastedflags = 0x00;
9702 U32 posflags = 0, negflags = 0;
9703 U32 *flagsp = &posflags;
9704 char has_charset_modifier = '\0';
9706 bool has_use_defaults = FALSE;
9707 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9708 int x_mod_count = 0;
9710 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9712 /* '^' as an initial flag sets certain defaults */
9713 if (UCHARAT(RExC_parse) == '^') {
9715 has_use_defaults = TRUE;
9716 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9717 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9718 ? REGEX_UNICODE_CHARSET
9719 : REGEX_DEPENDS_CHARSET);
9722 cs = get_regex_charset(RExC_flags);
9723 if (cs == REGEX_DEPENDS_CHARSET
9724 && (RExC_utf8 || RExC_uni_semantics))
9726 cs = REGEX_UNICODE_CHARSET;
9729 while (*RExC_parse) {
9730 /* && strchr("iogcmsx", *RExC_parse) */
9731 /* (?g), (?gc) and (?o) are useless here
9732 and must be globally applied -- japhy */
9733 switch (*RExC_parse) {
9735 /* Code for the imsxn flags */
9736 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9738 case LOCALE_PAT_MOD:
9739 if (has_charset_modifier) {
9740 goto excess_modifier;
9742 else if (flagsp == &negflags) {
9745 cs = REGEX_LOCALE_CHARSET;
9746 has_charset_modifier = LOCALE_PAT_MOD;
9748 case UNICODE_PAT_MOD:
9749 if (has_charset_modifier) {
9750 goto excess_modifier;
9752 else if (flagsp == &negflags) {
9755 cs = REGEX_UNICODE_CHARSET;
9756 has_charset_modifier = UNICODE_PAT_MOD;
9758 case ASCII_RESTRICT_PAT_MOD:
9759 if (flagsp == &negflags) {
9762 if (has_charset_modifier) {
9763 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9764 goto excess_modifier;
9766 /* Doubled modifier implies more restricted */
9767 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9770 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9772 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9774 case DEPENDS_PAT_MOD:
9775 if (has_use_defaults) {
9776 goto fail_modifiers;
9778 else if (flagsp == &negflags) {
9781 else if (has_charset_modifier) {
9782 goto excess_modifier;
9785 /* The dual charset means unicode semantics if the
9786 * pattern (or target, not known until runtime) are
9787 * utf8, or something in the pattern indicates unicode
9789 cs = (RExC_utf8 || RExC_uni_semantics)
9790 ? REGEX_UNICODE_CHARSET
9791 : REGEX_DEPENDS_CHARSET;
9792 has_charset_modifier = DEPENDS_PAT_MOD;
9796 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9797 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9799 else if (has_charset_modifier == *(RExC_parse - 1)) {
9800 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9804 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9806 NOT_REACHED; /*NOTREACHED*/
9809 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9811 NOT_REACHED; /*NOTREACHED*/
9812 case ONCE_PAT_MOD: /* 'o' */
9813 case GLOBAL_PAT_MOD: /* 'g' */
9814 if (PASS2 && ckWARN(WARN_REGEXP)) {
9815 const I32 wflagbit = *RExC_parse == 'o'
9818 if (! (wastedflags & wflagbit) ) {
9819 wastedflags |= wflagbit;
9820 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9823 "Useless (%s%c) - %suse /%c modifier",
9824 flagsp == &negflags ? "?-" : "?",
9826 flagsp == &negflags ? "don't " : "",
9833 case CONTINUE_PAT_MOD: /* 'c' */
9834 if (PASS2 && ckWARN(WARN_REGEXP)) {
9835 if (! (wastedflags & WASTED_C) ) {
9836 wastedflags |= WASTED_GC;
9837 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9840 "Useless (%sc) - %suse /gc modifier",
9841 flagsp == &negflags ? "?-" : "?",
9842 flagsp == &negflags ? "don't " : ""
9847 case KEEPCOPY_PAT_MOD: /* 'p' */
9848 if (flagsp == &negflags) {
9850 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9852 *flagsp |= RXf_PMf_KEEPCOPY;
9856 /* A flag is a default iff it is following a minus, so
9857 * if there is a minus, it means will be trying to
9858 * re-specify a default which is an error */
9859 if (has_use_defaults || flagsp == &negflags) {
9860 goto fail_modifiers;
9863 wastedflags = 0; /* reset so (?g-c) warns twice */
9867 RExC_flags |= posflags;
9868 RExC_flags &= ~negflags;
9869 set_regex_charset(&RExC_flags, cs);
9870 if (RExC_flags & RXf_PMf_FOLD) {
9871 RExC_contains_i = 1;
9874 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9880 RExC_parse += SKIP_IF_CHAR(RExC_parse);
9881 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9882 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9883 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9884 NOT_REACHED; /*NOTREACHED*/
9890 vFAIL("Sequence (?... not terminated");
9894 - reg - regular expression, i.e. main body or parenthesized thing
9896 * Caller must absorb opening parenthesis.
9898 * Combining parenthesis handling with the base level of regular expression
9899 * is a trifle forced, but the need to tie the tails of the branches to what
9900 * follows makes it hard to avoid.
9902 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9904 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9906 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9909 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9910 flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
9911 needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
9912 upgraded to UTF-8. Otherwise would only return NULL if regbranch() returns
9913 NULL, which cannot happen. */
9915 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9916 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9917 * 2 is like 1, but indicates that nextchar() has been called to advance
9918 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9919 * this flag alerts us to the need to check for that */
9921 regnode *ret; /* Will be the head of the group. */
9924 regnode *ender = NULL;
9927 U32 oregflags = RExC_flags;
9928 bool have_branch = 0;
9930 I32 freeze_paren = 0;
9931 I32 after_freeze = 0;
9932 I32 num; /* numeric backreferences */
9934 char * parse_start = RExC_parse; /* MJD */
9935 char * const oregcomp_parse = RExC_parse;
9937 GET_RE_DEBUG_FLAGS_DECL;
9939 PERL_ARGS_ASSERT_REG;
9940 DEBUG_PARSE("reg ");
9942 *flagp = 0; /* Tentatively. */
9944 /* Having this true makes it feasible to have a lot fewer tests for the
9945 * parse pointer being in scope. For example, we can write
9946 * while(isFOO(*RExC_parse)) RExC_parse++;
9948 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
9950 assert(*RExC_end == '\0');
9952 /* Make an OPEN node, if parenthesized. */
9955 /* Under /x, space and comments can be gobbled up between the '(' and
9956 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9957 * intervening space, as the sequence is a token, and a token should be
9959 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9961 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9962 char *start_verb = RExC_parse;
9963 STRLEN verb_len = 0;
9964 char *start_arg = NULL;
9965 unsigned char op = 0;
9966 int arg_required = 0;
9967 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
9969 if (has_intervening_patws) {
9971 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9973 while ( *RExC_parse && *RExC_parse != ')' ) {
9974 if ( *RExC_parse == ':' ) {
9975 start_arg = RExC_parse + 1;
9981 verb_len = RExC_parse - start_verb;
9984 while ( *RExC_parse && *RExC_parse != ')' )
9986 if ( *RExC_parse != ')' )
9987 vFAIL("Unterminated verb pattern argument");
9988 if ( RExC_parse == start_arg )
9991 if ( *RExC_parse != ')' )
9992 vFAIL("Unterminated verb pattern");
9995 switch ( *start_verb ) {
9996 case 'A': /* (*ACCEPT) */
9997 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9999 internal_argval = RExC_nestroot;
10002 case 'C': /* (*COMMIT) */
10003 if ( memEQs(start_verb,verb_len,"COMMIT") )
10006 case 'F': /* (*FAIL) */
10007 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
10011 case ':': /* (*:NAME) */
10012 case 'M': /* (*MARK:NAME) */
10013 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
10018 case 'P': /* (*PRUNE) */
10019 if ( memEQs(start_verb,verb_len,"PRUNE") )
10022 case 'S': /* (*SKIP) */
10023 if ( memEQs(start_verb,verb_len,"SKIP") )
10026 case 'T': /* (*THEN) */
10027 /* [19:06] <TimToady> :: is then */
10028 if ( memEQs(start_verb,verb_len,"THEN") ) {
10030 RExC_seen |= REG_CUTGROUP_SEEN;
10035 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10037 "Unknown verb pattern '%"UTF8f"'",
10038 UTF8fARG(UTF, verb_len, start_verb));
10040 if ( arg_required && !start_arg ) {
10041 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
10042 verb_len, start_verb);
10044 if (internal_argval == -1) {
10045 ret = reganode(pRExC_state, op, 0);
10047 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
10049 RExC_seen |= REG_VERBARG_SEEN;
10050 if ( ! SIZE_ONLY ) {
10052 SV *sv = newSVpvn( start_arg,
10053 RExC_parse - start_arg);
10054 ARG(ret) = add_data( pRExC_state,
10055 STR_WITH_LEN("S"));
10056 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
10061 if ( internal_argval != -1 )
10062 ARG2L_SET(ret, internal_argval);
10064 nextchar(pRExC_state);
10067 else if (*RExC_parse == '?') { /* (?...) */
10068 bool is_logical = 0;
10069 const char * const seqstart = RExC_parse;
10070 const char * endptr;
10071 if (has_intervening_patws) {
10073 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
10077 paren = *RExC_parse++;
10078 ret = NULL; /* For lookahead/behind. */
10081 case 'P': /* (?P...) variants for those used to PCRE/Python */
10082 paren = *RExC_parse++;
10083 if ( paren == '<') /* (?P<...>) named capture */
10084 goto named_capture;
10085 else if (paren == '>') { /* (?P>name) named recursion */
10086 goto named_recursion;
10088 else if (paren == '=') { /* (?P=...) named backref */
10089 /* this pretty much dupes the code for \k<NAME> in
10090 * regatom(), if you change this make sure you change that
10092 char* name_start = RExC_parse;
10094 SV *sv_dat = reg_scan_name(pRExC_state,
10095 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10096 if (RExC_parse == name_start || *RExC_parse != ')')
10097 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
10098 vFAIL2("Sequence %.3s... not terminated",parse_start);
10101 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10102 RExC_rxi->data->data[num]=(void*)sv_dat;
10103 SvREFCNT_inc_simple_void(sv_dat);
10106 ret = reganode(pRExC_state,
10109 : (ASCII_FOLD_RESTRICTED)
10111 : (AT_LEAST_UNI_SEMANTICS)
10117 *flagp |= HASWIDTH;
10119 Set_Node_Offset(ret, parse_start+1);
10120 Set_Node_Cur_Length(ret, parse_start);
10122 nextchar(pRExC_state);
10126 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10127 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10128 vFAIL3("Sequence (%.*s...) not recognized",
10129 RExC_parse-seqstart, seqstart);
10130 NOT_REACHED; /*NOTREACHED*/
10131 case '<': /* (?<...) */
10132 if (*RExC_parse == '!')
10134 else if (*RExC_parse != '=')
10140 case '\'': /* (?'...') */
10141 name_start= RExC_parse;
10142 svname = reg_scan_name(pRExC_state,
10143 SIZE_ONLY /* reverse test from the others */
10144 ? REG_RSN_RETURN_NAME
10145 : REG_RSN_RETURN_NULL);
10146 if (RExC_parse == name_start || *RExC_parse != paren)
10147 vFAIL2("Sequence (?%c... not terminated",
10148 paren=='>' ? '<' : paren);
10152 if (!svname) /* shouldn't happen */
10154 "panic: reg_scan_name returned NULL");
10155 if (!RExC_paren_names) {
10156 RExC_paren_names= newHV();
10157 sv_2mortal(MUTABLE_SV(RExC_paren_names));
10159 RExC_paren_name_list= newAV();
10160 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
10163 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10165 sv_dat = HeVAL(he_str);
10167 /* croak baby croak */
10169 "panic: paren_name hash element allocation failed");
10170 } else if ( SvPOK(sv_dat) ) {
10171 /* (?|...) can mean we have dupes so scan to check
10172 its already been stored. Maybe a flag indicating
10173 we are inside such a construct would be useful,
10174 but the arrays are likely to be quite small, so
10175 for now we punt -- dmq */
10176 IV count = SvIV(sv_dat);
10177 I32 *pv = (I32*)SvPVX(sv_dat);
10179 for ( i = 0 ; i < count ; i++ ) {
10180 if ( pv[i] == RExC_npar ) {
10186 pv = (I32*)SvGROW(sv_dat,
10187 SvCUR(sv_dat) + sizeof(I32)+1);
10188 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10189 pv[count] = RExC_npar;
10190 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10193 (void)SvUPGRADE(sv_dat,SVt_PVNV);
10194 sv_setpvn(sv_dat, (char *)&(RExC_npar),
10197 SvIV_set(sv_dat, 1);
10200 /* Yes this does cause a memory leak in debugging Perls
10202 if (!av_store(RExC_paren_name_list,
10203 RExC_npar, SvREFCNT_inc(svname)))
10204 SvREFCNT_dec_NN(svname);
10207 /*sv_dump(sv_dat);*/
10209 nextchar(pRExC_state);
10211 goto capturing_parens;
10213 RExC_seen |= REG_LOOKBEHIND_SEEN;
10214 RExC_in_lookbehind++;
10217 case '=': /* (?=...) */
10218 RExC_seen_zerolen++;
10220 case '!': /* (?!...) */
10221 RExC_seen_zerolen++;
10222 /* check if we're really just a "FAIL" assertion */
10223 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
10224 FALSE /* Don't force to /x */ );
10225 if (*RExC_parse == ')') {
10226 ret=reganode(pRExC_state, OPFAIL, 0);
10227 nextchar(pRExC_state);
10231 case '|': /* (?|...) */
10232 /* branch reset, behave like a (?:...) except that
10233 buffers in alternations share the same numbers */
10235 after_freeze = freeze_paren = RExC_npar;
10237 case ':': /* (?:...) */
10238 case '>': /* (?>...) */
10240 case '$': /* (?$...) */
10241 case '@': /* (?@...) */
10242 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10244 case '0' : /* (?0) */
10245 case 'R' : /* (?R) */
10246 if (*RExC_parse != ')')
10247 FAIL("Sequence (?R) not terminated");
10248 ret = reg_node(pRExC_state, GOSTART);
10249 RExC_seen |= REG_GOSTART_SEEN;
10250 *flagp |= POSTPONED;
10251 nextchar(pRExC_state);
10254 /* named and numeric backreferences */
10255 case '&': /* (?&NAME) */
10256 parse_start = RExC_parse - 1;
10259 SV *sv_dat = reg_scan_name(pRExC_state,
10260 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10261 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10263 if (RExC_parse == RExC_end || *RExC_parse != ')')
10264 vFAIL("Sequence (?&... not terminated");
10265 goto gen_recurse_regop;
10268 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10270 vFAIL("Illegal pattern");
10272 goto parse_recursion;
10274 case '-': /* (?-1) */
10275 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10276 RExC_parse--; /* rewind to let it be handled later */
10280 case '1': case '2': case '3': case '4': /* (?1) */
10281 case '5': case '6': case '7': case '8': case '9':
10285 bool is_neg = FALSE;
10287 parse_start = RExC_parse - 1; /* MJD */
10288 if (*RExC_parse == '-') {
10292 if (grok_atoUV(RExC_parse, &unum, &endptr)
10296 RExC_parse = (char*)endptr;
10300 /* Some limit for num? */
10304 if (*RExC_parse!=')')
10305 vFAIL("Expecting close bracket");
10308 if ( paren == '-' ) {
10310 Diagram of capture buffer numbering.
10311 Top line is the normal capture buffer numbers
10312 Bottom line is the negative indexing as from
10316 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10320 num = RExC_npar + num;
10323 vFAIL("Reference to nonexistent group");
10325 } else if ( paren == '+' ) {
10326 num = RExC_npar + num - 1;
10329 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10331 if (num > (I32)RExC_rx->nparens) {
10333 vFAIL("Reference to nonexistent group");
10335 RExC_recurse_count++;
10336 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10337 "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10338 22, "| |", (int)(depth * 2 + 1), "",
10339 (UV)ARG(ret), (IV)ARG2L(ret)));
10341 RExC_seen |= REG_RECURSE_SEEN;
10342 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10343 Set_Node_Offset(ret, parse_start); /* MJD */
10345 *flagp |= POSTPONED;
10346 nextchar(pRExC_state);
10351 case '?': /* (??...) */
10353 if (*RExC_parse != '{') {
10354 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10355 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10357 "Sequence (%"UTF8f"...) not recognized",
10358 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10359 NOT_REACHED; /*NOTREACHED*/
10361 *flagp |= POSTPONED;
10362 paren = *RExC_parse++;
10364 case '{': /* (?{...}) */
10367 struct reg_code_block *cb;
10369 RExC_seen_zerolen++;
10371 if ( !pRExC_state->num_code_blocks
10372 || pRExC_state->code_index >= pRExC_state->num_code_blocks
10373 || pRExC_state->code_blocks[pRExC_state->code_index].start
10374 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10377 if (RExC_pm_flags & PMf_USE_RE_EVAL)
10378 FAIL("panic: Sequence (?{...}): no code block found\n");
10379 FAIL("Eval-group not allowed at runtime, use re 'eval'");
10381 /* this is a pre-compiled code block (?{...}) */
10382 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10383 RExC_parse = RExC_start + cb->end;
10386 if (cb->src_regex) {
10387 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10388 RExC_rxi->data->data[n] =
10389 (void*)SvREFCNT_inc((SV*)cb->src_regex);
10390 RExC_rxi->data->data[n+1] = (void*)o;
10393 n = add_data(pRExC_state,
10394 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10395 RExC_rxi->data->data[n] = (void*)o;
10398 pRExC_state->code_index++;
10399 nextchar(pRExC_state);
10403 ret = reg_node(pRExC_state, LOGICAL);
10405 eval = reg2Lanode(pRExC_state, EVAL,
10408 /* for later propagation into (??{})
10410 RExC_flags & RXf_PMf_COMPILETIME
10415 REGTAIL(pRExC_state, ret, eval);
10416 /* deal with the length of this later - MJD */
10419 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10420 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10421 Set_Node_Offset(ret, parse_start);
10424 case '(': /* (?(?{...})...) and (?(?=...)...) */
10427 const int DEFINE_len = sizeof("DEFINE") - 1;
10428 if (RExC_parse[0] == '?') { /* (?(?...)) */
10430 RExC_parse[1] == '=' ||
10431 RExC_parse[1] == '!' ||
10432 RExC_parse[1] == '<' ||
10433 RExC_parse[1] == '{'
10434 ) { /* Lookahead or eval. */
10438 ret = reg_node(pRExC_state, LOGICAL);
10442 tail = reg(pRExC_state, 1, &flag, depth+1);
10443 if (flag & (RESTART_PASS1|NEED_UTF8)) {
10444 *flagp = flag & (RESTART_PASS1|NEED_UTF8);
10447 REGTAIL(pRExC_state, ret, tail);
10450 /* Fall through to ‘Unknown switch condition’ at the
10451 end of the if/else chain. */
10453 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10454 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10456 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10457 char *name_start= RExC_parse++;
10459 SV *sv_dat=reg_scan_name(pRExC_state,
10460 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10461 if (RExC_parse == name_start || *RExC_parse != ch)
10462 vFAIL2("Sequence (?(%c... not terminated",
10463 (ch == '>' ? '<' : ch));
10466 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10467 RExC_rxi->data->data[num]=(void*)sv_dat;
10468 SvREFCNT_inc_simple_void(sv_dat);
10470 ret = reganode(pRExC_state,NGROUPP,num);
10471 goto insert_if_check_paren;
10473 else if (RExC_end - RExC_parse >= DEFINE_len
10474 && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
10476 ret = reganode(pRExC_state,DEFINEP,0);
10477 RExC_parse += DEFINE_len;
10479 goto insert_if_check_paren;
10481 else if (RExC_parse[0] == 'R') {
10484 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10486 if (grok_atoUV(RExC_parse, &uv, &endptr)
10490 RExC_parse = (char*)endptr;
10492 /* else "Switch condition not recognized" below */
10493 } else if (RExC_parse[0] == '&') {
10496 sv_dat = reg_scan_name(pRExC_state,
10498 ? REG_RSN_RETURN_NULL
10499 : REG_RSN_RETURN_DATA);
10500 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10502 ret = reganode(pRExC_state,INSUBP,parno);
10503 goto insert_if_check_paren;
10505 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10509 if (grok_atoUV(RExC_parse, &uv, &endptr)
10513 RExC_parse = (char*)endptr;
10516 vFAIL("panic: grok_atoUV returned FALSE");
10518 ret = reganode(pRExC_state, GROUPP, parno);
10520 insert_if_check_paren:
10521 if (UCHARAT(RExC_parse) != ')') {
10522 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10523 vFAIL("Switch condition not recognized");
10525 nextchar(pRExC_state);
10527 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10528 br = regbranch(pRExC_state, &flags, 1,depth+1);
10530 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10531 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10534 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10537 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10539 c = UCHARAT(RExC_parse);
10540 nextchar(pRExC_state);
10541 if (flags&HASWIDTH)
10542 *flagp |= HASWIDTH;
10545 vFAIL("(?(DEFINE)....) does not allow branches");
10547 /* Fake one for optimizer. */
10548 lastbr = reganode(pRExC_state, IFTHEN, 0);
10550 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10551 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10552 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10555 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10558 REGTAIL(pRExC_state, ret, lastbr);
10559 if (flags&HASWIDTH)
10560 *flagp |= HASWIDTH;
10561 c = UCHARAT(RExC_parse);
10562 nextchar(pRExC_state);
10567 if (RExC_parse>RExC_end)
10568 vFAIL("Switch (?(condition)... not terminated");
10570 vFAIL("Switch (?(condition)... contains too many branches");
10572 ender = reg_node(pRExC_state, TAIL);
10573 REGTAIL(pRExC_state, br, ender);
10575 REGTAIL(pRExC_state, lastbr, ender);
10576 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10579 REGTAIL(pRExC_state, ret, ender);
10580 RExC_size++; /* XXX WHY do we need this?!!
10581 For large programs it seems to be required
10582 but I can't figure out why. -- dmq*/
10585 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10586 vFAIL("Unknown switch condition (?(...))");
10588 case '[': /* (?[ ... ]) */
10589 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10592 RExC_parse--; /* for vFAIL to print correctly */
10593 vFAIL("Sequence (? incomplete");
10595 default: /* e.g., (?i) */
10598 parse_lparen_question_flags(pRExC_state);
10599 if (UCHARAT(RExC_parse) != ':') {
10601 nextchar(pRExC_state);
10606 nextchar(pRExC_state);
10611 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
10616 ret = reganode(pRExC_state, OPEN, parno);
10618 if (!RExC_nestroot)
10619 RExC_nestroot = parno;
10620 if (RExC_seen & REG_RECURSE_SEEN
10621 && !RExC_open_parens[parno-1])
10623 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10624 "%*s%*s Setting open paren #%"IVdf" to %d\n",
10625 22, "| |", (int)(depth * 2 + 1), "",
10626 (IV)parno, REG_NODE_NUM(ret)));
10627 RExC_open_parens[parno-1]= ret;
10630 Set_Node_Length(ret, 1); /* MJD */
10631 Set_Node_Offset(ret, RExC_parse); /* MJD */
10634 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
10643 /* Pick up the branches, linking them together. */
10644 parse_start = RExC_parse; /* MJD */
10645 br = regbranch(pRExC_state, &flags, 1,depth+1);
10647 /* branch_len = (paren != 0); */
10650 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10651 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10654 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10656 if (*RExC_parse == '|') {
10657 if (!SIZE_ONLY && RExC_extralen) {
10658 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10661 reginsert(pRExC_state, BRANCH, br, depth+1);
10662 Set_Node_Length(br, paren != 0);
10663 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10667 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10669 else if (paren == ':') {
10670 *flagp |= flags&SIMPLE;
10672 if (is_open) { /* Starts with OPEN. */
10673 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10675 else if (paren != '?') /* Not Conditional */
10677 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10679 while (*RExC_parse == '|') {
10680 if (!SIZE_ONLY && RExC_extralen) {
10681 ender = reganode(pRExC_state, LONGJMP,0);
10683 /* Append to the previous. */
10684 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10687 RExC_extralen += 2; /* Account for LONGJMP. */
10688 nextchar(pRExC_state);
10689 if (freeze_paren) {
10690 if (RExC_npar > after_freeze)
10691 after_freeze = RExC_npar;
10692 RExC_npar = freeze_paren;
10694 br = regbranch(pRExC_state, &flags, 0, depth+1);
10697 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10698 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10701 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10703 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10705 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10708 if (have_branch || paren != ':') {
10709 /* Make a closing node, and hook it on the end. */
10712 ender = reg_node(pRExC_state, TAIL);
10715 ender = reganode(pRExC_state, CLOSE, parno);
10716 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10717 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10718 "%*s%*s Setting close paren #%"IVdf" to %d\n",
10719 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10720 RExC_close_parens[parno-1]= ender;
10721 if (RExC_nestroot == parno)
10724 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10725 Set_Node_Length(ender,1); /* MJD */
10731 *flagp &= ~HASWIDTH;
10734 ender = reg_node(pRExC_state, SUCCEED);
10737 ender = reg_node(pRExC_state, END);
10739 assert(!RExC_opend); /* there can only be one! */
10740 RExC_opend = ender;
10744 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10745 DEBUG_PARSE_MSG("lsbr");
10746 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10747 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10748 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10749 SvPV_nolen_const(RExC_mysv1),
10750 (IV)REG_NODE_NUM(lastbr),
10751 SvPV_nolen_const(RExC_mysv2),
10752 (IV)REG_NODE_NUM(ender),
10753 (IV)(ender - lastbr)
10756 REGTAIL(pRExC_state, lastbr, ender);
10758 if (have_branch && !SIZE_ONLY) {
10759 char is_nothing= 1;
10761 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10763 /* Hook the tails of the branches to the closing node. */
10764 for (br = ret; br; br = regnext(br)) {
10765 const U8 op = PL_regkind[OP(br)];
10766 if (op == BRANCH) {
10767 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10768 if ( OP(NEXTOPER(br)) != NOTHING
10769 || regnext(NEXTOPER(br)) != ender)
10772 else if (op == BRANCHJ) {
10773 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10774 /* for now we always disable this optimisation * /
10775 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10776 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10782 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10783 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10784 DEBUG_PARSE_MSG("NADA");
10785 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10786 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10787 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10788 SvPV_nolen_const(RExC_mysv1),
10789 (IV)REG_NODE_NUM(ret),
10790 SvPV_nolen_const(RExC_mysv2),
10791 (IV)REG_NODE_NUM(ender),
10796 if (OP(ender) == TAIL) {
10801 for ( opt= br + 1; opt < ender ; opt++ )
10802 OP(opt)= OPTIMIZED;
10803 NEXT_OFF(br)= ender - br;
10811 static const char parens[] = "=!<,>";
10813 if (paren && (p = strchr(parens, paren))) {
10814 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10815 int flag = (p - parens) > 1;
10818 node = SUSPEND, flag = 0;
10819 reginsert(pRExC_state, node,ret, depth+1);
10820 Set_Node_Cur_Length(ret, parse_start);
10821 Set_Node_Offset(ret, parse_start + 1);
10823 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10827 /* Check for proper termination. */
10829 /* restore original flags, but keep (?p) and, if we've changed from /d
10830 * rules to /u, keep the /u */
10831 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10832 if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
10833 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
10835 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
10836 RExC_parse = oregcomp_parse;
10837 vFAIL("Unmatched (");
10839 nextchar(pRExC_state);
10841 else if (!paren && RExC_parse < RExC_end) {
10842 if (*RExC_parse == ')') {
10844 vFAIL("Unmatched )");
10847 FAIL("Junk on end of regexp"); /* "Can't happen". */
10848 NOT_REACHED; /* NOTREACHED */
10851 if (RExC_in_lookbehind) {
10852 RExC_in_lookbehind--;
10854 if (after_freeze > RExC_npar)
10855 RExC_npar = after_freeze;
10860 - regbranch - one alternative of an | operator
10862 * Implements the concatenation operator.
10864 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
10865 * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
10868 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10871 regnode *chain = NULL;
10873 I32 flags = 0, c = 0;
10874 GET_RE_DEBUG_FLAGS_DECL;
10876 PERL_ARGS_ASSERT_REGBRANCH;
10878 DEBUG_PARSE("brnc");
10883 if (!SIZE_ONLY && RExC_extralen)
10884 ret = reganode(pRExC_state, BRANCHJ,0);
10886 ret = reg_node(pRExC_state, BRANCH);
10887 Set_Node_Length(ret, 1);
10891 if (!first && SIZE_ONLY)
10892 RExC_extralen += 1; /* BRANCHJ */
10894 *flagp = WORST; /* Tentatively. */
10896 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
10897 FALSE /* Don't force to /x */ );
10898 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10899 flags &= ~TRYAGAIN;
10900 latest = regpiece(pRExC_state, &flags,depth+1);
10901 if (latest == NULL) {
10902 if (flags & TRYAGAIN)
10904 if (flags & (RESTART_PASS1|NEED_UTF8)) {
10905 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
10908 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10910 else if (ret == NULL)
10912 *flagp |= flags&(HASWIDTH|POSTPONED);
10913 if (chain == NULL) /* First piece. */
10914 *flagp |= flags&SPSTART;
10916 /* FIXME adding one for every branch after the first is probably
10917 * excessive now we have TRIE support. (hv) */
10919 REGTAIL(pRExC_state, chain, latest);
10924 if (chain == NULL) { /* Loop ran zero times. */
10925 chain = reg_node(pRExC_state, NOTHING);
10930 *flagp |= flags&SIMPLE;
10937 - regpiece - something followed by possible [*+?]
10939 * Note that the branching code sequences used for ? and the general cases
10940 * of * and + are somewhat optimized: they use the same NOTHING node as
10941 * both the endmarker for their branch list and the body of the last branch.
10942 * It might seem that this node could be dispensed with entirely, but the
10943 * endmarker role is not redundant.
10945 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10947 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
10948 * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
10951 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10957 const char * const origparse = RExC_parse;
10959 I32 max = REG_INFTY;
10960 #ifdef RE_TRACK_PATTERN_OFFSETS
10963 const char *maxpos = NULL;
10966 /* Save the original in case we change the emitted regop to a FAIL. */
10967 regnode * const orig_emit = RExC_emit;
10969 GET_RE_DEBUG_FLAGS_DECL;
10971 PERL_ARGS_ASSERT_REGPIECE;
10973 DEBUG_PARSE("piec");
10975 ret = regatom(pRExC_state, &flags,depth+1);
10977 if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
10978 *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
10980 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10986 if (op == '{' && regcurly(RExC_parse)) {
10988 #ifdef RE_TRACK_PATTERN_OFFSETS
10989 parse_start = RExC_parse; /* MJD */
10991 next = RExC_parse + 1;
10992 while (isDIGIT(*next) || *next == ',') {
10993 if (*next == ',') {
11001 if (*next == '}') { /* got one */
11002 const char* endptr;
11006 if (isDIGIT(*RExC_parse)) {
11007 if (!grok_atoUV(RExC_parse, &uv, &endptr))
11008 vFAIL("Invalid quantifier in {,}");
11009 if (uv >= REG_INFTY)
11010 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11015 if (*maxpos == ',')
11018 maxpos = RExC_parse;
11019 if (isDIGIT(*maxpos)) {
11020 if (!grok_atoUV(maxpos, &uv, &endptr))
11021 vFAIL("Invalid quantifier in {,}");
11022 if (uv >= REG_INFTY)
11023 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
11026 max = REG_INFTY; /* meaning "infinity" */
11029 nextchar(pRExC_state);
11030 if (max < min) { /* If can't match, warn and optimize to fail
11034 /* We can't back off the size because we have to reserve
11035 * enough space for all the things we are about to throw
11036 * away, but we can shrink it by the ammount we are about
11037 * to re-use here */
11038 RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
11041 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
11042 RExC_emit = orig_emit;
11044 ret = reganode(pRExC_state, OPFAIL, 0);
11047 else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?')
11050 ckWARN2reg(RExC_parse + 1,
11051 "Useless use of greediness modifier '%c'",
11057 if ((flags&SIMPLE)) {
11058 if (min == 0 && max == REG_INFTY) {
11059 reginsert(pRExC_state, STAR, ret, depth+1);
11062 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11065 if (min == 1 && max == REG_INFTY) {
11066 reginsert(pRExC_state, PLUS, ret, depth+1);
11069 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11072 MARK_NAUGHTY_EXP(2, 2);
11073 reginsert(pRExC_state, CURLY, ret, depth+1);
11074 Set_Node_Offset(ret, parse_start+1); /* MJD */
11075 Set_Node_Cur_Length(ret, parse_start);
11078 regnode * const w = reg_node(pRExC_state, WHILEM);
11081 REGTAIL(pRExC_state, ret, w);
11082 if (!SIZE_ONLY && RExC_extralen) {
11083 reginsert(pRExC_state, LONGJMP,ret, depth+1);
11084 reginsert(pRExC_state, NOTHING,ret, depth+1);
11085 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
11087 reginsert(pRExC_state, CURLYX,ret, depth+1);
11089 Set_Node_Offset(ret, parse_start+1);
11090 Set_Node_Length(ret,
11091 op == '{' ? (RExC_parse - parse_start) : 1);
11093 if (!SIZE_ONLY && RExC_extralen)
11094 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
11095 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
11097 RExC_whilem_seen++, RExC_extralen += 3;
11098 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
11105 *flagp |= HASWIDTH;
11107 ARG1_SET(ret, (U16)min);
11108 ARG2_SET(ret, (U16)max);
11110 if (max == REG_INFTY)
11111 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
11117 if (!ISMULT1(op)) {
11122 #if 0 /* Now runtime fix should be reliable. */
11124 /* if this is reinstated, don't forget to put this back into perldiag:
11126 =item Regexp *+ operand could be empty at {#} in regex m/%s/
11128 (F) The part of the regexp subject to either the * or + quantifier
11129 could match an empty string. The {#} shows in the regular
11130 expression about where the problem was discovered.
11134 if (!(flags&HASWIDTH) && op != '?')
11135 vFAIL("Regexp *+ operand could be empty");
11138 #ifdef RE_TRACK_PATTERN_OFFSETS
11139 parse_start = RExC_parse;
11141 nextchar(pRExC_state);
11143 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
11149 else if (op == '+') {
11153 else if (op == '?') {
11158 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
11159 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
11160 ckWARN2reg(RExC_parse,
11161 "%"UTF8f" matches null string many times",
11162 UTF8fARG(UTF, (RExC_parse >= origparse
11163 ? RExC_parse - origparse
11166 (void)ReREFCNT_inc(RExC_rx_sv);
11169 if (RExC_parse < RExC_end && *RExC_parse == '?') {
11170 nextchar(pRExC_state);
11171 reginsert(pRExC_state, MINMOD, ret, depth+1);
11172 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11175 if (RExC_parse < RExC_end && *RExC_parse == '+') {
11177 nextchar(pRExC_state);
11178 ender = reg_node(pRExC_state, SUCCEED);
11179 REGTAIL(pRExC_state, ret, ender);
11180 reginsert(pRExC_state, SUSPEND, ret, depth+1);
11182 ender = reg_node(pRExC_state, TAIL);
11183 REGTAIL(pRExC_state, ret, ender);
11186 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
11188 vFAIL("Nested quantifiers");
11195 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
11203 /* This routine teases apart the various meanings of \N and returns
11204 * accordingly. The input parameters constrain which meaning(s) is/are valid
11205 * in the current context.
11207 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
11209 * If <code_point_p> is not NULL, the context is expecting the result to be a
11210 * single code point. If this \N instance turns out to a single code point,
11211 * the function returns TRUE and sets *code_point_p to that code point.
11213 * If <node_p> is not NULL, the context is expecting the result to be one of
11214 * the things representable by a regnode. If this \N instance turns out to be
11215 * one such, the function generates the regnode, returns TRUE and sets *node_p
11216 * to point to that regnode.
11218 * If this instance of \N isn't legal in any context, this function will
11219 * generate a fatal error and not return.
11221 * On input, RExC_parse should point to the first char following the \N at the
11222 * time of the call. On successful return, RExC_parse will have been updated
11223 * to point to just after the sequence identified by this routine. Also
11224 * *flagp has been updated as needed.
11226 * When there is some problem with the current context and this \N instance,
11227 * the function returns FALSE, without advancing RExC_parse, nor setting
11228 * *node_p, nor *code_point_p, nor *flagp.
11230 * If <cp_count> is not NULL, the caller wants to know the length (in code
11231 * points) that this \N sequence matches. This is set even if the function
11232 * returns FALSE, as detailed below.
11234 * There are 5 possibilities here, as detailed in the next 5 paragraphs.
11236 * Probably the most common case is for the \N to specify a single code point.
11237 * *cp_count will be set to 1, and *code_point_p will be set to that code
11240 * Another possibility is for the input to be an empty \N{}, which for
11241 * backwards compatibility we accept. *cp_count will be set to 0. *node_p
11242 * will be set to a generated NOTHING node.
11244 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
11245 * set to 0. *node_p will be set to a generated REG_ANY node.
11247 * The fourth possibility is that \N resolves to a sequence of more than one
11248 * code points. *cp_count will be set to the number of code points in the
11249 * sequence. *node_p * will be set to a generated node returned by this
11250 * function calling S_reg().
11252 * The final possibility is that it is premature to be calling this function;
11253 * that pass1 needs to be restarted. This can happen when this changes from
11254 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
11255 * latter occurs only when the fourth possibility would otherwise be in
11256 * effect, and is because one of those code points requires the pattern to be
11257 * recompiled as UTF-8. The function returns FALSE, and sets the
11258 * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate. When this
11259 * happens, the caller needs to desist from continuing parsing, and return
11260 * this information to its caller. This is not set for when there is only one
11261 * code point, as this can be called as part of an ANYOF node, and they can
11262 * store above-Latin1 code points without the pattern having to be in UTF-8.
11264 * For non-single-quoted regexes, the tokenizer has resolved character and
11265 * sequence names inside \N{...} into their Unicode values, normalizing the
11266 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
11267 * hex-represented code points in the sequence. This is done there because
11268 * the names can vary based on what charnames pragma is in scope at the time,
11269 * so we need a way to take a snapshot of what they resolve to at the time of
11270 * the original parse. [perl #56444].
11272 * That parsing is skipped for single-quoted regexes, so we may here get
11273 * '\N{NAME}'. This is a fatal error. These names have to be resolved by the
11274 * parser. But if the single-quoted regex is something like '\N{U+41}', that
11275 * is legal and handled here. The code point is Unicode, and has to be
11276 * translated into the native character set for non-ASCII platforms.
11279 char * endbrace; /* points to '}' following the name */
11280 char *endchar; /* Points to '.' or '}' ending cur char in the input
11282 char* p = RExC_parse; /* Temporary */
11284 GET_RE_DEBUG_FLAGS_DECL;
11286 PERL_ARGS_ASSERT_GROK_BSLASH_N;
11288 GET_RE_DEBUG_FLAGS;
11290 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
11291 assert(! (node_p && cp_count)); /* At most 1 should be set */
11293 if (cp_count) { /* Initialize return for the most common case */
11297 /* The [^\n] meaning of \N ignores spaces and comments under the /x
11298 * modifier. The other meanings do not, so use a temporary until we find
11299 * out which we are being called with */
11300 skip_to_be_ignored_text(pRExC_state, &p,
11301 FALSE /* Don't force to /x */ );
11303 /* Disambiguate between \N meaning a named character versus \N meaning
11304 * [^\n]. The latter is assumed when the {...} following the \N is a legal
11305 * quantifier, or there is no '{' at all */
11306 if (*p != '{' || regcurly(p)) {
11316 *node_p = reg_node(pRExC_state, REG_ANY);
11317 *flagp |= HASWIDTH|SIMPLE;
11319 Set_Node_Length(*node_p, 1); /* MJD */
11323 /* Here, we have decided it should be a named character or sequence */
11325 /* The test above made sure that the next real character is a '{', but
11326 * under the /x modifier, it could be separated by space (or a comment and
11327 * \n) and this is not allowed (for consistency with \x{...} and the
11328 * tokenizer handling of \N{NAME}). */
11329 if (*RExC_parse != '{') {
11330 vFAIL("Missing braces on \\N{}");
11333 RExC_parse++; /* Skip past the '{' */
11335 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11336 || ! (endbrace == RExC_parse /* nothing between the {} */
11337 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
11338 && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11341 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11342 vFAIL("\\N{NAME} must be resolved by the lexer");
11345 REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
11348 if (endbrace == RExC_parse) { /* empty: \N{} */
11352 nextchar(pRExC_state);
11357 *node_p = reg_node(pRExC_state,NOTHING);
11361 RExC_parse += 2; /* Skip past the 'U+' */
11363 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11365 /* Code points are separated by dots. If none, there is only one code
11366 * point, and is terminated by the brace */
11368 if (endchar >= endbrace) {
11369 STRLEN length_of_hex;
11370 I32 grok_hex_flags;
11372 /* Here, exactly one code point. If that isn't what is wanted, fail */
11373 if (! code_point_p) {
11378 /* Convert code point from hex */
11379 length_of_hex = (STRLEN)(endchar - RExC_parse);
11380 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11381 | PERL_SCAN_DISALLOW_PREFIX
11383 /* No errors in the first pass (See [perl
11384 * #122671].) We let the code below find the
11385 * errors when there are multiple chars. */
11387 ? PERL_SCAN_SILENT_ILLDIGIT
11390 /* This routine is the one place where both single- and double-quotish
11391 * \N{U+xxxx} are evaluated. The value is a Unicode code point which
11392 * must be converted to native. */
11393 *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
11398 /* The tokenizer should have guaranteed validity, but it's possible to
11399 * bypass it by using single quoting, so check. Don't do the check
11400 * here when there are multiple chars; we do it below anyway. */
11401 if (length_of_hex == 0
11402 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11404 RExC_parse += length_of_hex; /* Includes all the valid */
11405 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11406 ? UTF8SKIP(RExC_parse)
11408 /* Guard against malformed utf8 */
11409 if (RExC_parse >= endchar) {
11410 RExC_parse = endchar;
11412 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11415 RExC_parse = endbrace + 1;
11418 else { /* Is a multiple character sequence */
11419 SV * substitute_parse;
11421 char *orig_end = RExC_end;
11422 char *save_start = RExC_start;
11425 /* Count the code points, if desired, in the sequence */
11428 while (RExC_parse < endbrace) {
11429 /* Point to the beginning of the next character in the sequence. */
11430 RExC_parse = endchar + 1;
11431 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11436 /* Fail if caller doesn't want to handle a multi-code-point sequence.
11437 * But don't backup up the pointer if the caller want to know how many
11438 * code points there are (they can then handle things) */
11446 /* What is done here is to convert this to a sub-pattern of the form
11447 * \x{char1}\x{char2}... and then call reg recursively to parse it
11448 * (enclosing in "(?: ... )" ). That way, it retains its atomicness,
11449 * while not having to worry about special handling that some code
11450 * points may have. */
11452 substitute_parse = newSVpvs("?:");
11454 while (RExC_parse < endbrace) {
11456 /* Convert to notation the rest of the code understands */
11457 sv_catpv(substitute_parse, "\\x{");
11458 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
11459 sv_catpv(substitute_parse, "}");
11461 /* Point to the beginning of the next character in the sequence. */
11462 RExC_parse = endchar + 1;
11463 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11466 sv_catpv(substitute_parse, ")");
11468 RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
11471 /* Don't allow empty number */
11472 if (len < (STRLEN) 8) {
11473 RExC_parse = endbrace;
11474 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11476 RExC_end = RExC_parse + len;
11478 /* The values are Unicode, and therefore not subject to recoding, but
11479 * have to be converted to native on a non-Unicode (meaning non-ASCII)
11481 RExC_override_recoding = 1;
11483 RExC_recode_x_to_native = 1;
11487 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11488 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11489 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11492 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11495 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11498 /* Restore the saved values */
11499 RExC_start = RExC_adjusted_start = save_start;
11500 RExC_parse = endbrace;
11501 RExC_end = orig_end;
11502 RExC_override_recoding = 0;
11504 RExC_recode_x_to_native = 0;
11507 SvREFCNT_dec_NN(substitute_parse);
11508 nextchar(pRExC_state);
11518 * It returns the code point in utf8 for the value in *encp.
11519 * value: a code value in the source encoding
11520 * encp: a pointer to an Encode object
11522 * If the result from Encode is not a single character,
11523 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11526 S_reg_recode(pTHX_ const U8 value, SV **encp)
11529 SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
11530 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11531 const STRLEN newlen = SvCUR(sv);
11532 UV uv = UNICODE_REPLACEMENT;
11534 PERL_ARGS_ASSERT_REG_RECODE;
11538 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11541 if (!newlen || numlen != newlen) {
11542 uv = UNICODE_REPLACEMENT;
11548 PERL_STATIC_INLINE U8
11549 S_compute_EXACTish(RExC_state_t *pRExC_state)
11553 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11561 op = get_regex_charset(RExC_flags);
11562 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11563 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11564 been, so there is no hole */
11567 return op + EXACTF;
11570 PERL_STATIC_INLINE void
11571 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11572 regnode *node, I32* flagp, STRLEN len, UV code_point,
11575 /* This knows the details about sizing an EXACTish node, setting flags for
11576 * it (by setting <*flagp>, and potentially populating it with a single
11579 * If <len> (the length in bytes) is non-zero, this function assumes that
11580 * the node has already been populated, and just does the sizing. In this
11581 * case <code_point> should be the final code point that has already been
11582 * placed into the node. This value will be ignored except that under some
11583 * circumstances <*flagp> is set based on it.
11585 * If <len> is zero, the function assumes that the node is to contain only
11586 * the single character given by <code_point> and calculates what <len>
11587 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11588 * additionally will populate the node's STRING with <code_point> or its
11591 * In both cases <*flagp> is appropriately set
11593 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11594 * 255, must be folded (the former only when the rules indicate it can
11597 * When it does the populating, it looks at the flag 'downgradable'. If
11598 * true with a node that folds, it checks if the single code point
11599 * participates in a fold, and if not downgrades the node to an EXACT.
11600 * This helps the optimizer */
11602 bool len_passed_in = cBOOL(len != 0);
11603 U8 character[UTF8_MAXBYTES_CASE+1];
11605 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11607 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11608 * sizing difference, and is extra work that is thrown away */
11609 if (downgradable && ! PASS2) {
11610 downgradable = FALSE;
11613 if (! len_passed_in) {
11615 if (UVCHR_IS_INVARIANT(code_point)) {
11616 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11617 *character = (U8) code_point;
11619 else { /* Here is /i and not /l. (toFOLD() is defined on just
11620 ASCII, which isn't the same thing as INVARIANT on
11621 EBCDIC, but it works there, as the extra invariants
11622 fold to themselves) */
11623 *character = toFOLD((U8) code_point);
11625 /* We can downgrade to an EXACT node if this character
11626 * isn't a folding one. Note that this assumes that
11627 * nothing above Latin1 folds to some other invariant than
11628 * one of these alphabetics; otherwise we would also have
11630 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11631 * || ASCII_FOLD_RESTRICTED))
11633 if (downgradable && PL_fold[code_point] == code_point) {
11639 else if (FOLD && (! LOC
11640 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11641 { /* Folding, and ok to do so now */
11642 UV folded = _to_uni_fold_flags(
11646 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11647 ? FOLD_FLAGS_NOMIX_ASCII
11650 && folded == code_point /* This quickly rules out many
11651 cases, avoiding the
11652 _invlist_contains_cp() overhead
11654 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11661 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11663 /* Not folding this cp, and can output it directly */
11664 *character = UTF8_TWO_BYTE_HI(code_point);
11665 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11669 uvchr_to_utf8( character, code_point);
11670 len = UTF8SKIP(character);
11672 } /* Else pattern isn't UTF8. */
11674 *character = (U8) code_point;
11676 } /* Else is folded non-UTF8 */
11677 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
11678 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
11679 || UNICODE_DOT_DOT_VERSION > 0)
11680 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11684 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11685 * comments at join_exact()); */
11686 *character = (U8) code_point;
11689 /* Can turn into an EXACT node if we know the fold at compile time,
11690 * and it folds to itself and doesn't particpate in other folds */
11693 && PL_fold_latin1[code_point] == code_point
11694 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11695 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11699 } /* else is Sharp s. May need to fold it */
11700 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11702 *(character + 1) = 's';
11706 *character = LATIN_SMALL_LETTER_SHARP_S;
11712 RExC_size += STR_SZ(len);
11715 RExC_emit += STR_SZ(len);
11716 STR_LEN(node) = len;
11717 if (! len_passed_in) {
11718 Copy((char *) character, STRING(node), len, char);
11722 *flagp |= HASWIDTH;
11724 /* A single character node is SIMPLE, except for the special-cased SHARP S
11726 if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
11727 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
11728 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
11729 || UNICODE_DOT_DOT_VERSION > 0)
11730 && ( code_point != LATIN_SMALL_LETTER_SHARP_S
11731 || ! FOLD || ! DEPENDS_SEMANTICS)
11737 /* The OP may not be well defined in PASS1 */
11738 if (PASS2 && OP(node) == EXACTFL) {
11739 RExC_contains_locale = 1;
11744 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11745 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11748 S_backref_value(char *p)
11750 const char* endptr;
11752 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
11759 - regatom - the lowest level
11761 Try to identify anything special at the start of the pattern. If there
11762 is, then handle it as required. This may involve generating a single regop,
11763 such as for an assertion; or it may involve recursing, such as to
11764 handle a () structure.
11766 If the string doesn't start with something special then we gobble up
11767 as much literal text as we can.
11769 Once we have been able to handle whatever type of thing started the
11770 sequence, we return.
11772 Note: we have to be careful with escapes, as they can be both literal
11773 and special, and in the case of \10 and friends, context determines which.
11775 A summary of the code structure is:
11777 switch (first_byte) {
11778 cases for each special:
11779 handle this special;
11782 switch (2nd byte) {
11783 cases for each unambiguous special:
11784 handle this special;
11786 cases for each ambigous special/literal:
11788 if (special) handle here
11790 default: // unambiguously literal:
11793 default: // is a literal char
11796 create EXACTish node for literal;
11797 while (more input and node isn't full) {
11798 switch (input_byte) {
11799 cases for each special;
11800 make sure parse pointer is set so that the next call to
11801 regatom will see this special first
11802 goto loopdone; // EXACTish node terminated by prev. char
11804 append char to EXACTISH node;
11806 get next input byte;
11810 return the generated node;
11812 Specifically there are two separate switches for handling
11813 escape sequences, with the one for handling literal escapes requiring
11814 a dummy entry for all of the special escapes that are actually handled
11817 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11819 Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
11820 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
11821 Otherwise does not return NULL.
11825 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11827 regnode *ret = NULL;
11834 GET_RE_DEBUG_FLAGS_DECL;
11836 *flagp = WORST; /* Tentatively. */
11838 DEBUG_PARSE("atom");
11840 PERL_ARGS_ASSERT_REGATOM;
11843 parse_start = RExC_parse;
11844 switch ((U8)*RExC_parse) {
11846 RExC_seen_zerolen++;
11847 nextchar(pRExC_state);
11848 if (RExC_flags & RXf_PMf_MULTILINE)
11849 ret = reg_node(pRExC_state, MBOL);
11851 ret = reg_node(pRExC_state, SBOL);
11852 Set_Node_Length(ret, 1); /* MJD */
11855 nextchar(pRExC_state);
11857 RExC_seen_zerolen++;
11858 if (RExC_flags & RXf_PMf_MULTILINE)
11859 ret = reg_node(pRExC_state, MEOL);
11861 ret = reg_node(pRExC_state, SEOL);
11862 Set_Node_Length(ret, 1); /* MJD */
11865 nextchar(pRExC_state);
11866 if (RExC_flags & RXf_PMf_SINGLELINE)
11867 ret = reg_node(pRExC_state, SANY);
11869 ret = reg_node(pRExC_state, REG_ANY);
11870 *flagp |= HASWIDTH|SIMPLE;
11872 Set_Node_Length(ret, 1); /* MJD */
11876 char * const oregcomp_parse = ++RExC_parse;
11877 ret = regclass(pRExC_state, flagp,depth+1,
11878 FALSE, /* means parse the whole char class */
11879 TRUE, /* allow multi-char folds */
11880 FALSE, /* don't silence non-portable warnings. */
11881 (bool) RExC_strict,
11882 TRUE, /* Allow an optimized regnode result */
11885 if (*flagp & (RESTART_PASS1|NEED_UTF8))
11887 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11890 if (*RExC_parse != ']') {
11891 RExC_parse = oregcomp_parse;
11892 vFAIL("Unmatched [");
11894 nextchar(pRExC_state);
11895 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11899 nextchar(pRExC_state);
11900 ret = reg(pRExC_state, 2, &flags,depth+1);
11902 if (flags & TRYAGAIN) {
11903 if (RExC_parse == RExC_end) {
11904 /* Make parent create an empty node if needed. */
11905 *flagp |= TRYAGAIN;
11910 if (flags & (RESTART_PASS1|NEED_UTF8)) {
11911 *flagp = flags & (RESTART_PASS1|NEED_UTF8);
11914 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11917 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11921 if (flags & TRYAGAIN) {
11922 *flagp |= TRYAGAIN;
11925 vFAIL("Internal urp");
11926 /* Supposed to be caught earlier. */
11932 vFAIL("Quantifier follows nothing");
11937 This switch handles escape sequences that resolve to some kind
11938 of special regop and not to literal text. Escape sequnces that
11939 resolve to literal text are handled below in the switch marked
11942 Every entry in this switch *must* have a corresponding entry
11943 in the literal escape switch. However, the opposite is not
11944 required, as the default for this switch is to jump to the
11945 literal text handling code.
11947 switch ((U8)*++RExC_parse) {
11948 /* Special Escapes */
11950 RExC_seen_zerolen++;
11951 ret = reg_node(pRExC_state, SBOL);
11952 /* SBOL is shared with /^/ so we set the flags so we can tell
11953 * /\A/ from /^/ in split. We check ret because first pass we
11954 * have no regop struct to set the flags on. */
11958 goto finish_meta_pat;
11960 ret = reg_node(pRExC_state, GPOS);
11961 RExC_seen |= REG_GPOS_SEEN;
11963 goto finish_meta_pat;
11965 RExC_seen_zerolen++;
11966 ret = reg_node(pRExC_state, KEEPS);
11968 /* XXX:dmq : disabling in-place substitution seems to
11969 * be necessary here to avoid cases of memory corruption, as
11970 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11972 RExC_seen |= REG_LOOKBEHIND_SEEN;
11973 goto finish_meta_pat;
11975 ret = reg_node(pRExC_state, SEOL);
11977 RExC_seen_zerolen++; /* Do not optimize RE away */
11978 goto finish_meta_pat;
11980 ret = reg_node(pRExC_state, EOS);
11982 RExC_seen_zerolen++; /* Do not optimize RE away */
11983 goto finish_meta_pat;
11985 vFAIL("\\C no longer supported");
11987 ret = reg_node(pRExC_state, CLUMP);
11988 *flagp |= HASWIDTH;
11989 goto finish_meta_pat;
11995 arg = ANYOF_WORDCHAR;
12003 regex_charset charset = get_regex_charset(RExC_flags);
12005 RExC_seen_zerolen++;
12006 RExC_seen |= REG_LOOKBEHIND_SEEN;
12007 op = BOUND + charset;
12009 if (op == BOUNDL) {
12010 RExC_contains_locale = 1;
12013 ret = reg_node(pRExC_state, op);
12015 if (*(RExC_parse + 1) != '{') {
12016 FLAGS(ret) = TRADITIONAL_BOUND;
12017 if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
12023 char name = *RExC_parse;
12026 endbrace = strchr(RExC_parse, '}');
12029 vFAIL2("Missing right brace on \\%c{}", name);
12031 /* XXX Need to decide whether to take spaces or not. Should be
12032 * consistent with \p{}, but that currently is SPACE, which
12033 * means vertical too, which seems wrong
12034 * while (isBLANK(*RExC_parse)) {
12037 if (endbrace == RExC_parse) {
12038 RExC_parse++; /* After the '}' */
12039 vFAIL2("Empty \\%c{}", name);
12041 length = endbrace - RExC_parse;
12042 /*while (isBLANK(*(RExC_parse + length - 1))) {
12045 switch (*RExC_parse) {
12048 && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
12050 goto bad_bound_type;
12052 FLAGS(ret) = GCB_BOUND;
12055 if (length != 2 || *(RExC_parse + 1) != 'b') {
12056 goto bad_bound_type;
12058 FLAGS(ret) = LB_BOUND;
12061 if (length != 2 || *(RExC_parse + 1) != 'b') {
12062 goto bad_bound_type;
12064 FLAGS(ret) = SB_BOUND;
12067 if (length != 2 || *(RExC_parse + 1) != 'b') {
12068 goto bad_bound_type;
12070 FLAGS(ret) = WB_BOUND;
12074 RExC_parse = endbrace;
12076 "'%"UTF8f"' is an unknown bound type",
12077 UTF8fARG(UTF, length, endbrace - length));
12078 NOT_REACHED; /*NOTREACHED*/
12080 RExC_parse = endbrace;
12081 REQUIRE_UNI_RULES(flagp, NULL);
12083 if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
12087 /* Don't have to worry about UTF-8, in this message because
12088 * to get here the contents of the \b must be ASCII */
12089 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
12090 "Using /u for '%.*s' instead of /%s",
12092 endbrace - length + 1,
12093 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
12094 ? ASCII_RESTRICT_PAT_MODS
12095 : ASCII_MORE_RESTRICT_PAT_MODS);
12099 if (PASS2 && invert) {
12100 OP(ret) += NBOUND - BOUND;
12102 goto finish_meta_pat;
12110 if (! DEPENDS_SEMANTICS) {
12114 /* \d doesn't have any matches in the upper Latin1 range, hence /d
12115 * is equivalent to /u. Changing to /u saves some branches at
12118 goto join_posix_op_known;
12121 ret = reg_node(pRExC_state, LNBREAK);
12122 *flagp |= HASWIDTH|SIMPLE;
12123 goto finish_meta_pat;
12131 goto join_posix_op_known;
12137 arg = ANYOF_VERTWS;
12139 goto join_posix_op_known;
12149 op = POSIXD + get_regex_charset(RExC_flags);
12150 if (op > POSIXA) { /* /aa is same as /a */
12153 else if (op == POSIXL) {
12154 RExC_contains_locale = 1;
12157 join_posix_op_known:
12160 op += NPOSIXD - POSIXD;
12163 ret = reg_node(pRExC_state, op);
12165 FLAGS(ret) = namedclass_to_classnum(arg);
12168 *flagp |= HASWIDTH|SIMPLE;
12172 nextchar(pRExC_state);
12173 Set_Node_Length(ret, 2); /* MJD */
12179 ret = regclass(pRExC_state, flagp,depth+1,
12180 TRUE, /* means just parse this element */
12181 FALSE, /* don't allow multi-char folds */
12182 FALSE, /* don't silence non-portable warnings. It
12183 would be a bug if these returned
12185 (bool) RExC_strict,
12186 TRUE, /* Allow an optimized regnode result */
12188 if (*flagp & RESTART_PASS1)
12190 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
12191 * multi-char folds are allowed. */
12193 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12198 Set_Node_Offset(ret, parse_start);
12199 Set_Node_Cur_Length(ret, parse_start - 2);
12200 nextchar(pRExC_state);
12203 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
12204 * \N{...} evaluates to a sequence of more than one code points).
12205 * The function call below returns a regnode, which is our result.
12206 * The parameters cause it to fail if the \N{} evaluates to a
12207 * single code point; we handle those like any other literal. The
12208 * reason that the multicharacter case is handled here and not as
12209 * part of the EXACtish code is because of quantifiers. In
12210 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
12211 * this way makes that Just Happen. dmq.
12212 * join_exact() will join this up with adjacent EXACTish nodes
12213 * later on, if appropriate. */
12215 if (grok_bslash_N(pRExC_state,
12216 &ret, /* Want a regnode returned */
12217 NULL, /* Fail if evaluates to a single code
12219 NULL, /* Don't need a count of how many code
12227 if (*flagp & RESTART_PASS1)
12230 /* Here, evaluates to a single code point. Go get that */
12231 RExC_parse = parse_start;
12234 case 'k': /* Handle \k<NAME> and \k'NAME' */
12237 char ch= RExC_parse[1];
12238 if (ch != '<' && ch != '\'' && ch != '{') {
12240 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12241 vFAIL2("Sequence %.2s... not terminated",parse_start);
12243 /* this pretty much dupes the code for (?P=...) in reg(), if
12244 you change this make sure you change that */
12245 char* name_start = (RExC_parse += 2);
12247 SV *sv_dat = reg_scan_name(pRExC_state,
12248 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
12249 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
12250 if (RExC_parse == name_start || *RExC_parse != ch)
12251 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12252 vFAIL2("Sequence %.3s... not terminated",parse_start);
12255 num = add_data( pRExC_state, STR_WITH_LEN("S"));
12256 RExC_rxi->data->data[num]=(void*)sv_dat;
12257 SvREFCNT_inc_simple_void(sv_dat);
12261 ret = reganode(pRExC_state,
12264 : (ASCII_FOLD_RESTRICTED)
12266 : (AT_LEAST_UNI_SEMANTICS)
12272 *flagp |= HASWIDTH;
12274 /* override incorrect value set in reganode MJD */
12275 Set_Node_Offset(ret, parse_start+1);
12276 Set_Node_Cur_Length(ret, parse_start);
12277 nextchar(pRExC_state);
12283 case '1': case '2': case '3': case '4':
12284 case '5': case '6': case '7': case '8': case '9':
12289 if (*RExC_parse == 'g') {
12293 if (*RExC_parse == '{') {
12297 if (*RExC_parse == '-') {
12301 if (hasbrace && !isDIGIT(*RExC_parse)) {
12302 if (isrel) RExC_parse--;
12304 goto parse_named_seq;
12307 num = S_backref_value(RExC_parse);
12309 vFAIL("Reference to invalid group 0");
12310 else if (num == I32_MAX) {
12311 if (isDIGIT(*RExC_parse))
12312 vFAIL("Reference to nonexistent group");
12314 vFAIL("Unterminated \\g... pattern");
12318 num = RExC_npar - num;
12320 vFAIL("Reference to nonexistent or unclosed group");
12324 num = S_backref_value(RExC_parse);
12325 /* bare \NNN might be backref or octal - if it is larger
12326 * than or equal RExC_npar then it is assumed to be an
12327 * octal escape. Note RExC_npar is +1 from the actual
12328 * number of parens. */
12329 /* Note we do NOT check if num == I32_MAX here, as that is
12330 * handled by the RExC_npar check */
12333 /* any numeric escape < 10 is always a backref */
12335 /* any numeric escape < RExC_npar is a backref */
12336 && num >= RExC_npar
12337 /* cannot be an octal escape if it starts with 8 */
12338 && *RExC_parse != '8'
12339 /* cannot be an octal escape it it starts with 9 */
12340 && *RExC_parse != '9'
12343 /* Probably not a backref, instead likely to be an
12344 * octal character escape, e.g. \35 or \777.
12345 * The above logic should make it obvious why using
12346 * octal escapes in patterns is problematic. - Yves */
12347 RExC_parse = parse_start;
12352 /* At this point RExC_parse points at a numeric escape like
12353 * \12 or \88 or something similar, which we should NOT treat
12354 * as an octal escape. It may or may not be a valid backref
12355 * escape. For instance \88888888 is unlikely to be a valid
12357 while (isDIGIT(*RExC_parse))
12360 if (*RExC_parse != '}')
12361 vFAIL("Unterminated \\g{...} pattern");
12365 if (num > (I32)RExC_rx->nparens)
12366 vFAIL("Reference to nonexistent group");
12369 ret = reganode(pRExC_state,
12372 : (ASCII_FOLD_RESTRICTED)
12374 : (AT_LEAST_UNI_SEMANTICS)
12380 *flagp |= HASWIDTH;
12382 /* override incorrect value set in reganode MJD */
12383 Set_Node_Offset(ret, parse_start);
12384 Set_Node_Cur_Length(ret, parse_start-1);
12385 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12386 FALSE /* Don't force to /x */ );
12390 if (RExC_parse >= RExC_end)
12391 FAIL("Trailing \\");
12394 /* Do not generate "unrecognized" warnings here, we fall
12395 back into the quick-grab loop below */
12396 RExC_parse = parse_start;
12398 } /* end of switch on a \foo sequence */
12403 /* '#' comments should have been spaced over before this function was
12405 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
12407 if (RExC_flags & RXf_PMf_EXTENDED) {
12408 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12409 if (RExC_parse < RExC_end)
12419 /* Here, we have determined that the next thing is probably a
12420 * literal character. RExC_parse points to the first byte of its
12421 * definition. (It still may be an escape sequence that evaluates
12422 * to a single character) */
12428 #define MAX_NODE_STRING_SIZE 127
12429 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12431 U8 upper_parse = MAX_NODE_STRING_SIZE;
12432 U8 node_type = compute_EXACTish(pRExC_state);
12433 bool next_is_quantifier;
12434 char * oldp = NULL;
12436 /* We can convert EXACTF nodes to EXACTFU if they contain only
12437 * characters that match identically regardless of the target
12438 * string's UTF8ness. The reason to do this is that EXACTF is not
12439 * trie-able, EXACTFU is.
12441 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
12442 * contain only above-Latin1 characters (hence must be in UTF8),
12443 * which don't participate in folds with Latin1-range characters,
12444 * as the latter's folds aren't known until runtime. (We don't
12445 * need to figure this out until pass 2) */
12446 bool maybe_exactfu = PASS2
12447 && (node_type == EXACTF || node_type == EXACTFL);
12449 /* If a folding node contains only code points that don't
12450 * participate in folds, it can be changed into an EXACT node,
12451 * which allows the optimizer more things to look for */
12454 ret = reg_node(pRExC_state, node_type);
12456 /* In pass1, folded, we use a temporary buffer instead of the
12457 * actual node, as the node doesn't exist yet */
12458 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12464 /* We look for the EXACTFish to EXACT node optimizaton only if
12465 * folding. (And we don't need to figure this out until pass 2).
12466 * XXX It might actually make sense to split the node into portions
12467 * that are exact and ones that aren't, so that we could later use
12468 * the exact ones to find the longest fixed and floating strings.
12469 * One would want to join them back into a larger node. One could
12470 * use a pseudo regnode like 'EXACT_ORIG_FOLD' */
12471 maybe_exact = FOLD && PASS2;
12473 /* XXX The node can hold up to 255 bytes, yet this only goes to
12474 * 127. I (khw) do not know why. Keeping it somewhat less than
12475 * 255 allows us to not have to worry about overflow due to
12476 * converting to utf8 and fold expansion, but that value is
12477 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
12478 * split up by this limit into a single one using the real max of
12479 * 255. Even at 127, this breaks under rare circumstances. If
12480 * folding, we do not want to split a node at a character that is a
12481 * non-final in a multi-char fold, as an input string could just
12482 * happen to want to match across the node boundary. The join
12483 * would solve that problem if the join actually happens. But a
12484 * series of more than two nodes in a row each of 127 would cause
12485 * the first join to succeed to get to 254, but then there wouldn't
12486 * be room for the next one, which could at be one of those split
12487 * multi-char folds. I don't know of any fool-proof solution. One
12488 * could back off to end with only a code point that isn't such a
12489 * non-final, but it is possible for there not to be any in the
12492 assert( ! UTF /* Is at the beginning of a character */
12493 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
12494 || UTF8_IS_START(UCHARAT(RExC_parse)));
12496 for (p = RExC_parse;
12497 len < upper_parse && p < RExC_end;
12502 /* White space has already been ignored */
12503 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
12504 || ! is_PATWS_safe((p), RExC_end, UTF));
12516 /* Literal Escapes Switch
12518 This switch is meant to handle escape sequences that
12519 resolve to a literal character.
12521 Every escape sequence that represents something
12522 else, like an assertion or a char class, is handled
12523 in the switch marked 'Special Escapes' above in this
12524 routine, but also has an entry here as anything that
12525 isn't explicitly mentioned here will be treated as
12526 an unescaped equivalent literal.
12529 switch ((U8)*++p) {
12530 /* These are all the special escapes. */
12531 case 'A': /* Start assertion */
12532 case 'b': case 'B': /* Word-boundary assertion*/
12533 case 'C': /* Single char !DANGEROUS! */
12534 case 'd': case 'D': /* digit class */
12535 case 'g': case 'G': /* generic-backref, pos assertion */
12536 case 'h': case 'H': /* HORIZWS */
12537 case 'k': case 'K': /* named backref, keep marker */
12538 case 'p': case 'P': /* Unicode property */
12539 case 'R': /* LNBREAK */
12540 case 's': case 'S': /* space class */
12541 case 'v': case 'V': /* VERTWS */
12542 case 'w': case 'W': /* word class */
12543 case 'X': /* eXtended Unicode "combining
12544 character sequence" */
12545 case 'z': case 'Z': /* End of line/string assertion */
12549 /* Anything after here is an escape that resolves to a
12550 literal. (Except digits, which may or may not)
12556 case 'N': /* Handle a single-code point named character. */
12557 RExC_parse = p + 1;
12558 if (! grok_bslash_N(pRExC_state,
12559 NULL, /* Fail if evaluates to
12560 anything other than a
12561 single code point */
12562 &ender, /* The returned single code
12564 NULL, /* Don't need a count of
12565 how many code points */
12569 if (*flagp & NEED_UTF8)
12570 FAIL("panic: grok_bslash_N set NEED_UTF8");
12571 if (*flagp & RESTART_PASS1)
12574 /* Here, it wasn't a single code point. Go close
12575 * up this EXACTish node. The switch() prior to
12576 * this switch handles the other cases */
12577 RExC_parse = p = oldp;
12581 if (ender > 0xff) {
12582 REQUIRE_UTF8(flagp);
12598 ender = ESC_NATIVE;
12608 const char* error_msg;
12610 bool valid = grok_bslash_o(&p,
12613 PASS2, /* out warnings */
12614 (bool) RExC_strict,
12615 TRUE, /* Output warnings
12620 RExC_parse = p; /* going to die anyway; point
12621 to exact spot of failure */
12625 if (IN_ENCODING && ender < 0x100) {
12626 goto recode_encoding;
12628 if (ender > 0xff) {
12629 REQUIRE_UTF8(flagp);
12635 UV result = UV_MAX; /* initialize to erroneous
12637 const char* error_msg;
12639 bool valid = grok_bslash_x(&p,
12642 PASS2, /* out warnings */
12643 (bool) RExC_strict,
12644 TRUE, /* Silence warnings
12649 RExC_parse = p; /* going to die anyway; point
12650 to exact spot of failure */
12655 if (ender < 0x100) {
12657 if (RExC_recode_x_to_native) {
12658 ender = LATIN1_TO_NATIVE(ender);
12663 goto recode_encoding;
12667 REQUIRE_UTF8(flagp);
12673 ender = grok_bslash_c(*p++, PASS2);
12675 case '8': case '9': /* must be a backreference */
12677 /* we have an escape like \8 which cannot be an octal escape
12678 * so we exit the loop, and let the outer loop handle this
12679 * escape which may or may not be a legitimate backref. */
12681 case '1': case '2': case '3':case '4':
12682 case '5': case '6': case '7':
12683 /* When we parse backslash escapes there is ambiguity
12684 * between backreferences and octal escapes. Any escape
12685 * from \1 - \9 is a backreference, any multi-digit
12686 * escape which does not start with 0 and which when
12687 * evaluated as decimal could refer to an already
12688 * parsed capture buffer is a back reference. Anything
12691 * Note this implies that \118 could be interpreted as
12692 * 118 OR as "\11" . "8" depending on whether there
12693 * were 118 capture buffers defined already in the
12696 /* NOTE, RExC_npar is 1 more than the actual number of
12697 * parens we have seen so far, hence the < RExC_npar below. */
12699 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12700 { /* Not to be treated as an octal constant, go
12708 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12710 ender = grok_oct(p, &numlen, &flags, NULL);
12711 if (ender > 0xff) {
12712 REQUIRE_UTF8(flagp);
12715 if (PASS2 /* like \08, \178 */
12718 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12720 reg_warn_non_literal_string(
12722 form_short_octal_warning(p, numlen));
12725 if (IN_ENCODING && ender < 0x100)
12726 goto recode_encoding;
12729 if (! RExC_override_recoding) {
12730 SV* enc = _get_encoding();
12731 ender = reg_recode((U8)ender, &enc);
12733 ckWARNreg(p, "Invalid escape in the specified encoding");
12734 REQUIRE_UTF8(flagp);
12739 FAIL("Trailing \\");
12742 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12743 /* Include any left brace following the alpha to emphasize
12744 * that it could be part of an escape at some point
12746 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12747 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12749 goto normal_default;
12750 } /* End of switch on '\' */
12753 /* Currently we don't warn when the lbrace is at the start
12754 * of a construct. This catches it in the middle of a
12755 * literal string, or when it's the first thing after
12756 * something like "\b" */
12758 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12760 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12763 default: /* A literal character */
12765 if (! UTF8_IS_INVARIANT(*p) && UTF) {
12767 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12768 &numlen, UTF8_ALLOW_DEFAULT);
12774 } /* End of switch on the literal */
12776 /* Here, have looked at the literal character and <ender>
12777 * contains its ordinal, <p> points to the character after it.
12778 * We need to check if the next non-ignored thing is a
12779 * quantifier. Move <p> to after anything that should be
12780 * ignored, which, as a side effect, positions <p> for the next
12781 * loop iteration */
12782 skip_to_be_ignored_text(pRExC_state, &p,
12783 FALSE /* Don't force to /x */ );
12785 /* If the next thing is a quantifier, it applies to this
12786 * character only, which means that this character has to be in
12787 * its own node and can't just be appended to the string in an
12788 * existing node, so if there are already other characters in
12789 * the node, close the node with just them, and set up to do
12790 * this character again next time through, when it will be the
12791 * only thing in its new node */
12792 if ((next_is_quantifier = ( LIKELY(p < RExC_end)
12793 && UNLIKELY(ISMULT2(p))))
12800 /* Ready to add 'ender' to the node */
12802 if (! FOLD) { /* The simple case, just append the literal */
12804 /* In the sizing pass, we need only the size of the
12805 * character we are appending, hence we can delay getting
12806 * its representation until PASS2. */
12809 const STRLEN unilen = UVCHR_SKIP(ender);
12812 /* We have to subtract 1 just below (and again in
12813 * the corresponding PASS2 code) because the loop
12814 * increments <len> each time, as all but this path
12815 * (and one other) through it add a single byte to
12816 * the EXACTish node. But these paths would change
12817 * len to be the correct final value, so cancel out
12818 * the increment that follows */
12824 } else { /* PASS2 */
12827 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
12828 len += (char *) new_s - s - 1;
12829 s = (char *) new_s;
12832 *(s++) = (char) ender;
12836 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
12838 /* Here are folding under /l, and the code point is
12839 * problematic. First, we know we can't simplify things */
12840 maybe_exact = FALSE;
12841 maybe_exactfu = FALSE;
12843 /* A problematic code point in this context means that its
12844 * fold isn't known until runtime, so we can't fold it now.
12845 * (The non-problematic code points are the above-Latin1
12846 * ones that fold to also all above-Latin1. Their folds
12847 * don't vary no matter what the locale is.) But here we
12848 * have characters whose fold depends on the locale.
12849 * Unlike the non-folding case above, we have to keep track
12850 * of these in the sizing pass, so that we can make sure we
12851 * don't split too-long nodes in the middle of a potential
12852 * multi-char fold. And unlike the regular fold case
12853 * handled in the else clauses below, we don't actually
12854 * fold and don't have special cases to consider. What we
12855 * do for both passes is the PASS2 code for non-folding */
12856 goto not_fold_common;
12858 else /* A regular FOLD code point */
12860 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12861 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12862 || UNICODE_DOT_DOT_VERSION > 0)
12863 /* See comments for join_exact() as to why we fold
12864 * this non-UTF at compile time */
12865 || ( node_type == EXACTFU
12866 && ender == LATIN_SMALL_LETTER_SHARP_S)
12869 /* Here, are folding and are not UTF-8 encoded; therefore
12870 * the character must be in the range 0-255, and is not /l
12871 * (Not /l because we already handled these under /l in
12872 * is_PROBLEMATIC_LOCALE_FOLD_cp) */
12873 if (IS_IN_SOME_FOLD_L1(ender)) {
12874 maybe_exact = FALSE;
12876 /* See if the character's fold differs between /d and
12877 * /u. This includes the multi-char fold SHARP S to
12879 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
12880 RExC_seen_unfolded_sharp_s = 1;
12881 maybe_exactfu = FALSE;
12883 else if (maybe_exactfu
12884 && (PL_fold[ender] != PL_fold_latin1[ender]
12885 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12886 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12887 || UNICODE_DOT_DOT_VERSION > 0)
12889 && isALPHA_FOLD_EQ(ender, 's')
12890 && isALPHA_FOLD_EQ(*(s-1), 's'))
12893 maybe_exactfu = FALSE;
12897 /* Even when folding, we store just the input character, as
12898 * we have an array that finds its fold quickly */
12899 *(s++) = (char) ender;
12901 else { /* FOLD, and UTF (or sharp s) */
12902 /* Unlike the non-fold case, we do actually have to
12903 * calculate the results here in pass 1. This is for two
12904 * reasons, the folded length may be longer than the
12905 * unfolded, and we have to calculate how many EXACTish
12906 * nodes it will take; and we may run out of room in a node
12907 * in the middle of a potential multi-char fold, and have
12908 * to back off accordingly. */
12911 if (isASCII_uni(ender)) {
12912 folded = toFOLD(ender);
12913 *(s)++ = (U8) folded;
12918 folded = _to_uni_fold_flags(
12922 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12923 ? FOLD_FLAGS_NOMIX_ASCII
12927 /* The loop increments <len> each time, as all but this
12928 * path (and one other) through it add a single byte to
12929 * the EXACTish node. But this one has changed len to
12930 * be the correct final value, so subtract one to
12931 * cancel out the increment that follows */
12932 len += foldlen - 1;
12934 /* If this node only contains non-folding code points so
12935 * far, see if this new one is also non-folding */
12937 if (folded != ender) {
12938 maybe_exact = FALSE;
12941 /* Here the fold is the original; we have to check
12942 * further to see if anything folds to it */
12943 if (_invlist_contains_cp(PL_utf8_foldable,
12946 maybe_exact = FALSE;
12953 if (next_is_quantifier) {
12955 /* Here, the next input is a quantifier, and to get here,
12956 * the current character is the only one in the node.
12957 * Also, here <len> doesn't include the final byte for this
12963 } /* End of loop through literal characters */
12965 /* Here we have either exhausted the input or ran out of room in
12966 * the node. (If we encountered a character that can't be in the
12967 * node, transfer is made directly to <loopdone>, and so we
12968 * wouldn't have fallen off the end of the loop.) In the latter
12969 * case, we artificially have to split the node into two, because
12970 * we just don't have enough space to hold everything. This
12971 * creates a problem if the final character participates in a
12972 * multi-character fold in the non-final position, as a match that
12973 * should have occurred won't, due to the way nodes are matched,
12974 * and our artificial boundary. So back off until we find a non-
12975 * problematic character -- one that isn't at the beginning or
12976 * middle of such a fold. (Either it doesn't participate in any
12977 * folds, or appears only in the final position of all the folds it
12978 * does participate in.) A better solution with far fewer false
12979 * positives, and that would fill the nodes more completely, would
12980 * be to actually have available all the multi-character folds to
12981 * test against, and to back-off only far enough to be sure that
12982 * this node isn't ending with a partial one. <upper_parse> is set
12983 * further below (if we need to reparse the node) to include just
12984 * up through that final non-problematic character that this code
12985 * identifies, so when it is set to less than the full node, we can
12986 * skip the rest of this */
12987 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12989 const STRLEN full_len = len;
12991 assert(len >= MAX_NODE_STRING_SIZE);
12993 /* Here, <s> points to the final byte of the final character.
12994 * Look backwards through the string until find a non-
12995 * problematic character */
12999 /* This has no multi-char folds to non-UTF characters */
13000 if (ASCII_FOLD_RESTRICTED) {
13004 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
13008 if (! PL_NonL1NonFinalFold) {
13009 PL_NonL1NonFinalFold = _new_invlist_C_array(
13010 NonL1_Perl_Non_Final_Folds_invlist);
13013 /* Point to the first byte of the final character */
13014 s = (char *) utf8_hop((U8 *) s, -1);
13016 while (s >= s0) { /* Search backwards until find
13017 non-problematic char */
13018 if (UTF8_IS_INVARIANT(*s)) {
13020 /* There are no ascii characters that participate
13021 * in multi-char folds under /aa. In EBCDIC, the
13022 * non-ascii invariants are all control characters,
13023 * so don't ever participate in any folds. */
13024 if (ASCII_FOLD_RESTRICTED
13025 || ! IS_NON_FINAL_FOLD(*s))
13030 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
13031 if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
13037 else if (! _invlist_contains_cp(
13038 PL_NonL1NonFinalFold,
13039 valid_utf8_to_uvchr((U8 *) s, NULL)))
13044 /* Here, the current character is problematic in that
13045 * it does occur in the non-final position of some
13046 * fold, so try the character before it, but have to
13047 * special case the very first byte in the string, so
13048 * we don't read outside the string */
13049 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
13050 } /* End of loop backwards through the string */
13052 /* If there were only problematic characters in the string,
13053 * <s> will point to before s0, in which case the length
13054 * should be 0, otherwise include the length of the
13055 * non-problematic character just found */
13056 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
13059 /* Here, have found the final character, if any, that is
13060 * non-problematic as far as ending the node without splitting
13061 * it across a potential multi-char fold. <len> contains the
13062 * number of bytes in the node up-to and including that
13063 * character, or is 0 if there is no such character, meaning
13064 * the whole node contains only problematic characters. In
13065 * this case, give up and just take the node as-is. We can't
13070 /* If the node ends in an 's' we make sure it stays EXACTF,
13071 * as if it turns into an EXACTFU, it could later get
13072 * joined with another 's' that would then wrongly match
13074 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
13076 maybe_exactfu = FALSE;
13080 /* Here, the node does contain some characters that aren't
13081 * problematic. If one such is the final character in the
13082 * node, we are done */
13083 if (len == full_len) {
13086 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
13088 /* If the final character is problematic, but the
13089 * penultimate is not, back-off that last character to
13090 * later start a new node with it */
13095 /* Here, the final non-problematic character is earlier
13096 * in the input than the penultimate character. What we do
13097 * is reparse from the beginning, going up only as far as
13098 * this final ok one, thus guaranteeing that the node ends
13099 * in an acceptable character. The reason we reparse is
13100 * that we know how far in the character is, but we don't
13101 * know how to correlate its position with the input parse.
13102 * An alternate implementation would be to build that
13103 * correlation as we go along during the original parse,
13104 * but that would entail extra work for every node, whereas
13105 * this code gets executed only when the string is too
13106 * large for the node, and the final two characters are
13107 * problematic, an infrequent occurrence. Yet another
13108 * possible strategy would be to save the tail of the
13109 * string, and the next time regatom is called, initialize
13110 * with that. The problem with this is that unless you
13111 * back off one more character, you won't be guaranteed
13112 * regatom will get called again, unless regbranch,
13113 * regpiece ... are also changed. If you do back off that
13114 * extra character, so that there is input guaranteed to
13115 * force calling regatom, you can't handle the case where
13116 * just the first character in the node is acceptable. I
13117 * (khw) decided to try this method which doesn't have that
13118 * pitfall; if performance issues are found, we can do a
13119 * combination of the current approach plus that one */
13125 } /* End of verifying node ends with an appropriate char */
13127 loopdone: /* Jumped to when encounters something that shouldn't be
13130 /* I (khw) don't know if you can get here with zero length, but the
13131 * old code handled this situation by creating a zero-length EXACT
13132 * node. Might as well be NOTHING instead */
13138 /* If 'maybe_exact' is still set here, means there are no
13139 * code points in the node that participate in folds;
13140 * similarly for 'maybe_exactfu' and code points that match
13141 * differently depending on UTF8ness of the target string
13142 * (for /u), or depending on locale for /l */
13148 else if (maybe_exactfu) {
13154 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
13155 FALSE /* Don't look to see if could
13156 be turned into an EXACT
13157 node, as we have already
13162 RExC_parse = p - 1;
13163 Set_Node_Cur_Length(ret, parse_start);
13165 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13166 FALSE /* Don't force to /x */ );
13168 /* len is STRLEN which is unsigned, need to copy to signed */
13171 vFAIL("Internal disaster");
13174 } /* End of label 'defchar:' */
13176 } /* End of giant switch on input character */
13183 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
13185 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
13186 * sets up the bitmap and any flags, removing those code points from the
13187 * inversion list, setting it to NULL should it become completely empty */
13189 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
13190 assert(PL_regkind[OP(node)] == ANYOF);
13192 ANYOF_BITMAP_ZERO(node);
13193 if (*invlist_ptr) {
13195 /* This gets set if we actually need to modify things */
13196 bool change_invlist = FALSE;
13200 /* Start looking through *invlist_ptr */
13201 invlist_iterinit(*invlist_ptr);
13202 while (invlist_iternext(*invlist_ptr, &start, &end)) {
13206 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
13207 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
13210 /* Quit if are above what we should change */
13211 if (start >= NUM_ANYOF_CODE_POINTS) {
13215 change_invlist = TRUE;
13217 /* Set all the bits in the range, up to the max that we are doing */
13218 high = (end < NUM_ANYOF_CODE_POINTS - 1)
13220 : NUM_ANYOF_CODE_POINTS - 1;
13221 for (i = start; i <= (int) high; i++) {
13222 if (! ANYOF_BITMAP_TEST(node, i)) {
13223 ANYOF_BITMAP_SET(node, i);
13227 invlist_iterfinish(*invlist_ptr);
13229 /* Done with loop; remove any code points that are in the bitmap from
13230 * *invlist_ptr; similarly for code points above the bitmap if we have
13231 * a flag to match all of them anyways */
13232 if (change_invlist) {
13233 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
13235 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
13236 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
13239 /* If have completely emptied it, remove it completely */
13240 if (_invlist_len(*invlist_ptr) == 0) {
13241 SvREFCNT_dec_NN(*invlist_ptr);
13242 *invlist_ptr = NULL;
13247 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
13248 Character classes ([:foo:]) can also be negated ([:^foo:]).
13249 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
13250 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
13251 but trigger failures because they are currently unimplemented. */
13253 #define POSIXCC_DONE(c) ((c) == ':')
13254 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
13255 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
13257 PERL_STATIC_INLINE I32
13258 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
13260 I32 namedclass = OOB_NAMEDCLASS;
13262 PERL_ARGS_ASSERT_REGPPOSIXCC;
13264 if (value == '[' && RExC_parse + 1 < RExC_end &&
13265 /* I smell either [: or [= or [. -- POSIX has been here, right? */
13266 POSIXCC(UCHARAT(RExC_parse)))
13268 const char c = UCHARAT(RExC_parse);
13269 char* const s = RExC_parse++;
13271 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
13273 if (RExC_parse == RExC_end) {
13276 /* Try to give a better location for the error (than the end of
13277 * the string) by looking for the matching ']' */
13279 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
13282 vFAIL2("Unmatched '%c' in POSIX class", c);
13284 /* Grandfather lone [:, [=, [. */
13288 const char* const t = RExC_parse++; /* skip over the c */
13291 if (UCHARAT(RExC_parse) == ']') {
13292 const char *posixcc = s + 1;
13293 RExC_parse++; /* skip over the ending ] */
13296 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
13297 const I32 skip = t - posixcc;
13299 /* Initially switch on the length of the name. */
13302 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
13303 this is the Perl \w
13305 namedclass = ANYOF_WORDCHAR;
13308 /* Names all of length 5. */
13309 /* alnum alpha ascii blank cntrl digit graph lower
13310 print punct space upper */
13311 /* Offset 4 gives the best switch position. */
13312 switch (posixcc[4]) {
13314 if (memEQ(posixcc, "alph", 4)) /* alpha */
13315 namedclass = ANYOF_ALPHA;
13318 if (memEQ(posixcc, "spac", 4)) /* space */
13319 namedclass = ANYOF_SPACE;
13322 if (memEQ(posixcc, "grap", 4)) /* graph */
13323 namedclass = ANYOF_GRAPH;
13326 if (memEQ(posixcc, "asci", 4)) /* ascii */
13327 namedclass = ANYOF_ASCII;
13330 if (memEQ(posixcc, "blan", 4)) /* blank */
13331 namedclass = ANYOF_BLANK;
13334 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
13335 namedclass = ANYOF_CNTRL;
13338 if (memEQ(posixcc, "alnu", 4)) /* alnum */
13339 namedclass = ANYOF_ALPHANUMERIC;
13342 if (memEQ(posixcc, "lowe", 4)) /* lower */
13343 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
13344 else if (memEQ(posixcc, "uppe", 4)) /* upper */
13345 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
13348 if (memEQ(posixcc, "digi", 4)) /* digit */
13349 namedclass = ANYOF_DIGIT;
13350 else if (memEQ(posixcc, "prin", 4)) /* print */
13351 namedclass = ANYOF_PRINT;
13352 else if (memEQ(posixcc, "punc", 4)) /* punct */
13353 namedclass = ANYOF_PUNCT;
13358 if (memEQ(posixcc, "xdigit", 6))
13359 namedclass = ANYOF_XDIGIT;
13363 if (namedclass == OOB_NAMEDCLASS)
13365 "POSIX class [:%"UTF8f":] unknown",
13366 UTF8fARG(UTF, t - s - 1, s + 1));
13368 /* The #defines are structured so each complement is +1 to
13369 * the normal one */
13373 assert (posixcc[skip] == ':');
13374 assert (posixcc[skip+1] == ']');
13375 } else if (!SIZE_ONLY) {
13376 /* [[=foo=]] and [[.foo.]] are still future. */
13378 /* adjust RExC_parse so the warning shows after
13379 the class closes */
13380 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
13382 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
13385 /* Maternal grandfather:
13386 * "[:" ending in ":" but not in ":]" */
13388 vFAIL("Unmatched '[' in POSIX class");
13391 /* Grandfather lone [:, [=, [. */
13401 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
13403 /* This applies some heuristics at the current parse position (which should
13404 * be at a '[') to see if what follows might be intended to be a [:posix:]
13405 * class. It returns true if it really is a posix class, of course, but it
13406 * also can return true if it thinks that what was intended was a posix
13407 * class that didn't quite make it.
13409 * It will return true for
13411 * [:alphanumerics] (as long as the ] isn't followed immediately by a
13412 * ')' indicating the end of the (?[
13413 * [:any garbage including %^&$ punctuation:]
13415 * This is designed to be called only from S_handle_regex_sets; it could be
13416 * easily adapted to be called from the spot at the beginning of regclass()
13417 * that checks to see in a normal bracketed class if the surrounding []
13418 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
13419 * change long-standing behavior, so I (khw) didn't do that */
13420 char* p = RExC_parse + 1;
13421 char first_char = *p;
13423 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13425 assert(*(p - 1) == '[');
13427 if (! POSIXCC(first_char)) {
13432 while (p < RExC_end && isWORDCHAR(*p)) p++;
13434 if (p >= RExC_end) {
13438 if (p - RExC_parse > 2 /* Got at least 1 word character */
13439 && (*p == first_char
13440 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13445 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13448 && p - RExC_parse > 2 /* [:] evaluates to colon;
13449 [::] is a bad posix class. */
13450 && first_char == *(p - 1));
13453 STATIC unsigned int
13454 S_regex_set_precedence(const U8 my_operator) {
13456 /* Returns the precedence in the (?[...]) construct of the input operator,
13457 * specified by its character representation. The precedence follows
13458 * general Perl rules, but it extends this so that ')' and ']' have (low)
13459 * precedence even though they aren't really operators */
13461 switch (my_operator) {
13477 NOT_REACHED; /* NOTREACHED */
13478 return 0; /* Silence compiler warning */
13482 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13483 I32 *flagp, U32 depth,
13484 char * const oregcomp_parse)
13486 /* Handle the (?[...]) construct to do set operations */
13488 U8 curchar; /* Current character being parsed */
13489 UV start, end; /* End points of code point ranges */
13490 SV* final = NULL; /* The end result inversion list */
13491 SV* result_string; /* 'final' stringified */
13492 AV* stack; /* stack of operators and operands not yet
13494 AV* fence_stack = NULL; /* A stack containing the positions in
13495 'stack' of where the undealt-with left
13496 parens would be if they were actually
13498 IV fence = 0; /* Position of where most recent undealt-
13499 with left paren in stack is; -1 if none.
13501 STRLEN len; /* Temporary */
13502 regnode* node; /* Temporary, and final regnode returned by
13504 const bool save_fold = FOLD; /* Temporary */
13505 char *save_end, *save_parse; /* Temporaries */
13506 const bool in_locale = LOC; /* we turn off /l during processing */
13508 GET_RE_DEBUG_FLAGS_DECL;
13510 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13513 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
13516 REQUIRE_UNI_RULES(flagp, NULL); /* The use of this operator implies /u.
13517 This is required so that the compile
13518 time values are valid in all runtime
13521 /* This will return only an ANYOF regnode, or (unlikely) something smaller
13522 * (such as EXACT). Thus we can skip most everything if just sizing. We
13523 * call regclass to handle '[]' so as to not have to reinvent its parsing
13524 * rules here (throwing away the size it computes each time). And, we exit
13525 * upon an unescaped ']' that isn't one ending a regclass. To do both
13526 * these things, we need to realize that something preceded by a backslash
13527 * is escaped, so we have to keep track of backslashes */
13529 UV depth = 0; /* how many nested (?[...]) constructs */
13531 while (RExC_parse < RExC_end) {
13532 SV* current = NULL;
13534 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13535 TRUE /* Force /x */ );
13537 switch (*RExC_parse) {
13539 if (RExC_parse[1] == '[') depth++, RExC_parse++;
13544 /* Skip the next byte (which could cause us to end up in
13545 * the middle of a UTF-8 character, but since none of those
13546 * are confusable with anything we currently handle in this
13547 * switch (invariants all), it's safe. We'll just hit the
13548 * default: case next time and keep on incrementing until
13549 * we find one of the invariants we do handle. */
13551 if (*RExC_parse == 'c') {
13552 /* Skip the \cX notation for control characters */
13553 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
13558 /* If this looks like it is a [:posix:] class, leave the
13559 * parse pointer at the '[' to fool regclass() into
13560 * thinking it is part of a '[[:posix:]]'. That function
13561 * will use strict checking to force a syntax error if it
13562 * doesn't work out to a legitimate class */
13563 bool is_posix_class
13564 = could_it_be_a_POSIX_class(pRExC_state);
13565 if (! is_posix_class) {
13569 /* regclass() can only return RESTART_PASS1 and NEED_UTF8
13570 * if multi-char folds are allowed. */
13571 if (!regclass(pRExC_state, flagp,depth+1,
13572 is_posix_class, /* parse the whole char
13573 class only if not a
13575 FALSE, /* don't allow multi-char folds */
13576 TRUE, /* silence non-portable warnings. */
13578 FALSE, /* Require return to be an ANYOF */
13581 FAIL2("panic: regclass returned NULL to handle_sets, "
13582 "flags=%#"UVxf"", (UV) *flagp);
13584 /* function call leaves parse pointing to the ']', except
13585 * if we faked it */
13586 if (is_posix_class) {
13590 SvREFCNT_dec(current); /* In case it returned something */
13595 if (depth--) break;
13597 if (RExC_parse < RExC_end
13598 && *RExC_parse == ')')
13600 node = reganode(pRExC_state, ANYOF, 0);
13601 RExC_size += ANYOF_SKIP;
13602 nextchar(pRExC_state);
13603 Set_Node_Length(node,
13604 RExC_parse - oregcomp_parse + 1); /* MJD */
13606 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
13614 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
13618 FAIL("Syntax error in (?[...])");
13621 /* Pass 2 only after this. */
13622 Perl_ck_warner_d(aTHX_
13623 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13624 "The regex_sets feature is experimental" REPORT_LOCATION,
13625 REPORT_LOCATION_ARGS(RExC_parse));
13627 /* Everything in this construct is a metacharacter. Operands begin with
13628 * either a '\' (for an escape sequence), or a '[' for a bracketed
13629 * character class. Any other character should be an operator, or
13630 * parenthesis for grouping. Both types of operands are handled by calling
13631 * regclass() to parse them. It is called with a parameter to indicate to
13632 * return the computed inversion list. The parsing here is implemented via
13633 * a stack. Each entry on the stack is a single character representing one
13634 * of the operators; or else a pointer to an operand inversion list. */
13636 #define IS_OPERATOR(a) SvIOK(a)
13637 #define IS_OPERAND(a) (! IS_OPERATOR(a))
13639 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
13640 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
13641 * with pronouncing it called it Reverse Polish instead, but now that YOU
13642 * know how to pronounce it you can use the correct term, thus giving due
13643 * credit to the person who invented it, and impressing your geek friends.
13644 * Wikipedia says that the pronounciation of "Ł" has been changing so that
13645 * it is now more like an English initial W (as in wonk) than an L.)
13647 * This means that, for example, 'a | b & c' is stored on the stack as
13655 * where the numbers in brackets give the stack [array] element number.
13656 * In this implementation, parentheses are not stored on the stack.
13657 * Instead a '(' creates a "fence" so that the part of the stack below the
13658 * fence is invisible except to the corresponding ')' (this allows us to
13659 * replace testing for parens, by using instead subtraction of the fence
13660 * position). As new operands are processed they are pushed onto the stack
13661 * (except as noted in the next paragraph). New operators of higher
13662 * precedence than the current final one are inserted on the stack before
13663 * the lhs operand (so that when the rhs is pushed next, everything will be
13664 * in the correct positions shown above. When an operator of equal or
13665 * lower precedence is encountered in parsing, all the stacked operations
13666 * of equal or higher precedence are evaluated, leaving the result as the
13667 * top entry on the stack. This makes higher precedence operations
13668 * evaluate before lower precedence ones, and causes operations of equal
13669 * precedence to left associate.
13671 * The only unary operator '!' is immediately pushed onto the stack when
13672 * encountered. When an operand is encountered, if the top of the stack is
13673 * a '!", the complement is immediately performed, and the '!' popped. The
13674 * resulting value is treated as a new operand, and the logic in the
13675 * previous paragraph is executed. Thus in the expression
13677 * the stack looks like
13683 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
13690 * A ')' is treated as an operator with lower precedence than all the
13691 * aforementioned ones, which causes all operations on the stack above the
13692 * corresponding '(' to be evaluated down to a single resultant operand.
13693 * Then the fence for the '(' is removed, and the operand goes through the
13694 * algorithm above, without the fence.
13696 * A separate stack is kept of the fence positions, so that the position of
13697 * the latest so-far unbalanced '(' is at the top of it.
13699 * The ']' ending the construct is treated as the lowest operator of all,
13700 * so that everything gets evaluated down to a single operand, which is the
13703 sv_2mortal((SV *)(stack = newAV()));
13704 sv_2mortal((SV *)(fence_stack = newAV()));
13706 while (RExC_parse < RExC_end) {
13707 I32 top_index; /* Index of top-most element in 'stack' */
13708 SV** top_ptr; /* Pointer to top 'stack' element */
13709 SV* current = NULL; /* To contain the current inversion list
13711 SV* only_to_avoid_leaks;
13713 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13714 TRUE /* Force /x */ );
13715 if (RExC_parse >= RExC_end) {
13716 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13719 curchar = UCHARAT(RExC_parse);
13723 top_index = av_tindex(stack);
13726 SV** stacked_ptr; /* Ptr to something already on 'stack' */
13727 char stacked_operator; /* The topmost operator on the 'stack'. */
13728 SV* lhs; /* Operand to the left of the operator */
13729 SV* rhs; /* Operand to the right of the operator */
13730 SV* fence_ptr; /* Pointer to top element of the fence
13735 if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
13737 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13738 * This happens when we have some thing like
13740 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13742 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
13744 * Here we would be handling the interpolated
13745 * '$thai_or_lao'. We handle this by a recursive call to
13746 * ourselves which returns the inversion list the
13747 * interpolated expression evaluates to. We use the flags
13748 * from the interpolated pattern. */
13749 U32 save_flags = RExC_flags;
13750 const char * save_parse;
13752 RExC_parse += 2; /* Skip past the '(?' */
13753 save_parse = RExC_parse;
13755 /* Parse any flags for the '(?' */
13756 parse_lparen_question_flags(pRExC_state);
13758 if (RExC_parse == save_parse /* Makes sure there was at
13759 least one flag (or else
13760 this embedding wasn't
13762 || RExC_parse >= RExC_end - 4
13763 || UCHARAT(RExC_parse) != ':'
13764 || UCHARAT(++RExC_parse) != '('
13765 || UCHARAT(++RExC_parse) != '?'
13766 || UCHARAT(++RExC_parse) != '[')
13769 /* In combination with the above, this moves the
13770 * pointer to the point just after the first erroneous
13771 * character (or if there are no flags, to where they
13772 * should have been) */
13773 if (RExC_parse >= RExC_end - 4) {
13774 RExC_parse = RExC_end;
13776 else if (RExC_parse != save_parse) {
13777 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13779 vFAIL("Expecting '(?flags:(?[...'");
13782 /* Recurse, with the meat of the embedded expression */
13784 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
13785 depth+1, oregcomp_parse);
13787 /* Here, 'current' contains the embedded expression's
13788 * inversion list, and RExC_parse points to the trailing
13789 * ']'; the next character should be the ')' */
13791 assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13793 /* Then the ')' matching the original '(' handled by this
13794 * case: statement */
13796 assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13799 RExC_flags = save_flags;
13800 goto handle_operand;
13803 /* A regular '('. Look behind for illegal syntax */
13804 if (top_index - fence >= 0) {
13805 /* If the top entry on the stack is an operator, it had
13806 * better be a '!', otherwise the entry below the top
13807 * operand should be an operator */
13808 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
13809 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
13810 || ( IS_OPERAND(*top_ptr)
13811 && ( top_index - fence < 1
13812 || ! (stacked_ptr = av_fetch(stack,
13815 || ! IS_OPERATOR(*stacked_ptr))))
13818 vFAIL("Unexpected '(' with no preceding operator");
13822 /* Stack the position of this undealt-with left paren */
13823 fence = top_index + 1;
13824 av_push(fence_stack, newSViv(fence));
13828 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
13829 * multi-char folds are allowed. */
13830 if (!regclass(pRExC_state, flagp,depth+1,
13831 TRUE, /* means parse just the next thing */
13832 FALSE, /* don't allow multi-char folds */
13833 FALSE, /* don't silence non-portable warnings. */
13835 FALSE, /* Require return to be an ANYOF */
13838 FAIL2("panic: regclass returned NULL to handle_sets, "
13839 "flags=%#"UVxf"", (UV) *flagp);
13842 /* regclass() will return with parsing just the \ sequence,
13843 * leaving the parse pointer at the next thing to parse */
13845 goto handle_operand;
13847 case '[': /* Is a bracketed character class */
13849 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13851 if (! is_posix_class) {
13855 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
13856 * multi-char folds are allowed. */
13857 if(!regclass(pRExC_state, flagp,depth+1,
13858 is_posix_class, /* parse the whole char class
13859 only if not a posix class */
13860 FALSE, /* don't allow multi-char folds */
13861 FALSE, /* don't silence non-portable warnings. */
13863 FALSE, /* Require return to be an ANYOF */
13867 FAIL2("panic: regclass returned NULL to handle_sets, "
13868 "flags=%#"UVxf"", (UV) *flagp);
13871 /* function call leaves parse pointing to the ']', except if we
13873 if (is_posix_class) {
13877 goto handle_operand;
13881 if (top_index >= 1) {
13882 goto join_operators;
13885 /* Only a single operand on the stack: are done */
13889 if (av_tindex(fence_stack) < 0) {
13891 vFAIL("Unexpected ')'");
13894 /* If at least two thing on the stack, treat this as an
13896 if (top_index - fence >= 1) {
13897 goto join_operators;
13900 /* Here only a single thing on the fenced stack, and there is a
13901 * fence. Get rid of it */
13902 fence_ptr = av_pop(fence_stack);
13904 fence = SvIV(fence_ptr) - 1;
13905 SvREFCNT_dec_NN(fence_ptr);
13912 /* Having gotten rid of the fence, we pop the operand at the
13913 * stack top and process it as a newly encountered operand */
13914 current = av_pop(stack);
13915 if (IS_OPERAND(current)) {
13916 goto handle_operand;
13928 /* These binary operators should have a left operand already
13930 if ( top_index - fence < 0
13931 || top_index - fence == 1
13932 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13933 || ! IS_OPERAND(*top_ptr))
13935 goto unexpected_binary;
13938 /* If only the one operand is on the part of the stack visible
13939 * to us, we just place this operator in the proper position */
13940 if (top_index - fence < 2) {
13942 /* Place the operator before the operand */
13944 SV* lhs = av_pop(stack);
13945 av_push(stack, newSVuv(curchar));
13946 av_push(stack, lhs);
13950 /* But if there is something else on the stack, we need to
13951 * process it before this new operator if and only if the
13952 * stacked operation has equal or higher precedence than the
13957 /* The operator on the stack is supposed to be below both its
13959 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
13960 || IS_OPERAND(*stacked_ptr))
13962 /* But if not, it's legal and indicates we are completely
13963 * done if and only if we're currently processing a ']',
13964 * which should be the final thing in the expression */
13965 if (curchar == ']') {
13971 vFAIL2("Unexpected binary operator '%c' with no "
13972 "preceding operand", curchar);
13974 stacked_operator = (char) SvUV(*stacked_ptr);
13976 if (regex_set_precedence(curchar)
13977 > regex_set_precedence(stacked_operator))
13979 /* Here, the new operator has higher precedence than the
13980 * stacked one. This means we need to add the new one to
13981 * the stack to await its rhs operand (and maybe more
13982 * stuff). We put it before the lhs operand, leaving
13983 * untouched the stacked operator and everything below it
13985 lhs = av_pop(stack);
13986 assert(IS_OPERAND(lhs));
13988 av_push(stack, newSVuv(curchar));
13989 av_push(stack, lhs);
13993 /* Here, the new operator has equal or lower precedence than
13994 * what's already there. This means the operation already
13995 * there should be performed now, before the new one. */
13997 rhs = av_pop(stack);
13998 if (! IS_OPERAND(rhs)) {
14000 /* This can happen when a ! is not followed by an operand,
14001 * like in /(?[\t &!])/ */
14005 lhs = av_pop(stack);
14007 if (! IS_OPERAND(lhs)) {
14009 /* This can happen when there is an empty (), like in
14010 * /(?[[0]+()+])/ */
14014 switch (stacked_operator) {
14016 _invlist_intersection(lhs, rhs, &rhs);
14021 _invlist_union(lhs, rhs, &rhs);
14025 _invlist_subtract(lhs, rhs, &rhs);
14028 case '^': /* The union minus the intersection */
14034 _invlist_union(lhs, rhs, &u);
14035 _invlist_intersection(lhs, rhs, &i);
14036 /* _invlist_subtract will overwrite rhs
14037 without freeing what it already contains */
14039 _invlist_subtract(u, i, &rhs);
14040 SvREFCNT_dec_NN(i);
14041 SvREFCNT_dec_NN(u);
14042 SvREFCNT_dec_NN(element);
14048 /* Here, the higher precedence operation has been done, and the
14049 * result is in 'rhs'. We overwrite the stacked operator with
14050 * the result. Then we redo this code to either push the new
14051 * operator onto the stack or perform any higher precedence
14052 * stacked operation */
14053 only_to_avoid_leaks = av_pop(stack);
14054 SvREFCNT_dec(only_to_avoid_leaks);
14055 av_push(stack, rhs);
14058 case '!': /* Highest priority, right associative */
14060 /* If what's already at the top of the stack is another '!",
14061 * they just cancel each other out */
14062 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
14063 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
14065 only_to_avoid_leaks = av_pop(stack);
14066 SvREFCNT_dec(only_to_avoid_leaks);
14068 else { /* Otherwise, since it's right associative, just push
14070 av_push(stack, newSVuv(curchar));
14075 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14076 vFAIL("Unexpected character");
14080 /* Here 'current' is the operand. If something is already on the
14081 * stack, we have to check if it is a !. */
14082 top_index = av_tindex(stack); /* Code above may have altered the
14083 * stack in the time since we
14084 * earlier set 'top_index'. */
14085 if (top_index - fence >= 0) {
14086 /* If the top entry on the stack is an operator, it had better
14087 * be a '!', otherwise the entry below the top operand should
14088 * be an operator */
14089 top_ptr = av_fetch(stack, top_index, FALSE);
14091 if (IS_OPERATOR(*top_ptr)) {
14093 /* The only permissible operator at the top of the stack is
14094 * '!', which is applied immediately to this operand. */
14095 curchar = (char) SvUV(*top_ptr);
14096 if (curchar != '!') {
14097 SvREFCNT_dec(current);
14098 vFAIL2("Unexpected binary operator '%c' with no "
14099 "preceding operand", curchar);
14102 _invlist_invert(current);
14104 only_to_avoid_leaks = av_pop(stack);
14105 SvREFCNT_dec(only_to_avoid_leaks);
14106 top_index = av_tindex(stack);
14108 /* And we redo with the inverted operand. This allows
14109 * handling multiple ! in a row */
14110 goto handle_operand;
14112 /* Single operand is ok only for the non-binary ')'
14114 else if ((top_index - fence == 0 && curchar != ')')
14115 || (top_index - fence > 0
14116 && (! (stacked_ptr = av_fetch(stack,
14119 || IS_OPERAND(*stacked_ptr))))
14121 SvREFCNT_dec(current);
14122 vFAIL("Operand with no preceding operator");
14126 /* Here there was nothing on the stack or the top element was
14127 * another operand. Just add this new one */
14128 av_push(stack, current);
14130 } /* End of switch on next parse token */
14132 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14133 } /* End of loop parsing through the construct */
14136 if (av_tindex(fence_stack) >= 0) {
14137 vFAIL("Unmatched (");
14140 if (av_tindex(stack) < 0 /* Was empty */
14141 || ((final = av_pop(stack)) == NULL)
14142 || ! IS_OPERAND(final)
14143 || SvTYPE(final) != SVt_INVLIST
14144 || av_tindex(stack) >= 0) /* More left on stack */
14147 SvREFCNT_dec(final);
14148 vFAIL("Incomplete expression within '(?[ ])'");
14151 /* Here, 'final' is the resultant inversion list from evaluating the
14152 * expression. Return it if so requested */
14153 if (return_invlist) {
14154 *return_invlist = final;
14158 /* Otherwise generate a resultant node, based on 'final'. regclass() is
14159 * expecting a string of ranges and individual code points */
14160 invlist_iterinit(final);
14161 result_string = newSVpvs("");
14162 while (invlist_iternext(final, &start, &end)) {
14163 if (start == end) {
14164 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
14167 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
14172 /* About to generate an ANYOF (or similar) node from the inversion list we
14173 * have calculated */
14174 save_parse = RExC_parse;
14175 RExC_parse = SvPV(result_string, len);
14176 save_end = RExC_end;
14177 RExC_end = RExC_parse + len;
14179 /* We turn off folding around the call, as the class we have constructed
14180 * already has all folding taken into consideration, and we don't want
14181 * regclass() to add to that */
14182 RExC_flags &= ~RXf_PMf_FOLD;
14183 /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
14184 * folds are allowed. */
14185 node = regclass(pRExC_state, flagp,depth+1,
14186 FALSE, /* means parse the whole char class */
14187 FALSE, /* don't allow multi-char folds */
14188 TRUE, /* silence non-portable warnings. The above may very
14189 well have generated non-portable code points, but
14190 they're valid on this machine */
14191 FALSE, /* similarly, no need for strict */
14192 FALSE, /* Require return to be an ANYOF */
14196 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
14199 /* Fix up the node type if we are in locale. (We have pretended we are
14200 * under /u for the purposes of regclass(), as this construct will only
14201 * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so
14202 * as to cause any warnings about bad locales to be output in regexec.c),
14203 * and add the flag that indicates to check if not in a UTF-8 locale. The
14204 * reason we above forbid optimization into something other than an ANYOF
14205 * node is simply to minimize the number of code changes in regexec.c.
14206 * Otherwise we would have to create new EXACTish node types and deal with
14207 * them. This decision could be revisited should this construct become
14210 * (One might think we could look at the resulting ANYOF node and suppress
14211 * the flag if everything is above 255, as those would be UTF-8 only,
14212 * but this isn't true, as the components that led to that result could
14213 * have been locale-affected, and just happen to cancel each other out
14214 * under UTF-8 locales.) */
14216 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
14218 assert(OP(node) == ANYOF);
14222 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
14226 RExC_flags |= RXf_PMf_FOLD;
14229 RExC_parse = save_parse + 1;
14230 RExC_end = save_end;
14231 SvREFCNT_dec_NN(final);
14232 SvREFCNT_dec_NN(result_string);
14234 nextchar(pRExC_state);
14235 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
14242 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
14244 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
14245 * innocent-looking character class, like /[ks]/i won't have to go out to
14246 * disk to find the possible matches.
14248 * This should be called only for a Latin1-range code points, cp, which is
14249 * known to be involved in a simple fold with other code points above
14250 * Latin1. It would give false results if /aa has been specified.
14251 * Multi-char folds are outside the scope of this, and must be handled
14254 * XXX It would be better to generate these via regen, in case a new
14255 * version of the Unicode standard adds new mappings, though that is not
14256 * really likely, and may be caught by the default: case of the switch
14259 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
14261 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
14267 add_cp_to_invlist(*invlist, KELVIN_SIGN);
14271 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
14274 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
14275 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
14277 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14278 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14279 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
14281 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14282 *invlist = add_cp_to_invlist(*invlist,
14283 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14286 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
14288 case LATIN_SMALL_LETTER_SHARP_S:
14289 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
14294 #if UNICODE_MAJOR_VERSION < 3 \
14295 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
14297 /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
14302 add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
14303 # if UNICODE_DOT_DOT_VERSION == 1
14304 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
14310 /* Use deprecated warning to increase the chances of this being
14313 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
14320 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
14322 /* This adds the string scalar <multi_string> to the array
14323 * <multi_char_matches>. <multi_string> is known to have exactly
14324 * <cp_count> code points in it. This is used when constructing a
14325 * bracketed character class and we find something that needs to match more
14326 * than a single character.
14328 * <multi_char_matches> is actually an array of arrays. Each top-level
14329 * element is an array that contains all the strings known so far that are
14330 * the same length. And that length (in number of code points) is the same
14331 * as the index of the top-level array. Hence, the [2] element is an
14332 * array, each element thereof is a string containing TWO code points;
14333 * while element [3] is for strings of THREE characters, and so on. Since
14334 * this is for multi-char strings there can never be a [0] nor [1] element.
14336 * When we rewrite the character class below, we will do so such that the
14337 * longest strings are written first, so that it prefers the longest
14338 * matching strings first. This is done even if it turns out that any
14339 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
14340 * Christiansen has agreed that this is ok. This makes the test for the
14341 * ligature 'ffi' come before the test for 'ff', for example */
14344 AV** this_array_ptr;
14346 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
14348 if (! multi_char_matches) {
14349 multi_char_matches = newAV();
14352 if (av_exists(multi_char_matches, cp_count)) {
14353 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
14354 this_array = *this_array_ptr;
14357 this_array = newAV();
14358 av_store(multi_char_matches, cp_count,
14361 av_push(this_array, multi_string);
14363 return multi_char_matches;
14366 /* The names of properties whose definitions are not known at compile time are
14367 * stored in this SV, after a constant heading. So if the length has been
14368 * changed since initialization, then there is a run-time definition. */
14369 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
14370 (SvCUR(listsv) != initial_listsv_len)
14372 /* There is a restricted set of white space characters that are legal when
14373 * ignoring white space in a bracketed character class. This generates the
14374 * code to skip them.
14376 * There is a line below that uses the same white space criteria but is outside
14377 * this macro. Both here and there must use the same definition */
14378 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
14381 while ( p < RExC_end \
14382 && isBLANK_A(UCHARAT(p))) \
14390 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
14391 const bool stop_at_1, /* Just parse the next thing, don't
14392 look for a full character class */
14393 bool allow_multi_folds,
14394 const bool silence_non_portable, /* Don't output warnings
14398 bool optimizable, /* ? Allow a non-ANYOF return
14400 SV** ret_invlist /* Return an inversion list, not a node */
14403 /* parse a bracketed class specification. Most of these will produce an
14404 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
14405 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
14406 * under /i with multi-character folds: it will be rewritten following the
14407 * paradigm of this example, where the <multi-fold>s are characters which
14408 * fold to multiple character sequences:
14409 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
14410 * gets effectively rewritten as:
14411 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
14412 * reg() gets called (recursively) on the rewritten version, and this
14413 * function will return what it constructs. (Actually the <multi-fold>s
14414 * aren't physically removed from the [abcdefghi], it's just that they are
14415 * ignored in the recursion by means of a flag:
14416 * <RExC_in_multi_char_class>.)
14418 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
14419 * characters, with the corresponding bit set if that character is in the
14420 * list. For characters above this, a range list or swash is used. There
14421 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
14422 * determinable at compile time
14424 * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs
14425 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded
14426 * to UTF-8. This can only happen if ret_invlist is non-NULL.
14429 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
14431 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
14434 IV namedclass = OOB_NAMEDCLASS;
14435 char *rangebegin = NULL;
14436 bool need_class = 0;
14438 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
14439 than just initialized. */
14440 SV* properties = NULL; /* Code points that match \p{} \P{} */
14441 SV* posixes = NULL; /* Code points that match classes like [:word:],
14442 extended beyond the Latin1 range. These have to
14443 be kept separate from other code points for much
14444 of this function because their handling is
14445 different under /i, and for most classes under
14447 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
14448 separate for a while from the non-complemented
14449 versions because of complications with /d
14451 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
14452 treated more simply than the general case,
14453 leading to less compilation and execution
14455 UV element_count = 0; /* Number of distinct elements in the class.
14456 Optimizations may be possible if this is tiny */
14457 AV * multi_char_matches = NULL; /* Code points that fold to more than one
14458 character; used under /i */
14460 char * stop_ptr = RExC_end; /* where to stop parsing */
14461 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
14464 /* Unicode properties are stored in a swash; this holds the current one
14465 * being parsed. If this swash is the only above-latin1 component of the
14466 * character class, an optimization is to pass it directly on to the
14467 * execution engine. Otherwise, it is set to NULL to indicate that there
14468 * are other things in the class that have to be dealt with at execution
14470 SV* swash = NULL; /* Code points that match \p{} \P{} */
14472 /* Set if a component of this character class is user-defined; just passed
14473 * on to the engine */
14474 bool has_user_defined_property = FALSE;
14476 /* inversion list of code points this node matches only when the target
14477 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
14479 SV* has_upper_latin1_only_utf8_matches = NULL;
14481 /* Inversion list of code points this node matches regardless of things
14482 * like locale, folding, utf8ness of the target string */
14483 SV* cp_list = NULL;
14485 /* Like cp_list, but code points on this list need to be checked for things
14486 * that fold to/from them under /i */
14487 SV* cp_foldable_list = NULL;
14489 /* Like cp_list, but code points on this list are valid only when the
14490 * runtime locale is UTF-8 */
14491 SV* only_utf8_locale_list = NULL;
14493 /* In a range, if one of the endpoints is non-character-set portable,
14494 * meaning that it hard-codes a code point that may mean a different
14495 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
14496 * mnemonic '\t' which each mean the same character no matter which
14497 * character set the platform is on. */
14498 unsigned int non_portable_endpoint = 0;
14500 /* Is the range unicode? which means on a platform that isn't 1-1 native
14501 * to Unicode (i.e. non-ASCII), each code point in it should be considered
14502 * to be a Unicode value. */
14503 bool unicode_range = FALSE;
14504 bool invert = FALSE; /* Is this class to be complemented */
14506 bool warn_super = ALWAYS_WARN_SUPER;
14508 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
14509 case we need to change the emitted regop to an EXACT. */
14510 const char * orig_parse = RExC_parse;
14511 const SSize_t orig_size = RExC_size;
14512 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
14513 GET_RE_DEBUG_FLAGS_DECL;
14515 PERL_ARGS_ASSERT_REGCLASS;
14517 PERL_UNUSED_ARG(depth);
14520 DEBUG_PARSE("clas");
14522 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
14523 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
14524 && UNICODE_DOT_DOT_VERSION == 0)
14525 allow_multi_folds = FALSE;
14528 /* Assume we are going to generate an ANYOF node. */
14529 ret = reganode(pRExC_state,
14536 RExC_size += ANYOF_SKIP;
14537 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
14540 ANYOF_FLAGS(ret) = 0;
14542 RExC_emit += ANYOF_SKIP;
14543 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
14544 initial_listsv_len = SvCUR(listsv);
14545 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
14548 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
14550 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
14553 allow_multi_folds = FALSE;
14555 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
14558 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
14559 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
14560 const char *s = RExC_parse;
14561 const char c = *s++;
14566 while (isWORDCHAR(*s))
14568 if (*s && c == *s && s[1] == ']') {
14569 SAVEFREESV(RExC_rx_sv);
14571 "POSIX syntax [%c %c] belongs inside character classes",
14573 (void)ReREFCNT_inc(RExC_rx_sv);
14577 /* If the caller wants us to just parse a single element, accomplish this
14578 * by faking the loop ending condition */
14579 if (stop_at_1 && RExC_end > RExC_parse) {
14580 stop_ptr = RExC_parse + 1;
14583 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
14584 if (UCHARAT(RExC_parse) == ']')
14585 goto charclassloop;
14588 if (RExC_parse >= stop_ptr) {
14592 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
14594 if (UCHARAT(RExC_parse) == ']') {
14600 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
14601 save_value = value;
14602 save_prevvalue = prevvalue;
14605 rangebegin = RExC_parse;
14607 non_portable_endpoint = 0;
14610 value = utf8n_to_uvchr((U8*)RExC_parse,
14611 RExC_end - RExC_parse,
14612 &numlen, UTF8_ALLOW_DEFAULT);
14613 RExC_parse += numlen;
14616 value = UCHARAT(RExC_parse++);
14619 && RExC_parse < RExC_end
14620 && POSIXCC(UCHARAT(RExC_parse)))
14622 namedclass = regpposixcc(pRExC_state, value, strict);
14624 else if (value == '\\') {
14625 /* Is a backslash; get the code point of the char after it */
14626 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
14627 value = utf8n_to_uvchr((U8*)RExC_parse,
14628 RExC_end - RExC_parse,
14629 &numlen, UTF8_ALLOW_DEFAULT);
14630 RExC_parse += numlen;
14633 value = UCHARAT(RExC_parse++);
14635 /* Some compilers cannot handle switching on 64-bit integer
14636 * values, therefore value cannot be an UV. Yes, this will
14637 * be a problem later if we want switch on Unicode.
14638 * A similar issue a little bit later when switching on
14639 * namedclass. --jhi */
14641 /* If the \ is escaping white space when white space is being
14642 * skipped, it means that that white space is wanted literally, and
14643 * is already in 'value'. Otherwise, need to translate the escape
14644 * into what it signifies. */
14645 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
14647 case 'w': namedclass = ANYOF_WORDCHAR; break;
14648 case 'W': namedclass = ANYOF_NWORDCHAR; break;
14649 case 's': namedclass = ANYOF_SPACE; break;
14650 case 'S': namedclass = ANYOF_NSPACE; break;
14651 case 'd': namedclass = ANYOF_DIGIT; break;
14652 case 'D': namedclass = ANYOF_NDIGIT; break;
14653 case 'v': namedclass = ANYOF_VERTWS; break;
14654 case 'V': namedclass = ANYOF_NVERTWS; break;
14655 case 'h': namedclass = ANYOF_HORIZWS; break;
14656 case 'H': namedclass = ANYOF_NHORIZWS; break;
14657 case 'N': /* Handle \N{NAME} in class */
14659 const char * const backslash_N_beg = RExC_parse - 2;
14662 if (! grok_bslash_N(pRExC_state,
14663 NULL, /* No regnode */
14664 &value, /* Yes single value */
14665 &cp_count, /* Multiple code pt count */
14670 if (*flagp & NEED_UTF8)
14671 FAIL("panic: grok_bslash_N set NEED_UTF8");
14672 if (*flagp & RESTART_PASS1)
14675 if (cp_count < 0) {
14676 vFAIL("\\N in a character class must be a named character: \\N{...}");
14678 else if (cp_count == 0) {
14680 RExC_parse++; /* Position after the "}" */
14681 vFAIL("Zero length \\N{}");
14684 ckWARNreg(RExC_parse,
14685 "Ignoring zero length \\N{} in character class");
14688 else { /* cp_count > 1 */
14689 if (! RExC_in_multi_char_class) {
14690 if (invert || range || *RExC_parse == '-') {
14693 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
14696 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
14698 break; /* <value> contains the first code
14699 point. Drop out of the switch to
14703 SV * multi_char_N = newSVpvn(backslash_N_beg,
14704 RExC_parse - backslash_N_beg);
14706 = add_multi_match(multi_char_matches,
14711 } /* End of cp_count != 1 */
14713 /* This element should not be processed further in this
14716 value = save_value;
14717 prevvalue = save_prevvalue;
14718 continue; /* Back to top of loop to get next char */
14721 /* Here, is a single code point, and <value> contains it */
14722 unicode_range = TRUE; /* \N{} are Unicode */
14730 /* We will handle any undefined properties ourselves */
14731 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
14732 /* And we actually would prefer to get
14733 * the straight inversion list of the
14734 * swash, since we will be accessing it
14735 * anyway, to save a little time */
14736 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
14738 if (RExC_parse >= RExC_end)
14739 vFAIL2("Empty \\%c{}", (U8)value);
14740 if (*RExC_parse == '{') {
14741 const U8 c = (U8)value;
14742 e = strchr(RExC_parse, '}');
14745 vFAIL2("Missing right brace on \\%c{}", c);
14749 while (isSPACE(*RExC_parse)) {
14753 if (UCHARAT(RExC_parse) == '^') {
14755 /* toggle. (The rhs xor gets the single bit that
14756 * differs between P and p; the other xor inverts just
14758 value ^= 'P' ^ 'p';
14761 while (isSPACE(*RExC_parse)) {
14766 if (e == RExC_parse)
14767 vFAIL2("Empty \\%c{}", c);
14769 n = e - RExC_parse;
14770 while (isSPACE(*(RExC_parse + n - 1)))
14772 } /* The \p isn't immediately followed by a '{' */
14773 else if (! isALPHA(*RExC_parse)) {
14774 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14775 vFAIL2("Character following \\%c must be '{' or a "
14776 "single-character Unicode property name",
14786 char* base_name; /* name after any packages are stripped */
14787 const char * const colon_colon = "::";
14789 /* Try to get the definition of the property into
14790 * <invlist>. If /i is in effect, the effective property
14791 * will have its name be <__NAME_i>. The design is
14792 * discussed in commit
14793 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14794 name = savepv(Perl_form(aTHX_
14796 (FOLD) ? "__" : "",
14802 /* Look up the property name, and get its swash and
14803 * inversion list, if the property is found */
14804 if (swash) { /* Return any left-overs */
14805 SvREFCNT_dec_NN(swash);
14807 swash = _core_swash_init("utf8", name, &PL_sv_undef,
14810 NULL, /* No inversion list */
14813 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14814 HV* curpkg = (IN_PERL_COMPILETIME)
14816 : CopSTASH(PL_curcop);
14820 if (swash) { /* Got a swash but no inversion list.
14821 Something is likely wrong that will
14822 be sorted-out later */
14823 SvREFCNT_dec_NN(swash);
14827 /* Here didn't find it. It could be a an error (like a
14828 * typo) in specifying a Unicode property, or it could
14829 * be a user-defined property that will be available at
14830 * run-time. The names of these must begin with 'In'
14831 * or 'Is' (after any packages are stripped off). So
14832 * if not one of those, or if we accept only
14833 * compile-time properties, is an error; otherwise add
14834 * it to the list for run-time look up. */
14835 if ((base_name = rninstr(name, name + n,
14836 colon_colon, colon_colon + 2)))
14837 { /* Has ::. We know this must be a user-defined
14840 final_n -= base_name - name;
14849 || base_name[0] != 'I'
14850 || (base_name[1] != 's' && base_name[1] != 'n')
14853 const char * const msg
14855 ? "Illegal user-defined property name"
14856 : "Can't find Unicode property definition";
14857 RExC_parse = e + 1;
14859 /* diag_listed_as: Can't find Unicode property definition "%s" */
14860 vFAIL3utf8f("%s \"%"UTF8f"\"",
14861 msg, UTF8fARG(UTF, n, name));
14864 /* If the property name doesn't already have a package
14865 * name, add the current one to it so that it can be
14866 * referred to outside it. [perl #121777] */
14867 if (! has_pkg && curpkg) {
14868 char* pkgname = HvNAME(curpkg);
14869 if (strNE(pkgname, "main")) {
14870 char* full_name = Perl_form(aTHX_
14874 n = strlen(full_name);
14876 name = savepvn(full_name, n);
14879 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14880 (value == 'p' ? '+' : '!'),
14881 UTF8fARG(UTF, n, name));
14882 has_user_defined_property = TRUE;
14883 optimizable = FALSE; /* Will have to leave this an
14886 /* We don't know yet what this matches, so have to flag
14888 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
14892 /* Here, did get the swash and its inversion list. If
14893 * the swash is from a user-defined property, then this
14894 * whole character class should be regarded as such */
14895 if (swash_init_flags
14896 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14898 has_user_defined_property = TRUE;
14901 /* We warn on matching an above-Unicode code point
14902 * if the match would return true, except don't
14903 * warn for \p{All}, which has exactly one element
14905 (_invlist_contains_cp(invlist, 0x110000)
14906 && (! (_invlist_len(invlist) == 1
14907 && *invlist_array(invlist) == 0)))
14913 /* Invert if asking for the complement */
14914 if (value == 'P') {
14915 _invlist_union_complement_2nd(properties,
14919 /* The swash can't be used as-is, because we've
14920 * inverted things; delay removing it to here after
14921 * have copied its invlist above */
14922 SvREFCNT_dec_NN(swash);
14926 _invlist_union(properties, invlist, &properties);
14931 RExC_parse = e + 1;
14932 namedclass = ANYOF_UNIPROP; /* no official name, but it's
14935 /* \p means they want Unicode semantics */
14936 REQUIRE_UNI_RULES(flagp, NULL);
14939 case 'n': value = '\n'; break;
14940 case 'r': value = '\r'; break;
14941 case 't': value = '\t'; break;
14942 case 'f': value = '\f'; break;
14943 case 'b': value = '\b'; break;
14944 case 'e': value = ESC_NATIVE; break;
14945 case 'a': value = '\a'; break;
14947 RExC_parse--; /* function expects to be pointed at the 'o' */
14949 const char* error_msg;
14950 bool valid = grok_bslash_o(&RExC_parse,
14953 PASS2, /* warnings only in
14956 silence_non_portable,
14962 non_portable_endpoint++;
14963 if (IN_ENCODING && value < 0x100) {
14964 goto recode_encoding;
14968 RExC_parse--; /* function expects to be pointed at the 'x' */
14970 const char* error_msg;
14971 bool valid = grok_bslash_x(&RExC_parse,
14974 PASS2, /* Output warnings */
14976 silence_non_portable,
14982 non_portable_endpoint++;
14983 if (IN_ENCODING && value < 0x100)
14984 goto recode_encoding;
14987 value = grok_bslash_c(*RExC_parse++, PASS2);
14988 non_portable_endpoint++;
14990 case '0': case '1': case '2': case '3': case '4':
14991 case '5': case '6': case '7':
14993 /* Take 1-3 octal digits */
14994 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14995 numlen = (strict) ? 4 : 3;
14996 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14997 RExC_parse += numlen;
15000 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15001 vFAIL("Need exactly 3 octal digits");
15003 else if (! SIZE_ONLY /* like \08, \178 */
15005 && RExC_parse < RExC_end
15006 && isDIGIT(*RExC_parse)
15007 && ckWARN(WARN_REGEXP))
15009 SAVEFREESV(RExC_rx_sv);
15010 reg_warn_non_literal_string(
15012 form_short_octal_warning(RExC_parse, numlen));
15013 (void)ReREFCNT_inc(RExC_rx_sv);
15016 non_portable_endpoint++;
15017 if (IN_ENCODING && value < 0x100)
15018 goto recode_encoding;
15022 if (! RExC_override_recoding) {
15023 SV* enc = _get_encoding();
15024 value = reg_recode((U8)value, &enc);
15027 vFAIL("Invalid escape in the specified encoding");
15030 ckWARNreg(RExC_parse,
15031 "Invalid escape in the specified encoding");
15037 /* Allow \_ to not give an error */
15038 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
15040 vFAIL2("Unrecognized escape \\%c in character class",
15044 SAVEFREESV(RExC_rx_sv);
15045 ckWARN2reg(RExC_parse,
15046 "Unrecognized escape \\%c in character class passed through",
15048 (void)ReREFCNT_inc(RExC_rx_sv);
15052 } /* End of switch on char following backslash */
15053 } /* end of handling backslash escape sequences */
15055 /* Here, we have the current token in 'value' */
15057 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
15060 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
15061 * literal, as is the character that began the false range, i.e.
15062 * the 'a' in the examples */
15065 const int w = (RExC_parse >= rangebegin)
15066 ? RExC_parse - rangebegin
15070 "False [] range \"%"UTF8f"\"",
15071 UTF8fARG(UTF, w, rangebegin));
15074 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
15075 ckWARN2reg(RExC_parse,
15076 "False [] range \"%"UTF8f"\"",
15077 UTF8fARG(UTF, w, rangebegin));
15078 (void)ReREFCNT_inc(RExC_rx_sv);
15079 cp_list = add_cp_to_invlist(cp_list, '-');
15080 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
15085 range = 0; /* this was not a true range */
15086 element_count += 2; /* So counts for three values */
15089 classnum = namedclass_to_classnum(namedclass);
15091 if (LOC && namedclass < ANYOF_POSIXL_MAX
15092 #ifndef HAS_ISASCII
15093 && classnum != _CC_ASCII
15096 /* What the Posix classes (like \w, [:space:]) match in locale
15097 * isn't knowable under locale until actual match time. Room
15098 * must be reserved (one time per outer bracketed class) to
15099 * store such classes. The space will contain a bit for each
15100 * named class that is to be matched against. This isn't
15101 * needed for \p{} and pseudo-classes, as they are not affected
15102 * by locale, and hence are dealt with separately */
15103 if (! need_class) {
15106 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
15109 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
15111 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
15112 ANYOF_POSIXL_ZERO(ret);
15114 /* We can't change this into some other type of node
15115 * (unless this is the only element, in which case there
15116 * are nodes that mean exactly this) as has runtime
15118 optimizable = FALSE;
15121 /* Coverity thinks it is possible for this to be negative; both
15122 * jhi and khw think it's not, but be safer */
15123 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
15124 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
15126 /* See if it already matches the complement of this POSIX
15128 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
15129 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
15133 posixl_matches_all = TRUE;
15134 break; /* No need to continue. Since it matches both
15135 e.g., \w and \W, it matches everything, and the
15136 bracketed class can be optimized into qr/./s */
15139 /* Add this class to those that should be checked at runtime */
15140 ANYOF_POSIXL_SET(ret, namedclass);
15142 /* The above-Latin1 characters are not subject to locale rules.
15143 * Just add them, in the second pass, to the
15144 * unconditionally-matched list */
15146 SV* scratch_list = NULL;
15148 /* Get the list of the above-Latin1 code points this
15150 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
15151 PL_XPosix_ptrs[classnum],
15153 /* Odd numbers are complements, like
15154 * NDIGIT, NASCII, ... */
15155 namedclass % 2 != 0,
15157 /* Checking if 'cp_list' is NULL first saves an extra
15158 * clone. Its reference count will be decremented at the
15159 * next union, etc, or if this is the only instance, at the
15160 * end of the routine */
15162 cp_list = scratch_list;
15165 _invlist_union(cp_list, scratch_list, &cp_list);
15166 SvREFCNT_dec_NN(scratch_list);
15168 continue; /* Go get next character */
15171 else if (! SIZE_ONLY) {
15173 /* Here, not in pass1 (in that pass we skip calculating the
15174 * contents of this class), and is /l, or is a POSIX class for
15175 * which /l doesn't matter (or is a Unicode property, which is
15176 * skipped here). */
15177 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
15178 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
15180 /* Here, should be \h, \H, \v, or \V. None of /d, /i
15181 * nor /l make a difference in what these match,
15182 * therefore we just add what they match to cp_list. */
15183 if (classnum != _CC_VERTSPACE) {
15184 assert( namedclass == ANYOF_HORIZWS
15185 || namedclass == ANYOF_NHORIZWS);
15187 /* It turns out that \h is just a synonym for
15189 classnum = _CC_BLANK;
15192 _invlist_union_maybe_complement_2nd(
15194 PL_XPosix_ptrs[classnum],
15195 namedclass % 2 != 0, /* Complement if odd
15196 (NHORIZWS, NVERTWS)
15201 else if (UNI_SEMANTICS
15202 || classnum == _CC_ASCII
15203 || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
15204 || classnum == _CC_XDIGIT)))
15206 /* We usually have to worry about /d and /a affecting what
15207 * POSIX classes match, with special code needed for /d
15208 * because we won't know until runtime what all matches.
15209 * But there is no extra work needed under /u, and
15210 * [:ascii:] is unaffected by /a and /d; and :digit: and
15211 * :xdigit: don't have runtime differences under /d. So we
15212 * can special case these, and avoid some extra work below,
15213 * and at runtime. */
15214 _invlist_union_maybe_complement_2nd(
15216 PL_XPosix_ptrs[classnum],
15217 namedclass % 2 != 0,
15220 else { /* Garden variety class. If is NUPPER, NALPHA, ...
15221 complement and use nposixes */
15222 SV** posixes_ptr = namedclass % 2 == 0
15225 _invlist_union_maybe_complement_2nd(
15227 PL_XPosix_ptrs[classnum],
15228 namedclass % 2 != 0,
15232 } /* end of namedclass \blah */
15234 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
15236 /* If 'range' is set, 'value' is the ending of a range--check its
15237 * validity. (If value isn't a single code point in the case of a
15238 * range, we should have figured that out above in the code that
15239 * catches false ranges). Later, we will handle each individual code
15240 * point in the range. If 'range' isn't set, this could be the
15241 * beginning of a range, so check for that by looking ahead to see if
15242 * the next real character to be processed is the range indicator--the
15247 /* For unicode ranges, we have to test that the Unicode as opposed
15248 * to the native values are not decreasing. (Above 255, there is
15249 * no difference between native and Unicode) */
15250 if (unicode_range && prevvalue < 255 && value < 255) {
15251 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
15252 goto backwards_range;
15257 if (prevvalue > value) /* b-a */ {
15262 w = RExC_parse - rangebegin;
15264 "Invalid [] range \"%"UTF8f"\"",
15265 UTF8fARG(UTF, w, rangebegin));
15266 NOT_REACHED; /* NOTREACHED */
15270 prevvalue = value; /* save the beginning of the potential range */
15271 if (! stop_at_1 /* Can't be a range if parsing just one thing */
15272 && *RExC_parse == '-')
15274 char* next_char_ptr = RExC_parse + 1;
15276 /* Get the next real char after the '-' */
15277 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
15279 /* If the '-' is at the end of the class (just before the ']',
15280 * it is a literal minus; otherwise it is a range */
15281 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
15282 RExC_parse = next_char_ptr;
15284 /* a bad range like \w-, [:word:]- ? */
15285 if (namedclass > OOB_NAMEDCLASS) {
15286 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
15287 const int w = RExC_parse >= rangebegin
15288 ? RExC_parse - rangebegin
15291 vFAIL4("False [] range \"%*.*s\"",
15296 "False [] range \"%*.*s\"",
15301 cp_list = add_cp_to_invlist(cp_list, '-');
15305 range = 1; /* yeah, it's a range! */
15306 continue; /* but do it the next time */
15311 if (namedclass > OOB_NAMEDCLASS) {
15315 /* Here, we have a single value this time through the loop, and
15316 * <prevvalue> is the beginning of the range, if any; or <value> if
15319 /* non-Latin1 code point implies unicode semantics. Must be set in
15320 * pass1 so is there for the whole of pass 2 */
15322 REQUIRE_UNI_RULES(flagp, NULL);
15325 /* Ready to process either the single value, or the completed range.
15326 * For single-valued non-inverted ranges, we consider the possibility
15327 * of multi-char folds. (We made a conscious decision to not do this
15328 * for the other cases because it can often lead to non-intuitive
15329 * results. For example, you have the peculiar case that:
15330 * "s s" =~ /^[^\xDF]+$/i => Y
15331 * "ss" =~ /^[^\xDF]+$/i => N
15333 * See [perl #89750] */
15334 if (FOLD && allow_multi_folds && value == prevvalue) {
15335 if (value == LATIN_SMALL_LETTER_SHARP_S
15336 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
15339 /* Here <value> is indeed a multi-char fold. Get what it is */
15341 U8 foldbuf[UTF8_MAXBYTES_CASE];
15344 UV folded = _to_uni_fold_flags(
15348 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
15349 ? FOLD_FLAGS_NOMIX_ASCII
15353 /* Here, <folded> should be the first character of the
15354 * multi-char fold of <value>, with <foldbuf> containing the
15355 * whole thing. But, if this fold is not allowed (because of
15356 * the flags), <fold> will be the same as <value>, and should
15357 * be processed like any other character, so skip the special
15359 if (folded != value) {
15361 /* Skip if we are recursed, currently parsing the class
15362 * again. Otherwise add this character to the list of
15363 * multi-char folds. */
15364 if (! RExC_in_multi_char_class) {
15365 STRLEN cp_count = utf8_length(foldbuf,
15366 foldbuf + foldlen);
15367 SV* multi_fold = sv_2mortal(newSVpvs(""));
15369 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
15372 = add_multi_match(multi_char_matches,
15378 /* This element should not be processed further in this
15381 value = save_value;
15382 prevvalue = save_prevvalue;
15388 if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
15391 /* If the range starts above 255, everything is portable and
15392 * likely to be so for any forseeable character set, so don't
15394 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
15395 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
15397 else if (prevvalue != value) {
15399 /* Under strict, ranges that stop and/or end in an ASCII
15400 * printable should have each end point be a portable value
15401 * for it (preferably like 'A', but we don't warn if it is
15402 * a (portable) Unicode name or code point), and the range
15403 * must be be all digits or all letters of the same case.
15404 * Otherwise, the range is non-portable and unclear as to
15405 * what it contains */
15406 if ((isPRINT_A(prevvalue) || isPRINT_A(value))
15407 && (non_portable_endpoint
15408 || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
15409 || (isLOWER_A(prevvalue) && isLOWER_A(value))
15410 || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
15412 vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
15414 else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
15416 /* But the nature of Unicode and languages mean we
15417 * can't do the same checks for above-ASCII ranges,
15418 * except in the case of digit ones. These should
15419 * contain only digits from the same group of 10. The
15420 * ASCII case is handled just above. 0x660 is the
15421 * first digit character beyond ASCII. Hence here, the
15422 * range could be a range of digits. Find out. */
15423 IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15425 IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15428 /* If the range start and final points are in the same
15429 * inversion list element, it means that either both
15430 * are not digits, or both are digits in a consecutive
15431 * sequence of digits. (So far, Unicode has kept all
15432 * such sequences as distinct groups of 10, but assert
15433 * to make sure). If the end points are not in the
15434 * same element, neither should be a digit. */
15435 if (index_start == index_final) {
15436 assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
15437 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15438 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15440 /* But actually Unicode did have one group of 11
15441 * 'digits' in 5.2, so in case we are operating
15442 * on that version, let that pass */
15443 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15444 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15446 && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15450 else if ((index_start >= 0
15451 && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
15452 || (index_final >= 0
15453 && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
15455 vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
15460 if ((! range || prevvalue == value) && non_portable_endpoint) {
15461 if (isPRINT_A(value)) {
15464 if (isBACKSLASHED_PUNCT(value)) {
15465 literal[d++] = '\\';
15467 literal[d++] = (char) value;
15468 literal[d++] = '\0';
15471 "\"%.*s\" is more clearly written simply as \"%s\"",
15472 (int) (RExC_parse - rangebegin),
15477 else if isMNEMONIC_CNTRL(value) {
15479 "\"%.*s\" is more clearly written simply as \"%s\"",
15480 (int) (RExC_parse - rangebegin),
15482 cntrl_to_mnemonic((char) value)
15488 /* Deal with this element of the class */
15492 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15495 /* On non-ASCII platforms, for ranges that span all of 0..255, and
15496 * ones that don't require special handling, we can just add the
15497 * range like we do for ASCII platforms */
15498 if ((UNLIKELY(prevvalue == 0) && value >= 255)
15499 || ! (prevvalue < 256
15501 || (! non_portable_endpoint
15502 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
15503 || (isUPPER_A(prevvalue)
15504 && isUPPER_A(value)))))))
15506 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15510 /* Here, requires special handling. This can be because it is
15511 * a range whose code points are considered to be Unicode, and
15512 * so must be individually translated into native, or because
15513 * its a subrange of 'A-Z' or 'a-z' which each aren't
15514 * contiguous in EBCDIC, but we have defined them to include
15515 * only the "expected" upper or lower case ASCII alphabetics.
15516 * Subranges above 255 are the same in native and Unicode, so
15517 * can be added as a range */
15518 U8 start = NATIVE_TO_LATIN1(prevvalue);
15520 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
15521 for (j = start; j <= end; j++) {
15522 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
15525 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15532 range = 0; /* this range (if it was one) is done now */
15533 } /* End of loop through all the text within the brackets */
15535 /* If anything in the class expands to more than one character, we have to
15536 * deal with them by building up a substitute parse string, and recursively
15537 * calling reg() on it, instead of proceeding */
15538 if (multi_char_matches) {
15539 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
15542 char *save_end = RExC_end;
15543 char *save_parse = RExC_parse;
15544 char *save_start = RExC_start;
15545 STRLEN prefix_end = 0; /* We copy the character class after a
15546 prefix supplied here. This is the size
15547 + 1 of that prefix */
15548 bool first_time = TRUE; /* First multi-char occurrence doesn't get
15553 assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */
15555 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
15556 because too confusing */
15558 sv_catpv(substitute_parse, "(?:");
15562 /* Look at the longest folds first */
15563 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
15565 if (av_exists(multi_char_matches, cp_count)) {
15566 AV** this_array_ptr;
15569 this_array_ptr = (AV**) av_fetch(multi_char_matches,
15571 while ((this_sequence = av_pop(*this_array_ptr)) !=
15574 if (! first_time) {
15575 sv_catpv(substitute_parse, "|");
15577 first_time = FALSE;
15579 sv_catpv(substitute_parse, SvPVX(this_sequence));
15584 /* If the character class contains anything else besides these
15585 * multi-character folds, have to include it in recursive parsing */
15586 if (element_count) {
15587 sv_catpv(substitute_parse, "|[");
15588 prefix_end = SvCUR(substitute_parse);
15589 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
15590 sv_catpv(substitute_parse, "]");
15593 sv_catpv(substitute_parse, ")");
15596 /* This is a way to get the parse to skip forward a whole named
15597 * sequence instead of matching the 2nd character when it fails the
15599 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
15603 /* Set up the data structure so that any errors will be properly
15604 * reported. See the comments at the definition of
15605 * REPORT_LOCATION_ARGS for details */
15606 RExC_precomp_adj = orig_parse - RExC_precomp;
15607 RExC_start = RExC_parse = SvPV(substitute_parse, len);
15608 RExC_adjusted_start = RExC_start + prefix_end;
15609 RExC_end = RExC_parse + len;
15610 RExC_in_multi_char_class = 1;
15611 RExC_override_recoding = 1;
15612 RExC_emit = (regnode *)orig_emit;
15614 ret = reg(pRExC_state, 1, ®_flags, depth+1);
15616 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
15618 /* And restore so can parse the rest of the pattern */
15619 RExC_parse = save_parse;
15620 RExC_start = RExC_adjusted_start = save_start;
15621 RExC_precomp_adj = 0;
15622 RExC_end = save_end;
15623 RExC_in_multi_char_class = 0;
15624 RExC_override_recoding = 0;
15625 SvREFCNT_dec_NN(multi_char_matches);
15629 /* Here, we've gone through the entire class and dealt with multi-char
15630 * folds. We are now in a position that we can do some checks to see if we
15631 * can optimize this ANYOF node into a simpler one, even in Pass 1.
15632 * Currently we only do two checks:
15633 * 1) is in the unlikely event that the user has specified both, eg. \w and
15634 * \W under /l, then the class matches everything. (This optimization
15635 * is done only to make the optimizer code run later work.)
15636 * 2) if the character class contains only a single element (including a
15637 * single range), we see if there is an equivalent node for it.
15638 * Other checks are possible */
15640 && ! ret_invlist /* Can't optimize if returning the constructed
15642 && (UNLIKELY(posixl_matches_all) || element_count == 1))
15647 if (UNLIKELY(posixl_matches_all)) {
15650 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
15651 \w or [:digit:] or \p{foo}
15654 /* All named classes are mapped into POSIXish nodes, with its FLAG
15655 * argument giving which class it is */
15656 switch ((I32)namedclass) {
15657 case ANYOF_UNIPROP:
15660 /* These don't depend on the charset modifiers. They always
15661 * match under /u rules */
15662 case ANYOF_NHORIZWS:
15663 case ANYOF_HORIZWS:
15664 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
15667 case ANYOF_NVERTWS:
15672 /* The actual POSIXish node for all the rest depends on the
15673 * charset modifier. The ones in the first set depend only on
15674 * ASCII or, if available on this platform, also locale */
15678 op = (LOC) ? POSIXL : POSIXA;
15684 /* The following don't have any matches in the upper Latin1
15685 * range, hence /d is equivalent to /u for them. Making it /u
15686 * saves some branches at runtime */
15690 case ANYOF_NXDIGIT:
15691 if (! DEPENDS_SEMANTICS) {
15692 goto treat_as_default;
15698 /* The following change to CASED under /i */
15704 namedclass = ANYOF_CASED + (namedclass % 2);
15708 /* The rest have more possibilities depending on the charset.
15709 * We take advantage of the enum ordering of the charset
15710 * modifiers to get the exact node type, */
15713 op = POSIXD + get_regex_charset(RExC_flags);
15714 if (op > POSIXA) { /* /aa is same as /a */
15719 /* The odd numbered ones are the complements of the
15720 * next-lower even number one */
15721 if (namedclass % 2 == 1) {
15725 arg = namedclass_to_classnum(namedclass);
15729 else if (value == prevvalue) {
15731 /* Here, the class consists of just a single code point */
15734 if (! LOC && value == '\n') {
15735 op = REG_ANY; /* Optimize [^\n] */
15736 *flagp |= HASWIDTH|SIMPLE;
15740 else if (value < 256 || UTF) {
15742 /* Optimize a single value into an EXACTish node, but not if it
15743 * would require converting the pattern to UTF-8. */
15744 op = compute_EXACTish(pRExC_state);
15746 } /* Otherwise is a range */
15747 else if (! LOC) { /* locale could vary these */
15748 if (prevvalue == '0') {
15749 if (value == '9') {
15754 else if (! FOLD || ASCII_FOLD_RESTRICTED) {
15755 /* We can optimize A-Z or a-z, but not if they could match
15756 * something like the KELVIN SIGN under /i. */
15757 if (prevvalue == 'A') {
15760 && ! non_portable_endpoint
15763 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
15767 else if (prevvalue == 'a') {
15770 && ! non_portable_endpoint
15773 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
15780 /* Here, we have changed <op> away from its initial value iff we found
15781 * an optimization */
15784 /* Throw away this ANYOF regnode, and emit the calculated one,
15785 * which should correspond to the beginning, not current, state of
15787 const char * cur_parse = RExC_parse;
15788 RExC_parse = (char *)orig_parse;
15792 /* To get locale nodes to not use the full ANYOF size would
15793 * require moving the code above that writes the portions
15794 * of it that aren't in other nodes to after this point.
15795 * e.g. ANYOF_POSIXL_SET */
15796 RExC_size = orig_size;
15800 RExC_emit = (regnode *)orig_emit;
15801 if (PL_regkind[op] == POSIXD) {
15802 if (op == POSIXL) {
15803 RExC_contains_locale = 1;
15806 op += NPOSIXD - POSIXD;
15811 ret = reg_node(pRExC_state, op);
15813 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
15817 *flagp |= HASWIDTH|SIMPLE;
15819 else if (PL_regkind[op] == EXACT) {
15820 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15821 TRUE /* downgradable to EXACT */
15825 RExC_parse = (char *) cur_parse;
15827 SvREFCNT_dec(posixes);
15828 SvREFCNT_dec(nposixes);
15829 SvREFCNT_dec(simple_posixes);
15830 SvREFCNT_dec(cp_list);
15831 SvREFCNT_dec(cp_foldable_list);
15838 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
15840 /* If folding, we calculate all characters that could fold to or from the
15841 * ones already on the list */
15842 if (cp_foldable_list) {
15844 UV start, end; /* End points of code point ranges */
15846 SV* fold_intersection = NULL;
15849 /* Our calculated list will be for Unicode rules. For locale
15850 * matching, we have to keep a separate list that is consulted at
15851 * runtime only when the locale indicates Unicode rules. For
15852 * non-locale, we just use the general list */
15854 use_list = &only_utf8_locale_list;
15857 use_list = &cp_list;
15860 /* Only the characters in this class that participate in folds need
15861 * be checked. Get the intersection of this class and all the
15862 * possible characters that are foldable. This can quickly narrow
15863 * down a large class */
15864 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
15865 &fold_intersection);
15867 /* The folds for all the Latin1 characters are hard-coded into this
15868 * program, but we have to go out to disk to get the others. */
15869 if (invlist_highest(cp_foldable_list) >= 256) {
15871 /* This is a hash that for a particular fold gives all
15872 * characters that are involved in it */
15873 if (! PL_utf8_foldclosures) {
15874 _load_PL_utf8_foldclosures();
15878 /* Now look at the foldable characters in this class individually */
15879 invlist_iterinit(fold_intersection);
15880 while (invlist_iternext(fold_intersection, &start, &end)) {
15883 /* Look at every character in the range */
15884 for (j = start; j <= end; j++) {
15885 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
15891 if (IS_IN_SOME_FOLD_L1(j)) {
15893 /* ASCII is always matched; non-ASCII is matched
15894 * only under Unicode rules (which could happen
15895 * under /l if the locale is a UTF-8 one */
15896 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
15897 *use_list = add_cp_to_invlist(*use_list,
15898 PL_fold_latin1[j]);
15901 has_upper_latin1_only_utf8_matches
15902 = add_cp_to_invlist(
15903 has_upper_latin1_only_utf8_matches,
15904 PL_fold_latin1[j]);
15908 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
15909 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
15911 add_above_Latin1_folds(pRExC_state,
15918 /* Here is an above Latin1 character. We don't have the
15919 * rules hard-coded for it. First, get its fold. This is
15920 * the simple fold, as the multi-character folds have been
15921 * handled earlier and separated out */
15922 _to_uni_fold_flags(j, foldbuf, &foldlen,
15923 (ASCII_FOLD_RESTRICTED)
15924 ? FOLD_FLAGS_NOMIX_ASCII
15927 /* Single character fold of above Latin1. Add everything in
15928 * its fold closure to the list that this node should match.
15929 * The fold closures data structure is a hash with the keys
15930 * being the UTF-8 of every character that is folded to, like
15931 * 'k', and the values each an array of all code points that
15932 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
15933 * Multi-character folds are not included */
15934 if ((listp = hv_fetch(PL_utf8_foldclosures,
15935 (char *) foldbuf, foldlen, FALSE)))
15937 AV* list = (AV*) *listp;
15939 for (k = 0; k <= av_tindex(list); k++) {
15940 SV** c_p = av_fetch(list, k, FALSE);
15946 /* /aa doesn't allow folds between ASCII and non- */
15947 if ((ASCII_FOLD_RESTRICTED
15948 && (isASCII(c) != isASCII(j))))
15953 /* Folds under /l which cross the 255/256 boundary
15954 * are added to a separate list. (These are valid
15955 * only when the locale is UTF-8.) */
15956 if (c < 256 && LOC) {
15957 *use_list = add_cp_to_invlist(*use_list, c);
15961 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15963 cp_list = add_cp_to_invlist(cp_list, c);
15966 /* Similarly folds involving non-ascii Latin1
15967 * characters under /d are added to their list */
15968 has_upper_latin1_only_utf8_matches
15969 = add_cp_to_invlist(
15970 has_upper_latin1_only_utf8_matches,
15977 SvREFCNT_dec_NN(fold_intersection);
15980 /* Now that we have finished adding all the folds, there is no reason
15981 * to keep the foldable list separate */
15982 _invlist_union(cp_list, cp_foldable_list, &cp_list);
15983 SvREFCNT_dec_NN(cp_foldable_list);
15986 /* And combine the result (if any) with any inversion list from posix
15987 * classes. The lists are kept separate up to now because we don't want to
15988 * fold the classes (folding of those is automatically handled by the swash
15989 * fetching code) */
15990 if (simple_posixes) {
15991 _invlist_union(cp_list, simple_posixes, &cp_list);
15992 SvREFCNT_dec_NN(simple_posixes);
15994 if (posixes || nposixes) {
15995 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15996 /* Under /a and /aa, nothing above ASCII matches these */
15997 _invlist_intersection(posixes,
15998 PL_XPosix_ptrs[_CC_ASCII],
16002 if (DEPENDS_SEMANTICS) {
16003 /* Under /d, everything in the upper half of the Latin1 range
16004 * matches these complements */
16005 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
16007 else if (AT_LEAST_ASCII_RESTRICTED) {
16008 /* Under /a and /aa, everything above ASCII matches these
16010 _invlist_union_complement_2nd(nposixes,
16011 PL_XPosix_ptrs[_CC_ASCII],
16015 _invlist_union(posixes, nposixes, &posixes);
16016 SvREFCNT_dec_NN(nposixes);
16019 posixes = nposixes;
16022 if (! DEPENDS_SEMANTICS) {
16024 _invlist_union(cp_list, posixes, &cp_list);
16025 SvREFCNT_dec_NN(posixes);
16032 /* Under /d, we put into a separate list the Latin1 things that
16033 * match only when the target string is utf8 */
16034 SV* nonascii_but_latin1_properties = NULL;
16035 _invlist_intersection(posixes, PL_UpperLatin1,
16036 &nonascii_but_latin1_properties);
16037 _invlist_subtract(posixes, nonascii_but_latin1_properties,
16040 _invlist_union(cp_list, posixes, &cp_list);
16041 SvREFCNT_dec_NN(posixes);
16047 if (has_upper_latin1_only_utf8_matches) {
16048 _invlist_union(has_upper_latin1_only_utf8_matches,
16049 nonascii_but_latin1_properties,
16050 &has_upper_latin1_only_utf8_matches);
16051 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
16054 has_upper_latin1_only_utf8_matches
16055 = nonascii_but_latin1_properties;
16060 /* And combine the result (if any) with any inversion list from properties.
16061 * The lists are kept separate up to now so that we can distinguish the two
16062 * in regards to matching above-Unicode. A run-time warning is generated
16063 * if a Unicode property is matched against a non-Unicode code point. But,
16064 * we allow user-defined properties to match anything, without any warning,
16065 * and we also suppress the warning if there is a portion of the character
16066 * class that isn't a Unicode property, and which matches above Unicode, \W
16067 * or [\x{110000}] for example.
16068 * (Note that in this case, unlike the Posix one above, there is no
16069 * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
16070 * forces Unicode semantics */
16074 /* If it matters to the final outcome, see if a non-property
16075 * component of the class matches above Unicode. If so, the
16076 * warning gets suppressed. This is true even if just a single
16077 * such code point is specified, as though not strictly correct if
16078 * another such code point is matched against, the fact that they
16079 * are using above-Unicode code points indicates they should know
16080 * the issues involved */
16082 warn_super = ! (invert
16083 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
16086 _invlist_union(properties, cp_list, &cp_list);
16087 SvREFCNT_dec_NN(properties);
16090 cp_list = properties;
16095 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
16097 /* Because an ANYOF node is the only one that warns, this node
16098 * can't be optimized into something else */
16099 optimizable = FALSE;
16103 /* Here, we have calculated what code points should be in the character
16106 * Now we can see about various optimizations. Fold calculation (which we
16107 * did above) needs to take place before inversion. Otherwise /[^k]/i
16108 * would invert to include K, which under /i would match k, which it
16109 * shouldn't. Therefore we can't invert folded locale now, as it won't be
16110 * folded until runtime */
16112 /* If we didn't do folding, it's because some information isn't available
16113 * until runtime; set the run-time fold flag for these. (We don't have to
16114 * worry about properties folding, as that is taken care of by the swash
16115 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
16116 * locales, or the class matches at least one 0-255 range code point */
16118 if (only_utf8_locale_list) {
16121 |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16123 else if (cp_list) { /* Look to see if a 0-255 code point is in list */
16125 invlist_iterinit(cp_list);
16126 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
16127 ANYOF_FLAGS(ret) |= ANYOFL_FOLD;
16129 invlist_iterfinish(cp_list);
16133 #define MATCHES_ALL_NON_UTF8_NON_ASCII(ret) \
16134 ( DEPENDS_SEMANTICS \
16135 && ANYOF_FLAGS(ret) \
16136 & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
16138 /* See if we can simplify things under /d */
16139 if ( has_upper_latin1_only_utf8_matches
16140 || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
16142 if (has_upper_latin1_only_utf8_matches) {
16143 if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
16145 /* Here, we have two, almost opposite, constraints in effect
16146 * for upper latin1 characters. The macro means they all match
16147 * when the target string ISN'T in UTF-8.
16148 * 'has_upper_latin1_only_utf8_matches' contains the chars that
16149 * match only if the target string IS UTF-8. Therefore the
16150 * ones in 'has_upper_latin1_only_utf8_matches' match
16151 * regardless of UTF-8, so can be added to the regular list,
16152 * and 'has_upper_latin1_only_utf8_matches' cleared */
16153 _invlist_union(cp_list,
16154 has_upper_latin1_only_utf8_matches,
16156 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
16157 has_upper_latin1_only_utf8_matches = NULL;
16159 else if (cp_list) {
16161 /* Here, 'cp_list' gives chars that always match, and
16162 * 'has_upper_latin1_only_utf8_matches' gives chars that were
16163 * specified to match only if the target string is in UTF-8.
16164 * It may be that these overlap, so we can subtract the
16165 * unconditionally matching from the conditional ones, to make
16166 * the conditional list as small as possible, perhaps even
16167 * clearing it, in which case more optimizations are possible
16169 _invlist_subtract(has_upper_latin1_only_utf8_matches,
16171 &has_upper_latin1_only_utf8_matches);
16172 if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
16173 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
16174 has_upper_latin1_only_utf8_matches = NULL;
16179 /* Similarly, if the unconditional matches include every upper latin1
16180 * character, we can clear that flag to permit later optimizations */
16181 if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
16182 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
16183 _invlist_subtract(only_non_utf8_list, cp_list, &only_non_utf8_list);
16184 if (_invlist_len(only_non_utf8_list) == 0) {
16185 ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
16187 SvREFCNT_dec_NN(only_non_utf8_list);
16188 only_non_utf8_list = NULL;;
16191 /* If we haven't gotten rid of all conditional matching, we change the
16192 * regnode type to indicate that */
16193 if ( has_upper_latin1_only_utf8_matches
16194 || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
16197 optimizable = FALSE;
16200 #undef MATCHES_ALL_NON_UTF8_NON_ASCII
16202 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
16203 * at compile time. Besides not inverting folded locale now, we can't
16204 * invert if there are things such as \w, which aren't known until runtime
16208 && OP(ret) != ANYOFD
16209 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
16210 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
16212 _invlist_invert(cp_list);
16214 /* Any swash can't be used as-is, because we've inverted things */
16216 SvREFCNT_dec_NN(swash);
16220 /* Clear the invert flag since have just done it here */
16227 *ret_invlist = cp_list;
16228 SvREFCNT_dec(swash);
16230 /* Discard the generated node */
16232 RExC_size = orig_size;
16235 RExC_emit = orig_emit;
16240 /* Some character classes are equivalent to other nodes. Such nodes take
16241 * up less room and generally fewer operations to execute than ANYOF nodes.
16242 * Above, we checked for and optimized into some such equivalents for
16243 * certain common classes that are easy to test. Getting to this point in
16244 * the code means that the class didn't get optimized there. Since this
16245 * code is only executed in Pass 2, it is too late to save space--it has
16246 * been allocated in Pass 1, and currently isn't given back. But turning
16247 * things into an EXACTish node can allow the optimizer to join it to any
16248 * adjacent such nodes. And if the class is equivalent to things like /./,
16249 * expensive run-time swashes can be avoided. Now that we have more
16250 * complete information, we can find things necessarily missed by the
16251 * earlier code. Another possible "optimization" that isn't done is that
16252 * something like [Ee] could be changed into an EXACTFU. khw tried this
16253 * and found that the ANYOF is faster, including for code points not in the
16254 * bitmap. This still might make sense to do, provided it got joined with
16255 * an adjacent node(s) to create a longer EXACTFU one. This could be
16256 * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
16257 * routine would know is joinable. If that didn't happen, the node type
16258 * could then be made a straight ANYOF */
16260 if (optimizable && cp_list && ! invert) {
16262 U8 op = END; /* The optimzation node-type */
16263 int posix_class = -1; /* Illegal value */
16264 const char * cur_parse= RExC_parse;
16266 invlist_iterinit(cp_list);
16267 if (! invlist_iternext(cp_list, &start, &end)) {
16269 /* Here, the list is empty. This happens, for example, when a
16270 * Unicode property that doesn't match anything is the only element
16271 * in the character class (perluniprops.pod notes such properties).
16274 *flagp |= HASWIDTH|SIMPLE;
16276 else if (start == end) { /* The range is a single code point */
16277 if (! invlist_iternext(cp_list, &start, &end)
16279 /* Don't do this optimization if it would require changing
16280 * the pattern to UTF-8 */
16281 && (start < 256 || UTF))
16283 /* Here, the list contains a single code point. Can optimize
16284 * into an EXACTish node */
16295 /* A locale node under folding with one code point can be
16296 * an EXACTFL, as its fold won't be calculated until
16302 /* Here, we are generally folding, but there is only one
16303 * code point to match. If we have to, we use an EXACT
16304 * node, but it would be better for joining with adjacent
16305 * nodes in the optimization pass if we used the same
16306 * EXACTFish node that any such are likely to be. We can
16307 * do this iff the code point doesn't participate in any
16308 * folds. For example, an EXACTF of a colon is the same as
16309 * an EXACT one, since nothing folds to or from a colon. */
16311 if (IS_IN_SOME_FOLD_L1(value)) {
16316 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
16321 /* If we haven't found the node type, above, it means we
16322 * can use the prevailing one */
16324 op = compute_EXACTish(pRExC_state);
16328 } /* End of first range contains just a single code point */
16329 else if (start == 0) {
16330 if (end == UV_MAX) {
16332 *flagp |= HASWIDTH|SIMPLE;
16335 else if (end == '\n' - 1
16336 && invlist_iternext(cp_list, &start, &end)
16337 && start == '\n' + 1 && end == UV_MAX)
16340 *flagp |= HASWIDTH|SIMPLE;
16344 invlist_iterfinish(cp_list);
16347 const UV cp_list_len = _invlist_len(cp_list);
16348 const UV* cp_list_array = invlist_array(cp_list);
16350 /* Here, didn't find an optimization. See if this matches any of
16351 * the POSIX classes. These run slightly faster for above-Unicode
16352 * code points, so don't bother with POSIXA ones nor the 2 that
16353 * have no above-Unicode matches. We can avoid these checks unless
16354 * the ANYOF matches at least as high as the lowest POSIX one
16355 * (which was manually found to be \v. The actual code point may
16356 * increase in later Unicode releases, if a higher code point is
16357 * assigned to be \v, but this code will never break. It would
16358 * just mean we could execute the checks for posix optimizations
16359 * unnecessarily) */
16361 if (cp_list_array[cp_list_len-1] > 0x2029) {
16362 for (posix_class = 0;
16363 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
16367 if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
16370 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
16372 /* Check if matches normal or inverted */
16373 if (_invlistEQ(cp_list,
16374 PL_XPosix_ptrs[posix_class],
16377 op = (try_inverted)
16380 *flagp |= HASWIDTH|SIMPLE;
16390 RExC_parse = (char *)orig_parse;
16391 RExC_emit = (regnode *)orig_emit;
16393 if (regarglen[op]) {
16394 ret = reganode(pRExC_state, op, 0);
16396 ret = reg_node(pRExC_state, op);
16399 RExC_parse = (char *)cur_parse;
16401 if (PL_regkind[op] == EXACT) {
16402 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
16403 TRUE /* downgradable to EXACT */
16406 else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
16407 FLAGS(ret) = posix_class;
16410 SvREFCNT_dec_NN(cp_list);
16415 /* Here, <cp_list> contains all the code points we can determine at
16416 * compile time that match under all conditions. Go through it, and
16417 * for things that belong in the bitmap, put them there, and delete from
16418 * <cp_list>. While we are at it, see if everything above 255 is in the
16419 * list, and if so, set a flag to speed up execution */
16421 populate_ANYOF_from_invlist(ret, &cp_list);
16424 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
16427 /* Here, the bitmap has been populated with all the Latin1 code points that
16428 * always match. Can now add to the overall list those that match only
16429 * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
16431 if (has_upper_latin1_only_utf8_matches) {
16433 _invlist_union(cp_list,
16434 has_upper_latin1_only_utf8_matches,
16436 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
16439 cp_list = has_upper_latin1_only_utf8_matches;
16441 ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
16444 /* If there is a swash and more than one element, we can't use the swash in
16445 * the optimization below. */
16446 if (swash && element_count > 1) {
16447 SvREFCNT_dec_NN(swash);
16451 /* Note that the optimization of using 'swash' if it is the only thing in
16452 * the class doesn't have us change swash at all, so it can include things
16453 * that are also in the bitmap; otherwise we have purposely deleted that
16454 * duplicate information */
16455 set_ANYOF_arg(pRExC_state, ret, cp_list,
16456 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
16458 only_utf8_locale_list,
16459 swash, has_user_defined_property);
16461 *flagp |= HASWIDTH|SIMPLE;
16463 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
16464 RExC_contains_locale = 1;
16470 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
16473 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
16474 regnode* const node,
16476 SV* const runtime_defns,
16477 SV* const only_utf8_locale_list,
16479 const bool has_user_defined_property)
16481 /* Sets the arg field of an ANYOF-type node 'node', using information about
16482 * the node passed-in. If there is nothing outside the node's bitmap, the
16483 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
16484 * the count returned by add_data(), having allocated and stored an array,
16485 * av, that that count references, as follows:
16486 * av[0] stores the character class description in its textual form.
16487 * This is used later (regexec.c:Perl_regclass_swash()) to
16488 * initialize the appropriate swash, and is also useful for dumping
16489 * the regnode. This is set to &PL_sv_undef if the textual
16490 * description is not needed at run-time (as happens if the other
16491 * elements completely define the class)
16492 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
16493 * computed from av[0]. But if no further computation need be done,
16494 * the swash is stored here now (and av[0] is &PL_sv_undef).
16495 * av[2] stores the inversion list of code points that match only if the
16496 * current locale is UTF-8
16497 * av[3] stores the cp_list inversion list for use in addition or instead
16498 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
16499 * (Otherwise everything needed is already in av[0] and av[1])
16500 * av[4] is set if any component of the class is from a user-defined
16501 * property; used only if av[3] exists */
16505 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
16507 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
16508 assert(! (ANYOF_FLAGS(node)
16509 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
16510 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
16513 AV * const av = newAV();
16516 av_store(av, 0, (runtime_defns)
16517 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
16520 av_store(av, 1, swash);
16521 SvREFCNT_dec_NN(cp_list);
16524 av_store(av, 1, &PL_sv_undef);
16526 av_store(av, 3, cp_list);
16527 av_store(av, 4, newSVuv(has_user_defined_property));
16531 if (only_utf8_locale_list) {
16532 av_store(av, 2, only_utf8_locale_list);
16535 av_store(av, 2, &PL_sv_undef);
16538 rv = newRV_noinc(MUTABLE_SV(av));
16539 n = add_data(pRExC_state, STR_WITH_LEN("s"));
16540 RExC_rxi->data->data[n] = (void*)rv;
16545 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
16547 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
16548 const regnode* node,
16551 SV** only_utf8_locale_ptr,
16555 /* For internal core use only.
16556 * Returns the swash for the input 'node' in the regex 'prog'.
16557 * If <doinit> is 'true', will attempt to create the swash if not already
16559 * If <listsvp> is non-null, will return the printable contents of the
16560 * swash. This can be used to get debugging information even before the
16561 * swash exists, by calling this function with 'doinit' set to false, in
16562 * which case the components that will be used to eventually create the
16563 * swash are returned (in a printable form).
16564 * If <exclude_list> is not NULL, it is an inversion list of things to
16565 * exclude from what's returned in <listsvp>.
16566 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
16567 * that, in spite of this function's name, the swash it returns may include
16568 * the bitmap data as well */
16571 SV *si = NULL; /* Input swash initialization string */
16572 SV* invlist = NULL;
16574 RXi_GET_DECL(prog,progi);
16575 const struct reg_data * const data = prog ? progi->data : NULL;
16577 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
16579 if (data && data->count) {
16580 const U32 n = ARG(node);
16582 if (data->what[n] == 's') {
16583 SV * const rv = MUTABLE_SV(data->data[n]);
16584 AV * const av = MUTABLE_AV(SvRV(rv));
16585 SV **const ary = AvARRAY(av);
16586 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
16588 si = *ary; /* ary[0] = the string to initialize the swash with */
16590 if (av_tindex(av) >= 2) {
16591 if (only_utf8_locale_ptr
16593 && ary[2] != &PL_sv_undef)
16595 *only_utf8_locale_ptr = ary[2];
16598 assert(only_utf8_locale_ptr);
16599 *only_utf8_locale_ptr = NULL;
16602 /* Elements 3 and 4 are either both present or both absent. [3]
16603 * is any inversion list generated at compile time; [4]
16604 * indicates if that inversion list has any user-defined
16605 * properties in it. */
16606 if (av_tindex(av) >= 3) {
16608 if (SvUV(ary[4])) {
16609 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
16617 /* Element [1] is reserved for the set-up swash. If already there,
16618 * return it; if not, create it and store it there */
16619 if (ary[1] && SvROK(ary[1])) {
16622 else if (doinit && ((si && si != &PL_sv_undef)
16623 || (invlist && invlist != &PL_sv_undef))) {
16625 sw = _core_swash_init("utf8", /* the utf8 package */
16629 0, /* not from tr/// */
16631 &swash_init_flags);
16632 (void)av_store(av, 1, sw);
16637 /* If requested, return a printable version of what this swash matches */
16639 SV* matches_string = newSVpvs("");
16641 /* The swash should be used, if possible, to get the data, as it
16642 * contains the resolved data. But this function can be called at
16643 * compile-time, before everything gets resolved, in which case we
16644 * return the currently best available information, which is the string
16645 * that will eventually be used to do that resolving, 'si' */
16646 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
16647 && (si && si != &PL_sv_undef))
16649 sv_catsv(matches_string, si);
16652 /* Add the inversion list to whatever we have. This may have come from
16653 * the swash, or from an input parameter */
16655 if (exclude_list) {
16656 SV* clone = invlist_clone(invlist);
16657 _invlist_subtract(clone, exclude_list, &clone);
16658 sv_catsv(matches_string, _invlist_contents(clone));
16659 SvREFCNT_dec_NN(clone);
16662 sv_catsv(matches_string, _invlist_contents(invlist));
16665 *listsvp = matches_string;
16670 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
16672 /* reg_skipcomment()
16674 Absorbs an /x style # comment from the input stream,
16675 returning a pointer to the first character beyond the comment, or if the
16676 comment terminates the pattern without anything following it, this returns
16677 one past the final character of the pattern (in other words, RExC_end) and
16678 sets the REG_RUN_ON_COMMENT_SEEN flag.
16680 Note it's the callers responsibility to ensure that we are
16681 actually in /x mode
16685 PERL_STATIC_INLINE char*
16686 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
16688 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
16692 while (p < RExC_end) {
16693 if (*(++p) == '\n') {
16698 /* we ran off the end of the pattern without ending the comment, so we have
16699 * to add an \n when wrapping */
16700 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
16705 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
16707 const bool force_to_xmod
16710 /* If the text at the current parse position '*p' is a '(?#...)' comment,
16711 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
16712 * is /x whitespace, advance '*p' so that on exit it points to the first
16713 * byte past all such white space and comments */
16715 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
16717 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
16719 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
16722 if (RExC_end - (*p) >= 3
16724 && *(*p + 1) == '?'
16725 && *(*p + 2) == '#')
16727 while (*(*p) != ')') {
16728 if ((*p) == RExC_end)
16729 FAIL("Sequence (?#... not terminated");
16737 const char * save_p = *p;
16738 while ((*p) < RExC_end) {
16740 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
16743 else if (*(*p) == '#') {
16744 (*p) = reg_skipcomment(pRExC_state, (*p));
16750 if (*p != save_p) {
16763 Advances the parse position by one byte, unless that byte is the beginning
16764 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
16765 those two cases, the parse position is advanced beyond all such comments and
16768 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
16772 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
16774 PERL_ARGS_ASSERT_NEXTCHAR;
16777 || UTF8_IS_INVARIANT(*RExC_parse)
16778 || UTF8_IS_START(*RExC_parse));
16780 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16782 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16783 FALSE /* Don't assume /x */ );
16787 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
16789 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
16790 * space. In pass1, it aligns and increments RExC_size; in pass2,
16793 regnode * const ret = RExC_emit;
16794 GET_RE_DEBUG_FLAGS_DECL;
16796 PERL_ARGS_ASSERT_REGNODE_GUTS;
16798 assert(extra_size >= regarglen[op]);
16801 SIZE_ALIGN(RExC_size);
16802 RExC_size += 1 + extra_size;
16805 if (RExC_emit >= RExC_emit_bound)
16806 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
16807 op, (void*)RExC_emit, (void*)RExC_emit_bound);
16809 NODE_ALIGN_FILL(ret);
16810 #ifndef RE_TRACK_PATTERN_OFFSETS
16811 PERL_UNUSED_ARG(name);
16813 if (RExC_offsets) { /* MJD */
16815 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
16818 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
16819 ? "Overwriting end of array!\n" : "OK",
16820 (UV)(RExC_emit - RExC_emit_start),
16821 (UV)(RExC_parse - RExC_start),
16822 (UV)RExC_offsets[0]));
16823 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
16830 - reg_node - emit a node
16832 STATIC regnode * /* Location. */
16833 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
16835 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
16837 PERL_ARGS_ASSERT_REG_NODE;
16839 assert(regarglen[op] == 0);
16842 regnode *ptr = ret;
16843 FILL_ADVANCE_NODE(ptr, op);
16850 - reganode - emit a node with an argument
16852 STATIC regnode * /* Location. */
16853 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
16855 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
16857 PERL_ARGS_ASSERT_REGANODE;
16859 assert(regarglen[op] == 1);
16862 regnode *ptr = ret;
16863 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
16870 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
16872 /* emit a node with U32 and I32 arguments */
16874 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
16876 PERL_ARGS_ASSERT_REG2LANODE;
16878 assert(regarglen[op] == 2);
16881 regnode *ptr = ret;
16882 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
16889 - reginsert - insert an operator in front of already-emitted operand
16891 * Means relocating the operand.
16894 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
16899 const int offset = regarglen[(U8)op];
16900 const int size = NODE_STEP_REGNODE + offset;
16901 GET_RE_DEBUG_FLAGS_DECL;
16903 PERL_ARGS_ASSERT_REGINSERT;
16904 PERL_UNUSED_CONTEXT;
16905 PERL_UNUSED_ARG(depth);
16906 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
16907 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
16916 if (RExC_open_parens) {
16918 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
16919 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
16920 if ( RExC_open_parens[paren] >= opnd ) {
16921 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
16922 RExC_open_parens[paren] += size;
16924 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
16926 if ( RExC_close_parens[paren] >= opnd ) {
16927 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
16928 RExC_close_parens[paren] += size;
16930 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
16935 while (src > opnd) {
16936 StructCopy(--src, --dst, regnode);
16937 #ifdef RE_TRACK_PATTERN_OFFSETS
16938 if (RExC_offsets) { /* MJD 20010112 */
16940 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
16944 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
16945 ? "Overwriting end of array!\n" : "OK",
16946 (UV)(src - RExC_emit_start),
16947 (UV)(dst - RExC_emit_start),
16948 (UV)RExC_offsets[0]));
16949 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
16950 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
16956 place = opnd; /* Op node, where operand used to be. */
16957 #ifdef RE_TRACK_PATTERN_OFFSETS
16958 if (RExC_offsets) { /* MJD */
16960 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
16964 (UV)(place - RExC_emit_start) > RExC_offsets[0]
16965 ? "Overwriting end of array!\n" : "OK",
16966 (UV)(place - RExC_emit_start),
16967 (UV)(RExC_parse - RExC_start),
16968 (UV)RExC_offsets[0]));
16969 Set_Node_Offset(place, RExC_parse);
16970 Set_Node_Length(place, 1);
16973 src = NEXTOPER(place);
16974 FILL_ADVANCE_NODE(place, op);
16975 Zero(src, offset, regnode);
16979 - regtail - set the next-pointer at the end of a node chain of p to val.
16980 - SEE ALSO: regtail_study
16983 S_regtail(pTHX_ RExC_state_t * pRExC_state,
16984 const regnode * const p,
16985 const regnode * const val,
16989 GET_RE_DEBUG_FLAGS_DECL;
16991 PERL_ARGS_ASSERT_REGTAIL;
16993 PERL_UNUSED_ARG(depth);
16999 /* Find last node. */
17000 scan = (regnode *) p;
17002 regnode * const temp = regnext(scan);
17004 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
17005 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
17006 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
17007 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
17008 (temp == NULL ? "->" : ""),
17009 (temp == NULL ? PL_reg_name[OP(val)] : "")
17017 if (reg_off_by_arg[OP(scan)]) {
17018 ARG_SET(scan, val - scan);
17021 NEXT_OFF(scan) = val - scan;
17027 - regtail_study - set the next-pointer at the end of a node chain of p to val.
17028 - Look for optimizable sequences at the same time.
17029 - currently only looks for EXACT chains.
17031 This is experimental code. The idea is to use this routine to perform
17032 in place optimizations on branches and groups as they are constructed,
17033 with the long term intention of removing optimization from study_chunk so
17034 that it is purely analytical.
17036 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
17037 to control which is which.
17040 /* TODO: All four parms should be const */
17043 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
17044 const regnode *val,U32 depth)
17048 #ifdef EXPERIMENTAL_INPLACESCAN
17051 GET_RE_DEBUG_FLAGS_DECL;
17053 PERL_ARGS_ASSERT_REGTAIL_STUDY;
17059 /* Find last node. */
17063 regnode * const temp = regnext(scan);
17064 #ifdef EXPERIMENTAL_INPLACESCAN
17065 if (PL_regkind[OP(scan)] == EXACT) {
17066 bool unfolded_multi_char; /* Unexamined in this routine */
17067 if (join_exact(pRExC_state, scan, &min,
17068 &unfolded_multi_char, 1, val, depth+1))
17073 switch (OP(scan)) {
17077 case EXACTFA_NO_TRIE:
17083 if( exact == PSEUDO )
17085 else if ( exact != OP(scan) )
17094 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
17095 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
17096 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
17097 SvPV_nolen_const(RExC_mysv),
17098 REG_NODE_NUM(scan),
17099 PL_reg_name[exact]);
17106 DEBUG_PARSE_MSG("");
17107 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
17108 PerlIO_printf(Perl_debug_log,
17109 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
17110 SvPV_nolen_const(RExC_mysv),
17111 (IV)REG_NODE_NUM(val),
17115 if (reg_off_by_arg[OP(scan)]) {
17116 ARG_SET(scan, val - scan);
17119 NEXT_OFF(scan) = val - scan;
17127 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
17132 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
17137 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
17139 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
17140 if (flags & (1<<bit)) {
17141 if (!set++ && lead)
17142 PerlIO_printf(Perl_debug_log, "%s",lead);
17143 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
17148 PerlIO_printf(Perl_debug_log, "\n");
17150 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
17155 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
17161 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
17163 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
17164 if (flags & (1<<bit)) {
17165 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
17168 if (!set++ && lead)
17169 PerlIO_printf(Perl_debug_log, "%s",lead);
17170 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
17173 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
17174 if (!set++ && lead) {
17175 PerlIO_printf(Perl_debug_log, "%s",lead);
17178 case REGEX_UNICODE_CHARSET:
17179 PerlIO_printf(Perl_debug_log, "UNICODE");
17181 case REGEX_LOCALE_CHARSET:
17182 PerlIO_printf(Perl_debug_log, "LOCALE");
17184 case REGEX_ASCII_RESTRICTED_CHARSET:
17185 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
17187 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
17188 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
17191 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
17197 PerlIO_printf(Perl_debug_log, "\n");
17199 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
17205 Perl_regdump(pTHX_ const regexp *r)
17208 SV * const sv = sv_newmortal();
17209 SV *dsv= sv_newmortal();
17210 RXi_GET_DECL(r,ri);
17211 GET_RE_DEBUG_FLAGS_DECL;
17213 PERL_ARGS_ASSERT_REGDUMP;
17215 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
17217 /* Header fields of interest. */
17218 if (r->anchored_substr) {
17219 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
17220 RE_SV_DUMPLEN(r->anchored_substr), 30);
17221 PerlIO_printf(Perl_debug_log,
17222 "anchored %s%s at %"IVdf" ",
17223 s, RE_SV_TAIL(r->anchored_substr),
17224 (IV)r->anchored_offset);
17225 } else if (r->anchored_utf8) {
17226 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
17227 RE_SV_DUMPLEN(r->anchored_utf8), 30);
17228 PerlIO_printf(Perl_debug_log,
17229 "anchored utf8 %s%s at %"IVdf" ",
17230 s, RE_SV_TAIL(r->anchored_utf8),
17231 (IV)r->anchored_offset);
17233 if (r->float_substr) {
17234 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
17235 RE_SV_DUMPLEN(r->float_substr), 30);
17236 PerlIO_printf(Perl_debug_log,
17237 "floating %s%s at %"IVdf"..%"UVuf" ",
17238 s, RE_SV_TAIL(r->float_substr),
17239 (IV)r->float_min_offset, (UV)r->float_max_offset);
17240 } else if (r->float_utf8) {
17241 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
17242 RE_SV_DUMPLEN(r->float_utf8), 30);
17243 PerlIO_printf(Perl_debug_log,
17244 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
17245 s, RE_SV_TAIL(r->float_utf8),
17246 (IV)r->float_min_offset, (UV)r->float_max_offset);
17248 if (r->check_substr || r->check_utf8)
17249 PerlIO_printf(Perl_debug_log,
17251 (r->check_substr == r->float_substr
17252 && r->check_utf8 == r->float_utf8
17253 ? "(checking floating" : "(checking anchored"));
17254 if (r->intflags & PREGf_NOSCAN)
17255 PerlIO_printf(Perl_debug_log, " noscan");
17256 if (r->extflags & RXf_CHECK_ALL)
17257 PerlIO_printf(Perl_debug_log, " isall");
17258 if (r->check_substr || r->check_utf8)
17259 PerlIO_printf(Perl_debug_log, ") ");
17261 if (ri->regstclass) {
17262 regprop(r, sv, ri->regstclass, NULL, NULL);
17263 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
17265 if (r->intflags & PREGf_ANCH) {
17266 PerlIO_printf(Perl_debug_log, "anchored");
17267 if (r->intflags & PREGf_ANCH_MBOL)
17268 PerlIO_printf(Perl_debug_log, "(MBOL)");
17269 if (r->intflags & PREGf_ANCH_SBOL)
17270 PerlIO_printf(Perl_debug_log, "(SBOL)");
17271 if (r->intflags & PREGf_ANCH_GPOS)
17272 PerlIO_printf(Perl_debug_log, "(GPOS)");
17273 (void)PerlIO_putc(Perl_debug_log, ' ');
17275 if (r->intflags & PREGf_GPOS_SEEN)
17276 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
17277 if (r->intflags & PREGf_SKIP)
17278 PerlIO_printf(Perl_debug_log, "plus ");
17279 if (r->intflags & PREGf_IMPLICIT)
17280 PerlIO_printf(Perl_debug_log, "implicit ");
17281 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
17282 if (r->extflags & RXf_EVAL_SEEN)
17283 PerlIO_printf(Perl_debug_log, "with eval ");
17284 PerlIO_printf(Perl_debug_log, "\n");
17286 regdump_extflags("r->extflags: ",r->extflags);
17287 regdump_intflags("r->intflags: ",r->intflags);
17290 PERL_ARGS_ASSERT_REGDUMP;
17291 PERL_UNUSED_CONTEXT;
17292 PERL_UNUSED_ARG(r);
17293 #endif /* DEBUGGING */
17297 - regprop - printable representation of opcode, with run time support
17301 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
17306 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
17307 static const char * const anyofs[] = {
17308 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
17309 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
17310 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
17311 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
17312 || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
17313 #error Need to adjust order of anyofs[]
17348 RXi_GET_DECL(prog,progi);
17349 GET_RE_DEBUG_FLAGS_DECL;
17351 PERL_ARGS_ASSERT_REGPROP;
17353 sv_setpvn(sv, "", 0);
17355 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
17356 /* It would be nice to FAIL() here, but this may be called from
17357 regexec.c, and it would be hard to supply pRExC_state. */
17358 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17359 (int)OP(o), (int)REGNODE_MAX);
17360 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
17362 k = PL_regkind[OP(o)];
17365 sv_catpvs(sv, " ");
17366 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
17367 * is a crude hack but it may be the best for now since
17368 * we have no flag "this EXACTish node was UTF-8"
17370 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
17371 PERL_PV_ESCAPE_UNI_DETECT |
17372 PERL_PV_ESCAPE_NONASCII |
17373 PERL_PV_PRETTY_ELLIPSES |
17374 PERL_PV_PRETTY_LTGT |
17375 PERL_PV_PRETTY_NOCLEAR
17377 } else if (k == TRIE) {
17378 /* print the details of the trie in dumpuntil instead, as
17379 * progi->data isn't available here */
17380 const char op = OP(o);
17381 const U32 n = ARG(o);
17382 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
17383 (reg_ac_data *)progi->data->data[n] :
17385 const reg_trie_data * const trie
17386 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
17388 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
17389 DEBUG_TRIE_COMPILE_r(
17390 Perl_sv_catpvf(aTHX_ sv,
17391 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
17392 (UV)trie->startstate,
17393 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
17394 (UV)trie->wordcount,
17397 (UV)TRIE_CHARCOUNT(trie),
17398 (UV)trie->uniquecharcount
17401 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
17402 sv_catpvs(sv, "[");
17403 (void) put_charclass_bitmap_innards(sv,
17404 (IS_ANYOF_TRIE(op))
17406 : TRIE_BITMAP(trie),
17408 sv_catpvs(sv, "]");
17411 } else if (k == CURLY) {
17412 U32 lo = ARG1(o), hi = ARG2(o);
17413 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
17414 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
17415 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
17416 if (hi == REG_INFTY)
17417 sv_catpvs(sv, "INFTY");
17419 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
17420 sv_catpvs(sv, "}");
17422 else if (k == WHILEM && o->flags) /* Ordinal/of */
17423 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
17424 else if (k == REF || k == OPEN || k == CLOSE
17425 || k == GROUPP || OP(o)==ACCEPT)
17427 AV *name_list= NULL;
17428 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
17429 Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno); /* Parenth number */
17430 if ( RXp_PAREN_NAMES(prog) ) {
17431 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
17432 } else if ( pRExC_state ) {
17433 name_list= RExC_paren_name_list;
17436 if ( k != REF || (OP(o) < NREF)) {
17437 SV **name= av_fetch(name_list, parno, 0 );
17439 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
17442 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
17443 I32 *nums=(I32*)SvPVX(sv_dat);
17444 SV **name= av_fetch(name_list, nums[0], 0 );
17447 for ( n=0; n<SvIVX(sv_dat); n++ ) {
17448 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
17449 (n ? "," : ""), (IV)nums[n]);
17451 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
17455 if ( k == REF && reginfo) {
17456 U32 n = ARG(o); /* which paren pair */
17457 I32 ln = prog->offs[n].start;
17458 if (prog->lastparen < n || ln == -1)
17459 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
17460 else if (ln == prog->offs[n].end)
17461 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
17463 const char *s = reginfo->strbeg + ln;
17464 Perl_sv_catpvf(aTHX_ sv, ": ");
17465 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
17466 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
17469 } else if (k == GOSUB) {
17470 AV *name_list= NULL;
17471 if ( RXp_PAREN_NAMES(prog) ) {
17472 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
17473 } else if ( pRExC_state ) {
17474 name_list= RExC_paren_name_list;
17477 /* Paren and offset */
17478 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
17480 SV **name= av_fetch(name_list, ARG(o), 0 );
17482 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
17485 else if (k == LOGICAL)
17486 /* 2: embedded, otherwise 1 */
17487 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
17488 else if (k == ANYOF) {
17489 const U8 flags = ANYOF_FLAGS(o);
17491 SV* bitmap_invlist = NULL; /* Will hold what the bit map contains */
17494 if (OP(o) == ANYOFL) {
17495 if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
17496 sv_catpvs(sv, "{utf8-loc}");
17499 sv_catpvs(sv, "{loc}");
17502 if (flags & ANYOFL_FOLD)
17503 sv_catpvs(sv, "{i}");
17504 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
17505 if (flags & ANYOF_INVERT)
17506 sv_catpvs(sv, "^");
17508 /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
17510 do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
17513 /* output any special charclass tests (used entirely under use
17515 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
17517 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
17518 if (ANYOF_POSIXL_TEST(o,i)) {
17519 sv_catpv(sv, anyofs[i]);
17525 if ( ARG(o) != ANYOF_ONLY_HAS_BITMAP
17527 & ( ANYOF_MATCHES_ALL_ABOVE_BITMAP
17528 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP
17532 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
17533 if (flags & ANYOF_INVERT)
17534 /*make sure the invert info is in each */
17535 sv_catpvs(sv, "^");
17538 if (OP(o) == ANYOFD
17539 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
17541 sv_catpvs(sv, "{non-utf8-latin1-all}");
17544 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
17545 sv_catpvs(sv, "{above_bitmap_all}");
17547 if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
17548 SV *lv; /* Set if there is something outside the bit map. */
17549 bool byte_output = FALSE; /* If something has been output */
17550 SV *only_utf8_locale;
17552 /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
17553 * is used to guarantee that nothing in the bitmap gets
17555 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
17556 &lv, &only_utf8_locale,
17558 if (lv && lv != &PL_sv_undef) {
17559 char *s = savesvpv(lv);
17560 char * const origs = s;
17562 while (*s && *s != '\n')
17566 const char * const t = ++s;
17568 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) {
17569 if (OP(o) == ANYOFD) {
17570 sv_catpvs(sv, "{utf8}");
17573 sv_catpvs(sv, "{outside bitmap}");
17578 sv_catpvs(sv, " ");
17584 /* Truncate very long output */
17585 if (s - origs > 256) {
17586 Perl_sv_catpvf(aTHX_ sv,
17588 (int) (s - origs - 1),
17594 else if (*s == '\t') {
17608 SvREFCNT_dec_NN(lv);
17611 if ((flags & ANYOFL_FOLD)
17612 && only_utf8_locale
17613 && only_utf8_locale != &PL_sv_undef)
17616 int max_entries = 256;
17618 sv_catpvs(sv, "{utf8 locale}");
17619 invlist_iterinit(only_utf8_locale);
17620 while (invlist_iternext(only_utf8_locale,
17622 put_range(sv, start, end, FALSE);
17624 if (max_entries < 0) {
17625 sv_catpvs(sv, "...");
17629 invlist_iterfinish(only_utf8_locale);
17633 SvREFCNT_dec(bitmap_invlist);
17636 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
17638 else if (k == POSIXD || k == NPOSIXD) {
17639 U8 index = FLAGS(o) * 2;
17640 if (index < C_ARRAY_LENGTH(anyofs)) {
17641 if (*anyofs[index] != '[') {
17644 sv_catpv(sv, anyofs[index]);
17645 if (*anyofs[index] != '[') {
17650 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
17653 else if (k == BOUND || k == NBOUND) {
17654 /* Must be synced with order of 'bound_type' in regcomp.h */
17655 const char * const bounds[] = {
17656 "", /* Traditional */
17662 sv_catpv(sv, bounds[FLAGS(o)]);
17664 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
17665 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
17666 else if (OP(o) == SBOL)
17667 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
17669 /* add on the verb argument if there is one */
17670 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
17671 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
17672 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
17675 PERL_UNUSED_CONTEXT;
17676 PERL_UNUSED_ARG(sv);
17677 PERL_UNUSED_ARG(o);
17678 PERL_UNUSED_ARG(prog);
17679 PERL_UNUSED_ARG(reginfo);
17680 PERL_UNUSED_ARG(pRExC_state);
17681 #endif /* DEBUGGING */
17687 Perl_re_intuit_string(pTHX_ REGEXP * const r)
17688 { /* Assume that RE_INTUIT is set */
17689 struct regexp *const prog = ReANY(r);
17690 GET_RE_DEBUG_FLAGS_DECL;
17692 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
17693 PERL_UNUSED_CONTEXT;
17697 const char * const s = SvPV_nolen_const(RX_UTF8(r)
17698 ? prog->check_utf8 : prog->check_substr);
17700 if (!PL_colorset) reginitcolors();
17701 PerlIO_printf(Perl_debug_log,
17702 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
17704 RX_UTF8(r) ? "utf8 " : "",
17705 PL_colors[5],PL_colors[0],
17708 (strlen(s) > 60 ? "..." : ""));
17711 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
17712 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
17718 handles refcounting and freeing the perl core regexp structure. When
17719 it is necessary to actually free the structure the first thing it
17720 does is call the 'free' method of the regexp_engine associated to
17721 the regexp, allowing the handling of the void *pprivate; member
17722 first. (This routine is not overridable by extensions, which is why
17723 the extensions free is called first.)
17725 See regdupe and regdupe_internal if you change anything here.
17727 #ifndef PERL_IN_XSUB_RE
17729 Perl_pregfree(pTHX_ REGEXP *r)
17735 Perl_pregfree2(pTHX_ REGEXP *rx)
17737 struct regexp *const r = ReANY(rx);
17738 GET_RE_DEBUG_FLAGS_DECL;
17740 PERL_ARGS_ASSERT_PREGFREE2;
17742 if (r->mother_re) {
17743 ReREFCNT_dec(r->mother_re);
17745 CALLREGFREE_PVT(rx); /* free the private data */
17746 SvREFCNT_dec(RXp_PAREN_NAMES(r));
17747 Safefree(r->xpv_len_u.xpvlenu_pv);
17750 SvREFCNT_dec(r->anchored_substr);
17751 SvREFCNT_dec(r->anchored_utf8);
17752 SvREFCNT_dec(r->float_substr);
17753 SvREFCNT_dec(r->float_utf8);
17754 Safefree(r->substrs);
17756 RX_MATCH_COPY_FREE(rx);
17757 #ifdef PERL_ANY_COW
17758 SvREFCNT_dec(r->saved_copy);
17761 SvREFCNT_dec(r->qr_anoncv);
17762 rx->sv_u.svu_rx = 0;
17767 This is a hacky workaround to the structural issue of match results
17768 being stored in the regexp structure which is in turn stored in
17769 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
17770 could be PL_curpm in multiple contexts, and could require multiple
17771 result sets being associated with the pattern simultaneously, such
17772 as when doing a recursive match with (??{$qr})
17774 The solution is to make a lightweight copy of the regexp structure
17775 when a qr// is returned from the code executed by (??{$qr}) this
17776 lightweight copy doesn't actually own any of its data except for
17777 the starp/end and the actual regexp structure itself.
17783 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
17785 struct regexp *ret;
17786 struct regexp *const r = ReANY(rx);
17787 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
17789 PERL_ARGS_ASSERT_REG_TEMP_COPY;
17792 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
17794 SvOK_off((SV *)ret_x);
17796 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
17797 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
17798 made both spots point to the same regexp body.) */
17799 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
17800 assert(!SvPVX(ret_x));
17801 ret_x->sv_u.svu_rx = temp->sv_any;
17802 temp->sv_any = NULL;
17803 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
17804 SvREFCNT_dec_NN(temp);
17805 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
17806 ing below will not set it. */
17807 SvCUR_set(ret_x, SvCUR(rx));
17810 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
17811 sv_force_normal(sv) is called. */
17813 ret = ReANY(ret_x);
17815 SvFLAGS(ret_x) |= SvUTF8(rx);
17816 /* We share the same string buffer as the original regexp, on which we
17817 hold a reference count, incremented when mother_re is set below.
17818 The string pointer is copied here, being part of the regexp struct.
17820 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
17821 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
17823 const I32 npar = r->nparens+1;
17824 Newx(ret->offs, npar, regexp_paren_pair);
17825 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17828 Newx(ret->substrs, 1, struct reg_substr_data);
17829 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17831 SvREFCNT_inc_void(ret->anchored_substr);
17832 SvREFCNT_inc_void(ret->anchored_utf8);
17833 SvREFCNT_inc_void(ret->float_substr);
17834 SvREFCNT_inc_void(ret->float_utf8);
17836 /* check_substr and check_utf8, if non-NULL, point to either their
17837 anchored or float namesakes, and don't hold a second reference. */
17839 RX_MATCH_COPIED_off(ret_x);
17840 #ifdef PERL_ANY_COW
17841 ret->saved_copy = NULL;
17843 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
17844 SvREFCNT_inc_void(ret->qr_anoncv);
17850 /* regfree_internal()
17852 Free the private data in a regexp. This is overloadable by
17853 extensions. Perl takes care of the regexp structure in pregfree(),
17854 this covers the *pprivate pointer which technically perl doesn't
17855 know about, however of course we have to handle the
17856 regexp_internal structure when no extension is in use.
17858 Note this is called before freeing anything in the regexp
17863 Perl_regfree_internal(pTHX_ REGEXP * const rx)
17865 struct regexp *const r = ReANY(rx);
17866 RXi_GET_DECL(r,ri);
17867 GET_RE_DEBUG_FLAGS_DECL;
17869 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
17875 SV *dsv= sv_newmortal();
17876 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
17877 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
17878 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
17879 PL_colors[4],PL_colors[5],s);
17882 #ifdef RE_TRACK_PATTERN_OFFSETS
17884 Safefree(ri->u.offsets); /* 20010421 MJD */
17886 if (ri->code_blocks) {
17888 for (n = 0; n < ri->num_code_blocks; n++)
17889 SvREFCNT_dec(ri->code_blocks[n].src_regex);
17890 Safefree(ri->code_blocks);
17894 int n = ri->data->count;
17897 /* If you add a ->what type here, update the comment in regcomp.h */
17898 switch (ri->data->what[n]) {
17904 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
17907 Safefree(ri->data->data[n]);
17913 { /* Aho Corasick add-on structure for a trie node.
17914 Used in stclass optimization only */
17916 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
17917 #ifdef USE_ITHREADS
17921 refcount = --aho->refcount;
17924 PerlMemShared_free(aho->states);
17925 PerlMemShared_free(aho->fail);
17926 /* do this last!!!! */
17927 PerlMemShared_free(ri->data->data[n]);
17928 /* we should only ever get called once, so
17929 * assert as much, and also guard the free
17930 * which /might/ happen twice. At the least
17931 * it will make code anlyzers happy and it
17932 * doesn't cost much. - Yves */
17933 assert(ri->regstclass);
17934 if (ri->regstclass) {
17935 PerlMemShared_free(ri->regstclass);
17936 ri->regstclass = 0;
17943 /* trie structure. */
17945 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
17946 #ifdef USE_ITHREADS
17950 refcount = --trie->refcount;
17953 PerlMemShared_free(trie->charmap);
17954 PerlMemShared_free(trie->states);
17955 PerlMemShared_free(trie->trans);
17957 PerlMemShared_free(trie->bitmap);
17959 PerlMemShared_free(trie->jump);
17960 PerlMemShared_free(trie->wordinfo);
17961 /* do this last!!!! */
17962 PerlMemShared_free(ri->data->data[n]);
17967 Perl_croak(aTHX_ "panic: regfree data code '%c'",
17968 ri->data->what[n]);
17971 Safefree(ri->data->what);
17972 Safefree(ri->data);
17978 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
17979 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
17980 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
17983 re_dup - duplicate a regexp.
17985 This routine is expected to clone a given regexp structure. It is only
17986 compiled under USE_ITHREADS.
17988 After all of the core data stored in struct regexp is duplicated
17989 the regexp_engine.dupe method is used to copy any private data
17990 stored in the *pprivate pointer. This allows extensions to handle
17991 any duplication it needs to do.
17993 See pregfree() and regfree_internal() if you change anything here.
17995 #if defined(USE_ITHREADS)
17996 #ifndef PERL_IN_XSUB_RE
17998 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
18002 const struct regexp *r = ReANY(sstr);
18003 struct regexp *ret = ReANY(dstr);
18005 PERL_ARGS_ASSERT_RE_DUP_GUTS;
18007 npar = r->nparens+1;
18008 Newx(ret->offs, npar, regexp_paren_pair);
18009 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
18011 if (ret->substrs) {
18012 /* Do it this way to avoid reading from *r after the StructCopy().
18013 That way, if any of the sv_dup_inc()s dislodge *r from the L1
18014 cache, it doesn't matter. */
18015 const bool anchored = r->check_substr
18016 ? r->check_substr == r->anchored_substr
18017 : r->check_utf8 == r->anchored_utf8;
18018 Newx(ret->substrs, 1, struct reg_substr_data);
18019 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
18021 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
18022 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
18023 ret->float_substr = sv_dup_inc(ret->float_substr, param);
18024 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
18026 /* check_substr and check_utf8, if non-NULL, point to either their
18027 anchored or float namesakes, and don't hold a second reference. */
18029 if (ret->check_substr) {
18031 assert(r->check_utf8 == r->anchored_utf8);
18032 ret->check_substr = ret->anchored_substr;
18033 ret->check_utf8 = ret->anchored_utf8;
18035 assert(r->check_substr == r->float_substr);
18036 assert(r->check_utf8 == r->float_utf8);
18037 ret->check_substr = ret->float_substr;
18038 ret->check_utf8 = ret->float_utf8;
18040 } else if (ret->check_utf8) {
18042 ret->check_utf8 = ret->anchored_utf8;
18044 ret->check_utf8 = ret->float_utf8;
18049 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
18050 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
18053 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
18055 if (RX_MATCH_COPIED(dstr))
18056 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
18058 ret->subbeg = NULL;
18059 #ifdef PERL_ANY_COW
18060 ret->saved_copy = NULL;
18063 /* Whether mother_re be set or no, we need to copy the string. We
18064 cannot refrain from copying it when the storage points directly to
18065 our mother regexp, because that's
18066 1: a buffer in a different thread
18067 2: something we no longer hold a reference on
18068 so we need to copy it locally. */
18069 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
18070 ret->mother_re = NULL;
18072 #endif /* PERL_IN_XSUB_RE */
18077 This is the internal complement to regdupe() which is used to copy
18078 the structure pointed to by the *pprivate pointer in the regexp.
18079 This is the core version of the extension overridable cloning hook.
18080 The regexp structure being duplicated will be copied by perl prior
18081 to this and will be provided as the regexp *r argument, however
18082 with the /old/ structures pprivate pointer value. Thus this routine
18083 may override any copying normally done by perl.
18085 It returns a pointer to the new regexp_internal structure.
18089 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
18092 struct regexp *const r = ReANY(rx);
18093 regexp_internal *reti;
18095 RXi_GET_DECL(r,ri);
18097 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
18101 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
18102 char, regexp_internal);
18103 Copy(ri->program, reti->program, len+1, regnode);
18105 reti->num_code_blocks = ri->num_code_blocks;
18106 if (ri->code_blocks) {
18108 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
18109 struct reg_code_block);
18110 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
18111 struct reg_code_block);
18112 for (n = 0; n < ri->num_code_blocks; n++)
18113 reti->code_blocks[n].src_regex = (REGEXP*)
18114 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
18117 reti->code_blocks = NULL;
18119 reti->regstclass = NULL;
18122 struct reg_data *d;
18123 const int count = ri->data->count;
18126 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
18127 char, struct reg_data);
18128 Newx(d->what, count, U8);
18131 for (i = 0; i < count; i++) {
18132 d->what[i] = ri->data->what[i];
18133 switch (d->what[i]) {
18134 /* see also regcomp.h and regfree_internal() */
18135 case 'a': /* actually an AV, but the dup function is identical. */
18139 case 'u': /* actually an HV, but the dup function is identical. */
18140 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
18143 /* This is cheating. */
18144 Newx(d->data[i], 1, regnode_ssc);
18145 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
18146 reti->regstclass = (regnode*)d->data[i];
18149 /* Trie stclasses are readonly and can thus be shared
18150 * without duplication. We free the stclass in pregfree
18151 * when the corresponding reg_ac_data struct is freed.
18153 reti->regstclass= ri->regstclass;
18157 ((reg_trie_data*)ri->data->data[i])->refcount++;
18162 d->data[i] = ri->data->data[i];
18165 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
18166 ri->data->what[i]);
18175 reti->name_list_idx = ri->name_list_idx;
18177 #ifdef RE_TRACK_PATTERN_OFFSETS
18178 if (ri->u.offsets) {
18179 Newx(reti->u.offsets, 2*len+1, U32);
18180 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
18183 SetProgLen(reti,len);
18186 return (void*)reti;
18189 #endif /* USE_ITHREADS */
18191 #ifndef PERL_IN_XSUB_RE
18194 - regnext - dig the "next" pointer out of a node
18197 Perl_regnext(pTHX_ regnode *p)
18204 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
18205 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
18206 (int)OP(p), (int)REGNODE_MAX);
18209 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
18218 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
18221 STRLEN l1 = strlen(pat1);
18222 STRLEN l2 = strlen(pat2);
18225 const char *message;
18227 PERL_ARGS_ASSERT_RE_CROAK2;
18233 Copy(pat1, buf, l1 , char);
18234 Copy(pat2, buf + l1, l2 , char);
18235 buf[l1 + l2] = '\n';
18236 buf[l1 + l2 + 1] = '\0';
18237 va_start(args, pat2);
18238 msv = vmess(buf, &args);
18240 message = SvPV_const(msv,l1);
18243 Copy(message, buf, l1 , char);
18244 /* l1-1 to avoid \n */
18245 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
18248 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
18250 #ifndef PERL_IN_XSUB_RE
18252 Perl_save_re_context(pTHX)
18257 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
18260 const REGEXP * const rx = PM_GETRE(PL_curpm);
18262 nparens = RX_NPARENS(rx);
18265 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
18266 * that PL_curpm will be null, but that utf8.pm and the modules it
18267 * loads will only use $1..$3.
18268 * The t/porting/re_context.t test file checks this assumption.
18273 for (i = 1; i <= nparens; i++) {
18274 char digits[TYPE_CHARS(long)];
18275 const STRLEN len = my_snprintf(digits, sizeof(digits),
18277 GV *const *const gvp
18278 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
18281 GV * const gv = *gvp;
18282 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
18292 S_put_code_point(pTHX_ SV *sv, UV c)
18294 PERL_ARGS_ASSERT_PUT_CODE_POINT;
18297 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
18299 else if (isPRINT(c)) {
18300 const char string = (char) c;
18301 if (isBACKSLASHED_PUNCT(c))
18302 sv_catpvs(sv, "\\");
18303 sv_catpvn(sv, &string, 1);
18306 const char * const mnemonic = cntrl_to_mnemonic((char) c);
18308 Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
18311 Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
18316 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
18319 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
18321 /* Appends to 'sv' a displayable version of the range of code points from
18322 * 'start' to 'end'. It assumes that only ASCII printables are displayable
18323 * as-is (though some of these will be escaped by put_code_point()). */
18325 const unsigned int min_range_count = 3;
18327 assert(start <= end);
18329 PERL_ARGS_ASSERT_PUT_RANGE;
18331 while (start <= end) {
18333 const char * format;
18335 if (end - start < min_range_count) {
18337 /* Individual chars in short ranges */
18338 for (; start <= end; start++) {
18339 put_code_point(sv, start);
18344 /* If permitted by the input options, and there is a possibility that
18345 * this range contains a printable literal, look to see if there is
18347 if (allow_literals && start <= MAX_PRINT_A) {
18349 /* If the range begin isn't an ASCII printable, effectively split
18350 * the range into two parts:
18351 * 1) the portion before the first such printable,
18353 * and output them separately. */
18354 if (! isPRINT_A(start)) {
18355 UV temp_end = start + 1;
18357 /* There is no point looking beyond the final possible
18358 * printable, in MAX_PRINT_A */
18359 UV max = MIN(end, MAX_PRINT_A);
18361 while (temp_end <= max && ! isPRINT_A(temp_end)) {
18365 /* Here, temp_end points to one beyond the first printable if
18366 * found, or to one beyond 'max' if not. If none found, make
18367 * sure that we use the entire range */
18368 if (temp_end > MAX_PRINT_A) {
18369 temp_end = end + 1;
18372 /* Output the first part of the split range, the part that
18373 * doesn't have printables, with no looking for literals
18374 * (otherwise we would infinitely recurse) */
18375 put_range(sv, start, temp_end - 1, FALSE);
18377 /* The 2nd part of the range (if any) starts here. */
18380 /* We continue instead of dropping down because even if the 2nd
18381 * part is non-empty, it could be so short that we want to
18382 * output it specially, as tested for at the top of this loop.
18387 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
18388 * output a sub-range of just the digits or letters, then process
18389 * the remaining portion as usual. */
18390 if (isALPHANUMERIC_A(start)) {
18391 UV mask = (isDIGIT_A(start))
18396 UV temp_end = start + 1;
18398 /* Find the end of the sub-range that includes just the
18399 * characters in the same class as the first character in it */
18400 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
18405 /* For short ranges, don't duplicate the code above to output
18406 * them; just call recursively */
18407 if (temp_end - start < min_range_count) {
18408 put_range(sv, start, temp_end, FALSE);
18410 else { /* Output as a range */
18411 put_code_point(sv, start);
18412 sv_catpvs(sv, "-");
18413 put_code_point(sv, temp_end);
18415 start = temp_end + 1;
18419 /* We output any other printables as individual characters */
18420 if (isPUNCT_A(start) || isSPACE_A(start)) {
18421 while (start <= end && (isPUNCT_A(start)
18422 || isSPACE_A(start)))
18424 put_code_point(sv, start);
18429 } /* End of looking for literals */
18431 /* Here is not to output as a literal. Some control characters have
18432 * mnemonic names. Split off any of those at the beginning and end of
18433 * the range to print mnemonically. It isn't possible for many of
18434 * these to be in a row, so this won't overwhelm with output */
18435 while (isMNEMONIC_CNTRL(start) && start <= end) {
18436 put_code_point(sv, start);
18439 if (start < end && isMNEMONIC_CNTRL(end)) {
18441 /* Here, the final character in the range has a mnemonic name.
18442 * Work backwards from the end to find the final non-mnemonic */
18443 UV temp_end = end - 1;
18444 while (isMNEMONIC_CNTRL(temp_end)) {
18448 /* And separately output the range that doesn't have mnemonics */
18449 put_range(sv, start, temp_end, FALSE);
18451 /* Then output the mnemonic trailing controls */
18452 start = temp_end + 1;
18453 while (start <= end) {
18454 put_code_point(sv, start);
18460 /* As a final resort, output the range or subrange as hex. */
18462 this_end = (end < NUM_ANYOF_CODE_POINTS)
18464 : NUM_ANYOF_CODE_POINTS - 1;
18465 #if NUM_ANYOF_CODE_POINTS > 256
18466 format = (this_end < 256)
18467 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
18468 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
18470 format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}";
18472 GCC_DIAG_IGNORE(-Wformat-nonliteral);
18473 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
18480 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
18482 /* Appends to 'sv' a displayable version of the innards of the bracketed
18483 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
18484 * output anything, and bitmap_invlist, if not NULL, will point to an
18485 * inversion list of what is in the bit map */
18489 unsigned int punct_count = 0;
18491 bool allow_literals = TRUE;
18492 bool inverted_for_output = FALSE;
18494 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
18496 /* Worst case is exactly every-other code point is in the list */
18497 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
18499 /* Convert the bit map to an inversion list, keeping track of how many
18500 * ASCII puncts are set, including an extra amount for the backslashed
18502 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
18503 if (BITMAP_TEST(bitmap, i)) {
18504 invlist = add_cp_to_invlist(invlist, i);
18505 if (isPUNCT_A(i)) {
18507 if isBACKSLASHED_PUNCT(i) {
18514 /* Nothing to output */
18515 if (_invlist_len(invlist) == 0) {
18516 SvREFCNT_dec_NN(invlist);
18520 /* Generally, it is more readable if printable characters are output as
18521 * literals, but if a range (nearly) spans all of them, it's best to output
18522 * it as a single range. This code will use a single range if all but 2
18523 * printables are in it */
18524 invlist_iterinit(invlist);
18525 while (invlist_iternext(invlist, &start, &end)) {
18527 /* If range starts beyond final printable, it doesn't have any in it */
18528 if (start > MAX_PRINT_A) {
18532 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
18533 * all but two, the range must start and end no later than 2 from
18535 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
18536 if (end > MAX_PRINT_A) {
18542 if (end - start >= MAX_PRINT_A - ' ' - 2) {
18543 allow_literals = FALSE;
18548 invlist_iterfinish(invlist);
18550 /* The legibility of the output depends mostly on how many punctuation
18551 * characters are output. There are 32 possible ASCII ones, and some have
18552 * an additional backslash, bringing it to currently 36, so if any more
18553 * than 18 are to be output, we can instead output it as its complement,
18554 * yielding fewer puncts, and making it more legible. But give some weight
18555 * to the fact that outputting it as a complement is less legible than a
18556 * straight output, so don't complement unless we are somewhat over the 18
18558 if (allow_literals && punct_count > 22) {
18559 sv_catpvs(sv, "^");
18561 /* Add everything remaining to the list, so when we invert it just
18562 * below, it will be excluded */
18563 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
18564 _invlist_invert(invlist);
18565 inverted_for_output = TRUE;
18568 /* Here we have figured things out. Output each range */
18569 invlist_iterinit(invlist);
18570 while (invlist_iternext(invlist, &start, &end)) {
18571 if (start >= NUM_ANYOF_CODE_POINTS) {
18574 put_range(sv, start, end, allow_literals);
18576 invlist_iterfinish(invlist);
18578 if (bitmap_invlist) {
18580 /* Here, wants the inversion list returned. If we inverted it, we have
18581 * to restore it to the original */
18582 if (inverted_for_output) {
18583 _invlist_invert(invlist);
18584 _invlist_intersection(invlist, PL_InBitmap, &invlist);
18587 *bitmap_invlist = invlist;
18590 SvREFCNT_dec_NN(invlist);
18596 #define CLEAR_OPTSTART \
18597 if (optstart) STMT_START { \
18598 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
18599 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
18603 #define DUMPUNTIL(b,e) \
18605 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
18607 STATIC const regnode *
18608 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
18609 const regnode *last, const regnode *plast,
18610 SV* sv, I32 indent, U32 depth)
18612 U8 op = PSEUDO; /* Arbitrary non-END op. */
18613 const regnode *next;
18614 const regnode *optstart= NULL;
18616 RXi_GET_DECL(r,ri);
18617 GET_RE_DEBUG_FLAGS_DECL;
18619 PERL_ARGS_ASSERT_DUMPUNTIL;
18621 #ifdef DEBUG_DUMPUNTIL
18622 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
18623 last ? last-start : 0,plast ? plast-start : 0);
18626 if (plast && plast < last)
18629 while (PL_regkind[op] != END && (!last || node < last)) {
18631 /* While that wasn't END last time... */
18634 if (op == CLOSE || op == WHILEM)
18636 next = regnext((regnode *)node);
18639 if (OP(node) == OPTIMIZED) {
18640 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
18647 regprop(r, sv, node, NULL, NULL);
18648 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
18649 (int)(2*indent + 1), "", SvPVX_const(sv));
18651 if (OP(node) != OPTIMIZED) {
18652 if (next == NULL) /* Next ptr. */
18653 PerlIO_printf(Perl_debug_log, " (0)");
18654 else if (PL_regkind[(U8)op] == BRANCH
18655 && PL_regkind[OP(next)] != BRANCH )
18656 PerlIO_printf(Perl_debug_log, " (FAIL)");
18658 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
18659 (void)PerlIO_putc(Perl_debug_log, '\n');
18663 if (PL_regkind[(U8)op] == BRANCHJ) {
18666 const regnode *nnode = (OP(next) == LONGJMP
18667 ? regnext((regnode *)next)
18669 if (last && nnode > last)
18671 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
18674 else if (PL_regkind[(U8)op] == BRANCH) {
18676 DUMPUNTIL(NEXTOPER(node), next);
18678 else if ( PL_regkind[(U8)op] == TRIE ) {
18679 const regnode *this_trie = node;
18680 const char op = OP(node);
18681 const U32 n = ARG(node);
18682 const reg_ac_data * const ac = op>=AHOCORASICK ?
18683 (reg_ac_data *)ri->data->data[n] :
18685 const reg_trie_data * const trie =
18686 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
18688 AV *const trie_words
18689 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
18691 const regnode *nextbranch= NULL;
18694 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
18695 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
18697 PerlIO_printf(Perl_debug_log, "%*s%s ",
18698 (int)(2*(indent+3)), "",
18700 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
18701 SvCUR(*elem_ptr), 60,
18702 PL_colors[0], PL_colors[1],
18704 ? PERL_PV_ESCAPE_UNI
18706 | PERL_PV_PRETTY_ELLIPSES
18707 | PERL_PV_PRETTY_LTGT
18712 U16 dist= trie->jump[word_idx+1];
18713 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
18714 (UV)((dist ? this_trie + dist : next) - start));
18717 nextbranch= this_trie + trie->jump[0];
18718 DUMPUNTIL(this_trie + dist, nextbranch);
18720 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
18721 nextbranch= regnext((regnode *)nextbranch);
18723 PerlIO_printf(Perl_debug_log, "\n");
18726 if (last && next > last)
18731 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
18732 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
18733 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
18735 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
18737 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
18739 else if ( op == PLUS || op == STAR) {
18740 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
18742 else if (PL_regkind[(U8)op] == ANYOF) {
18743 /* arglen 1 + class block */
18744 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
18745 ? ANYOF_POSIXL_SKIP
18747 node = NEXTOPER(node);
18749 else if (PL_regkind[(U8)op] == EXACT) {
18750 /* Literal string, where present. */
18751 node += NODE_SZ_STR(node) - 1;
18752 node = NEXTOPER(node);
18755 node = NEXTOPER(node);
18756 node += regarglen[(U8)op];
18758 if (op == CURLYX || op == OPEN)
18762 #ifdef DEBUG_DUMPUNTIL
18763 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
18768 #endif /* DEBUGGING */
18771 * ex: set ts=8 sts=4 sw=4 et: