5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_inline.h"
90 #include "invlist_inline.h"
91 #include "unicode_constants.h"
93 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
95 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
97 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
98 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
101 #define STATIC static
104 /* this is a chain of data about sub patterns we are processing that
105 need to be handled separately/specially in study_chunk. Its so
106 we can simulate recursion without losing state. */
108 typedef struct scan_frame {
109 regnode *last_regnode; /* last node to process in this frame */
110 regnode *next_regnode; /* next node to process when last is reached */
111 U32 prev_recursed_depth;
112 I32 stopparen; /* what stopparen do we use */
114 struct scan_frame *this_prev_frame; /* this previous frame */
115 struct scan_frame *prev_frame; /* previous frame */
116 struct scan_frame *next_frame; /* next frame */
119 /* Certain characters are output as a sequence with the first being a
121 #define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
124 struct RExC_state_t {
125 U32 flags; /* RXf_* are we folding, multilining? */
126 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
127 char *precomp; /* uncompiled string. */
128 char *precomp_end; /* pointer to end of uncompiled string. */
129 REGEXP *rx_sv; /* The SV that is the regexp. */
130 regexp *rx; /* perl core regexp structure */
131 regexp_internal *rxi; /* internal data for regexp object
133 char *start; /* Start of input for compile */
134 char *end; /* End of input for compile */
135 char *parse; /* Input-scan pointer. */
136 char *copy_start; /* start of copy of input within
137 constructed parse string */
138 char *copy_start_in_input; /* Position in input string
139 corresponding to copy_start */
140 SSize_t whilem_seen; /* number of WHILEM in this expr */
141 regnode *emit_start; /* Start of emitted-code area */
142 regnode_offset emit; /* Code-emit pointer */
143 I32 naughty; /* How bad is this pattern? */
144 I32 sawback; /* Did we see \1, ...? */
146 SSize_t size; /* Number of regnode equivalents in
149 /* position beyond 'precomp' of the warning message furthest away from
150 * 'precomp'. During the parse, no warnings are raised for any problems
151 * earlier in the parse than this position. This works if warnings are
152 * raised the first time a given spot is parsed, and if only one
153 * independent warning is raised for any given spot */
154 Size_t latest_warn_offset;
156 I32 npar; /* Capture buffer count so far in the
157 parse, (OPEN) plus one. ("par" 0 is
159 I32 total_par; /* During initial parse, is either 0,
160 or -1; the latter indicating a
161 reparse is needed. After that pass,
162 it is what 'npar' became after the
163 pass. Hence, it being > 0 indicates
164 we are in a reparse situation */
165 I32 nestroot; /* root parens we are in - used by
168 regnode_offset *open_parens; /* offsets to open parens */
169 regnode_offset *close_parens; /* offsets to close parens */
170 regnode *end_op; /* END node in program */
171 I32 utf8; /* whether the pattern is utf8 or not */
172 I32 orig_utf8; /* whether the pattern was originally in utf8 */
173 /* XXX use this for future optimisation of case
174 * where pattern must be upgraded to utf8. */
175 I32 uni_semantics; /* If a d charset modifier should use unicode
176 rules, even if the pattern is not in
178 HV *paren_names; /* Paren names */
180 regnode **recurse; /* Recurse regops */
181 I32 recurse_count; /* Number of recurse regops we have generated */
182 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
184 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
187 I32 override_recoding;
189 I32 recode_x_to_native;
191 I32 in_multi_char_class;
192 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
194 int code_index; /* next code_blocks[] slot */
195 SSize_t maxlen; /* mininum possible number of chars in string to match */
196 scan_frame *frame_head;
197 scan_frame *frame_last;
200 #ifdef ADD_TO_REGEXEC
201 char *starttry; /* -Dr: where regtry was called. */
202 #define RExC_starttry (pRExC_state->starttry)
204 SV *runtime_code_qr; /* qr with the runtime code blocks */
206 const char *lastparse;
208 AV *paren_name_list; /* idx -> name */
209 U32 study_chunk_recursed_count;
213 #define RExC_lastparse (pRExC_state->lastparse)
214 #define RExC_lastnum (pRExC_state->lastnum)
215 #define RExC_paren_name_list (pRExC_state->paren_name_list)
216 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
217 #define RExC_mysv (pRExC_state->mysv1)
218 #define RExC_mysv1 (pRExC_state->mysv1)
219 #define RExC_mysv2 (pRExC_state->mysv2)
229 #define RExC_flags (pRExC_state->flags)
230 #define RExC_pm_flags (pRExC_state->pm_flags)
231 #define RExC_precomp (pRExC_state->precomp)
232 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
233 #define RExC_copy_start_in_constructed (pRExC_state->copy_start)
234 #define RExC_precomp_end (pRExC_state->precomp_end)
235 #define RExC_rx_sv (pRExC_state->rx_sv)
236 #define RExC_rx (pRExC_state->rx)
237 #define RExC_rxi (pRExC_state->rxi)
238 #define RExC_start (pRExC_state->start)
239 #define RExC_end (pRExC_state->end)
240 #define RExC_parse (pRExC_state->parse)
241 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
242 #define RExC_whilem_seen (pRExC_state->whilem_seen)
243 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
244 under /d from /u ? */
247 #ifdef RE_TRACK_PATTERN_OFFSETS
248 # define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
251 #define RExC_emit (pRExC_state->emit)
252 #define RExC_emit_start (pRExC_state->emit_start)
253 #define RExC_sawback (pRExC_state->sawback)
254 #define RExC_seen (pRExC_state->seen)
255 #define RExC_size (pRExC_state->size)
256 #define RExC_maxlen (pRExC_state->maxlen)
257 #define RExC_npar (pRExC_state->npar)
258 #define RExC_total_parens (pRExC_state->total_par)
259 #define RExC_nestroot (pRExC_state->nestroot)
260 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
261 #define RExC_utf8 (pRExC_state->utf8)
262 #define RExC_uni_semantics (pRExC_state->uni_semantics)
263 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
264 #define RExC_open_parens (pRExC_state->open_parens)
265 #define RExC_close_parens (pRExC_state->close_parens)
266 #define RExC_end_op (pRExC_state->end_op)
267 #define RExC_paren_names (pRExC_state->paren_names)
268 #define RExC_recurse (pRExC_state->recurse)
269 #define RExC_recurse_count (pRExC_state->recurse_count)
270 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
271 #define RExC_study_chunk_recursed_bytes \
272 (pRExC_state->study_chunk_recursed_bytes)
273 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
274 #define RExC_contains_locale (pRExC_state->contains_locale)
276 # define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
278 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
279 #define RExC_frame_head (pRExC_state->frame_head)
280 #define RExC_frame_last (pRExC_state->frame_last)
281 #define RExC_frame_count (pRExC_state->frame_count)
282 #define RExC_strict (pRExC_state->strict)
283 #define RExC_study_started (pRExC_state->study_started)
284 #define RExC_warn_text (pRExC_state->warn_text)
285 #define RExC_in_script_run (pRExC_state->in_script_run)
286 #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
288 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
289 * a flag to disable back-off on the fixed/floating substrings - if it's
290 * a high complexity pattern we assume the benefit of avoiding a full match
291 * is worth the cost of checking for the substrings even if they rarely help.
293 #define RExC_naughty (pRExC_state->naughty)
294 #define TOO_NAUGHTY (10)
295 #define MARK_NAUGHTY(add) \
296 if (RExC_naughty < TOO_NAUGHTY) \
297 RExC_naughty += (add)
298 #define MARK_NAUGHTY_EXP(exp, add) \
299 if (RExC_naughty < TOO_NAUGHTY) \
300 RExC_naughty += RExC_naughty / (exp) + (add)
302 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
303 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
304 ((*s) == '{' && regcurly(s)))
307 * Flags to be passed up and down.
309 #define WORST 0 /* Worst case. */
310 #define HASWIDTH 0x01 /* Known to not match null strings, could match
313 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
314 * character. (There needs to be a case: in the switch statement in regexec.c
315 * for any node marked SIMPLE.) Note that this is not the same thing as
318 #define SPSTART 0x04 /* Starts with * or + */
319 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
320 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
321 #define RESTART_PARSE 0x20 /* Need to redo the parse */
322 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
323 calcuate sizes as UTF-8 */
325 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
327 /* whether trie related optimizations are enabled */
328 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
329 #define TRIE_STUDY_OPT
330 #define FULL_TRIE_STUDY
336 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
337 #define PBITVAL(paren) (1 << ((paren) & 7))
338 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
339 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
340 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
342 #define REQUIRE_UTF8(flagp) STMT_START { \
344 *flagp = RESTART_PARSE|NEED_UTF8; \
349 /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
350 * a flag that indicates we've changed to /u during the parse. */
351 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
353 if (DEPENDS_SEMANTICS) { \
354 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
355 RExC_uni_semantics = 1; \
356 if (RExC_seen_d_op && LIKELY(RExC_total_parens >= 0)) { \
357 /* No need to restart the parse if we haven't seen \
358 * anything that differs between /u and /d, and no need \
359 * to restart immediately if we're going to reparse \
360 * anyway to count parens */ \
361 *flagp |= RESTART_PARSE; \
362 return restart_retval; \
367 #define BRANCH_MAX_OFFSET U16_MAX
368 #define REQUIRE_BRANCHJ(flagp, restart_retval) \
370 RExC_use_BRANCHJ = 1; \
371 if (LIKELY(RExC_total_parens >= 0)) { \
372 /* No need to restart the parse immediately if we're \
373 * going to reparse anyway to count parens */ \
374 *flagp |= RESTART_PARSE; \
375 return restart_retval; \
379 #define REQUIRE_PARENS_PASS \
381 if (RExC_total_parens == 0) RExC_total_parens = -1; \
384 /* This is used to return failure (zero) early from the calling function if
385 * various flags in 'flags' are set. Two flags always cause a return:
386 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
387 * additional flags that should cause a return; 0 if none. If the return will
388 * be done, '*flagp' is first set to be all of the flags that caused the
390 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
392 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
393 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
398 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
400 #define RETURN_FAIL_ON_RESTART(flags,flagp) \
401 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
402 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
403 if (MUST_RESTART(*(flagp))) return 0
405 /* This converts the named class defined in regcomp.h to its equivalent class
406 * number defined in handy.h. */
407 #define namedclass_to_classnum(class) ((int) ((class) / 2))
408 #define classnum_to_namedclass(classnum) ((classnum) * 2)
410 #define _invlist_union_complement_2nd(a, b, output) \
411 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
412 #define _invlist_intersection_complement_2nd(a, b, output) \
413 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
415 /* About scan_data_t.
417 During optimisation we recurse through the regexp program performing
418 various inplace (keyhole style) optimisations. In addition study_chunk
419 and scan_commit populate this data structure with information about
420 what strings MUST appear in the pattern. We look for the longest
421 string that must appear at a fixed location, and we look for the
422 longest string that may appear at a floating location. So for instance
427 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
428 strings (because they follow a .* construct). study_chunk will identify
429 both FOO and BAR as being the longest fixed and floating strings respectively.
431 The strings can be composites, for instance
435 will result in a composite fixed substring 'foo'.
437 For each string some basic information is maintained:
440 This is the position the string must appear at, or not before.
441 It also implicitly (when combined with minlenp) tells us how many
442 characters must match before the string we are searching for.
443 Likewise when combined with minlenp and the length of the string it
444 tells us how many characters must appear after the string we have
448 Only used for floating strings. This is the rightmost point that
449 the string can appear at. If set to SSize_t_MAX it indicates that the
450 string can occur infinitely far to the right.
451 For fixed strings, it is equal to min_offset.
454 A pointer to the minimum number of characters of the pattern that the
455 string was found inside. This is important as in the case of positive
456 lookahead or positive lookbehind we can have multiple patterns
461 The minimum length of the pattern overall is 3, the minimum length
462 of the lookahead part is 3, but the minimum length of the part that
463 will actually match is 1. So 'FOO's minimum length is 3, but the
464 minimum length for the F is 1. This is important as the minimum length
465 is used to determine offsets in front of and behind the string being
466 looked for. Since strings can be composites this is the length of the
467 pattern at the time it was committed with a scan_commit. Note that
468 the length is calculated by study_chunk, so that the minimum lengths
469 are not known until the full pattern has been compiled, thus the
470 pointer to the value.
474 In the case of lookbehind the string being searched for can be
475 offset past the start point of the final matching string.
476 If this value was just blithely removed from the min_offset it would
477 invalidate some of the calculations for how many chars must match
478 before or after (as they are derived from min_offset and minlen and
479 the length of the string being searched for).
480 When the final pattern is compiled and the data is moved from the
481 scan_data_t structure into the regexp structure the information
482 about lookbehind is factored in, with the information that would
483 have been lost precalculated in the end_shift field for the
486 The fields pos_min and pos_delta are used to store the minimum offset
487 and the delta to the maximum offset at the current point in the pattern.
491 struct scan_data_substrs {
492 SV *str; /* longest substring found in pattern */
493 SSize_t min_offset; /* earliest point in string it can appear */
494 SSize_t max_offset; /* latest point in string it can appear */
495 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
496 SSize_t lookbehind; /* is the pos of the string modified by LB */
497 I32 flags; /* per substring SF_* and SCF_* flags */
500 typedef struct scan_data_t {
501 /*I32 len_min; unused */
502 /*I32 len_delta; unused */
506 SSize_t last_end; /* min value, <0 unless valid. */
507 SSize_t last_start_min;
508 SSize_t last_start_max;
509 U8 cur_is_floating; /* whether the last_* values should be set as
510 * the next fixed (0) or floating (1)
513 /* [0] is longest fixed substring so far, [1] is longest float so far */
514 struct scan_data_substrs substrs[2];
516 I32 flags; /* common SF_* and SCF_* flags */
518 SSize_t *last_closep;
519 regnode_ssc *start_class;
523 * Forward declarations for pregcomp()'s friends.
526 static const scan_data_t zero_scan_data = {
527 0, 0, NULL, 0, 0, 0, 0,
529 { NULL, 0, 0, 0, 0, 0 },
530 { NULL, 0, 0, 0, 0, 0 },
537 #define SF_BEFORE_SEOL 0x0001
538 #define SF_BEFORE_MEOL 0x0002
539 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
541 #define SF_IS_INF 0x0040
542 #define SF_HAS_PAR 0x0080
543 #define SF_IN_PAR 0x0100
544 #define SF_HAS_EVAL 0x0200
547 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
548 * longest substring in the pattern. When it is not set the optimiser keeps
549 * track of position, but does not keep track of the actual strings seen,
551 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
554 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
555 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
556 * turned off because of the alternation (BRANCH). */
557 #define SCF_DO_SUBSTR 0x0400
559 #define SCF_DO_STCLASS_AND 0x0800
560 #define SCF_DO_STCLASS_OR 0x1000
561 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
562 #define SCF_WHILEM_VISITED_POS 0x2000
564 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
565 #define SCF_SEEN_ACCEPT 0x8000
566 #define SCF_TRIE_DOING_RESTUDY 0x10000
567 #define SCF_IN_DEFINE 0x20000
572 #define UTF cBOOL(RExC_utf8)
574 /* The enums for all these are ordered so things work out correctly */
575 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
576 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
577 == REGEX_DEPENDS_CHARSET)
578 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
579 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
580 >= REGEX_UNICODE_CHARSET)
581 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
582 == REGEX_ASCII_RESTRICTED_CHARSET)
583 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
584 >= REGEX_ASCII_RESTRICTED_CHARSET)
585 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
586 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
588 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
590 /* For programs that want to be strictly Unicode compatible by dying if any
591 * attempt is made to match a non-Unicode code point against a Unicode
593 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
595 #define OOB_NAMEDCLASS -1
597 /* There is no code point that is out-of-bounds, so this is problematic. But
598 * its only current use is to initialize a variable that is always set before
600 #define OOB_UNICODE 0xDEADBEEF
602 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
605 /* length of regex to show in messages that don't mark a position within */
606 #define RegexLengthToShowInErrorMessages 127
609 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
610 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
611 * op/pragma/warn/regcomp.
613 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
614 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
616 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
617 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
619 /* The code in this file in places uses one level of recursion with parsing
620 * rebased to an alternate string constructed by us in memory. This can take
621 * the form of something that is completely different from the input, or
622 * something that uses the input as part of the alternate. In the first case,
623 * there should be no possibility of an error, as we are in complete control of
624 * the alternate string. But in the second case we don't completely control
625 * the input portion, so there may be errors in that. Here's an example:
627 * is handled specially because \x{df} folds to a sequence of more than one
628 * character: 'ss'. What is done is to create and parse an alternate string,
629 * which looks like this:
630 * /(?:\x{DF}|[abc\x{DF}def])/ui
631 * where it uses the input unchanged in the middle of something it constructs,
632 * which is a branch for the DF outside the character class, and clustering
633 * parens around the whole thing. (It knows enough to skip the DF inside the
634 * class while in this substitute parse.) 'abc' and 'def' may have errors that
635 * need to be reported. The general situation looks like this:
637 * |<------- identical ------>|
639 * Input: ---------------------------------------------------------------
640 * Constructed: ---------------------------------------------------
642 * |<------- identical ------>|
644 * sI..eI is the portion of the input pattern we are concerned with here.
645 * sC..EC is the constructed substitute parse string.
646 * sC..tC is constructed by us
647 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
648 * In the diagram, these are vertically aligned.
649 * eC..EC is also constructed by us.
650 * xC is the position in the substitute parse string where we found a
652 * xI is the position in the original pattern corresponding to xC.
654 * We want to display a message showing the real input string. Thus we need to
655 * translate from xC to xI. We know that xC >= tC, since the portion of the
656 * string sC..tC has been constructed by us, and so shouldn't have errors. We
658 * xI = tI + (xC - tC)
660 * When the substitute parse is constructed, the code needs to set:
663 * RExC_copy_start_in_input (tI)
664 * RExC_copy_start_in_constructed (tC)
665 * and restore them when done.
667 * During normal processing of the input pattern, both
668 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
669 * sI, so that xC equals xI.
672 #define sI RExC_precomp
673 #define eI RExC_precomp_end
674 #define sC RExC_start
676 #define tI RExC_copy_start_in_input
677 #define tC RExC_copy_start_in_constructed
678 #define xI(xC) (tI + (xC - tC))
679 #define xI_offset(xC) (xI(xC) - sI)
681 #define REPORT_LOCATION_ARGS(xC) \
683 (xI(xC) > eI) /* Don't run off end */ \
684 ? eI - sI /* Length before the <--HERE */ \
685 : ((xI_offset(xC) >= 0) \
687 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
688 IVdf " trying to output message for " \
690 __FILE__, __LINE__, (IV) xI_offset(xC), \
691 ((int) (eC - sC)), sC), 0)), \
692 sI), /* The input pattern printed up to the <--HERE */ \
694 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
695 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
697 /* Used to point after bad bytes for an error message, but avoid skipping
698 * past a nul byte. */
699 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
701 /* Set up to clean up after our imminent demise */
702 #define PREPARE_TO_DIE \
705 SAVEFREESV(RExC_rx_sv); \
706 if (RExC_open_parens) \
707 SAVEFREEPV(RExC_open_parens); \
708 if (RExC_close_parens) \
709 SAVEFREEPV(RExC_close_parens); \
713 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
714 * arg. Show regex, up to a maximum length. If it's too long, chop and add
717 #define _FAIL(code) STMT_START { \
718 const char *ellipses = ""; \
719 IV len = RExC_precomp_end - RExC_precomp; \
722 if (len > RegexLengthToShowInErrorMessages) { \
723 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
724 len = RegexLengthToShowInErrorMessages - 10; \
730 #define FAIL(msg) _FAIL( \
731 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
732 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
734 #define FAIL2(msg,arg) _FAIL( \
735 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
736 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
739 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
741 #define Simple_vFAIL(m) STMT_START { \
742 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
743 m, REPORT_LOCATION_ARGS(RExC_parse)); \
747 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
749 #define vFAIL(m) STMT_START { \
755 * Like Simple_vFAIL(), but accepts two arguments.
757 #define Simple_vFAIL2(m,a1) STMT_START { \
758 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
759 REPORT_LOCATION_ARGS(RExC_parse)); \
763 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
765 #define vFAIL2(m,a1) STMT_START { \
767 Simple_vFAIL2(m, a1); \
772 * Like Simple_vFAIL(), but accepts three arguments.
774 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
775 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
776 REPORT_LOCATION_ARGS(RExC_parse)); \
780 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
782 #define vFAIL3(m,a1,a2) STMT_START { \
784 Simple_vFAIL3(m, a1, a2); \
788 * Like Simple_vFAIL(), but accepts four arguments.
790 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
791 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
792 REPORT_LOCATION_ARGS(RExC_parse)); \
795 #define vFAIL4(m,a1,a2,a3) STMT_START { \
797 Simple_vFAIL4(m, a1, a2, a3); \
800 /* A specialized version of vFAIL2 that works with UTF8f */
801 #define vFAIL2utf8f(m, a1) STMT_START { \
803 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
804 REPORT_LOCATION_ARGS(RExC_parse)); \
807 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
809 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
810 REPORT_LOCATION_ARGS(RExC_parse)); \
813 /* Setting this to NULL is a signal to not output warnings */
814 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL
815 #define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp
817 /* Since a warning can be generated multiple times as the input is reparsed, we
818 * output it the first time we come to that point in the parse, but suppress it
819 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
820 * generate any warnings */
821 #define TO_OUTPUT_WARNINGS(loc) \
822 ( RExC_copy_start_in_constructed \
823 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
825 /* After we've emitted a warning, we save the position in the input so we don't
827 #define UPDATE_WARNINGS_LOC(loc) \
829 if (TO_OUTPUT_WARNINGS(loc)) { \
830 RExC_latest_warn_offset = (xI(loc)) - RExC_precomp; \
834 /* 'warns' is the output of the packWARNx macro used in 'code' */
835 #define _WARN_HELPER(loc, warns, code) \
837 if (! RExC_copy_start_in_constructed) { \
838 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
839 " expected at '%s'", \
840 __FILE__, __LINE__, loc); \
842 if (TO_OUTPUT_WARNINGS(loc)) { \
846 UPDATE_WARNINGS_LOC(loc); \
850 /* m is not necessarily a "literal string", in this macro */
851 #define reg_warn_non_literal_string(loc, m) \
852 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
853 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
854 "%s" REPORT_LOCATION, \
855 m, REPORT_LOCATION_ARGS(loc)))
857 #define ckWARNreg(loc,m) \
858 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
859 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
861 REPORT_LOCATION_ARGS(loc)))
863 #define vWARN(loc, m) \
864 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
865 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
867 REPORT_LOCATION_ARGS(loc))) \
869 #define vWARN_dep(loc, m) \
870 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
871 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
873 REPORT_LOCATION_ARGS(loc)))
875 #define ckWARNdep(loc,m) \
876 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
877 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
879 REPORT_LOCATION_ARGS(loc)))
881 #define ckWARNregdep(loc,m) \
882 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
883 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
886 REPORT_LOCATION_ARGS(loc)))
888 #define ckWARN2reg_d(loc,m, a1) \
889 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
890 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
892 a1, REPORT_LOCATION_ARGS(loc)))
894 #define ckWARN2reg(loc, m, a1) \
895 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
896 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
898 a1, REPORT_LOCATION_ARGS(loc)))
900 #define vWARN3(loc, m, a1, a2) \
901 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
902 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
904 a1, a2, REPORT_LOCATION_ARGS(loc)))
906 #define ckWARN3reg(loc, m, a1, a2) \
907 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
908 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
911 REPORT_LOCATION_ARGS(loc)))
913 #define vWARN4(loc, m, a1, a2, a3) \
914 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
915 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
918 REPORT_LOCATION_ARGS(loc)))
920 #define ckWARN4reg(loc, m, a1, a2, a3) \
921 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
922 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
925 REPORT_LOCATION_ARGS(loc)))
927 #define vWARN5(loc, m, a1, a2, a3, a4) \
928 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
929 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
932 REPORT_LOCATION_ARGS(loc)))
934 #define ckWARNexperimental(loc, class, m) \
935 _WARN_HELPER(loc, packWARN(class), \
936 Perl_ck_warner_d(aTHX_ packWARN(class), \
938 REPORT_LOCATION_ARGS(loc)))
940 /* Convert between a pointer to a node and its offset from the beginning of the
942 #define REGNODE_p(offset) (RExC_emit_start + (offset))
943 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
945 /* Macros for recording node offsets. 20001227 mjd@plover.com
946 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
947 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
948 * Element 0 holds the number n.
949 * Position is 1 indexed.
951 #ifndef RE_TRACK_PATTERN_OFFSETS
952 #define Set_Node_Offset_To_R(offset,byte)
953 #define Set_Node_Offset(node,byte)
954 #define Set_Cur_Node_Offset
955 #define Set_Node_Length_To_R(node,len)
956 #define Set_Node_Length(node,len)
957 #define Set_Node_Cur_Length(node,start)
958 #define Node_Offset(n)
959 #define Node_Length(n)
960 #define Set_Node_Offset_Length(node,offset,len)
961 #define ProgLen(ri) ri->u.proglen
962 #define SetProgLen(ri,x) ri->u.proglen = x
963 #define Track_Code(code)
965 #define ProgLen(ri) ri->u.offsets[0]
966 #define SetProgLen(ri,x) ri->u.offsets[0] = x
967 #define Set_Node_Offset_To_R(offset,byte) STMT_START { \
968 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
969 __LINE__, (int)(offset), (int)(byte))); \
971 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
974 RExC_offsets[2*(offset)-1] = (byte); \
978 #define Set_Node_Offset(node,byte) \
979 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
980 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
982 #define Set_Node_Length_To_R(node,len) STMT_START { \
983 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
984 __LINE__, (int)(node), (int)(len))); \
986 Perl_croak(aTHX_ "value of node is %d in Length macro", \
989 RExC_offsets[2*(node)] = (len); \
993 #define Set_Node_Length(node,len) \
994 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
995 #define Set_Node_Cur_Length(node, start) \
996 Set_Node_Length(node, RExC_parse - start)
998 /* Get offsets and lengths */
999 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1000 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1002 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1003 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1004 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
1007 #define Track_Code(code) STMT_START { code } STMT_END
1010 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1011 #define EXPERIMENTAL_INPLACESCAN
1012 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1016 Perl_re_printf(pTHX_ const char *fmt, ...)
1020 PerlIO *f= Perl_debug_log;
1021 PERL_ARGS_ASSERT_RE_PRINTF;
1023 result = PerlIO_vprintf(f, fmt, ap);
1029 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1033 PerlIO *f= Perl_debug_log;
1034 PERL_ARGS_ASSERT_RE_INDENTF;
1035 va_start(ap, depth);
1036 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1037 result = PerlIO_vprintf(f, fmt, ap);
1041 #endif /* DEBUGGING */
1043 #define DEBUG_RExC_seen() \
1044 DEBUG_OPTIMISE_MORE_r({ \
1045 Perl_re_printf( aTHX_ "RExC_seen: "); \
1047 if (RExC_seen & REG_ZERO_LEN_SEEN) \
1048 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
1050 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
1051 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
1053 if (RExC_seen & REG_GPOS_SEEN) \
1054 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
1056 if (RExC_seen & REG_RECURSE_SEEN) \
1057 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
1059 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
1060 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
1062 if (RExC_seen & REG_VERBARG_SEEN) \
1063 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
1065 if (RExC_seen & REG_CUTGROUP_SEEN) \
1066 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
1068 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
1069 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
1071 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
1072 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
1074 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
1075 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
1077 Perl_re_printf( aTHX_ "\n"); \
1080 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1081 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
1086 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1087 const char *close_str)
1092 Perl_re_printf( aTHX_ "%s", open_str);
1093 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1094 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1095 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1096 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1097 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1098 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1099 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1100 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1101 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1102 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1103 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1104 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1105 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1106 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1107 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1108 Perl_re_printf( aTHX_ "%s", close_str);
1113 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1114 U32 depth, int is_inf)
1116 GET_RE_DEBUG_FLAGS_DECL;
1118 DEBUG_OPTIMISE_MORE_r({
1121 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1125 (IV)data->pos_delta,
1129 S_debug_show_study_flags(aTHX_ data->flags," [","]");
1131 Perl_re_printf( aTHX_
1132 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1134 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1135 is_inf ? "INF " : ""
1138 if (data->last_found) {
1140 Perl_re_printf(aTHX_
1141 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1142 SvPVX_const(data->last_found),
1144 (IV)data->last_start_min,
1145 (IV)data->last_start_max
1148 for (i = 0; i < 2; i++) {
1149 Perl_re_printf(aTHX_
1150 " %s%s: '%s' @ %" IVdf "/%" IVdf,
1151 data->cur_is_floating == i ? "*" : "",
1152 i ? "Float" : "Fixed",
1153 SvPVX_const(data->substrs[i].str),
1154 (IV)data->substrs[i].min_offset,
1155 (IV)data->substrs[i].max_offset
1157 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1161 Perl_re_printf( aTHX_ "\n");
1167 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1168 regnode *scan, U32 depth, U32 flags)
1170 GET_RE_DEBUG_FLAGS_DECL;
1177 Next = regnext(scan);
1178 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1179 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1182 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1183 Next ? (REG_NODE_NUM(Next)) : 0 );
1184 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1185 Perl_re_printf( aTHX_ "\n");
1190 # define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1191 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1193 # define DEBUG_PEEP(str, scan, depth, flags) \
1194 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1197 # define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1198 # define DEBUG_PEEP(str, scan, depth, flags) NOOP
1202 /* =========================================================
1203 * BEGIN edit_distance stuff.
1205 * This calculates how many single character changes of any type are needed to
1206 * transform a string into another one. It is taken from version 3.1 of
1208 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1211 /* Our unsorted dictionary linked list. */
1212 /* Note we use UVs, not chars. */
1217 struct dictionary* next;
1219 typedef struct dictionary item;
1222 PERL_STATIC_INLINE item*
1223 push(UV key, item* curr)
1226 Newx(head, 1, item);
1234 PERL_STATIC_INLINE item*
1235 find(item* head, UV key)
1237 item* iterator = head;
1239 if (iterator->key == key){
1242 iterator = iterator->next;
1248 PERL_STATIC_INLINE item*
1249 uniquePush(item* head, UV key)
1251 item* iterator = head;
1254 if (iterator->key == key) {
1257 iterator = iterator->next;
1260 return push(key, head);
1263 PERL_STATIC_INLINE void
1264 dict_free(item* head)
1266 item* iterator = head;
1269 item* temp = iterator;
1270 iterator = iterator->next;
1277 /* End of Dictionary Stuff */
1279 /* All calculations/work are done here */
1281 S_edit_distance(const UV* src,
1283 const STRLEN x, /* length of src[] */
1284 const STRLEN y, /* length of tgt[] */
1285 const SSize_t maxDistance
1289 UV swapCount, swapScore, targetCharCount, i, j;
1291 UV score_ceil = x + y;
1293 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1295 /* intialize matrix start values */
1296 Newx(scores, ( (x + 2) * (y + 2)), UV);
1297 scores[0] = score_ceil;
1298 scores[1 * (y + 2) + 0] = score_ceil;
1299 scores[0 * (y + 2) + 1] = score_ceil;
1300 scores[1 * (y + 2) + 1] = 0;
1301 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1306 for (i=1;i<=x;i++) {
1308 head = uniquePush(head, src[i]);
1309 scores[(i+1) * (y + 2) + 1] = i;
1310 scores[(i+1) * (y + 2) + 0] = score_ceil;
1313 for (j=1;j<=y;j++) {
1316 head = uniquePush(head, tgt[j]);
1317 scores[1 * (y + 2) + (j + 1)] = j;
1318 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1321 targetCharCount = find(head, tgt[j-1])->value;
1322 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1324 if (src[i-1] != tgt[j-1]){
1325 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1329 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1333 find(head, src[i-1])->value = i;
1337 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1340 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1344 /* END of edit_distance() stuff
1345 * ========================================================= */
1347 /* is c a control character for which we have a mnemonic? */
1348 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1351 S_cntrl_to_mnemonic(const U8 c)
1353 /* Returns the mnemonic string that represents character 'c', if one
1354 * exists; NULL otherwise. The only ones that exist for the purposes of
1355 * this routine are a few control characters */
1358 case '\a': return "\\a";
1359 case '\b': return "\\b";
1360 case ESC_NATIVE: return "\\e";
1361 case '\f': return "\\f";
1362 case '\n': return "\\n";
1363 case '\r': return "\\r";
1364 case '\t': return "\\t";
1370 /* Mark that we cannot extend a found fixed substring at this point.
1371 Update the longest found anchored substring or the longest found
1372 floating substrings if needed. */
1375 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1376 SSize_t *minlenp, int is_inf)
1378 const STRLEN l = CHR_SVLEN(data->last_found);
1379 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1380 const STRLEN old_l = CHR_SVLEN(longest_sv);
1381 GET_RE_DEBUG_FLAGS_DECL;
1383 PERL_ARGS_ASSERT_SCAN_COMMIT;
1385 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1386 const U8 i = data->cur_is_floating;
1387 SvSetMagicSV(longest_sv, data->last_found);
1388 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1391 data->substrs[0].max_offset = data->substrs[0].min_offset;
1393 data->substrs[1].max_offset = (l
1394 ? data->last_start_max
1395 : (data->pos_delta > SSize_t_MAX - data->pos_min
1397 : data->pos_min + data->pos_delta));
1399 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1400 data->substrs[1].max_offset = SSize_t_MAX;
1403 if (data->flags & SF_BEFORE_EOL)
1404 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1406 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1407 data->substrs[i].minlenp = minlenp;
1408 data->substrs[i].lookbehind = 0;
1411 SvCUR_set(data->last_found, 0);
1413 SV * const sv = data->last_found;
1414 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1415 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1420 data->last_end = -1;
1421 data->flags &= ~SF_BEFORE_EOL;
1422 DEBUG_STUDYDATA("commit", data, 0, is_inf);
1425 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1426 * list that describes which code points it matches */
1429 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1431 /* Set the SSC 'ssc' to match an empty string or any code point */
1433 PERL_ARGS_ASSERT_SSC_ANYTHING;
1435 assert(is_ANYOF_SYNTHETIC(ssc));
1437 /* mortalize so won't leak */
1438 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1439 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1443 S_ssc_is_anything(const regnode_ssc *ssc)
1445 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1446 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1447 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1448 * in any way, so there's no point in using it */
1453 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1455 assert(is_ANYOF_SYNTHETIC(ssc));
1457 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1461 /* See if the list consists solely of the range 0 - Infinity */
1462 invlist_iterinit(ssc->invlist);
1463 ret = invlist_iternext(ssc->invlist, &start, &end)
1467 invlist_iterfinish(ssc->invlist);
1473 /* If e.g., both \w and \W are set, matches everything */
1474 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1476 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1477 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1487 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1489 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1490 * string, any code point, or any posix class under locale */
1492 PERL_ARGS_ASSERT_SSC_INIT;
1494 Zero(ssc, 1, regnode_ssc);
1495 set_ANYOF_SYNTHETIC(ssc);
1496 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1499 /* If any portion of the regex is to operate under locale rules that aren't
1500 * fully known at compile time, initialization includes it. The reason
1501 * this isn't done for all regexes is that the optimizer was written under
1502 * the assumption that locale was all-or-nothing. Given the complexity and
1503 * lack of documentation in the optimizer, and that there are inadequate
1504 * test cases for locale, many parts of it may not work properly, it is
1505 * safest to avoid locale unless necessary. */
1506 if (RExC_contains_locale) {
1507 ANYOF_POSIXL_SETALL(ssc);
1510 ANYOF_POSIXL_ZERO(ssc);
1515 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1516 const regnode_ssc *ssc)
1518 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1519 * to the list of code points matched, and locale posix classes; hence does
1520 * not check its flags) */
1525 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1527 assert(is_ANYOF_SYNTHETIC(ssc));
1529 invlist_iterinit(ssc->invlist);
1530 ret = invlist_iternext(ssc->invlist, &start, &end)
1534 invlist_iterfinish(ssc->invlist);
1540 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1548 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1549 const regnode_charclass* const node)
1551 /* Returns a mortal inversion list defining which code points are matched
1552 * by 'node', which is of type ANYOF. Handles complementing the result if
1553 * appropriate. If some code points aren't knowable at this time, the
1554 * returned list must, and will, contain every code point that is a
1558 SV* only_utf8_locale_invlist = NULL;
1560 const U32 n = ARG(node);
1561 bool new_node_has_latin1 = FALSE;
1563 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1565 /* Look at the data structure created by S_set_ANYOF_arg() */
1566 if (n != ANYOF_ONLY_HAS_BITMAP) {
1567 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1568 AV * const av = MUTABLE_AV(SvRV(rv));
1569 SV **const ary = AvARRAY(av);
1570 assert(RExC_rxi->data->what[n] == 's');
1572 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1573 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]), NULL));
1575 else if (ary[0] && ary[0] != &PL_sv_undef) {
1577 /* Here, no compile-time swash, and there are things that won't be
1578 * known until runtime -- we have to assume it could be anything */
1579 invlist = sv_2mortal(_new_invlist(1));
1580 return _add_range_to_invlist(invlist, 0, UV_MAX);
1582 else if (ary[3] && ary[3] != &PL_sv_undef) {
1584 /* Here no compile-time swash, and no run-time only data. Use the
1585 * node's inversion list */
1586 invlist = sv_2mortal(invlist_clone(ary[3], NULL));
1589 /* Get the code points valid only under UTF-8 locales */
1590 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1591 && ary[2] && ary[2] != &PL_sv_undef)
1593 only_utf8_locale_invlist = ary[2];
1598 invlist = sv_2mortal(_new_invlist(0));
1601 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1602 * code points, and an inversion list for the others, but if there are code
1603 * points that should match only conditionally on the target string being
1604 * UTF-8, those are placed in the inversion list, and not the bitmap.
1605 * Since there are circumstances under which they could match, they are
1606 * included in the SSC. But if the ANYOF node is to be inverted, we have
1607 * to exclude them here, so that when we invert below, the end result
1608 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1609 * have to do this here before we add the unconditionally matched code
1611 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1612 _invlist_intersection_complement_2nd(invlist,
1617 /* Add in the points from the bit map */
1618 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1619 if (ANYOF_BITMAP_TEST(node, i)) {
1620 unsigned int start = i++;
1622 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1625 invlist = _add_range_to_invlist(invlist, start, i-1);
1626 new_node_has_latin1 = TRUE;
1630 /* If this can match all upper Latin1 code points, have to add them
1631 * as well. But don't add them if inverting, as when that gets done below,
1632 * it would exclude all these characters, including the ones it shouldn't
1633 * that were added just above */
1634 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
1635 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1637 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1640 /* Similarly for these */
1641 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1642 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1645 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1646 _invlist_invert(invlist);
1648 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
1650 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1651 * locale. We can skip this if there are no 0-255 at all. */
1652 _invlist_union(invlist, PL_Latin1, &invlist);
1655 /* Similarly add the UTF-8 locale possible matches. These have to be
1656 * deferred until after the non-UTF-8 locale ones are taken care of just
1657 * above, or it leads to wrong results under ANYOF_INVERT */
1658 if (only_utf8_locale_invlist) {
1659 _invlist_union_maybe_complement_2nd(invlist,
1660 only_utf8_locale_invlist,
1661 ANYOF_FLAGS(node) & ANYOF_INVERT,
1668 /* These two functions currently do the exact same thing */
1669 #define ssc_init_zero ssc_init
1671 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1672 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1674 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1675 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1676 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1679 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1680 const regnode_charclass *and_with)
1682 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1683 * another SSC or a regular ANYOF class. Can create false positives. */
1688 PERL_ARGS_ASSERT_SSC_AND;
1690 assert(is_ANYOF_SYNTHETIC(ssc));
1692 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1693 * the code point inversion list and just the relevant flags */
1694 if (is_ANYOF_SYNTHETIC(and_with)) {
1695 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1696 anded_flags = ANYOF_FLAGS(and_with);
1698 /* XXX This is a kludge around what appears to be deficiencies in the
1699 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1700 * there are paths through the optimizer where it doesn't get weeded
1701 * out when it should. And if we don't make some extra provision for
1702 * it like the code just below, it doesn't get added when it should.
1703 * This solution is to add it only when AND'ing, which is here, and
1704 * only when what is being AND'ed is the pristine, original node
1705 * matching anything. Thus it is like adding it to ssc_anything() but
1706 * only when the result is to be AND'ed. Probably the same solution
1707 * could be adopted for the same problem we have with /l matching,
1708 * which is solved differently in S_ssc_init(), and that would lead to
1709 * fewer false positives than that solution has. But if this solution
1710 * creates bugs, the consequences are only that a warning isn't raised
1711 * that should be; while the consequences for having /l bugs is
1712 * incorrect matches */
1713 if (ssc_is_anything((regnode_ssc *)and_with)) {
1714 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1718 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1719 if (OP(and_with) == ANYOFD) {
1720 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1723 anded_flags = ANYOF_FLAGS(and_with)
1724 &( ANYOF_COMMON_FLAGS
1725 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1726 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1727 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1729 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1734 ANYOF_FLAGS(ssc) &= anded_flags;
1736 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1737 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1738 * 'and_with' may be inverted. When not inverted, we have the situation of
1740 * (C1 | P1) & (C2 | P2)
1741 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1742 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1743 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1744 * <= ((C1 & C2) | P1 | P2)
1745 * Alternatively, the last few steps could be:
1746 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1747 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1748 * <= (C1 | C2 | (P1 & P2))
1749 * We favor the second approach if either P1 or P2 is non-empty. This is
1750 * because these components are a barrier to doing optimizations, as what
1751 * they match cannot be known until the moment of matching as they are
1752 * dependent on the current locale, 'AND"ing them likely will reduce or
1754 * But we can do better if we know that C1,P1 are in their initial state (a
1755 * frequent occurrence), each matching everything:
1756 * (<everything>) & (C2 | P2) = C2 | P2
1757 * Similarly, if C2,P2 are in their initial state (again a frequent
1758 * occurrence), the result is a no-op
1759 * (C1 | P1) & (<everything>) = C1 | P1
1762 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1763 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1764 * <= (C1 & ~C2) | (P1 & ~P2)
1767 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1768 && ! is_ANYOF_SYNTHETIC(and_with))
1772 ssc_intersection(ssc,
1774 FALSE /* Has already been inverted */
1777 /* If either P1 or P2 is empty, the intersection will be also; can skip
1779 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1780 ANYOF_POSIXL_ZERO(ssc);
1782 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1784 /* Note that the Posix class component P from 'and_with' actually
1786 * P = Pa | Pb | ... | Pn
1787 * where each component is one posix class, such as in [\w\s].
1789 * ~P = ~(Pa | Pb | ... | Pn)
1790 * = ~Pa & ~Pb & ... & ~Pn
1791 * <= ~Pa | ~Pb | ... | ~Pn
1792 * The last is something we can easily calculate, but unfortunately
1793 * is likely to have many false positives. We could do better
1794 * in some (but certainly not all) instances if two classes in
1795 * P have known relationships. For example
1796 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1798 * :lower: & :print: = :lower:
1799 * And similarly for classes that must be disjoint. For example,
1800 * since \s and \w can have no elements in common based on rules in
1801 * the POSIX standard,
1802 * \w & ^\S = nothing
1803 * Unfortunately, some vendor locales do not meet the Posix
1804 * standard, in particular almost everything by Microsoft.
1805 * The loop below just changes e.g., \w into \W and vice versa */
1807 regnode_charclass_posixl temp;
1808 int add = 1; /* To calculate the index of the complement */
1810 Zero(&temp, 1, regnode_charclass_posixl);
1811 ANYOF_POSIXL_ZERO(&temp);
1812 for (i = 0; i < ANYOF_MAX; i++) {
1814 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1815 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1817 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1818 ANYOF_POSIXL_SET(&temp, i + add);
1820 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1822 ANYOF_POSIXL_AND(&temp, ssc);
1824 } /* else ssc already has no posixes */
1825 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1826 in its initial state */
1827 else if (! is_ANYOF_SYNTHETIC(and_with)
1828 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1830 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1831 * copy it over 'ssc' */
1832 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1833 if (is_ANYOF_SYNTHETIC(and_with)) {
1834 StructCopy(and_with, ssc, regnode_ssc);
1837 ssc->invlist = anded_cp_list;
1838 ANYOF_POSIXL_ZERO(ssc);
1839 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1840 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1844 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1845 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1847 /* One or the other of P1, P2 is non-empty. */
1848 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1849 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1851 ssc_union(ssc, anded_cp_list, FALSE);
1853 else { /* P1 = P2 = empty */
1854 ssc_intersection(ssc, anded_cp_list, FALSE);
1860 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1861 const regnode_charclass *or_with)
1863 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1864 * another SSC or a regular ANYOF class. Can create false positives if
1865 * 'or_with' is to be inverted. */
1870 PERL_ARGS_ASSERT_SSC_OR;
1872 assert(is_ANYOF_SYNTHETIC(ssc));
1874 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1875 * the code point inversion list and just the relevant flags */
1876 if (is_ANYOF_SYNTHETIC(or_with)) {
1877 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1878 ored_flags = ANYOF_FLAGS(or_with);
1881 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1882 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1883 if (OP(or_with) != ANYOFD) {
1885 |= ANYOF_FLAGS(or_with)
1886 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1887 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1888 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1890 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1895 ANYOF_FLAGS(ssc) |= ored_flags;
1897 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1898 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1899 * 'or_with' may be inverted. When not inverted, we have the simple
1900 * situation of computing:
1901 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1902 * If P1|P2 yields a situation with both a class and its complement are
1903 * set, like having both \w and \W, this matches all code points, and we
1904 * can delete these from the P component of the ssc going forward. XXX We
1905 * might be able to delete all the P components, but I (khw) am not certain
1906 * about this, and it is better to be safe.
1909 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1910 * <= (C1 | P1) | ~C2
1911 * <= (C1 | ~C2) | P1
1912 * (which results in actually simpler code than the non-inverted case)
1915 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1916 && ! is_ANYOF_SYNTHETIC(or_with))
1918 /* We ignore P2, leaving P1 going forward */
1919 } /* else Not inverted */
1920 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1921 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1922 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1924 for (i = 0; i < ANYOF_MAX; i += 2) {
1925 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1927 ssc_match_all_cp(ssc);
1928 ANYOF_POSIXL_CLEAR(ssc, i);
1929 ANYOF_POSIXL_CLEAR(ssc, i+1);
1937 FALSE /* Already has been inverted */
1941 PERL_STATIC_INLINE void
1942 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1944 PERL_ARGS_ASSERT_SSC_UNION;
1946 assert(is_ANYOF_SYNTHETIC(ssc));
1948 _invlist_union_maybe_complement_2nd(ssc->invlist,
1954 PERL_STATIC_INLINE void
1955 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1957 const bool invert2nd)
1959 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1961 assert(is_ANYOF_SYNTHETIC(ssc));
1963 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1969 PERL_STATIC_INLINE void
1970 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1972 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1974 assert(is_ANYOF_SYNTHETIC(ssc));
1976 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1979 PERL_STATIC_INLINE void
1980 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1982 /* AND just the single code point 'cp' into the SSC 'ssc' */
1984 SV* cp_list = _new_invlist(2);
1986 PERL_ARGS_ASSERT_SSC_CP_AND;
1988 assert(is_ANYOF_SYNTHETIC(ssc));
1990 cp_list = add_cp_to_invlist(cp_list, cp);
1991 ssc_intersection(ssc, cp_list,
1992 FALSE /* Not inverted */
1994 SvREFCNT_dec_NN(cp_list);
1997 PERL_STATIC_INLINE void
1998 S_ssc_clear_locale(regnode_ssc *ssc)
2000 /* Set the SSC 'ssc' to not match any locale things */
2001 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2003 assert(is_ANYOF_SYNTHETIC(ssc));
2005 ANYOF_POSIXL_ZERO(ssc);
2006 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2009 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2012 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2014 /* The synthetic start class is used to hopefully quickly winnow down
2015 * places where a pattern could start a match in the target string. If it
2016 * doesn't really narrow things down that much, there isn't much point to
2017 * having the overhead of using it. This function uses some very crude
2018 * heuristics to decide if to use the ssc or not.
2020 * It returns TRUE if 'ssc' rules out more than half what it considers to
2021 * be the "likely" possible matches, but of course it doesn't know what the
2022 * actual things being matched are going to be; these are only guesses
2024 * For /l matches, it assumes that the only likely matches are going to be
2025 * in the 0-255 range, uniformly distributed, so half of that is 127
2026 * For /a and /d matches, it assumes that the likely matches will be just
2027 * the ASCII range, so half of that is 63
2028 * For /u and there isn't anything matching above the Latin1 range, it
2029 * assumes that that is the only range likely to be matched, and uses
2030 * half that as the cut-off: 127. If anything matches above Latin1,
2031 * it assumes that all of Unicode could match (uniformly), except for
2032 * non-Unicode code points and things in the General Category "Other"
2033 * (unassigned, private use, surrogates, controls and formats). This
2034 * is a much large number. */
2036 U32 count = 0; /* Running total of number of code points matched by
2038 UV start, end; /* Start and end points of current range in inversion
2040 const U32 max_code_points = (LOC)
2042 : (( ! UNI_SEMANTICS
2043 || invlist_highest(ssc->invlist) < 256)
2046 const U32 max_match = max_code_points / 2;
2048 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2050 invlist_iterinit(ssc->invlist);
2051 while (invlist_iternext(ssc->invlist, &start, &end)) {
2052 if (start >= max_code_points) {
2055 end = MIN(end, max_code_points - 1);
2056 count += end - start + 1;
2057 if (count >= max_match) {
2058 invlist_iterfinish(ssc->invlist);
2068 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2070 /* The inversion list in the SSC is marked mortal; now we need a more
2071 * permanent copy, which is stored the same way that is done in a regular
2072 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2075 SV* invlist = invlist_clone(ssc->invlist, NULL);
2077 PERL_ARGS_ASSERT_SSC_FINALIZE;
2079 assert(is_ANYOF_SYNTHETIC(ssc));
2081 /* The code in this file assumes that all but these flags aren't relevant
2082 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2083 * by the time we reach here */
2084 assert(! (ANYOF_FLAGS(ssc)
2085 & ~( ANYOF_COMMON_FLAGS
2086 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2087 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2089 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2091 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
2092 NULL, NULL, NULL, FALSE);
2094 /* Make sure is clone-safe */
2095 ssc->invlist = NULL;
2097 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2098 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2099 OP(ssc) = ANYOFPOSIXL;
2101 else if (RExC_contains_locale) {
2105 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2108 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2109 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2110 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2111 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2112 ? (TRIE_LIST_CUR( idx ) - 1) \
2118 dump_trie(trie,widecharmap,revcharmap)
2119 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2120 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2122 These routines dump out a trie in a somewhat readable format.
2123 The _interim_ variants are used for debugging the interim
2124 tables that are used to generate the final compressed
2125 representation which is what dump_trie expects.
2127 Part of the reason for their existence is to provide a form
2128 of documentation as to how the different representations function.
2133 Dumps the final compressed table form of the trie to Perl_debug_log.
2134 Used for debugging make_trie().
2138 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2139 AV *revcharmap, U32 depth)
2142 SV *sv=sv_newmortal();
2143 int colwidth= widecharmap ? 6 : 4;
2145 GET_RE_DEBUG_FLAGS_DECL;
2147 PERL_ARGS_ASSERT_DUMP_TRIE;
2149 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
2150 depth+1, "Match","Base","Ofs" );
2152 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2153 SV ** const tmp = av_fetch( revcharmap, state, 0);
2155 Perl_re_printf( aTHX_ "%*s",
2157 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2158 PL_colors[0], PL_colors[1],
2159 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2160 PERL_PV_ESCAPE_FIRSTCHAR
2165 Perl_re_printf( aTHX_ "\n");
2166 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2168 for( state = 0 ; state < trie->uniquecharcount ; state++ )
2169 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2170 Perl_re_printf( aTHX_ "\n");
2172 for( state = 1 ; state < trie->statecount ; state++ ) {
2173 const U32 base = trie->states[ state ].trans.base;
2175 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
2177 if ( trie->states[ state ].wordnum ) {
2178 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
2180 Perl_re_printf( aTHX_ "%6s", "" );
2183 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
2188 while( ( base + ofs < trie->uniquecharcount ) ||
2189 ( base + ofs - trie->uniquecharcount < trie->lasttrans
2190 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2194 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
2196 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2197 if ( ( base + ofs >= trie->uniquecharcount )
2198 && ( base + ofs - trie->uniquecharcount
2200 && trie->trans[ base + ofs
2201 - trie->uniquecharcount ].check == state )
2203 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
2204 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2207 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
2211 Perl_re_printf( aTHX_ "]");
2214 Perl_re_printf( aTHX_ "\n" );
2216 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
2218 for (word=1; word <= trie->wordcount; word++) {
2219 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2220 (int)word, (int)(trie->wordinfo[word].prev),
2221 (int)(trie->wordinfo[word].len));
2223 Perl_re_printf( aTHX_ "\n" );
2226 Dumps a fully constructed but uncompressed trie in list form.
2227 List tries normally only are used for construction when the number of
2228 possible chars (trie->uniquecharcount) is very high.
2229 Used for debugging make_trie().
2232 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2233 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2237 SV *sv=sv_newmortal();
2238 int colwidth= widecharmap ? 6 : 4;
2239 GET_RE_DEBUG_FLAGS_DECL;
2241 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2243 /* print out the table precompression. */
2244 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
2246 Perl_re_indentf( aTHX_ "%s",
2247 depth+1, "------:-----+-----------------\n" );
2249 for( state=1 ; state < next_alloc ; state ++ ) {
2252 Perl_re_indentf( aTHX_ " %4" UVXf " :",
2253 depth+1, (UV)state );
2254 if ( ! trie->states[ state ].wordnum ) {
2255 Perl_re_printf( aTHX_ "%5s| ","");
2257 Perl_re_printf( aTHX_ "W%4x| ",
2258 trie->states[ state ].wordnum
2261 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2262 SV ** const tmp = av_fetch( revcharmap,
2263 TRIE_LIST_ITEM(state, charid).forid, 0);
2265 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
2267 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2269 PL_colors[0], PL_colors[1],
2270 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2271 | PERL_PV_ESCAPE_FIRSTCHAR
2273 TRIE_LIST_ITEM(state, charid).forid,
2274 (UV)TRIE_LIST_ITEM(state, charid).newstate
2277 Perl_re_printf( aTHX_ "\n%*s| ",
2278 (int)((depth * 2) + 14), "");
2281 Perl_re_printf( aTHX_ "\n");
2286 Dumps a fully constructed but uncompressed trie in table form.
2287 This is the normal DFA style state transition table, with a few
2288 twists to facilitate compression later.
2289 Used for debugging make_trie().
2292 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2293 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2298 SV *sv=sv_newmortal();
2299 int colwidth= widecharmap ? 6 : 4;
2300 GET_RE_DEBUG_FLAGS_DECL;
2302 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2305 print out the table precompression so that we can do a visual check
2306 that they are identical.
2309 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
2311 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2312 SV ** const tmp = av_fetch( revcharmap, charid, 0);
2314 Perl_re_printf( aTHX_ "%*s",
2316 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2317 PL_colors[0], PL_colors[1],
2318 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2319 PERL_PV_ESCAPE_FIRSTCHAR
2325 Perl_re_printf( aTHX_ "\n");
2326 Perl_re_indentf( aTHX_ "State+-", depth+1 );
2328 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2329 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
2332 Perl_re_printf( aTHX_ "\n" );
2334 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2336 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
2338 (UV)TRIE_NODENUM( state ) );
2340 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2341 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2343 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
2345 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
2347 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2348 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
2349 (UV)trie->trans[ state ].check );
2351 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
2352 (UV)trie->trans[ state ].check,
2353 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2361 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2362 startbranch: the first branch in the whole branch sequence
2363 first : start branch of sequence of branch-exact nodes.
2364 May be the same as startbranch
2365 last : Thing following the last branch.
2366 May be the same as tail.
2367 tail : item following the branch sequence
2368 count : words in the sequence
2369 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2370 depth : indent depth
2372 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2374 A trie is an N'ary tree where the branches are determined by digital
2375 decomposition of the key. IE, at the root node you look up the 1st character and
2376 follow that branch repeat until you find the end of the branches. Nodes can be
2377 marked as "accepting" meaning they represent a complete word. Eg:
2381 would convert into the following structure. Numbers represent states, letters
2382 following numbers represent valid transitions on the letter from that state, if
2383 the number is in square brackets it represents an accepting state, otherwise it
2384 will be in parenthesis.
2386 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2390 (1) +-i->(6)-+-s->[7]
2392 +-s->(3)-+-h->(4)-+-e->[5]
2394 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2396 This shows that when matching against the string 'hers' we will begin at state 1
2397 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2398 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2399 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2400 single traverse. We store a mapping from accepting to state to which word was
2401 matched, and then when we have multiple possibilities we try to complete the
2402 rest of the regex in the order in which they occurred in the alternation.
2404 The only prior NFA like behaviour that would be changed by the TRIE support is
2405 the silent ignoring of duplicate alternations which are of the form:
2407 / (DUPE|DUPE) X? (?{ ... }) Y /x
2409 Thus EVAL blocks following a trie may be called a different number of times with
2410 and without the optimisation. With the optimisations dupes will be silently
2411 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2412 the following demonstrates:
2414 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2416 which prints out 'word' three times, but
2418 'words'=~/(word|word|word)(?{ print $1 })S/
2420 which doesnt print it out at all. This is due to other optimisations kicking in.
2422 Example of what happens on a structural level:
2424 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2426 1: CURLYM[1] {1,32767}(18)
2437 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2438 and should turn into:
2440 1: CURLYM[1] {1,32767}(18)
2442 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2450 Cases where tail != last would be like /(?foo|bar)baz/:
2460 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2461 and would end up looking like:
2464 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2471 d = uvchr_to_utf8_flags(d, uv, 0);
2473 is the recommended Unicode-aware way of saying
2478 #define TRIE_STORE_REVCHAR(val) \
2481 SV *zlopp = newSV(UTF8_MAXBYTES); \
2482 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2483 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2484 SvCUR_set(zlopp, kapow - flrbbbbb); \
2487 av_push(revcharmap, zlopp); \
2489 char ooooff = (char)val; \
2490 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2494 /* This gets the next character from the input, folding it if not already
2496 #define TRIE_READ_CHAR STMT_START { \
2499 /* if it is UTF then it is either already folded, or does not need \
2501 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2503 else if (folder == PL_fold_latin1) { \
2504 /* This folder implies Unicode rules, which in the range expressible \
2505 * by not UTF is the lower case, with the two exceptions, one of \
2506 * which should have been taken care of before calling this */ \
2507 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2508 uvc = toLOWER_L1(*uc); \
2509 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2512 /* raw data, will be folded later if needed */ \
2520 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2521 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2522 U32 ging = TRIE_LIST_LEN( state ) * 2; \
2523 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2524 TRIE_LIST_LEN( state ) = ging; \
2526 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2527 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2528 TRIE_LIST_CUR( state )++; \
2531 #define TRIE_LIST_NEW(state) STMT_START { \
2532 Newx( trie->states[ state ].trans.list, \
2533 4, reg_trie_trans_le ); \
2534 TRIE_LIST_CUR( state ) = 1; \
2535 TRIE_LIST_LEN( state ) = 4; \
2538 #define TRIE_HANDLE_WORD(state) STMT_START { \
2539 U16 dupe= trie->states[ state ].wordnum; \
2540 regnode * const noper_next = regnext( noper ); \
2543 /* store the word for dumping */ \
2545 if (OP(noper) != NOTHING) \
2546 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2548 tmp = newSVpvn_utf8( "", 0, UTF ); \
2549 av_push( trie_words, tmp ); \
2553 trie->wordinfo[curword].prev = 0; \
2554 trie->wordinfo[curword].len = wordlen; \
2555 trie->wordinfo[curword].accept = state; \
2557 if ( noper_next < tail ) { \
2559 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2561 trie->jump[curword] = (U16)(noper_next - convert); \
2563 jumper = noper_next; \
2565 nextbranch= regnext(cur); \
2569 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2570 /* chain, so that when the bits of chain are later */\
2571 /* linked together, the dups appear in the chain */\
2572 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2573 trie->wordinfo[dupe].prev = curword; \
2575 /* we haven't inserted this word yet. */ \
2576 trie->states[ state ].wordnum = curword; \
2581 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2582 ( ( base + charid >= ucharcount \
2583 && base + charid < ubound \
2584 && state == trie->trans[ base - ucharcount + charid ].check \
2585 && trie->trans[ base - ucharcount + charid ].next ) \
2586 ? trie->trans[ base - ucharcount + charid ].next \
2587 : ( state==1 ? special : 0 ) \
2590 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2592 TRIE_BITMAP_SET(trie, uvc); \
2593 /* store the folded codepoint */ \
2595 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2598 /* store first byte of utf8 representation of */ \
2599 /* variant codepoints */ \
2600 if (! UVCHR_IS_INVARIANT(uvc)) { \
2601 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2606 #define MADE_JUMP_TRIE 2
2607 #define MADE_EXACT_TRIE 4
2610 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2611 regnode *first, regnode *last, regnode *tail,
2612 U32 word_count, U32 flags, U32 depth)
2614 /* first pass, loop through and scan words */
2615 reg_trie_data *trie;
2616 HV *widecharmap = NULL;
2617 AV *revcharmap = newAV();
2623 regnode *jumper = NULL;
2624 regnode *nextbranch = NULL;
2625 regnode *convert = NULL;
2626 U32 *prev_states; /* temp array mapping each state to previous one */
2627 /* we just use folder as a flag in utf8 */
2628 const U8 * folder = NULL;
2630 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2631 * which stands for one trie structure, one hash, optionally followed
2634 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2635 AV *trie_words = NULL;
2636 /* along with revcharmap, this only used during construction but both are
2637 * useful during debugging so we store them in the struct when debugging.
2640 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2641 STRLEN trie_charcount=0;
2643 SV *re_trie_maxbuff;
2644 GET_RE_DEBUG_FLAGS_DECL;
2646 PERL_ARGS_ASSERT_MAKE_TRIE;
2648 PERL_UNUSED_ARG(depth);
2652 case EXACT: case EXACT_ONLY8: case EXACTL: break;
2656 case EXACTFLU8: folder = PL_fold_latin1; break;
2657 case EXACTF: folder = PL_fold; break;
2658 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2661 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2663 trie->startstate = 1;
2664 trie->wordcount = word_count;
2665 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2666 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2667 if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
2668 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2669 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2670 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2673 trie_words = newAV();
2676 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2677 assert(re_trie_maxbuff);
2678 if (!SvIOK(re_trie_maxbuff)) {
2679 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2681 DEBUG_TRIE_COMPILE_r({
2682 Perl_re_indentf( aTHX_
2683 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2685 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2686 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2689 /* Find the node we are going to overwrite */
2690 if ( first == startbranch && OP( last ) != BRANCH ) {
2691 /* whole branch chain */
2694 /* branch sub-chain */
2695 convert = NEXTOPER( first );
2698 /* -- First loop and Setup --
2700 We first traverse the branches and scan each word to determine if it
2701 contains widechars, and how many unique chars there are, this is
2702 important as we have to build a table with at least as many columns as we
2705 We use an array of integers to represent the character codes 0..255
2706 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2707 the native representation of the character value as the key and IV's for
2710 *TODO* If we keep track of how many times each character is used we can
2711 remap the columns so that the table compression later on is more
2712 efficient in terms of memory by ensuring the most common value is in the
2713 middle and the least common are on the outside. IMO this would be better
2714 than a most to least common mapping as theres a decent chance the most
2715 common letter will share a node with the least common, meaning the node
2716 will not be compressible. With a middle is most common approach the worst
2717 case is when we have the least common nodes twice.
2721 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2722 regnode *noper = NEXTOPER( cur );
2726 U32 wordlen = 0; /* required init */
2727 STRLEN minchars = 0;
2728 STRLEN maxchars = 0;
2729 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2732 if (OP(noper) == NOTHING) {
2733 /* skip past a NOTHING at the start of an alternation
2734 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2736 regnode *noper_next= regnext(noper);
2737 if (noper_next < tail)
2742 && ( OP(noper) == flags
2743 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
2744 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
2745 || OP(noper) == EXACTFU_SS))) )
2747 uc= (U8*)STRING(noper);
2748 e= uc + STR_LEN(noper);
2755 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2756 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2757 regardless of encoding */
2758 if (OP( noper ) == EXACTFU_SS) {
2759 /* false positives are ok, so just set this */
2760 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2764 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2766 TRIE_CHARCOUNT(trie)++;
2769 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2770 * is in effect. Under /i, this character can match itself, or
2771 * anything that folds to it. If not under /i, it can match just
2772 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2773 * all fold to k, and all are single characters. But some folds
2774 * expand to more than one character, so for example LATIN SMALL
2775 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2776 * the string beginning at 'uc' is 'ffi', it could be matched by
2777 * three characters, or just by the one ligature character. (It
2778 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2779 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2780 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2781 * match.) The trie needs to know the minimum and maximum number
2782 * of characters that could match so that it can use size alone to
2783 * quickly reject many match attempts. The max is simple: it is
2784 * the number of folded characters in this branch (since a fold is
2785 * never shorter than what folds to it. */
2789 /* And the min is equal to the max if not under /i (indicated by
2790 * 'folder' being NULL), or there are no multi-character folds. If
2791 * there is a multi-character fold, the min is incremented just
2792 * once, for the character that folds to the sequence. Each
2793 * character in the sequence needs to be added to the list below of
2794 * characters in the trie, but we count only the first towards the
2795 * min number of characters needed. This is done through the
2796 * variable 'foldlen', which is returned by the macros that look
2797 * for these sequences as the number of bytes the sequence
2798 * occupies. Each time through the loop, we decrement 'foldlen' by
2799 * how many bytes the current char occupies. Only when it reaches
2800 * 0 do we increment 'minchars' or look for another multi-character
2802 if (folder == NULL) {
2805 else if (foldlen > 0) {
2806 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2811 /* See if *uc is the beginning of a multi-character fold. If
2812 * so, we decrement the length remaining to look at, to account
2813 * for the current character this iteration. (We can use 'uc'
2814 * instead of the fold returned by TRIE_READ_CHAR because for
2815 * non-UTF, the latin1_safe macro is smart enough to account
2816 * for all the unfolded characters, and because for UTF, the
2817 * string will already have been folded earlier in the
2818 * compilation process */
2820 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2821 foldlen -= UTF8SKIP(uc);
2824 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2829 /* The current character (and any potential folds) should be added
2830 * to the possible matching characters for this position in this
2834 U8 folded= folder[ (U8) uvc ];
2835 if ( !trie->charmap[ folded ] ) {
2836 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2837 TRIE_STORE_REVCHAR( folded );
2840 if ( !trie->charmap[ uvc ] ) {
2841 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2842 TRIE_STORE_REVCHAR( uvc );
2845 /* store the codepoint in the bitmap, and its folded
2847 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2848 set_bit = 0; /* We've done our bit :-) */
2852 /* XXX We could come up with the list of code points that fold
2853 * to this using PL_utf8_foldclosures, except not for
2854 * multi-char folds, as there may be multiple combinations
2855 * there that could work, which needs to wait until runtime to
2856 * resolve (The comment about LIGATURE FFI above is such an
2861 widecharmap = newHV();
2863 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2866 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2868 if ( !SvTRUE( *svpp ) ) {
2869 sv_setiv( *svpp, ++trie->uniquecharcount );
2870 TRIE_STORE_REVCHAR(uvc);
2873 } /* end loop through characters in this branch of the trie */
2875 /* We take the min and max for this branch and combine to find the min
2876 * and max for all branches processed so far */
2877 if( cur == first ) {
2878 trie->minlen = minchars;
2879 trie->maxlen = maxchars;
2880 } else if (minchars < trie->minlen) {
2881 trie->minlen = minchars;
2882 } else if (maxchars > trie->maxlen) {
2883 trie->maxlen = maxchars;
2885 } /* end first pass */
2886 DEBUG_TRIE_COMPILE_r(
2887 Perl_re_indentf( aTHX_
2888 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2890 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2891 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2892 (int)trie->minlen, (int)trie->maxlen )
2896 We now know what we are dealing with in terms of unique chars and
2897 string sizes so we can calculate how much memory a naive
2898 representation using a flat table will take. If it's over a reasonable
2899 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2900 conservative but potentially much slower representation using an array
2903 At the end we convert both representations into the same compressed
2904 form that will be used in regexec.c for matching with. The latter
2905 is a form that cannot be used to construct with but has memory
2906 properties similar to the list form and access properties similar
2907 to the table form making it both suitable for fast searches and
2908 small enough that its feasable to store for the duration of a program.
2910 See the comment in the code where the compressed table is produced
2911 inplace from the flat tabe representation for an explanation of how
2912 the compression works.
2917 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2920 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2921 > SvIV(re_trie_maxbuff) )
2924 Second Pass -- Array Of Lists Representation
2926 Each state will be represented by a list of charid:state records
2927 (reg_trie_trans_le) the first such element holds the CUR and LEN
2928 points of the allocated array. (See defines above).
2930 We build the initial structure using the lists, and then convert
2931 it into the compressed table form which allows faster lookups
2932 (but cant be modified once converted).
2935 STRLEN transcount = 1;
2937 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
2940 trie->states = (reg_trie_state *)
2941 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2942 sizeof(reg_trie_state) );
2946 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2948 regnode *noper = NEXTOPER( cur );
2949 U32 state = 1; /* required init */
2950 U16 charid = 0; /* sanity init */
2951 U32 wordlen = 0; /* required init */
2953 if (OP(noper) == NOTHING) {
2954 regnode *noper_next= regnext(noper);
2955 if (noper_next < tail)
2960 && ( OP(noper) == flags
2961 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
2962 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
2963 || OP(noper) == EXACTFU_SS))) )
2965 const U8 *uc= (U8*)STRING(noper);
2966 const U8 *e= uc + STR_LEN(noper);
2968 for ( ; uc < e ; uc += len ) {
2973 charid = trie->charmap[ uvc ];
2975 SV** const svpp = hv_fetch( widecharmap,
2982 charid=(U16)SvIV( *svpp );
2985 /* charid is now 0 if we dont know the char read, or
2986 * nonzero if we do */
2993 if ( !trie->states[ state ].trans.list ) {
2994 TRIE_LIST_NEW( state );
2997 check <= TRIE_LIST_USED( state );
3000 if ( TRIE_LIST_ITEM( state, check ).forid
3003 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3008 newstate = next_alloc++;
3009 prev_states[newstate] = state;
3010 TRIE_LIST_PUSH( state, charid, newstate );
3015 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3019 TRIE_HANDLE_WORD(state);
3021 } /* end second pass */
3023 /* next alloc is the NEXT state to be allocated */
3024 trie->statecount = next_alloc;
3025 trie->states = (reg_trie_state *)
3026 PerlMemShared_realloc( trie->states,
3028 * sizeof(reg_trie_state) );
3030 /* and now dump it out before we compress it */
3031 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3032 revcharmap, next_alloc,
3036 trie->trans = (reg_trie_trans *)
3037 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3044 for( state=1 ; state < next_alloc ; state ++ ) {
3048 DEBUG_TRIE_COMPILE_MORE_r(
3049 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
3053 if (trie->states[state].trans.list) {
3054 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3058 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3059 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3060 if ( forid < minid ) {
3062 } else if ( forid > maxid ) {
3066 if ( transcount < tp + maxid - minid + 1) {
3068 trie->trans = (reg_trie_trans *)
3069 PerlMemShared_realloc( trie->trans,
3071 * sizeof(reg_trie_trans) );
3072 Zero( trie->trans + (transcount / 2),
3076 base = trie->uniquecharcount + tp - minid;
3077 if ( maxid == minid ) {
3079 for ( ; zp < tp ; zp++ ) {
3080 if ( ! trie->trans[ zp ].next ) {
3081 base = trie->uniquecharcount + zp - minid;
3082 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3084 trie->trans[ zp ].check = state;
3090 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3092 trie->trans[ tp ].check = state;
3097 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3098 const U32 tid = base
3099 - trie->uniquecharcount
3100 + TRIE_LIST_ITEM( state, idx ).forid;
3101 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3103 trie->trans[ tid ].check = state;
3105 tp += ( maxid - minid + 1 );
3107 Safefree(trie->states[ state ].trans.list);
3110 DEBUG_TRIE_COMPILE_MORE_r(
3111 Perl_re_printf( aTHX_ " base: %d\n",base);
3114 trie->states[ state ].trans.base=base;
3116 trie->lasttrans = tp + 1;
3120 Second Pass -- Flat Table Representation.
3122 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3123 each. We know that we will need Charcount+1 trans at most to store
3124 the data (one row per char at worst case) So we preallocate both
3125 structures assuming worst case.
3127 We then construct the trie using only the .next slots of the entry
3130 We use the .check field of the first entry of the node temporarily
3131 to make compression both faster and easier by keeping track of how
3132 many non zero fields are in the node.
3134 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3137 There are two terms at use here: state as a TRIE_NODEIDX() which is
3138 a number representing the first entry of the node, and state as a
3139 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3140 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3141 if there are 2 entrys per node. eg:
3149 The table is internally in the right hand, idx form. However as we
3150 also have to deal with the states array which is indexed by nodenum
3151 we have to use TRIE_NODENUM() to convert.
3154 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
3157 trie->trans = (reg_trie_trans *)
3158 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3159 * trie->uniquecharcount + 1,
3160 sizeof(reg_trie_trans) );
3161 trie->states = (reg_trie_state *)
3162 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3163 sizeof(reg_trie_state) );
3164 next_alloc = trie->uniquecharcount + 1;
3167 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3169 regnode *noper = NEXTOPER( cur );
3171 U32 state = 1; /* required init */
3173 U16 charid = 0; /* sanity init */
3174 U32 accept_state = 0; /* sanity init */
3176 U32 wordlen = 0; /* required init */
3178 if (OP(noper) == NOTHING) {
3179 regnode *noper_next= regnext(noper);
3180 if (noper_next < tail)
3185 && ( OP(noper) == flags
3186 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
3187 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
3188 || OP(noper) == EXACTFU_SS))) )
3190 const U8 *uc= (U8*)STRING(noper);
3191 const U8 *e= uc + STR_LEN(noper);
3193 for ( ; uc < e ; uc += len ) {
3198 charid = trie->charmap[ uvc ];
3200 SV* const * const svpp = hv_fetch( widecharmap,
3204 charid = svpp ? (U16)SvIV(*svpp) : 0;
3208 if ( !trie->trans[ state + charid ].next ) {
3209 trie->trans[ state + charid ].next = next_alloc;
3210 trie->trans[ state ].check++;
3211 prev_states[TRIE_NODENUM(next_alloc)]
3212 = TRIE_NODENUM(state);
3213 next_alloc += trie->uniquecharcount;
3215 state = trie->trans[ state + charid ].next;
3217 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3219 /* charid is now 0 if we dont know the char read, or
3220 * nonzero if we do */
3223 accept_state = TRIE_NODENUM( state );
3224 TRIE_HANDLE_WORD(accept_state);
3226 } /* end second pass */
3228 /* and now dump it out before we compress it */
3229 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3231 next_alloc, depth+1));
3235 * Inplace compress the table.*
3237 For sparse data sets the table constructed by the trie algorithm will
3238 be mostly 0/FAIL transitions or to put it another way mostly empty.
3239 (Note that leaf nodes will not contain any transitions.)
3241 This algorithm compresses the tables by eliminating most such
3242 transitions, at the cost of a modest bit of extra work during lookup:
3244 - Each states[] entry contains a .base field which indicates the
3245 index in the state[] array wheres its transition data is stored.
3247 - If .base is 0 there are no valid transitions from that node.
3249 - If .base is nonzero then charid is added to it to find an entry in
3252 -If trans[states[state].base+charid].check!=state then the
3253 transition is taken to be a 0/Fail transition. Thus if there are fail
3254 transitions at the front of the node then the .base offset will point
3255 somewhere inside the previous nodes data (or maybe even into a node
3256 even earlier), but the .check field determines if the transition is
3260 The following process inplace converts the table to the compressed
3261 table: We first do not compress the root node 1,and mark all its
3262 .check pointers as 1 and set its .base pointer as 1 as well. This
3263 allows us to do a DFA construction from the compressed table later,
3264 and ensures that any .base pointers we calculate later are greater
3267 - We set 'pos' to indicate the first entry of the second node.
3269 - We then iterate over the columns of the node, finding the first and
3270 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3271 and set the .check pointers accordingly, and advance pos
3272 appropriately and repreat for the next node. Note that when we copy
3273 the next pointers we have to convert them from the original
3274 NODEIDX form to NODENUM form as the former is not valid post
3277 - If a node has no transitions used we mark its base as 0 and do not
3278 advance the pos pointer.
3280 - If a node only has one transition we use a second pointer into the
3281 structure to fill in allocated fail transitions from other states.
3282 This pointer is independent of the main pointer and scans forward
3283 looking for null transitions that are allocated to a state. When it
3284 finds one it writes the single transition into the "hole". If the
3285 pointer doesnt find one the single transition is appended as normal.
3287 - Once compressed we can Renew/realloc the structures to release the
3290 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3291 specifically Fig 3.47 and the associated pseudocode.
3295 const U32 laststate = TRIE_NODENUM( next_alloc );
3298 trie->statecount = laststate;
3300 for ( state = 1 ; state < laststate ; state++ ) {
3302 const U32 stateidx = TRIE_NODEIDX( state );
3303 const U32 o_used = trie->trans[ stateidx ].check;
3304 U32 used = trie->trans[ stateidx ].check;
3305 trie->trans[ stateidx ].check = 0;
3308 used && charid < trie->uniquecharcount;
3311 if ( flag || trie->trans[ stateidx + charid ].next ) {
3312 if ( trie->trans[ stateidx + charid ].next ) {
3314 for ( ; zp < pos ; zp++ ) {
3315 if ( ! trie->trans[ zp ].next ) {
3319 trie->states[ state ].trans.base
3321 + trie->uniquecharcount
3323 trie->trans[ zp ].next
3324 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3326 trie->trans[ zp ].check = state;
3327 if ( ++zp > pos ) pos = zp;
3334 trie->states[ state ].trans.base
3335 = pos + trie->uniquecharcount - charid ;
3337 trie->trans[ pos ].next
3338 = SAFE_TRIE_NODENUM(
3339 trie->trans[ stateidx + charid ].next );
3340 trie->trans[ pos ].check = state;
3345 trie->lasttrans = pos + 1;
3346 trie->states = (reg_trie_state *)
3347 PerlMemShared_realloc( trie->states, laststate
3348 * sizeof(reg_trie_state) );
3349 DEBUG_TRIE_COMPILE_MORE_r(
3350 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3352 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3356 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3359 } /* end table compress */
3361 DEBUG_TRIE_COMPILE_MORE_r(
3362 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3364 (UV)trie->statecount,
3365 (UV)trie->lasttrans)
3367 /* resize the trans array to remove unused space */
3368 trie->trans = (reg_trie_trans *)
3369 PerlMemShared_realloc( trie->trans, trie->lasttrans
3370 * sizeof(reg_trie_trans) );
3372 { /* Modify the program and insert the new TRIE node */
3373 U8 nodetype =(U8)(flags & 0xFF);
3377 regnode *optimize = NULL;
3378 #ifdef RE_TRACK_PATTERN_OFFSETS
3381 U32 mjd_nodelen = 0;
3382 #endif /* RE_TRACK_PATTERN_OFFSETS */
3383 #endif /* DEBUGGING */
3385 This means we convert either the first branch or the first Exact,
3386 depending on whether the thing following (in 'last') is a branch
3387 or not and whther first is the startbranch (ie is it a sub part of
3388 the alternation or is it the whole thing.)
3389 Assuming its a sub part we convert the EXACT otherwise we convert
3390 the whole branch sequence, including the first.
3392 /* Find the node we are going to overwrite */
3393 if ( first != startbranch || OP( last ) == BRANCH ) {
3394 /* branch sub-chain */
3395 NEXT_OFF( first ) = (U16)(last - first);
3396 #ifdef RE_TRACK_PATTERN_OFFSETS
3398 mjd_offset= Node_Offset((convert));
3399 mjd_nodelen= Node_Length((convert));
3402 /* whole branch chain */
3404 #ifdef RE_TRACK_PATTERN_OFFSETS
3407 const regnode *nop = NEXTOPER( convert );
3408 mjd_offset= Node_Offset((nop));
3409 mjd_nodelen= Node_Length((nop));
3413 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3415 (UV)mjd_offset, (UV)mjd_nodelen)
3418 /* But first we check to see if there is a common prefix we can
3419 split out as an EXACT and put in front of the TRIE node. */
3420 trie->startstate= 1;
3421 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3422 /* we want to find the first state that has more than
3423 * one transition, if that state is not the first state
3424 * then we have a common prefix which we can remove.
3427 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3429 I32 first_ofs = -1; /* keeps track of the ofs of the first
3430 transition, -1 means none */
3432 const U32 base = trie->states[ state ].trans.base;
3434 /* does this state terminate an alternation? */
3435 if ( trie->states[state].wordnum )
3438 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3439 if ( ( base + ofs >= trie->uniquecharcount ) &&
3440 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3441 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3443 if ( ++count > 1 ) {
3444 /* we have more than one transition */
3447 /* if this is the first state there is no common prefix
3448 * to extract, so we can exit */
3449 if ( state == 1 ) break;
3450 tmp = av_fetch( revcharmap, ofs, 0);
3451 ch = (U8*)SvPV_nolen_const( *tmp );
3453 /* if we are on count 2 then we need to initialize the
3454 * bitmap, and store the previous char if there was one
3457 /* clear the bitmap */
3458 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3460 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
3463 if (first_ofs >= 0) {
3464 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3465 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3467 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3469 Perl_re_printf( aTHX_ "%s", (char*)ch)
3473 /* store the current firstchar in the bitmap */
3474 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3475 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3481 /* This state has only one transition, its transition is part
3482 * of a common prefix - we need to concatenate the char it
3483 * represents to what we have so far. */
3484 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3486 char *ch = SvPV( *tmp, len );
3488 SV *sv=sv_newmortal();
3489 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3491 (UV)state, (UV)first_ofs,
3492 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3493 PL_colors[0], PL_colors[1],
3494 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3495 PERL_PV_ESCAPE_FIRSTCHAR
3500 OP( convert ) = nodetype;
3501 str=STRING(convert);
3504 STR_LEN(convert) += len;
3510 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3515 trie->prefixlen = (state-1);
3517 regnode *n = convert+NODE_SZ_STR(convert);
3518 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3519 trie->startstate = state;
3520 trie->minlen -= (state - 1);
3521 trie->maxlen -= (state - 1);
3523 /* At least the UNICOS C compiler choked on this
3524 * being argument to DEBUG_r(), so let's just have
3527 #ifdef PERL_EXT_RE_BUILD
3533 regnode *fix = convert;
3534 U32 word = trie->wordcount;
3535 #ifdef RE_TRACK_PATTERN_OFFSETS
3538 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3539 while( ++fix < n ) {
3540 Set_Node_Offset_Length(fix, 0, 0);
3543 SV ** const tmp = av_fetch( trie_words, word, 0 );
3545 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3546 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3548 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3556 NEXT_OFF(convert) = (U16)(tail - convert);
3557 DEBUG_r(optimize= n);
3563 if ( trie->maxlen ) {
3564 NEXT_OFF( convert ) = (U16)(tail - convert);
3565 ARG_SET( convert, data_slot );
3566 /* Store the offset to the first unabsorbed branch in
3567 jump[0], which is otherwise unused by the jump logic.
3568 We use this when dumping a trie and during optimisation. */
3570 trie->jump[0] = (U16)(nextbranch - convert);
3572 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3573 * and there is a bitmap
3574 * and the first "jump target" node we found leaves enough room
3575 * then convert the TRIE node into a TRIEC node, with the bitmap
3576 * embedded inline in the opcode - this is hypothetically faster.
3578 if ( !trie->states[trie->startstate].wordnum
3580 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3582 OP( convert ) = TRIEC;
3583 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3584 PerlMemShared_free(trie->bitmap);
3587 OP( convert ) = TRIE;
3589 /* store the type in the flags */
3590 convert->flags = nodetype;
3594 + regarglen[ OP( convert ) ];
3596 /* XXX We really should free up the resource in trie now,
3597 as we won't use them - (which resources?) dmq */
3599 /* needed for dumping*/
3600 DEBUG_r(if (optimize) {
3601 regnode *opt = convert;
3603 while ( ++opt < optimize) {
3604 Set_Node_Offset_Length(opt, 0, 0);
3607 Try to clean up some of the debris left after the
3610 while( optimize < jumper ) {
3611 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3612 OP( optimize ) = OPTIMIZED;
3613 Set_Node_Offset_Length(optimize, 0, 0);
3616 Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3618 } /* end node insert */
3620 /* Finish populating the prev field of the wordinfo array. Walk back
3621 * from each accept state until we find another accept state, and if
3622 * so, point the first word's .prev field at the second word. If the
3623 * second already has a .prev field set, stop now. This will be the
3624 * case either if we've already processed that word's accept state,
3625 * or that state had multiple words, and the overspill words were
3626 * already linked up earlier.
3633 for (word=1; word <= trie->wordcount; word++) {
3635 if (trie->wordinfo[word].prev)
3637 state = trie->wordinfo[word].accept;
3639 state = prev_states[state];
3642 prev = trie->states[state].wordnum;
3646 trie->wordinfo[word].prev = prev;
3648 Safefree(prev_states);
3652 /* and now dump out the compressed format */
3653 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3655 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3657 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3658 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3660 SvREFCNT_dec_NN(revcharmap);
3664 : trie->startstate>1
3670 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3672 /* The Trie is constructed and compressed now so we can build a fail array if
3675 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3677 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3681 We find the fail state for each state in the trie, this state is the longest
3682 proper suffix of the current state's 'word' that is also a proper prefix of
3683 another word in our trie. State 1 represents the word '' and is thus the
3684 default fail state. This allows the DFA not to have to restart after its
3685 tried and failed a word at a given point, it simply continues as though it
3686 had been matching the other word in the first place.
3688 'abcdgu'=~/abcdefg|cdgu/
3689 When we get to 'd' we are still matching the first word, we would encounter
3690 'g' which would fail, which would bring us to the state representing 'd' in
3691 the second word where we would try 'g' and succeed, proceeding to match
3694 /* add a fail transition */
3695 const U32 trie_offset = ARG(source);
3696 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3698 const U32 ucharcount = trie->uniquecharcount;
3699 const U32 numstates = trie->statecount;
3700 const U32 ubound = trie->lasttrans + ucharcount;
3704 U32 base = trie->states[ 1 ].trans.base;
3707 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3709 GET_RE_DEBUG_FLAGS_DECL;
3711 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3712 PERL_UNUSED_CONTEXT;
3714 PERL_UNUSED_ARG(depth);
3717 if ( OP(source) == TRIE ) {
3718 struct regnode_1 *op = (struct regnode_1 *)
3719 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3720 StructCopy(source, op, struct regnode_1);
3721 stclass = (regnode *)op;
3723 struct regnode_charclass *op = (struct regnode_charclass *)
3724 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3725 StructCopy(source, op, struct regnode_charclass);
3726 stclass = (regnode *)op;
3728 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3730 ARG_SET( stclass, data_slot );
3731 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3732 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3733 aho->trie=trie_offset;
3734 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3735 Copy( trie->states, aho->states, numstates, reg_trie_state );
3736 Newx( q, numstates, U32);
3737 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3740 /* initialize fail[0..1] to be 1 so that we always have
3741 a valid final fail state */
3742 fail[ 0 ] = fail[ 1 ] = 1;
3744 for ( charid = 0; charid < ucharcount ; charid++ ) {
3745 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3747 q[ q_write ] = newstate;
3748 /* set to point at the root */
3749 fail[ q[ q_write++ ] ]=1;
3752 while ( q_read < q_write) {
3753 const U32 cur = q[ q_read++ % numstates ];
3754 base = trie->states[ cur ].trans.base;
3756 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3757 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3759 U32 fail_state = cur;
3762 fail_state = fail[ fail_state ];
3763 fail_base = aho->states[ fail_state ].trans.base;
3764 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3766 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3767 fail[ ch_state ] = fail_state;
3768 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3770 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3772 q[ q_write++ % numstates] = ch_state;
3776 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3777 when we fail in state 1, this allows us to use the
3778 charclass scan to find a valid start char. This is based on the principle
3779 that theres a good chance the string being searched contains lots of stuff
3780 that cant be a start char.
3782 fail[ 0 ] = fail[ 1 ] = 0;
3783 DEBUG_TRIE_COMPILE_r({
3784 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
3785 depth, (UV)numstates
3787 for( q_read=1; q_read<numstates; q_read++ ) {
3788 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
3790 Perl_re_printf( aTHX_ "\n");
3793 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3798 /* The below joins as many adjacent EXACTish nodes as possible into a single
3799 * one. The regop may be changed if the node(s) contain certain sequences that
3800 * require special handling. The joining is only done if:
3801 * 1) there is room in the current conglomerated node to entirely contain the
3803 * 2) they are the exact same node type
3805 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3806 * these get optimized out
3808 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3809 * as possible, even if that means splitting an existing node so that its first
3810 * part is moved to the preceeding node. This would maximise the efficiency of
3811 * memEQ during matching.
3813 * If a node is to match under /i (folded), the number of characters it matches
3814 * can be different than its character length if it contains a multi-character
3815 * fold. *min_subtract is set to the total delta number of characters of the
3818 * And *unfolded_multi_char is set to indicate whether or not the node contains
3819 * an unfolded multi-char fold. This happens when it won't be known until
3820 * runtime whether the fold is valid or not; namely
3821 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3822 * target string being matched against turns out to be UTF-8 is that fold
3824 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
3826 * (Multi-char folds whose components are all above the Latin1 range are not
3827 * run-time locale dependent, and have already been folded by the time this
3828 * function is called.)
3830 * This is as good a place as any to discuss the design of handling these
3831 * multi-character fold sequences. It's been wrong in Perl for a very long
3832 * time. There are three code points in Unicode whose multi-character folds
3833 * were long ago discovered to mess things up. The previous designs for
3834 * dealing with these involved assigning a special node for them. This
3835 * approach doesn't always work, as evidenced by this example:
3836 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3837 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3838 * would match just the \xDF, it won't be able to handle the case where a
3839 * successful match would have to cross the node's boundary. The new approach
3840 * that hopefully generally solves the problem generates an EXACTFU_SS node
3841 * that is "sss" in this case.
3843 * It turns out that there are problems with all multi-character folds, and not
3844 * just these three. Now the code is general, for all such cases. The
3845 * approach taken is:
3846 * 1) This routine examines each EXACTFish node that could contain multi-
3847 * character folded sequences. Since a single character can fold into
3848 * such a sequence, the minimum match length for this node is less than
3849 * the number of characters in the node. This routine returns in
3850 * *min_subtract how many characters to subtract from the the actual
3851 * length of the string to get a real minimum match length; it is 0 if
3852 * there are no multi-char foldeds. This delta is used by the caller to
3853 * adjust the min length of the match, and the delta between min and max,
3854 * so that the optimizer doesn't reject these possibilities based on size
3856 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3857 * is used for an EXACTFU node that contains at least one "ss" sequence in
3858 * it. For non-UTF-8 patterns and strings, this is the only case where
3859 * there is a possible fold length change. That means that a regular
3860 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3861 * with length changes, and so can be processed faster. regexec.c takes
3862 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3863 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3864 * known until runtime). This saves effort in regex matching. However,
3865 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3866 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3867 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3868 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3869 * possibilities for the non-UTF8 patterns are quite simple, except for
3870 * the sharp s. All the ones that don't involve a UTF-8 target string are
3871 * members of a fold-pair, and arrays are set up for all of them so that
3872 * the other member of the pair can be found quickly. Code elsewhere in
3873 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3874 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3875 * described in the next item.
3876 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3877 * validity of the fold won't be known until runtime, and so must remain
3878 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
3879 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3880 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3881 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3882 * The reason this is a problem is that the optimizer part of regexec.c
3883 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3884 * that a character in the pattern corresponds to at most a single
3885 * character in the target string. (And I do mean character, and not byte
3886 * here, unlike other parts of the documentation that have never been
3887 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3888 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
3889 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
3890 * EXACTFL nodes, violate the assumption, and they are the only instances
3891 * where it is violated. I'm reluctant to try to change the assumption,
3892 * as the code involved is impenetrable to me (khw), so instead the code
3893 * here punts. This routine examines EXACTFL nodes, and (when the pattern
3894 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
3895 * boolean indicating whether or not the node contains such a fold. When
3896 * it is true, the caller sets a flag that later causes the optimizer in
3897 * this file to not set values for the floating and fixed string lengths,
3898 * and thus avoids the optimizer code in regexec.c that makes the invalid
3899 * assumption. Thus, there is no optimization based on string lengths for
3900 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3901 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
3902 * assumption is wrong only in these cases is that all other non-UTF-8
3903 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3904 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3905 * EXACTF nodes because we don't know at compile time if it actually
3906 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3907 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3908 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
3909 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3910 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3911 * string would require the pattern to be forced into UTF-8, the overhead
3912 * of which we want to avoid. Similarly the unfolded multi-char folds in
3913 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3916 * Similarly, the code that generates tries doesn't currently handle
3917 * not-already-folded multi-char folds, and it looks like a pain to change
3918 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
3919 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
3920 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
3921 * using /iaa matching will be doing so almost entirely with ASCII
3922 * strings, so this should rarely be encountered in practice */
3924 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3925 if (PL_regkind[OP(scan)] == EXACT) \
3926 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
3929 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3930 UV *min_subtract, bool *unfolded_multi_char,
3931 U32 flags, regnode *val, U32 depth)
3933 /* Merge several consecutive EXACTish nodes into one. */
3934 regnode *n = regnext(scan);
3936 regnode *next = scan + NODE_SZ_STR(scan);
3940 regnode *stop = scan;
3941 GET_RE_DEBUG_FLAGS_DECL;
3943 PERL_UNUSED_ARG(depth);
3946 PERL_ARGS_ASSERT_JOIN_EXACT;
3947 #ifndef EXPERIMENTAL_INPLACESCAN
3948 PERL_UNUSED_ARG(flags);
3949 PERL_UNUSED_ARG(val);
3951 DEBUG_PEEP("join", scan, depth, 0);
3953 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3954 * EXACT ones that are mergeable to the current one. */
3956 && (PL_regkind[OP(n)] == NOTHING
3957 || (stringok && OP(n) == OP(scan)))
3959 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3962 if (OP(n) == TAIL || n > next)
3964 if (PL_regkind[OP(n)] == NOTHING) {
3965 DEBUG_PEEP("skip:", n, depth, 0);
3966 NEXT_OFF(scan) += NEXT_OFF(n);
3967 next = n + NODE_STEP_REGNODE;
3974 else if (stringok) {
3975 const unsigned int oldl = STR_LEN(scan);
3976 regnode * const nnext = regnext(n);
3978 /* XXX I (khw) kind of doubt that this works on platforms (should
3979 * Perl ever run on one) where U8_MAX is above 255 because of lots
3980 * of other assumptions */
3981 /* Don't join if the sum can't fit into a single node */
3982 if (oldl + STR_LEN(n) > U8_MAX)
3985 DEBUG_PEEP("merg", n, depth, 0);
3988 NEXT_OFF(scan) += NEXT_OFF(n);
3989 STR_LEN(scan) += STR_LEN(n);
3990 next = n + NODE_SZ_STR(n);
3991 /* Now we can overwrite *n : */
3992 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4000 #ifdef EXPERIMENTAL_INPLACESCAN
4001 if (flags && !NEXT_OFF(n)) {
4002 DEBUG_PEEP("atch", val, depth, 0);
4003 if (reg_off_by_arg[OP(n)]) {
4004 ARG_SET(n, val - n);
4007 NEXT_OFF(n) = val - n;
4015 *unfolded_multi_char = FALSE;
4017 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
4018 * can now analyze for sequences of problematic code points. (Prior to
4019 * this final joining, sequences could have been split over boundaries, and
4020 * hence missed). The sequences only happen in folding, hence for any
4021 * non-EXACT EXACTish node */
4022 if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
4023 U8* s0 = (U8*) STRING(scan);
4025 U8* s_end = s0 + STR_LEN(scan);
4027 int total_count_delta = 0; /* Total delta number of characters that
4028 multi-char folds expand to */
4030 /* One pass is made over the node's string looking for all the
4031 * possibilities. To avoid some tests in the loop, there are two main
4032 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4037 if (OP(scan) == EXACTFL) {
4040 /* An EXACTFL node would already have been changed to another
4041 * node type unless there is at least one character in it that
4042 * is problematic; likely a character whose fold definition
4043 * won't be known until runtime, and so has yet to be folded.
4044 * For all but the UTF-8 locale, folds are 1-1 in length, but
4045 * to handle the UTF-8 case, we need to create a temporary
4046 * folded copy using UTF-8 locale rules in order to analyze it.
4047 * This is because our macros that look to see if a sequence is
4048 * a multi-char fold assume everything is folded (otherwise the
4049 * tests in those macros would be too complicated and slow).
4050 * Note that here, the non-problematic folds will have already
4051 * been done, so we can just copy such characters. We actually
4052 * don't completely fold the EXACTFL string. We skip the
4053 * unfolded multi-char folds, as that would just create work
4054 * below to figure out the size they already are */
4056 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4059 STRLEN s_len = UTF8SKIP(s);
4060 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4061 Copy(s, d, s_len, U8);
4064 else if (is_FOLDS_TO_MULTI_utf8(s)) {
4065 *unfolded_multi_char = TRUE;
4066 Copy(s, d, s_len, U8);
4069 else if (isASCII(*s)) {
4070 *(d++) = toFOLD(*s);
4074 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4080 /* Point the remainder of the routine to look at our temporary
4084 } /* End of creating folded copy of EXACTFL string */
4086 /* Examine the string for a multi-character fold sequence. UTF-8
4087 * patterns have all characters pre-folded by the time this code is
4089 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4090 length sequence we are looking for is 2 */
4092 int count = 0; /* How many characters in a multi-char fold */
4093 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4094 if (! len) { /* Not a multi-char fold: get next char */
4099 /* Nodes with 'ss' require special handling, except for
4100 * EXACTFAA-ish for which there is no multi-char fold to this */
4101 if (len == 2 && *s == 's' && *(s+1) == 's'
4102 && OP(scan) != EXACTFAA
4103 && OP(scan) != EXACTFAA_NO_TRIE)
4106 if (OP(scan) != EXACTFL) {
4107 OP(scan) = EXACTFU_SS;
4111 else { /* Here is a generic multi-char fold. */
4112 U8* multi_end = s + len;
4114 /* Count how many characters are in it. In the case of
4115 * /aa, no folds which contain ASCII code points are
4116 * allowed, so check for those, and skip if found. */
4117 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4118 count = utf8_length(s, multi_end);
4122 while (s < multi_end) {
4125 goto next_iteration;
4135 /* The delta is how long the sequence is minus 1 (1 is how long
4136 * the character that folds to the sequence is) */
4137 total_count_delta += count - 1;
4141 /* We created a temporary folded copy of the string in EXACTFL
4142 * nodes. Therefore we need to be sure it doesn't go below zero,
4143 * as the real string could be shorter */
4144 if (OP(scan) == EXACTFL) {
4145 int total_chars = utf8_length((U8*) STRING(scan),
4146 (U8*) STRING(scan) + STR_LEN(scan));
4147 if (total_count_delta > total_chars) {
4148 total_count_delta = total_chars;
4152 *min_subtract += total_count_delta;
4155 else if (OP(scan) == EXACTFAA) {
4157 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
4158 * fold to the ASCII range (and there are no existing ones in the
4159 * upper latin1 range). But, as outlined in the comments preceding
4160 * this function, we need to flag any occurrences of the sharp s.
4161 * This character forbids trie formation (because of added
4163 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
4164 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
4165 || UNICODE_DOT_DOT_VERSION > 0)
4167 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4168 OP(scan) = EXACTFAA_NO_TRIE;
4169 *unfolded_multi_char = TRUE;
4177 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
4178 * folds that are all Latin1. As explained in the comments
4179 * preceding this function, we look also for the sharp s in EXACTF
4180 * and EXACTFL nodes; it can be in the final position. Otherwise
4181 * we can stop looking 1 byte earlier because have to find at least
4182 * two characters for a multi-fold */
4183 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4188 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4189 if (! len) { /* Not a multi-char fold. */
4190 if (*s == LATIN_SMALL_LETTER_SHARP_S
4191 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4193 *unfolded_multi_char = TRUE;
4200 && isALPHA_FOLD_EQ(*s, 's')
4201 && isALPHA_FOLD_EQ(*(s+1), 's'))
4204 /* EXACTF nodes need to know that the minimum length
4205 * changed so that a sharp s in the string can match this
4206 * ss in the pattern, but they remain EXACTF nodes, as they
4207 * won't match this unless the target string is is UTF-8,
4208 * which we don't know until runtime. EXACTFL nodes can't
4209 * transform into EXACTFU nodes */
4210 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4211 OP(scan) = EXACTFU_SS;
4215 *min_subtract += len - 1;
4223 /* Allow dumping but overwriting the collection of skipped
4224 * ops and/or strings with fake optimized ops */
4225 n = scan + NODE_SZ_STR(scan);
4233 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4237 /* REx optimizer. Converts nodes into quicker variants "in place".
4238 Finds fixed substrings. */
4240 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4241 to the position after last scanned or to NULL. */
4243 #define INIT_AND_WITHP \
4244 assert(!and_withp); \
4245 Newx(and_withp, 1, regnode_ssc); \
4246 SAVEFREEPV(and_withp)
4250 S_unwind_scan_frames(pTHX_ const void *p)
4252 scan_frame *f= (scan_frame *)p;
4254 scan_frame *n= f->next_frame;
4260 /* the return from this sub is the minimum length that could possibly match */
4262 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4263 SSize_t *minlenp, SSize_t *deltap,
4268 regnode_ssc *and_withp,
4269 U32 flags, U32 depth)
4270 /* scanp: Start here (read-write). */
4271 /* deltap: Write maxlen-minlen here. */
4272 /* last: Stop before this one. */
4273 /* data: string data about the pattern */
4274 /* stopparen: treat close N as END */
4275 /* recursed: which subroutines have we recursed into */
4276 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4278 /* There must be at least this number of characters to match */
4281 regnode *scan = *scanp, *next;
4283 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4284 int is_inf_internal = 0; /* The studied chunk is infinite */
4285 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4286 scan_data_t data_fake;
4287 SV *re_trie_maxbuff = NULL;
4288 regnode *first_non_open = scan;
4289 SSize_t stopmin = SSize_t_MAX;
4290 scan_frame *frame = NULL;
4291 GET_RE_DEBUG_FLAGS_DECL;
4293 PERL_ARGS_ASSERT_STUDY_CHUNK;
4294 RExC_study_started= 1;
4296 Zero(&data_fake, 1, scan_data_t);
4299 while (first_non_open && OP(first_non_open) == OPEN)
4300 first_non_open=regnext(first_non_open);
4306 RExC_study_chunk_recursed_count++;
4308 DEBUG_OPTIMISE_MORE_r(
4310 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4311 depth, (long)stopparen,
4312 (unsigned long)RExC_study_chunk_recursed_count,
4313 (unsigned long)depth, (unsigned long)recursed_depth,
4316 if (recursed_depth) {
4319 for ( j = 0 ; j < recursed_depth ; j++ ) {
4320 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4322 PAREN_TEST(RExC_study_chunk_recursed +
4323 ( j * RExC_study_chunk_recursed_bytes), i )
4326 !PAREN_TEST(RExC_study_chunk_recursed +
4327 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4330 Perl_re_printf( aTHX_ " %d",(int)i);
4334 if ( j + 1 < recursed_depth ) {
4335 Perl_re_printf( aTHX_ ",");
4339 Perl_re_printf( aTHX_ "\n");
4342 while ( scan && OP(scan) != END && scan < last ){
4343 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
4344 node length to get a real minimum (because
4345 the folded version may be shorter) */
4346 bool unfolded_multi_char = FALSE;
4347 /* Peephole optimizer: */
4348 DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4349 DEBUG_PEEP("Peep", scan, depth, flags);
4352 /* The reason we do this here is that we need to deal with things like
4353 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4354 * parsing code, as each (?:..) is handled by a different invocation of
4357 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4359 /* Follow the next-chain of the current node and optimize
4360 away all the NOTHINGs from it. */
4361 if (OP(scan) != CURLYX) {
4362 const int max = (reg_off_by_arg[OP(scan)]
4364 /* I32 may be smaller than U16 on CRAYs! */
4365 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4366 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4370 /* Skip NOTHING and LONGJMP. */
4371 while ((n = regnext(n))
4372 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4373 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4374 && off + noff < max)
4376 if (reg_off_by_arg[OP(scan)])
4379 NEXT_OFF(scan) = off;
4382 /* The principal pseudo-switch. Cannot be a switch, since we
4383 look into several different things. */
4384 if ( OP(scan) == DEFINEP ) {
4386 SSize_t deltanext = 0;
4387 SSize_t fake_last_close = 0;
4388 I32 f = SCF_IN_DEFINE;
4390 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4391 scan = regnext(scan);
4392 assert( OP(scan) == IFTHEN );
4393 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4395 data_fake.last_closep= &fake_last_close;
4397 next = regnext(scan);
4398 scan = NEXTOPER(NEXTOPER(scan));
4399 DEBUG_PEEP("scan", scan, depth, flags);
4400 DEBUG_PEEP("next", next, depth, flags);
4402 /* we suppose the run is continuous, last=next...
4403 * NOTE we dont use the return here! */
4404 /* DEFINEP study_chunk() recursion */
4405 (void)study_chunk(pRExC_state, &scan, &minlen,
4406 &deltanext, next, &data_fake, stopparen,
4407 recursed_depth, NULL, f, depth+1);
4412 OP(scan) == BRANCH ||
4413 OP(scan) == BRANCHJ ||
4416 next = regnext(scan);
4419 /* The op(next)==code check below is to see if we
4420 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4421 * IFTHEN is special as it might not appear in pairs.
4422 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4423 * we dont handle it cleanly. */
4424 if (OP(next) == code || code == IFTHEN) {
4425 /* NOTE - There is similar code to this block below for
4426 * handling TRIE nodes on a re-study. If you change stuff here
4427 * check there too. */
4428 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4430 regnode * const startbranch=scan;
4432 if (flags & SCF_DO_SUBSTR) {
4433 /* Cannot merge strings after this. */
4434 scan_commit(pRExC_state, data, minlenp, is_inf);
4437 if (flags & SCF_DO_STCLASS)
4438 ssc_init_zero(pRExC_state, &accum);
4440 while (OP(scan) == code) {
4441 SSize_t deltanext, minnext, fake;
4443 regnode_ssc this_class;
4445 DEBUG_PEEP("Branch", scan, depth, flags);
4448 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4450 data_fake.whilem_c = data->whilem_c;
4451 data_fake.last_closep = data->last_closep;
4454 data_fake.last_closep = &fake;
4456 data_fake.pos_delta = delta;
4457 next = regnext(scan);
4459 scan = NEXTOPER(scan); /* everything */
4460 if (code != BRANCH) /* everything but BRANCH */
4461 scan = NEXTOPER(scan);
4463 if (flags & SCF_DO_STCLASS) {
4464 ssc_init(pRExC_state, &this_class);
4465 data_fake.start_class = &this_class;
4466 f = SCF_DO_STCLASS_AND;
4468 if (flags & SCF_WHILEM_VISITED_POS)
4469 f |= SCF_WHILEM_VISITED_POS;
4471 /* we suppose the run is continuous, last=next...*/
4472 /* recurse study_chunk() for each BRANCH in an alternation */
4473 minnext = study_chunk(pRExC_state, &scan, minlenp,
4474 &deltanext, next, &data_fake, stopparen,
4475 recursed_depth, NULL, f, depth+1);
4479 if (deltanext == SSize_t_MAX) {
4480 is_inf = is_inf_internal = 1;
4482 } else if (max1 < minnext + deltanext)
4483 max1 = minnext + deltanext;
4485 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4487 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4488 if ( stopmin > minnext)
4489 stopmin = min + min1;
4490 flags &= ~SCF_DO_SUBSTR;
4492 data->flags |= SCF_SEEN_ACCEPT;
4495 if (data_fake.flags & SF_HAS_EVAL)
4496 data->flags |= SF_HAS_EVAL;
4497 data->whilem_c = data_fake.whilem_c;
4499 if (flags & SCF_DO_STCLASS)
4500 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4502 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4504 if (flags & SCF_DO_SUBSTR) {
4505 data->pos_min += min1;
4506 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4507 data->pos_delta = SSize_t_MAX;
4509 data->pos_delta += max1 - min1;
4510 if (max1 != min1 || is_inf)
4511 data->cur_is_floating = 1;
4514 if (delta == SSize_t_MAX
4515 || SSize_t_MAX - delta - (max1 - min1) < 0)
4516 delta = SSize_t_MAX;
4518 delta += max1 - min1;
4519 if (flags & SCF_DO_STCLASS_OR) {
4520 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4522 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4523 flags &= ~SCF_DO_STCLASS;
4526 else if (flags & SCF_DO_STCLASS_AND) {
4528 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4529 flags &= ~SCF_DO_STCLASS;
4532 /* Switch to OR mode: cache the old value of
4533 * data->start_class */
4535 StructCopy(data->start_class, and_withp, regnode_ssc);
4536 flags &= ~SCF_DO_STCLASS_AND;
4537 StructCopy(&accum, data->start_class, regnode_ssc);
4538 flags |= SCF_DO_STCLASS_OR;
4542 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4543 OP( startbranch ) == BRANCH )
4547 Assuming this was/is a branch we are dealing with: 'scan'
4548 now points at the item that follows the branch sequence,
4549 whatever it is. We now start at the beginning of the
4550 sequence and look for subsequences of
4556 which would be constructed from a pattern like
4559 If we can find such a subsequence we need to turn the first
4560 element into a trie and then add the subsequent branch exact
4561 strings to the trie.
4565 1. patterns where the whole set of branches can be
4568 2. patterns where only a subset can be converted.
4570 In case 1 we can replace the whole set with a single regop
4571 for the trie. In case 2 we need to keep the start and end
4574 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4575 becomes BRANCH TRIE; BRANCH X;
4577 There is an additional case, that being where there is a
4578 common prefix, which gets split out into an EXACT like node
4579 preceding the TRIE node.
4581 If x(1..n)==tail then we can do a simple trie, if not we make
4582 a "jump" trie, such that when we match the appropriate word
4583 we "jump" to the appropriate tail node. Essentially we turn
4584 a nested if into a case structure of sorts.
4589 if (!re_trie_maxbuff) {
4590 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4591 if (!SvIOK(re_trie_maxbuff))
4592 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4594 if ( SvIV(re_trie_maxbuff)>=0 ) {
4596 regnode *first = (regnode *)NULL;
4597 regnode *last = (regnode *)NULL;
4598 regnode *tail = scan;
4602 /* var tail is used because there may be a TAIL
4603 regop in the way. Ie, the exacts will point to the
4604 thing following the TAIL, but the last branch will
4605 point at the TAIL. So we advance tail. If we
4606 have nested (?:) we may have to move through several
4610 while ( OP( tail ) == TAIL ) {
4611 /* this is the TAIL generated by (?:) */
4612 tail = regnext( tail );
4616 DEBUG_TRIE_COMPILE_r({
4617 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4618 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
4620 "Looking for TRIE'able sequences. Tail node is ",
4621 (UV) REGNODE_OFFSET(tail),
4622 SvPV_nolen_const( RExC_mysv )
4628 Step through the branches
4629 cur represents each branch,
4630 noper is the first thing to be matched as part
4632 noper_next is the regnext() of that node.
4634 We normally handle a case like this
4635 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4636 support building with NOJUMPTRIE, which restricts
4637 the trie logic to structures like /FOO|BAR/.
4639 If noper is a trieable nodetype then the branch is
4640 a possible optimization target. If we are building
4641 under NOJUMPTRIE then we require that noper_next is
4642 the same as scan (our current position in the regex
4645 Once we have two or more consecutive such branches
4646 we can create a trie of the EXACT's contents and
4647 stitch it in place into the program.
4649 If the sequence represents all of the branches in
4650 the alternation we replace the entire thing with a
4653 Otherwise when it is a subsequence we need to
4654 stitch it in place and replace only the relevant
4655 branches. This means the first branch has to remain
4656 as it is used by the alternation logic, and its
4657 next pointer, and needs to be repointed at the item
4658 on the branch chain following the last branch we
4659 have optimized away.
4661 This could be either a BRANCH, in which case the
4662 subsequence is internal, or it could be the item
4663 following the branch sequence in which case the
4664 subsequence is at the end (which does not
4665 necessarily mean the first node is the start of the
4668 TRIE_TYPE(X) is a define which maps the optype to a
4672 ----------------+-----------
4677 EXACTFU_ONLY8 | EXACTFU
4678 EXACTFU_SS | EXACTFU
4681 EXACTFLU8 | EXACTFLU8
4685 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4687 : ( EXACT == (X) || EXACT_ONLY8 == (X) ) \
4689 : ( EXACTFU == (X) \
4690 || EXACTFU_ONLY8 == (X) \
4691 || EXACTFU_SS == (X) ) \
4693 : ( EXACTFAA == (X) ) \
4695 : ( EXACTL == (X) ) \
4697 : ( EXACTFLU8 == (X) ) \
4701 /* dont use tail as the end marker for this traverse */
4702 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4703 regnode * const noper = NEXTOPER( cur );
4704 U8 noper_type = OP( noper );
4705 U8 noper_trietype = TRIE_TYPE( noper_type );
4706 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4707 regnode * const noper_next = regnext( noper );
4708 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4709 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4712 DEBUG_TRIE_COMPILE_r({
4713 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4714 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
4716 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4718 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4719 Perl_re_printf( aTHX_ " -> %d:%s",
4720 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4723 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4724 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4725 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4727 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4728 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4729 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4733 /* Is noper a trieable nodetype that can be merged
4734 * with the current trie (if there is one)? */
4738 ( noper_trietype == NOTHING )
4739 || ( trietype == NOTHING )
4740 || ( trietype == noper_trietype )
4743 && noper_next >= tail
4747 /* Handle mergable triable node Either we are
4748 * the first node in a new trieable sequence,
4749 * in which case we do some bookkeeping,
4750 * otherwise we update the end pointer. */
4753 if ( noper_trietype == NOTHING ) {
4754 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4755 regnode * const noper_next = regnext( noper );
4756 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4757 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4760 if ( noper_next_trietype ) {
4761 trietype = noper_next_trietype;
4762 } else if (noper_next_type) {
4763 /* a NOTHING regop is 1 regop wide.
4764 * We need at least two for a trie
4765 * so we can't merge this in */
4769 trietype = noper_trietype;
4772 if ( trietype == NOTHING )
4773 trietype = noper_trietype;
4778 } /* end handle mergable triable node */
4780 /* handle unmergable node -
4781 * noper may either be a triable node which can
4782 * not be tried together with the current trie,
4783 * or a non triable node */
4785 /* If last is set and trietype is not
4786 * NOTHING then we have found at least two
4787 * triable branch sequences in a row of a
4788 * similar trietype so we can turn them
4789 * into a trie. If/when we allow NOTHING to
4790 * start a trie sequence this condition
4791 * will be required, and it isn't expensive
4792 * so we leave it in for now. */
4793 if ( trietype && trietype != NOTHING )
4794 make_trie( pRExC_state,
4795 startbranch, first, cur, tail,
4796 count, trietype, depth+1 );
4797 last = NULL; /* note: we clear/update
4798 first, trietype etc below,
4799 so we dont do it here */
4803 && noper_next >= tail
4806 /* noper is triable, so we can start a new
4810 trietype = noper_trietype;
4812 /* if we already saw a first but the
4813 * current node is not triable then we have
4814 * to reset the first information. */
4819 } /* end handle unmergable node */
4820 } /* loop over branches */
4821 DEBUG_TRIE_COMPILE_r({
4822 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4823 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
4824 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
4825 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
4826 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4827 PL_reg_name[trietype]
4831 if ( last && trietype ) {
4832 if ( trietype != NOTHING ) {
4833 /* the last branch of the sequence was part of
4834 * a trie, so we have to construct it here
4835 * outside of the loop */
4836 made= make_trie( pRExC_state, startbranch,
4837 first, scan, tail, count,
4838 trietype, depth+1 );
4839 #ifdef TRIE_STUDY_OPT
4840 if ( ((made == MADE_EXACT_TRIE &&
4841 startbranch == first)
4842 || ( first_non_open == first )) &&
4844 flags |= SCF_TRIE_RESTUDY;
4845 if ( startbranch == first
4848 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4853 /* at this point we know whatever we have is a
4854 * NOTHING sequence/branch AND if 'startbranch'
4855 * is 'first' then we can turn the whole thing
4858 if ( startbranch == first ) {
4860 /* the entire thing is a NOTHING sequence,
4861 * something like this: (?:|) So we can
4862 * turn it into a plain NOTHING op. */
4863 DEBUG_TRIE_COMPILE_r({
4864 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4865 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
4867 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
4870 OP(startbranch)= NOTHING;
4871 NEXT_OFF(startbranch)= tail - startbranch;
4872 for ( opt= startbranch + 1; opt < tail ; opt++ )
4876 } /* end if ( last) */
4877 } /* TRIE_MAXBUF is non zero */
4882 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4883 scan = NEXTOPER(NEXTOPER(scan));
4884 } else /* single branch is optimized. */
4885 scan = NEXTOPER(scan);
4887 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
4889 regnode *start = NULL;
4890 regnode *end = NULL;
4891 U32 my_recursed_depth= recursed_depth;
4893 if (OP(scan) != SUSPEND) { /* GOSUB */
4894 /* Do setup, note this code has side effects beyond
4895 * the rest of this block. Specifically setting
4896 * RExC_recurse[] must happen at least once during
4899 RExC_recurse[ARG2L(scan)] = scan;
4900 start = REGNODE_p(RExC_open_parens[paren]);
4901 end = REGNODE_p(RExC_close_parens[paren]);
4903 /* NOTE we MUST always execute the above code, even
4904 * if we do nothing with a GOSUB */
4906 ( flags & SCF_IN_DEFINE )
4909 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4911 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4914 /* no need to do anything here if we are in a define. */
4915 /* or we are after some kind of infinite construct
4916 * so we can skip recursing into this item.
4917 * Since it is infinite we will not change the maxlen
4918 * or delta, and if we miss something that might raise
4919 * the minlen it will merely pessimise a little.
4921 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4922 * might result in a minlen of 1 and not of 4,
4923 * but this doesn't make us mismatch, just try a bit
4924 * harder than we should.
4926 scan= regnext(scan);
4933 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4935 /* it is quite possible that there are more efficient ways
4936 * to do this. We maintain a bitmap per level of recursion
4937 * of which patterns we have entered so we can detect if a
4938 * pattern creates a possible infinite loop. When we
4939 * recurse down a level we copy the previous levels bitmap
4940 * down. When we are at recursion level 0 we zero the top
4941 * level bitmap. It would be nice to implement a different
4942 * more efficient way of doing this. In particular the top
4943 * level bitmap may be unnecessary.
4945 if (!recursed_depth) {
4946 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4948 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4949 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4950 RExC_study_chunk_recursed_bytes, U8);
4952 /* we havent recursed into this paren yet, so recurse into it */
4953 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
4954 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4955 my_recursed_depth= recursed_depth + 1;
4957 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
4958 /* some form of infinite recursion, assume infinite length
4960 if (flags & SCF_DO_SUBSTR) {
4961 scan_commit(pRExC_state, data, minlenp, is_inf);
4962 data->cur_is_floating = 1;
4964 is_inf = is_inf_internal = 1;
4965 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4966 ssc_anything(data->start_class);
4967 flags &= ~SCF_DO_STCLASS;
4969 start= NULL; /* reset start so we dont recurse later on. */
4974 end = regnext(scan);
4977 scan_frame *newframe;
4979 if (!RExC_frame_last) {
4980 Newxz(newframe, 1, scan_frame);
4981 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4982 RExC_frame_head= newframe;
4984 } else if (!RExC_frame_last->next_frame) {
4985 Newxz(newframe, 1, scan_frame);
4986 RExC_frame_last->next_frame= newframe;
4987 newframe->prev_frame= RExC_frame_last;
4990 newframe= RExC_frame_last->next_frame;
4992 RExC_frame_last= newframe;
4994 newframe->next_regnode = regnext(scan);
4995 newframe->last_regnode = last;
4996 newframe->stopparen = stopparen;
4997 newframe->prev_recursed_depth = recursed_depth;
4998 newframe->this_prev_frame= frame;
5000 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5001 DEBUG_PEEP("fnew", scan, depth, flags);
5008 recursed_depth= my_recursed_depth;
5013 else if ( OP(scan) == EXACT
5014 || OP(scan) == EXACT_ONLY8
5015 || OP(scan) == EXACTL)
5017 SSize_t l = STR_LEN(scan);
5021 const U8 * const s = (U8*)STRING(scan);
5022 uc = utf8_to_uvchr_buf(s, s + l, NULL);
5023 l = utf8_length(s, s + l);
5025 uc = *((U8*)STRING(scan));
5028 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5029 /* The code below prefers earlier match for fixed
5030 offset, later match for variable offset. */
5031 if (data->last_end == -1) { /* Update the start info. */
5032 data->last_start_min = data->pos_min;
5033 data->last_start_max = is_inf
5034 ? SSize_t_MAX : data->pos_min + data->pos_delta;
5036 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
5038 SvUTF8_on(data->last_found);
5040 SV * const sv = data->last_found;
5041 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5042 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5043 if (mg && mg->mg_len >= 0)
5044 mg->mg_len += utf8_length((U8*)STRING(scan),
5045 (U8*)STRING(scan)+STR_LEN(scan));
5047 data->last_end = data->pos_min + l;
5048 data->pos_min += l; /* As in the first entry. */
5049 data->flags &= ~SF_BEFORE_EOL;
5052 /* ANDing the code point leaves at most it, and not in locale, and
5053 * can't match null string */
5054 if (flags & SCF_DO_STCLASS_AND) {
5055 ssc_cp_and(data->start_class, uc);
5056 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5057 ssc_clear_locale(data->start_class);
5059 else if (flags & SCF_DO_STCLASS_OR) {
5060 ssc_add_cp(data->start_class, uc);
5061 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5063 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5064 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5066 flags &= ~SCF_DO_STCLASS;
5068 else if (PL_regkind[OP(scan)] == EXACT) {
5069 /* But OP != EXACT!, so is EXACTFish */
5070 SSize_t l = STR_LEN(scan);
5071 const U8 * s = (U8*)STRING(scan);
5073 /* Search for fixed substrings supports EXACT only. */
5074 if (flags & SCF_DO_SUBSTR) {
5076 scan_commit(pRExC_state, data, minlenp, is_inf);
5079 l = utf8_length(s, s + l);
5081 if (unfolded_multi_char) {
5082 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5084 min += l - min_subtract;
5086 delta += min_subtract;
5087 if (flags & SCF_DO_SUBSTR) {
5088 data->pos_min += l - min_subtract;
5089 if (data->pos_min < 0) {
5092 data->pos_delta += min_subtract;
5094 data->cur_is_floating = 1; /* float */
5098 if (flags & SCF_DO_STCLASS) {
5099 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
5101 assert(EXACTF_invlist);
5102 if (flags & SCF_DO_STCLASS_AND) {
5103 if (OP(scan) != EXACTFL)
5104 ssc_clear_locale(data->start_class);
5105 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5106 ANYOF_POSIXL_ZERO(data->start_class);
5107 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5109 else { /* SCF_DO_STCLASS_OR */
5110 ssc_union(data->start_class, EXACTF_invlist, FALSE);
5111 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5113 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5114 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5116 flags &= ~SCF_DO_STCLASS;
5117 SvREFCNT_dec(EXACTF_invlist);
5120 else if (REGNODE_VARIES(OP(scan))) {
5121 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5122 I32 fl = 0, f = flags;
5123 regnode * const oscan = scan;
5124 regnode_ssc this_class;
5125 regnode_ssc *oclass = NULL;
5126 I32 next_is_eval = 0;
5128 switch (PL_regkind[OP(scan)]) {
5129 case WHILEM: /* End of (?:...)* . */
5130 scan = NEXTOPER(scan);
5133 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5134 next = NEXTOPER(scan);
5135 if ( OP(next) == EXACT
5136 || OP(next) == EXACT_ONLY8
5137 || OP(next) == EXACTL
5138 || (flags & SCF_DO_STCLASS))
5141 maxcount = REG_INFTY;
5142 next = regnext(scan);
5143 scan = NEXTOPER(scan);
5147 if (flags & SCF_DO_SUBSTR)
5152 if (flags & SCF_DO_STCLASS) {
5154 maxcount = REG_INFTY;
5155 next = regnext(scan);
5156 scan = NEXTOPER(scan);
5159 if (flags & SCF_DO_SUBSTR) {
5160 scan_commit(pRExC_state, data, minlenp, is_inf);
5161 /* Cannot extend fixed substrings */
5162 data->cur_is_floating = 1; /* float */
5164 is_inf = is_inf_internal = 1;
5165 scan = regnext(scan);
5166 goto optimize_curly_tail;
5168 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5169 && (scan->flags == stopparen))
5174 mincount = ARG1(scan);
5175 maxcount = ARG2(scan);
5177 next = regnext(scan);
5178 if (OP(scan) == CURLYX) {
5179 I32 lp = (data ? *(data->last_closep) : 0);
5180 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5182 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5183 next_is_eval = (OP(scan) == EVAL);
5185 if (flags & SCF_DO_SUBSTR) {
5187 scan_commit(pRExC_state, data, minlenp, is_inf);
5188 /* Cannot extend fixed substrings */
5189 pos_before = data->pos_min;
5193 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5195 data->flags |= SF_IS_INF;
5197 if (flags & SCF_DO_STCLASS) {
5198 ssc_init(pRExC_state, &this_class);
5199 oclass = data->start_class;
5200 data->start_class = &this_class;
5201 f |= SCF_DO_STCLASS_AND;
5202 f &= ~SCF_DO_STCLASS_OR;
5204 /* Exclude from super-linear cache processing any {n,m}
5205 regops for which the combination of input pos and regex
5206 pos is not enough information to determine if a match
5209 For example, in the regex /foo(bar\s*){4,8}baz/ with the
5210 regex pos at the \s*, the prospects for a match depend not
5211 only on the input position but also on how many (bar\s*)
5212 repeats into the {4,8} we are. */
5213 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5214 f &= ~SCF_WHILEM_VISITED_POS;
5216 /* This will finish on WHILEM, setting scan, or on NULL: */
5217 /* recurse study_chunk() on loop bodies */
5218 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5219 last, data, stopparen, recursed_depth, NULL,
5221 ? (f & ~SCF_DO_SUBSTR)
5225 if (flags & SCF_DO_STCLASS)
5226 data->start_class = oclass;
5227 if (mincount == 0 || minnext == 0) {
5228 if (flags & SCF_DO_STCLASS_OR) {
5229 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5231 else if (flags & SCF_DO_STCLASS_AND) {
5232 /* Switch to OR mode: cache the old value of
5233 * data->start_class */
5235 StructCopy(data->start_class, and_withp, regnode_ssc);
5236 flags &= ~SCF_DO_STCLASS_AND;
5237 StructCopy(&this_class, data->start_class, regnode_ssc);
5238 flags |= SCF_DO_STCLASS_OR;
5239 ANYOF_FLAGS(data->start_class)
5240 |= SSC_MATCHES_EMPTY_STRING;
5242 } else { /* Non-zero len */
5243 if (flags & SCF_DO_STCLASS_OR) {
5244 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5245 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5247 else if (flags & SCF_DO_STCLASS_AND)
5248 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5249 flags &= ~SCF_DO_STCLASS;
5251 if (!scan) /* It was not CURLYX, but CURLY. */
5253 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5254 /* ? quantifier ok, except for (?{ ... }) */
5255 && (next_is_eval || !(mincount == 0 && maxcount == 1))
5256 && (minnext == 0) && (deltanext == 0)
5257 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5258 && maxcount <= REG_INFTY/3) /* Complement check for big
5261 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5262 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5263 "Quantifier unexpected on zero-length expression "
5264 "in regex m/%" UTF8f "/",
5265 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5269 min += minnext * mincount;
5270 is_inf_internal |= deltanext == SSize_t_MAX
5271 || (maxcount == REG_INFTY && minnext + deltanext > 0);
5272 is_inf |= is_inf_internal;
5274 delta = SSize_t_MAX;
5276 delta += (minnext + deltanext) * maxcount
5277 - minnext * mincount;
5279 /* Try powerful optimization CURLYX => CURLYN. */
5280 if ( OP(oscan) == CURLYX && data
5281 && data->flags & SF_IN_PAR
5282 && !(data->flags & SF_HAS_EVAL)
5283 && !deltanext && minnext == 1 ) {
5284 /* Try to optimize to CURLYN. */
5285 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5286 regnode * const nxt1 = nxt;
5293 if (!REGNODE_SIMPLE(OP(nxt))
5294 && !(PL_regkind[OP(nxt)] == EXACT
5295 && STR_LEN(nxt) == 1))
5301 if (OP(nxt) != CLOSE)
5303 if (RExC_open_parens) {
5306 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5309 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5311 /* Now we know that nxt2 is the only contents: */
5312 oscan->flags = (U8)ARG(nxt);
5314 OP(nxt1) = NOTHING; /* was OPEN. */
5317 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5318 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5319 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5320 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5321 OP(nxt + 1) = OPTIMIZED; /* was count. */
5322 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5327 /* Try optimization CURLYX => CURLYM. */
5328 if ( OP(oscan) == CURLYX && data
5329 && !(data->flags & SF_HAS_PAR)
5330 && !(data->flags & SF_HAS_EVAL)
5331 && !deltanext /* atom is fixed width */
5332 && minnext != 0 /* CURLYM can't handle zero width */
5334 /* Nor characters whose fold at run-time may be
5335 * multi-character */
5336 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5338 /* XXXX How to optimize if data == 0? */
5339 /* Optimize to a simpler form. */
5340 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5344 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5345 && (OP(nxt2) != WHILEM))
5347 OP(nxt2) = SUCCEED; /* Whas WHILEM */
5348 /* Need to optimize away parenths. */
5349 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5350 /* Set the parenth number. */
5351 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5353 oscan->flags = (U8)ARG(nxt);
5354 if (RExC_open_parens) {
5356 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5359 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5362 OP(nxt1) = OPTIMIZED; /* was OPEN. */
5363 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5366 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5367 OP(nxt + 1) = OPTIMIZED; /* was count. */
5368 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5369 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5372 while ( nxt1 && (OP(nxt1) != WHILEM)) {
5373 regnode *nnxt = regnext(nxt1);
5375 if (reg_off_by_arg[OP(nxt1)])
5376 ARG_SET(nxt1, nxt2 - nxt1);
5377 else if (nxt2 - nxt1 < U16_MAX)
5378 NEXT_OFF(nxt1) = nxt2 - nxt1;
5380 OP(nxt) = NOTHING; /* Cannot beautify */
5385 /* Optimize again: */
5386 /* recurse study_chunk() on optimised CURLYX => CURLYM */
5387 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5388 NULL, stopparen, recursed_depth, NULL, 0,
5394 else if ((OP(oscan) == CURLYX)
5395 && (flags & SCF_WHILEM_VISITED_POS)
5396 /* See the comment on a similar expression above.
5397 However, this time it's not a subexpression
5398 we care about, but the expression itself. */
5399 && (maxcount == REG_INFTY)
5401 /* This stays as CURLYX, we can put the count/of pair. */
5402 /* Find WHILEM (as in regexec.c) */
5403 regnode *nxt = oscan + NEXT_OFF(oscan);
5405 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5407 nxt = PREVOPER(nxt);
5408 if (nxt->flags & 0xf) {
5409 /* we've already set whilem count on this node */
5410 } else if (++data->whilem_c < 16) {
5411 assert(data->whilem_c <= RExC_whilem_seen);
5412 nxt->flags = (U8)(data->whilem_c
5413 | (RExC_whilem_seen << 4)); /* On WHILEM */
5416 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5418 if (flags & SCF_DO_SUBSTR) {
5419 SV *last_str = NULL;
5420 STRLEN last_chrs = 0;
5421 int counted = mincount != 0;
5423 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5425 SSize_t b = pos_before >= data->last_start_min
5426 ? pos_before : data->last_start_min;
5428 const char * const s = SvPV_const(data->last_found, l);
5429 SSize_t old = b - data->last_start_min;
5432 old = utf8_hop((U8*)s, old) - (U8*)s;
5434 /* Get the added string: */
5435 last_str = newSVpvn_utf8(s + old, l, UTF);
5436 last_chrs = UTF ? utf8_length((U8*)(s + old),
5437 (U8*)(s + old + l)) : l;
5438 if (deltanext == 0 && pos_before == b) {
5439 /* What was added is a constant string */
5442 SvGROW(last_str, (mincount * l) + 1);
5443 repeatcpy(SvPVX(last_str) + l,
5444 SvPVX_const(last_str), l,
5446 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5447 /* Add additional parts. */
5448 SvCUR_set(data->last_found,
5449 SvCUR(data->last_found) - l);
5450 sv_catsv(data->last_found, last_str);
5452 SV * sv = data->last_found;
5454 SvUTF8(sv) && SvMAGICAL(sv) ?
5455 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5456 if (mg && mg->mg_len >= 0)
5457 mg->mg_len += last_chrs * (mincount-1);
5459 last_chrs *= mincount;
5460 data->last_end += l * (mincount - 1);
5463 /* start offset must point into the last copy */
5464 data->last_start_min += minnext * (mincount - 1);
5465 data->last_start_max =
5468 : data->last_start_max +
5469 (maxcount - 1) * (minnext + data->pos_delta);
5472 /* It is counted once already... */
5473 data->pos_min += minnext * (mincount - counted);
5475 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
5476 " SSize_t_MAX=%" UVuf " minnext=%" UVuf
5477 " maxcount=%" UVuf " mincount=%" UVuf "\n",
5478 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5480 if (deltanext != SSize_t_MAX)
5481 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
5482 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5483 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5485 if (deltanext == SSize_t_MAX
5486 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5487 data->pos_delta = SSize_t_MAX;
5489 data->pos_delta += - counted * deltanext +
5490 (minnext + deltanext) * maxcount - minnext * mincount;
5491 if (mincount != maxcount) {
5492 /* Cannot extend fixed substrings found inside
5494 scan_commit(pRExC_state, data, minlenp, is_inf);
5495 if (mincount && last_str) {
5496 SV * const sv = data->last_found;
5497 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5498 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5502 sv_setsv(sv, last_str);
5503 data->last_end = data->pos_min;
5504 data->last_start_min = data->pos_min - last_chrs;
5505 data->last_start_max = is_inf
5507 : data->pos_min + data->pos_delta - last_chrs;
5509 data->cur_is_floating = 1; /* float */
5511 SvREFCNT_dec(last_str);
5513 if (data && (fl & SF_HAS_EVAL))
5514 data->flags |= SF_HAS_EVAL;
5515 optimize_curly_tail:
5516 if (OP(oscan) != CURLYX) {
5517 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5519 NEXT_OFF(oscan) += NEXT_OFF(next);
5525 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5530 if (flags & SCF_DO_SUBSTR) {
5531 /* Cannot expect anything... */
5532 scan_commit(pRExC_state, data, minlenp, is_inf);
5533 data->cur_is_floating = 1; /* float */
5535 is_inf = is_inf_internal = 1;
5536 if (flags & SCF_DO_STCLASS_OR) {
5537 if (OP(scan) == CLUMP) {
5538 /* Actually is any start char, but very few code points
5539 * aren't start characters */
5540 ssc_match_all_cp(data->start_class);
5543 ssc_anything(data->start_class);
5546 flags &= ~SCF_DO_STCLASS;
5550 else if (OP(scan) == LNBREAK) {
5551 if (flags & SCF_DO_STCLASS) {
5552 if (flags & SCF_DO_STCLASS_AND) {
5553 ssc_intersection(data->start_class,
5554 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5555 ssc_clear_locale(data->start_class);
5556 ANYOF_FLAGS(data->start_class)
5557 &= ~SSC_MATCHES_EMPTY_STRING;
5559 else if (flags & SCF_DO_STCLASS_OR) {
5560 ssc_union(data->start_class,
5561 PL_XPosix_ptrs[_CC_VERTSPACE],
5563 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5565 /* See commit msg for
5566 * 749e076fceedeb708a624933726e7989f2302f6a */
5567 ANYOF_FLAGS(data->start_class)
5568 &= ~SSC_MATCHES_EMPTY_STRING;
5570 flags &= ~SCF_DO_STCLASS;
5573 if (delta != SSize_t_MAX)
5574 delta++; /* Because of the 2 char string cr-lf */
5575 if (flags & SCF_DO_SUBSTR) {
5576 /* Cannot expect anything... */
5577 scan_commit(pRExC_state, data, minlenp, is_inf);
5579 if (data->pos_delta != SSize_t_MAX) {
5580 data->pos_delta += 1;
5582 data->cur_is_floating = 1; /* float */
5585 else if (REGNODE_SIMPLE(OP(scan))) {
5587 if (flags & SCF_DO_SUBSTR) {
5588 scan_commit(pRExC_state, data, minlenp, is_inf);
5592 if (flags & SCF_DO_STCLASS) {
5594 SV* my_invlist = NULL;
5597 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5598 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5600 /* Some of the logic below assumes that switching
5601 locale on will only add false positives. */
5606 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5610 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5611 ssc_match_all_cp(data->start_class);
5616 SV* REG_ANY_invlist = _new_invlist(2);
5617 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5619 if (flags & SCF_DO_STCLASS_OR) {
5620 ssc_union(data->start_class,
5622 TRUE /* TRUE => invert, hence all but \n
5626 else if (flags & SCF_DO_STCLASS_AND) {
5627 ssc_intersection(data->start_class,
5629 TRUE /* TRUE => invert */
5631 ssc_clear_locale(data->start_class);
5633 SvREFCNT_dec_NN(REG_ANY_invlist);
5641 if (flags & SCF_DO_STCLASS_AND)
5642 ssc_and(pRExC_state, data->start_class,
5643 (regnode_charclass *) scan);
5645 ssc_or(pRExC_state, data->start_class,
5646 (regnode_charclass *) scan);
5652 SV* cp_list = get_ANYOFM_contents(scan);
5654 if (flags & SCF_DO_STCLASS_OR) {
5655 ssc_union(data->start_class, cp_list, invert);
5657 else if (flags & SCF_DO_STCLASS_AND) {
5658 ssc_intersection(data->start_class, cp_list, invert);
5661 SvREFCNT_dec_NN(cp_list);
5670 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5671 if (flags & SCF_DO_STCLASS_AND) {
5672 bool was_there = cBOOL(
5673 ANYOF_POSIXL_TEST(data->start_class,
5675 ANYOF_POSIXL_ZERO(data->start_class);
5676 if (was_there) { /* Do an AND */
5677 ANYOF_POSIXL_SET(data->start_class, namedclass);
5679 /* No individual code points can now match */
5680 data->start_class->invlist
5681 = sv_2mortal(_new_invlist(0));
5684 int complement = namedclass + ((invert) ? -1 : 1);
5686 assert(flags & SCF_DO_STCLASS_OR);
5688 /* If the complement of this class was already there,
5689 * the result is that they match all code points,
5690 * (\d + \D == everything). Remove the classes from
5691 * future consideration. Locale is not relevant in
5693 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5694 ssc_match_all_cp(data->start_class);
5695 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5696 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5698 else { /* The usual case; just add this class to the
5700 ANYOF_POSIXL_SET(data->start_class, namedclass);
5709 my_invlist = invlist_clone(PL_Posix_ptrs[_CC_ASCII], NULL);
5711 /* This can be handled as a Posix class */
5712 goto join_posix_and_ascii;
5714 case NPOSIXA: /* For these, we always know the exact set of
5719 assert(FLAGS(scan) != _CC_ASCII);
5720 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
5721 goto join_posix_and_ascii;
5729 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
5731 /* NPOSIXD matches all upper Latin1 code points unless the
5732 * target string being matched is UTF-8, which is
5733 * unknowable until match time. Since we are going to
5734 * invert, we want to get rid of all of them so that the
5735 * inversion will match all */
5736 if (OP(scan) == NPOSIXD) {
5737 _invlist_subtract(my_invlist, PL_UpperLatin1,
5741 join_posix_and_ascii:
5743 if (flags & SCF_DO_STCLASS_AND) {
5744 ssc_intersection(data->start_class, my_invlist, invert);
5745 ssc_clear_locale(data->start_class);
5748 assert(flags & SCF_DO_STCLASS_OR);
5749 ssc_union(data->start_class, my_invlist, invert);
5751 SvREFCNT_dec(my_invlist);
5753 if (flags & SCF_DO_STCLASS_OR)
5754 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5755 flags &= ~SCF_DO_STCLASS;
5758 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5759 data->flags |= (OP(scan) == MEOL
5762 scan_commit(pRExC_state, data, minlenp, is_inf);
5765 else if ( PL_regkind[OP(scan)] == BRANCHJ
5766 /* Lookbehind, or need to calculate parens/evals/stclass: */
5767 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5768 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5770 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5771 || OP(scan) == UNLESSM )
5773 /* Negative Lookahead/lookbehind
5774 In this case we can't do fixed string optimisation.
5777 SSize_t deltanext, minnext, fake = 0;
5782 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5784 data_fake.whilem_c = data->whilem_c;
5785 data_fake.last_closep = data->last_closep;
5788 data_fake.last_closep = &fake;
5789 data_fake.pos_delta = delta;
5790 if ( flags & SCF_DO_STCLASS && !scan->flags
5791 && OP(scan) == IFMATCH ) { /* Lookahead */
5792 ssc_init(pRExC_state, &intrnl);
5793 data_fake.start_class = &intrnl;
5794 f |= SCF_DO_STCLASS_AND;
5796 if (flags & SCF_WHILEM_VISITED_POS)
5797 f |= SCF_WHILEM_VISITED_POS;
5798 next = regnext(scan);
5799 nscan = NEXTOPER(NEXTOPER(scan));
5801 /* recurse study_chunk() for lookahead body */
5802 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5803 last, &data_fake, stopparen,
5804 recursed_depth, NULL, f, depth+1);
5807 FAIL("Variable length lookbehind not implemented");
5809 else if (minnext > (I32)U8_MAX) {
5810 FAIL2("Lookbehind longer than %" UVuf " not implemented",
5813 scan->flags = (U8)minnext;
5816 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5818 if (data_fake.flags & SF_HAS_EVAL)
5819 data->flags |= SF_HAS_EVAL;
5820 data->whilem_c = data_fake.whilem_c;
5822 if (f & SCF_DO_STCLASS_AND) {
5823 if (flags & SCF_DO_STCLASS_OR) {
5824 /* OR before, AND after: ideally we would recurse with
5825 * data_fake to get the AND applied by study of the
5826 * remainder of the pattern, and then derecurse;
5827 * *** HACK *** for now just treat as "no information".
5828 * See [perl #56690].
5830 ssc_init(pRExC_state, data->start_class);
5832 /* AND before and after: combine and continue. These
5833 * assertions are zero-length, so can match an EMPTY
5835 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5836 ANYOF_FLAGS(data->start_class)
5837 |= SSC_MATCHES_EMPTY_STRING;
5841 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5843 /* Positive Lookahead/lookbehind
5844 In this case we can do fixed string optimisation,
5845 but we must be careful about it. Note in the case of
5846 lookbehind the positions will be offset by the minimum
5847 length of the pattern, something we won't know about
5848 until after the recurse.
5850 SSize_t deltanext, fake = 0;
5854 /* We use SAVEFREEPV so that when the full compile
5855 is finished perl will clean up the allocated
5856 minlens when it's all done. This way we don't
5857 have to worry about freeing them when we know
5858 they wont be used, which would be a pain.
5861 Newx( minnextp, 1, SSize_t );
5862 SAVEFREEPV(minnextp);
5865 StructCopy(data, &data_fake, scan_data_t);
5866 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5869 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5870 data_fake.last_found=newSVsv(data->last_found);
5874 data_fake.last_closep = &fake;
5875 data_fake.flags = 0;
5876 data_fake.substrs[0].flags = 0;
5877 data_fake.substrs[1].flags = 0;
5878 data_fake.pos_delta = delta;
5880 data_fake.flags |= SF_IS_INF;
5881 if ( flags & SCF_DO_STCLASS && !scan->flags
5882 && OP(scan) == IFMATCH ) { /* Lookahead */
5883 ssc_init(pRExC_state, &intrnl);
5884 data_fake.start_class = &intrnl;
5885 f |= SCF_DO_STCLASS_AND;
5887 if (flags & SCF_WHILEM_VISITED_POS)
5888 f |= SCF_WHILEM_VISITED_POS;
5889 next = regnext(scan);
5890 nscan = NEXTOPER(NEXTOPER(scan));
5892 /* positive lookahead study_chunk() recursion */
5893 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5894 &deltanext, last, &data_fake,
5895 stopparen, recursed_depth, NULL,
5899 FAIL("Variable length lookbehind not implemented");
5901 else if (*minnextp > (I32)U8_MAX) {
5902 FAIL2("Lookbehind longer than %" UVuf " not implemented",
5905 scan->flags = (U8)*minnextp;
5910 if (f & SCF_DO_STCLASS_AND) {
5911 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5912 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5915 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5917 if (data_fake.flags & SF_HAS_EVAL)
5918 data->flags |= SF_HAS_EVAL;
5919 data->whilem_c = data_fake.whilem_c;
5920 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5922 if (RExC_rx->minlen<*minnextp)
5923 RExC_rx->minlen=*minnextp;
5924 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5925 SvREFCNT_dec_NN(data_fake.last_found);
5927 for (i = 0; i < 2; i++) {
5928 if (data_fake.substrs[i].minlenp != minlenp) {
5929 data->substrs[i].min_offset =
5930 data_fake.substrs[i].min_offset;
5931 data->substrs[i].max_offset =
5932 data_fake.substrs[i].max_offset;
5933 data->substrs[i].minlenp =
5934 data_fake.substrs[i].minlenp;
5935 data->substrs[i].lookbehind += scan->flags;
5944 else if (OP(scan) == OPEN) {
5945 if (stopparen != (I32)ARG(scan))
5948 else if (OP(scan) == CLOSE) {
5949 if (stopparen == (I32)ARG(scan)) {
5952 if ((I32)ARG(scan) == is_par) {
5953 next = regnext(scan);
5955 if ( next && (OP(next) != WHILEM) && next < last)
5956 is_par = 0; /* Disable optimization */
5959 *(data->last_closep) = ARG(scan);
5961 else if (OP(scan) == EVAL) {
5963 data->flags |= SF_HAS_EVAL;
5965 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5966 if (flags & SCF_DO_SUBSTR) {
5967 scan_commit(pRExC_state, data, minlenp, is_inf);
5968 flags &= ~SCF_DO_SUBSTR;
5970 if (data && OP(scan)==ACCEPT) {
5971 data->flags |= SCF_SEEN_ACCEPT;
5976 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5978 if (flags & SCF_DO_SUBSTR) {
5979 scan_commit(pRExC_state, data, minlenp, is_inf);
5980 data->cur_is_floating = 1; /* float */
5982 is_inf = is_inf_internal = 1;
5983 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5984 ssc_anything(data->start_class);
5985 flags &= ~SCF_DO_STCLASS;
5987 else if (OP(scan) == GPOS) {
5988 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5989 !(delta || is_inf || (data && data->pos_delta)))
5991 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5992 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5993 if (RExC_rx->gofs < (STRLEN)min)
5994 RExC_rx->gofs = min;
5996 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6000 #ifdef TRIE_STUDY_OPT
6001 #ifdef FULL_TRIE_STUDY
6002 else if (PL_regkind[OP(scan)] == TRIE) {
6003 /* NOTE - There is similar code to this block above for handling
6004 BRANCH nodes on the initial study. If you change stuff here
6006 regnode *trie_node= scan;
6007 regnode *tail= regnext(scan);
6008 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6009 SSize_t max1 = 0, min1 = SSize_t_MAX;
6012 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6013 /* Cannot merge strings after this. */
6014 scan_commit(pRExC_state, data, minlenp, is_inf);
6016 if (flags & SCF_DO_STCLASS)
6017 ssc_init_zero(pRExC_state, &accum);
6023 const regnode *nextbranch= NULL;
6026 for ( word=1 ; word <= trie->wordcount ; word++)
6028 SSize_t deltanext=0, minnext=0, f = 0, fake;
6029 regnode_ssc this_class;
6031 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6033 data_fake.whilem_c = data->whilem_c;
6034 data_fake.last_closep = data->last_closep;
6037 data_fake.last_closep = &fake;
6038 data_fake.pos_delta = delta;
6039 if (flags & SCF_DO_STCLASS) {
6040 ssc_init(pRExC_state, &this_class);
6041 data_fake.start_class = &this_class;
6042 f = SCF_DO_STCLASS_AND;
6044 if (flags & SCF_WHILEM_VISITED_POS)
6045 f |= SCF_WHILEM_VISITED_POS;
6047 if (trie->jump[word]) {
6049 nextbranch = trie_node + trie->jump[0];
6050 scan= trie_node + trie->jump[word];
6051 /* We go from the jump point to the branch that follows
6052 it. Note this means we need the vestigal unused
6053 branches even though they arent otherwise used. */
6054 /* optimise study_chunk() for TRIE */
6055 minnext = study_chunk(pRExC_state, &scan, minlenp,
6056 &deltanext, (regnode *)nextbranch, &data_fake,
6057 stopparen, recursed_depth, NULL, f, depth+1);
6059 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6060 nextbranch= regnext((regnode*)nextbranch);
6062 if (min1 > (SSize_t)(minnext + trie->minlen))
6063 min1 = minnext + trie->minlen;
6064 if (deltanext == SSize_t_MAX) {
6065 is_inf = is_inf_internal = 1;
6067 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6068 max1 = minnext + deltanext + trie->maxlen;
6070 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6072 if (data_fake.flags & SCF_SEEN_ACCEPT) {
6073 if ( stopmin > min + min1)
6074 stopmin = min + min1;
6075 flags &= ~SCF_DO_SUBSTR;
6077 data->flags |= SCF_SEEN_ACCEPT;
6080 if (data_fake.flags & SF_HAS_EVAL)
6081 data->flags |= SF_HAS_EVAL;
6082 data->whilem_c = data_fake.whilem_c;
6084 if (flags & SCF_DO_STCLASS)
6085 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6088 if (flags & SCF_DO_SUBSTR) {
6089 data->pos_min += min1;
6090 data->pos_delta += max1 - min1;
6091 if (max1 != min1 || is_inf)
6092 data->cur_is_floating = 1; /* float */
6095 if (delta != SSize_t_MAX) {
6096 if (SSize_t_MAX - (max1 - min1) >= delta)
6097 delta += max1 - min1;
6099 delta = SSize_t_MAX;
6101 if (flags & SCF_DO_STCLASS_OR) {
6102 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6104 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6105 flags &= ~SCF_DO_STCLASS;
6108 else if (flags & SCF_DO_STCLASS_AND) {
6110 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6111 flags &= ~SCF_DO_STCLASS;
6114 /* Switch to OR mode: cache the old value of
6115 * data->start_class */
6117 StructCopy(data->start_class, and_withp, regnode_ssc);
6118 flags &= ~SCF_DO_STCLASS_AND;
6119 StructCopy(&accum, data->start_class, regnode_ssc);
6120 flags |= SCF_DO_STCLASS_OR;
6127 else if (PL_regkind[OP(scan)] == TRIE) {
6128 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6131 min += trie->minlen;
6132 delta += (trie->maxlen - trie->minlen);
6133 flags &= ~SCF_DO_STCLASS; /* xxx */
6134 if (flags & SCF_DO_SUBSTR) {
6135 /* Cannot expect anything... */
6136 scan_commit(pRExC_state, data, minlenp, is_inf);
6137 data->pos_min += trie->minlen;
6138 data->pos_delta += (trie->maxlen - trie->minlen);
6139 if (trie->maxlen != trie->minlen)
6140 data->cur_is_floating = 1; /* float */
6142 if (trie->jump) /* no more substrings -- for now /grr*/
6143 flags &= ~SCF_DO_SUBSTR;
6145 #endif /* old or new */
6146 #endif /* TRIE_STUDY_OPT */
6148 /* Else: zero-length, ignore. */
6149 scan = regnext(scan);
6154 /* we need to unwind recursion. */
6157 DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6158 DEBUG_PEEP("fend", scan, depth, flags);
6160 /* restore previous context */
6161 last = frame->last_regnode;
6162 scan = frame->next_regnode;
6163 stopparen = frame->stopparen;
6164 recursed_depth = frame->prev_recursed_depth;
6166 RExC_frame_last = frame->prev_frame;
6167 frame = frame->this_prev_frame;
6168 goto fake_study_recurse;
6172 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6175 *deltap = is_inf_internal ? SSize_t_MAX : delta;
6177 if (flags & SCF_DO_SUBSTR && is_inf)
6178 data->pos_delta = SSize_t_MAX - data->pos_min;
6179 if (is_par > (I32)U8_MAX)
6181 if (is_par && pars==1 && data) {
6182 data->flags |= SF_IN_PAR;
6183 data->flags &= ~SF_HAS_PAR;
6185 else if (pars && data) {
6186 data->flags |= SF_HAS_PAR;
6187 data->flags &= ~SF_IN_PAR;
6189 if (flags & SCF_DO_STCLASS_OR)
6190 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6191 if (flags & SCF_TRIE_RESTUDY)
6192 data->flags |= SCF_TRIE_RESTUDY;
6194 DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6197 SSize_t final_minlen= min < stopmin ? min : stopmin;
6199 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6200 if (final_minlen > SSize_t_MAX - delta)
6201 RExC_maxlen = SSize_t_MAX;
6202 else if (RExC_maxlen < final_minlen + delta)
6203 RExC_maxlen = final_minlen + delta;
6205 return final_minlen;
6207 NOT_REACHED; /* NOTREACHED */
6211 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6213 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6215 PERL_ARGS_ASSERT_ADD_DATA;
6217 Renewc(RExC_rxi->data,
6218 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6219 char, struct reg_data);
6221 Renew(RExC_rxi->data->what, count + n, U8);
6223 Newx(RExC_rxi->data->what, n, U8);
6224 RExC_rxi->data->count = count + n;
6225 Copy(s, RExC_rxi->data->what + count, n, U8);
6229 /*XXX: todo make this not included in a non debugging perl, but appears to be
6230 * used anyway there, in 'use re' */
6231 #ifndef PERL_IN_XSUB_RE
6233 Perl_reginitcolors(pTHX)
6235 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6237 char *t = savepv(s);
6241 t = strchr(t, '\t');
6247 PL_colors[i] = t = (char *)"";
6252 PL_colors[i++] = (char *)"";
6259 #ifdef TRIE_STUDY_OPT
6260 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
6263 (data.flags & SCF_TRIE_RESTUDY) \
6271 #define CHECK_RESTUDY_GOTO_butfirst
6275 * pregcomp - compile a regular expression into internal code
6277 * Decides which engine's compiler to call based on the hint currently in
6281 #ifndef PERL_IN_XSUB_RE
6283 /* return the currently in-scope regex engine (or the default if none) */
6285 regexp_engine const *
6286 Perl_current_re_engine(pTHX)
6288 if (IN_PERL_COMPILETIME) {
6289 HV * const table = GvHV(PL_hintgv);
6292 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6293 return &PL_core_reg_engine;
6294 ptr = hv_fetchs(table, "regcomp", FALSE);
6295 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6296 return &PL_core_reg_engine;
6297 return INT2PTR(regexp_engine*, SvIV(*ptr));
6301 if (!PL_curcop->cop_hints_hash)
6302 return &PL_core_reg_engine;
6303 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6304 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6305 return &PL_core_reg_engine;
6306 return INT2PTR(regexp_engine*, SvIV(ptr));
6312 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6314 regexp_engine const *eng = current_re_engine();
6315 GET_RE_DEBUG_FLAGS_DECL;
6317 PERL_ARGS_ASSERT_PREGCOMP;
6319 /* Dispatch a request to compile a regexp to correct regexp engine. */
6321 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
6324 return CALLREGCOMP_ENG(eng, pattern, flags);
6328 /* public(ish) entry point for the perl core's own regex compiling code.
6329 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6330 * pattern rather than a list of OPs, and uses the internal engine rather
6331 * than the current one */
6334 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6336 SV *pat = pattern; /* defeat constness! */
6337 PERL_ARGS_ASSERT_RE_COMPILE;
6338 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6339 #ifdef PERL_IN_XSUB_RE
6342 &PL_core_reg_engine,
6344 NULL, NULL, rx_flags, 0);
6349 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6353 if (--cbs->refcnt > 0)
6355 for (n = 0; n < cbs->count; n++) {
6356 REGEXP *rx = cbs->cb[n].src_regex;
6358 cbs->cb[n].src_regex = NULL;
6359 SvREFCNT_dec_NN(rx);
6367 static struct reg_code_blocks *
6368 S_alloc_code_blocks(pTHX_ int ncode)
6370 struct reg_code_blocks *cbs;
6371 Newx(cbs, 1, struct reg_code_blocks);
6374 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6376 Newx(cbs->cb, ncode, struct reg_code_block);
6383 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6384 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6385 * point to the realloced string and length.
6387 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6391 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6392 char **pat_p, STRLEN *plen_p, int num_code_blocks)
6394 U8 *const src = (U8*)*pat_p;
6399 GET_RE_DEBUG_FLAGS_DECL;
6401 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6402 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6404 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6405 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6408 while (s < *plen_p) {
6409 append_utf8_from_native_byte(src[s], &d);
6411 if (n < num_code_blocks) {
6412 assert(pRExC_state->code_blocks);
6413 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6414 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6415 assert(*(d - 1) == '(');
6418 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6419 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6420 assert(*(d - 1) == ')');
6429 *pat_p = (char*) dst;
6431 RExC_orig_utf8 = RExC_utf8 = 1;
6436 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6437 * while recording any code block indices, and handling overloading,
6438 * nested qr// objects etc. If pat is null, it will allocate a new
6439 * string, or just return the first arg, if there's only one.
6441 * Returns the malloced/updated pat.
6442 * patternp and pat_count is the array of SVs to be concatted;
6443 * oplist is the optional list of ops that generated the SVs;
6444 * recompile_p is a pointer to a boolean that will be set if
6445 * the regex will need to be recompiled.
6446 * delim, if non-null is an SV that will be inserted between each element
6450 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6451 SV *pat, SV ** const patternp, int pat_count,
6452 OP *oplist, bool *recompile_p, SV *delim)
6456 bool use_delim = FALSE;
6457 bool alloced = FALSE;
6459 /* if we know we have at least two args, create an empty string,
6460 * then concatenate args to that. For no args, return an empty string */
6461 if (!pat && pat_count != 1) {
6467 for (svp = patternp; svp < patternp + pat_count; svp++) {
6470 STRLEN orig_patlen = 0;
6472 SV *msv = use_delim ? delim : *svp;
6473 if (!msv) msv = &PL_sv_undef;
6475 /* if we've got a delimiter, we go round the loop twice for each
6476 * svp slot (except the last), using the delimiter the second
6485 if (SvTYPE(msv) == SVt_PVAV) {
6486 /* we've encountered an interpolated array within
6487 * the pattern, e.g. /...@a..../. Expand the list of elements,
6488 * then recursively append elements.
6489 * The code in this block is based on S_pushav() */
6491 AV *const av = (AV*)msv;
6492 const SSize_t maxarg = AvFILL(av) + 1;
6496 assert(oplist->op_type == OP_PADAV
6497 || oplist->op_type == OP_RV2AV);
6498 oplist = OpSIBLING(oplist);
6501 if (SvRMAGICAL(av)) {
6504 Newx(array, maxarg, SV*);
6506 for (i=0; i < maxarg; i++) {
6507 SV ** const svp = av_fetch(av, i, FALSE);
6508 array[i] = svp ? *svp : &PL_sv_undef;
6512 array = AvARRAY(av);
6514 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6515 array, maxarg, NULL, recompile_p,
6517 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6523 /* we make the assumption here that each op in the list of
6524 * op_siblings maps to one SV pushed onto the stack,
6525 * except for code blocks, with have both an OP_NULL and
6527 * This allows us to match up the list of SVs against the
6528 * list of OPs to find the next code block.
6530 * Note that PUSHMARK PADSV PADSV ..
6532 * PADRANGE PADSV PADSV ..
6533 * so the alignment still works. */
6536 if (oplist->op_type == OP_NULL
6537 && (oplist->op_flags & OPf_SPECIAL))
6539 assert(n < pRExC_state->code_blocks->count);
6540 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6541 pRExC_state->code_blocks->cb[n].block = oplist;
6542 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6545 oplist = OpSIBLING(oplist); /* skip CONST */
6548 oplist = OpSIBLING(oplist);;
6551 /* apply magic and QR overloading to arg */
6554 if (SvROK(msv) && SvAMAGIC(msv)) {
6555 SV *sv = AMG_CALLunary(msv, regexp_amg);
6559 if (SvTYPE(sv) != SVt_REGEXP)
6560 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6565 /* try concatenation overload ... */
6566 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6567 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6570 /* overloading involved: all bets are off over literal
6571 * code. Pretend we haven't seen it */
6573 pRExC_state->code_blocks->count -= n;
6577 /* ... or failing that, try "" overload */
6578 while (SvAMAGIC(msv)
6579 && (sv = AMG_CALLunary(msv, string_amg))
6583 && SvRV(msv) == SvRV(sv))
6588 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6592 /* this is a partially unrolled
6593 * sv_catsv_nomg(pat, msv);
6594 * that allows us to adjust code block indices if
6597 char *dst = SvPV_force_nomg(pat, dlen);
6599 if (SvUTF8(msv) && !SvUTF8(pat)) {
6600 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6601 sv_setpvn(pat, dst, dlen);
6604 sv_catsv_nomg(pat, msv);
6608 /* We have only one SV to process, but we need to verify
6609 * it is properly null terminated or we will fail asserts
6610 * later. In theory we probably shouldn't get such SV's,
6611 * but if we do we should handle it gracefully. */
6612 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
6613 /* not a string, or a string with a trailing null */
6616 /* a string with no trailing null, we need to copy it
6617 * so it has a trailing null */
6618 pat = sv_2mortal(newSVsv(msv));
6623 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6626 /* extract any code blocks within any embedded qr//'s */
6627 if (rx && SvTYPE(rx) == SVt_REGEXP
6628 && RX_ENGINE((REGEXP*)rx)->op_comp)
6631 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6632 if (ri->code_blocks && ri->code_blocks->count) {
6634 /* the presence of an embedded qr// with code means
6635 * we should always recompile: the text of the
6636 * qr// may not have changed, but it may be a
6637 * different closure than last time */
6639 if (pRExC_state->code_blocks) {
6640 int new_count = pRExC_state->code_blocks->count
6641 + ri->code_blocks->count;
6642 Renew(pRExC_state->code_blocks->cb,
6643 new_count, struct reg_code_block);
6644 pRExC_state->code_blocks->count = new_count;
6647 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
6648 ri->code_blocks->count);
6650 for (i=0; i < ri->code_blocks->count; i++) {
6651 struct reg_code_block *src, *dst;
6652 STRLEN offset = orig_patlen
6653 + ReANY((REGEXP *)rx)->pre_prefix;
6654 assert(n < pRExC_state->code_blocks->count);
6655 src = &ri->code_blocks->cb[i];
6656 dst = &pRExC_state->code_blocks->cb[n];
6657 dst->start = src->start + offset;
6658 dst->end = src->end + offset;
6659 dst->block = src->block;
6660 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6669 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6678 /* see if there are any run-time code blocks in the pattern.
6679 * False positives are allowed */
6682 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6683 char *pat, STRLEN plen)
6688 PERL_UNUSED_CONTEXT;
6690 for (s = 0; s < plen; s++) {
6691 if ( pRExC_state->code_blocks
6692 && n < pRExC_state->code_blocks->count
6693 && s == pRExC_state->code_blocks->cb[n].start)
6695 s = pRExC_state->code_blocks->cb[n].end;
6699 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6701 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6703 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6710 /* Handle run-time code blocks. We will already have compiled any direct
6711 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6712 * copy of it, but with any literal code blocks blanked out and
6713 * appropriate chars escaped; then feed it into
6715 * eval "qr'modified_pattern'"
6719 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6723 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6725 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6726 * and merge them with any code blocks of the original regexp.
6728 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6729 * instead, just save the qr and return FALSE; this tells our caller that
6730 * the original pattern needs upgrading to utf8.
6734 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6735 char *pat, STRLEN plen)
6739 GET_RE_DEBUG_FLAGS_DECL;
6741 if (pRExC_state->runtime_code_qr) {
6742 /* this is the second time we've been called; this should
6743 * only happen if the main pattern got upgraded to utf8
6744 * during compilation; re-use the qr we compiled first time
6745 * round (which should be utf8 too)
6747 qr = pRExC_state->runtime_code_qr;
6748 pRExC_state->runtime_code_qr = NULL;
6749 assert(RExC_utf8 && SvUTF8(qr));
6755 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
6759 /* determine how many extra chars we need for ' and \ escaping */
6760 for (s = 0; s < plen; s++) {
6761 if (pat[s] == '\'' || pat[s] == '\\')
6765 Newx(newpat, newlen, char);
6767 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6769 for (s = 0; s < plen; s++) {
6770 if ( pRExC_state->code_blocks
6771 && n < pRExC_state->code_blocks->count
6772 && s == pRExC_state->code_blocks->cb[n].start)
6774 /* blank out literal code block so that they aren't
6775 * recompiled: eg change from/to:
6785 assert(pat[s] == '(');
6786 assert(pat[s+1] == '?');
6790 while (s < pRExC_state->code_blocks->cb[n].end) {
6798 if (pat[s] == '\'' || pat[s] == '\\')
6803 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
6805 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
6811 Perl_re_printf( aTHX_
6812 "%sre-parsing pattern for runtime code:%s %s\n",
6813 PL_colors[4], PL_colors[5], newpat);
6816 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6822 PUSHSTACKi(PERLSI_REQUIRE);
6823 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6824 * parsing qr''; normally only q'' does this. It also alters
6826 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6827 SvREFCNT_dec_NN(sv);
6832 SV * const errsv = ERRSV;
6833 if (SvTRUE_NN(errsv))
6834 /* use croak_sv ? */
6835 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
6837 assert(SvROK(qr_ref));
6839 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6840 /* the leaving below frees the tmp qr_ref.
6841 * Give qr a life of its own */
6849 if (!RExC_utf8 && SvUTF8(qr)) {
6850 /* first time through; the pattern got upgraded; save the
6851 * qr for the next time through */
6852 assert(!pRExC_state->runtime_code_qr);
6853 pRExC_state->runtime_code_qr = qr;
6858 /* extract any code blocks within the returned qr// */
6861 /* merge the main (r1) and run-time (r2) code blocks into one */
6863 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6864 struct reg_code_block *new_block, *dst;
6865 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6869 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
6871 SvREFCNT_dec_NN(qr);
6875 if (!r1->code_blocks)
6876 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
6878 r1c = r1->code_blocks->count;
6879 r2c = r2->code_blocks->count;
6881 Newx(new_block, r1c + r2c, struct reg_code_block);
6885 while (i1 < r1c || i2 < r2c) {
6886 struct reg_code_block *src;
6890 src = &r2->code_blocks->cb[i2++];
6894 src = &r1->code_blocks->cb[i1++];
6895 else if ( r1->code_blocks->cb[i1].start
6896 < r2->code_blocks->cb[i2].start)
6898 src = &r1->code_blocks->cb[i1++];
6899 assert(src->end < r2->code_blocks->cb[i2].start);
6902 assert( r1->code_blocks->cb[i1].start
6903 > r2->code_blocks->cb[i2].start);
6904 src = &r2->code_blocks->cb[i2++];
6906 assert(src->end < r1->code_blocks->cb[i1].start);
6909 assert(pat[src->start] == '(');
6910 assert(pat[src->end] == ')');
6911 dst->start = src->start;
6912 dst->end = src->end;
6913 dst->block = src->block;
6914 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6918 r1->code_blocks->count += r2c;
6919 Safefree(r1->code_blocks->cb);
6920 r1->code_blocks->cb = new_block;
6923 SvREFCNT_dec_NN(qr);
6929 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
6930 struct reg_substr_datum *rsd,
6931 struct scan_data_substrs *sub,
6932 STRLEN longest_length)
6934 /* This is the common code for setting up the floating and fixed length
6935 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6936 * as to whether succeeded or not */
6940 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
6941 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
6943 if (! (longest_length
6944 || (eol /* Can't have SEOL and MULTI */
6945 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6947 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6948 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6953 /* copy the information about the longest from the reg_scan_data
6954 over to the program. */
6955 if (SvUTF8(sub->str)) {
6957 rsd->utf8_substr = sub->str;
6959 rsd->substr = sub->str;
6960 rsd->utf8_substr = NULL;
6962 /* end_shift is how many chars that must be matched that
6963 follow this item. We calculate it ahead of time as once the
6964 lookbehind offset is added in we lose the ability to correctly
6966 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
6967 rsd->end_shift = ml - sub->min_offset
6969 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
6971 + (SvTAIL(sub->str) != 0)
6975 t = (eol/* Can't have SEOL and MULTI */
6976 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6977 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
6983 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
6985 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
6986 * properly wrapped with the right modifiers */
6988 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6989 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
6990 != REGEX_DEPENDS_CHARSET);
6992 /* The caret is output if there are any defaults: if not all the STD
6993 * flags are set, or if no character set specifier is needed */
6995 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6997 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6998 == REG_RUN_ON_COMMENT_SEEN);
6999 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7000 >> RXf_PMf_STD_PMMOD_SHIFT);
7001 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
7003 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7005 /* We output all the necessary flags; we never output a minus, as all
7006 * those are defaults, so are
7007 * covered by the caret */
7008 const STRLEN wraplen = pat_len + has_p + has_runon
7009 + has_default /* If needs a caret */
7010 + PL_bitcount[reganch] /* 1 char for each set standard flag */
7012 /* If needs a character set specifier */
7013 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7014 + (sizeof("(?:)") - 1);
7016 PERL_ARGS_ASSERT_SET_REGEX_PV;
7018 /* make sure PL_bitcount bounds not exceeded */
7019 assert(sizeof(STD_PAT_MODS) <= 8);
7021 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7024 SvFLAGS(Rx) |= SVf_UTF8;
7027 /* If a default, cover it using the caret */
7029 *p++= DEFAULT_PAT_MOD;
7035 name = get_regex_charset_name(RExC_rx->extflags, &len);
7036 if strEQ(name, DEPENDS_PAT_MODS) { /* /d under UTF-8 => /u */
7038 name = UNICODE_PAT_MODS;
7039 len = sizeof(UNICODE_PAT_MODS) - 1;
7041 Copy(name, p, len, char);
7045 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7048 while((ch = *fptr++)) {
7056 Copy(RExC_precomp, p, pat_len, char);
7057 assert ((RX_WRAPPED(Rx) - p) < 16);
7058 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7061 /* Adding a trailing \n causes this to compile properly:
7062 my $R = qr / A B C # D E/x; /($R)/
7063 Otherwise the parens are considered part of the comment */
7068 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7072 * Perl_re_op_compile - the perl internal RE engine's function to compile a
7073 * regular expression into internal code.
7074 * The pattern may be passed either as:
7075 * a list of SVs (patternp plus pat_count)
7076 * a list of OPs (expr)
7077 * If both are passed, the SV list is used, but the OP list indicates
7078 * which SVs are actually pre-compiled code blocks
7080 * The SVs in the list have magic and qr overloading applied to them (and
7081 * the list may be modified in-place with replacement SVs in the latter
7084 * If the pattern hasn't changed from old_re, then old_re will be
7087 * eng is the current engine. If that engine has an op_comp method, then
7088 * handle directly (i.e. we assume that op_comp was us); otherwise, just
7089 * do the initial concatenation of arguments and pass on to the external
7092 * If is_bare_re is not null, set it to a boolean indicating whether the
7093 * arg list reduced (after overloading) to a single bare regex which has
7094 * been returned (i.e. /$qr/).
7096 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7098 * pm_flags contains the PMf_* flags, typically based on those from the
7099 * pm_flags field of the related PMOP. Currently we're only interested in
7100 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
7102 * For many years this code had an initial sizing pass that calculated
7103 * (sometimes incorrectly, leading to security holes) the size needed for the
7104 * compiled pattern. That was changed by commit
7105 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7106 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
7107 * references to this sizing pass.
7109 * Now, an initial crude guess as to the size needed is made, based on the
7110 * length of the pattern. Patches welcome to improve that guess. That amount
7111 * of space is malloc'd and then immediately freed, and then clawed back node
7112 * by node. This design is to minimze, to the extent possible, memory churn
7113 * when doing the the reallocs.
7115 * A separate parentheses counting pass may be needed in some cases.
7116 * (Previously the sizing pass did this.) Patches welcome to reduce the number
7119 * The existence of a sizing pass necessitated design decisions that are no
7120 * longer needed. There are potential areas of simplification.
7122 * Beware that the optimization-preparation code in here knows about some
7123 * of the structure of the compiled regexp. [I'll say.]
7127 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7128 OP *expr, const regexp_engine* eng, REGEXP *old_re,
7129 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7131 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
7139 SV** new_patternp = patternp;
7141 /* these are all flags - maybe they should be turned
7142 * into a single int with different bit masks */
7143 I32 sawlookahead = 0;
7148 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7150 bool runtime_code = 0;
7152 RExC_state_t RExC_state;
7153 RExC_state_t * const pRExC_state = &RExC_state;
7154 #ifdef TRIE_STUDY_OPT
7156 RExC_state_t copyRExC_state;
7158 GET_RE_DEBUG_FLAGS_DECL;
7160 PERL_ARGS_ASSERT_RE_OP_COMPILE;
7162 DEBUG_r(if (!PL_colorset) reginitcolors());
7164 /* Initialize these here instead of as-needed, as is quick and avoids
7165 * having to test them each time otherwise */
7166 if (! PL_InBitmap) {
7168 char * dump_len_string;
7171 /* This is calculated here, because the Perl program that generates the
7172 * static global ones doesn't currently have access to
7173 * NUM_ANYOF_CODE_POINTS */
7174 PL_InBitmap = _new_invlist(2);
7175 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
7176 NUM_ANYOF_CODE_POINTS - 1);
7178 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
7179 if ( ! dump_len_string
7180 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
7182 PL_dump_re_max_len = 60; /* A reasonable default */
7187 pRExC_state->warn_text = NULL;
7188 pRExC_state->code_blocks = NULL;
7191 *is_bare_re = FALSE;
7193 if (expr && (expr->op_type == OP_LIST ||
7194 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7195 /* allocate code_blocks if needed */
7199 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7200 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7201 ncode++; /* count of DO blocks */
7204 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7208 /* compile-time pattern with just OP_CONSTs and DO blocks */
7213 /* find how many CONSTs there are */
7216 if (expr->op_type == OP_CONST)
7219 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7220 if (o->op_type == OP_CONST)
7224 /* fake up an SV array */
7226 assert(!new_patternp);
7227 Newx(new_patternp, n, SV*);
7228 SAVEFREEPV(new_patternp);
7232 if (expr->op_type == OP_CONST)
7233 new_patternp[n] = cSVOPx_sv(expr);
7235 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7236 if (o->op_type == OP_CONST)
7237 new_patternp[n++] = cSVOPo_sv;
7242 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7243 "Assembling pattern from %d elements%s\n", pat_count,
7244 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7246 /* set expr to the first arg op */
7248 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7249 && expr->op_type != OP_CONST)
7251 expr = cLISTOPx(expr)->op_first;
7252 assert( expr->op_type == OP_PUSHMARK
7253 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7254 || expr->op_type == OP_PADRANGE);
7255 expr = OpSIBLING(expr);
7258 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7259 expr, &recompile, NULL);
7261 /* handle bare (possibly after overloading) regex: foo =~ $re */
7266 if (SvTYPE(re) == SVt_REGEXP) {
7270 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7271 "Precompiled pattern%s\n",
7272 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7278 exp = SvPV_nomg(pat, plen);
7280 if (!eng->op_comp) {
7281 if ((SvUTF8(pat) && IN_BYTES)
7282 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7284 /* make a temporary copy; either to convert to bytes,
7285 * or to avoid repeating get-magic / overloaded stringify */
7286 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7287 (IN_BYTES ? 0 : SvUTF8(pat)));
7289 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7292 /* ignore the utf8ness if the pattern is 0 length */
7293 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7295 RExC_uni_semantics = RExC_utf8; /* UTF-8 implies unicode semantics;
7296 otherwise we may find later this should
7298 RExC_contains_locale = 0;
7299 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7300 RExC_in_script_run = 0;
7301 RExC_study_started = 0;
7302 pRExC_state->runtime_code_qr = NULL;
7303 RExC_frame_head= NULL;
7304 RExC_frame_last= NULL;
7305 RExC_frame_count= 0;
7306 RExC_latest_warn_offset = 0;
7307 RExC_use_BRANCHJ = 0;
7308 RExC_total_parens = 0;
7309 RExC_open_parens = NULL;
7310 RExC_close_parens = NULL;
7311 RExC_paren_names = NULL;
7313 RExC_seen_d_op = FALSE;
7315 RExC_paren_name_list = NULL;
7319 RExC_mysv1= sv_newmortal();
7320 RExC_mysv2= sv_newmortal();
7324 SV *dsv= sv_newmortal();
7325 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7326 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
7327 PL_colors[4], PL_colors[5], s);
7330 /* we jump here if we have to recompile, e.g., from upgrading the pattern
7333 if ((pm_flags & PMf_USE_RE_EVAL)
7334 /* this second condition covers the non-regex literal case,
7335 * i.e. $foo =~ '(?{})'. */
7336 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7338 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7341 /* return old regex if pattern hasn't changed */
7342 /* XXX: note in the below we have to check the flags as well as the
7345 * Things get a touch tricky as we have to compare the utf8 flag
7346 * independently from the compile flags. */
7350 && !!RX_UTF8(old_re) == !!RExC_utf8
7351 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7352 && RX_PRECOMP(old_re)
7353 && RX_PRELEN(old_re) == plen
7354 && memEQ(RX_PRECOMP(old_re), exp, plen)
7355 && !runtime_code /* with runtime code, always recompile */ )
7360 /* Allocate the pattern's SV */
7361 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7362 RExC_rx = ReANY(Rx);
7363 if ( RExC_rx == NULL )
7364 FAIL("Regexp out of space");
7366 rx_flags = orig_rx_flags;
7368 if (initial_charset == REGEX_DEPENDS_CHARSET && RExC_uni_semantics) {
7370 /* Set to use unicode semantics if the pattern is in utf8 and has the
7371 * 'depends' charset specified, as it means unicode when utf8 */
7372 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7375 RExC_pm_flags = pm_flags;
7378 assert(TAINTING_get || !TAINT_get);
7380 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7382 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7383 /* whoops, we have a non-utf8 pattern, whilst run-time code
7384 * got compiled as utf8. Try again with a utf8 pattern */
7385 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7386 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7390 assert(!pRExC_state->runtime_code_qr);
7396 RExC_in_lookbehind = 0;
7397 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7399 RExC_recode_x_to_native = 0;
7401 RExC_in_multi_char_class = 0;
7403 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7404 RExC_precomp_end = RExC_end = exp + plen;
7406 RExC_whilem_seen = 0;
7408 RExC_recurse = NULL;
7409 RExC_study_chunk_recursed = NULL;
7410 RExC_study_chunk_recursed_bytes= 0;
7411 RExC_recurse_count = 0;
7412 pRExC_state->code_index = 0;
7414 /* Initialize the string in the compiled pattern. This is so that there is
7415 * something to output if necessary */
7416 set_regex_pv(pRExC_state, Rx);
7419 Perl_re_printf( aTHX_
7420 "Starting parse and generation\n");
7422 RExC_lastparse=NULL;
7425 /* Allocate space and zero-initialize. Note, the two step process
7426 of zeroing when in debug mode, thus anything assigned has to
7427 happen after that */
7430 /* On the first pass of the parse, we guess how big this will be. Then
7431 * we grow in one operation to that amount and then give it back. As
7432 * we go along, we re-allocate what we need.
7434 * XXX Currently the guess is essentially that the pattern will be an
7435 * EXACT node with one byte input, one byte output. This is crude, and
7436 * better heuristics are welcome.
7438 * On any subsequent passes, we guess what we actually computed in the
7439 * latest earlier pass. Such a pass probably didn't complete so is
7440 * missing stuff. We could improve those guesses by knowing where the
7441 * parse stopped, and use the length so far plus apply the above
7442 * assumption to what's left. */
7443 RExC_size = STR_SZ(RExC_end - RExC_start);
7446 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7447 if ( RExC_rxi == NULL )
7448 FAIL("Regexp out of space");
7450 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7451 RXi_SET( RExC_rx, RExC_rxi );
7453 /* We start from 0 (over from 0 in the case this is a reparse. The first
7454 * node parsed will give back any excess memory we have allocated so far).
7458 /* non-zero initialization begins here */
7459 RExC_rx->engine= eng;
7460 RExC_rx->extflags = rx_flags;
7461 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7463 if (pm_flags & PMf_IS_QR) {
7464 RExC_rxi->code_blocks = pRExC_state->code_blocks;
7465 if (RExC_rxi->code_blocks) {
7466 RExC_rxi->code_blocks->refcnt++;
7470 RExC_rx->intflags = 0;
7472 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
7475 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7476 * code makes sure the final byte is an uncounted NUL. But should this
7477 * ever not be the case, lots of things could read beyond the end of the
7478 * buffer: loops like
7479 * while(isFOO(*RExC_parse)) RExC_parse++;
7480 * strchr(RExC_parse, "foo");
7481 * etc. So it is worth noting. */
7482 assert(*RExC_end == '\0');
7486 RExC_emit_start = RExC_rxi->program;
7487 pRExC_state->code_index = 0;
7489 *((char*) RExC_emit_start) = (char) REG_MAGIC;
7493 if (reg(pRExC_state, 0, &flags, 1)) {
7495 /* Success!, But if RExC_total_parens < 0, we need to redo the parse
7496 * knowing how many parens there actually are */
7497 if (RExC_total_parens < 0) {
7498 flags |= RESTART_PARSE;
7501 /* We have that number in RExC_npar */
7502 RExC_total_parens = RExC_npar;
7504 else if (! MUST_RESTART(flags)) {
7506 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7509 /* Here, we either have success, or we have to redo the parse for some reason */
7510 if (MUST_RESTART(flags)) {
7512 /* It's possible to write a regexp in ascii that represents Unicode
7513 codepoints outside of the byte range, such as via \x{100}. If we
7514 detect such a sequence we have to convert the entire pattern to utf8
7515 and then recompile, as our sizing calculation will have been based
7516 on 1 byte == 1 character, but we will need to use utf8 to encode
7517 at least some part of the pattern, and therefore must convert the whole
7520 if (flags & NEED_UTF8) {
7522 /* We have stored the offset of the final warning output so far.
7523 * That must be adjusted. Any variant characters between the start
7524 * of the pattern and this warning count for 2 bytes in the final,
7525 * so just add them again */
7526 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7527 RExC_latest_warn_offset +=
7528 variant_under_utf8_count((U8 *) exp, (U8 *) exp
7529 + RExC_latest_warn_offset);
7531 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7532 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7533 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7536 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7539 if (RExC_total_parens > 0) {
7540 /* Make enough room for all the known parens, and zero it */
7541 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7542 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7543 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
7545 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7546 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7548 else { /* Parse did not complete. Reinitialize the parentheses
7550 RExC_total_parens = 0;
7551 if (RExC_open_parens) {
7552 Safefree(RExC_open_parens);
7553 RExC_open_parens = NULL;
7555 if (RExC_close_parens) {
7556 Safefree(RExC_close_parens);
7557 RExC_close_parens = NULL;
7561 /* Clean up what we did in this parse */
7562 SvREFCNT_dec_NN(RExC_rx_sv);
7567 /* Here, we have successfully parsed and generated the pattern's program
7568 * for the regex engine. We are ready to finish things up and look for
7571 /* Update the string to compile, with correct modifiers, etc */
7572 set_regex_pv(pRExC_state, Rx);
7574 RExC_rx->nparens = RExC_total_parens - 1;
7576 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7577 if (RExC_whilem_seen > 15)
7578 RExC_whilem_seen = 15;
7581 Perl_re_printf( aTHX_
7582 "Required size %" IVdf " nodes\n", (IV)RExC_size);
7584 RExC_lastparse=NULL;
7587 #ifdef RE_TRACK_PATTERN_OFFSETS
7588 DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7589 "%s %" UVuf " bytes for offset annotations.\n",
7590 RExC_offsets ? "Got" : "Couldn't get",
7591 (UV)((RExC_offsets[0] * 2 + 1))));
7592 DEBUG_OFFSETS_r(if (RExC_offsets) {
7593 const STRLEN len = RExC_offsets[0];
7595 GET_RE_DEBUG_FLAGS_DECL;
7596 Perl_re_printf( aTHX_
7597 "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7598 for (i = 1; i <= len; i++) {
7599 if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7600 Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ",
7601 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7603 Perl_re_printf( aTHX_ "\n");
7607 SetProgLen(RExC_rxi,RExC_size);
7611 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
7614 /* XXXX To minimize changes to RE engine we always allocate
7615 3-units-long substrs field. */
7616 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
7617 if (RExC_recurse_count) {
7618 Newx(RExC_recurse, RExC_recurse_count, regnode *);
7619 SAVEFREEPV(RExC_recurse);
7622 if (RExC_seen & REG_RECURSE_SEEN) {
7623 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
7624 * So its 1 if there are no parens. */
7625 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
7626 ((RExC_total_parens & 0x07) != 0);
7627 Newx(RExC_study_chunk_recursed,
7628 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7629 SAVEFREEPV(RExC_study_chunk_recursed);
7633 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7635 RExC_study_chunk_recursed_count= 0;
7637 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
7638 if (RExC_study_chunk_recursed) {
7639 Zero(RExC_study_chunk_recursed,
7640 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7644 #ifdef TRIE_STUDY_OPT
7646 StructCopy(&zero_scan_data, &data, scan_data_t);
7647 copyRExC_state = RExC_state;
7650 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7652 RExC_state = copyRExC_state;
7653 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7654 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7656 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7657 StructCopy(&zero_scan_data, &data, scan_data_t);
7660 StructCopy(&zero_scan_data, &data, scan_data_t);
7663 /* Dig out information for optimizations. */
7664 RExC_rx->extflags = RExC_flags; /* was pm_op */
7665 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7668 SvUTF8_on(Rx); /* Unicode in it? */
7669 RExC_rxi->regstclass = NULL;
7670 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
7671 RExC_rx->intflags |= PREGf_NAUGHTY;
7672 scan = RExC_rxi->program + 1; /* First BRANCH. */
7674 /* testing for BRANCH here tells us whether there is "must appear"
7675 data in the pattern. If there is then we can use it for optimisations */
7676 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
7679 STRLEN longest_length[2];
7680 regnode_ssc ch_class; /* pointed to by data */
7682 SSize_t last_close = 0; /* pointed to by data */
7683 regnode *first= scan;
7684 regnode *first_next= regnext(first);
7688 * Skip introductions and multiplicators >= 1
7689 * so that we can extract the 'meat' of the pattern that must
7690 * match in the large if() sequence following.
7691 * NOTE that EXACT is NOT covered here, as it is normally
7692 * picked up by the optimiser separately.
7694 * This is unfortunate as the optimiser isnt handling lookahead
7695 * properly currently.
7698 while ((OP(first) == OPEN && (sawopen = 1)) ||
7699 /* An OR of *one* alternative - should not happen now. */
7700 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7701 /* for now we can't handle lookbehind IFMATCH*/
7702 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7703 (OP(first) == PLUS) ||
7704 (OP(first) == MINMOD) ||
7705 /* An {n,m} with n>0 */
7706 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7707 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7710 * the only op that could be a regnode is PLUS, all the rest
7711 * will be regnode_1 or regnode_2.
7713 * (yves doesn't think this is true)
7715 if (OP(first) == PLUS)
7718 if (OP(first) == MINMOD)
7720 first += regarglen[OP(first)];
7722 first = NEXTOPER(first);
7723 first_next= regnext(first);
7726 /* Starting-point info. */
7728 DEBUG_PEEP("first:", first, 0, 0);
7729 /* Ignore EXACT as we deal with it later. */
7730 if (PL_regkind[OP(first)] == EXACT) {
7731 if ( OP(first) == EXACT
7732 || OP(first) == EXACT_ONLY8
7733 || OP(first) == EXACTL)
7735 NOOP; /* Empty, get anchored substr later. */
7738 RExC_rxi->regstclass = first;
7741 else if (PL_regkind[OP(first)] == TRIE &&
7742 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
7744 /* this can happen only on restudy */
7745 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7748 else if (REGNODE_SIMPLE(OP(first)))
7749 RExC_rxi->regstclass = first;
7750 else if (PL_regkind[OP(first)] == BOUND ||
7751 PL_regkind[OP(first)] == NBOUND)
7752 RExC_rxi->regstclass = first;
7753 else if (PL_regkind[OP(first)] == BOL) {
7754 RExC_rx->intflags |= (OP(first) == MBOL
7757 first = NEXTOPER(first);
7760 else if (OP(first) == GPOS) {
7761 RExC_rx->intflags |= PREGf_ANCH_GPOS;
7762 first = NEXTOPER(first);
7765 else if ((!sawopen || !RExC_sawback) &&
7767 (OP(first) == STAR &&
7768 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7769 !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
7771 /* turn .* into ^.* with an implied $*=1 */
7773 (OP(NEXTOPER(first)) == REG_ANY)
7776 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
7777 first = NEXTOPER(first);
7780 if (sawplus && !sawminmod && !sawlookahead
7781 && (!sawopen || !RExC_sawback)
7782 && !pRExC_state->code_blocks) /* May examine pos and $& */
7783 /* x+ must match at the 1st pos of run of x's */
7784 RExC_rx->intflags |= PREGf_SKIP;
7786 /* Scan is after the zeroth branch, first is atomic matcher. */
7787 #ifdef TRIE_STUDY_OPT
7790 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
7791 (IV)(first - scan + 1))
7795 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
7796 (IV)(first - scan + 1))
7802 * If there's something expensive in the r.e., find the
7803 * longest literal string that must appear and make it the
7804 * regmust. Resolve ties in favor of later strings, since
7805 * the regstart check works with the beginning of the r.e.
7806 * and avoiding duplication strengthens checking. Not a
7807 * strong reason, but sufficient in the absence of others.
7808 * [Now we resolve ties in favor of the earlier string if
7809 * it happens that c_offset_min has been invalidated, since the
7810 * earlier string may buy us something the later one won't.]
7813 data.substrs[0].str = newSVpvs("");
7814 data.substrs[1].str = newSVpvs("");
7815 data.last_found = newSVpvs("");
7816 data.cur_is_floating = 0; /* initially any found substring is fixed */
7817 ENTER_with_name("study_chunk");
7818 SAVEFREESV(data.substrs[0].str);
7819 SAVEFREESV(data.substrs[1].str);
7820 SAVEFREESV(data.last_found);
7822 if (!RExC_rxi->regstclass) {
7823 ssc_init(pRExC_state, &ch_class);
7824 data.start_class = &ch_class;
7825 stclass_flag = SCF_DO_STCLASS_AND;
7826 } else /* XXXX Check for BOUND? */
7828 data.last_closep = &last_close;
7832 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
7833 * (NO top level branches)
7835 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7836 scan + RExC_size, /* Up to end */
7838 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7839 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7843 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7846 if ( RExC_total_parens == 1 && !data.cur_is_floating
7847 && data.last_start_min == 0 && data.last_end > 0
7848 && !RExC_seen_zerolen
7849 && !(RExC_seen & REG_VERBARG_SEEN)
7850 && !(RExC_seen & REG_GPOS_SEEN)
7852 RExC_rx->extflags |= RXf_CHECK_ALL;
7854 scan_commit(pRExC_state, &data,&minlen, 0);
7857 /* XXX this is done in reverse order because that's the way the
7858 * code was before it was parameterised. Don't know whether it
7859 * actually needs doing in reverse order. DAPM */
7860 for (i = 1; i >= 0; i--) {
7861 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
7864 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
7865 && data.substrs[0].min_offset
7866 == data.substrs[1].min_offset
7867 && SvCUR(data.substrs[0].str)
7868 == SvCUR(data.substrs[1].str)
7870 && S_setup_longest (aTHX_ pRExC_state,
7871 &(RExC_rx->substrs->data[i]),
7875 RExC_rx->substrs->data[i].min_offset =
7876 data.substrs[i].min_offset - data.substrs[i].lookbehind;
7878 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
7879 /* Don't offset infinity */
7880 if (data.substrs[i].max_offset < SSize_t_MAX)
7881 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
7882 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
7885 RExC_rx->substrs->data[i].substr = NULL;
7886 RExC_rx->substrs->data[i].utf8_substr = NULL;
7887 longest_length[i] = 0;
7891 LEAVE_with_name("study_chunk");
7893 if (RExC_rxi->regstclass
7894 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
7895 RExC_rxi->regstclass = NULL;
7897 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
7898 || RExC_rx->substrs->data[0].min_offset)
7900 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7901 && is_ssc_worth_it(pRExC_state, data.start_class))
7903 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7905 ssc_finalize(pRExC_state, data.start_class);
7907 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7908 StructCopy(data.start_class,
7909 (regnode_ssc*)RExC_rxi->data->data[n],
7911 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
7912 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7913 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7914 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
7915 Perl_re_printf( aTHX_
7916 "synthetic stclass \"%s\".\n",
7917 SvPVX_const(sv));});
7918 data.start_class = NULL;
7921 /* A temporary algorithm prefers floated substr to fixed one of
7922 * same length to dig more info. */
7923 i = (longest_length[0] <= longest_length[1]);
7924 RExC_rx->substrs->check_ix = i;
7925 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
7926 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
7927 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
7928 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
7929 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
7930 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
7931 RExC_rx->intflags |= PREGf_NOSCAN;
7933 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
7934 RExC_rx->extflags |= RXf_USE_INTUIT;
7935 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
7936 RExC_rx->extflags |= RXf_INTUIT_TAIL;
7939 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7940 if ( (STRLEN)minlen < longest_length[1] )
7941 minlen= longest_length[1];
7942 if ( (STRLEN)minlen < longest_length[0] )
7943 minlen= longest_length[0];
7947 /* Several toplevels. Best we can is to set minlen. */
7949 regnode_ssc ch_class;
7950 SSize_t last_close = 0;
7952 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
7954 scan = RExC_rxi->program + 1;
7955 ssc_init(pRExC_state, &ch_class);
7956 data.start_class = &ch_class;
7957 data.last_closep = &last_close;
7961 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
7962 * (patterns WITH top level branches)
7964 minlen = study_chunk(pRExC_state,
7965 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7966 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7967 ? SCF_TRIE_DOING_RESTUDY
7971 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7973 RExC_rx->check_substr = NULL;
7974 RExC_rx->check_utf8 = NULL;
7975 RExC_rx->substrs->data[0].substr = NULL;
7976 RExC_rx->substrs->data[0].utf8_substr = NULL;
7977 RExC_rx->substrs->data[1].substr = NULL;
7978 RExC_rx->substrs->data[1].utf8_substr = NULL;
7980 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7981 && is_ssc_worth_it(pRExC_state, data.start_class))
7983 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7985 ssc_finalize(pRExC_state, data.start_class);
7987 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7988 StructCopy(data.start_class,
7989 (regnode_ssc*)RExC_rxi->data->data[n],
7991 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
7992 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7993 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7994 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
7995 Perl_re_printf( aTHX_
7996 "synthetic stclass \"%s\".\n",
7997 SvPVX_const(sv));});
7998 data.start_class = NULL;
8002 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8003 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8004 RExC_rx->maxlen = REG_INFTY;
8007 RExC_rx->maxlen = RExC_maxlen;
8010 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8011 the "real" pattern. */
8013 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8014 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8016 RExC_rx->minlenret = minlen;
8017 if (RExC_rx->minlen < minlen)
8018 RExC_rx->minlen = minlen;
8020 if (RExC_seen & REG_RECURSE_SEEN ) {
8021 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8022 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8024 if (RExC_seen & REG_GPOS_SEEN)
8025 RExC_rx->intflags |= PREGf_GPOS_SEEN;
8026 if (RExC_seen & REG_LOOKBEHIND_SEEN)
8027 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8029 if (pRExC_state->code_blocks)
8030 RExC_rx->extflags |= RXf_EVAL_SEEN;
8031 if (RExC_seen & REG_VERBARG_SEEN)
8033 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8034 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8036 if (RExC_seen & REG_CUTGROUP_SEEN)
8037 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8038 if (pm_flags & PMf_USE_RE_EVAL)
8039 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8040 if (RExC_paren_names)
8041 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8043 RXp_PAREN_NAMES(RExC_rx) = NULL;
8045 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8046 * so it can be used in pp.c */
8047 if (RExC_rx->intflags & PREGf_ANCH)
8048 RExC_rx->extflags |= RXf_IS_ANCHORED;
8052 /* this is used to identify "special" patterns that might result
8053 * in Perl NOT calling the regex engine and instead doing the match "itself",
8054 * particularly special cases in split//. By having the regex compiler
8055 * do this pattern matching at a regop level (instead of by inspecting the pattern)
8056 * we avoid weird issues with equivalent patterns resulting in different behavior,
8057 * AND we allow non Perl engines to get the same optimizations by the setting the
8058 * flags appropriately - Yves */
8059 regnode *first = RExC_rxi->program + 1;
8061 regnode *next = regnext(first);
8064 if (PL_regkind[fop] == NOTHING && nop == END)
8065 RExC_rx->extflags |= RXf_NULL;
8066 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8067 /* when fop is SBOL first->flags will be true only when it was
8068 * produced by parsing /\A/, and not when parsing /^/. This is
8069 * very important for the split code as there we want to
8070 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8071 * See rt #122761 for more details. -- Yves */
8072 RExC_rx->extflags |= RXf_START_ONLY;
8073 else if (fop == PLUS
8074 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8076 RExC_rx->extflags |= RXf_WHITE;
8077 else if ( RExC_rx->extflags & RXf_SPLIT
8078 && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
8079 && STR_LEN(first) == 1
8080 && *(STRING(first)) == ' '
8082 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8086 if (RExC_contains_locale) {
8087 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8091 if (RExC_paren_names) {
8092 RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8093 RExC_rxi->data->data[RExC_rxi->name_list_idx]
8094 = (void*)SvREFCNT_inc(RExC_paren_name_list);
8097 RExC_rxi->name_list_idx = 0;
8099 while ( RExC_recurse_count > 0 ) {
8100 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8102 * This data structure is set up in study_chunk() and is used
8103 * to calculate the distance between a GOSUB regopcode and
8104 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8107 * If for some reason someone writes code that optimises
8108 * away a GOSUB opcode then the assert should be changed to
8109 * an if(scan) to guard the ARG2L_SET() - Yves
8112 assert(scan && OP(scan) == GOSUB);
8113 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8116 Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8117 /* assume we don't need to swap parens around before we match */
8119 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8120 (unsigned long)RExC_study_chunk_recursed_count);
8124 Perl_re_printf( aTHX_ "Final program:\n");
8128 if (RExC_open_parens) {
8129 Safefree(RExC_open_parens);
8130 RExC_open_parens = NULL;
8132 if (RExC_close_parens) {
8133 Safefree(RExC_close_parens);
8134 RExC_close_parens = NULL;
8138 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8139 * by setting the regexp SV to readonly-only instead. If the
8140 * pattern's been recompiled, the USEDness should remain. */
8141 if (old_re && SvREADONLY(old_re))
8149 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8152 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8154 PERL_UNUSED_ARG(value);
8156 if (flags & RXapif_FETCH) {
8157 return reg_named_buff_fetch(rx, key, flags);
8158 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8159 Perl_croak_no_modify();
8161 } else if (flags & RXapif_EXISTS) {
8162 return reg_named_buff_exists(rx, key, flags)
8165 } else if (flags & RXapif_REGNAMES) {
8166 return reg_named_buff_all(rx, flags);
8167 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8168 return reg_named_buff_scalar(rx, flags);
8170 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8176 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8179 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8180 PERL_UNUSED_ARG(lastkey);
8182 if (flags & RXapif_FIRSTKEY)
8183 return reg_named_buff_firstkey(rx, flags);
8184 else if (flags & RXapif_NEXTKEY)
8185 return reg_named_buff_nextkey(rx, flags);
8187 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8194 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8198 struct regexp *const rx = ReANY(r);
8200 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8202 if (rx && RXp_PAREN_NAMES(rx)) {
8203 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8206 SV* sv_dat=HeVAL(he_str);
8207 I32 *nums=(I32*)SvPVX(sv_dat);
8208 AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8209 for ( i=0; i<SvIVX(sv_dat); i++ ) {
8210 if ((I32)(rx->nparens) >= nums[i]
8211 && rx->offs[nums[i]].start != -1
8212 && rx->offs[nums[i]].end != -1)
8215 CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8220 ret = newSVsv(&PL_sv_undef);
8223 av_push(retarray, ret);
8226 return newRV_noinc(MUTABLE_SV(retarray));
8233 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8236 struct regexp *const rx = ReANY(r);
8238 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8240 if (rx && RXp_PAREN_NAMES(rx)) {
8241 if (flags & RXapif_ALL) {
8242 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8244 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8246 SvREFCNT_dec_NN(sv);
8258 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8260 struct regexp *const rx = ReANY(r);
8262 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8264 if ( rx && RXp_PAREN_NAMES(rx) ) {
8265 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8267 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8274 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8276 struct regexp *const rx = ReANY(r);
8277 GET_RE_DEBUG_FLAGS_DECL;
8279 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8281 if (rx && RXp_PAREN_NAMES(rx)) {
8282 HV *hv = RXp_PAREN_NAMES(rx);
8284 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8287 SV* sv_dat = HeVAL(temphe);
8288 I32 *nums = (I32*)SvPVX(sv_dat);
8289 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8290 if ((I32)(rx->lastparen) >= nums[i] &&
8291 rx->offs[nums[i]].start != -1 &&
8292 rx->offs[nums[i]].end != -1)
8298 if (parno || flags & RXapif_ALL) {
8299 return newSVhek(HeKEY_hek(temphe));
8307 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8312 struct regexp *const rx = ReANY(r);
8314 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8316 if (rx && RXp_PAREN_NAMES(rx)) {
8317 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8318 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8319 } else if (flags & RXapif_ONE) {
8320 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8321 av = MUTABLE_AV(SvRV(ret));
8322 length = av_tindex(av);
8323 SvREFCNT_dec_NN(ret);
8324 return newSViv(length + 1);
8326 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8331 return &PL_sv_undef;
8335 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8337 struct regexp *const rx = ReANY(r);
8340 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8342 if (rx && RXp_PAREN_NAMES(rx)) {
8343 HV *hv= RXp_PAREN_NAMES(rx);
8345 (void)hv_iterinit(hv);
8346 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8349 SV* sv_dat = HeVAL(temphe);
8350 I32 *nums = (I32*)SvPVX(sv_dat);
8351 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8352 if ((I32)(rx->lastparen) >= nums[i] &&
8353 rx->offs[nums[i]].start != -1 &&
8354 rx->offs[nums[i]].end != -1)
8360 if (parno || flags & RXapif_ALL) {
8361 av_push(av, newSVhek(HeKEY_hek(temphe)));
8366 return newRV_noinc(MUTABLE_SV(av));
8370 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8373 struct regexp *const rx = ReANY(r);
8379 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8381 if ( n == RX_BUFF_IDX_CARET_PREMATCH
8382 || n == RX_BUFF_IDX_CARET_FULLMATCH
8383 || n == RX_BUFF_IDX_CARET_POSTMATCH
8386 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8388 /* on something like
8391 * the KEEPCOPY is set on the PMOP rather than the regex */
8392 if (PL_curpm && r == PM_GETRE(PL_curpm))
8393 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8402 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8403 /* no need to distinguish between them any more */
8404 n = RX_BUFF_IDX_FULLMATCH;
8406 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8407 && rx->offs[0].start != -1)
8409 /* $`, ${^PREMATCH} */
8410 i = rx->offs[0].start;
8414 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8415 && rx->offs[0].end != -1)
8417 /* $', ${^POSTMATCH} */
8418 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8419 i = rx->sublen + rx->suboffset - rx->offs[0].end;
8422 if ( 0 <= n && n <= (I32)rx->nparens &&
8423 (s1 = rx->offs[n].start) != -1 &&
8424 (t1 = rx->offs[n].end) != -1)
8426 /* $&, ${^MATCH}, $1 ... */
8428 s = rx->subbeg + s1 - rx->suboffset;
8433 assert(s >= rx->subbeg);
8434 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8436 #ifdef NO_TAINT_SUPPORT
8437 sv_setpvn(sv, s, i);
8439 const int oldtainted = TAINT_get;
8441 sv_setpvn(sv, s, i);
8442 TAINT_set(oldtainted);
8444 if (RXp_MATCH_UTF8(rx))
8449 if (RXp_MATCH_TAINTED(rx)) {
8450 if (SvTYPE(sv) >= SVt_PVMG) {
8451 MAGIC* const mg = SvMAGIC(sv);
8454 SvMAGIC_set(sv, mg->mg_moremagic);
8456 if ((mgt = SvMAGIC(sv))) {
8457 mg->mg_moremagic = mgt;
8458 SvMAGIC_set(sv, mg);
8475 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8476 SV const * const value)
8478 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8480 PERL_UNUSED_ARG(rx);
8481 PERL_UNUSED_ARG(paren);
8482 PERL_UNUSED_ARG(value);
8485 Perl_croak_no_modify();
8489 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8492 struct regexp *const rx = ReANY(r);
8496 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8498 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
8499 || paren == RX_BUFF_IDX_CARET_FULLMATCH
8500 || paren == RX_BUFF_IDX_CARET_POSTMATCH
8503 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8505 /* on something like
8508 * the KEEPCOPY is set on the PMOP rather than the regex */
8509 if (PL_curpm && r == PM_GETRE(PL_curpm))
8510 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8516 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8518 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8519 case RX_BUFF_IDX_PREMATCH: /* $` */
8520 if (rx->offs[0].start != -1) {
8521 i = rx->offs[0].start;
8530 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8531 case RX_BUFF_IDX_POSTMATCH: /* $' */
8532 if (rx->offs[0].end != -1) {
8533 i = rx->sublen - rx->offs[0].end;
8535 s1 = rx->offs[0].end;
8542 default: /* $& / ${^MATCH}, $1, $2, ... */
8543 if (paren <= (I32)rx->nparens &&
8544 (s1 = rx->offs[paren].start) != -1 &&
8545 (t1 = rx->offs[paren].end) != -1)
8551 if (ckWARN(WARN_UNINITIALIZED))
8552 report_uninit((const SV *)sv);
8557 if (i > 0 && RXp_MATCH_UTF8(rx)) {
8558 const char * const s = rx->subbeg - rx->suboffset + s1;
8563 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8570 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8572 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8573 PERL_UNUSED_ARG(rx);
8577 return newSVpvs("Regexp");
8580 /* Scans the name of a named buffer from the pattern.
8581 * If flags is REG_RSN_RETURN_NULL returns null.
8582 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8583 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8584 * to the parsed name as looked up in the RExC_paren_names hash.
8585 * If there is an error throws a vFAIL().. type exception.
8588 #define REG_RSN_RETURN_NULL 0
8589 #define REG_RSN_RETURN_NAME 1
8590 #define REG_RSN_RETURN_DATA 2
8593 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8595 char *name_start = RExC_parse;
8598 PERL_ARGS_ASSERT_REG_SCAN_NAME;
8600 assert (RExC_parse <= RExC_end);
8601 if (RExC_parse == RExC_end) NOOP;
8602 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8603 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
8604 * using do...while */
8607 RExC_parse += UTF8SKIP(RExC_parse);
8608 } while ( RExC_parse < RExC_end
8609 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8613 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8615 RExC_parse++; /* so the <- from the vFAIL is after the offending
8617 vFAIL("Group name must start with a non-digit word character");
8619 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8620 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8621 if ( flags == REG_RSN_RETURN_NAME)
8623 else if (flags==REG_RSN_RETURN_DATA) {
8626 if ( ! sv_name ) /* should not happen*/
8627 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8628 if (RExC_paren_names)
8629 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8631 sv_dat = HeVAL(he_str);
8632 if ( ! sv_dat ) { /* Didn't find group */
8634 /* It might be a forward reference; we can't fail until we
8635 * know, by completing the parse to get all the groups, and
8637 if (RExC_total_parens > 0) {
8638 vFAIL("Reference to nonexistent named group");
8641 REQUIRE_PARENS_PASS;
8647 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8648 (unsigned long) flags);
8651 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
8653 if (RExC_lastparse!=RExC_parse) { \
8654 Perl_re_printf( aTHX_ "%s", \
8655 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
8656 RExC_end - RExC_parse, 16, \
8658 PERL_PV_ESCAPE_UNI_DETECT | \
8659 PERL_PV_PRETTY_ELLIPSES | \
8660 PERL_PV_PRETTY_LTGT | \
8661 PERL_PV_ESCAPE_RE | \
8662 PERL_PV_PRETTY_EXACTSIZE \
8666 Perl_re_printf( aTHX_ "%16s",""); \
8668 num=REG_NODE_NUM(REGNODE_p(RExC_emit)); \
8669 if (RExC_lastnum!=num) \
8670 Perl_re_printf( aTHX_ "|%4d", num); \
8672 Perl_re_printf( aTHX_ "|%4s",""); \
8673 Perl_re_printf( aTHX_ "|%*s%-4s", \
8674 (int)((depth*2)), "", \
8678 RExC_lastparse=RExC_parse; \
8683 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
8684 DEBUG_PARSE_MSG((funcname)); \
8685 Perl_re_printf( aTHX_ "%4s","\n"); \
8687 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
8688 DEBUG_PARSE_MSG((funcname)); \
8689 Perl_re_printf( aTHX_ fmt "\n",args); \
8692 /* This section of code defines the inversion list object and its methods. The
8693 * interfaces are highly subject to change, so as much as possible is static to
8694 * this file. An inversion list is here implemented as a malloc'd C UV array
8695 * as an SVt_INVLIST scalar.
8697 * An inversion list for Unicode is an array of code points, sorted by ordinal
8698 * number. Each element gives the code point that begins a range that extends
8699 * up-to but not including the code point given by the next element. The final
8700 * element gives the first code point of a range that extends to the platform's
8701 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
8702 * ...) give ranges whose code points are all in the inversion list. We say
8703 * that those ranges are in the set. The odd-numbered elements give ranges
8704 * whose code points are not in the inversion list, and hence not in the set.
8705 * Thus, element [0] is the first code point in the list. Element [1]
8706 * is the first code point beyond that not in the list; and element [2] is the
8707 * first code point beyond that that is in the list. In other words, the first
8708 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8709 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
8710 * all code points in that range are not in the inversion list. The third
8711 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8712 * list, and so forth. Thus every element whose index is divisible by two
8713 * gives the beginning of a range that is in the list, and every element whose
8714 * index is not divisible by two gives the beginning of a range not in the
8715 * list. If the final element's index is divisible by two, the inversion list
8716 * extends to the platform's infinity; otherwise the highest code point in the
8717 * inversion list is the contents of that element minus 1.
8719 * A range that contains just a single code point N will look like
8721 * invlist[i+1] == N+1
8723 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
8724 * impossible to represent, so element [i+1] is omitted. The single element
8726 * invlist[0] == UV_MAX
8727 * contains just UV_MAX, but is interpreted as matching to infinity.
8729 * Taking the complement (inverting) an inversion list is quite simple, if the
8730 * first element is 0, remove it; otherwise add a 0 element at the beginning.
8731 * This implementation reserves an element at the beginning of each inversion
8732 * list to always contain 0; there is an additional flag in the header which
8733 * indicates if the list begins at the 0, or is offset to begin at the next
8734 * element. This means that the inversion list can be inverted without any
8735 * copying; just flip the flag.
8737 * More about inversion lists can be found in "Unicode Demystified"
8738 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8740 * The inversion list data structure is currently implemented as an SV pointing
8741 * to an array of UVs that the SV thinks are bytes. This allows us to have an
8742 * array of UV whose memory management is automatically handled by the existing
8743 * facilities for SV's.
8745 * Some of the methods should always be private to the implementation, and some
8746 * should eventually be made public */
8748 /* The header definitions are in F<invlist_inline.h> */
8750 #ifndef PERL_IN_XSUB_RE
8752 PERL_STATIC_INLINE UV*
8753 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8755 /* Returns a pointer to the first element in the inversion list's array.
8756 * This is called upon initialization of an inversion list. Where the
8757 * array begins depends on whether the list has the code point U+0000 in it
8758 * or not. The other parameter tells it whether the code that follows this
8759 * call is about to put a 0 in the inversion list or not. The first
8760 * element is either the element reserved for 0, if TRUE, or the element
8761 * after it, if FALSE */
8763 bool* offset = get_invlist_offset_addr(invlist);
8764 UV* zero_addr = (UV *) SvPVX(invlist);
8766 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8769 assert(! _invlist_len(invlist));
8773 /* 1^1 = 0; 1^0 = 1 */
8774 *offset = 1 ^ will_have_0;
8775 return zero_addr + *offset;
8778 PERL_STATIC_INLINE void
8779 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8781 /* Sets the current number of elements stored in the inversion list.
8782 * Updates SvCUR correspondingly */
8783 PERL_UNUSED_CONTEXT;
8784 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8786 assert(is_invlist(invlist));
8791 : TO_INTERNAL_SIZE(len + offset));
8792 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8796 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
8798 /* Replaces the inversion list in 'dest' with the one from 'src'. It
8799 * steals the list from 'src', so 'src' is made to have a NULL list. This
8800 * is similar to what SvSetMagicSV() would do, if it were implemented on
8801 * inversion lists, though this routine avoids a copy */
8803 const UV src_len = _invlist_len(src);
8804 const bool src_offset = *get_invlist_offset_addr(src);
8805 const STRLEN src_byte_len = SvLEN(src);
8806 char * array = SvPVX(src);
8808 const int oldtainted = TAINT_get;
8810 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
8812 assert(is_invlist(src));
8813 assert(is_invlist(dest));
8814 assert(! invlist_is_iterating(src));
8815 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
8817 /* Make sure it ends in the right place with a NUL, as our inversion list
8818 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
8820 array[src_byte_len - 1] = '\0';
8822 TAINT_NOT; /* Otherwise it breaks */
8823 sv_usepvn_flags(dest,
8827 /* This flag is documented to cause a copy to be avoided */
8828 SV_HAS_TRAILING_NUL);
8829 TAINT_set(oldtainted);
8834 /* Finish up copying over the other fields in an inversion list */
8835 *get_invlist_offset_addr(dest) = src_offset;
8836 invlist_set_len(dest, src_len, src_offset);
8837 *get_invlist_previous_index_addr(dest) = 0;
8838 invlist_iterfinish(dest);
8841 PERL_STATIC_INLINE IV*
8842 S_get_invlist_previous_index_addr(SV* invlist)
8844 /* Return the address of the IV that is reserved to hold the cached index
8846 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8848 assert(is_invlist(invlist));
8850 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8853 PERL_STATIC_INLINE IV
8854 S_invlist_previous_index(SV* const invlist)
8856 /* Returns cached index of previous search */
8858 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8860 return *get_invlist_previous_index_addr(invlist);
8863 PERL_STATIC_INLINE void
8864 S_invlist_set_previous_index(SV* const invlist, const IV index)
8866 /* Caches <index> for later retrieval */
8868 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8870 assert(index == 0 || index < (int) _invlist_len(invlist));
8872 *get_invlist_previous_index_addr(invlist) = index;
8875 PERL_STATIC_INLINE void
8876 S_invlist_trim(SV* invlist)
8878 /* Free the not currently-being-used space in an inversion list */
8880 /* But don't free up the space needed for the 0 UV that is always at the
8881 * beginning of the list, nor the trailing NUL */
8882 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
8884 PERL_ARGS_ASSERT_INVLIST_TRIM;
8886 assert(is_invlist(invlist));
8888 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
8891 PERL_STATIC_INLINE void
8892 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
8894 PERL_ARGS_ASSERT_INVLIST_CLEAR;
8896 assert(is_invlist(invlist));
8898 invlist_set_len(invlist, 0, 0);
8899 invlist_trim(invlist);
8902 #endif /* ifndef PERL_IN_XSUB_RE */
8904 PERL_STATIC_INLINE bool
8905 S_invlist_is_iterating(SV* const invlist)
8907 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8909 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8912 #ifndef PERL_IN_XSUB_RE
8914 PERL_STATIC_INLINE UV
8915 S_invlist_max(SV* const invlist)
8917 /* Returns the maximum number of elements storable in the inversion list's
8918 * array, without having to realloc() */
8920 PERL_ARGS_ASSERT_INVLIST_MAX;
8922 assert(is_invlist(invlist));
8924 /* Assumes worst case, in which the 0 element is not counted in the
8925 * inversion list, so subtracts 1 for that */
8926 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8927 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8928 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8932 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
8934 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
8936 /* First 1 is in case the zero element isn't in the list; second 1 is for
8938 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8939 invlist_set_len(invlist, 0, 0);
8941 /* Force iterinit() to be used to get iteration to work */
8942 invlist_iterfinish(invlist);
8944 *get_invlist_previous_index_addr(invlist) = 0;
8948 Perl__new_invlist(pTHX_ IV initial_size)
8951 /* Return a pointer to a newly constructed inversion list, with enough
8952 * space to store 'initial_size' elements. If that number is negative, a
8953 * system default is used instead */
8957 if (initial_size < 0) {
8961 /* Allocate the initial space */
8962 new_list = newSV_type(SVt_INVLIST);
8964 initialize_invlist_guts(new_list, initial_size);
8970 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8972 /* Return a pointer to a newly constructed inversion list, initialized to
8973 * point to <list>, which has to be in the exact correct inversion list
8974 * form, including internal fields. Thus this is a dangerous routine that
8975 * should not be used in the wrong hands. The passed in 'list' contains
8976 * several header fields at the beginning that are not part of the
8977 * inversion list body proper */
8979 const STRLEN length = (STRLEN) list[0];
8980 const UV version_id = list[1];
8981 const bool offset = cBOOL(list[2]);
8982 #define HEADER_LENGTH 3
8983 /* If any of the above changes in any way, you must change HEADER_LENGTH
8984 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8985 * perl -E 'say int(rand 2**31-1)'
8987 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8988 data structure type, so that one being
8989 passed in can be validated to be an
8990 inversion list of the correct vintage.
8993 SV* invlist = newSV_type(SVt_INVLIST);
8995 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8997 if (version_id != INVLIST_VERSION_ID) {
8998 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9001 /* The generated array passed in includes header elements that aren't part
9002 * of the list proper, so start it just after them */
9003 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9005 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
9006 shouldn't touch it */
9008 *(get_invlist_offset_addr(invlist)) = offset;
9010 /* The 'length' passed to us is the physical number of elements in the
9011 * inversion list. But if there is an offset the logical number is one
9013 invlist_set_len(invlist, length - offset, offset);
9015 invlist_set_previous_index(invlist, 0);
9017 /* Initialize the iteration pointer. */
9018 invlist_iterfinish(invlist);
9020 SvREADONLY_on(invlist);
9026 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
9028 /* Grow the maximum size of an inversion list */
9030 PERL_ARGS_ASSERT_INVLIST_EXTEND;
9032 assert(is_invlist(invlist));
9034 /* Add one to account for the zero element at the beginning which may not
9035 * be counted by the calling parameters */
9036 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
9040 S__append_range_to_invlist(pTHX_ SV* const invlist,
9041 const UV start, const UV end)
9043 /* Subject to change or removal. Append the range from 'start' to 'end' at
9044 * the end of the inversion list. The range must be above any existing
9048 UV max = invlist_max(invlist);
9049 UV len = _invlist_len(invlist);
9052 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9054 if (len == 0) { /* Empty lists must be initialized */
9055 offset = start != 0;
9056 array = _invlist_array_init(invlist, ! offset);
9059 /* Here, the existing list is non-empty. The current max entry in the
9060 * list is generally the first value not in the set, except when the
9061 * set extends to the end of permissible values, in which case it is
9062 * the first entry in that final set, and so this call is an attempt to
9063 * append out-of-order */
9065 UV final_element = len - 1;
9066 array = invlist_array(invlist);
9067 if ( array[final_element] > start
9068 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9070 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9071 array[final_element], start,
9072 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9075 /* Here, it is a legal append. If the new range begins 1 above the end
9076 * of the range below it, it is extending the range below it, so the
9077 * new first value not in the set is one greater than the newly
9078 * extended range. */
9079 offset = *get_invlist_offset_addr(invlist);
9080 if (array[final_element] == start) {
9081 if (end != UV_MAX) {
9082 array[final_element] = end + 1;
9085 /* But if the end is the maximum representable on the machine,
9086 * assume that infinity was actually what was meant. Just let
9087 * the range that this would extend to have no end */
9088 invlist_set_len(invlist, len - 1, offset);
9094 /* Here the new range doesn't extend any existing set. Add it */
9096 len += 2; /* Includes an element each for the start and end of range */
9098 /* If wll overflow the existing space, extend, which may cause the array to
9101 invlist_extend(invlist, len);
9103 /* Have to set len here to avoid assert failure in invlist_array() */
9104 invlist_set_len(invlist, len, offset);
9106 array = invlist_array(invlist);
9109 invlist_set_len(invlist, len, offset);
9112 /* The next item on the list starts the range, the one after that is
9113 * one past the new range. */
9114 array[len - 2] = start;
9115 if (end != UV_MAX) {
9116 array[len - 1] = end + 1;
9119 /* But if the end is the maximum representable on the machine, just let
9120 * the range have no end */
9121 invlist_set_len(invlist, len - 1, offset);
9126 Perl__invlist_search(SV* const invlist, const UV cp)
9128 /* Searches the inversion list for the entry that contains the input code
9129 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
9130 * return value is the index into the list's array of the range that
9131 * contains <cp>, that is, 'i' such that
9132 * array[i] <= cp < array[i+1]
9137 IV high = _invlist_len(invlist);
9138 const IV highest_element = high - 1;
9141 PERL_ARGS_ASSERT__INVLIST_SEARCH;
9143 /* If list is empty, return failure. */
9148 /* (We can't get the array unless we know the list is non-empty) */
9149 array = invlist_array(invlist);
9151 mid = invlist_previous_index(invlist);
9153 if (mid > highest_element) {
9154 mid = highest_element;
9157 /* <mid> contains the cache of the result of the previous call to this
9158 * function (0 the first time). See if this call is for the same result,
9159 * or if it is for mid-1. This is under the theory that calls to this
9160 * function will often be for related code points that are near each other.
9161 * And benchmarks show that caching gives better results. We also test
9162 * here if the code point is within the bounds of the list. These tests
9163 * replace others that would have had to be made anyway to make sure that
9164 * the array bounds were not exceeded, and these give us extra information
9165 * at the same time */
9166 if (cp >= array[mid]) {
9167 if (cp >= array[highest_element]) {
9168 return highest_element;
9171 /* Here, array[mid] <= cp < array[highest_element]. This means that
9172 * the final element is not the answer, so can exclude it; it also
9173 * means that <mid> is not the final element, so can refer to 'mid + 1'
9175 if (cp < array[mid + 1]) {
9181 else { /* cp < aray[mid] */
9182 if (cp < array[0]) { /* Fail if outside the array */
9186 if (cp >= array[mid - 1]) {
9191 /* Binary search. What we are looking for is <i> such that
9192 * array[i] <= cp < array[i+1]
9193 * The loop below converges on the i+1. Note that there may not be an
9194 * (i+1)th element in the array, and things work nonetheless */
9195 while (low < high) {
9196 mid = (low + high) / 2;
9197 assert(mid <= highest_element);
9198 if (array[mid] <= cp) { /* cp >= array[mid] */
9201 /* We could do this extra test to exit the loop early.
9202 if (cp < array[low]) {
9207 else { /* cp < array[mid] */
9214 invlist_set_previous_index(invlist, high);
9219 Perl__invlist_populate_swatch(SV* const invlist,
9220 const UV start, const UV end, U8* swatch)
9222 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
9223 * but is used when the swash has an inversion list. This makes this much
9224 * faster, as it uses a binary search instead of a linear one. This is
9225 * intimately tied to that function, and perhaps should be in utf8.c,
9226 * except it is intimately tied to inversion lists as well. It assumes
9227 * that <swatch> is all 0's on input */
9230 const IV len = _invlist_len(invlist);
9234 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
9236 if (len == 0) { /* Empty inversion list */
9240 array = invlist_array(invlist);
9242 /* Find which element it is */
9243 i = _invlist_search(invlist, start);
9245 /* We populate from <start> to <end> */
9246 while (current < end) {
9249 /* The inversion list gives the results for every possible code point
9250 * after the first one in the list. Only those ranges whose index is
9251 * even are ones that the inversion list matches. For the odd ones,
9252 * and if the initial code point is not in the list, we have to skip
9253 * forward to the next element */
9254 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
9256 if (i >= len) { /* Finished if beyond the end of the array */
9260 if (current >= end) { /* Finished if beyond the end of what we
9262 if (LIKELY(end < UV_MAX)) {
9266 /* We get here when the upper bound is the maximum
9267 * representable on the machine, and we are looking for just
9268 * that code point. Have to special case it */
9270 goto join_end_of_list;
9273 assert(current >= start);
9275 /* The current range ends one below the next one, except don't go past
9278 upper = (i < len && array[i] < end) ? array[i] : end;
9280 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
9281 * for each code point in it */
9282 for (; current < upper; current++) {
9283 const STRLEN offset = (STRLEN)(current - start);
9284 swatch[offset >> 3] |= 1 << (offset & 7);
9289 /* Quit if at the end of the list */
9292 /* But first, have to deal with the highest possible code point on
9293 * the platform. The previous code assumes that <end> is one
9294 * beyond where we want to populate, but that is impossible at the
9295 * platform's infinity, so have to handle it specially */
9296 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
9298 const STRLEN offset = (STRLEN)(end - start);
9299 swatch[offset >> 3] |= 1 << (offset & 7);
9304 /* Advance to the next range, which will be for code points not in the
9313 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9314 const bool complement_b, SV** output)
9316 /* Take the union of two inversion lists and point '*output' to it. On
9317 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9318 * even 'a' or 'b'). If to an inversion list, the contents of the original
9319 * list will be replaced by the union. The first list, 'a', may be
9320 * NULL, in which case a copy of the second list is placed in '*output'.
9321 * If 'complement_b' is TRUE, the union is taken of the complement
9322 * (inversion) of 'b' instead of b itself.
9324 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9325 * Richard Gillam, published by Addison-Wesley, and explained at some
9326 * length there. The preface says to incorporate its examples into your
9327 * code at your own risk.
9329 * The algorithm is like a merge sort. */
9331 const UV* array_a; /* a's array */
9333 UV len_a; /* length of a's array */
9336 SV* u; /* the resulting union */
9340 UV i_a = 0; /* current index into a's array */
9344 /* running count, as explained in the algorithm source book; items are
9345 * stopped accumulating and are output when the count changes to/from 0.
9346 * The count is incremented when we start a range that's in an input's set,
9347 * and decremented when we start a range that's not in a set. So this
9348 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
9349 * and hence nothing goes into the union; 1, just one of the inputs is in
9350 * its set (and its current range gets added to the union); and 2 when both
9351 * inputs are in their sets. */
9354 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9356 assert(*output == NULL || is_invlist(*output));
9358 len_b = _invlist_len(b);
9361 /* Here, 'b' is empty, hence it's complement is all possible code
9362 * points. So if the union includes the complement of 'b', it includes
9363 * everything, and we need not even look at 'a'. It's easiest to
9364 * create a new inversion list that matches everything. */
9366 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9368 if (*output == NULL) { /* If the output didn't exist, just point it
9370 *output = everything;
9372 else { /* Otherwise, replace its contents with the new list */
9373 invlist_replace_list_destroys_src(*output, everything);
9374 SvREFCNT_dec_NN(everything);
9380 /* Here, we don't want the complement of 'b', and since 'b' is empty,
9381 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
9382 * output will be empty */
9384 if (a == NULL || _invlist_len(a) == 0) {
9385 if (*output == NULL) {
9386 *output = _new_invlist(0);
9389 invlist_clear(*output);
9394 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9395 * union. We can just return a copy of 'a' if '*output' doesn't point
9396 * to an existing list */
9397 if (*output == NULL) {
9398 *output = invlist_clone(a, NULL);
9402 /* If the output is to overwrite 'a', we have a no-op, as it's
9408 /* Here, '*output' is to be overwritten by 'a' */
9409 u = invlist_clone(a, NULL);
9410 invlist_replace_list_destroys_src(*output, u);
9416 /* Here 'b' is not empty. See about 'a' */
9418 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9420 /* Here, 'a' is empty (and b is not). That means the union will come
9421 * entirely from 'b'. If '*output' is NULL, we can directly return a
9422 * clone of 'b'. Otherwise, we replace the contents of '*output' with
9425 SV ** dest = (*output == NULL) ? output : &u;
9426 *dest = invlist_clone(b, NULL);
9428 _invlist_invert(*dest);
9432 invlist_replace_list_destroys_src(*output, u);
9439 /* Here both lists exist and are non-empty */
9440 array_a = invlist_array(a);
9441 array_b = invlist_array(b);
9443 /* If are to take the union of 'a' with the complement of b, set it
9444 * up so are looking at b's complement. */
9447 /* To complement, we invert: if the first element is 0, remove it. To
9448 * do this, we just pretend the array starts one later */
9449 if (array_b[0] == 0) {
9455 /* But if the first element is not zero, we pretend the list starts
9456 * at the 0 that is always stored immediately before the array. */
9462 /* Size the union for the worst case: that the sets are completely
9464 u = _new_invlist(len_a + len_b);
9466 /* Will contain U+0000 if either component does */
9467 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
9468 || (len_b > 0 && array_b[0] == 0));
9470 /* Go through each input list item by item, stopping when have exhausted
9472 while (i_a < len_a && i_b < len_b) {
9473 UV cp; /* The element to potentially add to the union's array */
9474 bool cp_in_set; /* is it in the the input list's set or not */
9476 /* We need to take one or the other of the two inputs for the union.
9477 * Since we are merging two sorted lists, we take the smaller of the
9478 * next items. In case of a tie, we take first the one that is in its
9479 * set. If we first took the one not in its set, it would decrement
9480 * the count, possibly to 0 which would cause it to be output as ending
9481 * the range, and the next time through we would take the same number,
9482 * and output it again as beginning the next range. By doing it the
9483 * opposite way, there is no possibility that the count will be
9484 * momentarily decremented to 0, and thus the two adjoining ranges will
9485 * be seamlessly merged. (In a tie and both are in the set or both not
9486 * in the set, it doesn't matter which we take first.) */
9487 if ( array_a[i_a] < array_b[i_b]
9488 || ( array_a[i_a] == array_b[i_b]
9489 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9491 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9492 cp = array_a[i_a++];
9495 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9496 cp = array_b[i_b++];
9499 /* Here, have chosen which of the two inputs to look at. Only output
9500 * if the running count changes to/from 0, which marks the
9501 * beginning/end of a range that's in the set */
9504 array_u[i_u++] = cp;
9511 array_u[i_u++] = cp;
9517 /* The loop above increments the index into exactly one of the input lists
9518 * each iteration, and ends when either index gets to its list end. That
9519 * means the other index is lower than its end, and so something is
9520 * remaining in that one. We decrement 'count', as explained below, if
9521 * that list is in its set. (i_a and i_b each currently index the element
9522 * beyond the one we care about.) */
9523 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9524 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9529 /* Above we decremented 'count' if the list that had unexamined elements in
9530 * it was in its set. This has made it so that 'count' being non-zero
9531 * means there isn't anything left to output; and 'count' equal to 0 means
9532 * that what is left to output is precisely that which is left in the
9533 * non-exhausted input list.
9535 * To see why, note first that the exhausted input obviously has nothing
9536 * left to add to the union. If it was in its set at its end, that means
9537 * the set extends from here to the platform's infinity, and hence so does
9538 * the union and the non-exhausted set is irrelevant. The exhausted set
9539 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
9540 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9541 * 'count' remains at 1. This is consistent with the decremented 'count'
9542 * != 0 meaning there's nothing left to add to the union.
9544 * But if the exhausted input wasn't in its set, it contributed 0 to
9545 * 'count', and the rest of the union will be whatever the other input is.
9546 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9547 * otherwise it gets decremented to 0. This is consistent with 'count'
9548 * == 0 meaning the remainder of the union is whatever is left in the
9549 * non-exhausted list. */
9554 IV copy_count = len_a - i_a;
9555 if (copy_count > 0) { /* The non-exhausted input is 'a' */
9556 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9558 else { /* The non-exhausted input is b */
9559 copy_count = len_b - i_b;
9560 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9562 len_u = i_u + copy_count;
9565 /* Set the result to the final length, which can change the pointer to
9566 * array_u, so re-find it. (Note that it is unlikely that this will
9567 * change, as we are shrinking the space, not enlarging it) */
9568 if (len_u != _invlist_len(u)) {
9569 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9571 array_u = invlist_array(u);
9574 if (*output == NULL) { /* Simply return the new inversion list */
9578 /* Otherwise, overwrite the inversion list that was in '*output'. We
9579 * could instead free '*output', and then set it to 'u', but experience
9580 * has shown [perl #127392] that if the input is a mortal, we can get a
9581 * huge build-up of these during regex compilation before they get
9583 invlist_replace_list_destroys_src(*output, u);
9591 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9592 const bool complement_b, SV** i)
9594 /* Take the intersection of two inversion lists and point '*i' to it. On
9595 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9596 * even 'a' or 'b'). If to an inversion list, the contents of the original
9597 * list will be replaced by the intersection. The first list, 'a', may be
9598 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
9599 * TRUE, the result will be the intersection of 'a' and the complement (or
9600 * inversion) of 'b' instead of 'b' directly.
9602 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9603 * Richard Gillam, published by Addison-Wesley, and explained at some
9604 * length there. The preface says to incorporate its examples into your
9605 * code at your own risk. In fact, it had bugs
9607 * The algorithm is like a merge sort, and is essentially the same as the
9611 const UV* array_a; /* a's array */
9613 UV len_a; /* length of a's array */
9616 SV* r; /* the resulting intersection */
9620 UV i_a = 0; /* current index into a's array */
9624 /* running count of how many of the two inputs are postitioned at ranges
9625 * that are in their sets. As explained in the algorithm source book,
9626 * items are stopped accumulating and are output when the count changes
9627 * to/from 2. The count is incremented when we start a range that's in an
9628 * input's set, and decremented when we start a range that's not in a set.
9629 * Only when it is 2 are we in the intersection. */
9632 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9634 assert(*i == NULL || is_invlist(*i));
9636 /* Special case if either one is empty */
9637 len_a = (a == NULL) ? 0 : _invlist_len(a);
9638 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9639 if (len_a != 0 && complement_b) {
9641 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9642 * must be empty. Here, also we are using 'b's complement, which
9643 * hence must be every possible code point. Thus the intersection
9646 if (*i == a) { /* No-op */
9651 *i = invlist_clone(a, NULL);
9655 r = invlist_clone(a, NULL);
9656 invlist_replace_list_destroys_src(*i, r);
9661 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
9662 * intersection must be empty */
9664 *i = _new_invlist(0);
9672 /* Here both lists exist and are non-empty */
9673 array_a = invlist_array(a);
9674 array_b = invlist_array(b);
9676 /* If are to take the intersection of 'a' with the complement of b, set it
9677 * up so are looking at b's complement. */
9680 /* To complement, we invert: if the first element is 0, remove it. To
9681 * do this, we just pretend the array starts one later */
9682 if (array_b[0] == 0) {
9688 /* But if the first element is not zero, we pretend the list starts
9689 * at the 0 that is always stored immediately before the array. */
9695 /* Size the intersection for the worst case: that the intersection ends up
9696 * fragmenting everything to be completely disjoint */
9697 r= _new_invlist(len_a + len_b);
9699 /* Will contain U+0000 iff both components do */
9700 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9701 && len_b > 0 && array_b[0] == 0);
9703 /* Go through each list item by item, stopping when have exhausted one of
9705 while (i_a < len_a && i_b < len_b) {
9706 UV cp; /* The element to potentially add to the intersection's
9708 bool cp_in_set; /* Is it in the input list's set or not */
9710 /* We need to take one or the other of the two inputs for the
9711 * intersection. Since we are merging two sorted lists, we take the
9712 * smaller of the next items. In case of a tie, we take first the one
9713 * that is not in its set (a difference from the union algorithm). If
9714 * we first took the one in its set, it would increment the count,
9715 * possibly to 2 which would cause it to be output as starting a range
9716 * in the intersection, and the next time through we would take that
9717 * same number, and output it again as ending the set. By doing the
9718 * opposite of this, there is no possibility that the count will be
9719 * momentarily incremented to 2. (In a tie and both are in the set or
9720 * both not in the set, it doesn't matter which we take first.) */
9721 if ( array_a[i_a] < array_b[i_b]
9722 || ( array_a[i_a] == array_b[i_b]
9723 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9725 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9726 cp = array_a[i_a++];
9729 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9733 /* Here, have chosen which of the two inputs to look at. Only output
9734 * if the running count changes to/from 2, which marks the
9735 * beginning/end of a range that's in the intersection */
9739 array_r[i_r++] = cp;
9744 array_r[i_r++] = cp;
9751 /* The loop above increments the index into exactly one of the input lists
9752 * each iteration, and ends when either index gets to its list end. That
9753 * means the other index is lower than its end, and so something is
9754 * remaining in that one. We increment 'count', as explained below, if the
9755 * exhausted list was in its set. (i_a and i_b each currently index the
9756 * element beyond the one we care about.) */
9757 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9758 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9763 /* Above we incremented 'count' if the exhausted list was in its set. This
9764 * has made it so that 'count' being below 2 means there is nothing left to
9765 * output; otheriwse what's left to add to the intersection is precisely
9766 * that which is left in the non-exhausted input list.
9768 * To see why, note first that the exhausted input obviously has nothing
9769 * left to affect the intersection. If it was in its set at its end, that
9770 * means the set extends from here to the platform's infinity, and hence
9771 * anything in the non-exhausted's list will be in the intersection, and
9772 * anything not in it won't be. Hence, the rest of the intersection is
9773 * precisely what's in the non-exhausted list The exhausted set also
9774 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
9775 * it means 'count' is now at least 2. This is consistent with the
9776 * incremented 'count' being >= 2 means to add the non-exhausted list to
9779 * But if the exhausted input wasn't in its set, it contributed 0 to
9780 * 'count', and the intersection can't include anything further; the
9781 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
9782 * incremented. This is consistent with 'count' being < 2 meaning nothing
9783 * further to add to the intersection. */
9784 if (count < 2) { /* Nothing left to put in the intersection. */
9787 else { /* copy the non-exhausted list, unchanged. */
9788 IV copy_count = len_a - i_a;
9789 if (copy_count > 0) { /* a is the one with stuff left */
9790 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9792 else { /* b is the one with stuff left */
9793 copy_count = len_b - i_b;
9794 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9796 len_r = i_r + copy_count;
9799 /* Set the result to the final length, which can change the pointer to
9800 * array_r, so re-find it. (Note that it is unlikely that this will
9801 * change, as we are shrinking the space, not enlarging it) */
9802 if (len_r != _invlist_len(r)) {
9803 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9805 array_r = invlist_array(r);
9808 if (*i == NULL) { /* Simply return the calculated intersection */
9811 else { /* Otherwise, replace the existing inversion list in '*i'. We could
9812 instead free '*i', and then set it to 'r', but experience has
9813 shown [perl #127392] that if the input is a mortal, we can get a
9814 huge build-up of these during regex compilation before they get
9817 invlist_replace_list_destroys_src(*i, r);
9829 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
9831 /* Add the range from 'start' to 'end' inclusive to the inversion list's
9832 * set. A pointer to the inversion list is returned. This may actually be
9833 * a new list, in which case the passed in one has been destroyed. The
9834 * passed-in inversion list can be NULL, in which case a new one is created
9835 * with just the one range in it. The new list is not necessarily
9836 * NUL-terminated. Space is not freed if the inversion list shrinks as a
9837 * result of this function. The gain would not be large, and in many
9838 * cases, this is called multiple times on a single inversion list, so
9839 * anything freed may almost immediately be needed again.
9841 * This used to mostly call the 'union' routine, but that is much more
9842 * heavyweight than really needed for a single range addition */
9844 UV* array; /* The array implementing the inversion list */
9845 UV len; /* How many elements in 'array' */
9846 SSize_t i_s; /* index into the invlist array where 'start'
9848 SSize_t i_e = 0; /* And the index where 'end' should go */
9849 UV cur_highest; /* The highest code point in the inversion list
9850 upon entry to this function */
9852 /* This range becomes the whole inversion list if none already existed */
9853 if (invlist == NULL) {
9854 invlist = _new_invlist(2);
9855 _append_range_to_invlist(invlist, start, end);
9859 /* Likewise, if the inversion list is currently empty */
9860 len = _invlist_len(invlist);
9862 _append_range_to_invlist(invlist, start, end);
9866 /* Starting here, we have to know the internals of the list */
9867 array = invlist_array(invlist);
9869 /* If the new range ends higher than the current highest ... */
9870 cur_highest = invlist_highest(invlist);
9871 if (end > cur_highest) {
9873 /* If the whole range is higher, we can just append it */
9874 if (start > cur_highest) {
9875 _append_range_to_invlist(invlist, start, end);
9879 /* Otherwise, add the portion that is higher ... */
9880 _append_range_to_invlist(invlist, cur_highest + 1, end);
9882 /* ... and continue on below to handle the rest. As a result of the
9883 * above append, we know that the index of the end of the range is the
9884 * final even numbered one of the array. Recall that the final element
9885 * always starts a range that extends to infinity. If that range is in
9886 * the set (meaning the set goes from here to infinity), it will be an
9887 * even index, but if it isn't in the set, it's odd, and the final
9888 * range in the set is one less, which is even. */
9889 if (end == UV_MAX) {
9897 /* We have dealt with appending, now see about prepending. If the new
9898 * range starts lower than the current lowest ... */
9899 if (start < array[0]) {
9901 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
9902 * Let the union code handle it, rather than having to know the
9903 * trickiness in two code places. */
9904 if (UNLIKELY(start == 0)) {
9907 range_invlist = _new_invlist(2);
9908 _append_range_to_invlist(range_invlist, start, end);
9910 _invlist_union(invlist, range_invlist, &invlist);
9912 SvREFCNT_dec_NN(range_invlist);
9917 /* If the whole new range comes before the first entry, and doesn't
9918 * extend it, we have to insert it as an additional range */
9919 if (end < array[0] - 1) {
9921 goto splice_in_new_range;
9924 /* Here the new range adjoins the existing first range, extending it
9928 /* And continue on below to handle the rest. We know that the index of
9929 * the beginning of the range is the first one of the array */
9932 else { /* Not prepending any part of the new range to the existing list.
9933 * Find where in the list it should go. This finds i_s, such that:
9934 * invlist[i_s] <= start < array[i_s+1]
9936 i_s = _invlist_search(invlist, start);
9939 /* At this point, any extending before the beginning of the inversion list
9940 * and/or after the end has been done. This has made it so that, in the
9941 * code below, each endpoint of the new range is either in a range that is
9942 * in the set, or is in a gap between two ranges that are. This means we
9943 * don't have to worry about exceeding the array bounds.
9945 * Find where in the list the new range ends (but we can skip this if we
9946 * have already determined what it is, or if it will be the same as i_s,
9947 * which we already have computed) */
9949 i_e = (start == end)
9951 : _invlist_search(invlist, end);
9954 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
9955 * is a range that goes to infinity there is no element at invlist[i_e+1],
9956 * so only the first relation holds. */
9958 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
9960 /* Here, the ranges on either side of the beginning of the new range
9961 * are in the set, and this range starts in the gap between them.
9963 * The new range extends the range above it downwards if the new range
9964 * ends at or above that range's start */
9965 const bool extends_the_range_above = ( end == UV_MAX
9966 || end + 1 >= array[i_s+1]);
9968 /* The new range extends the range below it upwards if it begins just
9969 * after where that range ends */
9970 if (start == array[i_s]) {
9972 /* If the new range fills the entire gap between the other ranges,
9973 * they will get merged together. Other ranges may also get
9974 * merged, depending on how many of them the new range spans. In
9975 * the general case, we do the merge later, just once, after we
9976 * figure out how many to merge. But in the case where the new
9977 * range exactly spans just this one gap (possibly extending into
9978 * the one above), we do the merge here, and an early exit. This
9979 * is done here to avoid having to special case later. */
9980 if (i_e - i_s <= 1) {
9982 /* If i_e - i_s == 1, it means that the new range terminates
9983 * within the range above, and hence 'extends_the_range_above'
9984 * must be true. (If the range above it extends to infinity,
9985 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
9986 * will be 0, so no harm done.) */
9987 if (extends_the_range_above) {
9988 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
9989 invlist_set_len(invlist,
9991 *(get_invlist_offset_addr(invlist)));
9995 /* Here, i_e must == i_s. We keep them in sync, as they apply
9996 * to the same range, and below we are about to decrement i_s
10001 /* Here, the new range is adjacent to the one below. (It may also
10002 * span beyond the range above, but that will get resolved later.)
10003 * Extend the range below to include this one. */
10004 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10006 start = array[i_s];
10008 else if (extends_the_range_above) {
10010 /* Here the new range only extends the range above it, but not the
10011 * one below. It merges with the one above. Again, we keep i_e
10012 * and i_s in sync if they point to the same range */
10017 array[i_s] = start;
10021 /* Here, we've dealt with the new range start extending any adjoining
10024 * If the new range extends to infinity, it is now the final one,
10025 * regardless of what was there before */
10026 if (UNLIKELY(end == UV_MAX)) {
10027 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10031 /* If i_e started as == i_s, it has also been dealt with,
10032 * and been updated to the new i_s, which will fail the following if */
10033 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10035 /* Here, the ranges on either side of the end of the new range are in
10036 * the set, and this range ends in the gap between them.
10038 * If this range is adjacent to (hence extends) the range above it, it
10039 * becomes part of that range; likewise if it extends the range below,
10040 * it becomes part of that range */
10041 if (end + 1 == array[i_e+1]) {
10043 array[i_e] = start;
10045 else if (start <= array[i_e]) {
10046 array[i_e] = end + 1;
10053 /* If the range fits entirely in an existing range (as possibly already
10054 * extended above), it doesn't add anything new */
10055 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10059 /* Here, no part of the range is in the list. Must add it. It will
10060 * occupy 2 more slots */
10061 splice_in_new_range:
10063 invlist_extend(invlist, len + 2);
10064 array = invlist_array(invlist);
10065 /* Move the rest of the array down two slots. Don't include any
10067 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10069 /* Do the actual splice */
10070 array[i_e+1] = start;
10071 array[i_e+2] = end + 1;
10072 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10076 /* Here the new range crossed the boundaries of a pre-existing range. The
10077 * code above has adjusted things so that both ends are in ranges that are
10078 * in the set. This means everything in between must also be in the set.
10079 * Just squash things together */
10080 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10081 invlist_set_len(invlist,
10083 *(get_invlist_offset_addr(invlist)));
10089 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10090 UV** other_elements_ptr)
10092 /* Create and return an inversion list whose contents are to be populated
10093 * by the caller. The caller gives the number of elements (in 'size') and
10094 * the very first element ('element0'). This function will set
10095 * '*other_elements_ptr' to an array of UVs, where the remaining elements
10096 * are to be placed.
10098 * Obviously there is some trust involved that the caller will properly
10099 * fill in the other elements of the array.
10101 * (The first element needs to be passed in, as the underlying code does
10102 * things differently depending on whether it is zero or non-zero) */
10104 SV* invlist = _new_invlist(size);
10107 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10109 invlist = add_cp_to_invlist(invlist, element0);
10110 offset = *get_invlist_offset_addr(invlist);
10112 invlist_set_len(invlist, size, offset);
10113 *other_elements_ptr = invlist_array(invlist) + 1;
10119 PERL_STATIC_INLINE SV*
10120 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
10121 return _add_range_to_invlist(invlist, cp, cp);
10124 #ifndef PERL_IN_XSUB_RE
10126 Perl__invlist_invert(pTHX_ SV* const invlist)
10128 /* Complement the input inversion list. This adds a 0 if the list didn't
10129 * have a zero; removes it otherwise. As described above, the data
10130 * structure is set up so that this is very efficient */
10132 PERL_ARGS_ASSERT__INVLIST_INVERT;
10134 assert(! invlist_is_iterating(invlist));
10136 /* The inverse of matching nothing is matching everything */
10137 if (_invlist_len(invlist) == 0) {
10138 _append_range_to_invlist(invlist, 0, UV_MAX);
10142 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10146 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10149 /* Return a new inversion list that is a copy of the input one, which is
10150 * unchanged. The new list will not be mortal even if the old one was. */
10152 const STRLEN nominal_length = _invlist_len(invlist); /* Why not +1 XXX */
10153 const STRLEN physical_length = SvCUR(invlist);
10154 const bool offset = *(get_invlist_offset_addr(invlist));
10156 PERL_ARGS_ASSERT_INVLIST_CLONE;
10158 /* Need to allocate extra space to accommodate Perl's addition of a
10159 * trailing NUL to SvPV's, since it thinks they are always strings */
10160 if (new_invlist == NULL) {
10161 new_invlist = _new_invlist(nominal_length);
10164 sv_upgrade(new_invlist, SVt_INVLIST);
10165 initialize_invlist_guts(new_invlist, nominal_length);
10168 *(get_invlist_offset_addr(new_invlist)) = offset;
10169 invlist_set_len(new_invlist, nominal_length, offset);
10170 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10172 return new_invlist;
10177 PERL_STATIC_INLINE STRLEN*
10178 S_get_invlist_iter_addr(SV* invlist)
10180 /* Return the address of the UV that contains the current iteration
10183 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
10185 assert(is_invlist(invlist));
10187 return &(((XINVLIST*) SvANY(invlist))->iterator);
10190 PERL_STATIC_INLINE void
10191 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
10193 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
10195 *get_invlist_iter_addr(invlist) = 0;
10198 PERL_STATIC_INLINE void
10199 S_invlist_iterfinish(SV* invlist)
10201 /* Terminate iterator for invlist. This is to catch development errors.
10202 * Any iteration that is interrupted before completed should call this
10203 * function. Functions that add code points anywhere else but to the end
10204 * of an inversion list assert that they are not in the middle of an
10205 * iteration. If they were, the addition would make the iteration
10206 * problematical: if the iteration hadn't reached the place where things
10207 * were being added, it would be ok */
10209 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
10211 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
10215 S_invlist_iternext(SV* invlist, UV* start, UV* end)
10217 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
10218 * This call sets in <*start> and <*end>, the next range in <invlist>.
10219 * Returns <TRUE> if successful and the next call will return the next
10220 * range; <FALSE> if was already at the end of the list. If the latter,
10221 * <*start> and <*end> are unchanged, and the next call to this function
10222 * will start over at the beginning of the list */
10224 STRLEN* pos = get_invlist_iter_addr(invlist);
10225 UV len = _invlist_len(invlist);
10228 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
10231 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
10235 array = invlist_array(invlist);
10237 *start = array[(*pos)++];
10243 *end = array[(*pos)++] - 1;
10249 PERL_STATIC_INLINE UV
10250 S_invlist_highest(SV* const invlist)
10252 /* Returns the highest code point that matches an inversion list. This API
10253 * has an ambiguity, as it returns 0 under either the highest is actually
10254 * 0, or if the list is empty. If this distinction matters to you, check
10255 * for emptiness before calling this function */
10257 UV len = _invlist_len(invlist);
10260 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
10266 array = invlist_array(invlist);
10268 /* The last element in the array in the inversion list always starts a
10269 * range that goes to infinity. That range may be for code points that are
10270 * matched in the inversion list, or it may be for ones that aren't
10271 * matched. In the latter case, the highest code point in the set is one
10272 * less than the beginning of this range; otherwise it is the final element
10273 * of this range: infinity */
10274 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
10276 : array[len - 1] - 1;
10280 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10282 /* Get the contents of an inversion list into a string SV so that they can
10283 * be printed out. If 'traditional_style' is TRUE, it uses the format
10284 * traditionally done for debug tracing; otherwise it uses a format
10285 * suitable for just copying to the output, with blanks between ranges and
10286 * a dash between range components */
10290 const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10291 const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10293 if (traditional_style) {
10294 output = newSVpvs("\n");
10297 output = newSVpvs("");
10300 PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10302 assert(! invlist_is_iterating(invlist));
10304 invlist_iterinit(invlist);
10305 while (invlist_iternext(invlist, &start, &end)) {
10306 if (end == UV_MAX) {
10307 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFINITY%c",
10308 start, intra_range_delimiter,
10309 inter_range_delimiter);
10311 else if (end != start) {
10312 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10314 intra_range_delimiter,
10315 end, inter_range_delimiter);
10318 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10319 start, inter_range_delimiter);
10323 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10324 SvCUR_set(output, SvCUR(output) - 1);
10330 #ifndef PERL_IN_XSUB_RE
10332 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10333 const char * const indent, SV* const invlist)
10335 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
10336 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
10337 * the string 'indent'. The output looks like this:
10338 [0] 0x000A .. 0x000D
10340 [4] 0x2028 .. 0x2029
10341 [6] 0x3104 .. INFINITY
10342 * This means that the first range of code points matched by the list are
10343 * 0xA through 0xD; the second range contains only the single code point
10344 * 0x85, etc. An inversion list is an array of UVs. Two array elements
10345 * are used to define each range (except if the final range extends to
10346 * infinity, only a single element is needed). The array index of the
10347 * first element for the corresponding range is given in brackets. */
10352 PERL_ARGS_ASSERT__INVLIST_DUMP;
10354 if (invlist_is_iterating(invlist)) {
10355 Perl_dump_indent(aTHX_ level, file,
10356 "%sCan't dump inversion list because is in middle of iterating\n",
10361 invlist_iterinit(invlist);
10362 while (invlist_iternext(invlist, &start, &end)) {
10363 if (end == UV_MAX) {
10364 Perl_dump_indent(aTHX_ level, file,
10365 "%s[%" UVuf "] 0x%04" UVXf " .. INFINITY\n",
10366 indent, (UV)count, start);
10368 else if (end != start) {
10369 Perl_dump_indent(aTHX_ level, file,
10370 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10371 indent, (UV)count, start, end);
10374 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10375 indent, (UV)count, start);
10383 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10385 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10387 /* Return a boolean as to if the two passed in inversion lists are
10388 * identical. The final argument, if TRUE, says to take the complement of
10389 * the second inversion list before doing the comparison */
10391 const UV* array_a = invlist_array(a);
10392 const UV* array_b = invlist_array(b);
10393 UV len_a = _invlist_len(a);
10394 UV len_b = _invlist_len(b);
10396 PERL_ARGS_ASSERT__INVLISTEQ;
10398 /* If are to compare 'a' with the complement of b, set it
10399 * up so are looking at b's complement. */
10400 if (complement_b) {
10402 /* The complement of nothing is everything, so <a> would have to have
10403 * just one element, starting at zero (ending at infinity) */
10405 return (len_a == 1 && array_a[0] == 0);
10407 else if (array_b[0] == 0) {
10409 /* Otherwise, to complement, we invert. Here, the first element is
10410 * 0, just remove it. To do this, we just pretend the array starts
10418 /* But if the first element is not zero, we pretend the list starts
10419 * at the 0 that is always stored immediately before the array. */
10425 return len_a == len_b
10426 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10432 * As best we can, determine the characters that can match the start of
10433 * the given EXACTF-ish node.
10435 * Returns the invlist as a new SV*; it is the caller's responsibility to
10436 * call SvREFCNT_dec() when done with it.
10439 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10441 const U8 * s = (U8*)STRING(node);
10442 SSize_t bytelen = STR_LEN(node);
10444 /* Start out big enough for 2 separate code points */
10445 SV* invlist = _new_invlist(4);
10447 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
10452 /* We punt and assume can match anything if the node begins
10453 * with a multi-character fold. Things are complicated. For
10454 * example, /ffi/i could match any of:
10455 * "\N{LATIN SMALL LIGATURE FFI}"
10456 * "\N{LATIN SMALL LIGATURE FF}I"
10457 * "F\N{LATIN SMALL LIGATURE FI}"
10458 * plus several other things; and making sure we have all the
10459 * possibilities is hard. */
10460 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10461 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10464 /* Any Latin1 range character can potentially match any
10465 * other depending on the locale */
10466 if (OP(node) == EXACTFL) {
10467 _invlist_union(invlist, PL_Latin1, &invlist);
10470 /* But otherwise, it matches at least itself. We can
10471 * quickly tell if it has a distinct fold, and if so,
10472 * it matches that as well */
10473 invlist = add_cp_to_invlist(invlist, uc);
10474 if (IS_IN_SOME_FOLD_L1(uc))
10475 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10478 /* Some characters match above-Latin1 ones under /i. This
10479 * is true of EXACTFL ones when the locale is UTF-8 */
10480 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10481 && (! isASCII(uc) || (OP(node) != EXACTFAA
10482 && OP(node) != EXACTFAA_NO_TRIE)))
10484 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10488 else { /* Pattern is UTF-8 */
10489 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10490 const U8* e = s + bytelen;
10493 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10495 /* The only code points that aren't folded in a UTF EXACTFish
10496 * node are are the problematic ones in EXACTFL nodes */
10497 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10498 /* We need to check for the possibility that this EXACTFL
10499 * node begins with a multi-char fold. Therefore we fold
10500 * the first few characters of it so that we can make that
10506 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10508 *(d++) = (U8) toFOLD(*s);
10509 if (fc < 0) { /* Save the first fold */
10516 UV fold = toFOLD_utf8_safe(s, e, d, &len);
10517 if (fc < 0) { /* Save the first fold */
10525 /* And set up so the code below that looks in this folded
10526 * buffer instead of the node's string */
10531 /* When we reach here 's' points to the fold of the first
10532 * character(s) of the node; and 'e' points to far enough along
10533 * the folded string to be just past any possible multi-char
10536 * Unlike the non-UTF-8 case, the macro for determining if a
10537 * string is a multi-char fold requires all the characters to
10538 * already be folded. This is because of all the complications
10539 * if not. Note that they are folded anyway, except in EXACTFL
10540 * nodes. Like the non-UTF case above, we punt if the node
10541 * begins with a multi-char fold */
10543 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10544 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10546 else { /* Single char fold */
10548 unsigned int first_folds_to;
10549 const unsigned int * remaining_folds_to_list;
10550 Size_t folds_to_count;
10552 /* It matches itself */
10553 invlist = add_cp_to_invlist(invlist, fc);
10555 /* ... plus all the things that fold to it, which are found in
10556 * PL_utf8_foldclosures */
10557 folds_to_count = _inverse_folds(fc, &first_folds_to,
10558 &remaining_folds_to_list);
10559 for (k = 0; k < folds_to_count; k++) {
10560 UV c = (k == 0) ? first_folds_to : remaining_folds_to_list[k-1];
10562 /* /aa doesn't allow folds between ASCII and non- */
10563 if ( (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10564 && isASCII(c) != isASCII(fc))
10569 invlist = add_cp_to_invlist(invlist, c);
10577 #undef HEADER_LENGTH
10578 #undef TO_INTERNAL_SIZE
10579 #undef FROM_INTERNAL_SIZE
10580 #undef INVLIST_VERSION_ID
10582 /* End of inversion list object */
10585 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10587 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10588 * constructs, and updates RExC_flags with them. On input, RExC_parse
10589 * should point to the first flag; it is updated on output to point to the
10590 * final ')' or ':'. There needs to be at least one flag, or this will
10593 /* for (?g), (?gc), and (?o) warnings; warning
10594 about (?c) will warn about (?g) -- japhy */
10596 #define WASTED_O 0x01
10597 #define WASTED_G 0x02
10598 #define WASTED_C 0x04
10599 #define WASTED_GC (WASTED_G|WASTED_C)
10600 I32 wastedflags = 0x00;
10601 U32 posflags = 0, negflags = 0;
10602 U32 *flagsp = &posflags;
10603 char has_charset_modifier = '\0';
10605 bool has_use_defaults = FALSE;
10606 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10607 int x_mod_count = 0;
10609 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10611 /* '^' as an initial flag sets certain defaults */
10612 if (UCHARAT(RExC_parse) == '^') {
10614 has_use_defaults = TRUE;
10615 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10616 set_regex_charset(&RExC_flags, (RExC_uni_semantics)
10617 ? REGEX_UNICODE_CHARSET
10618 : REGEX_DEPENDS_CHARSET);
10621 cs = get_regex_charset(RExC_flags);
10622 if (cs == REGEX_DEPENDS_CHARSET
10623 && (RExC_uni_semantics))
10625 cs = REGEX_UNICODE_CHARSET;
10628 while (RExC_parse < RExC_end) {
10629 /* && strchr("iogcmsx", *RExC_parse) */
10630 /* (?g), (?gc) and (?o) are useless here
10631 and must be globally applied -- japhy */
10632 switch (*RExC_parse) {
10634 /* Code for the imsxn flags */
10635 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10637 case LOCALE_PAT_MOD:
10638 if (has_charset_modifier) {
10639 goto excess_modifier;
10641 else if (flagsp == &negflags) {
10644 cs = REGEX_LOCALE_CHARSET;
10645 has_charset_modifier = LOCALE_PAT_MOD;
10647 case UNICODE_PAT_MOD:
10648 if (has_charset_modifier) {
10649 goto excess_modifier;
10651 else if (flagsp == &negflags) {
10654 cs = REGEX_UNICODE_CHARSET;
10655 has_charset_modifier = UNICODE_PAT_MOD;
10657 case ASCII_RESTRICT_PAT_MOD:
10658 if (flagsp == &negflags) {
10661 if (has_charset_modifier) {
10662 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10663 goto excess_modifier;
10665 /* Doubled modifier implies more restricted */
10666 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10669 cs = REGEX_ASCII_RESTRICTED_CHARSET;
10671 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10673 case DEPENDS_PAT_MOD:
10674 if (has_use_defaults) {
10675 goto fail_modifiers;
10677 else if (flagsp == &negflags) {
10680 else if (has_charset_modifier) {
10681 goto excess_modifier;
10684 /* The dual charset means unicode semantics if the
10685 * pattern (or target, not known until runtime) are
10686 * utf8, or something in the pattern indicates unicode
10688 cs = (RExC_uni_semantics)
10689 ? REGEX_UNICODE_CHARSET
10690 : REGEX_DEPENDS_CHARSET;
10691 has_charset_modifier = DEPENDS_PAT_MOD;
10695 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10696 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10698 else if (has_charset_modifier == *(RExC_parse - 1)) {
10699 vFAIL2("Regexp modifier \"%c\" may not appear twice",
10700 *(RExC_parse - 1));
10703 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10705 NOT_REACHED; /*NOTREACHED*/
10708 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10709 *(RExC_parse - 1));
10710 NOT_REACHED; /*NOTREACHED*/
10711 case ONCE_PAT_MOD: /* 'o' */
10712 case GLOBAL_PAT_MOD: /* 'g' */
10713 if (ckWARN(WARN_REGEXP)) {
10714 const I32 wflagbit = *RExC_parse == 'o'
10717 if (! (wastedflags & wflagbit) ) {
10718 wastedflags |= wflagbit;
10719 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10722 "Useless (%s%c) - %suse /%c modifier",
10723 flagsp == &negflags ? "?-" : "?",
10725 flagsp == &negflags ? "don't " : "",
10732 case CONTINUE_PAT_MOD: /* 'c' */
10733 if (ckWARN(WARN_REGEXP)) {
10734 if (! (wastedflags & WASTED_C) ) {
10735 wastedflags |= WASTED_GC;
10736 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10739 "Useless (%sc) - %suse /gc modifier",
10740 flagsp == &negflags ? "?-" : "?",
10741 flagsp == &negflags ? "don't " : ""
10746 case KEEPCOPY_PAT_MOD: /* 'p' */
10747 if (flagsp == &negflags) {
10748 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10750 *flagsp |= RXf_PMf_KEEPCOPY;
10754 /* A flag is a default iff it is following a minus, so
10755 * if there is a minus, it means will be trying to
10756 * re-specify a default which is an error */
10757 if (has_use_defaults || flagsp == &negflags) {
10758 goto fail_modifiers;
10760 flagsp = &negflags;
10761 wastedflags = 0; /* reset so (?g-c) warns twice */
10767 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
10768 negflags |= RXf_PMf_EXTENDED_MORE;
10770 RExC_flags |= posflags;
10772 if (negflags & RXf_PMf_EXTENDED) {
10773 negflags |= RXf_PMf_EXTENDED_MORE;
10775 RExC_flags &= ~negflags;
10776 set_regex_charset(&RExC_flags, cs);
10781 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10782 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10783 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
10784 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10785 NOT_REACHED; /*NOTREACHED*/
10788 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10791 vFAIL("Sequence (?... not terminated");
10795 - reg - regular expression, i.e. main body or parenthesized thing
10797 * Caller must absorb opening parenthesis.
10799 * Combining parenthesis handling with the base level of regular expression
10800 * is a trifle forced, but the need to tie the tails of the branches to what
10801 * follows makes it hard to avoid.
10803 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10805 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10807 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10810 PERL_STATIC_INLINE regnode_offset
10811 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10813 char * parse_start,
10817 regnode_offset ret;
10818 char* name_start = RExC_parse;
10820 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
10821 GET_RE_DEBUG_FLAGS_DECL;
10823 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
10825 if (RExC_parse == name_start || *RExC_parse != ch) {
10826 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
10827 vFAIL2("Sequence %.3s... not terminated", parse_start);
10831 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10832 RExC_rxi->data->data[num]=(void*)sv_dat;
10833 SvREFCNT_inc_simple_void_NN(sv_dat);
10836 ret = reganode(pRExC_state,
10839 : (ASCII_FOLD_RESTRICTED)
10841 : (AT_LEAST_UNI_SEMANTICS)
10847 *flagp |= HASWIDTH;
10849 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
10850 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
10852 nextchar(pRExC_state);
10856 /* On success, returns the offset at which any next node should be placed into
10857 * the regex engine program being compiled.
10859 * Returns 0 otherwise, with *flagp set to indicate why:
10860 * TRYAGAIN at the end of (?) that only sets flags.
10861 * RESTART_PARSE if the parse needs to be restarted, or'd with
10862 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
10863 * Otherwise would only return 0 if regbranch() returns 0, which cannot
10865 STATIC regnode_offset
10866 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
10867 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
10868 * 2 is like 1, but indicates that nextchar() has been called to advance
10869 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
10870 * this flag alerts us to the need to check for that */
10872 regnode_offset ret = 0; /* Will be the head of the group. */
10874 regnode_offset lastbr;
10875 regnode_offset ender = 0;
10878 U32 oregflags = RExC_flags;
10879 bool have_branch = 0;
10881 I32 freeze_paren = 0;
10882 I32 after_freeze = 0;
10883 I32 num; /* numeric backreferences */
10885 char * parse_start = RExC_parse; /* MJD */
10886 char * const oregcomp_parse = RExC_parse;
10888 GET_RE_DEBUG_FLAGS_DECL;
10890 PERL_ARGS_ASSERT_REG;
10891 DEBUG_PARSE("reg ");
10893 *flagp = 0; /* Tentatively. */
10895 /* Having this true makes it feasible to have a lot fewer tests for the
10896 * parse pointer being in scope. For example, we can write
10897 * while(isFOO(*RExC_parse)) RExC_parse++;
10899 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
10901 assert(*RExC_end == '\0');
10903 /* Make an OPEN node, if parenthesized. */
10906 /* Under /x, space and comments can be gobbled up between the '(' and
10907 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
10908 * intervening space, as the sequence is a token, and a token should be
10910 bool has_intervening_patws = (paren == 2)
10911 && *(RExC_parse - 1) != '(';
10913 if (RExC_parse >= RExC_end) {
10914 vFAIL("Unmatched (");
10917 if (paren == 'r') { /* Atomic script run */
10921 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
10922 char *start_verb = RExC_parse + 1;
10924 char *start_arg = NULL;
10925 unsigned char op = 0;
10926 int arg_required = 0;
10927 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
10928 bool has_upper = FALSE;
10930 if (has_intervening_patws) {
10931 RExC_parse++; /* past the '*' */
10933 /* For strict backwards compatibility, don't change the message
10934 * now that we also have lowercase operands */
10935 if (isUPPER(*RExC_parse)) {
10936 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
10939 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
10942 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
10943 if ( *RExC_parse == ':' ) {
10944 start_arg = RExC_parse + 1;
10948 if (isUPPER(*RExC_parse)) {
10954 RExC_parse += UTF8SKIP(RExC_parse);
10957 verb_len = RExC_parse - start_verb;
10959 if (RExC_parse >= RExC_end) {
10960 goto unterminated_verb_pattern;
10963 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10964 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
10965 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10967 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
10968 unterminated_verb_pattern:
10970 vFAIL("Unterminated verb pattern argument");
10973 vFAIL("Unterminated '(*...' argument");
10977 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
10979 vFAIL("Unterminated verb pattern");
10982 vFAIL("Unterminated '(*...' construct");
10987 /* Here, we know that RExC_parse < RExC_end */
10989 switch ( *start_verb ) {
10990 case 'A': /* (*ACCEPT) */
10991 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
10993 internal_argval = RExC_nestroot;
10996 case 'C': /* (*COMMIT) */
10997 if ( memEQs(start_verb, verb_len,"COMMIT") )
11000 case 'F': /* (*FAIL) */
11001 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11005 case ':': /* (*:NAME) */
11006 case 'M': /* (*MARK:NAME) */
11007 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11012 case 'P': /* (*PRUNE) */
11013 if ( memEQs(start_verb, verb_len,"PRUNE") )
11016 case 'S': /* (*SKIP) */
11017 if ( memEQs(start_verb, verb_len,"SKIP") )
11020 case 'T': /* (*THEN) */
11021 /* [19:06] <TimToady> :: is then */
11022 if ( memEQs(start_verb, verb_len,"THEN") ) {
11024 RExC_seen |= REG_CUTGROUP_SEEN;
11028 if ( memEQs(start_verb, verb_len, "asr")
11029 || memEQs(start_verb, verb_len, "atomic_script_run"))
11031 paren = 'r'; /* Mnemonic: recursed run */
11034 else if (memEQs(start_verb, verb_len, "atomic")) {
11035 paren = 't'; /* AtOMIC */
11036 goto alpha_assertions;
11040 if ( memEQs(start_verb, verb_len, "plb")
11041 || memEQs(start_verb, verb_len, "positive_lookbehind"))
11044 goto lookbehind_alpha_assertions;
11046 else if ( memEQs(start_verb, verb_len, "pla")
11047 || memEQs(start_verb, verb_len, "positive_lookahead"))
11050 goto alpha_assertions;
11054 if ( memEQs(start_verb, verb_len, "nlb")
11055 || memEQs(start_verb, verb_len, "negative_lookbehind"))
11058 goto lookbehind_alpha_assertions;
11060 else if ( memEQs(start_verb, verb_len, "nla")
11061 || memEQs(start_verb, verb_len, "negative_lookahead"))
11064 goto alpha_assertions;
11068 if ( memEQs(start_verb, verb_len, "sr")
11069 || memEQs(start_verb, verb_len, "script_run"))
11071 regnode_offset atomic;
11077 /* This indicates Unicode rules. */
11078 REQUIRE_UNI_RULES(flagp, 0);
11084 RExC_parse = start_arg;
11086 if (RExC_in_script_run) {
11088 /* Nested script runs are treated as no-ops, because
11089 * if the nested one fails, the outer one must as
11090 * well. It could fail sooner, and avoid (??{} with
11091 * side effects, but that is explicitly documented as
11092 * undefined behavior. */
11096 if (paren == 's') {
11101 /* But, the atomic part of a nested atomic script run
11102 * isn't a no-op, but can be treated just like a '(?>'
11108 /* By doing this here, we avoid extra warnings for nested
11110 ckWARNexperimental(RExC_parse,
11111 WARN_EXPERIMENTAL__SCRIPT_RUN,
11112 "The script_run feature is experimental");
11114 if (paren == 's') {
11115 /* Here, we're starting a new regular script run */
11116 ret = reg_node(pRExC_state, SROPEN);
11117 RExC_in_script_run = 1;
11122 /* Here, we are starting an atomic script run. This is
11123 * handled by recursing to deal with the atomic portion
11124 * separately, enclosed in SROPEN ... SRCLOSE nodes */
11126 ret = reg_node(pRExC_state, SROPEN);
11128 RExC_in_script_run = 1;
11130 atomic = reg(pRExC_state, 'r', &flags, depth);
11131 if (flags & (RESTART_PARSE|NEED_UTF8)) {
11132 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11136 REGTAIL(pRExC_state, ret, atomic);
11138 REGTAIL(pRExC_state, atomic,
11139 reg_node(pRExC_state, SRCLOSE));
11141 RExC_in_script_run = 0;
11147 lookbehind_alpha_assertions:
11148 RExC_seen |= REG_LOOKBEHIND_SEEN;
11149 RExC_in_lookbehind++;
11153 ckWARNexperimental(RExC_parse,
11154 WARN_EXPERIMENTAL__ALPHA_ASSERTIONS,
11155 "The alpha_assertions feature is experimental");
11157 RExC_seen_zerolen++;
11163 /* An empty negative lookahead assertion simply is failure */
11164 if (paren == 'A' && RExC_parse == start_arg) {
11165 ret=reganode(pRExC_state, OPFAIL, 0);
11166 nextchar(pRExC_state);
11170 RExC_parse = start_arg;
11175 "'(*%" UTF8f "' requires a terminating ':'",
11176 UTF8fARG(UTF, verb_len, start_verb));
11177 NOT_REACHED; /*NOTREACHED*/
11179 } /* End of switch */
11181 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11182 if (has_upper || verb_len == 0) {
11184 "Unknown verb pattern '%" UTF8f "'",
11185 UTF8fARG(UTF, verb_len, start_verb));
11189 "Unknown '(*...)' construct '%" UTF8f "'",
11190 UTF8fARG(UTF, verb_len, start_verb));
11193 if ( RExC_parse == start_arg ) {
11196 if ( arg_required && !start_arg ) {
11197 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11198 verb_len, start_verb);
11200 if (internal_argval == -1) {
11201 ret = reganode(pRExC_state, op, 0);
11203 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11205 RExC_seen |= REG_VERBARG_SEEN;
11207 SV *sv = newSVpvn( start_arg,
11208 RExC_parse - start_arg);
11209 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11210 STR_WITH_LEN("S"));
11211 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11212 FLAGS(REGNODE_p(ret)) = 1;
11214 FLAGS(REGNODE_p(ret)) = 0;
11216 if ( internal_argval != -1 )
11217 ARG2L_SET(REGNODE_p(ret), internal_argval);
11218 nextchar(pRExC_state);
11221 else if (*RExC_parse == '?') { /* (?...) */
11222 bool is_logical = 0;
11223 const char * const seqstart = RExC_parse;
11224 const char * endptr;
11225 if (has_intervening_patws) {
11227 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11230 RExC_parse++; /* past the '?' */
11231 paren = *RExC_parse; /* might be a trailing NUL, if not
11233 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11234 if (RExC_parse > RExC_end) {
11237 ret = 0; /* For look-ahead/behind. */
11240 case 'P': /* (?P...) variants for those used to PCRE/Python */
11241 paren = *RExC_parse;
11242 if ( paren == '<') { /* (?P<...>) named capture */
11244 if (RExC_parse >= RExC_end) {
11245 vFAIL("Sequence (?P<... not terminated");
11247 goto named_capture;
11249 else if (paren == '>') { /* (?P>name) named recursion */
11251 if (RExC_parse >= RExC_end) {
11252 vFAIL("Sequence (?P>... not terminated");
11254 goto named_recursion;
11256 else if (paren == '=') { /* (?P=...) named backref */
11258 return handle_named_backref(pRExC_state, flagp,
11261 RExC_parse += SKIP_IF_CHAR(RExC_parse);
11262 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11263 vFAIL3("Sequence (%.*s...) not recognized",
11264 RExC_parse-seqstart, seqstart);
11265 NOT_REACHED; /*NOTREACHED*/
11266 case '<': /* (?<...) */
11267 if (*RExC_parse == '!')
11269 else if (*RExC_parse != '=')
11276 case '\'': /* (?'...') */
11277 name_start = RExC_parse;
11278 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11279 if ( RExC_parse == name_start
11280 || RExC_parse >= RExC_end
11281 || *RExC_parse != paren)
11283 vFAIL2("Sequence (?%c... not terminated",
11284 paren=='>' ? '<' : paren);
11289 if (!svname) /* shouldn't happen */
11291 "panic: reg_scan_name returned NULL");
11292 if (!RExC_paren_names) {
11293 RExC_paren_names= newHV();
11294 sv_2mortal(MUTABLE_SV(RExC_paren_names));
11296 RExC_paren_name_list= newAV();
11297 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11300 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11302 sv_dat = HeVAL(he_str);
11304 /* croak baby croak */
11306 "panic: paren_name hash element allocation failed");
11307 } else if ( SvPOK(sv_dat) ) {
11308 /* (?|...) can mean we have dupes so scan to check
11309 its already been stored. Maybe a flag indicating
11310 we are inside such a construct would be useful,
11311 but the arrays are likely to be quite small, so
11312 for now we punt -- dmq */
11313 IV count = SvIV(sv_dat);
11314 I32 *pv = (I32*)SvPVX(sv_dat);
11316 for ( i = 0 ; i < count ; i++ ) {
11317 if ( pv[i] == RExC_npar ) {
11323 pv = (I32*)SvGROW(sv_dat,
11324 SvCUR(sv_dat) + sizeof(I32)+1);
11325 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11326 pv[count] = RExC_npar;
11327 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11330 (void)SvUPGRADE(sv_dat, SVt_PVNV);
11331 sv_setpvn(sv_dat, (char *)&(RExC_npar),
11334 SvIV_set(sv_dat, 1);
11337 /* Yes this does cause a memory leak in debugging Perls
11339 if (!av_store(RExC_paren_name_list,
11340 RExC_npar, SvREFCNT_inc_NN(svname)))
11341 SvREFCNT_dec_NN(svname);
11344 /*sv_dump(sv_dat);*/
11346 nextchar(pRExC_state);
11348 goto capturing_parens;
11351 RExC_seen |= REG_LOOKBEHIND_SEEN;
11352 RExC_in_lookbehind++;
11354 if (RExC_parse >= RExC_end) {
11355 vFAIL("Sequence (?... not terminated");
11359 case '=': /* (?=...) */
11360 RExC_seen_zerolen++;
11362 case '!': /* (?!...) */
11363 RExC_seen_zerolen++;
11364 /* check if we're really just a "FAIL" assertion */
11365 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11366 FALSE /* Don't force to /x */ );
11367 if (*RExC_parse == ')') {
11368 ret=reganode(pRExC_state, OPFAIL, 0);
11369 nextchar(pRExC_state);
11373 case '|': /* (?|...) */
11374 /* branch reset, behave like a (?:...) except that
11375 buffers in alternations share the same numbers */
11377 after_freeze = freeze_paren = RExC_npar;
11379 /* XXX This construct currently requires an extra pass.
11380 * Investigation would be required to see if that could be
11382 REQUIRE_PARENS_PASS;
11384 case ':': /* (?:...) */
11385 case '>': /* (?>...) */
11387 case '$': /* (?$...) */
11388 case '@': /* (?@...) */
11389 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11391 case '0' : /* (?0) */
11392 case 'R' : /* (?R) */
11393 if (RExC_parse == RExC_end || *RExC_parse != ')')
11394 FAIL("Sequence (?R) not terminated");
11396 RExC_seen |= REG_RECURSE_SEEN;
11398 /* XXX These constructs currently require an extra pass.
11399 * It probably could be changed */
11400 REQUIRE_PARENS_PASS;
11402 *flagp |= POSTPONED;
11403 goto gen_recurse_regop;
11405 /* named and numeric backreferences */
11406 case '&': /* (?&NAME) */
11407 parse_start = RExC_parse - 1;
11410 SV *sv_dat = reg_scan_name(pRExC_state,
11411 REG_RSN_RETURN_DATA);
11412 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11414 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11415 vFAIL("Sequence (?&... not terminated");
11416 goto gen_recurse_regop;
11419 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
11421 vFAIL("Illegal pattern");
11423 goto parse_recursion;
11425 case '-': /* (?-1) */
11426 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
11427 RExC_parse--; /* rewind to let it be handled later */
11431 case '1': case '2': case '3': case '4': /* (?1) */
11432 case '5': case '6': case '7': case '8': case '9':
11433 RExC_parse = (char *) seqstart + 1; /* Point to the digit */
11436 bool is_neg = FALSE;
11438 parse_start = RExC_parse - 1; /* MJD */
11439 if (*RExC_parse == '-') {
11444 if (grok_atoUV(RExC_parse, &unum, &endptr)
11448 RExC_parse = (char*)endptr;
11452 /* Some limit for num? */
11456 if (*RExC_parse!=')')
11457 vFAIL("Expecting close bracket");
11460 if ( paren == '-' ) {
11462 Diagram of capture buffer numbering.
11463 Top line is the normal capture buffer numbers
11464 Bottom line is the negative indexing as from
11468 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11472 num = RExC_npar + num;
11475 /* It might be a forward reference; we can't fail until
11476 * we know, by completing the parse to get all the
11477 * groups, and then reparsing */
11478 if (RExC_total_parens > 0) {
11480 vFAIL("Reference to nonexistent group");
11483 REQUIRE_PARENS_PASS;
11486 } else if ( paren == '+' ) {
11487 num = RExC_npar + num - 1;
11489 /* We keep track how many GOSUB items we have produced.
11490 To start off the ARG2L() of the GOSUB holds its "id",
11491 which is used later in conjunction with RExC_recurse
11492 to calculate the offset we need to jump for the GOSUB,
11493 which it will store in the final representation.
11494 We have to defer the actual calculation until much later
11495 as the regop may move.
11498 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11499 if (num >= RExC_npar) {
11501 /* It might be a forward reference; we can't fail until we
11502 * know, by completing the parse to get all the groups, and
11503 * then reparsing */
11504 if (RExC_total_parens > 0) {
11505 if (num >= RExC_total_parens) {
11507 vFAIL("Reference to nonexistent group");
11511 REQUIRE_PARENS_PASS;
11514 RExC_recurse_count++;
11515 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11516 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11517 22, "| |", (int)(depth * 2 + 1), "",
11518 (UV)ARG(REGNODE_p(ret)),
11519 (IV)ARG2L(REGNODE_p(ret))));
11520 RExC_seen |= REG_RECURSE_SEEN;
11522 Set_Node_Length(REGNODE_p(ret),
11523 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11524 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11526 *flagp |= POSTPONED;
11527 assert(*RExC_parse == ')');
11528 nextchar(pRExC_state);
11533 case '?': /* (??...) */
11535 if (*RExC_parse != '{') {
11536 RExC_parse += SKIP_IF_CHAR(RExC_parse);
11537 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11539 "Sequence (%" UTF8f "...) not recognized",
11540 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11541 NOT_REACHED; /*NOTREACHED*/
11543 *flagp |= POSTPONED;
11547 case '{': /* (?{...}) */
11550 struct reg_code_block *cb;
11553 RExC_seen_zerolen++;
11555 if ( !pRExC_state->code_blocks
11556 || pRExC_state->code_index
11557 >= pRExC_state->code_blocks->count
11558 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11559 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11562 if (RExC_pm_flags & PMf_USE_RE_EVAL)
11563 FAIL("panic: Sequence (?{...}): no code block found\n");
11564 FAIL("Eval-group not allowed at runtime, use re 'eval'");
11566 /* this is a pre-compiled code block (?{...}) */
11567 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11568 RExC_parse = RExC_start + cb->end;
11570 if (cb->src_regex) {
11571 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11572 RExC_rxi->data->data[n] =
11573 (void*)SvREFCNT_inc((SV*)cb->src_regex);
11574 RExC_rxi->data->data[n+1] = (void*)o;
11577 n = add_data(pRExC_state,
11578 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11579 RExC_rxi->data->data[n] = (void*)o;
11581 pRExC_state->code_index++;
11582 nextchar(pRExC_state);
11585 regnode_offset eval;
11586 ret = reg_node(pRExC_state, LOGICAL);
11588 eval = reg2Lanode(pRExC_state, EVAL,
11591 /* for later propagation into (??{})
11593 RExC_flags & RXf_PMf_COMPILETIME
11595 FLAGS(REGNODE_p(ret)) = 2;
11596 REGTAIL(pRExC_state, ret, eval);
11597 /* deal with the length of this later - MJD */
11600 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11601 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11602 Set_Node_Offset(REGNODE_p(ret), parse_start);
11605 case '(': /* (?(?{...})...) and (?(?=...)...) */
11608 const int DEFINE_len = sizeof("DEFINE") - 1;
11609 if ( RExC_parse < RExC_end - 1
11610 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
11611 && ( RExC_parse[1] == '='
11612 || RExC_parse[1] == '!'
11613 || RExC_parse[1] == '<'
11614 || RExC_parse[1] == '{'))
11615 || ( RExC_parse[0] == '*' /* (?(*...)) */
11616 && ( memBEGINs(RExC_parse + 1,
11617 (Size_t) (RExC_end - (RExC_parse + 1)),
11619 || memBEGINs(RExC_parse + 1,
11620 (Size_t) (RExC_end - (RExC_parse + 1)),
11622 || memBEGINs(RExC_parse + 1,
11623 (Size_t) (RExC_end - (RExC_parse + 1)),
11625 || memBEGINs(RExC_parse + 1,
11626 (Size_t) (RExC_end - (RExC_parse + 1)),
11628 || memBEGINs(RExC_parse + 1,
11629 (Size_t) (RExC_end - (RExC_parse + 1)),
11630 "positive_lookahead:")
11631 || memBEGINs(RExC_parse + 1,
11632 (Size_t) (RExC_end - (RExC_parse + 1)),
11633 "positive_lookbehind:")
11634 || memBEGINs(RExC_parse + 1,
11635 (Size_t) (RExC_end - (RExC_parse + 1)),
11636 "negative_lookahead:")
11637 || memBEGINs(RExC_parse + 1,
11638 (Size_t) (RExC_end - (RExC_parse + 1)),
11639 "negative_lookbehind:"))))
11640 ) { /* Lookahead or eval. */
11642 regnode_offset tail;
11644 ret = reg_node(pRExC_state, LOGICAL);
11645 FLAGS(REGNODE_p(ret)) = 1;
11647 tail = reg(pRExC_state, 1, &flag, depth+1);
11648 RETURN_FAIL_ON_RESTART(flag, flagp);
11649 REGTAIL(pRExC_state, ret, tail);
11652 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
11653 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11655 char ch = RExC_parse[0] == '<' ? '>' : '\'';
11656 char *name_start= RExC_parse++;
11658 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11659 if ( RExC_parse == name_start
11660 || RExC_parse >= RExC_end
11661 || *RExC_parse != ch)
11663 vFAIL2("Sequence (?(%c... not terminated",
11664 (ch == '>' ? '<' : ch));
11668 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11669 RExC_rxi->data->data[num]=(void*)sv_dat;
11670 SvREFCNT_inc_simple_void_NN(sv_dat);
11672 ret = reganode(pRExC_state, NGROUPP, num);
11673 goto insert_if_check_paren;
11675 else if (memBEGINs(RExC_parse,
11676 (STRLEN) (RExC_end - RExC_parse),
11679 ret = reganode(pRExC_state, DEFINEP, 0);
11680 RExC_parse += DEFINE_len;
11682 goto insert_if_check_paren;
11684 else if (RExC_parse[0] == 'R') {
11686 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
11687 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11688 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11691 if (RExC_parse[0] == '0') {
11695 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11698 if (grok_atoUV(RExC_parse, &uv, &endptr)
11701 parno = (I32)uv + 1;
11702 RExC_parse = (char*)endptr;
11704 /* else "Switch condition not recognized" below */
11705 } else if (RExC_parse[0] == '&') {
11708 sv_dat = reg_scan_name(pRExC_state,
11709 REG_RSN_RETURN_DATA);
11711 parno = 1 + *((I32 *)SvPVX(sv_dat));
11713 ret = reganode(pRExC_state, INSUBP, parno);
11714 goto insert_if_check_paren;
11716 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
11721 if (grok_atoUV(RExC_parse, &uv, &endptr)
11725 RExC_parse = (char*)endptr;
11728 vFAIL("panic: grok_atoUV returned FALSE");
11730 ret = reganode(pRExC_state, GROUPP, parno);
11732 insert_if_check_paren:
11733 if (UCHARAT(RExC_parse) != ')') {
11734 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11735 vFAIL("Switch condition not recognized");
11737 nextchar(pRExC_state);
11739 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
11740 br = regbranch(pRExC_state, &flags, 1, depth+1);
11742 RETURN_FAIL_ON_RESTART(flags,flagp);
11743 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11746 REGTAIL(pRExC_state, br, reganode(pRExC_state,
11748 c = UCHARAT(RExC_parse);
11749 nextchar(pRExC_state);
11750 if (flags&HASWIDTH)
11751 *flagp |= HASWIDTH;
11754 vFAIL("(?(DEFINE)....) does not allow branches");
11756 /* Fake one for optimizer. */
11757 lastbr = reganode(pRExC_state, IFTHEN, 0);
11759 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
11760 RETURN_FAIL_ON_RESTART(flags, flagp);
11761 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11764 REGTAIL(pRExC_state, ret, lastbr);
11765 if (flags&HASWIDTH)
11766 *flagp |= HASWIDTH;
11767 c = UCHARAT(RExC_parse);
11768 nextchar(pRExC_state);
11773 if (RExC_parse >= RExC_end)
11774 vFAIL("Switch (?(condition)... not terminated");
11776 vFAIL("Switch (?(condition)... contains too many branches");
11778 ender = reg_node(pRExC_state, TAIL);
11779 REGTAIL(pRExC_state, br, ender);
11781 REGTAIL(pRExC_state, lastbr, ender);
11782 REGTAIL(pRExC_state, REGNODE_OFFSET(
11784 NEXTOPER(REGNODE_p(lastbr)))),
11788 REGTAIL(pRExC_state, ret, ender);
11789 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */
11790 RExC_size++; /* XXX WHY do we need this?!!
11791 For large programs it seems to be required
11792 but I can't figure out why. -- dmq*/
11796 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11797 vFAIL("Unknown switch condition (?(...))");
11799 case '[': /* (?[ ... ]) */
11800 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
11802 case 0: /* A NUL */
11803 RExC_parse--; /* for vFAIL to print correctly */
11804 vFAIL("Sequence (? incomplete");
11806 default: /* e.g., (?i) */
11807 RExC_parse = (char *) seqstart + 1;
11809 parse_lparen_question_flags(pRExC_state);
11810 if (UCHARAT(RExC_parse) != ':') {
11811 if (RExC_parse < RExC_end)
11812 nextchar(pRExC_state);
11817 nextchar(pRExC_state);
11823 if (*RExC_parse == '{') {
11824 ckWARNregdep(RExC_parse + 1,
11825 "Unescaped left brace in regex is "
11826 "deprecated here (and will be fatal "
11827 "in Perl 5.32), passed through");
11829 /* Not bothering to indent here, as the above 'else' is temporary
11831 if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
11835 if (RExC_total_parens <= 0) {
11836 /* If we are in our first pass through (and maybe only pass),
11837 * we need to allocate memory for the capturing parentheses
11838 * data structures. Since we start at npar=1, when it reaches
11839 * 2, for the first time it has something to put in it. Above
11840 * 2 means we extend what we already have */
11841 if (RExC_npar == 2) {
11842 /* setup RExC_open_parens, which holds the address of each
11843 * OPEN tag, and to make things simpler for the 0 index the
11844 * start of the program - this is used later for offsets */
11845 Newxz(RExC_open_parens, RExC_npar, regnode_offset);
11846 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
11848 /* setup RExC_close_parens, which holds the address of each
11849 * CLOSE tag, and to make things simpler for the 0 index
11850 * the end of the program - this is used later for offsets
11852 Newxz(RExC_close_parens, RExC_npar, regnode_offset);
11853 /* we dont know where end op starts yet, so we dont need to
11854 * set RExC_close_parens[0] like we do RExC_open_parens[0]
11858 Renew(RExC_open_parens, RExC_npar, regnode_offset);
11859 Zero(RExC_open_parens + RExC_npar - 1, 1, regnode_offset);
11861 Renew(RExC_close_parens, RExC_npar, regnode_offset);
11862 Zero(RExC_close_parens + RExC_npar - 1, 1, regnode_offset);
11866 ret = reganode(pRExC_state, OPEN, parno);
11867 if (!RExC_nestroot)
11868 RExC_nestroot = parno;
11869 if (RExC_open_parens && !RExC_open_parens[parno])
11871 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11872 "%*s%*s Setting open paren #%" IVdf " to %d\n",
11873 22, "| |", (int)(depth * 2 + 1), "",
11874 (IV)parno, REG_NODE_NUM(REGNODE_p(ret))));
11875 RExC_open_parens[parno]= ret;
11878 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
11879 Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
11882 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
11892 /* Pick up the branches, linking them together. */
11893 parse_start = RExC_parse; /* MJD */
11894 br = regbranch(pRExC_state, &flags, 1, depth+1);
11896 /* branch_len = (paren != 0); */
11899 RETURN_FAIL_ON_RESTART(flags, flagp);
11900 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
11902 if (*RExC_parse == '|') {
11903 if (RExC_use_BRANCHJ) {
11904 reginsert(pRExC_state, BRANCHJ, br, depth+1);
11907 reginsert(pRExC_state, BRANCH, br, depth+1);
11908 Set_Node_Length(REGNODE_p(br), paren != 0);
11909 Set_Node_Offset_To_R(br, parse_start-RExC_start);
11913 else if (paren == ':') {
11914 *flagp |= flags&SIMPLE;
11916 if (is_open) { /* Starts with OPEN. */
11917 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
11919 else if (paren != '?') /* Not Conditional */
11921 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11923 while (*RExC_parse == '|') {
11924 if (RExC_use_BRANCHJ) {
11925 ender = reganode(pRExC_state, LONGJMP, 0);
11927 /* Append to the previous. */
11928 REGTAIL(pRExC_state,
11929 REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
11932 nextchar(pRExC_state);
11933 if (freeze_paren) {
11934 if (RExC_npar > after_freeze)
11935 after_freeze = RExC_npar;
11936 RExC_npar = freeze_paren;
11938 br = regbranch(pRExC_state, &flags, 0, depth+1);
11941 RETURN_FAIL_ON_RESTART(flags, flagp);
11942 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
11944 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
11946 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
11949 if (have_branch || paren != ':') {
11952 /* Make a closing node, and hook it on the end. */
11955 ender = reg_node(pRExC_state, TAIL);
11958 ender = reganode(pRExC_state, CLOSE, parno);
11959 if ( RExC_close_parens ) {
11960 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11961 "%*s%*s Setting close paren #%" IVdf " to %d\n",
11962 22, "| |", (int)(depth * 2 + 1), "",
11963 (IV)parno, REG_NODE_NUM(REGNODE_p(ender))));
11964 RExC_close_parens[parno]= ender;
11965 if (RExC_nestroot == parno)
11968 Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
11969 Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
11972 ender = reg_node(pRExC_state, SRCLOSE);
11973 RExC_in_script_run = 0;
11983 *flagp &= ~HASWIDTH;
11985 case 't': /* aTomic */
11987 ender = reg_node(pRExC_state, SUCCEED);
11990 ender = reg_node(pRExC_state, END);
11991 assert(!RExC_end_op); /* there can only be one! */
11992 RExC_end_op = REGNODE_p(ender);
11993 if (RExC_close_parens) {
11994 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11995 "%*s%*s Setting close paren #0 (END) to %d\n",
11996 22, "| |", (int)(depth * 2 + 1), "",
11997 REG_NODE_NUM(REGNODE_p(ender))));
11999 RExC_close_parens[0]= ender;
12004 DEBUG_PARSE_MSG("lsbr");
12005 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12006 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12007 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12008 SvPV_nolen_const(RExC_mysv1),
12009 (IV)REG_NODE_NUM(REGNODE_p(lastbr)),
12010 SvPV_nolen_const(RExC_mysv2),
12011 (IV)REG_NODE_NUM(REGNODE_p(ender)),
12012 (IV)(ender - lastbr)
12015 REGTAIL(pRExC_state, lastbr, ender);
12018 char is_nothing= 1;
12020 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12022 /* Hook the tails of the branches to the closing node. */
12023 for (br = REGNODE_p(ret); br; br = regnext(br)) {
12024 const U8 op = PL_regkind[OP(br)];
12025 if (op == BRANCH) {
12026 REGTAIL_STUDY(pRExC_state,
12027 REGNODE_OFFSET(NEXTOPER(br)),
12029 if ( OP(NEXTOPER(br)) != NOTHING
12030 || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12033 else if (op == BRANCHJ) {
12034 REGTAIL_STUDY(pRExC_state,
12035 REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12037 /* for now we always disable this optimisation * /
12038 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12039 || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12045 regnode * ret_as_regnode = REGNODE_p(ret);
12046 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12047 ? regnext(ret_as_regnode)
12050 DEBUG_PARSE_MSG("NADA");
12051 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12052 NULL, pRExC_state);
12053 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12054 NULL, pRExC_state);
12055 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12056 SvPV_nolen_const(RExC_mysv1),
12057 (IV)REG_NODE_NUM(ret_as_regnode),
12058 SvPV_nolen_const(RExC_mysv2),
12059 (IV)REG_NODE_NUM(REGNODE_p(ender)),
12064 if (OP(REGNODE_p(ender)) == TAIL) {
12066 RExC_emit= REGNODE_OFFSET(br) + 1;
12069 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12070 OP(opt)= OPTIMIZED;
12071 NEXT_OFF(br)= REGNODE_p(ender) - br;
12079 /* Even/odd or x=don't care: 010101x10x */
12080 static const char parens[] = "=!aA<,>Bbt";
12081 /* flag below is set to 0 up through 'A'; 1 for larger */
12083 if (paren && (p = strchr(parens, paren))) {
12084 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12085 int flag = (p - parens) > 3;
12087 if (paren == '>' || paren == 't') {
12088 node = SUSPEND, flag = 0;
12091 reginsert(pRExC_state, node, ret, depth+1);
12092 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12093 Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12094 FLAGS(REGNODE_p(ret)) = flag;
12095 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
12099 /* Check for proper termination. */
12101 /* restore original flags, but keep (?p) and, if we've changed from /d
12102 * rules to /u, keep the /u */
12103 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12104 if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12105 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12107 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12108 RExC_parse = oregcomp_parse;
12109 vFAIL("Unmatched (");
12111 nextchar(pRExC_state);
12113 else if (!paren && RExC_parse < RExC_end) {
12114 if (*RExC_parse == ')') {
12116 vFAIL("Unmatched )");
12119 FAIL("Junk on end of regexp"); /* "Can't happen". */
12120 NOT_REACHED; /* NOTREACHED */
12123 if (RExC_in_lookbehind) {
12124 RExC_in_lookbehind--;
12126 if (after_freeze > RExC_npar)
12127 RExC_npar = after_freeze;
12132 - regbranch - one alternative of an | operator
12134 * Implements the concatenation operator.
12136 * On success, returns the offset at which any next node should be placed into
12137 * the regex engine program being compiled.
12139 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12140 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12143 STATIC regnode_offset
12144 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12146 regnode_offset ret;
12147 regnode_offset chain = 0;
12148 regnode_offset latest;
12149 I32 flags = 0, c = 0;
12150 GET_RE_DEBUG_FLAGS_DECL;
12152 PERL_ARGS_ASSERT_REGBRANCH;
12154 DEBUG_PARSE("brnc");
12159 if (RExC_use_BRANCHJ)
12160 ret = reganode(pRExC_state, BRANCHJ, 0);
12162 ret = reg_node(pRExC_state, BRANCH);
12163 Set_Node_Length(REGNODE_p(ret), 1);
12167 *flagp = WORST; /* Tentatively. */
12169 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12170 FALSE /* Don't force to /x */ );
12171 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12172 flags &= ~TRYAGAIN;
12173 latest = regpiece(pRExC_state, &flags, depth+1);
12175 if (flags & TRYAGAIN)
12177 RETURN_FAIL_ON_RESTART(flags, flagp);
12178 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12182 *flagp |= flags&(HASWIDTH|POSTPONED);
12183 if (chain == 0) /* First piece. */
12184 *flagp |= flags&SPSTART;
12186 /* FIXME adding one for every branch after the first is probably
12187 * excessive now we have TRIE support. (hv) */
12189 if ( chain > (SSize_t) BRANCH_MAX_OFFSET
12190 && ! RExC_use_BRANCHJ)
12192 /* XXX We could just redo this branch, but figuring out what
12193 * bookkeeping needs to be reset is a pain */
12194 REQUIRE_BRANCHJ(flagp, 0);
12196 REGTAIL(pRExC_state, chain, latest);
12201 if (chain == 0) { /* Loop ran zero times. */
12202 chain = reg_node(pRExC_state, NOTHING);
12207 *flagp |= flags&SIMPLE;
12214 - regpiece - something followed by possible quantifier * + ? {n,m}
12216 * Note that the branching code sequences used for ? and the general cases
12217 * of * and + are somewhat optimized: they use the same NOTHING node as
12218 * both the endmarker for their branch list and the body of the last branch.
12219 * It might seem that this node could be dispensed with entirely, but the
12220 * endmarker role is not redundant.
12222 * On success, returns the offset at which any next node should be placed into
12223 * the regex engine program being compiled.
12225 * Returns 0 otherwise, with *flagp set to indicate why:
12226 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
12227 * RESTART_PARSE if the parse needs to be restarted, or'd with
12228 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12230 STATIC regnode_offset
12231 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12233 regnode_offset ret;
12237 const char * const origparse = RExC_parse;
12239 I32 max = REG_INFTY;
12240 #ifdef RE_TRACK_PATTERN_OFFSETS
12243 const char *maxpos = NULL;
12246 /* Save the original in case we change the emitted regop to a FAIL. */
12247 const regnode_offset orig_emit = RExC_emit;
12249 GET_RE_DEBUG_FLAGS_DECL;
12251 PERL_ARGS_ASSERT_REGPIECE;
12253 DEBUG_PARSE("piec");
12255 ret = regatom(pRExC_state, &flags, depth+1);
12257 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12258 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12263 if (op == '{' && regcurly(RExC_parse)) {
12265 #ifdef RE_TRACK_PATTERN_OFFSETS
12266 parse_start = RExC_parse; /* MJD */
12268 next = RExC_parse + 1;
12269 while (isDIGIT(*next) || *next == ',') {
12270 if (*next == ',') {
12278 if (*next == '}') { /* got one */
12279 const char* endptr;
12283 if (isDIGIT(*RExC_parse)) {
12285 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12286 vFAIL("Invalid quantifier in {,}");
12287 if (uv >= REG_INFTY)
12288 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12293 if (*maxpos == ',')
12296 maxpos = RExC_parse;
12297 if (isDIGIT(*maxpos)) {
12299 if (!grok_atoUV(maxpos, &uv, &endptr))
12300 vFAIL("Invalid quantifier in {,}");
12301 if (uv >= REG_INFTY)
12302 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12305 max = REG_INFTY; /* meaning "infinity" */
12308 nextchar(pRExC_state);
12309 if (max < min) { /* If can't match, warn and optimize to fail
12311 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12312 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12313 NEXT_OFF(REGNODE_p(orig_emit)) =
12314 regarglen[OPFAIL] + NODE_STEP_REGNODE;
12317 else if (min == max && *RExC_parse == '?')
12319 ckWARN2reg(RExC_parse + 1,
12320 "Useless use of greediness modifier '%c'",
12325 if ((flags&SIMPLE)) {
12326 if (min == 0 && max == REG_INFTY) {
12327 reginsert(pRExC_state, STAR, ret, depth+1);
12329 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12332 if (min == 1 && max == REG_INFTY) {
12333 reginsert(pRExC_state, PLUS, ret, depth+1);
12335 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12338 MARK_NAUGHTY_EXP(2, 2);
12339 reginsert(pRExC_state, CURLY, ret, depth+1);
12340 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12341 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12344 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12346 FLAGS(REGNODE_p(w)) = 0;
12347 REGTAIL(pRExC_state, ret, w);
12348 if (RExC_use_BRANCHJ) {
12349 reginsert(pRExC_state, LONGJMP, ret, depth+1);
12350 reginsert(pRExC_state, NOTHING, ret, depth+1);
12351 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
12353 reginsert(pRExC_state, CURLYX, ret, depth+1);
12355 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12356 Set_Node_Length(REGNODE_p(ret),
12357 op == '{' ? (RExC_parse - parse_start) : 1);
12359 if (RExC_use_BRANCHJ)
12360 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
12362 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
12363 RExC_whilem_seen++;
12364 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
12366 FLAGS(REGNODE_p(ret)) = 0;
12371 *flagp |= HASWIDTH;
12372 ARG1_SET(REGNODE_p(ret), (U16)min);
12373 ARG2_SET(REGNODE_p(ret), (U16)max);
12374 if (max == REG_INFTY)
12375 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12381 if (!ISMULT1(op)) {
12386 #if 0 /* Now runtime fix should be reliable. */
12388 /* if this is reinstated, don't forget to put this back into perldiag:
12390 =item Regexp *+ operand could be empty at {#} in regex m/%s/
12392 (F) The part of the regexp subject to either the * or + quantifier
12393 could match an empty string. The {#} shows in the regular
12394 expression about where the problem was discovered.
12398 if (!(flags&HASWIDTH) && op != '?')
12399 vFAIL("Regexp *+ operand could be empty");
12402 #ifdef RE_TRACK_PATTERN_OFFSETS
12403 parse_start = RExC_parse;
12405 nextchar(pRExC_state);
12407 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
12413 else if (op == '+') {
12417 else if (op == '?') {
12422 if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12423 ckWARN2reg(RExC_parse,
12424 "%" UTF8f " matches null string many times",
12425 UTF8fARG(UTF, (RExC_parse >= origparse
12426 ? RExC_parse - origparse
12431 if (*RExC_parse == '?') {
12432 nextchar(pRExC_state);
12433 reginsert(pRExC_state, MINMOD, ret, depth+1);
12434 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
12436 else if (*RExC_parse == '+') {
12437 regnode_offset ender;
12438 nextchar(pRExC_state);
12439 ender = reg_node(pRExC_state, SUCCEED);
12440 REGTAIL(pRExC_state, ret, ender);
12441 reginsert(pRExC_state, SUSPEND, ret, depth+1);
12442 ender = reg_node(pRExC_state, TAIL);
12443 REGTAIL(pRExC_state, ret, ender);
12446 if (ISMULT2(RExC_parse)) {
12448 vFAIL("Nested quantifiers");
12455 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12456 regnode_offset * node_p,
12464 /* This routine teases apart the various meanings of \N and returns
12465 * accordingly. The input parameters constrain which meaning(s) is/are valid
12466 * in the current context.
12468 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12470 * If <code_point_p> is not NULL, the context is expecting the result to be a
12471 * single code point. If this \N instance turns out to a single code point,
12472 * the function returns TRUE and sets *code_point_p to that code point.
12474 * If <node_p> is not NULL, the context is expecting the result to be one of
12475 * the things representable by a regnode. If this \N instance turns out to be
12476 * one such, the function generates the regnode, returns TRUE and sets *node_p
12477 * to point to the offset of that regnode into the regex engine program being
12480 * If this instance of \N isn't legal in any context, this function will
12481 * generate a fatal error and not return.
12483 * On input, RExC_parse should point to the first char following the \N at the
12484 * time of the call. On successful return, RExC_parse will have been updated
12485 * to point to just after the sequence identified by this routine. Also
12486 * *flagp has been updated as needed.
12488 * When there is some problem with the current context and this \N instance,
12489 * the function returns FALSE, without advancing RExC_parse, nor setting
12490 * *node_p, nor *code_point_p, nor *flagp.
12492 * If <cp_count> is not NULL, the caller wants to know the length (in code
12493 * points) that this \N sequence matches. This is set, and the input is
12494 * parsed for errors, even if the function returns FALSE, as detailed below.
12496 * There are 5 possibilities here, as detailed in the next 5 paragraphs.
12498 * Probably the most common case is for the \N to specify a single code point.
12499 * *cp_count will be set to 1, and *code_point_p will be set to that code
12502 * Another possibility is for the input to be an empty \N{}, which for
12503 * backwards compatibility we accept. *cp_count will be set to 0. *node_p
12504 * will be set to a generated NOTHING node.
12506 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12507 * set to 0. *node_p will be set to a generated REG_ANY node.
12509 * The fourth possibility is that \N resolves to a sequence of more than one
12510 * code points. *cp_count will be set to the number of code points in the
12511 * sequence. *node_p will be set to a generated node returned by this
12512 * function calling S_reg().
12514 * The final possibility is that it is premature to be calling this function;
12515 * the parse needs to be restarted. This can happen when this changes from
12516 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
12517 * latter occurs only when the fourth possibility would otherwise be in
12518 * effect, and is because one of those code points requires the pattern to be
12519 * recompiled as UTF-8. The function returns FALSE, and sets the
12520 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
12521 * happens, the caller needs to desist from continuing parsing, and return
12522 * this information to its caller. This is not set for when there is only one
12523 * code point, as this can be called as part of an ANYOF node, and they can
12524 * store above-Latin1 code points without the pattern having to be in UTF-8.
12526 * For non-single-quoted regexes, the tokenizer has resolved character and
12527 * sequence names inside \N{...} into their Unicode values, normalizing the
12528 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12529 * hex-represented code points in the sequence. This is done there because
12530 * the names can vary based on what charnames pragma is in scope at the time,
12531 * so we need a way to take a snapshot of what they resolve to at the time of
12532 * the original parse. [perl #56444].
12534 * That parsing is skipped for single-quoted regexes, so we may here get
12535 * '\N{NAME}'. This is a fatal error. These names have to be resolved by the
12536 * parser. But if the single-quoted regex is something like '\N{U+41}', that
12537 * is legal and handled here. The code point is Unicode, and has to be
12538 * translated into the native character set for non-ASCII platforms.
12541 char * endbrace; /* points to '}' following the name */
12542 char* p = RExC_parse; /* Temporary */
12544 SV * substitute_parse = NULL;
12548 Size_t count = 0; /* code point count kept internally by this function */
12550 GET_RE_DEBUG_FLAGS_DECL;
12552 PERL_ARGS_ASSERT_GROK_BSLASH_N;
12554 GET_RE_DEBUG_FLAGS;
12556 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
12557 assert(! (node_p && cp_count)); /* At most 1 should be set */
12559 if (cp_count) { /* Initialize return for the most common case */
12563 /* The [^\n] meaning of \N ignores spaces and comments under the /x
12564 * modifier. The other meanings do not, so use a temporary until we find
12565 * out which we are being called with */
12566 skip_to_be_ignored_text(pRExC_state, &p,
12567 FALSE /* Don't force to /x */ );
12569 /* Disambiguate between \N meaning a named character versus \N meaning
12570 * [^\n]. The latter is assumed when the {...} following the \N is a legal
12571 * quantifier, or there is no '{' at all */
12572 if (*p != '{' || regcurly(p)) {
12582 *node_p = reg_node(pRExC_state, REG_ANY);
12583 *flagp |= HASWIDTH|SIMPLE;
12585 Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12589 /* The test above made sure that the next real character is a '{', but
12590 * under the /x modifier, it could be separated by space (or a comment and
12591 * \n) and this is not allowed (for consistency with \x{...} and the
12592 * tokenizer handling of \N{NAME}). */
12593 if (*RExC_parse != '{') {
12594 vFAIL("Missing braces on \\N{}");
12597 RExC_parse++; /* Skip past the '{' */
12599 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12600 if (! endbrace) { /* no trailing brace */
12601 vFAIL2("Missing right brace on \\%c{}", 'N');
12604 /* Here, we have decided it should be a named character or sequence */
12605 REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
12608 if (endbrace == RExC_parse) { /* empty: \N{} */
12610 RExC_parse++; /* Position after the "}" */
12611 vFAIL("Zero length \\N{}");
12616 nextchar(pRExC_state);
12621 *node_p = reg_node(pRExC_state, NOTHING);
12625 /* If we haven't got something that begins with 'U+', then it didn't get lexed. */
12626 if ( endbrace - RExC_parse < 2
12627 || strnNE(RExC_parse, "U+", 2))
12629 RExC_parse = endbrace; /* position msg's '<--HERE' */
12630 vFAIL("\\N{NAME} must be resolved by the lexer");
12633 /* This code purposely indented below because of future changes coming */
12635 /* We can get to here when the input is \N{U+...} or when toke.c has
12636 * converted a name to the \N{U+...} form. This include changing a
12637 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
12639 RExC_parse += 2; /* Skip past the 'U+' */
12641 /* Code points are separated by dots. The '}' terminates the whole
12644 do { /* Loop until the ending brace */
12646 char * start_digit; /* The first of the current code point */
12647 if (! isXDIGIT(*RExC_parse)) {
12649 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12652 start_digit = RExC_parse;
12655 /* Loop through the hex digits of the current code point */
12657 /* Adding this digit will shift the result 4 bits. If that
12658 * result would be above the legal max, it's overflow */
12659 if (cp > MAX_LEGAL_CP >> 4) {
12661 /* Find the end of the code point */
12664 } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
12666 /* Be sure to synchronize this message with the similar one
12668 vFAIL4("Use of code point 0x%.*s is not allowed; the"
12669 " permissible max is 0x%" UVxf,
12670 (int) (RExC_parse - start_digit), start_digit,
12674 /* Accumulate this (valid) digit into the running total */
12675 cp = (cp << 4) + READ_XDIGIT(RExC_parse);
12677 /* READ_XDIGIT advanced the input pointer. Ignore a single
12678 * underscore separator */
12679 if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
12682 } while (isXDIGIT(*RExC_parse));
12684 /* Here, have accumulated the next code point */
12685 if (RExC_parse >= endbrace) { /* If done ... */
12690 /* Here, is a single code point; fail if doesn't want that */
12691 if (! code_point_p) {
12696 /* A single code point is easy to handle; just return it */
12697 *code_point_p = UNI_TO_NATIVE(cp);
12698 RExC_parse = endbrace;
12699 nextchar(pRExC_state);
12703 /* Here, the only legal thing would be a multiple character
12704 * sequence (of the form "\N{U+c1.c2. ... }". So the next
12705 * character must be a dot (and the one after that can't be the
12706 * endbrace, or we'd have something like \N{U+100.} ) */
12707 if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
12708 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
12709 ? UTF8SKIP(RExC_parse)
12711 if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
12712 RExC_parse = endbrace;
12714 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12717 /* Here, looks like its really a multiple character sequence. Fail
12718 * if that's not what the caller wants. But continue with counting
12719 * and error checking if they still want a count */
12720 if (! node_p && ! cp_count) {
12724 /* What is done here is to convert this to a sub-pattern of the
12725 * form \x{char1}\x{char2}... and then call reg recursively to
12726 * parse it (enclosing in "(?: ... )" ). That way, it retains its
12727 * atomicness, while not having to worry about special handling
12728 * that some code points may have. We don't create a subpattern,
12729 * but go through the motions of code point counting and error
12730 * checking, if the caller doesn't want a node returned. */
12732 if (node_p && count == 1) {
12733 substitute_parse = newSVpvs("?:");
12739 /* Convert to notation the rest of the code understands */
12740 sv_catpvs(substitute_parse, "\\x{");
12741 sv_catpvn(substitute_parse, start_digit,
12742 RExC_parse - start_digit);
12743 sv_catpvs(substitute_parse, "}");
12746 /* Move to after the dot (or ending brace the final time through.)
12751 } while (RExC_parse < endbrace);
12753 if (! node_p) { /* Doesn't want the node */
12760 sv_catpvs(substitute_parse, ")");
12763 /* The values are Unicode, and therefore have to be converted to native
12764 * on a non-Unicode (meaning non-ASCII) platform. */
12765 RExC_recode_x_to_native = 1;
12768 /* Here, we have the string the name evaluates to, ready to be parsed,
12769 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
12770 * constructs. This can be called from within a substitute parse already.
12771 * The error reporting mechanism doesn't work for 2 levels of this, but the
12772 * code above has validated this new construct, so there should be no
12773 * errors generated by the below. And this isn' an exact copy, so the
12774 * mechanism to seamlessly deal with this won't work, so turn off warnings
12776 save_start = RExC_start;
12777 orig_end = RExC_end;
12779 RExC_parse = RExC_start = SvPVX(substitute_parse);
12780 RExC_end = RExC_parse + SvCUR(substitute_parse);
12781 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
12783 *node_p = reg(pRExC_state, 1, &flags, depth+1);
12785 /* Restore the saved values */
12787 RExC_start = save_start;
12788 RExC_parse = endbrace;
12789 RExC_end = orig_end;
12791 RExC_recode_x_to_native = 0;
12794 SvREFCNT_dec_NN(substitute_parse);
12797 RETURN_FAIL_ON_RESTART(flags, flagp);
12798 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
12801 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
12803 nextchar(pRExC_state);
12809 PERL_STATIC_INLINE U8
12810 S_compute_EXACTish(RExC_state_t *pRExC_state)
12814 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
12822 op = get_regex_charset(RExC_flags);
12823 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
12824 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
12825 been, so there is no hole */
12828 return op + EXACTF;
12831 PERL_STATIC_INLINE void
12832 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
12833 regnode_offset node, I32* flagp, STRLEN len,
12834 UV code_point, bool downgradable)
12836 /* This knows the details about sizing an EXACTish node, setting flags for
12837 * it (by setting <*flagp>, and potentially populating it with a single
12840 * If <len> (the length in bytes) is non-zero, this function assumes that
12841 * the node has already been populated, and just does the sizing. In this
12842 * case <code_point> should be the final code point that has already been
12843 * placed into the node. This value will be ignored except that under some
12844 * circumstances <*flagp> is set based on it.
12846 * If <len> is zero, the function assumes that the node is to contain only
12847 * the single character given by <code_point> and calculates what <len>
12848 * should be. It populates the node's STRING with <code_point> or its
12851 * In both cases <*flagp> is appropriately set
12853 * It knows that under FOLD, the Latin Sharp S and UTF characters above
12854 * 255, must be folded (the former only when the rules indicate it can
12857 * When it does the populating, it looks at the flag 'downgradable'. If
12858 * true with a node that folds, it checks if the single code point
12859 * participates in a fold, and if not downgrades the node to an EXACT.
12860 * This helps the optimizer */
12862 bool len_passed_in = cBOOL(len != 0);
12863 U8 character[UTF8_MAXBYTES_CASE+1];
12865 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
12867 if (! len_passed_in) {
12869 if (UVCHR_IS_INVARIANT(code_point)) {
12870 if (LOC || ! FOLD) { /* /l defers folding until runtime */
12871 *character = (U8) code_point;
12873 else { /* Here is /i and not /l. */
12874 *character = toFOLD((U8) code_point);
12876 /* We can downgrade to an EXACT node if this character
12877 * isn't a folding one. Note that this assumes that
12878 * nothing above Latin1 folds to some other invariant than
12879 * one of these alphabetics; otherwise we would also have
12881 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12882 * || ASCII_FOLD_RESTRICTED))
12884 if (downgradable && PL_fold[code_point] == code_point) {
12885 OP(REGNODE_p(node)) = EXACT;
12890 else if (FOLD && ( ! LOC
12891 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
12892 { /* Folding, and ok to do so now */
12893 UV folded = _to_uni_fold_flags(
12897 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12898 ? FOLD_FLAGS_NOMIX_ASCII
12901 && folded == code_point /* This quickly rules out many
12902 cases, avoiding the
12903 _invlist_contains_cp() overhead
12905 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
12907 OP(REGNODE_p(node)) = (LOC)
12912 else if (code_point <= MAX_UTF8_TWO_BYTE) {
12914 /* Not folding this cp, and can output it directly */
12915 *character = UTF8_TWO_BYTE_HI(code_point);
12916 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
12920 uvchr_to_utf8( character, code_point);
12921 len = UTF8SKIP(character);
12923 } /* Else pattern isn't UTF8. */
12925 *character = (U8) code_point;
12927 } /* Else is folded non-UTF8 */
12928 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12929 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12930 || UNICODE_DOT_DOT_VERSION > 0)
12931 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
12935 /* We don't fold any non-UTF8 except possibly the Sharp s (see
12936 * comments at join_exact()); */
12937 *character = (U8) code_point;
12940 /* Can turn into an EXACT node if we know the fold at compile time,
12941 * and it folds to itself and doesn't particpate in other folds */
12944 && PL_fold_latin1[code_point] == code_point
12945 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
12946 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
12948 OP(REGNODE_p(node)) = EXACT;
12950 } /* else is Sharp s. May need to fold it */
12951 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
12953 *(character + 1) = 's';
12957 *character = LATIN_SMALL_LETTER_SHARP_S;
12962 if (downgradable) {
12963 change_engine_size(pRExC_state, STR_SZ(len));
12966 RExC_emit += STR_SZ(len);
12967 STR_LEN(REGNODE_p(node)) = len;
12968 if (! len_passed_in) {
12969 Copy((char *) character, STRING(REGNODE_p(node)), len, char);
12972 *flagp |= HASWIDTH;
12974 /* A single character node is SIMPLE, except for the special-cased SHARP S
12976 if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
12977 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
12978 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
12979 || UNICODE_DOT_DOT_VERSION > 0)
12980 && ( code_point != LATIN_SMALL_LETTER_SHARP_S
12981 || ! FOLD || ! DEPENDS_SEMANTICS)
12987 if (OP(REGNODE_p(node)) == EXACTFL) {
12988 RExC_contains_locale = 1;
12993 S_new_regcurly(const char *s, const char *e)
12995 /* This is a temporary function designed to match the most lenient form of
12996 * a {m,n} quantifier we ever envision, with either number omitted, and
12997 * spaces anywhere between/before/after them.
12999 * If this function fails, then the string it matches is very unlikely to
13000 * ever be considered a valid quantifier, so we can allow the '{' that
13001 * begins it to be considered as a literal */
13003 bool has_min = FALSE;
13004 bool has_max = FALSE;
13006 PERL_ARGS_ASSERT_NEW_REGCURLY;
13008 if (s >= e || *s++ != '{')
13011 while (s < e && isSPACE(*s)) {
13014 while (s < e && isDIGIT(*s)) {
13018 while (s < e && isSPACE(*s)) {
13024 while (s < e && isSPACE(*s)) {
13027 while (s < e && isDIGIT(*s)) {
13031 while (s < e && isSPACE(*s)) {
13036 return s < e && *s == '}' && (has_min || has_max);
13039 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13040 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13043 S_backref_value(char *p, char *e)
13045 const char* endptr = e;
13047 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13054 - regatom - the lowest level
13056 Try to identify anything special at the start of the current parse position.
13057 If there is, then handle it as required. This may involve generating a
13058 single regop, such as for an assertion; or it may involve recursing, such as
13059 to handle a () structure.
13061 If the string doesn't start with something special then we gobble up
13062 as much literal text as we can. If we encounter a quantifier, we have to
13063 back off the final literal character, as that quantifier applies to just it
13064 and not to the whole string of literals.
13066 Once we have been able to handle whatever type of thing started the
13067 sequence, we return the offset into the regex engine program being compiled
13068 at which any next regnode should be placed.
13070 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13071 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13072 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13073 Otherwise does not return 0.
13075 Note: we have to be careful with escapes, as they can be both literal
13076 and special, and in the case of \10 and friends, context determines which.
13078 A summary of the code structure is:
13080 switch (first_byte) {
13081 cases for each special:
13082 handle this special;
13085 switch (2nd byte) {
13086 cases for each unambiguous special:
13087 handle this special;
13089 cases for each ambigous special/literal:
13091 if (special) handle here
13093 default: // unambiguously literal:
13096 default: // is a literal char
13099 create EXACTish node for literal;
13100 while (more input and node isn't full) {
13101 switch (input_byte) {
13102 cases for each special;
13103 make sure parse pointer is set so that the next call to
13104 regatom will see this special first
13105 goto loopdone; // EXACTish node terminated by prev. char
13107 append char to EXACTISH node;
13109 get next input byte;
13113 return the generated node;
13115 Specifically there are two separate switches for handling
13116 escape sequences, with the one for handling literal escapes requiring
13117 a dummy entry for all of the special escapes that are actually handled
13122 STATIC regnode_offset
13123 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13125 regnode_offset ret = 0;
13132 GET_RE_DEBUG_FLAGS_DECL;
13134 *flagp = WORST; /* Tentatively. */
13136 DEBUG_PARSE("atom");
13138 PERL_ARGS_ASSERT_REGATOM;
13141 parse_start = RExC_parse;
13142 assert(RExC_parse < RExC_end);
13143 switch ((U8)*RExC_parse) {
13145 RExC_seen_zerolen++;
13146 nextchar(pRExC_state);
13147 if (RExC_flags & RXf_PMf_MULTILINE)
13148 ret = reg_node(pRExC_state, MBOL);
13150 ret = reg_node(pRExC_state, SBOL);
13151 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13154 nextchar(pRExC_state);
13156 RExC_seen_zerolen++;
13157 if (RExC_flags & RXf_PMf_MULTILINE)
13158 ret = reg_node(pRExC_state, MEOL);
13160 ret = reg_node(pRExC_state, SEOL);
13161 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13164 nextchar(pRExC_state);
13165 if (RExC_flags & RXf_PMf_SINGLELINE)
13166 ret = reg_node(pRExC_state, SANY);
13168 ret = reg_node(pRExC_state, REG_ANY);
13169 *flagp |= HASWIDTH|SIMPLE;
13171 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13175 char * const oregcomp_parse = ++RExC_parse;
13176 ret = regclass(pRExC_state, flagp, depth+1,
13177 FALSE, /* means parse the whole char class */
13178 TRUE, /* allow multi-char folds */
13179 FALSE, /* don't silence non-portable warnings. */
13180 (bool) RExC_strict,
13181 TRUE, /* Allow an optimized regnode result */
13184 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13185 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13188 if (*RExC_parse != ']') {
13189 RExC_parse = oregcomp_parse;
13190 vFAIL("Unmatched [");
13192 nextchar(pRExC_state);
13193 Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13197 nextchar(pRExC_state);
13198 ret = reg(pRExC_state, 2, &flags, depth+1);
13200 if (flags & TRYAGAIN) {
13201 if (RExC_parse >= RExC_end) {
13202 /* Make parent create an empty node if needed. */
13203 *flagp |= TRYAGAIN;
13208 RETURN_FAIL_ON_RESTART(flags, flagp);
13209 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13212 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13216 if (flags & TRYAGAIN) {
13217 *flagp |= TRYAGAIN;
13220 vFAIL("Internal urp");
13221 /* Supposed to be caught earlier. */
13227 vFAIL("Quantifier follows nothing");
13232 This switch handles escape sequences that resolve to some kind
13233 of special regop and not to literal text. Escape sequences that
13234 resolve to literal text are handled below in the switch marked
13237 Every entry in this switch *must* have a corresponding entry
13238 in the literal escape switch. However, the opposite is not
13239 required, as the default for this switch is to jump to the
13240 literal text handling code.
13243 switch ((U8)*RExC_parse) {
13244 /* Special Escapes */
13246 RExC_seen_zerolen++;
13247 ret = reg_node(pRExC_state, SBOL);
13248 /* SBOL is shared with /^/ so we set the flags so we can tell
13249 * /\A/ from /^/ in split. */
13250 FLAGS(REGNODE_p(ret)) = 1;
13252 goto finish_meta_pat;
13254 ret = reg_node(pRExC_state, GPOS);
13255 RExC_seen |= REG_GPOS_SEEN;
13257 goto finish_meta_pat;
13259 RExC_seen_zerolen++;
13260 ret = reg_node(pRExC_state, KEEPS);
13262 /* XXX:dmq : disabling in-place substitution seems to
13263 * be necessary here to avoid cases of memory corruption, as
13264 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13266 RExC_seen |= REG_LOOKBEHIND_SEEN;
13267 goto finish_meta_pat;
13269 ret = reg_node(pRExC_state, SEOL);
13271 RExC_seen_zerolen++; /* Do not optimize RE away */
13272 goto finish_meta_pat;
13274 ret = reg_node(pRExC_state, EOS);
13276 RExC_seen_zerolen++; /* Do not optimize RE away */
13277 goto finish_meta_pat;
13279 vFAIL("\\C no longer supported");
13281 ret = reg_node(pRExC_state, CLUMP);
13282 *flagp |= HASWIDTH;
13283 goto finish_meta_pat;
13289 arg = ANYOF_WORDCHAR;
13297 regex_charset charset = get_regex_charset(RExC_flags);
13299 RExC_seen_zerolen++;
13300 RExC_seen |= REG_LOOKBEHIND_SEEN;
13301 op = BOUND + charset;
13304 RExC_seen_d_op = TRUE;
13306 else if (op == BOUNDL) {
13307 RExC_contains_locale = 1;
13310 ret = reg_node(pRExC_state, op);
13312 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13313 FLAGS(REGNODE_p(ret)) = TRADITIONAL_BOUND;
13314 if (op > BOUNDA) { /* /aa is same as /a */
13315 OP(REGNODE_p(ret)) = BOUNDA;
13320 char name = *RExC_parse;
13321 char * endbrace = NULL;
13323 if (RExC_parse < RExC_end) {
13324 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13328 vFAIL2("Missing right brace on \\%c{}", name);
13330 /* XXX Need to decide whether to take spaces or not. Should be
13331 * consistent with \p{}, but that currently is SPACE, which
13332 * means vertical too, which seems wrong
13333 * while (isBLANK(*RExC_parse)) {
13336 if (endbrace == RExC_parse) {
13337 RExC_parse++; /* After the '}' */
13338 vFAIL2("Empty \\%c{}", name);
13340 length = endbrace - RExC_parse;
13341 /*while (isBLANK(*(RExC_parse + length - 1))) {
13344 switch (*RExC_parse) {
13347 && (memNEs(RExC_parse + 1, length - 1, "cb")))
13349 goto bad_bound_type;
13351 FLAGS(REGNODE_p(ret)) = GCB_BOUND;
13354 if (length != 2 || *(RExC_parse + 1) != 'b') {
13355 goto bad_bound_type;
13357 FLAGS(REGNODE_p(ret)) = LB_BOUND;
13360 if (length != 2 || *(RExC_parse + 1) != 'b') {
13361 goto bad_bound_type;
13363 FLAGS(REGNODE_p(ret)) = SB_BOUND;
13366 if (length != 2 || *(RExC_parse + 1) != 'b') {
13367 goto bad_bound_type;
13369 FLAGS(REGNODE_p(ret)) = WB_BOUND;
13373 RExC_parse = endbrace;
13375 "'%" UTF8f "' is an unknown bound type",
13376 UTF8fARG(UTF, length, endbrace - length));
13377 NOT_REACHED; /*NOTREACHED*/
13379 RExC_parse = endbrace;
13380 REQUIRE_UNI_RULES(flagp, 0);
13382 if (op >= BOUNDA) { /* /aa is same as /a */
13383 OP(REGNODE_p(ret)) = BOUNDU;
13386 /* Don't have to worry about UTF-8, in this message because
13387 * to get here the contents of the \b must be ASCII */
13388 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
13389 "Using /u for '%.*s' instead of /%s",
13391 endbrace - length + 1,
13392 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13393 ? ASCII_RESTRICT_PAT_MODS
13394 : ASCII_MORE_RESTRICT_PAT_MODS);
13399 OP(REGNODE_p(ret)) += NBOUND - BOUND;
13401 goto finish_meta_pat;
13409 if (! DEPENDS_SEMANTICS) {
13413 /* \d doesn't have any matches in the upper Latin1 range, hence /d
13414 * is equivalent to /u. Changing to /u saves some branches at
13417 goto join_posix_op_known;
13420 ret = reg_node(pRExC_state, LNBREAK);
13421 *flagp |= HASWIDTH|SIMPLE;
13422 goto finish_meta_pat;
13430 goto join_posix_op_known;
13436 arg = ANYOF_VERTWS;
13438 goto join_posix_op_known;
13448 op = POSIXD + get_regex_charset(RExC_flags);
13449 if (op > POSIXA) { /* /aa is same as /a */
13452 else if (op == POSIXL) {
13453 RExC_contains_locale = 1;
13455 else if (op == POSIXD) {
13456 RExC_seen_d_op = TRUE;
13459 join_posix_op_known:
13462 op += NPOSIXD - POSIXD;
13465 ret = reg_node(pRExC_state, op);
13466 FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
13468 *flagp |= HASWIDTH|SIMPLE;
13472 if ( UCHARAT(RExC_parse + 1) == '{'
13473 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13476 vFAIL("Unescaped left brace in regex is illegal here");
13478 nextchar(pRExC_state);
13479 Set_Node_Length(REGNODE_p(ret), 2); /* MJD */
13485 ret = regclass(pRExC_state, flagp, depth+1,
13486 TRUE, /* means just parse this element */
13487 FALSE, /* don't allow multi-char folds */
13488 FALSE, /* don't silence non-portable warnings. It
13489 would be a bug if these returned
13491 (bool) RExC_strict,
13492 TRUE, /* Allow an optimized regnode result */
13494 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13495 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13496 * multi-char folds are allowed. */
13498 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13503 Set_Node_Offset(REGNODE_p(ret), parse_start);
13504 Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2);
13505 nextchar(pRExC_state);
13508 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13509 * \N{...} evaluates to a sequence of more than one code points).
13510 * The function call below returns a regnode, which is our result.
13511 * The parameters cause it to fail if the \N{} evaluates to a
13512 * single code point; we handle those like any other literal. The
13513 * reason that the multicharacter case is handled here and not as
13514 * part of the EXACtish code is because of quantifiers. In
13515 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13516 * this way makes that Just Happen. dmq.
13517 * join_exact() will join this up with adjacent EXACTish nodes
13518 * later on, if appropriate. */
13520 if (grok_bslash_N(pRExC_state,
13521 &ret, /* Want a regnode returned */
13522 NULL, /* Fail if evaluates to a single code
13524 NULL, /* Don't need a count of how many code
13533 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13535 /* Here, evaluates to a single code point. Go get that */
13536 RExC_parse = parse_start;
13539 case 'k': /* Handle \k<NAME> and \k'NAME' */
13543 if ( RExC_parse >= RExC_end - 1
13544 || (( ch = RExC_parse[1]) != '<'
13549 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13550 vFAIL2("Sequence %.2s... not terminated", parse_start);
13553 ret = handle_named_backref(pRExC_state,
13565 case '1': case '2': case '3': case '4':
13566 case '5': case '6': case '7': case '8': case '9':
13571 if (*RExC_parse == 'g') {
13575 if (*RExC_parse == '{') {
13579 if (*RExC_parse == '-') {
13583 if (hasbrace && !isDIGIT(*RExC_parse)) {
13584 if (isrel) RExC_parse--;
13586 goto parse_named_seq;
13589 if (RExC_parse >= RExC_end) {
13590 goto unterminated_g;
13592 num = S_backref_value(RExC_parse, RExC_end);
13594 vFAIL("Reference to invalid group 0");
13595 else if (num == I32_MAX) {
13596 if (isDIGIT(*RExC_parse))
13597 vFAIL("Reference to nonexistent group");
13600 vFAIL("Unterminated \\g... pattern");
13604 num = RExC_npar - num;
13606 vFAIL("Reference to nonexistent or unclosed group");
13610 num = S_backref_value(RExC_parse, RExC_end);
13611 /* bare \NNN might be backref or octal - if it is larger
13612 * than or equal RExC_npar then it is assumed to be an
13613 * octal escape. Note RExC_npar is +1 from the actual
13614 * number of parens. */
13615 /* Note we do NOT check if num == I32_MAX here, as that is
13616 * handled by the RExC_npar check */
13619 /* any numeric escape < 10 is always a backref */
13621 /* any numeric escape < RExC_npar is a backref */
13622 && num >= RExC_npar
13623 /* cannot be an octal escape if it starts with 8 */
13624 && *RExC_parse != '8'
13625 /* cannot be an octal escape it it starts with 9 */
13626 && *RExC_parse != '9'
13628 /* Probably not meant to be a backref, instead likely
13629 * to be an octal character escape, e.g. \35 or \777.
13630 * The above logic should make it obvious why using
13631 * octal escapes in patterns is problematic. - Yves */
13632 RExC_parse = parse_start;
13637 /* At this point RExC_parse points at a numeric escape like
13638 * \12 or \88 or something similar, which we should NOT treat
13639 * as an octal escape. It may or may not be a valid backref
13640 * escape. For instance \88888888 is unlikely to be a valid
13642 while (isDIGIT(*RExC_parse))
13645 if (*RExC_parse != '}')
13646 vFAIL("Unterminated \\g{...} pattern");
13649 if (num >= (I32)RExC_npar) {
13651 /* It might be a forward reference; we can't fail until we
13652 * know, by completing the parse to get all the groups, and
13653 * then reparsing */
13654 if (RExC_total_parens > 0) {
13655 if (num >= RExC_total_parens) {
13656 vFAIL("Reference to nonexistent group");
13660 REQUIRE_PARENS_PASS;
13664 ret = reganode(pRExC_state,
13667 : (ASCII_FOLD_RESTRICTED)
13669 : (AT_LEAST_UNI_SEMANTICS)
13675 if (OP(REGNODE_p(ret)) == REFF) {
13676 RExC_seen_d_op = TRUE;
13678 *flagp |= HASWIDTH;
13680 /* override incorrect value set in reganode MJD */
13681 Set_Node_Offset(REGNODE_p(ret), parse_start);
13682 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13683 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13684 FALSE /* Don't force to /x */ );
13688 if (RExC_parse >= RExC_end)
13689 FAIL("Trailing \\");
13692 /* Do not generate "unrecognized" warnings here, we fall
13693 back into the quick-grab loop below */
13694 RExC_parse = parse_start;
13696 } /* end of switch on a \foo sequence */
13701 /* '#' comments should have been spaced over before this function was
13703 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13705 if (RExC_flags & RXf_PMf_EXTENDED) {
13706 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13707 if (RExC_parse < RExC_end)
13717 /* Here, we have determined that the next thing is probably a
13718 * literal character. RExC_parse points to the first byte of its
13719 * definition. (It still may be an escape sequence that evaluates
13720 * to a single character) */
13727 /* This allows us to fill a node with just enough spare so that if the final
13728 * character folds, its expansion is guaranteed to fit */
13729 #define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
13732 U8 upper_parse = MAX_NODE_STRING_SIZE;
13734 /* We start out as an EXACT node, even if under /i, until we find a
13735 * character which is in a fold. The algorithm now segregates into
13736 * separate nodes, characters that fold from those that don't under
13737 * /i. (This hopefully will create nodes that are fixed strings
13738 * even under /i, giving the optimizer something to grab on to.)
13739 * So, if a node has something in it and the next character is in
13740 * the opposite category, that node is closed up, and the function
13741 * returns. Then regatom is called again, and a new node is
13742 * created for the new category. */
13743 U8 node_type = EXACT;
13745 /* Assume the node will be fully used; the excess is given back at
13746 * the end. We can't make any other length assumptions, as a byte
13747 * input sequence could shrink down. */
13748 Ptrdiff_t initial_size = STR_SZ(256);
13750 bool next_is_quantifier;
13751 char * oldp = NULL;
13753 /* We can convert EXACTF nodes to EXACTFU if they contain only
13754 * characters that match identically regardless of the target
13755 * string's UTF8ness. The reason to do this is that EXACTF is not
13756 * trie-able, EXACTFU is.
13758 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
13759 * contain only above-Latin1 characters (hence must be in UTF8),
13760 * which don't participate in folds with Latin1-range characters,
13761 * as the latter's folds aren't known until runtime. */
13762 bool maybe_exactfu = FOLD;
13764 /* Does this node contain something that can't match unless the
13765 * target string is (also) in UTF-8 */
13766 bool requires_utf8_target = FALSE;
13768 bool has_micro_sign = FALSE;
13770 /* Allocate an EXACT node. The node_type may change below to
13771 * another EXACTish node, but since the size of the node doesn't
13772 * change, it works */
13773 ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
13774 FILL_NODE(ret, node_type);
13777 s = STRING(REGNODE_p(ret));
13783 /* This breaks under rare circumstances. If folding, we do not
13784 * want to split a node at a character that is a non-final in a
13785 * multi-char fold, as an input string could just happen to want to
13786 * match across the node boundary. The code at the end of the loop
13787 * looks for this, and backs off until it finds not such a
13788 * character, but it is possible (though extremely, extremely
13789 * unlikely) for all characters in the node to be non-final fold
13790 * ones, in which case we just leave the node fully filled, and
13791 * hope that it doesn't match the string in just the wrong place */
13793 assert( ! UTF /* Is at the beginning of a character */
13794 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
13795 || UTF8_IS_START(UCHARAT(RExC_parse)));
13798 /* Here, we have a literal character. Find the maximal string of
13799 * them in the input that we can fit into a single EXACTish node.
13800 * We quit at the first non-literal or when the node gets full, or
13801 * under /i the categorization of folding/non-folding character
13803 for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
13805 /* In most cases each iteration adds one byte to the output.
13806 * The exceptions override this */
13807 Size_t added_len = 1;
13811 /* White space has already been ignored */
13812 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
13813 || ! is_PATWS_safe((p), RExC_end, UTF));
13825 /* Literal Escapes Switch
13827 This switch is meant to handle escape sequences that
13828 resolve to a literal character.
13830 Every escape sequence that represents something
13831 else, like an assertion or a char class, is handled
13832 in the switch marked 'Special Escapes' above in this
13833 routine, but also has an entry here as anything that
13834 isn't explicitly mentioned here will be treated as
13835 an unescaped equivalent literal.
13838 switch ((U8)*++p) {
13840 /* These are all the special escapes. */
13841 case 'A': /* Start assertion */
13842 case 'b': case 'B': /* Word-boundary assertion*/
13843 case 'C': /* Single char !DANGEROUS! */
13844 case 'd': case 'D': /* digit class */
13845 case 'g': case 'G': /* generic-backref, pos assertion */
13846 case 'h': case 'H': /* HORIZWS */
13847 case 'k': case 'K': /* named backref, keep marker */
13848 case 'p': case 'P': /* Unicode property */
13849 case 'R': /* LNBREAK */
13850 case 's': case 'S': /* space class */
13851 case 'v': case 'V': /* VERTWS */
13852 case 'w': case 'W': /* word class */
13853 case 'X': /* eXtended Unicode "combining
13854 character sequence" */
13855 case 'z': case 'Z': /* End of line/string assertion */
13859 /* Anything after here is an escape that resolves to a
13860 literal. (Except digits, which may or may not)
13866 case 'N': /* Handle a single-code point named character. */
13867 RExC_parse = p + 1;
13868 if (! grok_bslash_N(pRExC_state,
13869 NULL, /* Fail if evaluates to
13870 anything other than a
13871 single code point */
13872 &ender, /* The returned single code
13874 NULL, /* Don't need a count of
13875 how many code points */
13880 if (*flagp & NEED_UTF8)
13881 FAIL("panic: grok_bslash_N set NEED_UTF8");
13882 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13884 /* Here, it wasn't a single code point. Go close
13885 * up this EXACTish node. The switch() prior to
13886 * this switch handles the other cases */
13887 RExC_parse = p = oldp;
13891 RExC_parse = parse_start;
13893 /* The \N{} means the pattern, if previously /d,
13894 * becomes /u. That means it can't be an EXACTF node,
13895 * but an EXACTFU */
13896 if (node_type == EXACTF) {
13897 node_type = EXACTFU;
13899 /* If the node already contains something that
13900 * differs between EXACTF and EXACTFU, reparse it
13902 if (! maybe_exactfu) {
13905 maybe_exactfu = FOLD; /* Prob. unnecessary */
13924 ender = ESC_NATIVE;
13934 const char* error_msg;
13936 bool valid = grok_bslash_o(&p,
13940 TO_OUTPUT_WARNINGS(p),
13941 (bool) RExC_strict,
13942 TRUE, /* Output warnings
13947 RExC_parse = p; /* going to die anyway; point
13948 to exact spot of failure */
13951 UPDATE_WARNINGS_LOC(p - 1);
13957 UV result = UV_MAX; /* initialize to erroneous
13959 const char* error_msg;
13961 bool valid = grok_bslash_x(&p,
13965 TO_OUTPUT_WARNINGS(p),
13966 (bool) RExC_strict,
13967 TRUE, /* Silence warnings
13972 RExC_parse = p; /* going to die anyway; point
13973 to exact spot of failure */
13976 UPDATE_WARNINGS_LOC(p - 1);
13979 if (ender < 0x100) {
13981 if (RExC_recode_x_to_native) {
13982 ender = LATIN1_TO_NATIVE(ender);
13990 ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
13991 UPDATE_WARNINGS_LOC(p);
13994 case '8': case '9': /* must be a backreference */
13996 /* we have an escape like \8 which cannot be an octal escape
13997 * so we exit the loop, and let the outer loop handle this
13998 * escape which may or may not be a legitimate backref. */
14000 case '1': case '2': case '3':case '4':
14001 case '5': case '6': case '7':
14002 /* When we parse backslash escapes there is ambiguity
14003 * between backreferences and octal escapes. Any escape
14004 * from \1 - \9 is a backreference, any multi-digit
14005 * escape which does not start with 0 and which when
14006 * evaluated as decimal could refer to an already
14007 * parsed capture buffer is a back reference. Anything
14010 * Note this implies that \118 could be interpreted as
14011 * 118 OR as "\11" . "8" depending on whether there
14012 * were 118 capture buffers defined already in the
14015 /* NOTE, RExC_npar is 1 more than the actual number of
14016 * parens we have seen so far, hence the "<" as opposed
14018 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14019 { /* Not to be treated as an octal constant, go
14027 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14029 ender = grok_oct(p, &numlen, &flags, NULL);
14031 if ( isDIGIT(*p) /* like \08, \178 */
14032 && ckWARN(WARN_REGEXP)
14035 reg_warn_non_literal_string(
14037 form_short_octal_warning(p, numlen));
14043 FAIL("Trailing \\");
14046 if (isALPHANUMERIC(*p)) {
14047 /* An alpha followed by '{' is going to fail next
14048 * iteration, so don't output this warning in that
14050 if (! isALPHA(*p) || *(p + 1) != '{') {
14051 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14052 " passed through", p);
14055 goto normal_default;
14056 } /* End of switch on '\' */
14059 /* Trying to gain new uses for '{' without breaking too
14060 * much existing code is hard. The solution currently
14062 * 1) If there is no ambiguity that a '{' should always
14063 * be taken literally, at the start of a construct, we
14065 * 2) If the literal '{' conflicts with our desired use
14066 * of it as a metacharacter, we die. The deprecation
14067 * cycles for this have come and gone.
14068 * 3) If there is ambiguity, we raise a simple warning.
14069 * This could happen, for example, if the user
14070 * intended it to introduce a quantifier, but slightly
14071 * misspelled the quantifier. Without this warning,
14072 * the quantifier would silently be taken as a literal
14073 * string of characters instead of a meta construct */
14074 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14076 || ( p > parse_start + 1
14077 && isALPHA_A(*(p - 1))
14078 && *(p - 2) == '\\')
14079 || new_regcurly(p, RExC_end))
14081 RExC_parse = p + 1;
14082 vFAIL("Unescaped left brace in regex is "
14085 ckWARNreg(p + 1, "Unescaped left brace in regex is"
14086 " passed through");
14088 goto normal_default;
14091 if (p > RExC_parse && RExC_strict) {
14092 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14095 default: /* A literal character */
14097 if (! UTF8_IS_INVARIANT(*p) && UTF) {
14099 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14100 &numlen, UTF8_ALLOW_DEFAULT);
14106 } /* End of switch on the literal */
14108 /* Here, have looked at the literal character, and <ender>
14109 * contains its ordinal; <p> points to the character after it.
14113 REQUIRE_UTF8(flagp);
14116 /* We need to check if the next non-ignored thing is a
14117 * quantifier. Move <p> to after anything that should be
14118 * ignored, which, as a side effect, positions <p> for the next
14119 * loop iteration */
14120 skip_to_be_ignored_text(pRExC_state, &p,
14121 FALSE /* Don't force to /x */ );
14123 /* If the next thing is a quantifier, it applies to this
14124 * character only, which means that this character has to be in
14125 * its own node and can't just be appended to the string in an
14126 * existing node, so if there are already other characters in
14127 * the node, close the node with just them, and set up to do
14128 * this character again next time through, when it will be the
14129 * only thing in its new node */
14131 next_is_quantifier = LIKELY(p < RExC_end)
14132 && UNLIKELY(ISMULT2(p));
14134 if (next_is_quantifier && LIKELY(len)) {
14139 /* Ready to add 'ender' to the node */
14141 if (! FOLD) { /* The simple case, just append the literal */
14144 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14145 *(s++) = (char) ender;
14148 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14149 added_len = (char *) new_s - s;
14150 s = (char *) new_s;
14153 requires_utf8_target = TRUE;
14157 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14159 /* Here are folding under /l, and the code point is
14160 * problematic. If this is the first character in the
14161 * node, change the node type to folding. Otherwise, if
14162 * this is the first problematic character, close up the
14163 * existing node, so can start a new node with this one */
14165 node_type = EXACTFL;
14167 else if (node_type == EXACT) {
14172 /* This code point means we can't simplify things */
14173 maybe_exactfu = FALSE;
14175 /* Here, we are adding a problematic fold character.
14176 * "Problematic" in this context means that its fold isn't
14177 * known until runtime. (The non-problematic code points
14178 * are the above-Latin1 ones that fold to also all
14179 * above-Latin1. Their folds don't vary no matter what the
14180 * locale is.) But here we have characters whose fold
14181 * depends on the locale. We just add in the unfolded
14182 * character, and wait until runtime to fold it */
14183 goto not_fold_common;
14185 else /* regular fold; see if actually is in a fold */
14186 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14188 && ! _invlist_contains_cp(PL_utf8_foldable, ender)))
14190 /* Here, folding, but the character isn't in a fold.
14192 * Start a new node if previous characters in the node were
14194 if (len && node_type != EXACT) {
14199 /* Here, continuing a node with non-folded characters. Add
14201 goto not_fold_common;
14203 else { /* Here, does participate in some fold */
14205 /* If this is the first character in the node, change its
14206 * type to folding. Otherwise, if this is the first
14207 * folding character in the node, close up the existing
14208 * node, so can start a new node with this one. */
14210 node_type = compute_EXACTish(pRExC_state);
14212 else if (node_type == EXACT) {
14217 if (UTF) { /* For UTF-8, we add the folded value */
14218 if (UVCHR_IS_INVARIANT(ender)) {
14219 *(s)++ = (U8) toFOLD(ender);
14222 ender = _to_uni_fold_flags(
14226 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14227 ? FOLD_FLAGS_NOMIX_ASCII
14232 requires_utf8_target = TRUE;
14233 if (UNLIKELY(ender == GREEK_SMALL_LETTER_MU)) {
14234 has_micro_sign = TRUE;
14241 /* Here is non-UTF8; we don't normally store the folded
14242 * value. First, see if the character's fold differs
14243 * between /d and /u. */
14244 if (PL_fold[ender] != PL_fold_latin1[ender]) {
14245 maybe_exactfu = FALSE;
14248 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
14249 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
14250 || UNICODE_DOT_DOT_VERSION > 0)
14252 /* On non-ancient Unicode versions, this includes the
14253 * multi-char fold SHARP S to 'ss' */
14255 else if (UNLIKELY( ender == LATIN_SMALL_LETTER_SHARP_S
14257 && isALPHA_FOLD_EQ(ender, 's')
14258 && isALPHA_FOLD_EQ(*(s-1), 's'))))
14261 if (node_type == EXACTFU) {
14262 /* See comments for join_exact() as to why we
14263 * fold this non-UTF at compile time */
14264 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14267 /* Let the code below add in the extra 's' */
14273 maybe_exactfu = FALSE;
14278 else if (UNLIKELY(ender == MICRO_SIGN)) {
14279 has_micro_sign = TRUE;
14282 /* Even when folding, we store just the input
14283 * character, as we have an array that finds its fold
14285 *(s++) = (char) ender;
14287 } /* End of adding current character to the node */
14291 if (next_is_quantifier) {
14293 /* Here, the next input is a quantifier, and to get here,
14294 * the current character is the only one in the node. */
14298 } /* End of loop through literal characters */
14300 /* Here we have either exhausted the input or ran out of room in
14301 * the node. (If we encountered a character that can't be in the
14302 * node, transfer is made directly to <loopdone>, and so we
14303 * wouldn't have fallen off the end of the loop.) In the latter
14304 * case, we artificially have to split the node into two, because
14305 * we just don't have enough space to hold everything. This
14306 * creates a problem if the final character participates in a
14307 * multi-character fold in the non-final position, as a match that
14308 * should have occurred won't, due to the way nodes are matched,
14309 * and our artificial boundary. So back off until we find a non-
14310 * problematic character -- one that isn't at the beginning or
14311 * middle of such a fold. (Either it doesn't participate in any
14312 * folds, or appears only in the final position of all the folds it
14313 * does participate in.) A better solution with far fewer false
14314 * positives, and that would fill the nodes more completely, would
14315 * be to actually have available all the multi-character folds to
14316 * test against, and to back-off only far enough to be sure that
14317 * this node isn't ending with a partial one. <upper_parse> is set
14318 * further below (if we need to reparse the node) to include just
14319 * up through that final non-problematic character that this code
14320 * identifies, so when it is set to less than the full node, we can
14321 * skip the rest of this */
14322 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
14324 const STRLEN full_len = len;
14326 assert(len >= MAX_NODE_STRING_SIZE);
14328 /* Here, <s> points to the final byte of the final character.
14329 * Look backwards through the string until find a non-
14330 * problematic character */
14334 /* This has no multi-char folds to non-UTF characters */
14335 if (ASCII_FOLD_RESTRICTED) {
14339 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
14344 /* Point to the first byte of the final character */
14345 s = (char *) utf8_hop((U8 *) s, -1);
14347 while (s >= s0) { /* Search backwards until find
14348 a non-problematic char */
14349 if (UTF8_IS_INVARIANT(*s)) {
14351 /* There are no ascii characters that participate
14352 * in multi-char folds under /aa. In EBCDIC, the
14353 * non-ascii invariants are all control characters,
14354 * so don't ever participate in any folds. */
14355 if (ASCII_FOLD_RESTRICTED
14356 || ! IS_NON_FINAL_FOLD(*s))
14361 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
14362 if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
14368 else if (! _invlist_contains_cp(
14369 PL_NonL1NonFinalFold,
14370 valid_utf8_to_uvchr((U8 *) s, NULL)))
14375 /* Here, the current character is problematic in that
14376 * it does occur in the non-final position of some
14377 * fold, so try the character before it, but have to
14378 * special case the very first byte in the string, so
14379 * we don't read outside the string */
14380 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
14381 } /* End of loop backwards through the string */
14383 /* If there were only problematic characters in the string,
14384 * <s> will point to before s0, in which case the length
14385 * should be 0, otherwise include the length of the
14386 * non-problematic character just found */
14387 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
14390 /* Here, have found the final character, if any, that is
14391 * non-problematic as far as ending the node without splitting
14392 * it across a potential multi-char fold. <len> contains the
14393 * number of bytes in the node up-to and including that
14394 * character, or is 0 if there is no such character, meaning
14395 * the whole node contains only problematic characters. In
14396 * this case, give up and just take the node as-is. We can't
14401 /* If the node ends in an 's' we make sure it stays EXACTF,
14402 * as if it turns into an EXACTFU, it could later get
14403 * joined with another 's' that would then wrongly match
14405 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
14407 maybe_exactfu = FALSE;
14411 /* Here, the node does contain some characters that aren't
14412 * problematic. If one such is the final character in the
14413 * node, we are done */
14414 if (len == full_len) {
14417 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
14419 /* If the final character is problematic, but the
14420 * penultimate is not, back-off that last character to
14421 * later start a new node with it */
14426 /* Here, the final non-problematic character is earlier
14427 * in the input than the penultimate character. What we do
14428 * is reparse from the beginning, going up only as far as
14429 * this final ok one, thus guaranteeing that the node ends
14430 * in an acceptable character. The reason we reparse is
14431 * that we know how far in the character is, but we don't
14432 * know how to correlate its position with the input parse.
14433 * An alternate implementation would be to build that
14434 * correlation as we go along during the original parse,
14435 * but that would entail extra work for every node, whereas
14436 * this code gets executed only when the string is too
14437 * large for the node, and the final two characters are
14438 * problematic, an infrequent occurrence. Yet another
14439 * possible strategy would be to save the tail of the
14440 * string, and the next time regatom is called, initialize
14441 * with that. The problem with this is that unless you
14442 * back off one more character, you won't be guaranteed
14443 * regatom will get called again, unless regbranch,
14444 * regpiece ... are also changed. If you do back off that
14445 * extra character, so that there is input guaranteed to
14446 * force calling regatom, you can't handle the case where
14447 * just the first character in the node is acceptable. I
14448 * (khw) decided to try this method which doesn't have that
14449 * pitfall; if performance issues are found, we can do a
14450 * combination of the current approach plus that one */
14456 } /* End of verifying node ends with an appropriate char */
14458 loopdone: /* Jumped to when encounters something that shouldn't be
14461 /* Free up any over-allocated space */
14462 change_engine_size(pRExC_state, - (initial_size - STR_SZ(len)));
14464 /* I (khw) don't know if you can get here with zero length, but the
14465 * old code handled this situation by creating a zero-length EXACT
14466 * node. Might as well be NOTHING instead */
14468 OP(REGNODE_p(ret)) = NOTHING;
14471 OP(REGNODE_p(ret)) = node_type;
14473 /* If the node type is EXACT here, check to see if it
14474 * should be EXACTL, or EXACT_ONLY8. */
14475 if (node_type == EXACT) {
14477 OP(REGNODE_p(ret)) = EXACTL;
14479 else if (requires_utf8_target) {
14480 OP(REGNODE_p(ret)) = EXACT_ONLY8;
14485 /* If 'maybe_exactfu' is set, then there are no code points
14486 * that match differently depending on UTF8ness of the
14487 * target string (for /u), or depending on locale for /l */
14488 if (maybe_exactfu) {
14489 if (node_type == EXACTF) {
14490 OP(REGNODE_p(ret)) = EXACTFU;
14492 else if (node_type == EXACTFL) {
14493 OP(REGNODE_p(ret)) = EXACTFLU8;
14496 else if (node_type == EXACTF) {
14497 RExC_seen_d_op = TRUE;
14500 /* The micro sign is the only below 256 character that
14501 * folds to above 255 */
14502 if ( OP(REGNODE_p(ret)) == EXACTFU
14503 && requires_utf8_target
14504 && LIKELY(! has_micro_sign))
14506 OP(REGNODE_p(ret)) = EXACTFU_ONLY8;
14511 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len,
14512 UV_MAX, /* unused here */
14513 FALSE /* Don't look to see if could
14514 be turned into an EXACT
14515 node, as we have already
14520 RExC_parse = p - 1;
14521 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
14524 /* len is STRLEN which is unsigned, need to copy to signed */
14527 vFAIL("Internal disaster");
14530 } /* End of label 'defchar:' */
14532 } /* End of giant switch on input character */
14534 /* Position parse to next real character */
14535 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14536 FALSE /* Don't force to /x */ );
14537 if ( *RExC_parse == '{'
14538 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
14540 if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
14542 vFAIL("Unescaped left brace in regex is illegal here");
14544 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
14545 " passed through");
14553 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
14555 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
14556 * sets up the bitmap and any flags, removing those code points from the
14557 * inversion list, setting it to NULL should it become completely empty */
14559 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
14560 assert(PL_regkind[OP(node)] == ANYOF);
14562 ANYOF_BITMAP_ZERO(node);
14563 if (*invlist_ptr) {
14565 /* This gets set if we actually need to modify things */
14566 bool change_invlist = FALSE;
14570 /* Start looking through *invlist_ptr */
14571 invlist_iterinit(*invlist_ptr);
14572 while (invlist_iternext(*invlist_ptr, &start, &end)) {
14576 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
14577 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
14580 /* Quit if are above what we should change */
14581 if (start >= NUM_ANYOF_CODE_POINTS) {
14585 change_invlist = TRUE;
14587 /* Set all the bits in the range, up to the max that we are doing */
14588 high = (end < NUM_ANYOF_CODE_POINTS - 1)
14590 : NUM_ANYOF_CODE_POINTS - 1;
14591 for (i = start; i <= (int) high; i++) {
14592 if (! ANYOF_BITMAP_TEST(node, i)) {
14593 ANYOF_BITMAP_SET(node, i);
14597 invlist_iterfinish(*invlist_ptr);
14599 /* Done with loop; remove any code points that are in the bitmap from
14600 * *invlist_ptr; similarly for code points above the bitmap if we have
14601 * a flag to match all of them anyways */
14602 if (change_invlist) {
14603 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
14605 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
14606 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
14609 /* If have completely emptied it, remove it completely */
14610 if (_invlist_len(*invlist_ptr) == 0) {
14611 SvREFCNT_dec_NN(*invlist_ptr);
14612 *invlist_ptr = NULL;
14617 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
14618 Character classes ([:foo:]) can also be negated ([:^foo:]).
14619 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
14620 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
14621 but trigger failures because they are currently unimplemented. */
14623 #define POSIXCC_DONE(c) ((c) == ':')
14624 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
14625 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
14626 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
14628 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
14629 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
14630 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
14632 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
14634 /* 'posix_warnings' and 'warn_text' are names of variables in the following
14636 #define ADD_POSIX_WARNING(p, text) STMT_START { \
14637 if (posix_warnings) { \
14638 if (! RExC_warn_text ) RExC_warn_text = \
14639 (AV *) sv_2mortal((SV *) newAV()); \
14640 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
14644 REPORT_LOCATION_ARGS(p))); \
14647 #define CLEAR_POSIX_WARNINGS() \
14649 if (posix_warnings && RExC_warn_text) \
14650 av_clear(RExC_warn_text); \
14653 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
14655 CLEAR_POSIX_WARNINGS(); \
14660 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
14662 const char * const s, /* Where the putative posix class begins.
14663 Normally, this is one past the '['. This
14664 parameter exists so it can be somewhere
14665 besides RExC_parse. */
14666 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
14668 AV ** posix_warnings, /* Where to place any generated warnings, or
14670 const bool check_only /* Don't die if error */
14673 /* This parses what the caller thinks may be one of the three POSIX
14675 * 1) a character class, like [:blank:]
14676 * 2) a collating symbol, like [. .]
14677 * 3) an equivalence class, like [= =]
14678 * In the latter two cases, it croaks if it finds a syntactically legal
14679 * one, as these are not handled by Perl.
14681 * The main purpose is to look for a POSIX character class. It returns:
14682 * a) the class number
14683 * if it is a completely syntactically and semantically legal class.
14684 * 'updated_parse_ptr', if not NULL, is set to point to just after the
14685 * closing ']' of the class
14686 * b) OOB_NAMEDCLASS
14687 * if it appears that one of the three POSIX constructs was meant, but
14688 * its specification was somehow defective. 'updated_parse_ptr', if
14689 * not NULL, is set to point to the character just after the end
14690 * character of the class. See below for handling of warnings.
14691 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
14692 * if it doesn't appear that a POSIX construct was intended.
14693 * 'updated_parse_ptr' is not changed. No warnings nor errors are
14696 * In b) there may be errors or warnings generated. If 'check_only' is
14697 * TRUE, then any errors are discarded. Warnings are returned to the
14698 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
14699 * instead it is NULL, warnings are suppressed.
14701 * The reason for this function, and its complexity is that a bracketed
14702 * character class can contain just about anything. But it's easy to
14703 * mistype the very specific posix class syntax but yielding a valid
14704 * regular bracketed class, so it silently gets compiled into something
14705 * quite unintended.
14707 * The solution adopted here maintains backward compatibility except that
14708 * it adds a warning if it looks like a posix class was intended but
14709 * improperly specified. The warning is not raised unless what is input
14710 * very closely resembles one of the 14 legal posix classes. To do this,
14711 * it uses fuzzy parsing. It calculates how many single-character edits it
14712 * would take to transform what was input into a legal posix class. Only
14713 * if that number is quite small does it think that the intention was a
14714 * posix class. Obviously these are heuristics, and there will be cases
14715 * where it errs on one side or another, and they can be tweaked as
14716 * experience informs.
14718 * The syntax for a legal posix class is:
14720 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
14722 * What this routine considers syntactically to be an intended posix class
14723 * is this (the comments indicate some restrictions that the pattern
14726 * qr/(?x: \[? # The left bracket, possibly
14728 * \h* # possibly followed by blanks
14729 * (?: \^ \h* )? # possibly a misplaced caret
14730 * [:;]? # The opening class character,
14731 * # possibly omitted. A typo
14732 * # semi-colon can also be used.
14734 * \^? # possibly a correctly placed
14735 * # caret, but not if there was also
14736 * # a misplaced one
14738 * .{3,15} # The class name. If there are
14739 * # deviations from the legal syntax,
14740 * # its edit distance must be close
14741 * # to a real class name in order
14742 * # for it to be considered to be
14743 * # an intended posix class.
14745 * [[:punct:]]? # The closing class character,
14746 * # possibly omitted. If not a colon
14747 * # nor semi colon, the class name
14748 * # must be even closer to a valid
14751 * \]? # The right bracket, possibly
14755 * In the above, \h must be ASCII-only.
14757 * These are heuristics, and can be tweaked as field experience dictates.
14758 * There will be cases when someone didn't intend to specify a posix class
14759 * that this warns as being so. The goal is to minimize these, while
14760 * maximizing the catching of things intended to be a posix class that
14761 * aren't parsed as such.
14765 const char * const e = RExC_end;
14766 unsigned complement = 0; /* If to complement the class */
14767 bool found_problem = FALSE; /* Assume OK until proven otherwise */
14768 bool has_opening_bracket = FALSE;
14769 bool has_opening_colon = FALSE;
14770 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
14772 const char * possible_end = NULL; /* used for a 2nd parse pass */
14773 const char* name_start; /* ptr to class name first char */
14775 /* If the number of single-character typos the input name is away from a
14776 * legal name is no more than this number, it is considered to have meant
14777 * the legal name */
14778 int max_distance = 2;
14780 /* to store the name. The size determines the maximum length before we
14781 * decide that no posix class was intended. Should be at least
14782 * sizeof("alphanumeric") */
14784 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
14786 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
14788 CLEAR_POSIX_WARNINGS();
14791 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
14794 if (*(p - 1) != '[') {
14795 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
14796 found_problem = TRUE;
14799 has_opening_bracket = TRUE;
14802 /* They could be confused and think you can put spaces between the
14805 found_problem = TRUE;
14809 } while (p < e && isBLANK(*p));
14811 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14814 /* For [. .] and [= =]. These are quite different internally from [: :],
14815 * so they are handled separately. */
14816 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
14817 and 1 for at least one char in it
14820 const char open_char = *p;
14821 const char * temp_ptr = p + 1;
14823 /* These two constructs are not handled by perl, and if we find a
14824 * syntactically valid one, we croak. khw, who wrote this code, finds
14825 * this explanation of them very unclear:
14826 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
14827 * And searching the rest of the internet wasn't very helpful either.
14828 * It looks like just about any byte can be in these constructs,
14829 * depending on the locale. But unless the pattern is being compiled
14830 * under /l, which is very rare, Perl runs under the C or POSIX locale.
14831 * In that case, it looks like [= =] isn't allowed at all, and that
14832 * [. .] could be any single code point, but for longer strings the
14833 * constituent characters would have to be the ASCII alphabetics plus
14834 * the minus-hyphen. Any sensible locale definition would limit itself
14835 * to these. And any portable one definitely should. Trying to parse
14836 * the general case is a nightmare (see [perl #127604]). So, this code
14837 * looks only for interiors of these constructs that match:
14839 * Using \w relaxes the apparent rules a little, without adding much
14840 * danger of mistaking something else for one of these constructs.
14842 * [. .] in some implementations described on the internet is usable to
14843 * escape a character that otherwise is special in bracketed character
14844 * classes. For example [.].] means a literal right bracket instead of
14845 * the ending of the class
14847 * [= =] can legitimately contain a [. .] construct, but we don't
14848 * handle this case, as that [. .] construct will later get parsed
14849 * itself and croak then. And [= =] is checked for even when not under
14850 * /l, as Perl has long done so.
14852 * The code below relies on there being a trailing NUL, so it doesn't
14853 * have to keep checking if the parse ptr < e.
14855 if (temp_ptr[1] == open_char) {
14858 else while ( temp_ptr < e
14859 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
14864 if (*temp_ptr == open_char) {
14866 if (*temp_ptr == ']') {
14868 if (! found_problem && ! check_only) {
14869 RExC_parse = (char *) temp_ptr;
14870 vFAIL3("POSIX syntax [%c %c] is reserved for future "
14871 "extensions", open_char, open_char);
14874 /* Here, the syntax wasn't completely valid, or else the call
14875 * is to check-only */
14876 if (updated_parse_ptr) {
14877 *updated_parse_ptr = (char *) temp_ptr;
14880 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
14884 /* If we find something that started out to look like one of these
14885 * constructs, but isn't, we continue below so that it can be checked
14886 * for being a class name with a typo of '.' or '=' instead of a colon.
14890 /* Here, we think there is a possibility that a [: :] class was meant, and
14891 * we have the first real character. It could be they think the '^' comes
14894 found_problem = TRUE;
14895 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
14900 found_problem = TRUE;
14904 } while (p < e && isBLANK(*p));
14906 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14910 /* But the first character should be a colon, which they could have easily
14911 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
14912 * distinguish from a colon, so treat that as a colon). */
14915 has_opening_colon = TRUE;
14917 else if (*p == ';') {
14918 found_problem = TRUE;
14920 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14921 has_opening_colon = TRUE;
14924 found_problem = TRUE;
14925 ADD_POSIX_WARNING(p, "there must be a starting ':'");
14927 /* Consider an initial punctuation (not one of the recognized ones) to
14928 * be a left terminator */
14929 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
14934 /* They may think that you can put spaces between the components */
14936 found_problem = TRUE;
14940 } while (p < e && isBLANK(*p));
14942 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14947 /* We consider something like [^:^alnum:]] to not have been intended to
14948 * be a posix class, but XXX maybe we should */
14950 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14957 /* Again, they may think that you can put spaces between the components */
14959 found_problem = TRUE;
14963 } while (p < e && isBLANK(*p));
14965 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
14970 /* XXX This ']' may be a typo, and something else was meant. But
14971 * treating it as such creates enough complications, that that
14972 * possibility isn't currently considered here. So we assume that the
14973 * ']' is what is intended, and if we've already found an initial '[',
14974 * this leaves this construct looking like [:] or [:^], which almost
14975 * certainly weren't intended to be posix classes */
14976 if (has_opening_bracket) {
14977 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14980 /* But this function can be called when we parse the colon for
14981 * something like qr/[alpha:]]/, so we back up to look for the
14986 found_problem = TRUE;
14987 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
14989 else if (*p != ':') {
14991 /* XXX We are currently very restrictive here, so this code doesn't
14992 * consider the possibility that, say, /[alpha.]]/ was intended to
14993 * be a posix class. */
14994 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
14997 /* Here we have something like 'foo:]'. There was no initial colon,
14998 * and we back up over 'foo. XXX Unlike the going forward case, we
14999 * don't handle typos of non-word chars in the middle */
15000 has_opening_colon = FALSE;
15003 while (p > RExC_start && isWORDCHAR(*p)) {
15008 /* Here, we have positioned ourselves to where we think the first
15009 * character in the potential class is */
15012 /* Now the interior really starts. There are certain key characters that
15013 * can end the interior, or these could just be typos. To catch both
15014 * cases, we may have to do two passes. In the first pass, we keep on
15015 * going unless we come to a sequence that matches
15016 * qr/ [[:punct:]] [[:blank:]]* \] /xa
15017 * This means it takes a sequence to end the pass, so two typos in a row if
15018 * that wasn't what was intended. If the class is perfectly formed, just
15019 * this one pass is needed. We also stop if there are too many characters
15020 * being accumulated, but this number is deliberately set higher than any
15021 * real class. It is set high enough so that someone who thinks that
15022 * 'alphanumeric' is a correct name would get warned that it wasn't.
15023 * While doing the pass, we keep track of where the key characters were in
15024 * it. If we don't find an end to the class, and one of the key characters
15025 * was found, we redo the pass, but stop when we get to that character.
15026 * Thus the key character was considered a typo in the first pass, but a
15027 * terminator in the second. If two key characters are found, we stop at
15028 * the second one in the first pass. Again this can miss two typos, but
15029 * catches a single one
15031 * In the first pass, 'possible_end' starts as NULL, and then gets set to
15032 * point to the first key character. For the second pass, it starts as -1.
15038 bool has_blank = FALSE;
15039 bool has_upper = FALSE;
15040 bool has_terminating_colon = FALSE;
15041 bool has_terminating_bracket = FALSE;
15042 bool has_semi_colon = FALSE;
15043 unsigned int name_len = 0;
15044 int punct_count = 0;
15048 /* Squeeze out blanks when looking up the class name below */
15049 if (isBLANK(*p) ) {
15051 found_problem = TRUE;
15056 /* The name will end with a punctuation */
15058 const char * peek = p + 1;
15060 /* Treat any non-']' punctuation followed by a ']' (possibly
15061 * with intervening blanks) as trying to terminate the class.
15062 * ']]' is very likely to mean a class was intended (but
15063 * missing the colon), but the warning message that gets
15064 * generated shows the error position better if we exit the
15065 * loop at the bottom (eventually), so skip it here. */
15067 if (peek < e && isBLANK(*peek)) {
15069 found_problem = TRUE;
15072 } while (peek < e && isBLANK(*peek));
15075 if (peek < e && *peek == ']') {
15076 has_terminating_bracket = TRUE;
15078 has_terminating_colon = TRUE;
15080 else if (*p == ';') {
15081 has_semi_colon = TRUE;
15082 has_terminating_colon = TRUE;
15085 found_problem = TRUE;
15092 /* Here we have punctuation we thought didn't end the class.
15093 * Keep track of the position of the key characters that are
15094 * more likely to have been class-enders */
15095 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15097 /* Allow just one such possible class-ender not actually
15098 * ending the class. */
15099 if (possible_end) {
15105 /* If we have too many punctuation characters, no use in
15107 if (++punct_count > max_distance) {
15111 /* Treat the punctuation as a typo. */
15112 input_text[name_len++] = *p;
15115 else if (isUPPER(*p)) { /* Use lowercase for lookup */
15116 input_text[name_len++] = toLOWER(*p);
15118 found_problem = TRUE;
15120 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15121 input_text[name_len++] = *p;
15125 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15129 /* The declaration of 'input_text' is how long we allow a potential
15130 * class name to be, before saying they didn't mean a class name at
15132 if (name_len >= C_ARRAY_LENGTH(input_text)) {
15137 /* We get to here when the possible class name hasn't been properly
15138 * terminated before:
15139 * 1) we ran off the end of the pattern; or
15140 * 2) found two characters, each of which might have been intended to
15141 * be the name's terminator
15142 * 3) found so many punctuation characters in the purported name,
15143 * that the edit distance to a valid one is exceeded
15144 * 4) we decided it was more characters than anyone could have
15145 * intended to be one. */
15147 found_problem = TRUE;
15149 /* In the final two cases, we know that looking up what we've
15150 * accumulated won't lead to a match, even a fuzzy one. */
15151 if ( name_len >= C_ARRAY_LENGTH(input_text)
15152 || punct_count > max_distance)
15154 /* If there was an intermediate key character that could have been
15155 * an intended end, redo the parse, but stop there */
15156 if (possible_end && possible_end != (char *) -1) {
15157 possible_end = (char *) -1; /* Special signal value to say
15158 we've done a first pass */
15163 /* Otherwise, it can't have meant to have been a class */
15164 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15167 /* If we ran off the end, and the final character was a punctuation
15168 * one, back up one, to look at that final one just below. Later, we
15169 * will restore the parse pointer if appropriate */
15170 if (name_len && p == e && isPUNCT(*(p-1))) {
15175 if (p < e && isPUNCT(*p)) {
15177 has_terminating_bracket = TRUE;
15179 /* If this is a 2nd ']', and the first one is just below this
15180 * one, consider that to be the real terminator. This gives a
15181 * uniform and better positioning for the warning message */
15183 && possible_end != (char *) -1
15184 && *possible_end == ']'
15185 && name_len && input_text[name_len - 1] == ']')
15190 /* And this is actually equivalent to having done the 2nd
15191 * pass now, so set it to not try again */
15192 possible_end = (char *) -1;
15197 has_terminating_colon = TRUE;
15199 else if (*p == ';') {
15200 has_semi_colon = TRUE;
15201 has_terminating_colon = TRUE;
15209 /* Here, we have a class name to look up. We can short circuit the
15210 * stuff below for short names that can't possibly be meant to be a
15211 * class name. (We can do this on the first pass, as any second pass
15212 * will yield an even shorter name) */
15213 if (name_len < 3) {
15214 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15217 /* Find which class it is. Initially switch on the length of the name.
15219 switch (name_len) {
15221 if (memEQs(name_start, 4, "word")) {
15222 /* this is not POSIX, this is the Perl \w */
15223 class_number = ANYOF_WORDCHAR;
15227 /* Names all of length 5: alnum alpha ascii blank cntrl digit
15228 * graph lower print punct space upper
15229 * Offset 4 gives the best switch position. */
15230 switch (name_start[4]) {
15232 if (memBEGINs(name_start, 5, "alph")) /* alpha */
15233 class_number = ANYOF_ALPHA;
15236 if (memBEGINs(name_start, 5, "spac")) /* space */
15237 class_number = ANYOF_SPACE;
15240 if (memBEGINs(name_start, 5, "grap")) /* graph */
15241 class_number = ANYOF_GRAPH;
15244 if (memBEGINs(name_start, 5, "asci")) /* ascii */
15245 class_number = ANYOF_ASCII;
15248 if (memBEGINs(name_start, 5, "blan")) /* blank */
15249 class_number = ANYOF_BLANK;
15252 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
15253 class_number = ANYOF_CNTRL;
15256 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
15257 class_number = ANYOF_ALPHANUMERIC;
15260 if (memBEGINs(name_start, 5, "lowe")) /* lower */
15261 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
15262 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
15263 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
15266 if (memBEGINs(name_start, 5, "digi")) /* digit */
15267 class_number = ANYOF_DIGIT;
15268 else if (memBEGINs(name_start, 5, "prin")) /* print */
15269 class_number = ANYOF_PRINT;
15270 else if (memBEGINs(name_start, 5, "punc")) /* punct */
15271 class_number = ANYOF_PUNCT;
15276 if (memEQs(name_start, 6, "xdigit"))
15277 class_number = ANYOF_XDIGIT;
15281 /* If the name exactly matches a posix class name the class number will
15282 * here be set to it, and the input almost certainly was meant to be a
15283 * posix class, so we can skip further checking. If instead the syntax
15284 * is exactly correct, but the name isn't one of the legal ones, we
15285 * will return that as an error below. But if neither of these apply,
15286 * it could be that no posix class was intended at all, or that one
15287 * was, but there was a typo. We tease these apart by doing fuzzy
15288 * matching on the name */
15289 if (class_number == OOB_NAMEDCLASS && found_problem) {
15290 const UV posix_names[][6] = {
15291 { 'a', 'l', 'n', 'u', 'm' },
15292 { 'a', 'l', 'p', 'h', 'a' },
15293 { 'a', 's', 'c', 'i', 'i' },
15294 { 'b', 'l', 'a', 'n', 'k' },
15295 { 'c', 'n', 't', 'r', 'l' },
15296 { 'd', 'i', 'g', 'i', 't' },
15297 { 'g', 'r', 'a', 'p', 'h' },
15298 { 'l', 'o', 'w', 'e', 'r' },
15299 { 'p', 'r', 'i', 'n', 't' },
15300 { 'p', 'u', 'n', 'c', 't' },
15301 { 's', 'p', 'a', 'c', 'e' },
15302 { 'u', 'p', 'p', 'e', 'r' },
15303 { 'w', 'o', 'r', 'd' },
15304 { 'x', 'd', 'i', 'g', 'i', 't' }
15306 /* The names of the above all have added NULs to make them the same
15307 * size, so we need to also have the real lengths */
15308 const UV posix_name_lengths[] = {
15309 sizeof("alnum") - 1,
15310 sizeof("alpha") - 1,
15311 sizeof("ascii") - 1,
15312 sizeof("blank") - 1,
15313 sizeof("cntrl") - 1,
15314 sizeof("digit") - 1,
15315 sizeof("graph") - 1,
15316 sizeof("lower") - 1,
15317 sizeof("print") - 1,
15318 sizeof("punct") - 1,
15319 sizeof("space") - 1,
15320 sizeof("upper") - 1,
15321 sizeof("word") - 1,
15322 sizeof("xdigit")- 1
15325 int temp_max = max_distance; /* Use a temporary, so if we
15326 reparse, we haven't changed the
15329 /* Use a smaller max edit distance if we are missing one of the
15331 if ( has_opening_bracket + has_opening_colon < 2
15332 || has_terminating_bracket + has_terminating_colon < 2)
15337 /* See if the input name is close to a legal one */
15338 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
15340 /* Short circuit call if the lengths are too far apart to be
15342 if (abs( (int) (name_len - posix_name_lengths[i]))
15348 if (edit_distance(input_text,
15351 posix_name_lengths[i],
15355 { /* If it is close, it probably was intended to be a class */
15356 goto probably_meant_to_be;
15360 /* Here the input name is not close enough to a valid class name
15361 * for us to consider it to be intended to be a posix class. If
15362 * we haven't already done so, and the parse found a character that
15363 * could have been terminators for the name, but which we absorbed
15364 * as typos during the first pass, repeat the parse, signalling it
15365 * to stop at that character */
15366 if (possible_end && possible_end != (char *) -1) {
15367 possible_end = (char *) -1;
15372 /* Here neither pass found a close-enough class name */
15373 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15376 probably_meant_to_be:
15378 /* Here we think that a posix specification was intended. Update any
15380 if (updated_parse_ptr) {
15381 *updated_parse_ptr = (char *) p;
15384 /* If a posix class name was intended but incorrectly specified, we
15385 * output or return the warnings */
15386 if (found_problem) {
15388 /* We set flags for these issues in the parse loop above instead of
15389 * adding them to the list of warnings, because we can parse it
15390 * twice, and we only want one warning instance */
15392 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
15395 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15397 if (has_semi_colon) {
15398 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15400 else if (! has_terminating_colon) {
15401 ADD_POSIX_WARNING(p, "there is no terminating ':'");
15403 if (! has_terminating_bracket) {
15404 ADD_POSIX_WARNING(p, "there is no terminating ']'");
15407 if ( posix_warnings
15409 && av_top_index(RExC_warn_text) > -1)
15411 *posix_warnings = RExC_warn_text;
15414 else if (class_number != OOB_NAMEDCLASS) {
15415 /* If it is a known class, return the class. The class number
15416 * #defines are structured so each complement is +1 to the normal
15418 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
15420 else if (! check_only) {
15422 /* Here, it is an unrecognized class. This is an error (unless the
15423 * call is to check only, which we've already handled above) */
15424 const char * const complement_string = (complement)
15427 RExC_parse = (char *) p;
15428 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
15430 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
15434 return OOB_NAMEDCLASS;
15436 #undef ADD_POSIX_WARNING
15438 STATIC unsigned int
15439 S_regex_set_precedence(const U8 my_operator) {
15441 /* Returns the precedence in the (?[...]) construct of the input operator,
15442 * specified by its character representation. The precedence follows
15443 * general Perl rules, but it extends this so that ')' and ']' have (low)
15444 * precedence even though they aren't really operators */
15446 switch (my_operator) {
15462 NOT_REACHED; /* NOTREACHED */
15463 return 0; /* Silence compiler warning */
15466 STATIC regnode_offset
15467 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
15468 I32 *flagp, U32 depth,
15469 char * const oregcomp_parse)
15471 /* Handle the (?[...]) construct to do set operations */
15473 U8 curchar; /* Current character being parsed */
15474 UV start, end; /* End points of code point ranges */
15475 SV* final = NULL; /* The end result inversion list */
15476 SV* result_string; /* 'final' stringified */
15477 AV* stack; /* stack of operators and operands not yet
15479 AV* fence_stack = NULL; /* A stack containing the positions in
15480 'stack' of where the undealt-with left
15481 parens would be if they were actually
15483 /* The 'volatile' is a workaround for an optimiser bug
15484 * in Solaris Studio 12.3. See RT #127455 */
15485 volatile IV fence = 0; /* Position of where most recent undealt-
15486 with left paren in stack is; -1 if none.
15488 STRLEN len; /* Temporary */
15489 regnode_offset node; /* Temporary, and final regnode returned by
15491 const bool save_fold = FOLD; /* Temporary */
15492 char *save_end, *save_parse; /* Temporaries */
15493 const bool in_locale = LOC; /* we turn off /l during processing */
15495 GET_RE_DEBUG_FLAGS_DECL;
15497 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
15499 DEBUG_PARSE("xcls");
15502 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
15505 /* The use of this operator implies /u. This is required so that the
15506 * compile time values are valid in all runtime cases */
15507 REQUIRE_UNI_RULES(flagp, 0);
15509 ckWARNexperimental(RExC_parse,
15510 WARN_EXPERIMENTAL__REGEX_SETS,
15511 "The regex_sets feature is experimental");
15513 /* Everything in this construct is a metacharacter. Operands begin with
15514 * either a '\' (for an escape sequence), or a '[' for a bracketed
15515 * character class. Any other character should be an operator, or
15516 * parenthesis for grouping. Both types of operands are handled by calling
15517 * regclass() to parse them. It is called with a parameter to indicate to
15518 * return the computed inversion list. The parsing here is implemented via
15519 * a stack. Each entry on the stack is a single character representing one
15520 * of the operators; or else a pointer to an operand inversion list. */
15522 #define IS_OPERATOR(a) SvIOK(a)
15523 #define IS_OPERAND(a) (! IS_OPERATOR(a))
15525 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
15526 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
15527 * with pronouncing it called it Reverse Polish instead, but now that YOU
15528 * know how to pronounce it you can use the correct term, thus giving due
15529 * credit to the person who invented it, and impressing your geek friends.
15530 * Wikipedia says that the pronounciation of "Ł" has been changing so that
15531 * it is now more like an English initial W (as in wonk) than an L.)
15533 * This means that, for example, 'a | b & c' is stored on the stack as
15541 * where the numbers in brackets give the stack [array] element number.
15542 * In this implementation, parentheses are not stored on the stack.
15543 * Instead a '(' creates a "fence" so that the part of the stack below the
15544 * fence is invisible except to the corresponding ')' (this allows us to
15545 * replace testing for parens, by using instead subtraction of the fence
15546 * position). As new operands are processed they are pushed onto the stack
15547 * (except as noted in the next paragraph). New operators of higher
15548 * precedence than the current final one are inserted on the stack before
15549 * the lhs operand (so that when the rhs is pushed next, everything will be
15550 * in the correct positions shown above. When an operator of equal or
15551 * lower precedence is encountered in parsing, all the stacked operations
15552 * of equal or higher precedence are evaluated, leaving the result as the
15553 * top entry on the stack. This makes higher precedence operations
15554 * evaluate before lower precedence ones, and causes operations of equal
15555 * precedence to left associate.
15557 * The only unary operator '!' is immediately pushed onto the stack when
15558 * encountered. When an operand is encountered, if the top of the stack is
15559 * a '!", the complement is immediately performed, and the '!' popped. The
15560 * resulting value is treated as a new operand, and the logic in the
15561 * previous paragraph is executed. Thus in the expression
15563 * the stack looks like
15569 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
15576 * A ')' is treated as an operator with lower precedence than all the
15577 * aforementioned ones, which causes all operations on the stack above the
15578 * corresponding '(' to be evaluated down to a single resultant operand.
15579 * Then the fence for the '(' is removed, and the operand goes through the
15580 * algorithm above, without the fence.
15582 * A separate stack is kept of the fence positions, so that the position of
15583 * the latest so-far unbalanced '(' is at the top of it.
15585 * The ']' ending the construct is treated as the lowest operator of all,
15586 * so that everything gets evaluated down to a single operand, which is the
15589 sv_2mortal((SV *)(stack = newAV()));
15590 sv_2mortal((SV *)(fence_stack = newAV()));
15592 while (RExC_parse < RExC_end) {
15593 I32 top_index; /* Index of top-most element in 'stack' */
15594 SV** top_ptr; /* Pointer to top 'stack' element */
15595 SV* current = NULL; /* To contain the current inversion list
15597 SV* only_to_avoid_leaks;
15599 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15600 TRUE /* Force /x */ );
15601 if (RExC_parse >= RExC_end) { /* Fail */
15605 curchar = UCHARAT(RExC_parse);
15609 #ifdef ENABLE_REGEX_SETS_DEBUGGING
15610 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
15611 DEBUG_U(dump_regex_sets_structures(pRExC_state,
15612 stack, fence, fence_stack));
15615 top_index = av_tindex_skip_len_mg(stack);
15618 SV** stacked_ptr; /* Ptr to something already on 'stack' */
15619 char stacked_operator; /* The topmost operator on the 'stack'. */
15620 SV* lhs; /* Operand to the left of the operator */
15621 SV* rhs; /* Operand to the right of the operator */
15622 SV* fence_ptr; /* Pointer to top element of the fence
15627 if ( RExC_parse < RExC_end - 1
15628 && (UCHARAT(RExC_parse + 1) == '?'))
15630 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
15631 * This happens when we have some thing like
15633 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
15635 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
15637 * Here we would be handling the interpolated
15638 * '$thai_or_lao'. We handle this by a recursive call to
15639 * ourselves which returns the inversion list the
15640 * interpolated expression evaluates to. We use the flags
15641 * from the interpolated pattern. */
15642 U32 save_flags = RExC_flags;
15643 const char * save_parse;
15645 RExC_parse += 2; /* Skip past the '(?' */
15646 save_parse = RExC_parse;
15648 /* Parse any flags for the '(?' */
15649 parse_lparen_question_flags(pRExC_state);
15651 if (RExC_parse == save_parse /* Makes sure there was at
15652 least one flag (or else
15653 this embedding wasn't
15655 || RExC_parse >= RExC_end - 4
15656 || UCHARAT(RExC_parse) != ':'
15657 || UCHARAT(++RExC_parse) != '('
15658 || UCHARAT(++RExC_parse) != '?'
15659 || UCHARAT(++RExC_parse) != '[')
15662 /* In combination with the above, this moves the
15663 * pointer to the point just after the first erroneous
15664 * character (or if there are no flags, to where they
15665 * should have been) */
15666 if (RExC_parse >= RExC_end - 4) {
15667 RExC_parse = RExC_end;
15669 else if (RExC_parse != save_parse) {
15670 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15672 vFAIL("Expecting '(?flags:(?[...'");
15675 /* Recurse, with the meat of the embedded expression */
15677 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
15678 depth+1, oregcomp_parse);
15680 /* Here, 'current' contains the embedded expression's
15681 * inversion list, and RExC_parse points to the trailing
15682 * ']'; the next character should be the ')' */
15684 if (UCHARAT(RExC_parse) != ')')
15685 vFAIL("Expecting close paren for nested extended charclass");
15687 /* Then the ')' matching the original '(' handled by this
15688 * case: statement */
15690 if (UCHARAT(RExC_parse) != ')')
15691 vFAIL("Expecting close paren for wrapper for nested extended charclass");
15694 RExC_flags = save_flags;
15695 goto handle_operand;
15698 /* A regular '('. Look behind for illegal syntax */
15699 if (top_index - fence >= 0) {
15700 /* If the top entry on the stack is an operator, it had
15701 * better be a '!', otherwise the entry below the top
15702 * operand should be an operator */
15703 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
15704 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
15705 || ( IS_OPERAND(*top_ptr)
15706 && ( top_index - fence < 1
15707 || ! (stacked_ptr = av_fetch(stack,
15710 || ! IS_OPERATOR(*stacked_ptr))))
15713 vFAIL("Unexpected '(' with no preceding operator");
15717 /* Stack the position of this undealt-with left paren */
15718 av_push(fence_stack, newSViv(fence));
15719 fence = top_index + 1;
15723 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
15724 * multi-char folds are allowed. */
15725 if (!regclass(pRExC_state, flagp, depth+1,
15726 TRUE, /* means parse just the next thing */
15727 FALSE, /* don't allow multi-char folds */
15728 FALSE, /* don't silence non-portable warnings. */
15730 FALSE, /* Require return to be an ANYOF */
15733 FAIL2("panic: regclass returned failure to handle_sets, "
15734 "flags=%#" UVxf, (UV) *flagp);
15737 /* regclass() will return with parsing just the \ sequence,
15738 * leaving the parse pointer at the next thing to parse */
15740 goto handle_operand;
15742 case '[': /* Is a bracketed character class */
15744 /* See if this is a [:posix:] class. */
15745 bool is_posix_class = (OOB_NAMEDCLASS
15746 < handle_possible_posix(pRExC_state,
15750 TRUE /* checking only */));
15751 /* If it is a posix class, leave the parse pointer at the '['
15752 * to fool regclass() into thinking it is part of a
15753 * '[[:posix:]]'. */
15754 if (! is_posix_class) {
15758 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
15759 * multi-char folds are allowed. */
15760 if (!regclass(pRExC_state, flagp, depth+1,
15761 is_posix_class, /* parse the whole char
15762 class only if not a
15764 FALSE, /* don't allow multi-char folds */
15765 TRUE, /* silence non-portable warnings. */
15767 FALSE, /* Require return to be an ANYOF */
15770 FAIL2("panic: regclass returned failure to handle_sets, "
15771 "flags=%#" UVxf, (UV) *flagp);
15778 /* function call leaves parse pointing to the ']', except if we
15780 if (is_posix_class) {
15784 goto handle_operand;
15788 if (top_index >= 1) {
15789 goto join_operators;
15792 /* Only a single operand on the stack: are done */
15796 if (av_tindex_skip_len_mg(fence_stack) < 0) {
15797 if (UCHARAT(RExC_parse - 1) == ']') {
15801 vFAIL("Unexpected ')'");
15804 /* If nothing after the fence, is missing an operand */
15805 if (top_index - fence < 0) {
15809 /* If at least two things on the stack, treat this as an
15811 if (top_index - fence >= 1) {
15812 goto join_operators;
15815 /* Here only a single thing on the fenced stack, and there is a
15816 * fence. Get rid of it */
15817 fence_ptr = av_pop(fence_stack);
15819 fence = SvIV(fence_ptr);
15820 SvREFCNT_dec_NN(fence_ptr);
15827 /* Having gotten rid of the fence, we pop the operand at the
15828 * stack top and process it as a newly encountered operand */
15829 current = av_pop(stack);
15830 if (IS_OPERAND(current)) {
15831 goto handle_operand;
15843 /* These binary operators should have a left operand already
15845 if ( top_index - fence < 0
15846 || top_index - fence == 1
15847 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
15848 || ! IS_OPERAND(*top_ptr))
15850 goto unexpected_binary;
15853 /* If only the one operand is on the part of the stack visible
15854 * to us, we just place this operator in the proper position */
15855 if (top_index - fence < 2) {
15857 /* Place the operator before the operand */
15859 SV* lhs = av_pop(stack);
15860 av_push(stack, newSVuv(curchar));
15861 av_push(stack, lhs);
15865 /* But if there is something else on the stack, we need to
15866 * process it before this new operator if and only if the
15867 * stacked operation has equal or higher precedence than the
15872 /* The operator on the stack is supposed to be below both its
15874 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
15875 || IS_OPERAND(*stacked_ptr))
15877 /* But if not, it's legal and indicates we are completely
15878 * done if and only if we're currently processing a ']',
15879 * which should be the final thing in the expression */
15880 if (curchar == ']') {
15886 vFAIL2("Unexpected binary operator '%c' with no "
15887 "preceding operand", curchar);
15889 stacked_operator = (char) SvUV(*stacked_ptr);
15891 if (regex_set_precedence(curchar)
15892 > regex_set_precedence(stacked_operator))
15894 /* Here, the new operator has higher precedence than the
15895 * stacked one. This means we need to add the new one to
15896 * the stack to await its rhs operand (and maybe more
15897 * stuff). We put it before the lhs operand, leaving
15898 * untouched the stacked operator and everything below it
15900 lhs = av_pop(stack);
15901 assert(IS_OPERAND(lhs));
15903 av_push(stack, newSVuv(curchar));
15904 av_push(stack, lhs);
15908 /* Here, the new operator has equal or lower precedence than
15909 * what's already there. This means the operation already
15910 * there should be performed now, before the new one. */
15912 rhs = av_pop(stack);
15913 if (! IS_OPERAND(rhs)) {
15915 /* This can happen when a ! is not followed by an operand,
15916 * like in /(?[\t &!])/ */
15920 lhs = av_pop(stack);
15922 if (! IS_OPERAND(lhs)) {
15924 /* This can happen when there is an empty (), like in
15925 * /(?[[0]+()+])/ */
15929 switch (stacked_operator) {
15931 _invlist_intersection(lhs, rhs, &rhs);
15936 _invlist_union(lhs, rhs, &rhs);
15940 _invlist_subtract(lhs, rhs, &rhs);
15943 case '^': /* The union minus the intersection */
15948 _invlist_union(lhs, rhs, &u);
15949 _invlist_intersection(lhs, rhs, &i);
15950 _invlist_subtract(u, i, &rhs);
15951 SvREFCNT_dec_NN(i);
15952 SvREFCNT_dec_NN(u);
15958 /* Here, the higher precedence operation has been done, and the
15959 * result is in 'rhs'. We overwrite the stacked operator with
15960 * the result. Then we redo this code to either push the new
15961 * operator onto the stack or perform any higher precedence
15962 * stacked operation */
15963 only_to_avoid_leaks = av_pop(stack);
15964 SvREFCNT_dec(only_to_avoid_leaks);
15965 av_push(stack, rhs);
15968 case '!': /* Highest priority, right associative */
15970 /* If what's already at the top of the stack is another '!",
15971 * they just cancel each other out */
15972 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
15973 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
15975 only_to_avoid_leaks = av_pop(stack);
15976 SvREFCNT_dec(only_to_avoid_leaks);
15978 else { /* Otherwise, since it's right associative, just push
15980 av_push(stack, newSVuv(curchar));
15985 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
15986 if (RExC_parse >= RExC_end) {
15989 vFAIL("Unexpected character");
15993 /* Here 'current' is the operand. If something is already on the
15994 * stack, we have to check if it is a !. But first, the code above
15995 * may have altered the stack in the time since we earlier set
15998 top_index = av_tindex_skip_len_mg(stack);
15999 if (top_index - fence >= 0) {
16000 /* If the top entry on the stack is an operator, it had better
16001 * be a '!', otherwise the entry below the top operand should
16002 * be an operator */
16003 top_ptr = av_fetch(stack, top_index, FALSE);
16005 if (IS_OPERATOR(*top_ptr)) {
16007 /* The only permissible operator at the top of the stack is
16008 * '!', which is applied immediately to this operand. */
16009 curchar = (char) SvUV(*top_ptr);
16010 if (curchar != '!') {
16011 SvREFCNT_dec(current);
16012 vFAIL2("Unexpected binary operator '%c' with no "
16013 "preceding operand", curchar);
16016 _invlist_invert(current);
16018 only_to_avoid_leaks = av_pop(stack);
16019 SvREFCNT_dec(only_to_avoid_leaks);
16021 /* And we redo with the inverted operand. This allows
16022 * handling multiple ! in a row */
16023 goto handle_operand;
16025 /* Single operand is ok only for the non-binary ')'
16027 else if ((top_index - fence == 0 && curchar != ')')
16028 || (top_index - fence > 0
16029 && (! (stacked_ptr = av_fetch(stack,
16032 || IS_OPERAND(*stacked_ptr))))
16034 SvREFCNT_dec(current);
16035 vFAIL("Operand with no preceding operator");
16039 /* Here there was nothing on the stack or the top element was
16040 * another operand. Just add this new one */
16041 av_push(stack, current);
16043 } /* End of switch on next parse token */
16045 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16046 } /* End of loop parsing through the construct */
16048 vFAIL("Syntax error in (?[...])");
16052 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16053 if (RExC_parse < RExC_end) {
16057 vFAIL("Unexpected ']' with no following ')' in (?[...");
16060 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16061 vFAIL("Unmatched (");
16064 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
16065 || ((final = av_pop(stack)) == NULL)
16066 || ! IS_OPERAND(final)
16067 || ! is_invlist(final)
16068 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
16071 SvREFCNT_dec(final);
16072 vFAIL("Incomplete expression within '(?[ ])'");
16075 /* Here, 'final' is the resultant inversion list from evaluating the
16076 * expression. Return it if so requested */
16077 if (return_invlist) {
16078 *return_invlist = final;
16082 /* Otherwise generate a resultant node, based on 'final'. regclass() is
16083 * expecting a string of ranges and individual code points */
16084 invlist_iterinit(final);
16085 result_string = newSVpvs("");
16086 while (invlist_iternext(final, &start, &end)) {
16087 if (start == end) {
16088 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16091 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
16096 /* About to generate an ANYOF (or similar) node from the inversion list we
16097 * have calculated */
16098 save_parse = RExC_parse;
16099 RExC_parse = SvPV(result_string, len);
16100 save_end = RExC_end;
16101 RExC_end = RExC_parse + len;
16102 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16104 /* We turn off folding around the call, as the class we have constructed
16105 * already has all folding taken into consideration, and we don't want
16106 * regclass() to add to that */
16107 RExC_flags &= ~RXf_PMf_FOLD;
16108 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16109 * folds are allowed. */
16110 node = regclass(pRExC_state, flagp, depth+1,
16111 FALSE, /* means parse the whole char class */
16112 FALSE, /* don't allow multi-char folds */
16113 TRUE, /* silence non-portable warnings. The above may very
16114 well have generated non-portable code points, but
16115 they're valid on this machine */
16116 FALSE, /* similarly, no need for strict */
16117 FALSE, /* Require return to be an ANYOF */
16122 RExC_parse = save_parse + 1;
16123 RExC_end = save_end;
16124 SvREFCNT_dec_NN(final);
16125 SvREFCNT_dec_NN(result_string);
16128 RExC_flags |= RXf_PMf_FOLD;
16132 FAIL2("panic: regclass returned failure to handle_sets, flags=%#" UVxf,
16135 /* Fix up the node type if we are in locale. (We have pretended we are
16136 * under /u for the purposes of regclass(), as this construct will only
16137 * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so
16138 * as to cause any warnings about bad locales to be output in regexec.c),
16139 * and add the flag that indicates to check if not in a UTF-8 locale. The
16140 * reason we above forbid optimization into something other than an ANYOF
16141 * node is simply to minimize the number of code changes in regexec.c.
16142 * Otherwise we would have to create new EXACTish node types and deal with
16143 * them. This decision could be revisited should this construct become
16146 * (One might think we could look at the resulting ANYOF node and suppress
16147 * the flag if everything is above 255, as those would be UTF-8 only,
16148 * but this isn't true, as the components that led to that result could
16149 * have been locale-affected, and just happen to cancel each other out
16150 * under UTF-8 locales.) */
16152 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16154 assert(OP(REGNODE_p(node)) == ANYOF);
16156 OP(REGNODE_p(node)) = ANYOFL;
16157 ANYOF_FLAGS(REGNODE_p(node))
16158 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16161 nextchar(pRExC_state);
16162 Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16166 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16169 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16170 AV * stack, const IV fence, AV * fence_stack)
16171 { /* Dumps the stacks in handle_regex_sets() */
16173 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16174 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16177 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16179 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16181 if (stack_top < 0) {
16182 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16185 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16186 for (i = stack_top; i >= 0; i--) {
16187 SV ** element_ptr = av_fetch(stack, i, FALSE);
16188 if (! element_ptr) {
16191 if (IS_OPERATOR(*element_ptr)) {
16192 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
16193 (int) i, (int) SvIV(*element_ptr));
16196 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
16197 sv_dump(*element_ptr);
16202 if (fence_stack_top < 0) {
16203 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
16206 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
16207 for (i = fence_stack_top; i >= 0; i--) {
16208 SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
16209 if (! element_ptr) {
16212 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
16213 (int) i, (int) SvIV(*element_ptr));
16224 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
16226 /* This adds the Latin1/above-Latin1 folding rules.
16228 * This should be called only for a Latin1-range code points, cp, which is
16229 * known to be involved in a simple fold with other code points above
16230 * Latin1. It would give false results if /aa has been specified.
16231 * Multi-char folds are outside the scope of this, and must be handled
16234 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
16236 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
16238 /* The rules that are valid for all Unicode versions are hard-coded in */
16243 add_cp_to_invlist(*invlist, KELVIN_SIGN);
16247 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
16250 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
16251 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
16253 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
16254 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
16255 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
16257 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
16258 *invlist = add_cp_to_invlist(*invlist,
16259 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
16262 default: /* Other code points are checked against the data for the
16263 current Unicode version */
16265 Size_t folds_to_count;
16266 unsigned int first_folds_to;
16267 const unsigned int * remaining_folds_to_list;
16271 folded_cp = toFOLD(cp);
16274 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
16276 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
16279 if (folded_cp > 255) {
16280 *invlist = add_cp_to_invlist(*invlist, folded_cp);
16283 folds_to_count = _inverse_folds(folded_cp, &first_folds_to,
16284 &remaining_folds_to_list);
16285 if (folds_to_count == 0) {
16287 /* Use deprecated warning to increase the chances of this being
16289 ckWARN2reg_d(RExC_parse,
16290 "Perl folding rules are not up-to-date for 0x%02X;"
16291 " please use the perlbug utility to report;", cp);
16296 if (first_folds_to > 255) {
16297 *invlist = add_cp_to_invlist(*invlist, first_folds_to);
16299 for (i = 0; i < folds_to_count - 1; i++) {
16300 if (remaining_folds_to_list[i] > 255) {
16301 *invlist = add_cp_to_invlist(*invlist,
16302 remaining_folds_to_list[i]);
16312 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
16314 /* Output the elements of the array given by '*posix_warnings' as REGEXP
16318 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
16320 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
16322 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
16326 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
16327 if (first_is_fatal) { /* Avoid leaking this */
16328 av_undef(posix_warnings); /* This isn't necessary if the
16329 array is mortal, but is a
16331 (void) sv_2mortal(msg);
16334 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
16335 SvREFCNT_dec_NN(msg);
16338 UPDATE_WARNINGS_LOC(RExC_parse);
16342 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
16344 /* This adds the string scalar <multi_string> to the array
16345 * <multi_char_matches>. <multi_string> is known to have exactly
16346 * <cp_count> code points in it. This is used when constructing a
16347 * bracketed character class and we find something that needs to match more
16348 * than a single character.
16350 * <multi_char_matches> is actually an array of arrays. Each top-level
16351 * element is an array that contains all the strings known so far that are
16352 * the same length. And that length (in number of code points) is the same
16353 * as the index of the top-level array. Hence, the [2] element is an
16354 * array, each element thereof is a string containing TWO code points;
16355 * while element [3] is for strings of THREE characters, and so on. Since
16356 * this is for multi-char strings there can never be a [0] nor [1] element.
16358 * When we rewrite the character class below, we will do so such that the
16359 * longest strings are written first, so that it prefers the longest
16360 * matching strings first. This is done even if it turns out that any
16361 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
16362 * Christiansen has agreed that this is ok. This makes the test for the
16363 * ligature 'ffi' come before the test for 'ff', for example */
16366 AV** this_array_ptr;
16368 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
16370 if (! multi_char_matches) {
16371 multi_char_matches = newAV();
16374 if (av_exists(multi_char_matches, cp_count)) {
16375 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
16376 this_array = *this_array_ptr;
16379 this_array = newAV();
16380 av_store(multi_char_matches, cp_count,
16383 av_push(this_array, multi_string);
16385 return multi_char_matches;
16388 /* The names of properties whose definitions are not known at compile time are
16389 * stored in this SV, after a constant heading. So if the length has been
16390 * changed since initialization, then there is a run-time definition. */
16391 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
16392 (SvCUR(listsv) != initial_listsv_len)
16394 /* There is a restricted set of white space characters that are legal when
16395 * ignoring white space in a bracketed character class. This generates the
16396 * code to skip them.
16398 * There is a line below that uses the same white space criteria but is outside
16399 * this macro. Both here and there must use the same definition */
16400 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
16403 while (isBLANK_A(UCHARAT(p))) \
16410 STATIC regnode_offset
16411 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
16412 const bool stop_at_1, /* Just parse the next thing, don't
16413 look for a full character class */
16414 bool allow_multi_folds,
16415 const bool silence_non_portable, /* Don't output warnings
16419 bool optimizable, /* ? Allow a non-ANYOF return
16421 SV** ret_invlist /* Return an inversion list, not a node */
16424 /* parse a bracketed class specification. Most of these will produce an
16425 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
16426 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
16427 * under /i with multi-character folds: it will be rewritten following the
16428 * paradigm of this example, where the <multi-fold>s are characters which
16429 * fold to multiple character sequences:
16430 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
16431 * gets effectively rewritten as:
16432 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
16433 * reg() gets called (recursively) on the rewritten version, and this
16434 * function will return what it constructs. (Actually the <multi-fold>s
16435 * aren't physically removed from the [abcdefghi], it's just that they are
16436 * ignored in the recursion by means of a flag:
16437 * <RExC_in_multi_char_class>.)
16439 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
16440 * characters, with the corresponding bit set if that character is in the
16441 * list. For characters above this, a range list or swash is used. There
16442 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
16443 * determinable at compile time
16445 * On success, returns the offset at which any next node should be placed
16446 * into the regex engine program being compiled.
16448 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
16449 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
16453 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
16455 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
16456 regnode_offset ret;
16458 int namedclass = OOB_NAMEDCLASS;
16459 char *rangebegin = NULL;
16460 bool need_class = 0;
16462 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
16463 than just initialized. */
16464 SV* properties = NULL; /* Code points that match \p{} \P{} */
16465 SV* posixes = NULL; /* Code points that match classes like [:word:],
16466 extended beyond the Latin1 range. These have to
16467 be kept separate from other code points for much
16468 of this function because their handling is
16469 different under /i, and for most classes under
16471 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
16472 separate for a while from the non-complemented
16473 versions because of complications with /d
16475 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
16476 treated more simply than the general case,
16477 leading to less compilation and execution
16479 UV element_count = 0; /* Number of distinct elements in the class.
16480 Optimizations may be possible if this is tiny */
16481 AV * multi_char_matches = NULL; /* Code points that fold to more than one
16482 character; used under /i */
16484 char * stop_ptr = RExC_end; /* where to stop parsing */
16486 /* ignore unescaped whitespace? */
16487 const bool skip_white = cBOOL( ret_invlist
16488 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
16490 /* Unicode properties are stored in a swash; this holds the current one
16491 * being parsed. If this swash is the only above-latin1 component of the
16492 * character class, an optimization is to pass it directly on to the
16493 * execution engine. Otherwise, it is set to NULL to indicate that there
16494 * are other things in the class that have to be dealt with at execution
16496 SV* swash = NULL; /* Code points that match \p{} \P{} */
16498 /* Set if a component of this character class is user-defined; just passed
16499 * on to the engine */
16500 bool has_user_defined_property = FALSE;
16502 /* inversion list of code points this node matches only when the target
16503 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
16505 SV* has_upper_latin1_only_utf8_matches = NULL;
16507 /* Inversion list of code points this node matches regardless of things
16508 * like locale, folding, utf8ness of the target string */
16509 SV* cp_list = NULL;
16511 /* Like cp_list, but code points on this list need to be checked for things
16512 * that fold to/from them under /i */
16513 SV* cp_foldable_list = NULL;
16515 /* Like cp_list, but code points on this list are valid only when the
16516 * runtime locale is UTF-8 */
16517 SV* only_utf8_locale_list = NULL;
16519 /* In a range, if one of the endpoints is non-character-set portable,
16520 * meaning that it hard-codes a code point that may mean a different
16521 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
16522 * mnemonic '\t' which each mean the same character no matter which
16523 * character set the platform is on. */
16524 unsigned int non_portable_endpoint = 0;
16526 /* Is the range unicode? which means on a platform that isn't 1-1 native
16527 * to Unicode (i.e. non-ASCII), each code point in it should be considered
16528 * to be a Unicode value. */
16529 bool unicode_range = FALSE;
16530 bool invert = FALSE; /* Is this class to be complemented */
16532 bool warn_super = ALWAYS_WARN_SUPER;
16534 const char * orig_parse = RExC_parse;
16535 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
16537 /* This variable is used to mark where the end in the input is of something
16538 * that looks like a POSIX construct but isn't. During the parse, when
16539 * something looks like it could be such a construct is encountered, it is
16540 * checked for being one, but not if we've already checked this area of the
16541 * input. Only after this position is reached do we check again */
16542 char *not_posix_region_end = RExC_parse - 1;
16544 AV* posix_warnings = NULL;
16545 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
16546 U8 op = END; /* The returned node-type, initialized to an impossible
16548 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
16549 U32 posixl = 0; /* bit field of posix classes matched under /l */
16550 bool use_anyofd = FALSE; /* ? Is this to be an ANYOFD node */
16552 GET_RE_DEBUG_FLAGS_DECL;
16554 PERL_ARGS_ASSERT_REGCLASS;
16556 PERL_UNUSED_ARG(depth);
16560 /* If wants an inversion list returned, we can't optimize to something
16563 optimizable = FALSE;
16566 DEBUG_PARSE("clas");
16568 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
16569 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
16570 && UNICODE_DOT_DOT_VERSION == 0)
16571 allow_multi_folds = FALSE;
16574 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
16575 initial_listsv_len = SvCUR(listsv);
16576 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
16578 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16580 assert(RExC_parse <= RExC_end);
16582 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
16585 allow_multi_folds = FALSE;
16587 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16590 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
16591 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
16592 int maybe_class = handle_possible_posix(pRExC_state,
16594 ¬_posix_region_end,
16596 TRUE /* checking only */);
16597 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
16598 ckWARN4reg(not_posix_region_end,
16599 "POSIX syntax [%c %c] belongs inside character classes%s",
16600 *RExC_parse, *RExC_parse,
16601 (maybe_class == OOB_NAMEDCLASS)
16602 ? ((POSIXCC_NOTYET(*RExC_parse))
16603 ? " (but this one isn't implemented)"
16604 : " (but this one isn't fully valid)")
16610 /* If the caller wants us to just parse a single element, accomplish this
16611 * by faking the loop ending condition */
16612 if (stop_at_1 && RExC_end > RExC_parse) {
16613 stop_ptr = RExC_parse + 1;
16616 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
16617 if (UCHARAT(RExC_parse) == ']')
16618 goto charclassloop;
16622 if ( posix_warnings
16623 && av_tindex_skip_len_mg(posix_warnings) >= 0
16624 && RExC_parse > not_posix_region_end)
16626 /* Warnings about posix class issues are considered tentative until
16627 * we are far enough along in the parse that we can no longer
16628 * change our mind, at which point we output them. This is done
16629 * each time through the loop so that a later class won't zap them
16630 * before they have been dealt with. */
16631 output_posix_warnings(pRExC_state, posix_warnings);
16634 if (RExC_parse >= stop_ptr) {
16638 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16640 if (UCHARAT(RExC_parse) == ']') {
16646 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
16647 save_value = value;
16648 save_prevvalue = prevvalue;
16651 rangebegin = RExC_parse;
16653 non_portable_endpoint = 0;
16655 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
16656 value = utf8n_to_uvchr((U8*)RExC_parse,
16657 RExC_end - RExC_parse,
16658 &numlen, UTF8_ALLOW_DEFAULT);
16659 RExC_parse += numlen;
16662 value = UCHARAT(RExC_parse++);
16664 if (value == '[') {
16665 char * posix_class_end;
16666 namedclass = handle_possible_posix(pRExC_state,
16669 do_posix_warnings ? &posix_warnings : NULL,
16670 FALSE /* die if error */);
16671 if (namedclass > OOB_NAMEDCLASS) {
16673 /* If there was an earlier attempt to parse this particular
16674 * posix class, and it failed, it was a false alarm, as this
16675 * successful one proves */
16676 if ( posix_warnings
16677 && av_tindex_skip_len_mg(posix_warnings) >= 0
16678 && not_posix_region_end >= RExC_parse
16679 && not_posix_region_end <= posix_class_end)
16681 av_undef(posix_warnings);
16684 RExC_parse = posix_class_end;
16686 else if (namedclass == OOB_NAMEDCLASS) {
16687 not_posix_region_end = posix_class_end;
16690 namedclass = OOB_NAMEDCLASS;
16693 else if ( RExC_parse - 1 > not_posix_region_end
16694 && MAYBE_POSIXCC(value))
16696 (void) handle_possible_posix(
16698 RExC_parse - 1, /* -1 because parse has already been
16700 ¬_posix_region_end,
16701 do_posix_warnings ? &posix_warnings : NULL,
16702 TRUE /* checking only */);
16704 else if ( strict && ! skip_white
16705 && ( _generic_isCC(value, _CC_VERTSPACE)
16706 || is_VERTWS_cp_high(value)))
16708 vFAIL("Literal vertical space in [] is illegal except under /x");
16710 else if (value == '\\') {
16711 /* Is a backslash; get the code point of the char after it */
16713 if (RExC_parse >= RExC_end) {
16714 vFAIL("Unmatched [");
16717 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
16718 value = utf8n_to_uvchr((U8*)RExC_parse,
16719 RExC_end - RExC_parse,
16720 &numlen, UTF8_ALLOW_DEFAULT);
16721 RExC_parse += numlen;
16724 value = UCHARAT(RExC_parse++);
16726 /* Some compilers cannot handle switching on 64-bit integer
16727 * values, therefore value cannot be an UV. Yes, this will
16728 * be a problem later if we want switch on Unicode.
16729 * A similar issue a little bit later when switching on
16730 * namedclass. --jhi */
16732 /* If the \ is escaping white space when white space is being
16733 * skipped, it means that that white space is wanted literally, and
16734 * is already in 'value'. Otherwise, need to translate the escape
16735 * into what it signifies. */
16736 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
16738 case 'w': namedclass = ANYOF_WORDCHAR; break;
16739 case 'W': namedclass = ANYOF_NWORDCHAR; break;
16740 case 's': namedclass = ANYOF_SPACE; break;
16741 case 'S': namedclass = ANYOF_NSPACE; break;
16742 case 'd': namedclass = ANYOF_DIGIT; break;
16743 case 'D': namedclass = ANYOF_NDIGIT; break;
16744 case 'v': namedclass = ANYOF_VERTWS; break;
16745 case 'V': namedclass = ANYOF_NVERTWS; break;
16746 case 'h': namedclass = ANYOF_HORIZWS; break;
16747 case 'H': namedclass = ANYOF_NHORIZWS; break;
16748 case 'N': /* Handle \N{NAME} in class */
16750 const char * const backslash_N_beg = RExC_parse - 2;
16753 if (! grok_bslash_N(pRExC_state,
16754 NULL, /* No regnode */
16755 &value, /* Yes single value */
16756 &cp_count, /* Multiple code pt count */
16762 if (*flagp & NEED_UTF8)
16763 FAIL("panic: grok_bslash_N set NEED_UTF8");
16765 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
16767 if (cp_count < 0) {
16768 vFAIL("\\N in a character class must be a named character: \\N{...}");
16770 else if (cp_count == 0) {
16771 ckWARNreg(RExC_parse,
16772 "Ignoring zero length \\N{} in character class");
16774 else { /* cp_count > 1 */
16775 if (! RExC_in_multi_char_class) {
16776 if (invert || range || *RExC_parse == '-') {
16779 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
16781 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
16782 break; /* <value> contains the first code
16783 point. Drop out of the switch to
16787 SV * multi_char_N = newSVpvn(backslash_N_beg,
16788 RExC_parse - backslash_N_beg);
16790 = add_multi_match(multi_char_matches,
16795 } /* End of cp_count != 1 */
16797 /* This element should not be processed further in this
16800 value = save_value;
16801 prevvalue = save_prevvalue;
16802 continue; /* Back to top of loop to get next char */
16805 /* Here, is a single code point, and <value> contains it */
16806 unicode_range = TRUE; /* \N{} are Unicode */
16815 /* We will handle any undefined properties ourselves */
16816 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
16817 /* And we actually would prefer to get
16818 * the straight inversion list of the
16819 * swash, since we will be accessing it
16820 * anyway, to save a little time */
16821 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
16823 SvREFCNT_dec(swash); /* Free any left-overs */
16825 /* \p means they want Unicode semantics */
16826 REQUIRE_UNI_RULES(flagp, 0);
16828 if (RExC_parse >= RExC_end)
16829 vFAIL2("Empty \\%c", (U8)value);
16830 if (*RExC_parse == '{') {
16831 const U8 c = (U8)value;
16832 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
16835 vFAIL2("Missing right brace on \\%c{}", c);
16840 /* White space is allowed adjacent to the braces and after
16841 * any '^', even when not under /x */
16842 while (isSPACE(*RExC_parse)) {
16846 if (UCHARAT(RExC_parse) == '^') {
16848 /* toggle. (The rhs xor gets the single bit that
16849 * differs between P and p; the other xor inverts just
16851 value ^= 'P' ^ 'p';
16854 while (isSPACE(*RExC_parse)) {
16859 if (e == RExC_parse)
16860 vFAIL2("Empty \\%c{}", c);
16862 n = e - RExC_parse;
16863 while (isSPACE(*(RExC_parse + n - 1)))
16866 } /* The \p isn't immediately followed by a '{' */
16867 else if (! isALPHA(*RExC_parse)) {
16868 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16869 vFAIL2("Character following \\%c must be '{' or a "
16870 "single-character Unicode property name",
16878 char* name = RExC_parse;
16879 char* base_name; /* name after any packages are stripped */
16880 char* lookup_name = NULL;
16881 const char * const colon_colon = "::";
16886 /* Temporary workaround for [perl #133136]. For this
16887 * precise input that is in the .t that is failing, load
16888 * utf8.pm, which is what the test wants, so that that
16890 if ( memEQs(RExC_start, e + 1 - RExC_start,
16892 && ! hv_common(GvHVn(PL_incgv),
16894 "utf8.pm", sizeof("utf8.pm") - 1,
16895 0, HV_FETCH_ISEXISTS, NULL, 0))
16897 require_pv("utf8.pm");
16899 invlist = parse_uniprop_string(name, n, FOLD, &invert);
16902 value ^= 'P' ^ 'p';
16907 /* Try to get the definition of the property into
16908 * <invlist>. If /i is in effect, the effective property
16909 * will have its name be <__NAME_i>. The design is
16910 * discussed in commit
16911 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
16912 name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse));
16915 for (i = RExC_parse; i < RExC_parse + n; i++) {
16916 if (isCNTRL(*i) && *i != '\t') {
16917 RExC_parse = e + 1;
16918 vFAIL2("Can't find Unicode property definition \"%s\"", name);
16923 lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name));
16925 /* The function call just below that uses this can fail
16926 * to return, leaking memory if we don't do this */
16927 SAVEFREEPV(lookup_name);
16930 /* Look up the property name, and get its swash and
16931 * inversion list, if the property is found */
16932 swash = _core_swash_init("utf8",
16939 NULL, /* No inversion list */
16942 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
16943 HV* curpkg = (IN_PERL_COMPILETIME)
16945 : CopSTASH(PL_curcop);
16949 if (swash) { /* Got a swash but no inversion list.
16950 Something is likely wrong that will
16951 be sorted-out later */
16952 SvREFCNT_dec_NN(swash);
16956 /* Here didn't find it. It could be a an error (like a
16957 * typo) in specifying a Unicode property, or it could
16958 * be a user-defined property that will be available at
16959 * run-time. The names of these must begin with 'In'
16960 * or 'Is' (after any packages are stripped off). So
16961 * if not one of those, or if we accept only
16962 * compile-time properties, is an error; otherwise add
16963 * it to the list for run-time look up. */
16964 if ((base_name = rninstr(name, name + n,
16965 colon_colon, colon_colon + 2)))
16966 { /* Has ::. We know this must be a user-defined
16969 final_n -= base_name - name;
16978 || base_name[0] != 'I'
16979 || (base_name[1] != 's' && base_name[1] != 'n')
16982 const char * const msg
16984 ? "Illegal user-defined property name"
16985 : "Can't find Unicode property definition";
16986 RExC_parse = e + 1;
16988 /* diag_listed_as: Can't find Unicode property definition "%s" */
16989 vFAIL3utf8f("%s \"%" UTF8f "\"",
16990 msg, UTF8fARG(UTF, n, name));
16993 /* If the property name doesn't already have a package
16994 * name, add the current one to it so that it can be
16995 * referred to outside it. [perl #121777] */
16996 if (! has_pkg && curpkg) {
16997 char* pkgname = HvNAME(curpkg);
16998 if (memNEs(pkgname, HvNAMELEN(curpkg), "main")) {
16999 char* full_name = Perl_form(aTHX_
17003 n = strlen(full_name);
17004 name = savepvn(full_name, n);
17008 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%" UTF8f "%s\n",
17009 (value == 'p' ? '+' : '!'),
17010 (FOLD) ? "__" : "",
17011 UTF8fARG(UTF, n, name),
17012 (FOLD) ? "_i" : "");
17013 has_user_defined_property = TRUE;
17014 optimizable = FALSE; /* Will have to leave this an
17017 /* We don't know yet what this matches, so have to flag
17019 anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17023 /* Here, did get the swash and its inversion list. If
17024 * the swash is from a user-defined property, then this
17025 * whole character class should be regarded as such */
17026 if (swash_init_flags
17027 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
17029 has_user_defined_property = TRUE;
17034 if (! has_user_defined_property &&
17035 /* We warn on matching an above-Unicode code point
17036 * if the match would return true, except don't
17037 * warn for \p{All}, which has exactly one element
17039 (_invlist_contains_cp(invlist, 0x110000)
17040 && (! (_invlist_len(invlist) == 1
17041 && *invlist_array(invlist) == 0))))
17046 /* Invert if asking for the complement */
17047 if (value == 'P') {
17048 _invlist_union_complement_2nd(properties,
17052 /* The swash can't be used as-is, because we've
17053 * inverted things; delay removing it to here after
17054 * have copied its invlist above */
17056 SvREFCNT_dec_NN(invlist);
17058 SvREFCNT_dec(swash);
17062 _invlist_union(properties, invlist, &properties);
17064 SvREFCNT_dec_NN(invlist);
17070 RExC_parse = e + 1;
17071 namedclass = ANYOF_UNIPROP; /* no official name, but it's
17075 case 'n': value = '\n'; break;
17076 case 'r': value = '\r'; break;
17077 case 't': value = '\t'; break;
17078 case 'f': value = '\f'; break;
17079 case 'b': value = '\b'; break;
17080 case 'e': value = ESC_NATIVE; break;
17081 case 'a': value = '\a'; break;
17083 RExC_parse--; /* function expects to be pointed at the 'o' */
17085 const char* error_msg;
17086 bool valid = grok_bslash_o(&RExC_parse,
17090 TO_OUTPUT_WARNINGS(RExC_parse),
17092 silence_non_portable,
17097 UPDATE_WARNINGS_LOC(RExC_parse - 1);
17099 non_portable_endpoint++;
17102 RExC_parse--; /* function expects to be pointed at the 'x' */
17104 const char* error_msg;
17105 bool valid = grok_bslash_x(&RExC_parse,
17109 TO_OUTPUT_WARNINGS(RExC_parse),
17111 silence_non_portable,
17116 UPDATE_WARNINGS_LOC(RExC_parse - 1);
17118 non_portable_endpoint++;
17121 value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
17122 UPDATE_WARNINGS_LOC(RExC_parse);
17124 non_portable_endpoint++;
17126 case '0': case '1': case '2': case '3': case '4':
17127 case '5': case '6': case '7':
17129 /* Take 1-3 octal digits */
17130 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
17131 numlen = (strict) ? 4 : 3;
17132 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17133 RExC_parse += numlen;
17136 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
17137 vFAIL("Need exactly 3 octal digits");
17139 else if ( numlen < 3 /* like \08, \178 */
17140 && RExC_parse < RExC_end
17141 && isDIGIT(*RExC_parse)
17142 && ckWARN(WARN_REGEXP))
17144 reg_warn_non_literal_string(
17146 form_short_octal_warning(RExC_parse, numlen));
17149 non_portable_endpoint++;
17153 /* Allow \_ to not give an error */
17154 if (isWORDCHAR(value) && value != '_') {
17156 vFAIL2("Unrecognized escape \\%c in character class",
17160 ckWARN2reg(RExC_parse,
17161 "Unrecognized escape \\%c in character class passed through",
17166 } /* End of switch on char following backslash */
17167 } /* end of handling backslash escape sequences */
17169 /* Here, we have the current token in 'value' */
17171 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
17174 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
17175 * literal, as is the character that began the false range, i.e.
17176 * the 'a' in the examples */
17178 const int w = (RExC_parse >= rangebegin)
17179 ? RExC_parse - rangebegin
17183 "False [] range \"%" UTF8f "\"",
17184 UTF8fARG(UTF, w, rangebegin));
17187 ckWARN2reg(RExC_parse,
17188 "False [] range \"%" UTF8f "\"",
17189 UTF8fARG(UTF, w, rangebegin));
17190 cp_list = add_cp_to_invlist(cp_list, '-');
17191 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
17195 range = 0; /* this was not a true range */
17196 element_count += 2; /* So counts for three values */
17199 classnum = namedclass_to_classnum(namedclass);
17201 if (LOC && namedclass < ANYOF_POSIXL_MAX
17202 #ifndef HAS_ISASCII
17203 && classnum != _CC_ASCII
17206 SV* scratch_list = NULL;
17208 /* What the Posix classes (like \w, [:space:]) match in locale
17209 * isn't knowable under locale until actual match time. Room
17210 * must be reserved (one time per outer bracketed class) to
17211 * store such classes. The space will contain a bit for each
17212 * named class that is to be matched against. This isn't
17213 * needed for \p{} and pseudo-classes, as they are not affected
17214 * by locale, and hence are dealt with separately */
17215 if (! need_class) {
17217 anyof_flags |= ANYOF_MATCHES_POSIXL;
17219 /* We can't change this into some other type of node
17220 * (unless this is the only element, in which case there
17221 * are nodes that mean exactly this) as has runtime
17223 optimizable = FALSE;
17226 /* Coverity thinks it is possible for this to be negative; both
17227 * jhi and khw think it's not, but be safer */
17228 assert(! (anyof_flags & ANYOF_MATCHES_POSIXL)
17229 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
17231 /* See if it already matches the complement of this POSIX
17233 if ( (anyof_flags & ANYOF_MATCHES_POSIXL)
17234 && POSIXL_TEST(posixl, namedclass + ((namedclass % 2)
17238 posixl_matches_all = TRUE;
17239 break; /* No need to continue. Since it matches both
17240 e.g., \w and \W, it matches everything, and the
17241 bracketed class can be optimized into qr/./s */
17244 /* Add this class to those that should be checked at runtime */
17245 POSIXL_SET(posixl, namedclass);
17247 /* The above-Latin1 characters are not subject to locale rules.
17248 * Just add them to the unconditionally-matched list */
17250 /* Get the list of the above-Latin1 code points this matches */
17251 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
17252 PL_XPosix_ptrs[classnum],
17254 /* Odd numbers are complements, like
17255 * NDIGIT, NASCII, ... */
17256 namedclass % 2 != 0,
17258 /* Checking if 'cp_list' is NULL first saves an extra clone.
17259 * Its reference count will be decremented at the next union,
17260 * etc, or if this is the only instance, at the end of the
17263 cp_list = scratch_list;
17266 _invlist_union(cp_list, scratch_list, &cp_list);
17267 SvREFCNT_dec_NN(scratch_list);
17269 continue; /* Go get next character */
17273 /* Here, is not /l, or is a POSIX class for which /l doesn't
17274 * matter (or is a Unicode property, which is skipped here). */
17275 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
17276 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
17278 /* Here, should be \h, \H, \v, or \V. None of /d, /i
17279 * nor /l make a difference in what these match,
17280 * therefore we just add what they match to cp_list. */
17281 if (classnum != _CC_VERTSPACE) {
17282 assert( namedclass == ANYOF_HORIZWS
17283 || namedclass == ANYOF_NHORIZWS);
17285 /* It turns out that \h is just a synonym for
17287 classnum = _CC_BLANK;
17290 _invlist_union_maybe_complement_2nd(
17292 PL_XPosix_ptrs[classnum],
17293 namedclass % 2 != 0, /* Complement if odd
17294 (NHORIZWS, NVERTWS)
17299 else if ( UNI_SEMANTICS
17300 || AT_LEAST_ASCII_RESTRICTED
17301 || classnum == _CC_ASCII
17302 || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
17303 || classnum == _CC_XDIGIT)))
17305 /* We usually have to worry about /d affecting what POSIX
17306 * classes match, with special code needed because we won't
17307 * know until runtime what all matches. But there is no
17308 * extra work needed under /u and /a; and [:ascii:] is
17309 * unaffected by /d; and :digit: and :xdigit: don't have
17310 * runtime differences under /d. So we can special case
17311 * these, and avoid some extra work below, and at runtime.
17313 _invlist_union_maybe_complement_2nd(
17315 ((AT_LEAST_ASCII_RESTRICTED)
17316 ? PL_Posix_ptrs[classnum]
17317 : PL_XPosix_ptrs[classnum]),
17318 namedclass % 2 != 0,
17321 else { /* Garden variety class. If is NUPPER, NALPHA, ...
17322 complement and use nposixes */
17323 SV** posixes_ptr = namedclass % 2 == 0
17326 _invlist_union_maybe_complement_2nd(
17328 PL_XPosix_ptrs[classnum],
17329 namedclass % 2 != 0,
17333 } /* end of namedclass \blah */
17335 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17337 /* If 'range' is set, 'value' is the ending of a range--check its
17338 * validity. (If value isn't a single code point in the case of a
17339 * range, we should have figured that out above in the code that
17340 * catches false ranges). Later, we will handle each individual code
17341 * point in the range. If 'range' isn't set, this could be the
17342 * beginning of a range, so check for that by looking ahead to see if
17343 * the next real character to be processed is the range indicator--the
17348 /* For unicode ranges, we have to test that the Unicode as opposed
17349 * to the native values are not decreasing. (Above 255, there is
17350 * no difference between native and Unicode) */
17351 if (unicode_range && prevvalue < 255 && value < 255) {
17352 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
17353 goto backwards_range;
17358 if (prevvalue > value) /* b-a */ {
17363 w = RExC_parse - rangebegin;
17365 "Invalid [] range \"%" UTF8f "\"",
17366 UTF8fARG(UTF, w, rangebegin));
17367 NOT_REACHED; /* NOTREACHED */
17371 prevvalue = value; /* save the beginning of the potential range */
17372 if (! stop_at_1 /* Can't be a range if parsing just one thing */
17373 && *RExC_parse == '-')
17375 char* next_char_ptr = RExC_parse + 1;
17377 /* Get the next real char after the '-' */
17378 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
17380 /* If the '-' is at the end of the class (just before the ']',
17381 * it is a literal minus; otherwise it is a range */
17382 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
17383 RExC_parse = next_char_ptr;
17385 /* a bad range like \w-, [:word:]- ? */
17386 if (namedclass > OOB_NAMEDCLASS) {
17387 if (strict || ckWARN(WARN_REGEXP)) {
17388 const int w = RExC_parse >= rangebegin
17389 ? RExC_parse - rangebegin
17392 vFAIL4("False [] range \"%*.*s\"",
17397 "False [] range \"%*.*s\"",
17401 cp_list = add_cp_to_invlist(cp_list, '-');
17404 range = 1; /* yeah, it's a range! */
17405 continue; /* but do it the next time */
17410 if (namedclass > OOB_NAMEDCLASS) {
17414 /* Here, we have a single value this time through the loop, and
17415 * <prevvalue> is the beginning of the range, if any; or <value> if
17418 /* non-Latin1 code point implies unicode semantics. */
17420 REQUIRE_UNI_RULES(flagp, 0);
17423 /* Ready to process either the single value, or the completed range.
17424 * For single-valued non-inverted ranges, we consider the possibility
17425 * of multi-char folds. (We made a conscious decision to not do this
17426 * for the other cases because it can often lead to non-intuitive
17427 * results. For example, you have the peculiar case that:
17428 * "s s" =~ /^[^\xDF]+$/i => Y
17429 * "ss" =~ /^[^\xDF]+$/i => N
17431 * See [perl #89750] */
17432 if (FOLD && allow_multi_folds && value == prevvalue) {
17433 if (value == LATIN_SMALL_LETTER_SHARP_S
17434 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
17437 /* Here <value> is indeed a multi-char fold. Get what it is */
17439 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17442 UV folded = _to_uni_fold_flags(
17446 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
17447 ? FOLD_FLAGS_NOMIX_ASCII
17451 /* Here, <folded> should be the first character of the
17452 * multi-char fold of <value>, with <foldbuf> containing the
17453 * whole thing. But, if this fold is not allowed (because of
17454 * the flags), <fold> will be the same as <value>, and should
17455 * be processed like any other character, so skip the special
17457 if (folded != value) {
17459 /* Skip if we are recursed, currently parsing the class
17460 * again. Otherwise add this character to the list of
17461 * multi-char folds. */
17462 if (! RExC_in_multi_char_class) {
17463 STRLEN cp_count = utf8_length(foldbuf,
17464 foldbuf + foldlen);
17465 SV* multi_fold = sv_2mortal(newSVpvs(""));
17467 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
17470 = add_multi_match(multi_char_matches,
17476 /* This element should not be processed further in this
17479 value = save_value;
17480 prevvalue = save_prevvalue;
17486 if (strict && ckWARN(WARN_REGEXP)) {
17489 /* If the range starts above 255, everything is portable and
17490 * likely to be so for any forseeable character set, so don't
17492 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
17493 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
17495 else if (prevvalue != value) {
17497 /* Under strict, ranges that stop and/or end in an ASCII
17498 * printable should have each end point be a portable value
17499 * for it (preferably like 'A', but we don't warn if it is
17500 * a (portable) Unicode name or code point), and the range
17501 * must be be all digits or all letters of the same case.
17502 * Otherwise, the range is non-portable and unclear as to
17503 * what it contains */
17504 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
17505 && ( non_portable_endpoint
17506 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
17507 || (isLOWER_A(prevvalue) && isLOWER_A(value))
17508 || (isUPPER_A(prevvalue) && isUPPER_A(value))
17510 vWARN(RExC_parse, "Ranges of ASCII printables should"
17511 " be some subset of \"0-9\","
17512 " \"A-Z\", or \"a-z\"");
17514 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
17515 SSize_t index_start;
17516 SSize_t index_final;
17518 /* But the nature of Unicode and languages mean we
17519 * can't do the same checks for above-ASCII ranges,
17520 * except in the case of digit ones. These should
17521 * contain only digits from the same group of 10. The
17522 * ASCII case is handled just above. Hence here, the
17523 * range could be a range of digits. First some
17524 * unlikely special cases. Grandfather in that a range
17525 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
17526 * if its starting value is one of the 10 digits prior
17527 * to it. This is because it is an alternate way of
17528 * writing 19D1, and some people may expect it to be in
17529 * that group. But it is bad, because it won't give
17530 * the expected results. In Unicode 5.2 it was
17531 * considered to be in that group (of 11, hence), but
17532 * this was fixed in the next version */
17534 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
17535 goto warn_bad_digit_range;
17537 else if (UNLIKELY( prevvalue >= 0x1D7CE
17538 && value <= 0x1D7FF))
17540 /* This is the only other case currently in Unicode
17541 * where the algorithm below fails. The code
17542 * points just above are the end points of a single
17543 * range containing only decimal digits. It is 5
17544 * different series of 0-9. All other ranges of
17545 * digits currently in Unicode are just a single
17546 * series. (And mktables will notify us if a later
17547 * Unicode version breaks this.)
17549 * If the range being checked is at most 9 long,
17550 * and the digit values represented are in
17551 * numerical order, they are from the same series.
17553 if ( value - prevvalue > 9
17554 || ((( value - 0x1D7CE) % 10)
17555 <= (prevvalue - 0x1D7CE) % 10))
17557 goto warn_bad_digit_range;
17562 /* For all other ranges of digits in Unicode, the
17563 * algorithm is just to check if both end points
17564 * are in the same series, which is the same range.
17566 index_start = _invlist_search(
17567 PL_XPosix_ptrs[_CC_DIGIT],
17570 /* Warn if the range starts and ends with a digit,
17571 * and they are not in the same group of 10. */
17572 if ( index_start >= 0
17573 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
17575 _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
17576 value)) != index_start
17577 && index_final >= 0
17578 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
17580 warn_bad_digit_range:
17581 vWARN(RExC_parse, "Ranges of digits should be"
17582 " from the same group of"
17589 if ((! range || prevvalue == value) && non_portable_endpoint) {
17590 if (isPRINT_A(value)) {
17593 if (isBACKSLASHED_PUNCT(value)) {
17594 literal[d++] = '\\';
17596 literal[d++] = (char) value;
17597 literal[d++] = '\0';
17600 "\"%.*s\" is more clearly written simply as \"%s\"",
17601 (int) (RExC_parse - rangebegin),
17606 else if isMNEMONIC_CNTRL(value) {
17608 "\"%.*s\" is more clearly written simply as \"%s\"",
17609 (int) (RExC_parse - rangebegin),
17611 cntrl_to_mnemonic((U8) value)
17617 /* Deal with this element of the class */
17620 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17623 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
17624 * that don't require special handling, we can just add the range like
17625 * we do for ASCII platforms */
17626 if ((UNLIKELY(prevvalue == 0) && value >= 255)
17627 || ! (prevvalue < 256
17629 || (! non_portable_endpoint
17630 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
17631 || (isUPPER_A(prevvalue)
17632 && isUPPER_A(value)))))))
17634 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17638 /* Here, requires special handling. This can be because it is a
17639 * range whose code points are considered to be Unicode, and so
17640 * must be individually translated into native, or because its a
17641 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
17642 * EBCDIC, but we have defined them to include only the "expected"
17643 * upper or lower case ASCII alphabetics. Subranges above 255 are
17644 * the same in native and Unicode, so can be added as a range */
17645 U8 start = NATIVE_TO_LATIN1(prevvalue);
17647 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
17648 for (j = start; j <= end; j++) {
17649 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
17652 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17658 range = 0; /* this range (if it was one) is done now */
17659 } /* End of loop through all the text within the brackets */
17661 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
17662 output_posix_warnings(pRExC_state, posix_warnings);
17665 /* If anything in the class expands to more than one character, we have to
17666 * deal with them by building up a substitute parse string, and recursively
17667 * calling reg() on it, instead of proceeding */
17668 if (multi_char_matches) {
17669 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
17672 char *save_end = RExC_end;
17673 char *save_parse = RExC_parse;
17674 char *save_start = RExC_start;
17675 Size_t constructed_prefix_len = 0; /* This gives the length of the
17676 constructed portion of the
17677 substitute parse. */
17678 bool first_time = TRUE; /* First multi-char occurrence doesn't get
17683 /* Only one level of recursion allowed */
17684 assert(RExC_copy_start_in_constructed == RExC_precomp);
17686 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
17687 because too confusing */
17689 sv_catpvs(substitute_parse, "(?:");
17693 /* Look at the longest folds first */
17694 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
17699 if (av_exists(multi_char_matches, cp_count)) {
17700 AV** this_array_ptr;
17703 this_array_ptr = (AV**) av_fetch(multi_char_matches,
17705 while ((this_sequence = av_pop(*this_array_ptr)) !=
17708 if (! first_time) {
17709 sv_catpvs(substitute_parse, "|");
17711 first_time = FALSE;
17713 sv_catpv(substitute_parse, SvPVX(this_sequence));
17718 /* If the character class contains anything else besides these
17719 * multi-character folds, have to include it in recursive parsing */
17720 if (element_count) {
17721 sv_catpvs(substitute_parse, "|[");
17722 constructed_prefix_len = SvCUR(substitute_parse);
17723 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
17725 /* Put in a closing ']' only if not going off the end, as otherwise
17726 * we are adding something that really isn't there */
17727 if (RExC_parse < RExC_end) {
17728 sv_catpvs(substitute_parse, "]");
17732 sv_catpvs(substitute_parse, ")");
17735 /* This is a way to get the parse to skip forward a whole named
17736 * sequence instead of matching the 2nd character when it fails the
17738 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
17742 /* Set up the data structure so that any errors will be properly
17743 * reported. See the comments at the definition of
17744 * REPORT_LOCATION_ARGS for details */
17745 RExC_copy_start_in_input = (char *) orig_parse;
17746 RExC_start = RExC_parse = SvPV(substitute_parse, len);
17747 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
17748 RExC_end = RExC_parse + len;
17749 RExC_in_multi_char_class = 1;
17751 ret = reg(pRExC_state, 1, ®_flags, depth+1);
17753 *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
17755 /* And restore so can parse the rest of the pattern */
17756 RExC_parse = save_parse;
17757 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
17758 RExC_end = save_end;
17759 RExC_in_multi_char_class = 0;
17760 SvREFCNT_dec_NN(multi_char_matches);
17764 /* If folding, we calculate all characters that could fold to or from the
17765 * ones already on the list */
17766 if (cp_foldable_list) {
17768 UV start, end; /* End points of code point ranges */
17770 SV* fold_intersection = NULL;
17773 /* Our calculated list will be for Unicode rules. For locale
17774 * matching, we have to keep a separate list that is consulted at
17775 * runtime only when the locale indicates Unicode rules. For
17776 * non-locale, we just use the general list */
17778 use_list = &only_utf8_locale_list;
17781 use_list = &cp_list;
17784 /* Only the characters in this class that participate in folds need
17785 * be checked. Get the intersection of this class and all the
17786 * possible characters that are foldable. This can quickly narrow
17787 * down a large class */
17788 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
17789 &fold_intersection);
17791 /* Now look at the foldable characters in this class individually */
17792 invlist_iterinit(fold_intersection);
17793 while (invlist_iternext(fold_intersection, &start, &end)) {
17797 /* Look at every character in the range */
17798 for (j = start; j <= end; j++) {
17799 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17802 Size_t folds_to_count;
17803 unsigned int first_folds_to;
17804 const unsigned int * remaining_folds_to_list;
17808 if (IS_IN_SOME_FOLD_L1(j)) {
17810 /* ASCII is always matched; non-ASCII is matched
17811 * only under Unicode rules (which could happen
17812 * under /l if the locale is a UTF-8 one */
17813 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
17814 *use_list = add_cp_to_invlist(*use_list,
17815 PL_fold_latin1[j]);
17818 has_upper_latin1_only_utf8_matches
17819 = add_cp_to_invlist(
17820 has_upper_latin1_only_utf8_matches,
17821 PL_fold_latin1[j]);
17825 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
17826 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
17828 add_above_Latin1_folds(pRExC_state,
17835 /* Here is an above Latin1 character. We don't have the
17836 * rules hard-coded for it. First, get its fold. This is
17837 * the simple fold, as the multi-character folds have been
17838 * handled earlier and separated out */
17839 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
17840 (ASCII_FOLD_RESTRICTED)
17841 ? FOLD_FLAGS_NOMIX_ASCII
17844 /* Single character fold of above Latin1. Add everything
17845 * in its fold closure to the list that this node should
17847 folds_to_count = _inverse_folds(folded, &first_folds_to,
17848 &remaining_folds_to_list);
17849 for (k = 0; k <= folds_to_count; k++) {
17850 UV c = (k == 0) /* First time through use itself */
17852 : (k == 1) /* 2nd time use, the first fold */
17855 /* Then the remaining ones */
17856 : remaining_folds_to_list[k-2];
17858 /* /aa doesn't allow folds between ASCII and non- */
17859 if (( ASCII_FOLD_RESTRICTED
17860 && (isASCII(c) != isASCII(j))))
17865 /* Folds under /l which cross the 255/256 boundary are
17866 * added to a separate list. (These are valid only
17867 * when the locale is UTF-8.) */
17868 if (c < 256 && LOC) {
17869 *use_list = add_cp_to_invlist(*use_list, c);
17873 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
17875 cp_list = add_cp_to_invlist(cp_list, c);
17878 /* Similarly folds involving non-ascii Latin1
17879 * characters under /d are added to their list */
17880 has_upper_latin1_only_utf8_matches
17881 = add_cp_to_invlist(
17882 has_upper_latin1_only_utf8_matches,
17888 SvREFCNT_dec_NN(fold_intersection);
17891 /* Now that we have finished adding all the folds, there is no reason
17892 * to keep the foldable list separate */
17893 _invlist_union(cp_list, cp_foldable_list, &cp_list);
17894 SvREFCNT_dec_NN(cp_foldable_list);
17897 /* And combine the result (if any) with any inversion lists from posix
17898 * classes. The lists are kept separate up to now because we don't want to
17899 * fold the classes (folding of those is automatically handled by the swash
17900 * fetching code) */
17901 if (simple_posixes) { /* These are the classes known to be unaffected by
17904 _invlist_union(cp_list, simple_posixes, &cp_list);
17905 SvREFCNT_dec_NN(simple_posixes);
17908 cp_list = simple_posixes;
17911 if (posixes || nposixes) {
17912 if (! DEPENDS_SEMANTICS) {
17914 /* For everything but /d, we can just add the current 'posixes' and
17915 * 'nposixes' to the main list */
17918 _invlist_union(cp_list, posixes, &cp_list);
17919 SvREFCNT_dec_NN(posixes);
17927 _invlist_union(cp_list, nposixes, &cp_list);
17928 SvREFCNT_dec_NN(nposixes);
17931 cp_list = nposixes;
17936 /* Under /d, things like \w match upper Latin1 characters only if
17937 * the target string is in UTF-8. But things like \W match all the
17938 * upper Latin1 characters if the target string is not in UTF-8.
17940 * Handle the case where there something like \W separately */
17942 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
17944 /* A complemented posix class matches all upper Latin1
17945 * characters if not in UTF-8. And it matches just certain
17946 * ones when in UTF-8. That means those certain ones are
17947 * matched regardless, so can just be added to the
17948 * unconditional list */
17950 _invlist_union(cp_list, nposixes, &cp_list);
17951 SvREFCNT_dec_NN(nposixes);
17955 cp_list = nposixes;
17958 /* Likewise for 'posixes' */
17959 _invlist_union(posixes, cp_list, &cp_list);
17961 /* Likewise for anything else in the range that matched only
17963 if (has_upper_latin1_only_utf8_matches) {
17964 _invlist_union(cp_list,
17965 has_upper_latin1_only_utf8_matches,
17967 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
17968 has_upper_latin1_only_utf8_matches = NULL;
17971 /* If we don't match all the upper Latin1 characters regardless
17972 * of UTF-8ness, we have to set a flag to match the rest when
17974 _invlist_subtract(only_non_utf8_list, cp_list,
17975 &only_non_utf8_list);
17976 if (_invlist_len(only_non_utf8_list) != 0) {
17977 anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
17979 SvREFCNT_dec_NN(only_non_utf8_list);
17982 /* Here there were no complemented posix classes. That means
17983 * the upper Latin1 characters in 'posixes' match only when the
17984 * target string is in UTF-8. So we have to add them to the
17985 * list of those types of code points, while adding the
17986 * remainder to the unconditional list.
17988 * First calculate what they are */
17989 SV* nonascii_but_latin1_properties = NULL;
17990 _invlist_intersection(posixes, PL_UpperLatin1,
17991 &nonascii_but_latin1_properties);
17993 /* And add them to the final list of such characters. */
17994 _invlist_union(has_upper_latin1_only_utf8_matches,
17995 nonascii_but_latin1_properties,
17996 &has_upper_latin1_only_utf8_matches);
17998 /* Remove them from what now becomes the unconditional list */
17999 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18002 /* And add those unconditional ones to the final list */
18004 _invlist_union(cp_list, posixes, &cp_list);
18005 SvREFCNT_dec_NN(posixes);
18012 SvREFCNT_dec(nonascii_but_latin1_properties);
18014 /* Get rid of any characters that we now know are matched
18015 * unconditionally from the conditional list, which may make
18016 * that list empty */
18017 _invlist_subtract(has_upper_latin1_only_utf8_matches,
18019 &has_upper_latin1_only_utf8_matches);
18020 if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
18021 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
18022 has_upper_latin1_only_utf8_matches = NULL;
18028 /* And combine the result (if any) with any inversion list from properties.
18029 * The lists are kept separate up to now so that we can distinguish the two
18030 * in regards to matching above-Unicode. A run-time warning is generated
18031 * if a Unicode property is matched against a non-Unicode code point. But,
18032 * we allow user-defined properties to match anything, without any warning,
18033 * and we also suppress the warning if there is a portion of the character
18034 * class that isn't a Unicode property, and which matches above Unicode, \W
18035 * or [\x{110000}] for example.
18036 * (Note that in this case, unlike the Posix one above, there is no
18037 * <has_upper_latin1_only_utf8_matches>, because having a Unicode property
18038 * forces Unicode semantics */
18042 /* If it matters to the final outcome, see if a non-property
18043 * component of the class matches above Unicode. If so, the
18044 * warning gets suppressed. This is true even if just a single
18045 * such code point is specified, as, though not strictly correct if
18046 * another such code point is matched against, the fact that they
18047 * are using above-Unicode code points indicates they should know
18048 * the issues involved */
18050 warn_super = ! (invert
18051 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18054 _invlist_union(properties, cp_list, &cp_list);
18055 SvREFCNT_dec_NN(properties);
18058 cp_list = properties;
18063 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18065 /* Because an ANYOF node is the only one that warns, this node
18066 * can't be optimized into something else */
18067 optimizable = FALSE;
18071 /* Here, we have calculated what code points should be in the character
18074 * Now we can see about various optimizations. Fold calculation (which we
18075 * did above) needs to take place before inversion. Otherwise /[^k]/i
18076 * would invert to include K, which under /i would match k, which it
18077 * shouldn't. Therefore we can't invert folded locale now, as it won't be
18078 * folded until runtime */
18080 /* If we didn't do folding, it's because some information isn't available
18081 * until runtime; set the run-time fold flag for these. (We don't have to
18082 * worry about properties folding, as that is taken care of by the swash
18083 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
18084 * locales, or the class matches at least one 0-255 range code point */
18087 /* Some things on the list might be unconditionally included because of
18088 * other components. Remove them, and clean up the list if it goes to
18090 if (only_utf8_locale_list && cp_list) {
18091 _invlist_subtract(only_utf8_locale_list, cp_list,
18092 &only_utf8_locale_list);
18094 if (_invlist_len(only_utf8_locale_list) == 0) {
18095 SvREFCNT_dec_NN(only_utf8_locale_list);
18096 only_utf8_locale_list = NULL;
18099 if (only_utf8_locale_list) {
18102 | ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18104 else if (cp_list) { /* Look to see if a 0-255 code point is in list */
18106 invlist_iterinit(cp_list);
18107 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
18108 anyof_flags |= ANYOFL_FOLD;
18110 invlist_iterfinish(cp_list);
18113 else if ( DEPENDS_SEMANTICS
18114 && ( has_upper_latin1_only_utf8_matches
18115 || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18118 RExC_seen_d_op = TRUE;
18119 optimizable = FALSE;
18122 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
18123 * at compile time. Besides not inverting folded locale now, we can't
18124 * invert if there are things such as \w, which aren't known until runtime
18129 && ! (anyof_flags & (ANYOF_LOCALE_FLAGS))
18130 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
18132 _invlist_invert(cp_list);
18134 /* Any swash can't be used as-is, because we've inverted things */
18136 SvREFCNT_dec_NN(swash);
18140 /* Clear the invert flag since have just done it here */
18145 *ret_invlist = cp_list;
18146 SvREFCNT_dec(swash);
18151 /* Some character classes are equivalent to other nodes. Such nodes take
18152 * up less room and generally fewer operations to execute than ANYOF nodes.
18156 int posix_class = -1; /* Illegal value */
18157 U8 ANYOFM_mask = 0xFF;
18161 if (UNLIKELY(posixl_matches_all)) {
18164 else if (cp_list && ! invert) {
18166 invlist_iterinit(cp_list);
18167 if (! invlist_iternext(cp_list, &start, &end)) {
18169 /* Here, the list is empty. This happens, for example, when a
18170 * Unicode property that doesn't match anything is the only
18171 * element in the character class (perluniprops.pod notes such
18174 *flagp |= HASWIDTH|SIMPLE;
18176 else if (start == end) { /* The range is a single code point */
18177 if (! invlist_iternext(cp_list, &start, &end)
18179 /* Don't do this optimization if it would require
18180 * changing the pattern to UTF-8 */
18181 && (start < 256 || UTF))
18183 /* Here, the list contains a single code point. Can
18184 * optimize into an EXACTish node */
18195 /* A locale node under folding with one code point can
18196 * be an EXACTFL, as its fold won't be calculated until
18202 /* Here, we are generally folding, but there is only
18203 * one code point to match. If we have to, we use an
18204 * EXACT node, but it would be better for joining with
18205 * adjacent nodes in the optimization phase if we used
18206 * the same EXACTFish node that any such are likely to
18207 * be. We can do this iff the code point doesn't
18208 * participate in any folds. For example, an EXACTF of
18209 * a colon is the same as an EXACT one, since nothing
18210 * folds to or from a colon. */
18212 if (IS_IN_SOME_FOLD_L1(value)) {
18217 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
18222 /* If we haven't found the node type, above, it means
18223 * we can use the prevailing one */
18225 op = compute_EXACTish(pRExC_state);
18229 } /* End of first range contains just a single code point */
18230 else if (start == 0) {
18231 if (end == UV_MAX) {
18233 *flagp |= HASWIDTH|SIMPLE;
18236 else if (end == '\n' - 1
18237 && invlist_iternext(cp_list, &start, &end)
18238 && start == '\n' + 1 && end == UV_MAX)
18241 *flagp |= HASWIDTH|SIMPLE;
18245 invlist_iterfinish(cp_list);
18249 /* Here, didn't find an optimization. See if this matches any
18250 * of the POSIX classes. First try ASCII */
18252 if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 0)) {
18254 *flagp |= HASWIDTH|SIMPLE;
18256 else if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 1)) {
18258 *flagp |= HASWIDTH|SIMPLE;
18262 /* Then try the other POSIX classes. The POSIXA ones are
18263 * about the same speed as ANYOF ops, but take less room;
18264 * the ones that have above-Latin1 code point matches are
18265 * somewhat faster than ANYOF. */
18267 for (posix_class = 0;
18268 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
18273 for (try_inverted = 0; try_inverted < 2; try_inverted++)
18276 /* Check if matches POSIXA, normal or inverted */
18277 if (PL_Posix_ptrs[posix_class]) {
18278 if (_invlistEQ(cp_list,
18279 PL_Posix_ptrs[posix_class],
18282 op = (try_inverted)
18285 *flagp |= HASWIDTH|SIMPLE;
18290 /* Check if matches POSIXU, normal or inverted */
18291 if (_invlistEQ(cp_list,
18292 PL_XPosix_ptrs[posix_class],
18295 op = (try_inverted)
18298 *flagp |= HASWIDTH|SIMPLE;
18306 /* If it didn't match a POSIX class, it might be able to be
18307 * turned into an ANYOFM node. Compare two different bytes,
18308 * bit-by-bit. In some positions, the bits in each will be 1;
18309 * and in other positions both will be 0; and in some positions
18310 * the bit will be 1 in one byte, and 0 in the other. Let 'n'
18311 * be the number of positions where the bits differ. We create
18312 * a mask which has exactly 'n' 0 bits, each in a position
18313 * where the two bytes differ. Now take the set of all bytes
18314 * that when ANDed with the mask yield the same result. That
18315 * set has 2**n elements, and is representable by just two 8
18316 * bit numbers: the result and the mask. Importantly, matching
18317 * the set can be vectorized by creating a word full of the
18318 * result bytes, and a word full of the mask bytes, yielding a
18319 * significant speed up. Here, see if this node matches such a
18320 * set. As a concrete example consider [01], and the byte
18321 * representing '0' which is 0x30 on ASCII machines. It has
18322 * the bits 0011 0000. Take the mask 1111 1110. If we AND
18323 * 0x31 and 0x30 with that mask we get 0x30. Any other bytes
18324 * ANDed yield something else. So [01], which is a common
18325 * usage, is optimizable into ANYOFM, and can benefit from the
18326 * speed up. We can only do this on UTF-8 invariant bytes,
18327 * because the variance would throw this off. */
18329 PERL_UINT_FAST8_T inverted = 0;
18331 const PERL_UINT_FAST8_T max_permissible = 0xFF;
18333 const PERL_UINT_FAST8_T max_permissible = 0x7F;
18335 if (invlist_highest(cp_list) > max_permissible) {
18336 _invlist_invert(cp_list);
18340 if (invlist_highest(cp_list) <= max_permissible) {
18341 Size_t cp_count = 0;
18342 bool first_time = TRUE;
18343 unsigned int lowest_cp = 0xFF;
18344 U8 bits_differing = 0;
18346 /* Only needed on EBCDIC, as there, variants and non- are mixed
18347 * together. Could #ifdef it out on ASCII, but probably the
18348 * compiler will optimize it out */
18349 bool has_variant = FALSE;
18351 /* Go through the bytes and find the bit positions that differ */
18352 invlist_iterinit(cp_list);
18353 while (invlist_iternext(cp_list, &start, &end)) {
18354 unsigned int i = start;
18356 cp_count += end - start + 1;
18359 if (! UVCHR_IS_INVARIANT(i)) {
18360 has_variant = TRUE;
18364 first_time = FALSE;
18370 /* Find the bit positions that differ from the lowest
18371 * code point in the node. Keep track of all such
18372 * positions by OR'ing */
18373 for (; i <= end; i++) {
18374 if (! UVCHR_IS_INVARIANT(i)) {
18375 has_variant = TRUE;
18379 bits_differing |= i ^ lowest_cp;
18382 invlist_iterfinish(cp_list);
18384 /* At the end of the loop, we count how many bits differ
18385 * from the bits in lowest code point, call the count 'd'.
18386 * If the set we found contains 2**d elements, it is the
18387 * closure of all code points that differ only in those bit
18388 * positions. To convince yourself of that, first note
18389 * that the number in the closure must be a power of 2,
18390 * which we test for. The only way we could have that
18391 * count and it be some differing set, is if we got some
18392 * code points that don't differ from the lowest code point
18393 * in any position, but do differ from each other in some
18394 * other position. That means one code point has a 1 in
18395 * that position, and another has a 0. But that would mean
18396 * that one of them differs from the lowest code point in
18397 * that position, which possibility we've already excluded.
18400 && cp_count == 1U << PL_bitcount[bits_differing])
18402 assert(inverted || cp_count > 1);
18403 op = ANYOFM + inverted;;
18405 /* We need to make the bits that differ be 0's */
18406 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS
18409 /* The argument is the lowest code point */
18410 anode_arg = lowest_cp;
18411 *flagp |= HASWIDTH|SIMPLE;
18415 _invlist_invert(cp_list);
18422 if (regarglen[op]) {
18423 ret = reganode(pRExC_state, op, anode_arg);
18425 ret = reg_node(pRExC_state, op);
18427 Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
18428 RExC_parse - orig_parse);;
18430 if (PL_regkind[op] == EXACT) {
18431 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
18432 TRUE /* downgradable to EXACT */
18435 else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
18436 FLAGS(REGNODE_p(ret)) = posix_class;
18438 else if (PL_regkind[op] == ANYOFM) {
18439 FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
18442 SvREFCNT_dec_NN(cp_list);
18445 } /* End of seeing if can optimize it into a different node */
18447 /* It's going to be an ANYOF node. */
18455 ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
18456 FILL_NODE(ret, op); /* We set the argument later */
18457 RExC_emit += 1 + regarglen[op];
18458 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
18460 /* Here, <cp_list> contains all the code points we can determine at
18461 * compile time that match under all conditions. Go through it, and
18462 * for things that belong in the bitmap, put them there, and delete from
18463 * <cp_list>. While we are at it, see if everything above 255 is in the
18464 * list, and if so, set a flag to speed up execution */
18466 populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
18469 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
18473 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
18476 /* Here, the bitmap has been populated with all the Latin1 code points that
18477 * always match. Can now add to the overall list those that match only
18478 * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>).
18480 if (has_upper_latin1_only_utf8_matches) {
18482 _invlist_union(cp_list,
18483 has_upper_latin1_only_utf8_matches,
18485 SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
18488 cp_list = has_upper_latin1_only_utf8_matches;
18490 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
18493 /* If there is a swash and more than one element, we can't use the swash in
18494 * the optimization below. */
18495 if (swash && element_count > 1) {
18496 SvREFCNT_dec_NN(swash);
18500 /* Note that the optimization of using 'swash' if it is the only thing in
18501 * the class doesn't have us change swash at all, so it can include things
18502 * that are also in the bitmap; otherwise we have purposely deleted that
18503 * duplicate information */
18504 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
18505 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
18507 only_utf8_locale_list,
18508 swash, has_user_defined_property);
18510 *flagp |= HASWIDTH|SIMPLE;
18512 if (ANYOF_FLAGS(REGNODE_p(ret)) & ANYOF_LOCALE_FLAGS) {
18513 RExC_contains_locale = 1;
18519 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
18522 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
18523 regnode* const node,
18525 SV* const runtime_defns,
18526 SV* const only_utf8_locale_list,
18528 const bool has_user_defined_property)
18530 /* Sets the arg field of an ANYOF-type node 'node', using information about
18531 * the node passed-in. If there is nothing outside the node's bitmap, the
18532 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
18533 * the count returned by add_data(), having allocated and stored an array,
18534 * av, that that count references, as follows:
18535 * av[0] stores the character class description in its textual form.
18536 * This is used later (regexec.c:Perl_regclass_swash()) to
18537 * initialize the appropriate swash, and is also useful for dumping
18538 * the regnode. This is set to &PL_sv_undef if the textual
18539 * description is not needed at run-time (as happens if the other
18540 * elements completely define the class)
18541 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
18542 * computed from av[0]. But if no further computation need be done,
18543 * the swash is stored here now (and av[0] is &PL_sv_undef).
18544 * av[2] stores the inversion list of code points that match only if the
18545 * current locale is UTF-8
18546 * av[3] stores the cp_list inversion list for use in addition or instead
18547 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
18548 * (Otherwise everything needed is already in av[0] and av[1])
18549 * av[4] is set if any component of the class is from a user-defined
18550 * property; used only if av[3] exists */
18554 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
18556 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
18557 assert(! (ANYOF_FLAGS(node)
18558 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
18559 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
18562 AV * const av = newAV();
18565 av_store(av, 0, (runtime_defns)
18566 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
18569 av_store(av, 1, swash);
18570 SvREFCNT_dec_NN(cp_list);
18573 av_store(av, 1, &PL_sv_undef);
18575 av_store(av, 3, cp_list);
18576 av_store(av, 4, newSVuv(has_user_defined_property));
18580 if (only_utf8_locale_list) {
18581 av_store(av, 2, only_utf8_locale_list);
18584 av_store(av, 2, &PL_sv_undef);
18587 rv = newRV_noinc(MUTABLE_SV(av));
18588 n = add_data(pRExC_state, STR_WITH_LEN("s"));
18589 RExC_rxi->data->data[n] = (void*)rv;
18594 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
18596 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
18597 const regnode* node,
18600 SV** only_utf8_locale_ptr,
18601 SV** output_invlist)
18604 /* For internal core use only.
18605 * Returns the swash for the input 'node' in the regex 'prog'.
18606 * If <doinit> is 'true', will attempt to create the swash if not already
18608 * If <listsvp> is non-null, will return the printable contents of the
18609 * swash. This can be used to get debugging information even before the
18610 * swash exists, by calling this function with 'doinit' set to false, in
18611 * which case the components that will be used to eventually create the
18612 * swash are returned (in a printable form).
18613 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
18614 * store an inversion list of code points that should match only if the
18615 * execution-time locale is a UTF-8 one.
18616 * If <output_invlist> is not NULL, it is where this routine is to store an
18617 * inversion list of the code points that would be instead returned in
18618 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
18619 * when this parameter is used, is just the non-code point data that
18620 * will go into creating the swash. This currently should be just
18621 * user-defined properties whose definitions were not known at compile
18622 * time. Using this parameter allows for easier manipulation of the
18623 * swash's data by the caller. It is illegal to call this function with
18624 * this parameter set, but not <listsvp>
18626 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
18627 * that, in spite of this function's name, the swash it returns may include
18628 * the bitmap data as well */
18631 SV *si = NULL; /* Input swash initialization string */
18632 SV* invlist = NULL;
18634 RXi_GET_DECL(prog, progi);
18635 const struct reg_data * const data = prog ? progi->data : NULL;
18637 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
18638 assert(! output_invlist || listsvp);
18640 if (data && data->count) {
18641 const U32 n = ARG(node);
18643 if (data->what[n] == 's') {
18644 SV * const rv = MUTABLE_SV(data->data[n]);
18645 AV * const av = MUTABLE_AV(SvRV(rv));
18646 SV **const ary = AvARRAY(av);
18647 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
18649 si = *ary; /* ary[0] = the string to initialize the swash with */
18651 if (av_tindex_skip_len_mg(av) >= 2) {
18652 if (only_utf8_locale_ptr
18654 && ary[2] != &PL_sv_undef)
18656 *only_utf8_locale_ptr = ary[2];
18659 assert(only_utf8_locale_ptr);
18660 *only_utf8_locale_ptr = NULL;
18663 /* Elements 3 and 4 are either both present or both absent. [3]
18664 * is any inversion list generated at compile time; [4]
18665 * indicates if that inversion list has any user-defined
18666 * properties in it. */
18667 if (av_tindex_skip_len_mg(av) >= 3) {
18669 if (SvUV(ary[4])) {
18670 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
18678 /* Element [1] is reserved for the set-up swash. If already there,
18679 * return it; if not, create it and store it there */
18680 if (ary[1] && SvROK(ary[1])) {
18683 else if (doinit && ((si && si != &PL_sv_undef)
18684 || (invlist && invlist != &PL_sv_undef))) {
18686 sw = _core_swash_init("utf8", /* the utf8 package */
18690 0, /* not from tr/// */
18692 &swash_init_flags);
18693 (void)av_store(av, 1, sw);
18698 /* If requested, return a printable version of what this swash matches */
18700 SV* matches_string = NULL;
18702 /* The swash should be used, if possible, to get the data, as it
18703 * contains the resolved data. But this function can be called at
18704 * compile-time, before everything gets resolved, in which case we
18705 * return the currently best available information, which is the string
18706 * that will eventually be used to do that resolving, 'si' */
18707 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
18708 && (si && si != &PL_sv_undef))
18710 /* Here, we only have 'si' (and possibly some passed-in data in
18711 * 'invlist', which is handled below) If the caller only wants
18712 * 'si', use that. */
18713 if (! output_invlist) {
18714 matches_string = newSVsv(si);
18717 /* But if the caller wants an inversion list of the node, we
18718 * need to parse 'si' and place as much as possible in the
18719 * desired output inversion list, making 'matches_string' only
18720 * contain the currently unresolvable things */
18721 const char *si_string = SvPVX(si);
18722 STRLEN remaining = SvCUR(si);
18726 /* Ignore everything before the first new-line */
18727 while (*si_string != '\n' && remaining > 0) {
18731 assert(remaining > 0);
18736 while (remaining > 0) {
18738 /* The data consists of just strings defining user-defined
18739 * property names, but in prior incarnations, and perhaps
18740 * somehow from pluggable regex engines, it could still
18741 * hold hex code point definitions. Each component of a
18742 * range would be separated by a tab, and each range by a
18743 * new-line. If these are found, instead add them to the
18744 * inversion list */
18745 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
18746 |PERL_SCAN_SILENT_NON_PORTABLE;
18747 STRLEN len = remaining;
18748 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
18750 /* If the hex decode routine found something, it should go
18751 * up to the next \n */
18752 if ( *(si_string + len) == '\n') {
18753 if (count) { /* 2nd code point on line */
18754 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
18757 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
18760 goto prepare_for_next_iteration;
18763 /* If the hex decode was instead for the lower range limit,
18764 * save it, and go parse the upper range limit */
18765 if (*(si_string + len) == '\t') {
18766 assert(count == 0);
18770 prepare_for_next_iteration:
18771 si_string += len + 1;
18772 remaining -= len + 1;
18776 /* Here, didn't find a legal hex number. Just add it from
18777 * here to the next \n */
18780 while (*(si_string + len) != '\n' && remaining > 0) {
18784 if (*(si_string + len) == '\n') {
18788 if (matches_string) {
18789 sv_catpvn(matches_string, si_string, len - 1);
18792 matches_string = newSVpvn(si_string, len - 1);
18795 sv_catpvs(matches_string, " ");
18796 } /* end of loop through the text */
18798 assert(matches_string);
18799 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
18800 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
18802 } /* end of has an 'si' but no swash */
18805 /* If we have a swash in place, its equivalent inversion list was above
18806 * placed into 'invlist'. If not, this variable may contain a stored
18807 * inversion list which is information beyond what is in 'si' */
18810 /* Again, if the caller doesn't want the output inversion list, put
18811 * everything in 'matches-string' */
18812 if (! output_invlist) {
18813 if ( ! matches_string) {
18814 matches_string = newSVpvs("\n");
18816 sv_catsv(matches_string, invlist_contents(invlist,
18817 TRUE /* traditional style */
18820 else if (! *output_invlist) {
18821 *output_invlist = invlist_clone(invlist, NULL);
18824 _invlist_union(*output_invlist, invlist, output_invlist);
18828 *listsvp = matches_string;
18833 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
18835 /* reg_skipcomment()
18837 Absorbs an /x style # comment from the input stream,
18838 returning a pointer to the first character beyond the comment, or if the
18839 comment terminates the pattern without anything following it, this returns
18840 one past the final character of the pattern (in other words, RExC_end) and
18841 sets the REG_RUN_ON_COMMENT_SEEN flag.
18843 Note it's the callers responsibility to ensure that we are
18844 actually in /x mode
18848 PERL_STATIC_INLINE char*
18849 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
18851 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
18855 while (p < RExC_end) {
18856 if (*(++p) == '\n') {
18861 /* we ran off the end of the pattern without ending the comment, so we have
18862 * to add an \n when wrapping */
18863 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
18868 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
18870 const bool force_to_xmod
18873 /* If the text at the current parse position '*p' is a '(?#...)' comment,
18874 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
18875 * is /x whitespace, advance '*p' so that on exit it points to the first
18876 * byte past all such white space and comments */
18878 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
18880 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
18882 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
18885 if (RExC_end - (*p) >= 3
18887 && *(*p + 1) == '?'
18888 && *(*p + 2) == '#')
18890 while (*(*p) != ')') {
18891 if ((*p) == RExC_end)
18892 FAIL("Sequence (?#... not terminated");
18900 const char * save_p = *p;
18901 while ((*p) < RExC_end) {
18903 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
18906 else if (*(*p) == '#') {
18907 (*p) = reg_skipcomment(pRExC_state, (*p));
18913 if (*p != save_p) {
18926 Advances the parse position by one byte, unless that byte is the beginning
18927 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
18928 those two cases, the parse position is advanced beyond all such comments and
18931 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
18935 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
18937 PERL_ARGS_ASSERT_NEXTCHAR;
18939 if (RExC_parse < RExC_end) {
18941 || UTF8_IS_INVARIANT(*RExC_parse)
18942 || UTF8_IS_START(*RExC_parse));
18944 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
18946 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
18947 FALSE /* Don't force /x */ );
18952 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
18954 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
18959 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
18960 /* +1 for REG_MAGIC */
18963 if ( RExC_rxi == NULL )
18964 FAIL("Regexp out of space");
18965 RXi_SET(RExC_rx, RExC_rxi);
18967 RExC_emit_start = RExC_rxi->program;
18969 Zero(REGNODE_p(RExC_emit), size, regnode);
18972 #ifdef RE_TRACK_PATTERN_OFFSETS
18973 Renew(RExC_offsets, 2*RExC_size+1, U32);
18975 Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
18977 RExC_offsets[0] = RExC_size;
18981 STATIC regnode_offset
18982 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
18984 /* Allocate a regnode for 'op', with 'extra_size' extra space. It aligns
18985 * and increments RExC_size and RExC_emit
18987 * It returns the regnode's offset into the regex engine program */
18989 const regnode_offset ret = RExC_emit;
18991 GET_RE_DEBUG_FLAGS_DECL;
18993 PERL_ARGS_ASSERT_REGNODE_GUTS;
18995 SIZE_ALIGN(RExC_size);
18996 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
18997 NODE_ALIGN_FILL(REGNODE_p(ret));
18998 #ifndef RE_TRACK_PATTERN_OFFSETS
18999 PERL_UNUSED_ARG(name);
19000 PERL_UNUSED_ARG(op);
19002 assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
19004 if (RExC_offsets) { /* MJD */
19006 ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
19009 (UV)(RExC_emit) > RExC_offsets[0]
19010 ? "Overwriting end of array!\n" : "OK",
19012 (UV)(RExC_parse - RExC_start),
19013 (UV)RExC_offsets[0]));
19014 Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
19021 - reg_node - emit a node
19023 STATIC regnode_offset /* Location. */
19024 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
19026 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
19027 regnode_offset ptr = ret;
19029 PERL_ARGS_ASSERT_REG_NODE;
19031 assert(regarglen[op] == 0);
19033 FILL_ADVANCE_NODE(ptr, op);
19039 - reganode - emit a node with an argument
19041 STATIC regnode_offset /* Location. */
19042 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
19044 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
19045 regnode_offset ptr = ret;
19047 PERL_ARGS_ASSERT_REGANODE;
19049 /* ANYOF are special cased to allow non-length 1 args */
19050 assert(regarglen[op] == 1);
19052 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
19057 STATIC regnode_offset
19058 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
19060 /* emit a node with U32 and I32 arguments */
19062 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
19063 regnode_offset ptr = ret;
19065 PERL_ARGS_ASSERT_REG2LANODE;
19067 assert(regarglen[op] == 2);
19069 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
19075 - reginsert - insert an operator in front of already-emitted operand
19077 * That means that on exit 'operand' is the offset of the newly inserted
19078 * operator, and the original operand has been relocated.
19080 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
19081 * set up NEXT_OFF() of the inserted node if needed. Something like this:
19083 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
19084 * NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
19086 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
19089 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
19090 const regnode_offset operand, const U32 depth)
19095 const int offset = regarglen[(U8)op];
19096 const int size = NODE_STEP_REGNODE + offset;
19097 GET_RE_DEBUG_FLAGS_DECL;
19099 PERL_ARGS_ASSERT_REGINSERT;
19100 PERL_UNUSED_CONTEXT;
19101 PERL_UNUSED_ARG(depth);
19102 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
19103 DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
19104 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
19105 studying. If this is wrong then we need to adjust RExC_recurse
19106 below like we do with RExC_open_parens/RExC_close_parens. */
19107 change_engine_size(pRExC_state, (Ptrdiff_t) size);
19108 src = REGNODE_p(RExC_emit);
19110 dst = REGNODE_p(RExC_emit);
19111 if (RExC_open_parens) {
19113 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
19114 /* remember that RExC_npar is rex->nparens + 1,
19115 * iow it is 1 more than the number of parens seen in
19116 * the pattern so far. */
19117 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
19118 /* note, RExC_open_parens[0] is the start of the
19119 * regex, it can't move. RExC_close_parens[0] is the end
19120 * of the regex, it *can* move. */
19121 if ( paren && RExC_open_parens[paren] >= operand ) {
19122 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
19123 RExC_open_parens[paren] += size;
19125 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
19127 if ( RExC_close_parens[paren] >= operand ) {
19128 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
19129 RExC_close_parens[paren] += size;
19131 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
19136 RExC_end_op += size;
19138 while (src > REGNODE_p(operand)) {
19139 StructCopy(--src, --dst, regnode);
19140 #ifdef RE_TRACK_PATTERN_OFFSETS
19141 if (RExC_offsets) { /* MJD 20010112 */
19143 ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
19147 (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
19148 ? "Overwriting end of array!\n" : "OK",
19149 (UV)REGNODE_OFFSET(src),
19150 (UV)REGNODE_OFFSET(dst),
19151 (UV)RExC_offsets[0]));
19152 Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
19153 Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
19158 place = REGNODE_p(operand); /* Op node, where operand used to be. */
19159 #ifdef RE_TRACK_PATTERN_OFFSETS
19160 if (RExC_offsets) { /* MJD */
19162 ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
19166 (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
19167 ? "Overwriting end of array!\n" : "OK",
19168 (UV)REGNODE_OFFSET(place),
19169 (UV)(RExC_parse - RExC_start),
19170 (UV)RExC_offsets[0]));
19171 Set_Node_Offset(place, RExC_parse);
19172 Set_Node_Length(place, 1);
19175 src = NEXTOPER(place);
19177 FILL_NODE(operand, op);
19179 /* Zero out any arguments in the new node */
19180 Zero(src, offset, regnode);
19184 - regtail - set the next-pointer at the end of a node chain of p to val.
19185 - SEE ALSO: regtail_study
19188 S_regtail(pTHX_ RExC_state_t * pRExC_state,
19189 const regnode_offset p,
19190 const regnode_offset val,
19193 regnode_offset scan;
19194 GET_RE_DEBUG_FLAGS_DECL;
19196 PERL_ARGS_ASSERT_REGTAIL;
19198 PERL_UNUSED_ARG(depth);
19201 /* Find last node. */
19202 scan = (regnode_offset) p;
19204 regnode * const temp = regnext(REGNODE_p(scan));
19206 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
19207 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
19208 Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n",
19209 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(REGNODE_p(scan)),
19210 (temp == NULL ? "->" : ""),
19211 (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
19216 scan = REGNODE_OFFSET(temp);
19219 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
19220 ARG_SET(REGNODE_p(scan), val - scan);
19223 NEXT_OFF(REGNODE_p(scan)) = val - scan;
19229 - regtail_study - set the next-pointer at the end of a node chain of p to val.
19230 - Look for optimizable sequences at the same time.
19231 - currently only looks for EXACT chains.
19233 This is experimental code. The idea is to use this routine to perform
19234 in place optimizations on branches and groups as they are constructed,
19235 with the long term intention of removing optimization from study_chunk so
19236 that it is purely analytical.
19238 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
19239 to control which is which.
19242 /* TODO: All four parms should be const */
19245 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
19246 const regnode_offset val, U32 depth)
19248 regnode_offset scan;
19250 #ifdef EXPERIMENTAL_INPLACESCAN
19253 GET_RE_DEBUG_FLAGS_DECL;
19255 PERL_ARGS_ASSERT_REGTAIL_STUDY;
19258 /* Find last node. */
19262 regnode * const temp = regnext(REGNODE_p(scan));
19263 #ifdef EXPERIMENTAL_INPLACESCAN
19264 if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
19265 bool unfolded_multi_char; /* Unexamined in this routine */
19266 if (join_exact(pRExC_state, scan, &min,
19267 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
19272 switch (OP(REGNODE_p(scan))) {
19277 case EXACTFAA_NO_TRIE:
19280 case EXACTFU_ONLY8:
19284 if( exact == PSEUDO )
19285 exact= OP(REGNODE_p(scan));
19286 else if ( exact != OP(REGNODE_p(scan)) )
19295 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
19296 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
19297 Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n",
19298 SvPV_nolen_const(RExC_mysv),
19299 REG_NODE_NUM(REGNODE_p(scan)),
19300 PL_reg_name[exact]);
19304 scan = REGNODE_OFFSET(temp);
19307 DEBUG_PARSE_MSG("");
19308 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
19309 Perl_re_printf( aTHX_
19310 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
19311 SvPV_nolen_const(RExC_mysv),
19312 (IV)REG_NODE_NUM(REGNODE_p(val)),
19316 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
19317 ARG_SET(REGNODE_p(scan), val - scan);
19320 NEXT_OFF(REGNODE_p(scan)) = val - scan;
19328 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
19330 /* Returns an inversion list of all the code points matched by the
19331 * ANYOFM/NANYOFM node 'n' */
19333 SV * cp_list = _new_invlist(-1);
19334 const U8 lowest = (U8) ARG(n);
19337 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
19339 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
19341 /* Starting with the lowest code point, any code point that ANDed with the
19342 * mask yields the lowest code point is in the set */
19343 for (i = lowest; i <= 0xFF; i++) {
19344 if ((i & FLAGS(n)) == ARG(n)) {
19345 cp_list = add_cp_to_invlist(cp_list, i);
19348 /* We know how many code points (a power of two) that are in the
19349 * set. No use looking once we've got that number */
19350 if (count >= needed) break;
19354 if (OP(n) == NANYOFM) {
19355 _invlist_invert(cp_list);
19361 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
19366 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
19371 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
19373 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
19374 if (flags & (1<<bit)) {
19375 if (!set++ && lead)
19376 Perl_re_printf( aTHX_ "%s", lead);
19377 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
19382 Perl_re_printf( aTHX_ "\n");
19384 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
19389 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
19395 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
19397 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
19398 if (flags & (1<<bit)) {
19399 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
19402 if (!set++ && lead)
19403 Perl_re_printf( aTHX_ "%s", lead);
19404 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
19407 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
19408 if (!set++ && lead) {
19409 Perl_re_printf( aTHX_ "%s", lead);
19412 case REGEX_UNICODE_CHARSET:
19413 Perl_re_printf( aTHX_ "UNICODE");
19415 case REGEX_LOCALE_CHARSET:
19416 Perl_re_printf( aTHX_ "LOCALE");
19418 case REGEX_ASCII_RESTRICTED_CHARSET:
19419 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
19421 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
19422 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
19425 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
19431 Perl_re_printf( aTHX_ "\n");
19433 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
19439 Perl_regdump(pTHX_ const regexp *r)
19443 SV * const sv = sv_newmortal();
19444 SV *dsv= sv_newmortal();
19445 RXi_GET_DECL(r, ri);
19446 GET_RE_DEBUG_FLAGS_DECL;
19448 PERL_ARGS_ASSERT_REGDUMP;
19450 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
19452 /* Header fields of interest. */
19453 for (i = 0; i < 2; i++) {
19454 if (r->substrs->data[i].substr) {
19455 RE_PV_QUOTED_DECL(s, 0, dsv,
19456 SvPVX_const(r->substrs->data[i].substr),
19457 RE_SV_DUMPLEN(r->substrs->data[i].substr),
19458 PL_dump_re_max_len);
19459 Perl_re_printf( aTHX_
19460 "%s %s%s at %" IVdf "..%" UVuf " ",
19461 i ? "floating" : "anchored",
19463 RE_SV_TAIL(r->substrs->data[i].substr),
19464 (IV)r->substrs->data[i].min_offset,
19465 (UV)r->substrs->data[i].max_offset);
19467 else if (r->substrs->data[i].utf8_substr) {
19468 RE_PV_QUOTED_DECL(s, 1, dsv,
19469 SvPVX_const(r->substrs->data[i].utf8_substr),
19470 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
19472 Perl_re_printf( aTHX_
19473 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
19474 i ? "floating" : "anchored",
19476 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
19477 (IV)r->substrs->data[i].min_offset,
19478 (UV)r->substrs->data[i].max_offset);
19482 if (r->check_substr || r->check_utf8)
19483 Perl_re_printf( aTHX_
19485 ( r->check_substr == r->substrs->data[1].substr
19486 && r->check_utf8 == r->substrs->data[1].utf8_substr
19487 ? "(checking floating" : "(checking anchored"));
19488 if (r->intflags & PREGf_NOSCAN)
19489 Perl_re_printf( aTHX_ " noscan");
19490 if (r->extflags & RXf_CHECK_ALL)
19491 Perl_re_printf( aTHX_ " isall");
19492 if (r->check_substr || r->check_utf8)
19493 Perl_re_printf( aTHX_ ") ");
19495 if (ri->regstclass) {
19496 regprop(r, sv, ri->regstclass, NULL, NULL);
19497 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
19499 if (r->intflags & PREGf_ANCH) {
19500 Perl_re_printf( aTHX_ "anchored");
19501 if (r->intflags & PREGf_ANCH_MBOL)
19502 Perl_re_printf( aTHX_ "(MBOL)");
19503 if (r->intflags & PREGf_ANCH_SBOL)
19504 Perl_re_printf( aTHX_ "(SBOL)");
19505 if (r->intflags & PREGf_ANCH_GPOS)
19506 Perl_re_printf( aTHX_ "(GPOS)");
19507 Perl_re_printf( aTHX_ " ");
19509 if (r->intflags & PREGf_GPOS_SEEN)
19510 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
19511 if (r->intflags & PREGf_SKIP)
19512 Perl_re_printf( aTHX_ "plus ");
19513 if (r->intflags & PREGf_IMPLICIT)
19514 Perl_re_printf( aTHX_ "implicit ");
19515 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
19516 if (r->extflags & RXf_EVAL_SEEN)
19517 Perl_re_printf( aTHX_ "with eval ");
19518 Perl_re_printf( aTHX_ "\n");
19520 regdump_extflags("r->extflags: ", r->extflags);
19521 regdump_intflags("r->intflags: ", r->intflags);
19524 PERL_ARGS_ASSERT_REGDUMP;
19525 PERL_UNUSED_CONTEXT;
19526 PERL_UNUSED_ARG(r);
19527 #endif /* DEBUGGING */
19530 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
19533 # if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
19534 || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
19535 || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
19536 || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
19537 || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
19538 || _CC_VERTSPACE != 15
19539 # error Need to adjust order of anyofs[]
19541 static const char * const anyofs[] = {
19578 - regprop - printable representation of opcode, with run time support
19582 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
19586 RXi_GET_DECL(prog, progi);
19587 GET_RE_DEBUG_FLAGS_DECL;
19589 PERL_ARGS_ASSERT_REGPROP;
19593 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
19594 /* It would be nice to FAIL() here, but this may be called from
19595 regexec.c, and it would be hard to supply pRExC_state. */
19596 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
19597 (int)OP(o), (int)REGNODE_MAX);
19598 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
19600 k = PL_regkind[OP(o)];
19603 sv_catpvs(sv, " ");
19604 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
19605 * is a crude hack but it may be the best for now since
19606 * we have no flag "this EXACTish node was UTF-8"
19608 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
19609 PL_colors[0], PL_colors[1],
19610 PERL_PV_ESCAPE_UNI_DETECT |
19611 PERL_PV_ESCAPE_NONASCII |
19612 PERL_PV_PRETTY_ELLIPSES |
19613 PERL_PV_PRETTY_LTGT |
19614 PERL_PV_PRETTY_NOCLEAR
19616 } else if (k == TRIE) {
19617 /* print the details of the trie in dumpuntil instead, as
19618 * progi->data isn't available here */
19619 const char op = OP(o);
19620 const U32 n = ARG(o);
19621 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
19622 (reg_ac_data *)progi->data->data[n] :
19624 const reg_trie_data * const trie
19625 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
19627 Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
19628 DEBUG_TRIE_COMPILE_r({
19630 sv_catpvs(sv, "(JUMP)");
19631 Perl_sv_catpvf(aTHX_ sv,
19632 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
19633 (UV)trie->startstate,
19634 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
19635 (UV)trie->wordcount,
19638 (UV)TRIE_CHARCOUNT(trie),
19639 (UV)trie->uniquecharcount
19642 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
19643 sv_catpvs(sv, "[");
19644 (void) put_charclass_bitmap_innards(sv,
19645 ((IS_ANYOF_TRIE(op))
19647 : TRIE_BITMAP(trie)),
19653 sv_catpvs(sv, "]");
19655 } else if (k == CURLY) {
19656 U32 lo = ARG1(o), hi = ARG2(o);
19657 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
19658 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
19659 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
19660 if (hi == REG_INFTY)
19661 sv_catpvs(sv, "INFTY");
19663 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
19664 sv_catpvs(sv, "}");
19666 else if (k == WHILEM && o->flags) /* Ordinal/of */
19667 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
19668 else if (k == REF || k == OPEN || k == CLOSE
19669 || k == GROUPP || OP(o)==ACCEPT)
19671 AV *name_list= NULL;
19672 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
19673 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
19674 if ( RXp_PAREN_NAMES(prog) ) {
19675 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
19676 } else if ( pRExC_state ) {
19677 name_list= RExC_paren_name_list;
19680 if ( k != REF || (OP(o) < NREF)) {
19681 SV **name= av_fetch(name_list, parno, 0 );
19683 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19686 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
19687 I32 *nums=(I32*)SvPVX(sv_dat);
19688 SV **name= av_fetch(name_list, nums[0], 0 );
19691 for ( n=0; n<SvIVX(sv_dat); n++ ) {
19692 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
19693 (n ? "," : ""), (IV)nums[n]);
19695 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19699 if ( k == REF && reginfo) {
19700 U32 n = ARG(o); /* which paren pair */
19701 I32 ln = prog->offs[n].start;
19702 if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
19703 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
19704 else if (ln == prog->offs[n].end)
19705 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
19707 const char *s = reginfo->strbeg + ln;
19708 Perl_sv_catpvf(aTHX_ sv, ": ");
19709 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
19710 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
19713 } else if (k == GOSUB) {
19714 AV *name_list= NULL;
19715 if ( RXp_PAREN_NAMES(prog) ) {
19716 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
19717 } else if ( pRExC_state ) {
19718 name_list= RExC_paren_name_list;
19721 /* Paren and offset */
19722 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
19723 (int)((o + (int)ARG2L(o)) - progi->program) );
19725 SV **name= av_fetch(name_list, ARG(o), 0 );
19727 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
19730 else if (k == LOGICAL)
19731 /* 2: embedded, otherwise 1 */
19732 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
19733 else if (k == ANYOF) {
19734 const U8 flags = ANYOF_FLAGS(o);
19735 bool do_sep = FALSE; /* Do we need to separate various components of
19737 /* Set if there is still an unresolved user-defined property */
19738 SV *unresolved = NULL;
19740 /* Things that are ignored except when the runtime locale is UTF-8 */
19741 SV *only_utf8_locale_invlist = NULL;
19743 /* Code points that don't fit in the bitmap */
19744 SV *nonbitmap_invlist = NULL;
19746 /* And things that aren't in the bitmap, but are small enough to be */
19747 SV* bitmap_range_not_in_bitmap = NULL;
19749 const bool inverted = flags & ANYOF_INVERT;
19751 if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
19752 if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
19753 sv_catpvs(sv, "{utf8-locale-reqd}");
19755 if (flags & ANYOFL_FOLD) {
19756 sv_catpvs(sv, "{i}");
19760 /* If there is stuff outside the bitmap, get it */
19761 if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
19762 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
19764 &only_utf8_locale_invlist,
19765 &nonbitmap_invlist);
19766 /* The non-bitmap data may contain stuff that could fit in the
19767 * bitmap. This could come from a user-defined property being
19768 * finally resolved when this call was done; or much more likely
19769 * because there are matches that require UTF-8 to be valid, and so
19770 * aren't in the bitmap. This is teased apart later */
19771 _invlist_intersection(nonbitmap_invlist,
19773 &bitmap_range_not_in_bitmap);
19774 /* Leave just the things that don't fit into the bitmap */
19775 _invlist_subtract(nonbitmap_invlist,
19777 &nonbitmap_invlist);
19780 /* Obey this flag to add all above-the-bitmap code points */
19781 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
19782 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
19783 NUM_ANYOF_CODE_POINTS,
19787 /* Ready to start outputting. First, the initial left bracket */
19788 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19790 /* Then all the things that could fit in the bitmap */
19791 do_sep = put_charclass_bitmap_innards(sv,
19793 bitmap_range_not_in_bitmap,
19794 only_utf8_locale_invlist,
19797 /* Can't try inverting for a
19798 * better display if there are
19799 * things that haven't been
19801 unresolved != NULL);
19802 SvREFCNT_dec(bitmap_range_not_in_bitmap);
19804 /* If there are user-defined properties which haven't been defined yet,
19805 * output them. If the result is not to be inverted, it is clearest to
19806 * output them in a separate [] from the bitmap range stuff. If the
19807 * result is to be complemented, we have to show everything in one [],
19808 * as the inversion applies to the whole thing. Use {braces} to
19809 * separate them from anything in the bitmap and anything above the
19813 if (! do_sep) { /* If didn't output anything in the bitmap */
19814 sv_catpvs(sv, "^");
19816 sv_catpvs(sv, "{");
19819 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
19821 sv_catsv(sv, unresolved);
19823 sv_catpvs(sv, "}");
19825 do_sep = ! inverted;
19828 /* And, finally, add the above-the-bitmap stuff */
19829 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
19832 /* See if truncation size is overridden */
19833 const STRLEN dump_len = (PL_dump_re_max_len > 256)
19834 ? PL_dump_re_max_len
19837 /* This is output in a separate [] */
19839 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
19842 /* And, for easy of understanding, it is shown in the
19843 * uncomplemented form if possible. The one exception being if
19844 * there are unresolved items, where the inversion has to be
19845 * delayed until runtime */
19846 if (inverted && ! unresolved) {
19847 _invlist_invert(nonbitmap_invlist);
19848 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
19851 contents = invlist_contents(nonbitmap_invlist,
19852 FALSE /* output suitable for catsv */
19855 /* If the output is shorter than the permissible maximum, just do it. */
19856 if (SvCUR(contents) <= dump_len) {
19857 sv_catsv(sv, contents);
19860 const char * contents_string = SvPVX(contents);
19861 STRLEN i = dump_len;
19863 /* Otherwise, start at the permissible max and work back to the
19864 * first break possibility */
19865 while (i > 0 && contents_string[i] != ' ') {
19868 if (i == 0) { /* Fail-safe. Use the max if we couldn't
19869 find a legal break */
19873 sv_catpvn(sv, contents_string, i);
19874 sv_catpvs(sv, "...");
19877 SvREFCNT_dec_NN(contents);
19878 SvREFCNT_dec_NN(nonbitmap_invlist);
19881 /* And finally the matching, closing ']' */
19882 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
19884 SvREFCNT_dec(unresolved);
19886 else if (k == ANYOFM) {
19887 SV * cp_list = get_ANYOFM_contents(o);
19889 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19890 if (OP(o) == NANYOFM) {
19891 _invlist_invert(cp_list);
19894 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
19895 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
19897 SvREFCNT_dec(cp_list);
19899 else if (k == POSIXD || k == NPOSIXD) {
19900 U8 index = FLAGS(o) * 2;
19901 if (index < C_ARRAY_LENGTH(anyofs)) {
19902 if (*anyofs[index] != '[') {
19903 sv_catpvs(sv, "[");
19905 sv_catpv(sv, anyofs[index]);
19906 if (*anyofs[index] != '[') {
19907 sv_catpvs(sv, "]");
19911 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
19914 else if (k == BOUND || k == NBOUND) {
19915 /* Must be synced with order of 'bound_type' in regcomp.h */
19916 const char * const bounds[] = {
19917 "", /* Traditional */
19923 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
19924 sv_catpv(sv, bounds[FLAGS(o)]);
19926 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
19927 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
19928 else if (OP(o) == SBOL)
19929 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
19931 /* add on the verb argument if there is one */
19932 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
19934 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
19935 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
19937 sv_catpvs(sv, ":NULL");
19940 PERL_UNUSED_CONTEXT;
19941 PERL_UNUSED_ARG(sv);
19942 PERL_UNUSED_ARG(o);
19943 PERL_UNUSED_ARG(prog);
19944 PERL_UNUSED_ARG(reginfo);
19945 PERL_UNUSED_ARG(pRExC_state);
19946 #endif /* DEBUGGING */
19952 Perl_re_intuit_string(pTHX_ REGEXP * const r)
19953 { /* Assume that RE_INTUIT is set */
19954 struct regexp *const prog = ReANY(r);
19955 GET_RE_DEBUG_FLAGS_DECL;
19957 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
19958 PERL_UNUSED_CONTEXT;
19962 const char * const s = SvPV_nolen_const(RX_UTF8(r)
19963 ? prog->check_utf8 : prog->check_substr);
19965 if (!PL_colorset) reginitcolors();
19966 Perl_re_printf( aTHX_
19967 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
19969 RX_UTF8(r) ? "utf8 " : "",
19970 PL_colors[5], PL_colors[0],
19973 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
19976 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
19977 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
19983 handles refcounting and freeing the perl core regexp structure. When
19984 it is necessary to actually free the structure the first thing it
19985 does is call the 'free' method of the regexp_engine associated to
19986 the regexp, allowing the handling of the void *pprivate; member
19987 first. (This routine is not overridable by extensions, which is why
19988 the extensions free is called first.)
19990 See regdupe and regdupe_internal if you change anything here.
19992 #ifndef PERL_IN_XSUB_RE
19994 Perl_pregfree(pTHX_ REGEXP *r)
20000 Perl_pregfree2(pTHX_ REGEXP *rx)
20002 struct regexp *const r = ReANY(rx);
20003 GET_RE_DEBUG_FLAGS_DECL;
20005 PERL_ARGS_ASSERT_PREGFREE2;
20010 if (r->mother_re) {
20011 ReREFCNT_dec(r->mother_re);
20013 CALLREGFREE_PVT(rx); /* free the private data */
20014 SvREFCNT_dec(RXp_PAREN_NAMES(r));
20018 for (i = 0; i < 2; i++) {
20019 SvREFCNT_dec(r->substrs->data[i].substr);
20020 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
20022 Safefree(r->substrs);
20024 RX_MATCH_COPY_FREE(rx);
20025 #ifdef PERL_ANY_COW
20026 SvREFCNT_dec(r->saved_copy);
20029 SvREFCNT_dec(r->qr_anoncv);
20030 if (r->recurse_locinput)
20031 Safefree(r->recurse_locinput);
20037 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
20038 except that dsv will be created if NULL.
20040 This function is used in two main ways. First to implement
20041 $r = qr/....; $s = $$r;
20043 Secondly, it is used as a hacky workaround to the structural issue of
20045 being stored in the regexp structure which is in turn stored in
20046 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
20047 could be PL_curpm in multiple contexts, and could require multiple
20048 result sets being associated with the pattern simultaneously, such
20049 as when doing a recursive match with (??{$qr})
20051 The solution is to make a lightweight copy of the regexp structure
20052 when a qr// is returned from the code executed by (??{$qr}) this
20053 lightweight copy doesn't actually own any of its data except for
20054 the starp/end and the actual regexp structure itself.
20060 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
20062 struct regexp *drx;
20063 struct regexp *const srx = ReANY(ssv);
20064 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
20066 PERL_ARGS_ASSERT_REG_TEMP_COPY;
20069 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
20071 SvOK_off((SV *)dsv);
20073 /* For PVLVs, the head (sv_any) points to an XPVLV, while
20074 * the LV's xpvlenu_rx will point to a regexp body, which
20075 * we allocate here */
20076 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
20077 assert(!SvPVX(dsv));
20078 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
20079 temp->sv_any = NULL;
20080 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
20081 SvREFCNT_dec_NN(temp);
20082 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
20083 ing below will not set it. */
20084 SvCUR_set(dsv, SvCUR(ssv));
20087 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
20088 sv_force_normal(sv) is called. */
20092 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
20093 SvPV_set(dsv, RX_WRAPPED(ssv));
20094 /* We share the same string buffer as the original regexp, on which we
20095 hold a reference count, incremented when mother_re is set below.
20096 The string pointer is copied here, being part of the regexp struct.
20098 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
20099 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
20103 const I32 npar = srx->nparens+1;
20104 Newx(drx->offs, npar, regexp_paren_pair);
20105 Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
20107 if (srx->substrs) {
20109 Newx(drx->substrs, 1, struct reg_substr_data);
20110 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
20112 for (i = 0; i < 2; i++) {
20113 SvREFCNT_inc_void(drx->substrs->data[i].substr);
20114 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
20117 /* check_substr and check_utf8, if non-NULL, point to either their
20118 anchored or float namesakes, and don't hold a second reference. */
20120 RX_MATCH_COPIED_off(dsv);
20121 #ifdef PERL_ANY_COW
20122 drx->saved_copy = NULL;
20124 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
20125 SvREFCNT_inc_void(drx->qr_anoncv);
20126 if (srx->recurse_locinput)
20127 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
20134 /* regfree_internal()
20136 Free the private data in a regexp. This is overloadable by
20137 extensions. Perl takes care of the regexp structure in pregfree(),
20138 this covers the *pprivate pointer which technically perl doesn't
20139 know about, however of course we have to handle the
20140 regexp_internal structure when no extension is in use.
20142 Note this is called before freeing anything in the regexp
20147 Perl_regfree_internal(pTHX_ REGEXP * const rx)
20149 struct regexp *const r = ReANY(rx);
20150 RXi_GET_DECL(r, ri);
20151 GET_RE_DEBUG_FLAGS_DECL;
20153 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
20163 SV *dsv= sv_newmortal();
20164 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
20165 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
20166 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
20167 PL_colors[4], PL_colors[5], s);
20171 #ifdef RE_TRACK_PATTERN_OFFSETS
20173 Safefree(ri->u.offsets); /* 20010421 MJD */
20175 if (ri->code_blocks)
20176 S_free_codeblocks(aTHX_ ri->code_blocks);
20179 int n = ri->data->count;
20182 /* If you add a ->what type here, update the comment in regcomp.h */
20183 switch (ri->data->what[n]) {
20189 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
20192 Safefree(ri->data->data[n]);
20198 { /* Aho Corasick add-on structure for a trie node.
20199 Used in stclass optimization only */
20201 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
20202 #ifdef USE_ITHREADS
20206 refcount = --aho->refcount;
20209 PerlMemShared_free(aho->states);
20210 PerlMemShared_free(aho->fail);
20211 /* do this last!!!! */
20212 PerlMemShared_free(ri->data->data[n]);
20213 /* we should only ever get called once, so
20214 * assert as much, and also guard the free
20215 * which /might/ happen twice. At the least
20216 * it will make code anlyzers happy and it
20217 * doesn't cost much. - Yves */
20218 assert(ri->regstclass);
20219 if (ri->regstclass) {
20220 PerlMemShared_free(ri->regstclass);
20221 ri->regstclass = 0;
20228 /* trie structure. */
20230 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
20231 #ifdef USE_ITHREADS
20235 refcount = --trie->refcount;
20238 PerlMemShared_free(trie->charmap);
20239 PerlMemShared_free(trie->states);
20240 PerlMemShared_free(trie->trans);
20242 PerlMemShared_free(trie->bitmap);
20244 PerlMemShared_free(trie->jump);
20245 PerlMemShared_free(trie->wordinfo);
20246 /* do this last!!!! */
20247 PerlMemShared_free(ri->data->data[n]);
20252 Perl_croak(aTHX_ "panic: regfree data code '%c'",
20253 ri->data->what[n]);
20256 Safefree(ri->data->what);
20257 Safefree(ri->data);
20263 #define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t))
20264 #define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t))
20265 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
20268 re_dup_guts - duplicate a regexp.
20270 This routine is expected to clone a given regexp structure. It is only
20271 compiled under USE_ITHREADS.
20273 After all of the core data stored in struct regexp is duplicated
20274 the regexp_engine.dupe method is used to copy any private data
20275 stored in the *pprivate pointer. This allows extensions to handle
20276 any duplication it needs to do.
20278 See pregfree() and regfree_internal() if you change anything here.
20280 #if defined(USE_ITHREADS)
20281 #ifndef PERL_IN_XSUB_RE
20283 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
20287 const struct regexp *r = ReANY(sstr);
20288 struct regexp *ret = ReANY(dstr);
20290 PERL_ARGS_ASSERT_RE_DUP_GUTS;
20292 npar = r->nparens+1;
20293 Newx(ret->offs, npar, regexp_paren_pair);
20294 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
20296 if (ret->substrs) {
20297 /* Do it this way to avoid reading from *r after the StructCopy().
20298 That way, if any of the sv_dup_inc()s dislodge *r from the L1
20299 cache, it doesn't matter. */
20301 const bool anchored = r->check_substr
20302 ? r->check_substr == r->substrs->data[0].substr
20303 : r->check_utf8 == r->substrs->data[0].utf8_substr;
20304 Newx(ret->substrs, 1, struct reg_substr_data);
20305 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
20307 for (i = 0; i < 2; i++) {
20308 ret->substrs->data[i].substr =
20309 sv_dup_inc(ret->substrs->data[i].substr, param);
20310 ret->substrs->data[i].utf8_substr =
20311 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
20314 /* check_substr and check_utf8, if non-NULL, point to either their
20315 anchored or float namesakes, and don't hold a second reference. */
20317 if (ret->check_substr) {
20319 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
20321 ret->check_substr = ret->substrs->data[0].substr;
20322 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
20324 assert(r->check_substr == r->substrs->data[1].substr);
20325 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
20327 ret->check_substr = ret->substrs->data[1].substr;
20328 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
20330 } else if (ret->check_utf8) {
20332 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
20334 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
20339 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
20340 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
20341 if (r->recurse_locinput)
20342 Newx(ret->recurse_locinput, r->nparens + 1, char *);
20345 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
20347 if (RX_MATCH_COPIED(dstr))
20348 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
20350 ret->subbeg = NULL;
20351 #ifdef PERL_ANY_COW
20352 ret->saved_copy = NULL;
20355 /* Whether mother_re be set or no, we need to copy the string. We
20356 cannot refrain from copying it when the storage points directly to
20357 our mother regexp, because that's
20358 1: a buffer in a different thread
20359 2: something we no longer hold a reference on
20360 so we need to copy it locally. */
20361 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
20362 ret->mother_re = NULL;
20364 #endif /* PERL_IN_XSUB_RE */
20369 This is the internal complement to regdupe() which is used to copy
20370 the structure pointed to by the *pprivate pointer in the regexp.
20371 This is the core version of the extension overridable cloning hook.
20372 The regexp structure being duplicated will be copied by perl prior
20373 to this and will be provided as the regexp *r argument, however
20374 with the /old/ structures pprivate pointer value. Thus this routine
20375 may override any copying normally done by perl.
20377 It returns a pointer to the new regexp_internal structure.
20381 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
20384 struct regexp *const r = ReANY(rx);
20385 regexp_internal *reti;
20387 RXi_GET_DECL(r, ri);
20389 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
20393 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
20394 char, regexp_internal);
20395 Copy(ri->program, reti->program, len+1, regnode);
20398 if (ri->code_blocks) {
20400 Newx(reti->code_blocks, 1, struct reg_code_blocks);
20401 Newx(reti->code_blocks->cb, ri->code_blocks->count,
20402 struct reg_code_block);
20403 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
20404 ri->code_blocks->count, struct reg_code_block);
20405 for (n = 0; n < ri->code_blocks->count; n++)
20406 reti->code_blocks->cb[n].src_regex = (REGEXP*)
20407 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
20408 reti->code_blocks->count = ri->code_blocks->count;
20409 reti->code_blocks->refcnt = 1;
20412 reti->code_blocks = NULL;
20414 reti->regstclass = NULL;
20417 struct reg_data *d;
20418 const int count = ri->data->count;
20421 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
20422 char, struct reg_data);
20423 Newx(d->what, count, U8);
20426 for (i = 0; i < count; i++) {
20427 d->what[i] = ri->data->what[i];
20428 switch (d->what[i]) {
20429 /* see also regcomp.h and regfree_internal() */
20430 case 'a': /* actually an AV, but the dup function is identical.
20431 values seem to be "plain sv's" generally. */
20432 case 'r': /* a compiled regex (but still just another SV) */
20433 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
20434 this use case should go away, the code could have used
20435 'a' instead - see S_set_ANYOF_arg() for array contents. */
20436 case 'S': /* actually an SV, but the dup function is identical. */
20437 case 'u': /* actually an HV, but the dup function is identical.
20438 values are "plain sv's" */
20439 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
20442 /* Synthetic Start Class - "Fake" charclass we generate to optimize
20443 * patterns which could start with several different things. Pre-TRIE
20444 * this was more important than it is now, however this still helps
20445 * in some places, for instance /x?a+/ might produce a SSC equivalent
20446 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
20449 /* This is cheating. */
20450 Newx(d->data[i], 1, regnode_ssc);
20451 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
20452 reti->regstclass = (regnode*)d->data[i];
20455 /* AHO-CORASICK fail table */
20456 /* Trie stclasses are readonly and can thus be shared
20457 * without duplication. We free the stclass in pregfree
20458 * when the corresponding reg_ac_data struct is freed.
20460 reti->regstclass= ri->regstclass;
20463 /* TRIE transition table */
20465 ((reg_trie_data*)ri->data->data[i])->refcount++;
20468 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
20469 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
20470 is not from another regexp */
20471 d->data[i] = ri->data->data[i];
20474 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
20475 ri->data->what[i]);
20484 reti->name_list_idx = ri->name_list_idx;
20486 #ifdef RE_TRACK_PATTERN_OFFSETS
20487 if (ri->u.offsets) {
20488 Newx(reti->u.offsets, 2*len+1, U32);
20489 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
20492 SetProgLen(reti, len);
20495 return (void*)reti;
20498 #endif /* USE_ITHREADS */
20500 #ifndef PERL_IN_XSUB_RE
20503 - regnext - dig the "next" pointer out of a node
20506 Perl_regnext(pTHX_ regnode *p)
20513 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
20514 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
20515 (int)OP(p), (int)REGNODE_MAX);
20518 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
20528 S_re_croak2(pTHX_ bool utf8, const char* pat1, const char* pat2,...)
20531 STRLEN l1 = strlen(pat1);
20532 STRLEN l2 = strlen(pat2);
20535 const char *message;
20537 PERL_ARGS_ASSERT_RE_CROAK2;
20543 Copy(pat1, buf, l1 , char);
20544 Copy(pat2, buf + l1, l2 , char);
20545 buf[l1 + l2] = '\n';
20546 buf[l1 + l2 + 1] = '\0';
20547 va_start(args, pat2);
20548 msv = vmess(buf, &args);
20550 message = SvPV_const(msv, l1);
20553 Copy(message, buf, l1 , char);
20554 /* l1-1 to avoid \n */
20555 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
20558 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
20560 #ifndef PERL_IN_XSUB_RE
20562 Perl_save_re_context(pTHX)
20567 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
20570 const REGEXP * const rx = PM_GETRE(PL_curpm);
20572 nparens = RX_NPARENS(rx);
20575 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
20576 * that PL_curpm will be null, but that utf8.pm and the modules it
20577 * loads will only use $1..$3.
20578 * The t/porting/re_context.t test file checks this assumption.
20583 for (i = 1; i <= nparens; i++) {
20584 char digits[TYPE_CHARS(long)];
20585 const STRLEN len = my_snprintf(digits, sizeof(digits),
20587 GV *const *const gvp
20588 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
20591 GV * const gv = *gvp;
20592 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
20602 S_put_code_point(pTHX_ SV *sv, UV c)
20604 PERL_ARGS_ASSERT_PUT_CODE_POINT;
20607 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
20609 else if (isPRINT(c)) {
20610 const char string = (char) c;
20612 /* We use {phrase} as metanotation in the class, so also escape literal
20614 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
20615 sv_catpvs(sv, "\\");
20616 sv_catpvn(sv, &string, 1);
20618 else if (isMNEMONIC_CNTRL(c)) {
20619 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
20622 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
20626 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
20629 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
20631 /* Appends to 'sv' a displayable version of the range of code points from
20632 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
20633 * that have them, when they occur at the beginning or end of the range.
20634 * It uses hex to output the remaining code points, unless 'allow_literals'
20635 * is true, in which case the printable ASCII ones are output as-is (though
20636 * some of these will be escaped by put_code_point()).
20638 * NOTE: This is designed only for printing ranges of code points that fit
20639 * inside an ANYOF bitmap. Higher code points are simply suppressed
20642 const unsigned int min_range_count = 3;
20644 assert(start <= end);
20646 PERL_ARGS_ASSERT_PUT_RANGE;
20648 while (start <= end) {
20650 const char * format;
20652 if (end - start < min_range_count) {
20654 /* Output chars individually when they occur in short ranges */
20655 for (; start <= end; start++) {
20656 put_code_point(sv, start);
20661 /* If permitted by the input options, and there is a possibility that
20662 * this range contains a printable literal, look to see if there is
20664 if (allow_literals && start <= MAX_PRINT_A) {
20666 /* If the character at the beginning of the range isn't an ASCII
20667 * printable, effectively split the range into two parts:
20668 * 1) the portion before the first such printable,
20670 * and output them separately. */
20671 if (! isPRINT_A(start)) {
20672 UV temp_end = start + 1;
20674 /* There is no point looking beyond the final possible
20675 * printable, in MAX_PRINT_A */
20676 UV max = MIN(end, MAX_PRINT_A);
20678 while (temp_end <= max && ! isPRINT_A(temp_end)) {
20682 /* Here, temp_end points to one beyond the first printable if
20683 * found, or to one beyond 'max' if not. If none found, make
20684 * sure that we use the entire range */
20685 if (temp_end > MAX_PRINT_A) {
20686 temp_end = end + 1;
20689 /* Output the first part of the split range: the part that
20690 * doesn't have printables, with the parameter set to not look
20691 * for literals (otherwise we would infinitely recurse) */
20692 put_range(sv, start, temp_end - 1, FALSE);
20694 /* The 2nd part of the range (if any) starts here. */
20697 /* We do a continue, instead of dropping down, because even if
20698 * the 2nd part is non-empty, it could be so short that we want
20699 * to output it as individual characters, as tested for at the
20700 * top of this loop. */
20704 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
20705 * output a sub-range of just the digits or letters, then process
20706 * the remaining portion as usual. */
20707 if (isALPHANUMERIC_A(start)) {
20708 UV mask = (isDIGIT_A(start))
20713 UV temp_end = start + 1;
20715 /* Find the end of the sub-range that includes just the
20716 * characters in the same class as the first character in it */
20717 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
20722 /* For short ranges, don't duplicate the code above to output
20723 * them; just call recursively */
20724 if (temp_end - start < min_range_count) {
20725 put_range(sv, start, temp_end, FALSE);
20727 else { /* Output as a range */
20728 put_code_point(sv, start);
20729 sv_catpvs(sv, "-");
20730 put_code_point(sv, temp_end);
20732 start = temp_end + 1;
20736 /* We output any other printables as individual characters */
20737 if (isPUNCT_A(start) || isSPACE_A(start)) {
20738 while (start <= end && (isPUNCT_A(start)
20739 || isSPACE_A(start)))
20741 put_code_point(sv, start);
20746 } /* End of looking for literals */
20748 /* Here is not to output as a literal. Some control characters have
20749 * mnemonic names. Split off any of those at the beginning and end of
20750 * the range to print mnemonically. It isn't possible for many of
20751 * these to be in a row, so this won't overwhelm with output */
20753 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
20755 while (isMNEMONIC_CNTRL(start) && start <= end) {
20756 put_code_point(sv, start);
20760 /* If this didn't take care of the whole range ... */
20761 if (start <= end) {
20763 /* Look backwards from the end to find the final non-mnemonic
20766 while (isMNEMONIC_CNTRL(temp_end)) {
20770 /* And separately output the interior range that doesn't start
20771 * or end with mnemonics */
20772 put_range(sv, start, temp_end, FALSE);
20774 /* Then output the mnemonic trailing controls */
20775 start = temp_end + 1;
20776 while (start <= end) {
20777 put_code_point(sv, start);
20784 /* As a final resort, output the range or subrange as hex. */
20786 this_end = (end < NUM_ANYOF_CODE_POINTS)
20788 : NUM_ANYOF_CODE_POINTS - 1;
20789 #if NUM_ANYOF_CODE_POINTS > 256
20790 format = (this_end < 256)
20791 ? "\\x%02" UVXf "-\\x%02" UVXf
20792 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
20794 format = "\\x%02" UVXf "-\\x%02" UVXf;
20796 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
20797 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
20798 GCC_DIAG_RESTORE_STMT;
20804 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
20806 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
20810 bool allow_literals = TRUE;
20812 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
20814 /* Generally, it is more readable if printable characters are output as
20815 * literals, but if a range (nearly) spans all of them, it's best to output
20816 * it as a single range. This code will use a single range if all but 2
20817 * ASCII printables are in it */
20818 invlist_iterinit(invlist);
20819 while (invlist_iternext(invlist, &start, &end)) {
20821 /* If the range starts beyond the final printable, it doesn't have any
20823 if (start > MAX_PRINT_A) {
20827 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
20828 * all but two, the range must start and end no later than 2 from
20830 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
20831 if (end > MAX_PRINT_A) {
20837 if (end - start >= MAX_PRINT_A - ' ' - 2) {
20838 allow_literals = FALSE;
20843 invlist_iterfinish(invlist);
20845 /* Here we have figured things out. Output each range */
20846 invlist_iterinit(invlist);
20847 while (invlist_iternext(invlist, &start, &end)) {
20848 if (start >= NUM_ANYOF_CODE_POINTS) {
20851 put_range(sv, start, end, allow_literals);
20853 invlist_iterfinish(invlist);
20859 S_put_charclass_bitmap_innards_common(pTHX_
20860 SV* invlist, /* The bitmap */
20861 SV* posixes, /* Under /l, things like [:word:], \S */
20862 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
20863 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
20864 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
20865 const bool invert /* Is the result to be inverted? */
20868 /* Create and return an SV containing a displayable version of the bitmap
20869 * and associated information determined by the input parameters. If the
20870 * output would have been only the inversion indicator '^', NULL is instead
20875 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
20878 output = newSVpvs("^");
20881 output = newSVpvs("");
20884 /* First, the code points in the bitmap that are unconditionally there */
20885 put_charclass_bitmap_innards_invlist(output, invlist);
20887 /* Traditionally, these have been placed after the main code points */
20889 sv_catsv(output, posixes);
20892 if (only_utf8 && _invlist_len(only_utf8)) {
20893 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
20894 put_charclass_bitmap_innards_invlist(output, only_utf8);
20897 if (not_utf8 && _invlist_len(not_utf8)) {
20898 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
20899 put_charclass_bitmap_innards_invlist(output, not_utf8);
20902 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
20903 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
20904 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
20906 /* This is the only list in this routine that can legally contain code
20907 * points outside the bitmap range. The call just above to
20908 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
20909 * output them here. There's about a half-dozen possible, and none in
20910 * contiguous ranges longer than 2 */
20911 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
20913 SV* above_bitmap = NULL;
20915 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
20917 invlist_iterinit(above_bitmap);
20918 while (invlist_iternext(above_bitmap, &start, &end)) {
20921 for (i = start; i <= end; i++) {
20922 put_code_point(output, i);
20925 invlist_iterfinish(above_bitmap);
20926 SvREFCNT_dec_NN(above_bitmap);
20930 if (invert && SvCUR(output) == 1) {
20938 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
20940 SV *nonbitmap_invlist,
20941 SV *only_utf8_locale_invlist,
20942 const regnode * const node,
20943 const bool force_as_is_display)
20945 /* Appends to 'sv' a displayable version of the innards of the bracketed
20946 * character class defined by the other arguments:
20947 * 'bitmap' points to the bitmap, or NULL if to ignore that.
20948 * 'nonbitmap_invlist' is an inversion list of the code points that are in
20949 * the bitmap range, but for some reason aren't in the bitmap; NULL if
20950 * none. The reasons for this could be that they require some
20951 * condition such as the target string being or not being in UTF-8
20952 * (under /d), or because they came from a user-defined property that
20953 * was not resolved at the time of the regex compilation (under /u)
20954 * 'only_utf8_locale_invlist' is an inversion list of the code points that
20955 * are valid only if the runtime locale is a UTF-8 one; NULL if none
20956 * 'node' is the regex pattern ANYOF node. It is needed only when the
20957 * above two parameters are not null, and is passed so that this
20958 * routine can tease apart the various reasons for them.
20959 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
20960 * to invert things to see if that leads to a cleaner display. If
20961 * FALSE, this routine is free to use its judgment about doing this.
20963 * It returns TRUE if there was actually something output. (It may be that
20964 * the bitmap, etc is empty.)
20966 * When called for outputting the bitmap of a non-ANYOF node, just pass the
20967 * bitmap, with the succeeding parameters set to NULL, and the final one to
20971 /* In general, it tries to display the 'cleanest' representation of the
20972 * innards, choosing whether to display them inverted or not, regardless of
20973 * whether the class itself is to be inverted. However, there are some
20974 * cases where it can't try inverting, as what actually matches isn't known
20975 * until runtime, and hence the inversion isn't either. */
20976 bool inverting_allowed = ! force_as_is_display;
20979 STRLEN orig_sv_cur = SvCUR(sv);
20981 SV* invlist; /* Inversion list we accumulate of code points that
20982 are unconditionally matched */
20983 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
20985 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
20987 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
20988 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
20991 SV* as_is_display; /* The output string when we take the inputs
20993 SV* inverted_display; /* The output string when we invert the inputs */
20995 U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
20997 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
20999 /* We are biased in favor of displaying things without them being inverted,
21000 * as that is generally easier to understand */
21001 const int bias = 5;
21003 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
21005 /* Start off with whatever code points are passed in. (We clone, so we
21006 * don't change the caller's list) */
21007 if (nonbitmap_invlist) {
21008 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
21009 invlist = invlist_clone(nonbitmap_invlist, NULL);
21011 else { /* Worst case size is every other code point is matched */
21012 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
21016 if (OP(node) == ANYOFD) {
21018 /* This flag indicates that the code points below 0x100 in the
21019 * nonbitmap list are precisely the ones that match only when the
21020 * target is UTF-8 (they should all be non-ASCII). */
21021 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
21023 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
21024 _invlist_subtract(invlist, only_utf8, &invlist);
21027 /* And this flag for matching all non-ASCII 0xFF and below */
21028 if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
21030 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
21033 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
21035 /* If either of these flags are set, what matches isn't
21036 * determinable except during execution, so don't know enough here
21038 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
21039 inverting_allowed = FALSE;
21042 /* What the posix classes match also varies at runtime, so these
21043 * will be output symbolically. */
21044 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
21047 posixes = newSVpvs("");
21048 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
21049 if (ANYOF_POSIXL_TEST(node, i)) {
21050 sv_catpv(posixes, anyofs[i]);
21057 /* Accumulate the bit map into the unconditional match list */
21059 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
21060 if (BITMAP_TEST(bitmap, i)) {
21063 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
21066 invlist = _add_range_to_invlist(invlist, start, i-1);
21071 /* Make sure that the conditional match lists don't have anything in them
21072 * that match unconditionally; otherwise the output is quite confusing.
21073 * This could happen if the code that populates these misses some
21076 _invlist_subtract(only_utf8, invlist, &only_utf8);
21079 _invlist_subtract(not_utf8, invlist, ¬_utf8);
21082 if (only_utf8_locale_invlist) {
21084 /* Since this list is passed in, we have to make a copy before
21086 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
21088 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
21090 /* And, it can get really weird for us to try outputting an inverted
21091 * form of this list when it has things above the bitmap, so don't even
21093 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
21094 inverting_allowed = FALSE;
21098 /* Calculate what the output would be if we take the input as-is */
21099 as_is_display = put_charclass_bitmap_innards_common(invlist,
21106 /* If have to take the output as-is, just do that */
21107 if (! inverting_allowed) {
21108 if (as_is_display) {
21109 sv_catsv(sv, as_is_display);
21110 SvREFCNT_dec_NN(as_is_display);
21113 else { /* But otherwise, create the output again on the inverted input, and
21114 use whichever version is shorter */
21116 int inverted_bias, as_is_bias;
21118 /* We will apply our bias to whichever of the the results doesn't have
21128 inverted_bias = bias;
21131 /* Now invert each of the lists that contribute to the output,
21132 * excluding from the result things outside the possible range */
21134 /* For the unconditional inversion list, we have to add in all the
21135 * conditional code points, so that when inverted, they will be gone
21137 _invlist_union(only_utf8, invlist, &invlist);
21138 _invlist_union(not_utf8, invlist, &invlist);
21139 _invlist_union(only_utf8_locale, invlist, &invlist);
21140 _invlist_invert(invlist);
21141 _invlist_intersection(invlist, PL_InBitmap, &invlist);
21144 _invlist_invert(only_utf8);
21145 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
21147 else if (not_utf8) {
21149 /* If a code point matches iff the target string is not in UTF-8,
21150 * then complementing the result has it not match iff not in UTF-8,
21151 * which is the same thing as matching iff it is UTF-8. */
21152 only_utf8 = not_utf8;
21156 if (only_utf8_locale) {
21157 _invlist_invert(only_utf8_locale);
21158 _invlist_intersection(only_utf8_locale,
21160 &only_utf8_locale);
21163 inverted_display = put_charclass_bitmap_innards_common(
21168 only_utf8_locale, invert);
21170 /* Use the shortest representation, taking into account our bias
21171 * against showing it inverted */
21172 if ( inverted_display
21173 && ( ! as_is_display
21174 || ( SvCUR(inverted_display) + inverted_bias
21175 < SvCUR(as_is_display) + as_is_bias)))
21177 sv_catsv(sv, inverted_display);
21179 else if (as_is_display) {
21180 sv_catsv(sv, as_is_display);
21183 SvREFCNT_dec(as_is_display);
21184 SvREFCNT_dec(inverted_display);
21187 SvREFCNT_dec_NN(invlist);
21188 SvREFCNT_dec(only_utf8);
21189 SvREFCNT_dec(not_utf8);
21190 SvREFCNT_dec(posixes);
21191 SvREFCNT_dec(only_utf8_locale);
21193 return SvCUR(sv) > orig_sv_cur;
21196 #define CLEAR_OPTSTART \
21197 if (optstart) STMT_START { \
21198 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
21199 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
21203 #define DUMPUNTIL(b,e) \
21205 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
21207 STATIC const regnode *
21208 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
21209 const regnode *last, const regnode *plast,
21210 SV* sv, I32 indent, U32 depth)
21212 U8 op = PSEUDO; /* Arbitrary non-END op. */
21213 const regnode *next;
21214 const regnode *optstart= NULL;
21216 RXi_GET_DECL(r, ri);
21217 GET_RE_DEBUG_FLAGS_DECL;
21219 PERL_ARGS_ASSERT_DUMPUNTIL;
21221 #ifdef DEBUG_DUMPUNTIL
21222 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
21223 last ? last-start : 0, plast ? plast-start : 0);
21226 if (plast && plast < last)
21229 while (PL_regkind[op] != END && (!last || node < last)) {
21231 /* While that wasn't END last time... */
21234 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
21236 next = regnext((regnode *)node);
21239 if (OP(node) == OPTIMIZED) {
21240 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
21247 regprop(r, sv, node, NULL, NULL);
21248 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
21249 (int)(2*indent + 1), "", SvPVX_const(sv));
21251 if (OP(node) != OPTIMIZED) {
21252 if (next == NULL) /* Next ptr. */
21253 Perl_re_printf( aTHX_ " (0)");
21254 else if (PL_regkind[(U8)op] == BRANCH
21255 && PL_regkind[OP(next)] != BRANCH )
21256 Perl_re_printf( aTHX_ " (FAIL)");
21258 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
21259 Perl_re_printf( aTHX_ "\n");
21263 if (PL_regkind[(U8)op] == BRANCHJ) {
21266 const regnode *nnode = (OP(next) == LONGJMP
21267 ? regnext((regnode *)next)
21269 if (last && nnode > last)
21271 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
21274 else if (PL_regkind[(U8)op] == BRANCH) {
21276 DUMPUNTIL(NEXTOPER(node), next);
21278 else if ( PL_regkind[(U8)op] == TRIE ) {
21279 const regnode *this_trie = node;
21280 const char op = OP(node);
21281 const U32 n = ARG(node);
21282 const reg_ac_data * const ac = op>=AHOCORASICK ?
21283 (reg_ac_data *)ri->data->data[n] :
21285 const reg_trie_data * const trie =
21286 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
21288 AV *const trie_words
21289 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
21291 const regnode *nextbranch= NULL;
21294 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
21295 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
21297 Perl_re_indentf( aTHX_ "%s ",
21300 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
21301 SvCUR(*elem_ptr), PL_dump_re_max_len,
21302 PL_colors[0], PL_colors[1],
21304 ? PERL_PV_ESCAPE_UNI
21306 | PERL_PV_PRETTY_ELLIPSES
21307 | PERL_PV_PRETTY_LTGT
21312 U16 dist= trie->jump[word_idx+1];
21313 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
21314 (UV)((dist ? this_trie + dist : next) - start));
21317 nextbranch= this_trie + trie->jump[0];
21318 DUMPUNTIL(this_trie + dist, nextbranch);
21320 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
21321 nextbranch= regnext((regnode *)nextbranch);
21323 Perl_re_printf( aTHX_ "\n");
21326 if (last && next > last)
21331 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
21332 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
21333 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
21335 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
21337 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
21339 else if ( op == PLUS || op == STAR) {
21340 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
21342 else if (PL_regkind[(U8)op] == EXACT) {
21343 /* Literal string, where present. */
21344 node += NODE_SZ_STR(node) - 1;
21345 node = NEXTOPER(node);
21348 node = NEXTOPER(node);
21349 node += regarglen[(U8)op];
21351 if (op == CURLYX || op == OPEN || op == SROPEN)
21355 #ifdef DEBUG_DUMPUNTIL
21356 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
21361 #endif /* DEBUGGING */
21363 #ifndef PERL_IN_XSUB_RE
21365 #include "uni_keywords.h"
21368 Perl_init_uniprops(pTHX)
21370 /* Set up the inversion list global variables */
21372 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
21373 PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
21374 PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
21375 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
21376 PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
21377 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
21378 PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
21379 PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
21380 PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
21381 PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
21382 PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
21383 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
21384 PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
21385 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
21386 PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
21387 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
21389 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
21390 PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
21391 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
21392 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
21393 PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
21394 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
21395 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
21396 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
21397 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
21398 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
21399 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
21400 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
21401 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
21402 PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
21403 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
21404 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
21406 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
21407 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
21408 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
21409 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
21410 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
21412 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
21413 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
21414 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
21416 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
21418 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
21419 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
21421 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
21422 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
21424 PL_utf8_foldable = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
21425 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
21426 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
21427 PL_NonL1NonFinalFold = _new_invlist_C_array(
21428 NonL1_Perl_Non_Final_Folds_invlist);
21430 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
21431 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
21432 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
21433 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
21434 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
21435 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
21436 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
21438 /* The below are used only by deprecated functions. They could be removed */
21439 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
21440 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
21441 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
21445 Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len,
21446 const bool to_fold, bool * invert)
21448 /* Parse the interior meat of \p{} passed to this in 'name' with length
21449 * 'name_len', and return an inversion list if a property with 'name' is
21450 * found, or NULL if not. 'name' point to the input with leading and
21451 * trailing space trimmed. 'to_fold' indicates if /i is in effect.
21453 * When the return is an inversion list, '*invert' will be set to a boolean
21454 * indicating if it should be inverted or not
21456 * This currently doesn't handle all cases. A NULL return indicates the
21457 * caller should try a different approach
21461 bool stricter = FALSE;
21462 bool is_nv_type = FALSE; /* nv= or numeric_value=, or possibly one
21463 of the cjk numeric properties (though
21464 it requires extra effort to compile
21467 unsigned int j = 0, lookup_len;
21468 int equals_pos = -1; /* Where the '=' is found, or negative if none */
21469 int slash_pos = -1; /* Where the '/' is found, or negative if none */
21470 int table_index = 0;
21471 bool starts_with_In_or_Is = FALSE;
21472 Size_t lookup_offset = 0;
21474 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
21476 /* The input will be modified into 'lookup_name' */
21477 Newx(lookup_name, name_len, char);
21478 SAVEFREEPV(lookup_name);
21480 /* Parse the input. */
21481 for (i = 0; i < name_len; i++) {
21482 char cur = name[i];
21484 /* These characters can be freely ignored in most situations. Later it
21485 * may turn out we shouldn't have ignored them, and we have to reparse,
21486 * but we don't have enough information yet to make that decision */
21487 if (cur == '-' || cur == '_' || isSPACE_A(cur)) {
21491 /* Case differences are also ignored. Our lookup routine assumes
21492 * everything is lowercase */
21493 if (isUPPER_A(cur)) {
21494 lookup_name[j++] = toLOWER(cur);
21498 /* A double colon is either an error, or a package qualifier to a
21499 * subroutine user-defined property; neither of which do we currently
21502 * But a single colon is a synonym for '=' */
21504 if (i < name_len - 1 && name[i+1] == ':') {
21510 /* Otherwise, this character is part of the name. */
21511 lookup_name[j++] = cur;
21513 /* Only the equals sign needs further processing */
21515 equals_pos = j; /* Note where it occurred in the input */
21520 /* Here, we are either done with the whole property name, if it was simple;
21521 * or are positioned just after the '=' if it is compound. */
21523 if (equals_pos >= 0) {
21524 assert(! stricter); /* We shouldn't have set this yet */
21526 /* Space immediately after the '=' is ignored */
21528 for (; i < name_len; i++) {
21529 if (! isSPACE_A(name[i])) {
21534 /* Certain properties need special handling. They may optionally be
21535 * prefixed by 'is'. Ignore that prefix for the purposes of checking
21536 * if this is one of those properties */
21537 if (memBEGINPs(lookup_name, name_len, "is")) {
21541 /* Then check if it is one of these properties. This is hard-coded
21542 * because easier this way, and the list is unlikely to change. There
21543 * are several properties like this in the Unihan DB, which is unlikely
21544 * to be compiled, and they all end with 'numeric'. The interiors
21545 * aren't checked for the precise property. This would stop working if
21546 * a cjk property were to be created that ended with 'numeric' and
21547 * wasn't a numeric type */
21548 is_nv_type = memEQs(lookup_name + lookup_offset,
21549 j - 1 - lookup_offset, "numericvalue")
21550 || memEQs(lookup_name + lookup_offset,
21551 j - 1 - lookup_offset, "nv")
21552 || ( memENDPs(lookup_name + lookup_offset,
21553 j - 1 - lookup_offset, "numeric")
21554 && ( memBEGINPs(lookup_name + lookup_offset,
21555 j - 1 - lookup_offset, "cjk")
21556 || memBEGINPs(lookup_name + lookup_offset,
21557 j - 1 - lookup_offset, "k")));
21559 || memEQs(lookup_name + lookup_offset,
21560 j - 1 - lookup_offset, "canonicalcombiningclass")
21561 || memEQs(lookup_name + lookup_offset,
21562 j - 1 - lookup_offset, "ccc")
21563 || memEQs(lookup_name + lookup_offset,
21564 j - 1 - lookup_offset, "age")
21565 || memEQs(lookup_name + lookup_offset,
21566 j - 1 - lookup_offset, "in")
21567 || memEQs(lookup_name + lookup_offset,
21568 j - 1 - lookup_offset, "presentin"))
21572 /* What makes these properties special is that the stuff after the
21573 * '=' is a number. Therefore, we can't throw away '-'
21574 * willy-nilly, as those could be a minus sign. Other stricter
21575 * rules also apply. However, these properties all can have the
21576 * rhs not be a number, in which case they contain at least one
21577 * alphabetic. In those cases, the stricter rules don't apply.
21578 * But the numeric type properties can have the alphas [Ee] to
21579 * signify an exponent, and it is still a number with stricter
21580 * rules. So look for an alpha that signifys not-strict */
21582 for (k = i; k < name_len; k++) {
21583 if ( isALPHA_A(name[k])
21584 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
21594 /* A number may have a leading '+' or '-'. The latter is retained
21596 if (name[i] == '+') {
21599 else if (name[i] == '-') {
21600 lookup_name[j++] = '-';
21604 /* Skip leading zeros including single underscores separating the
21605 * zeros, or between the final leading zero and the first other
21607 for (; i < name_len - 1; i++) {
21608 if ( name[i] != '0'
21609 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
21616 else { /* No '=' */
21618 /* We are now in a position to determine if this property should have
21619 * been parsed using stricter rules. Only a few are like that, and
21620 * unlikely to change. */
21621 if ( memBEGINPs(lookup_name, j, "perl")
21622 && memNEs(lookup_name + 4, j - 4, "space")
21623 && memNEs(lookup_name + 4, j - 4, "word"))
21627 /* We set the inputs back to 0 and the code below will reparse,
21633 /* Here, we have either finished the property, or are positioned to parse
21634 * the remainder, and we know if stricter rules apply. Finish out, if not
21636 for (; i < name_len; i++) {
21637 char cur = name[i];
21639 /* In all instances, case differences are ignored, and we normalize to
21641 if (isUPPER_A(cur)) {
21642 lookup_name[j++] = toLOWER(cur);
21646 /* An underscore is skipped, but not under strict rules unless it
21647 * separates two digits */
21650 && ( i == 0 || (int) i == equals_pos || i == name_len- 1
21651 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
21653 lookup_name[j++] = '_';
21658 /* Hyphens are skipped except under strict */
21659 if (cur == '-' && ! stricter) {
21663 /* XXX Bug in documentation. It says white space skipped adjacent to
21664 * non-word char. Maybe we should, but shouldn't skip it next to a dot
21666 if (isSPACE_A(cur) && ! stricter) {
21670 lookup_name[j++] = cur;
21672 /* Unless this is a non-trailing slash, we are done with it */
21673 if (i >= name_len - 1 || cur != '/') {
21679 /* A slash in the 'numeric value' property indicates that what follows
21680 * is a denominator. It can have a leading '+' and '0's that should be
21681 * skipped. But we have never allowed a negative denominator, so treat
21682 * a minus like every other character. (No need to rule out a second
21683 * '/', as that won't match anything anyway */
21686 if (i < name_len && name[i] == '+') {
21690 /* Skip leading zeros including underscores separating digits */
21691 for (; i < name_len - 1; i++) {
21692 if ( name[i] != '0'
21693 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
21699 /* Store the first real character in the denominator */
21700 lookup_name[j++] = name[i];
21704 /* Here are completely done parsing the input 'name', and 'lookup_name'
21705 * contains a copy, normalized.
21707 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
21708 * different from without the underscores. */
21709 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
21710 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
21711 && UNLIKELY(name[name_len-1] == '_'))
21713 lookup_name[j++] = '&';
21715 else if (name_len > 2 && name[0] == 'I' && ( name[1] == 'n'
21716 || name[1] == 's'))
21719 /* Also, if the original input began with 'In' or 'Is', it could be a
21720 * subroutine call instead of a property names, which currently isn't
21721 * handled by this function. Subroutine calls can't happen if there is
21722 * an '=' in the name */
21723 if (equals_pos < 0 && get_cvn_flags(name, name_len, GV_NOTQUAL) != NULL)
21728 starts_with_In_or_Is = TRUE;
21731 lookup_len = j; /* Use a more mnemonic name starting here */
21733 /* Get the index into our pointer table of the inversion list corresponding
21734 * to the property */
21735 table_index = match_uniprop((U8 *) lookup_name, lookup_len);
21737 /* If it didn't find the property */
21738 if (table_index == 0) {
21740 /* If didn't find the property, we try again stripping off any initial
21742 if (starts_with_In_or_Is) {
21748 table_index = match_uniprop((U8 *) lookup_name, lookup_len);
21751 if (table_index == 0) {
21754 /* If not found, and not a numeric type property, isn't a legal
21756 if (! is_nv_type) {
21760 /* But the numeric type properties need more work to decide. What
21761 * we do is make sure we have the number in canonical form and look
21764 if (slash_pos < 0) { /* No slash */
21766 /* When it isn't a rational, take the input, convert it to a
21767 * NV, then create a canonical string representation of that
21772 /* Get the value */
21773 if (my_atof3(lookup_name + equals_pos, &value,
21774 lookup_len - equals_pos)
21775 != lookup_name + lookup_len)
21780 /* If the value is an integer, the canonical value is integral */
21781 if (Perl_ceil(value) == value) {
21782 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
21783 equals_pos, lookup_name, value);
21785 else { /* Otherwise, it is %e with a known precision */
21788 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
21789 equals_pos, lookup_name,
21790 PL_E_FORMAT_PRECISION, value);
21792 /* The exponent generated is expecting two digits, whereas
21793 * %e on some systems will generate three. Remove leading
21794 * zeros in excess of 2 from the exponent. We start
21795 * looking for them after the '=' */
21796 exp_ptr = strchr(canonical + equals_pos, 'e');
21798 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
21799 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
21801 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
21803 if (excess_exponent_len > 0) {
21804 SSize_t leading_zeros = strspn(cur_ptr, "0");
21805 SSize_t excess_leading_zeros
21806 = MIN(leading_zeros, excess_exponent_len);
21807 if (excess_leading_zeros > 0) {
21808 Move(cur_ptr + excess_leading_zeros,
21810 strlen(cur_ptr) - excess_leading_zeros
21811 + 1, /* Copy the NUL as well */
21818 else { /* Has a slash. Create a rational in canonical form */
21819 UV numerator, denominator, gcd, trial;
21820 const char * end_ptr;
21821 const char * sign = "";
21823 /* We can't just find the numerator, denominator, and do the
21824 * division, then use the method above, because that is
21825 * inexact. And the input could be a rational that is within
21826 * epsilon (given our precision) of a valid rational, and would
21827 * then incorrectly compare valid.
21829 * We're only interested in the part after the '=' */
21830 const char * this_lookup_name = lookup_name + equals_pos;
21831 lookup_len -= equals_pos;
21832 slash_pos -= equals_pos;
21834 /* Handle any leading minus */
21835 if (this_lookup_name[0] == '-') {
21837 this_lookup_name++;
21842 /* Convert the numerator to numeric */
21843 end_ptr = this_lookup_name + slash_pos;
21844 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
21848 /* It better have included all characters before the slash */
21849 if (*end_ptr != '/') {
21853 /* Set to look at just the denominator */
21854 this_lookup_name += slash_pos;
21855 lookup_len -= slash_pos;
21856 end_ptr = this_lookup_name + lookup_len;
21858 /* Convert the denominator to numeric */
21859 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
21863 /* It better be the rest of the characters, and don't divide by
21865 if ( end_ptr != this_lookup_name + lookup_len
21866 || denominator == 0)
21871 /* Get the greatest common denominator using
21872 http://en.wikipedia.org/wiki/Euclidean_algorithm */
21874 trial = denominator;
21875 while (trial != 0) {
21877 trial = gcd % trial;
21881 /* If already in lowest possible terms, we have already tried
21882 * looking this up */
21887 /* Reduce the rational, which should put it in canonical form.
21888 * Then look it up */
21890 denominator /= gcd;
21892 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
21893 equals_pos, lookup_name, sign, numerator, denominator);
21896 /* Here, we have the number in canonical form. Try that */
21897 table_index = match_uniprop((U8 *) canonical, strlen(canonical));
21898 if (table_index == 0) {
21904 /* The return is an index into a table of ptrs. A negative return
21905 * signifies that the real index is the absolute value, but the result
21906 * needs to be inverted */
21907 if (table_index < 0) {
21909 table_index = -table_index;
21915 /* Out-of band indices indicate a deprecated property. The proper index is
21916 * modulo it with the table size. And dividing by the table size yields
21917 * an offset into a table constructed to contain the corresponding warning
21919 if (table_index > MAX_UNI_KEYWORD_INDEX) {
21920 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
21921 table_index %= MAX_UNI_KEYWORD_INDEX;
21922 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
21923 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
21924 (int) name_len, name, deprecated_property_msgs[warning_offset]);
21927 /* In a few properties, a different property is used under /i. These are
21928 * unlikely to change, so are hard-coded here. */
21930 if ( table_index == UNI_XPOSIXUPPER
21931 || table_index == UNI_XPOSIXLOWER
21932 || table_index == UNI_TITLE)
21934 table_index = UNI_CASED;
21936 else if ( table_index == UNI_UPPERCASELETTER
21937 || table_index == UNI_LOWERCASELETTER
21938 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
21939 || table_index == UNI_TITLECASELETTER
21942 table_index = UNI_CASEDLETTER;
21944 else if ( table_index == UNI_POSIXUPPER
21945 || table_index == UNI_POSIXLOWER)
21947 table_index = UNI_POSIXALPHA;
21951 /* Create and return the inversion list */
21952 return _new_invlist_C_array(uni_prop_ptrs[table_index]);
21958 * ex: set ts=8 sts=4 sw=4 et: