5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
78 #ifdef PERL_IN_XSUB_RE
80 EXTERN_C const struct regexp_engine my_reg_engine;
85 #include "dquote_inline.h"
86 #include "invlist_inline.h"
87 #include "unicode_constants.h"
89 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
90 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
91 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
92 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
93 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
100 /* this is a chain of data about sub patterns we are processing that
101 need to be handled separately/specially in study_chunk. Its so
102 we can simulate recursion without losing state. */
104 typedef struct scan_frame {
105 regnode *last_regnode; /* last node to process in this frame */
106 regnode *next_regnode; /* next node to process when last is reached */
107 U32 prev_recursed_depth;
108 I32 stopparen; /* what stopparen do we use */
110 struct scan_frame *this_prev_frame; /* this previous frame */
111 struct scan_frame *prev_frame; /* previous frame */
112 struct scan_frame *next_frame; /* next frame */
115 /* Certain characters are output as a sequence with the first being a
117 #define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
120 struct RExC_state_t {
121 U32 flags; /* RXf_* are we folding, multilining? */
122 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
123 char *precomp; /* uncompiled string. */
124 char *precomp_end; /* pointer to end of uncompiled string. */
125 REGEXP *rx_sv; /* The SV that is the regexp. */
126 regexp *rx; /* perl core regexp structure */
127 regexp_internal *rxi; /* internal data for regexp object
129 char *start; /* Start of input for compile */
130 char *end; /* End of input for compile */
131 char *parse; /* Input-scan pointer. */
132 char *copy_start; /* start of copy of input within
133 constructed parse string */
134 char *save_copy_start; /* Provides one level of saving
135 and restoring 'copy_start' */
136 char *copy_start_in_input; /* Position in input string
137 corresponding to copy_start */
138 SSize_t whilem_seen; /* number of WHILEM in this expr */
139 regnode *emit_start; /* Start of emitted-code area */
140 regnode_offset emit; /* Code-emit pointer */
141 I32 naughty; /* How bad is this pattern? */
142 I32 sawback; /* Did we see \1, ...? */
144 SSize_t size; /* Number of regnode equivalents in
147 /* position beyond 'precomp' of the warning message furthest away from
148 * 'precomp'. During the parse, no warnings are raised for any problems
149 * earlier in the parse than this position. This works if warnings are
150 * raised the first time a given spot is parsed, and if only one
151 * independent warning is raised for any given spot */
152 Size_t latest_warn_offset;
154 I32 npar; /* Capture buffer count so far in the
155 parse, (OPEN) plus one. ("par" 0 is
157 I32 total_par; /* During initial parse, is either 0,
158 or -1; the latter indicating a
159 reparse is needed. After that pass,
160 it is what 'npar' became after the
161 pass. Hence, it being > 0 indicates
162 we are in a reparse situation */
163 I32 nestroot; /* root parens we are in - used by
166 regnode_offset *open_parens; /* offsets to open parens */
167 regnode_offset *close_parens; /* offsets to close parens */
168 I32 parens_buf_size; /* #slots malloced open/close_parens */
169 regnode *end_op; /* END node in program */
170 I32 utf8; /* whether the pattern is utf8 or not */
171 I32 orig_utf8; /* whether the pattern was originally in utf8 */
172 /* XXX use this for future optimisation of case
173 * where pattern must be upgraded to utf8. */
174 I32 uni_semantics; /* If a d charset modifier should use unicode
175 rules, even if the pattern is not in
177 HV *paren_names; /* Paren names */
179 regnode **recurse; /* Recurse regops */
180 I32 recurse_count; /* Number of recurse regops we have generated */
181 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
183 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
187 I32 override_recoding;
188 I32 recode_x_to_native;
189 I32 in_multi_char_class;
190 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
192 int code_index; /* next code_blocks[] slot */
193 SSize_t maxlen; /* mininum possible number of chars in string to match */
194 scan_frame *frame_head;
195 scan_frame *frame_last;
199 #ifdef ADD_TO_REGEXEC
200 char *starttry; /* -Dr: where regtry was called. */
201 #define RExC_starttry (pRExC_state->starttry)
203 SV *runtime_code_qr; /* qr with the runtime code blocks */
205 const char *lastparse;
207 AV *paren_name_list; /* idx -> name */
208 U32 study_chunk_recursed_count;
212 #define RExC_lastparse (pRExC_state->lastparse)
213 #define RExC_lastnum (pRExC_state->lastnum)
214 #define RExC_paren_name_list (pRExC_state->paren_name_list)
215 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
216 #define RExC_mysv (pRExC_state->mysv1)
217 #define RExC_mysv1 (pRExC_state->mysv1)
218 #define RExC_mysv2 (pRExC_state->mysv2)
228 #define RExC_flags (pRExC_state->flags)
229 #define RExC_pm_flags (pRExC_state->pm_flags)
230 #define RExC_precomp (pRExC_state->precomp)
231 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
232 #define RExC_copy_start_in_constructed (pRExC_state->copy_start)
233 #define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
234 #define RExC_precomp_end (pRExC_state->precomp_end)
235 #define RExC_rx_sv (pRExC_state->rx_sv)
236 #define RExC_rx (pRExC_state->rx)
237 #define RExC_rxi (pRExC_state->rxi)
238 #define RExC_start (pRExC_state->start)
239 #define RExC_end (pRExC_state->end)
240 #define RExC_parse (pRExC_state->parse)
241 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
242 #define RExC_whilem_seen (pRExC_state->whilem_seen)
243 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
244 under /d from /u ? */
246 #ifdef RE_TRACK_PATTERN_OFFSETS
247 # define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
250 #define RExC_emit (pRExC_state->emit)
251 #define RExC_emit_start (pRExC_state->emit_start)
252 #define RExC_sawback (pRExC_state->sawback)
253 #define RExC_seen (pRExC_state->seen)
254 #define RExC_size (pRExC_state->size)
255 #define RExC_maxlen (pRExC_state->maxlen)
256 #define RExC_npar (pRExC_state->npar)
257 #define RExC_total_parens (pRExC_state->total_par)
258 #define RExC_parens_buf_size (pRExC_state->parens_buf_size)
259 #define RExC_nestroot (pRExC_state->nestroot)
260 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
261 #define RExC_utf8 (pRExC_state->utf8)
262 #define RExC_uni_semantics (pRExC_state->uni_semantics)
263 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
264 #define RExC_open_parens (pRExC_state->open_parens)
265 #define RExC_close_parens (pRExC_state->close_parens)
266 #define RExC_end_op (pRExC_state->end_op)
267 #define RExC_paren_names (pRExC_state->paren_names)
268 #define RExC_recurse (pRExC_state->recurse)
269 #define RExC_recurse_count (pRExC_state->recurse_count)
270 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
271 #define RExC_study_chunk_recursed_bytes \
272 (pRExC_state->study_chunk_recursed_bytes)
273 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
274 #define RExC_in_lookahead (pRExC_state->in_lookahead)
275 #define RExC_contains_locale (pRExC_state->contains_locale)
276 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
279 # define SET_recode_x_to_native(x) \
280 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
282 # define SET_recode_x_to_native(x) NOOP
285 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
286 #define RExC_frame_head (pRExC_state->frame_head)
287 #define RExC_frame_last (pRExC_state->frame_last)
288 #define RExC_frame_count (pRExC_state->frame_count)
289 #define RExC_strict (pRExC_state->strict)
290 #define RExC_study_started (pRExC_state->study_started)
291 #define RExC_warn_text (pRExC_state->warn_text)
292 #define RExC_in_script_run (pRExC_state->in_script_run)
293 #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
294 #define RExC_unlexed_names (pRExC_state->unlexed_names)
296 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
297 * a flag to disable back-off on the fixed/floating substrings - if it's
298 * a high complexity pattern we assume the benefit of avoiding a full match
299 * is worth the cost of checking for the substrings even if they rarely help.
301 #define RExC_naughty (pRExC_state->naughty)
302 #define TOO_NAUGHTY (10)
303 #define MARK_NAUGHTY(add) \
304 if (RExC_naughty < TOO_NAUGHTY) \
305 RExC_naughty += (add)
306 #define MARK_NAUGHTY_EXP(exp, add) \
307 if (RExC_naughty < TOO_NAUGHTY) \
308 RExC_naughty += RExC_naughty / (exp) + (add)
310 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
311 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
312 ((*s) == '{' && regcurly(s)))
315 * Flags to be passed up and down.
317 #define WORST 0 /* Worst case. */
318 #define HASWIDTH 0x01 /* Known to not match null strings, could match
321 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
322 * character. (There needs to be a case: in the switch statement in regexec.c
323 * for any node marked SIMPLE.) Note that this is not the same thing as
326 #define SPSTART 0x04 /* Starts with * or + */
327 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
328 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
329 #define RESTART_PARSE 0x20 /* Need to redo the parse */
330 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
331 calcuate sizes as UTF-8 */
333 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
335 /* whether trie related optimizations are enabled */
336 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
337 #define TRIE_STUDY_OPT
338 #define FULL_TRIE_STUDY
344 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
345 #define PBITVAL(paren) (1 << ((paren) & 7))
346 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
347 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
348 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
350 #define REQUIRE_UTF8(flagp) STMT_START { \
352 *flagp = RESTART_PARSE|NEED_UTF8; \
357 /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
358 * a flag that indicates we need to override /d with /u as a result of
359 * something in the pattern. It should only be used in regards to calling
360 * set_regex_charset() or get_regex_charset() */
361 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
363 if (DEPENDS_SEMANTICS) { \
364 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
365 RExC_uni_semantics = 1; \
366 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
367 /* No need to restart the parse if we haven't seen \
368 * anything that differs between /u and /d, and no need \
369 * to restart immediately if we're going to reparse \
370 * anyway to count parens */ \
371 *flagp |= RESTART_PARSE; \
372 return restart_retval; \
377 #define REQUIRE_BRANCHJ(flagp, restart_retval) \
379 RExC_use_BRANCHJ = 1; \
380 *flagp |= RESTART_PARSE; \
381 return restart_retval; \
384 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
385 * less. After that, it must always be positive, because the whole re is
386 * considered to be surrounded by virtual parens. Setting it to negative
387 * indicates there is some construct that needs to know the actual number of
388 * parens to be properly handled. And that means an extra pass will be
389 * required after we've counted them all */
390 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
391 #define REQUIRE_PARENS_PASS \
392 STMT_START { /* No-op if have completed a pass */ \
393 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
395 #define IN_PARENS_PASS (RExC_total_parens < 0)
398 /* This is used to return failure (zero) early from the calling function if
399 * various flags in 'flags' are set. Two flags always cause a return:
400 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
401 * additional flags that should cause a return; 0 if none. If the return will
402 * be done, '*flagp' is first set to be all of the flags that caused the
404 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
406 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
407 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
412 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
414 #define RETURN_FAIL_ON_RESTART(flags,flagp) \
415 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
416 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
417 if (MUST_RESTART(*(flagp))) return 0
419 /* This converts the named class defined in regcomp.h to its equivalent class
420 * number defined in handy.h. */
421 #define namedclass_to_classnum(class) ((int) ((class) / 2))
422 #define classnum_to_namedclass(classnum) ((classnum) * 2)
424 #define _invlist_union_complement_2nd(a, b, output) \
425 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
426 #define _invlist_intersection_complement_2nd(a, b, output) \
427 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
429 /* About scan_data_t.
431 During optimisation we recurse through the regexp program performing
432 various inplace (keyhole style) optimisations. In addition study_chunk
433 and scan_commit populate this data structure with information about
434 what strings MUST appear in the pattern. We look for the longest
435 string that must appear at a fixed location, and we look for the
436 longest string that may appear at a floating location. So for instance
441 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
442 strings (because they follow a .* construct). study_chunk will identify
443 both FOO and BAR as being the longest fixed and floating strings respectively.
445 The strings can be composites, for instance
449 will result in a composite fixed substring 'foo'.
451 For each string some basic information is maintained:
454 This is the position the string must appear at, or not before.
455 It also implicitly (when combined with minlenp) tells us how many
456 characters must match before the string we are searching for.
457 Likewise when combined with minlenp and the length of the string it
458 tells us how many characters must appear after the string we have
462 Only used for floating strings. This is the rightmost point that
463 the string can appear at. If set to SSize_t_MAX it indicates that the
464 string can occur infinitely far to the right.
465 For fixed strings, it is equal to min_offset.
468 A pointer to the minimum number of characters of the pattern that the
469 string was found inside. This is important as in the case of positive
470 lookahead or positive lookbehind we can have multiple patterns
475 The minimum length of the pattern overall is 3, the minimum length
476 of the lookahead part is 3, but the minimum length of the part that
477 will actually match is 1. So 'FOO's minimum length is 3, but the
478 minimum length for the F is 1. This is important as the minimum length
479 is used to determine offsets in front of and behind the string being
480 looked for. Since strings can be composites this is the length of the
481 pattern at the time it was committed with a scan_commit. Note that
482 the length is calculated by study_chunk, so that the minimum lengths
483 are not known until the full pattern has been compiled, thus the
484 pointer to the value.
488 In the case of lookbehind the string being searched for can be
489 offset past the start point of the final matching string.
490 If this value was just blithely removed from the min_offset it would
491 invalidate some of the calculations for how many chars must match
492 before or after (as they are derived from min_offset and minlen and
493 the length of the string being searched for).
494 When the final pattern is compiled and the data is moved from the
495 scan_data_t structure into the regexp structure the information
496 about lookbehind is factored in, with the information that would
497 have been lost precalculated in the end_shift field for the
500 The fields pos_min and pos_delta are used to store the minimum offset
501 and the delta to the maximum offset at the current point in the pattern.
505 struct scan_data_substrs {
506 SV *str; /* longest substring found in pattern */
507 SSize_t min_offset; /* earliest point in string it can appear */
508 SSize_t max_offset; /* latest point in string it can appear */
509 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
510 SSize_t lookbehind; /* is the pos of the string modified by LB */
511 I32 flags; /* per substring SF_* and SCF_* flags */
514 typedef struct scan_data_t {
515 /*I32 len_min; unused */
516 /*I32 len_delta; unused */
520 SSize_t last_end; /* min value, <0 unless valid. */
521 SSize_t last_start_min;
522 SSize_t last_start_max;
523 U8 cur_is_floating; /* whether the last_* values should be set as
524 * the next fixed (0) or floating (1)
527 /* [0] is longest fixed substring so far, [1] is longest float so far */
528 struct scan_data_substrs substrs[2];
530 I32 flags; /* common SF_* and SCF_* flags */
532 SSize_t *last_closep;
533 regnode_ssc *start_class;
537 * Forward declarations for pregcomp()'s friends.
540 static const scan_data_t zero_scan_data = {
541 0, 0, NULL, 0, 0, 0, 0,
543 { NULL, 0, 0, 0, 0, 0 },
544 { NULL, 0, 0, 0, 0, 0 },
551 #define SF_BEFORE_SEOL 0x0001
552 #define SF_BEFORE_MEOL 0x0002
553 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
555 #define SF_IS_INF 0x0040
556 #define SF_HAS_PAR 0x0080
557 #define SF_IN_PAR 0x0100
558 #define SF_HAS_EVAL 0x0200
561 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
562 * longest substring in the pattern. When it is not set the optimiser keeps
563 * track of position, but does not keep track of the actual strings seen,
565 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
568 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
569 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
570 * turned off because of the alternation (BRANCH). */
571 #define SCF_DO_SUBSTR 0x0400
573 #define SCF_DO_STCLASS_AND 0x0800
574 #define SCF_DO_STCLASS_OR 0x1000
575 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
576 #define SCF_WHILEM_VISITED_POS 0x2000
578 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
579 #define SCF_SEEN_ACCEPT 0x8000
580 #define SCF_TRIE_DOING_RESTUDY 0x10000
581 #define SCF_IN_DEFINE 0x20000
586 #define UTF cBOOL(RExC_utf8)
588 /* The enums for all these are ordered so things work out correctly */
589 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
590 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
591 == REGEX_DEPENDS_CHARSET)
592 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
593 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
594 >= REGEX_UNICODE_CHARSET)
595 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
596 == REGEX_ASCII_RESTRICTED_CHARSET)
597 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
598 >= REGEX_ASCII_RESTRICTED_CHARSET)
599 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
600 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
602 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
604 /* For programs that want to be strictly Unicode compatible by dying if any
605 * attempt is made to match a non-Unicode code point against a Unicode
607 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
609 #define OOB_NAMEDCLASS -1
611 /* There is no code point that is out-of-bounds, so this is problematic. But
612 * its only current use is to initialize a variable that is always set before
614 #define OOB_UNICODE 0xDEADBEEF
616 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
619 /* length of regex to show in messages that don't mark a position within */
620 #define RegexLengthToShowInErrorMessages 127
623 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
624 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
625 * op/pragma/warn/regcomp.
627 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
628 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
630 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
631 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
633 /* The code in this file in places uses one level of recursion with parsing
634 * rebased to an alternate string constructed by us in memory. This can take
635 * the form of something that is completely different from the input, or
636 * something that uses the input as part of the alternate. In the first case,
637 * there should be no possibility of an error, as we are in complete control of
638 * the alternate string. But in the second case we don't completely control
639 * the input portion, so there may be errors in that. Here's an example:
641 * is handled specially because \x{df} folds to a sequence of more than one
642 * character: 'ss'. What is done is to create and parse an alternate string,
643 * which looks like this:
644 * /(?:\x{DF}|[abc\x{DF}def])/ui
645 * where it uses the input unchanged in the middle of something it constructs,
646 * which is a branch for the DF outside the character class, and clustering
647 * parens around the whole thing. (It knows enough to skip the DF inside the
648 * class while in this substitute parse.) 'abc' and 'def' may have errors that
649 * need to be reported. The general situation looks like this:
651 * |<------- identical ------>|
653 * Input: ---------------------------------------------------------------
654 * Constructed: ---------------------------------------------------
656 * |<------- identical ------>|
658 * sI..eI is the portion of the input pattern we are concerned with here.
659 * sC..EC is the constructed substitute parse string.
660 * sC..tC is constructed by us
661 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
662 * In the diagram, these are vertically aligned.
663 * eC..EC is also constructed by us.
664 * xC is the position in the substitute parse string where we found a
666 * xI is the position in the original pattern corresponding to xC.
668 * We want to display a message showing the real input string. Thus we need to
669 * translate from xC to xI. We know that xC >= tC, since the portion of the
670 * string sC..tC has been constructed by us, and so shouldn't have errors. We
672 * xI = tI + (xC - tC)
674 * When the substitute parse is constructed, the code needs to set:
677 * RExC_copy_start_in_input (tI)
678 * RExC_copy_start_in_constructed (tC)
679 * and restore them when done.
681 * During normal processing of the input pattern, both
682 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
683 * sI, so that xC equals xI.
686 #define sI RExC_precomp
687 #define eI RExC_precomp_end
688 #define sC RExC_start
690 #define tI RExC_copy_start_in_input
691 #define tC RExC_copy_start_in_constructed
692 #define xI(xC) (tI + (xC - tC))
693 #define xI_offset(xC) (xI(xC) - sI)
695 #define REPORT_LOCATION_ARGS(xC) \
697 (xI(xC) > eI) /* Don't run off end */ \
698 ? eI - sI /* Length before the <--HERE */ \
699 : ((xI_offset(xC) >= 0) \
701 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
702 IVdf " trying to output message for " \
704 __FILE__, __LINE__, (IV) xI_offset(xC), \
705 ((int) (eC - sC)), sC), 0)), \
706 sI), /* The input pattern printed up to the <--HERE */ \
708 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
709 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
711 /* Used to point after bad bytes for an error message, but avoid skipping
712 * past a nul byte. */
713 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
715 /* Set up to clean up after our imminent demise */
716 #define PREPARE_TO_DIE \
719 SAVEFREESV(RExC_rx_sv); \
720 if (RExC_open_parens) \
721 SAVEFREEPV(RExC_open_parens); \
722 if (RExC_close_parens) \
723 SAVEFREEPV(RExC_close_parens); \
727 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
728 * arg. Show regex, up to a maximum length. If it's too long, chop and add
731 #define _FAIL(code) STMT_START { \
732 const char *ellipses = ""; \
733 IV len = RExC_precomp_end - RExC_precomp; \
736 if (len > RegexLengthToShowInErrorMessages) { \
737 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
738 len = RegexLengthToShowInErrorMessages - 10; \
744 #define FAIL(msg) _FAIL( \
745 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
746 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
748 #define FAIL2(msg,arg) _FAIL( \
749 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
750 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
752 #define FAIL3(msg,arg1,arg2) _FAIL( \
753 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
754 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
757 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
759 #define Simple_vFAIL(m) STMT_START { \
760 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
761 m, REPORT_LOCATION_ARGS(RExC_parse)); \
765 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
767 #define vFAIL(m) STMT_START { \
773 * Like Simple_vFAIL(), but accepts two arguments.
775 #define Simple_vFAIL2(m,a1) STMT_START { \
776 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
777 REPORT_LOCATION_ARGS(RExC_parse)); \
781 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
783 #define vFAIL2(m,a1) STMT_START { \
785 Simple_vFAIL2(m, a1); \
790 * Like Simple_vFAIL(), but accepts three arguments.
792 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
793 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
794 REPORT_LOCATION_ARGS(RExC_parse)); \
798 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
800 #define vFAIL3(m,a1,a2) STMT_START { \
802 Simple_vFAIL3(m, a1, a2); \
806 * Like Simple_vFAIL(), but accepts four arguments.
808 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
809 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
810 REPORT_LOCATION_ARGS(RExC_parse)); \
813 #define vFAIL4(m,a1,a2,a3) STMT_START { \
815 Simple_vFAIL4(m, a1, a2, a3); \
818 /* A specialized version of vFAIL2 that works with UTF8f */
819 #define vFAIL2utf8f(m, a1) STMT_START { \
821 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
822 REPORT_LOCATION_ARGS(RExC_parse)); \
825 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
827 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
828 REPORT_LOCATION_ARGS(RExC_parse)); \
831 /* Setting this to NULL is a signal to not output warnings */
832 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
834 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
835 RExC_copy_start_in_constructed = NULL; \
837 #define RESTORE_WARNINGS \
838 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
840 /* Since a warning can be generated multiple times as the input is reparsed, we
841 * output it the first time we come to that point in the parse, but suppress it
842 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
843 * generate any warnings */
844 #define TO_OUTPUT_WARNINGS(loc) \
845 ( RExC_copy_start_in_constructed \
846 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
848 /* After we've emitted a warning, we save the position in the input so we don't
850 #define UPDATE_WARNINGS_LOC(loc) \
852 if (TO_OUTPUT_WARNINGS(loc)) { \
853 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
858 /* 'warns' is the output of the packWARNx macro used in 'code' */
859 #define _WARN_HELPER(loc, warns, code) \
861 if (! RExC_copy_start_in_constructed) { \
862 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
863 " expected at '%s'", \
864 __FILE__, __LINE__, loc); \
866 if (TO_OUTPUT_WARNINGS(loc)) { \
870 UPDATE_WARNINGS_LOC(loc); \
874 /* m is not necessarily a "literal string", in this macro */
875 #define reg_warn_non_literal_string(loc, m) \
876 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
877 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
878 "%s" REPORT_LOCATION, \
879 m, REPORT_LOCATION_ARGS(loc)))
881 #define ckWARNreg(loc,m) \
882 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
883 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
885 REPORT_LOCATION_ARGS(loc)))
887 #define vWARN(loc, m) \
888 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
889 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
891 REPORT_LOCATION_ARGS(loc))) \
893 #define vWARN_dep(loc, m) \
894 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
895 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
897 REPORT_LOCATION_ARGS(loc)))
899 #define ckWARNdep(loc,m) \
900 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
901 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
903 REPORT_LOCATION_ARGS(loc)))
905 #define ckWARNregdep(loc,m) \
906 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
907 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
910 REPORT_LOCATION_ARGS(loc)))
912 #define ckWARN2reg_d(loc,m, a1) \
913 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
914 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
916 a1, REPORT_LOCATION_ARGS(loc)))
918 #define ckWARN2reg(loc, m, a1) \
919 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
920 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
922 a1, REPORT_LOCATION_ARGS(loc)))
924 #define vWARN3(loc, m, a1, a2) \
925 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
926 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
928 a1, a2, REPORT_LOCATION_ARGS(loc)))
930 #define ckWARN3reg(loc, m, a1, a2) \
931 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
932 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
935 REPORT_LOCATION_ARGS(loc)))
937 #define vWARN4(loc, m, a1, a2, a3) \
938 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
939 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
942 REPORT_LOCATION_ARGS(loc)))
944 #define ckWARN4reg(loc, m, a1, a2, a3) \
945 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
946 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
949 REPORT_LOCATION_ARGS(loc)))
951 #define vWARN5(loc, m, a1, a2, a3, a4) \
952 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
953 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
956 REPORT_LOCATION_ARGS(loc)))
958 #define ckWARNexperimental(loc, class, m) \
959 _WARN_HELPER(loc, packWARN(class), \
960 Perl_ck_warner_d(aTHX_ packWARN(class), \
962 REPORT_LOCATION_ARGS(loc)))
964 /* Convert between a pointer to a node and its offset from the beginning of the
966 #define REGNODE_p(offset) (RExC_emit_start + (offset))
967 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
969 /* Macros for recording node offsets. 20001227 mjd@plover.com
970 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
971 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
972 * Element 0 holds the number n.
973 * Position is 1 indexed.
975 #ifndef RE_TRACK_PATTERN_OFFSETS
976 #define Set_Node_Offset_To_R(offset,byte)
977 #define Set_Node_Offset(node,byte)
978 #define Set_Cur_Node_Offset
979 #define Set_Node_Length_To_R(node,len)
980 #define Set_Node_Length(node,len)
981 #define Set_Node_Cur_Length(node,start)
982 #define Node_Offset(n)
983 #define Node_Length(n)
984 #define Set_Node_Offset_Length(node,offset,len)
985 #define ProgLen(ri) ri->u.proglen
986 #define SetProgLen(ri,x) ri->u.proglen = x
987 #define Track_Code(code)
989 #define ProgLen(ri) ri->u.offsets[0]
990 #define SetProgLen(ri,x) ri->u.offsets[0] = x
991 #define Set_Node_Offset_To_R(offset,byte) STMT_START { \
992 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
993 __LINE__, (int)(offset), (int)(byte))); \
995 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
998 RExC_offsets[2*(offset)-1] = (byte); \
1002 #define Set_Node_Offset(node,byte) \
1003 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1004 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1006 #define Set_Node_Length_To_R(node,len) STMT_START { \
1007 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
1008 __LINE__, (int)(node), (int)(len))); \
1010 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1013 RExC_offsets[2*(node)] = (len); \
1017 #define Set_Node_Length(node,len) \
1018 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1019 #define Set_Node_Cur_Length(node, start) \
1020 Set_Node_Length(node, RExC_parse - start)
1022 /* Get offsets and lengths */
1023 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1024 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1026 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1027 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1028 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
1031 #define Track_Code(code) STMT_START { code } STMT_END
1034 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1035 #define EXPERIMENTAL_INPLACESCAN
1036 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1040 Perl_re_printf(pTHX_ const char *fmt, ...)
1044 PerlIO *f= Perl_debug_log;
1045 PERL_ARGS_ASSERT_RE_PRINTF;
1047 result = PerlIO_vprintf(f, fmt, ap);
1053 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1057 PerlIO *f= Perl_debug_log;
1058 PERL_ARGS_ASSERT_RE_INDENTF;
1059 va_start(ap, depth);
1060 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1061 result = PerlIO_vprintf(f, fmt, ap);
1065 #endif /* DEBUGGING */
1067 #define DEBUG_RExC_seen() \
1068 DEBUG_OPTIMISE_MORE_r({ \
1069 Perl_re_printf( aTHX_ "RExC_seen: "); \
1071 if (RExC_seen & REG_ZERO_LEN_SEEN) \
1072 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
1074 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
1075 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
1077 if (RExC_seen & REG_GPOS_SEEN) \
1078 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
1080 if (RExC_seen & REG_RECURSE_SEEN) \
1081 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
1083 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
1084 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
1086 if (RExC_seen & REG_VERBARG_SEEN) \
1087 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
1089 if (RExC_seen & REG_CUTGROUP_SEEN) \
1090 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
1092 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
1093 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
1095 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
1096 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
1098 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
1099 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
1101 Perl_re_printf( aTHX_ "\n"); \
1104 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1105 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
1110 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1111 const char *close_str)
1116 Perl_re_printf( aTHX_ "%s", open_str);
1117 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1118 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1119 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1120 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1121 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1122 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1123 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1124 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1125 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1126 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1127 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1128 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1129 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1130 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1131 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1132 Perl_re_printf( aTHX_ "%s", close_str);
1137 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1138 U32 depth, int is_inf)
1140 GET_RE_DEBUG_FLAGS_DECL;
1142 DEBUG_OPTIMISE_MORE_r({
1145 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1149 (IV)data->pos_delta,
1153 S_debug_show_study_flags(aTHX_ data->flags," [","]");
1155 Perl_re_printf( aTHX_
1156 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1158 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1159 is_inf ? "INF " : ""
1162 if (data->last_found) {
1164 Perl_re_printf(aTHX_
1165 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1166 SvPVX_const(data->last_found),
1168 (IV)data->last_start_min,
1169 (IV)data->last_start_max
1172 for (i = 0; i < 2; i++) {
1173 Perl_re_printf(aTHX_
1174 " %s%s: '%s' @ %" IVdf "/%" IVdf,
1175 data->cur_is_floating == i ? "*" : "",
1176 i ? "Float" : "Fixed",
1177 SvPVX_const(data->substrs[i].str),
1178 (IV)data->substrs[i].min_offset,
1179 (IV)data->substrs[i].max_offset
1181 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1185 Perl_re_printf( aTHX_ "\n");
1191 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1192 regnode *scan, U32 depth, U32 flags)
1194 GET_RE_DEBUG_FLAGS_DECL;
1201 Next = regnext(scan);
1202 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1203 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1206 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1207 Next ? (REG_NODE_NUM(Next)) : 0 );
1208 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1209 Perl_re_printf( aTHX_ "\n");
1214 # define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1215 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1217 # define DEBUG_PEEP(str, scan, depth, flags) \
1218 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1221 # define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1222 # define DEBUG_PEEP(str, scan, depth, flags) NOOP
1226 /* =========================================================
1227 * BEGIN edit_distance stuff.
1229 * This calculates how many single character changes of any type are needed to
1230 * transform a string into another one. It is taken from version 3.1 of
1232 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1235 /* Our unsorted dictionary linked list. */
1236 /* Note we use UVs, not chars. */
1241 struct dictionary* next;
1243 typedef struct dictionary item;
1246 PERL_STATIC_INLINE item*
1247 push(UV key, item* curr)
1250 Newx(head, 1, item);
1258 PERL_STATIC_INLINE item*
1259 find(item* head, UV key)
1261 item* iterator = head;
1263 if (iterator->key == key){
1266 iterator = iterator->next;
1272 PERL_STATIC_INLINE item*
1273 uniquePush(item* head, UV key)
1275 item* iterator = head;
1278 if (iterator->key == key) {
1281 iterator = iterator->next;
1284 return push(key, head);
1287 PERL_STATIC_INLINE void
1288 dict_free(item* head)
1290 item* iterator = head;
1293 item* temp = iterator;
1294 iterator = iterator->next;
1301 /* End of Dictionary Stuff */
1303 /* All calculations/work are done here */
1305 S_edit_distance(const UV* src,
1307 const STRLEN x, /* length of src[] */
1308 const STRLEN y, /* length of tgt[] */
1309 const SSize_t maxDistance
1313 UV swapCount, swapScore, targetCharCount, i, j;
1315 UV score_ceil = x + y;
1317 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1319 /* intialize matrix start values */
1320 Newx(scores, ( (x + 2) * (y + 2)), UV);
1321 scores[0] = score_ceil;
1322 scores[1 * (y + 2) + 0] = score_ceil;
1323 scores[0 * (y + 2) + 1] = score_ceil;
1324 scores[1 * (y + 2) + 1] = 0;
1325 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1330 for (i=1;i<=x;i++) {
1332 head = uniquePush(head, src[i]);
1333 scores[(i+1) * (y + 2) + 1] = i;
1334 scores[(i+1) * (y + 2) + 0] = score_ceil;
1337 for (j=1;j<=y;j++) {
1340 head = uniquePush(head, tgt[j]);
1341 scores[1 * (y + 2) + (j + 1)] = j;
1342 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1345 targetCharCount = find(head, tgt[j-1])->value;
1346 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1348 if (src[i-1] != tgt[j-1]){
1349 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1353 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1357 find(head, src[i-1])->value = i;
1361 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1364 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1368 /* END of edit_distance() stuff
1369 * ========================================================= */
1371 /* is c a control character for which we have a mnemonic? */
1372 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1375 S_cntrl_to_mnemonic(const U8 c)
1377 /* Returns the mnemonic string that represents character 'c', if one
1378 * exists; NULL otherwise. The only ones that exist for the purposes of
1379 * this routine are a few control characters */
1382 case '\a': return "\\a";
1383 case '\b': return "\\b";
1384 case ESC_NATIVE: return "\\e";
1385 case '\f': return "\\f";
1386 case '\n': return "\\n";
1387 case '\r': return "\\r";
1388 case '\t': return "\\t";
1394 /* Mark that we cannot extend a found fixed substring at this point.
1395 Update the longest found anchored substring or the longest found
1396 floating substrings if needed. */
1399 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1400 SSize_t *minlenp, int is_inf)
1402 const STRLEN l = CHR_SVLEN(data->last_found);
1403 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1404 const STRLEN old_l = CHR_SVLEN(longest_sv);
1405 GET_RE_DEBUG_FLAGS_DECL;
1407 PERL_ARGS_ASSERT_SCAN_COMMIT;
1409 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1410 const U8 i = data->cur_is_floating;
1411 SvSetMagicSV(longest_sv, data->last_found);
1412 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1415 data->substrs[0].max_offset = data->substrs[0].min_offset;
1417 data->substrs[1].max_offset = (l
1418 ? data->last_start_max
1419 : (data->pos_delta > SSize_t_MAX - data->pos_min
1421 : data->pos_min + data->pos_delta));
1423 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1424 data->substrs[1].max_offset = SSize_t_MAX;
1427 if (data->flags & SF_BEFORE_EOL)
1428 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1430 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1431 data->substrs[i].minlenp = minlenp;
1432 data->substrs[i].lookbehind = 0;
1435 SvCUR_set(data->last_found, 0);
1437 SV * const sv = data->last_found;
1438 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1439 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1444 data->last_end = -1;
1445 data->flags &= ~SF_BEFORE_EOL;
1446 DEBUG_STUDYDATA("commit", data, 0, is_inf);
1449 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1450 * list that describes which code points it matches */
1453 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1455 /* Set the SSC 'ssc' to match an empty string or any code point */
1457 PERL_ARGS_ASSERT_SSC_ANYTHING;
1459 assert(is_ANYOF_SYNTHETIC(ssc));
1461 /* mortalize so won't leak */
1462 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1463 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1467 S_ssc_is_anything(const regnode_ssc *ssc)
1469 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1470 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1471 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1472 * in any way, so there's no point in using it */
1477 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1479 assert(is_ANYOF_SYNTHETIC(ssc));
1481 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1485 /* See if the list consists solely of the range 0 - Infinity */
1486 invlist_iterinit(ssc->invlist);
1487 ret = invlist_iternext(ssc->invlist, &start, &end)
1491 invlist_iterfinish(ssc->invlist);
1497 /* If e.g., both \w and \W are set, matches everything */
1498 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1500 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1501 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1511 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1513 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1514 * string, any code point, or any posix class under locale */
1516 PERL_ARGS_ASSERT_SSC_INIT;
1518 Zero(ssc, 1, regnode_ssc);
1519 set_ANYOF_SYNTHETIC(ssc);
1520 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1523 /* If any portion of the regex is to operate under locale rules that aren't
1524 * fully known at compile time, initialization includes it. The reason
1525 * this isn't done for all regexes is that the optimizer was written under
1526 * the assumption that locale was all-or-nothing. Given the complexity and
1527 * lack of documentation in the optimizer, and that there are inadequate
1528 * test cases for locale, many parts of it may not work properly, it is
1529 * safest to avoid locale unless necessary. */
1530 if (RExC_contains_locale) {
1531 ANYOF_POSIXL_SETALL(ssc);
1534 ANYOF_POSIXL_ZERO(ssc);
1539 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1540 const regnode_ssc *ssc)
1542 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1543 * to the list of code points matched, and locale posix classes; hence does
1544 * not check its flags) */
1549 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1551 assert(is_ANYOF_SYNTHETIC(ssc));
1553 invlist_iterinit(ssc->invlist);
1554 ret = invlist_iternext(ssc->invlist, &start, &end)
1558 invlist_iterfinish(ssc->invlist);
1564 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1571 #define INVLIST_INDEX 0
1572 #define ONLY_LOCALE_MATCHES_INDEX 1
1573 #define DEFERRED_USER_DEFINED_INDEX 2
1576 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1577 const regnode_charclass* const node)
1579 /* Returns a mortal inversion list defining which code points are matched
1580 * by 'node', which is of type ANYOF. Handles complementing the result if
1581 * appropriate. If some code points aren't knowable at this time, the
1582 * returned list must, and will, contain every code point that is a
1587 SV* only_utf8_locale_invlist = NULL;
1589 const U32 n = ARG(node);
1590 bool new_node_has_latin1 = FALSE;
1591 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
1593 : ANYOF_FLAGS(node);
1595 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1597 /* Look at the data structure created by S_set_ANYOF_arg() */
1598 if (n != ANYOF_ONLY_HAS_BITMAP) {
1599 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1600 AV * const av = MUTABLE_AV(SvRV(rv));
1601 SV **const ary = AvARRAY(av);
1602 assert(RExC_rxi->data->what[n] == 's');
1604 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1606 /* Here there are things that won't be known until runtime -- we
1607 * have to assume it could be anything */
1608 invlist = sv_2mortal(_new_invlist(1));
1609 return _add_range_to_invlist(invlist, 0, UV_MAX);
1611 else if (ary[INVLIST_INDEX]) {
1613 /* Use the node's inversion list */
1614 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1617 /* Get the code points valid only under UTF-8 locales */
1618 if ( (flags & ANYOFL_FOLD)
1619 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1621 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1626 invlist = sv_2mortal(_new_invlist(0));
1629 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1630 * code points, and an inversion list for the others, but if there are code
1631 * points that should match only conditionally on the target string being
1632 * UTF-8, those are placed in the inversion list, and not the bitmap.
1633 * Since there are circumstances under which they could match, they are
1634 * included in the SSC. But if the ANYOF node is to be inverted, we have
1635 * to exclude them here, so that when we invert below, the end result
1636 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1637 * have to do this here before we add the unconditionally matched code
1639 if (flags & ANYOF_INVERT) {
1640 _invlist_intersection_complement_2nd(invlist,
1645 /* Add in the points from the bit map */
1646 if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
1647 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1648 if (ANYOF_BITMAP_TEST(node, i)) {
1649 unsigned int start = i++;
1651 for (; i < NUM_ANYOF_CODE_POINTS
1652 && ANYOF_BITMAP_TEST(node, i); ++i)
1656 invlist = _add_range_to_invlist(invlist, start, i-1);
1657 new_node_has_latin1 = TRUE;
1662 /* If this can match all upper Latin1 code points, have to add them
1663 * as well. But don't add them if inverting, as when that gets done below,
1664 * it would exclude all these characters, including the ones it shouldn't
1665 * that were added just above */
1666 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1667 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1669 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1672 /* Similarly for these */
1673 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1674 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1677 if (flags & ANYOF_INVERT) {
1678 _invlist_invert(invlist);
1680 else if (flags & ANYOFL_FOLD) {
1681 if (new_node_has_latin1) {
1683 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1684 * the locale. We can skip this if there are no 0-255 at all. */
1685 _invlist_union(invlist, PL_Latin1, &invlist);
1687 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1688 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1691 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1692 invlist = add_cp_to_invlist(invlist, 'I');
1694 if (_invlist_contains_cp(invlist,
1695 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1697 invlist = add_cp_to_invlist(invlist, 'i');
1702 /* Similarly add the UTF-8 locale possible matches. These have to be
1703 * deferred until after the non-UTF-8 locale ones are taken care of just
1704 * above, or it leads to wrong results under ANYOF_INVERT */
1705 if (only_utf8_locale_invlist) {
1706 _invlist_union_maybe_complement_2nd(invlist,
1707 only_utf8_locale_invlist,
1708 flags & ANYOF_INVERT,
1715 /* These two functions currently do the exact same thing */
1716 #define ssc_init_zero ssc_init
1718 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1719 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1721 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1722 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1723 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1726 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1727 const regnode_charclass *and_with)
1729 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1730 * another SSC or a regular ANYOF class. Can create false positives. */
1733 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
1735 : ANYOF_FLAGS(and_with);
1738 PERL_ARGS_ASSERT_SSC_AND;
1740 assert(is_ANYOF_SYNTHETIC(ssc));
1742 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1743 * the code point inversion list and just the relevant flags */
1744 if (is_ANYOF_SYNTHETIC(and_with)) {
1745 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1746 anded_flags = and_with_flags;
1748 /* XXX This is a kludge around what appears to be deficiencies in the
1749 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1750 * there are paths through the optimizer where it doesn't get weeded
1751 * out when it should. And if we don't make some extra provision for
1752 * it like the code just below, it doesn't get added when it should.
1753 * This solution is to add it only when AND'ing, which is here, and
1754 * only when what is being AND'ed is the pristine, original node
1755 * matching anything. Thus it is like adding it to ssc_anything() but
1756 * only when the result is to be AND'ed. Probably the same solution
1757 * could be adopted for the same problem we have with /l matching,
1758 * which is solved differently in S_ssc_init(), and that would lead to
1759 * fewer false positives than that solution has. But if this solution
1760 * creates bugs, the consequences are only that a warning isn't raised
1761 * that should be; while the consequences for having /l bugs is
1762 * incorrect matches */
1763 if (ssc_is_anything((regnode_ssc *)and_with)) {
1764 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1768 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1769 if (OP(and_with) == ANYOFD) {
1770 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1773 anded_flags = and_with_flags
1774 &( ANYOF_COMMON_FLAGS
1775 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1776 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1777 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1779 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1784 ANYOF_FLAGS(ssc) &= anded_flags;
1786 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1787 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1788 * 'and_with' may be inverted. When not inverted, we have the situation of
1790 * (C1 | P1) & (C2 | P2)
1791 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1792 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1793 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1794 * <= ((C1 & C2) | P1 | P2)
1795 * Alternatively, the last few steps could be:
1796 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1797 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1798 * <= (C1 | C2 | (P1 & P2))
1799 * We favor the second approach if either P1 or P2 is non-empty. This is
1800 * because these components are a barrier to doing optimizations, as what
1801 * they match cannot be known until the moment of matching as they are
1802 * dependent on the current locale, 'AND"ing them likely will reduce or
1804 * But we can do better if we know that C1,P1 are in their initial state (a
1805 * frequent occurrence), each matching everything:
1806 * (<everything>) & (C2 | P2) = C2 | P2
1807 * Similarly, if C2,P2 are in their initial state (again a frequent
1808 * occurrence), the result is a no-op
1809 * (C1 | P1) & (<everything>) = C1 | P1
1812 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1813 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1814 * <= (C1 & ~C2) | (P1 & ~P2)
1817 if ((and_with_flags & ANYOF_INVERT)
1818 && ! is_ANYOF_SYNTHETIC(and_with))
1822 ssc_intersection(ssc,
1824 FALSE /* Has already been inverted */
1827 /* If either P1 or P2 is empty, the intersection will be also; can skip
1829 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1830 ANYOF_POSIXL_ZERO(ssc);
1832 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1834 /* Note that the Posix class component P from 'and_with' actually
1836 * P = Pa | Pb | ... | Pn
1837 * where each component is one posix class, such as in [\w\s].
1839 * ~P = ~(Pa | Pb | ... | Pn)
1840 * = ~Pa & ~Pb & ... & ~Pn
1841 * <= ~Pa | ~Pb | ... | ~Pn
1842 * The last is something we can easily calculate, but unfortunately
1843 * is likely to have many false positives. We could do better
1844 * in some (but certainly not all) instances if two classes in
1845 * P have known relationships. For example
1846 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1848 * :lower: & :print: = :lower:
1849 * And similarly for classes that must be disjoint. For example,
1850 * since \s and \w can have no elements in common based on rules in
1851 * the POSIX standard,
1852 * \w & ^\S = nothing
1853 * Unfortunately, some vendor locales do not meet the Posix
1854 * standard, in particular almost everything by Microsoft.
1855 * The loop below just changes e.g., \w into \W and vice versa */
1857 regnode_charclass_posixl temp;
1858 int add = 1; /* To calculate the index of the complement */
1860 Zero(&temp, 1, regnode_charclass_posixl);
1861 ANYOF_POSIXL_ZERO(&temp);
1862 for (i = 0; i < ANYOF_MAX; i++) {
1864 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1865 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1867 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1868 ANYOF_POSIXL_SET(&temp, i + add);
1870 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1872 ANYOF_POSIXL_AND(&temp, ssc);
1874 } /* else ssc already has no posixes */
1875 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1876 in its initial state */
1877 else if (! is_ANYOF_SYNTHETIC(and_with)
1878 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1880 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1881 * copy it over 'ssc' */
1882 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1883 if (is_ANYOF_SYNTHETIC(and_with)) {
1884 StructCopy(and_with, ssc, regnode_ssc);
1887 ssc->invlist = anded_cp_list;
1888 ANYOF_POSIXL_ZERO(ssc);
1889 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1890 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1894 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1895 || (and_with_flags & ANYOF_MATCHES_POSIXL))
1897 /* One or the other of P1, P2 is non-empty. */
1898 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1899 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1901 ssc_union(ssc, anded_cp_list, FALSE);
1903 else { /* P1 = P2 = empty */
1904 ssc_intersection(ssc, anded_cp_list, FALSE);
1910 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1911 const regnode_charclass *or_with)
1913 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1914 * another SSC or a regular ANYOF class. Can create false positives if
1915 * 'or_with' is to be inverted. */
1919 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
1921 : ANYOF_FLAGS(or_with);
1923 PERL_ARGS_ASSERT_SSC_OR;
1925 assert(is_ANYOF_SYNTHETIC(ssc));
1927 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1928 * the code point inversion list and just the relevant flags */
1929 if (is_ANYOF_SYNTHETIC(or_with)) {
1930 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1931 ored_flags = or_with_flags;
1934 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1935 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
1936 if (OP(or_with) != ANYOFD) {
1939 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1940 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1941 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
1943 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1948 ANYOF_FLAGS(ssc) |= ored_flags;
1950 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1951 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1952 * 'or_with' may be inverted. When not inverted, we have the simple
1953 * situation of computing:
1954 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1955 * If P1|P2 yields a situation with both a class and its complement are
1956 * set, like having both \w and \W, this matches all code points, and we
1957 * can delete these from the P component of the ssc going forward. XXX We
1958 * might be able to delete all the P components, but I (khw) am not certain
1959 * about this, and it is better to be safe.
1962 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1963 * <= (C1 | P1) | ~C2
1964 * <= (C1 | ~C2) | P1
1965 * (which results in actually simpler code than the non-inverted case)
1968 if ((or_with_flags & ANYOF_INVERT)
1969 && ! is_ANYOF_SYNTHETIC(or_with))
1971 /* We ignore P2, leaving P1 going forward */
1972 } /* else Not inverted */
1973 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
1974 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1975 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1977 for (i = 0; i < ANYOF_MAX; i += 2) {
1978 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1980 ssc_match_all_cp(ssc);
1981 ANYOF_POSIXL_CLEAR(ssc, i);
1982 ANYOF_POSIXL_CLEAR(ssc, i+1);
1990 FALSE /* Already has been inverted */
1994 PERL_STATIC_INLINE void
1995 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1997 PERL_ARGS_ASSERT_SSC_UNION;
1999 assert(is_ANYOF_SYNTHETIC(ssc));
2001 _invlist_union_maybe_complement_2nd(ssc->invlist,
2007 PERL_STATIC_INLINE void
2008 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2010 const bool invert2nd)
2012 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2014 assert(is_ANYOF_SYNTHETIC(ssc));
2016 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2022 PERL_STATIC_INLINE void
2023 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2025 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2027 assert(is_ANYOF_SYNTHETIC(ssc));
2029 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2032 PERL_STATIC_INLINE void
2033 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2035 /* AND just the single code point 'cp' into the SSC 'ssc' */
2037 SV* cp_list = _new_invlist(2);
2039 PERL_ARGS_ASSERT_SSC_CP_AND;
2041 assert(is_ANYOF_SYNTHETIC(ssc));
2043 cp_list = add_cp_to_invlist(cp_list, cp);
2044 ssc_intersection(ssc, cp_list,
2045 FALSE /* Not inverted */
2047 SvREFCNT_dec_NN(cp_list);
2050 PERL_STATIC_INLINE void
2051 S_ssc_clear_locale(regnode_ssc *ssc)
2053 /* Set the SSC 'ssc' to not match any locale things */
2054 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2056 assert(is_ANYOF_SYNTHETIC(ssc));
2058 ANYOF_POSIXL_ZERO(ssc);
2059 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2062 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2065 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2067 /* The synthetic start class is used to hopefully quickly winnow down
2068 * places where a pattern could start a match in the target string. If it
2069 * doesn't really narrow things down that much, there isn't much point to
2070 * having the overhead of using it. This function uses some very crude
2071 * heuristics to decide if to use the ssc or not.
2073 * It returns TRUE if 'ssc' rules out more than half what it considers to
2074 * be the "likely" possible matches, but of course it doesn't know what the
2075 * actual things being matched are going to be; these are only guesses
2077 * For /l matches, it assumes that the only likely matches are going to be
2078 * in the 0-255 range, uniformly distributed, so half of that is 127
2079 * For /a and /d matches, it assumes that the likely matches will be just
2080 * the ASCII range, so half of that is 63
2081 * For /u and there isn't anything matching above the Latin1 range, it
2082 * assumes that that is the only range likely to be matched, and uses
2083 * half that as the cut-off: 127. If anything matches above Latin1,
2084 * it assumes that all of Unicode could match (uniformly), except for
2085 * non-Unicode code points and things in the General Category "Other"
2086 * (unassigned, private use, surrogates, controls and formats). This
2087 * is a much large number. */
2089 U32 count = 0; /* Running total of number of code points matched by
2091 UV start, end; /* Start and end points of current range in inversion
2092 XXX outdated. UTF-8 locales are common, what about invert? list */
2093 const U32 max_code_points = (LOC)
2095 : (( ! UNI_SEMANTICS
2096 || invlist_highest(ssc->invlist) < 256)
2099 const U32 max_match = max_code_points / 2;
2101 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2103 invlist_iterinit(ssc->invlist);
2104 while (invlist_iternext(ssc->invlist, &start, &end)) {
2105 if (start >= max_code_points) {
2108 end = MIN(end, max_code_points - 1);
2109 count += end - start + 1;
2110 if (count >= max_match) {
2111 invlist_iterfinish(ssc->invlist);
2121 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2123 /* The inversion list in the SSC is marked mortal; now we need a more
2124 * permanent copy, which is stored the same way that is done in a regular
2125 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2128 SV* invlist = invlist_clone(ssc->invlist, NULL);
2130 PERL_ARGS_ASSERT_SSC_FINALIZE;
2132 assert(is_ANYOF_SYNTHETIC(ssc));
2134 /* The code in this file assumes that all but these flags aren't relevant
2135 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2136 * by the time we reach here */
2137 assert(! (ANYOF_FLAGS(ssc)
2138 & ~( ANYOF_COMMON_FLAGS
2139 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2140 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2142 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2144 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2145 SvREFCNT_dec(invlist);
2147 /* Make sure is clone-safe */
2148 ssc->invlist = NULL;
2150 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2151 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2152 OP(ssc) = ANYOFPOSIXL;
2154 else if (RExC_contains_locale) {
2158 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2161 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2162 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2163 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2164 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2165 ? (TRIE_LIST_CUR( idx ) - 1) \
2171 dump_trie(trie,widecharmap,revcharmap)
2172 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2173 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2175 These routines dump out a trie in a somewhat readable format.
2176 The _interim_ variants are used for debugging the interim
2177 tables that are used to generate the final compressed
2178 representation which is what dump_trie expects.
2180 Part of the reason for their existence is to provide a form
2181 of documentation as to how the different representations function.
2186 Dumps the final compressed table form of the trie to Perl_debug_log.
2187 Used for debugging make_trie().
2191 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2192 AV *revcharmap, U32 depth)
2195 SV *sv=sv_newmortal();
2196 int colwidth= widecharmap ? 6 : 4;
2198 GET_RE_DEBUG_FLAGS_DECL;
2200 PERL_ARGS_ASSERT_DUMP_TRIE;
2202 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
2203 depth+1, "Match","Base","Ofs" );
2205 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2206 SV ** const tmp = av_fetch( revcharmap, state, 0);
2208 Perl_re_printf( aTHX_ "%*s",
2210 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2211 PL_colors[0], PL_colors[1],
2212 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2213 PERL_PV_ESCAPE_FIRSTCHAR
2218 Perl_re_printf( aTHX_ "\n");
2219 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2221 for( state = 0 ; state < trie->uniquecharcount ; state++ )
2222 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2223 Perl_re_printf( aTHX_ "\n");
2225 for( state = 1 ; state < trie->statecount ; state++ ) {
2226 const U32 base = trie->states[ state ].trans.base;
2228 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
2230 if ( trie->states[ state ].wordnum ) {
2231 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
2233 Perl_re_printf( aTHX_ "%6s", "" );
2236 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
2241 while( ( base + ofs < trie->uniquecharcount ) ||
2242 ( base + ofs - trie->uniquecharcount < trie->lasttrans
2243 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2247 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
2249 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2250 if ( ( base + ofs >= trie->uniquecharcount )
2251 && ( base + ofs - trie->uniquecharcount
2253 && trie->trans[ base + ofs
2254 - trie->uniquecharcount ].check == state )
2256 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
2257 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2260 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
2264 Perl_re_printf( aTHX_ "]");
2267 Perl_re_printf( aTHX_ "\n" );
2269 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
2271 for (word=1; word <= trie->wordcount; word++) {
2272 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2273 (int)word, (int)(trie->wordinfo[word].prev),
2274 (int)(trie->wordinfo[word].len));
2276 Perl_re_printf( aTHX_ "\n" );
2279 Dumps a fully constructed but uncompressed trie in list form.
2280 List tries normally only are used for construction when the number of
2281 possible chars (trie->uniquecharcount) is very high.
2282 Used for debugging make_trie().
2285 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2286 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2290 SV *sv=sv_newmortal();
2291 int colwidth= widecharmap ? 6 : 4;
2292 GET_RE_DEBUG_FLAGS_DECL;
2294 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2296 /* print out the table precompression. */
2297 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
2299 Perl_re_indentf( aTHX_ "%s",
2300 depth+1, "------:-----+-----------------\n" );
2302 for( state=1 ; state < next_alloc ; state ++ ) {
2305 Perl_re_indentf( aTHX_ " %4" UVXf " :",
2306 depth+1, (UV)state );
2307 if ( ! trie->states[ state ].wordnum ) {
2308 Perl_re_printf( aTHX_ "%5s| ","");
2310 Perl_re_printf( aTHX_ "W%4x| ",
2311 trie->states[ state ].wordnum
2314 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2315 SV ** const tmp = av_fetch( revcharmap,
2316 TRIE_LIST_ITEM(state, charid).forid, 0);
2318 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
2320 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2322 PL_colors[0], PL_colors[1],
2323 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2324 | PERL_PV_ESCAPE_FIRSTCHAR
2326 TRIE_LIST_ITEM(state, charid).forid,
2327 (UV)TRIE_LIST_ITEM(state, charid).newstate
2330 Perl_re_printf( aTHX_ "\n%*s| ",
2331 (int)((depth * 2) + 14), "");
2334 Perl_re_printf( aTHX_ "\n");
2339 Dumps a fully constructed but uncompressed trie in table form.
2340 This is the normal DFA style state transition table, with a few
2341 twists to facilitate compression later.
2342 Used for debugging make_trie().
2345 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2346 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2351 SV *sv=sv_newmortal();
2352 int colwidth= widecharmap ? 6 : 4;
2353 GET_RE_DEBUG_FLAGS_DECL;
2355 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2358 print out the table precompression so that we can do a visual check
2359 that they are identical.
2362 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
2364 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2365 SV ** const tmp = av_fetch( revcharmap, charid, 0);
2367 Perl_re_printf( aTHX_ "%*s",
2369 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2370 PL_colors[0], PL_colors[1],
2371 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2372 PERL_PV_ESCAPE_FIRSTCHAR
2378 Perl_re_printf( aTHX_ "\n");
2379 Perl_re_indentf( aTHX_ "State+-", depth+1 );
2381 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2382 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
2385 Perl_re_printf( aTHX_ "\n" );
2387 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2389 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
2391 (UV)TRIE_NODENUM( state ) );
2393 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2394 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2396 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
2398 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
2400 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2401 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
2402 (UV)trie->trans[ state ].check );
2404 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
2405 (UV)trie->trans[ state ].check,
2406 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2414 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2415 startbranch: the first branch in the whole branch sequence
2416 first : start branch of sequence of branch-exact nodes.
2417 May be the same as startbranch
2418 last : Thing following the last branch.
2419 May be the same as tail.
2420 tail : item following the branch sequence
2421 count : words in the sequence
2422 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2423 depth : indent depth
2425 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2427 A trie is an N'ary tree where the branches are determined by digital
2428 decomposition of the key. IE, at the root node you look up the 1st character and
2429 follow that branch repeat until you find the end of the branches. Nodes can be
2430 marked as "accepting" meaning they represent a complete word. Eg:
2434 would convert into the following structure. Numbers represent states, letters
2435 following numbers represent valid transitions on the letter from that state, if
2436 the number is in square brackets it represents an accepting state, otherwise it
2437 will be in parenthesis.
2439 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2443 (1) +-i->(6)-+-s->[7]
2445 +-s->(3)-+-h->(4)-+-e->[5]
2447 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2449 This shows that when matching against the string 'hers' we will begin at state 1
2450 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2451 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2452 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2453 single traverse. We store a mapping from accepting to state to which word was
2454 matched, and then when we have multiple possibilities we try to complete the
2455 rest of the regex in the order in which they occurred in the alternation.
2457 The only prior NFA like behaviour that would be changed by the TRIE support is
2458 the silent ignoring of duplicate alternations which are of the form:
2460 / (DUPE|DUPE) X? (?{ ... }) Y /x
2462 Thus EVAL blocks following a trie may be called a different number of times with
2463 and without the optimisation. With the optimisations dupes will be silently
2464 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2465 the following demonstrates:
2467 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2469 which prints out 'word' three times, but
2471 'words'=~/(word|word|word)(?{ print $1 })S/
2473 which doesnt print it out at all. This is due to other optimisations kicking in.
2475 Example of what happens on a structural level:
2477 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2479 1: CURLYM[1] {1,32767}(18)
2490 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2491 and should turn into:
2493 1: CURLYM[1] {1,32767}(18)
2495 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2503 Cases where tail != last would be like /(?foo|bar)baz/:
2513 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2514 and would end up looking like:
2517 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2524 d = uvchr_to_utf8_flags(d, uv, 0);
2526 is the recommended Unicode-aware way of saying
2531 #define TRIE_STORE_REVCHAR(val) \
2534 SV *zlopp = newSV(UTF8_MAXBYTES); \
2535 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2536 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2538 SvCUR_set(zlopp, kapow - flrbbbbb); \
2541 av_push(revcharmap, zlopp); \
2543 char ooooff = (char)val; \
2544 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2548 /* This gets the next character from the input, folding it if not already
2550 #define TRIE_READ_CHAR STMT_START { \
2553 /* if it is UTF then it is either already folded, or does not need \
2555 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2557 else if (folder == PL_fold_latin1) { \
2558 /* This folder implies Unicode rules, which in the range expressible \
2559 * by not UTF is the lower case, with the two exceptions, one of \
2560 * which should have been taken care of before calling this */ \
2561 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2562 uvc = toLOWER_L1(*uc); \
2563 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2566 /* raw data, will be folded later if needed */ \
2574 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2575 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2576 U32 ging = TRIE_LIST_LEN( state ) * 2; \
2577 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2578 TRIE_LIST_LEN( state ) = ging; \
2580 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2581 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2582 TRIE_LIST_CUR( state )++; \
2585 #define TRIE_LIST_NEW(state) STMT_START { \
2586 Newx( trie->states[ state ].trans.list, \
2587 4, reg_trie_trans_le ); \
2588 TRIE_LIST_CUR( state ) = 1; \
2589 TRIE_LIST_LEN( state ) = 4; \
2592 #define TRIE_HANDLE_WORD(state) STMT_START { \
2593 U16 dupe= trie->states[ state ].wordnum; \
2594 regnode * const noper_next = regnext( noper ); \
2597 /* store the word for dumping */ \
2599 if (OP(noper) != NOTHING) \
2600 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2602 tmp = newSVpvn_utf8( "", 0, UTF ); \
2603 av_push( trie_words, tmp ); \
2607 trie->wordinfo[curword].prev = 0; \
2608 trie->wordinfo[curword].len = wordlen; \
2609 trie->wordinfo[curword].accept = state; \
2611 if ( noper_next < tail ) { \
2613 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2615 trie->jump[curword] = (U16)(noper_next - convert); \
2617 jumper = noper_next; \
2619 nextbranch= regnext(cur); \
2623 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2624 /* chain, so that when the bits of chain are later */\
2625 /* linked together, the dups appear in the chain */\
2626 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2627 trie->wordinfo[dupe].prev = curword; \
2629 /* we haven't inserted this word yet. */ \
2630 trie->states[ state ].wordnum = curword; \
2635 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2636 ( ( base + charid >= ucharcount \
2637 && base + charid < ubound \
2638 && state == trie->trans[ base - ucharcount + charid ].check \
2639 && trie->trans[ base - ucharcount + charid ].next ) \
2640 ? trie->trans[ base - ucharcount + charid ].next \
2641 : ( state==1 ? special : 0 ) \
2644 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2646 TRIE_BITMAP_SET(trie, uvc); \
2647 /* store the folded codepoint */ \
2649 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2652 /* store first byte of utf8 representation of */ \
2653 /* variant codepoints */ \
2654 if (! UVCHR_IS_INVARIANT(uvc)) { \
2655 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2660 #define MADE_JUMP_TRIE 2
2661 #define MADE_EXACT_TRIE 4
2664 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2665 regnode *first, regnode *last, regnode *tail,
2666 U32 word_count, U32 flags, U32 depth)
2668 /* first pass, loop through and scan words */
2669 reg_trie_data *trie;
2670 HV *widecharmap = NULL;
2671 AV *revcharmap = newAV();
2677 regnode *jumper = NULL;
2678 regnode *nextbranch = NULL;
2679 regnode *convert = NULL;
2680 U32 *prev_states; /* temp array mapping each state to previous one */
2681 /* we just use folder as a flag in utf8 */
2682 const U8 * folder = NULL;
2684 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2685 * which stands for one trie structure, one hash, optionally followed
2688 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2689 AV *trie_words = NULL;
2690 /* along with revcharmap, this only used during construction but both are
2691 * useful during debugging so we store them in the struct when debugging.
2694 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2695 STRLEN trie_charcount=0;
2697 SV *re_trie_maxbuff;
2698 GET_RE_DEBUG_FLAGS_DECL;
2700 PERL_ARGS_ASSERT_MAKE_TRIE;
2702 PERL_UNUSED_ARG(depth);
2706 case EXACT: case EXACT_REQ8: case EXACTL: break;
2710 case EXACTFLU8: folder = PL_fold_latin1; break;
2711 case EXACTF: folder = PL_fold; break;
2712 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2715 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2717 trie->startstate = 1;
2718 trie->wordcount = word_count;
2719 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2720 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2721 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2722 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2723 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2724 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2727 trie_words = newAV();
2730 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2731 assert(re_trie_maxbuff);
2732 if (!SvIOK(re_trie_maxbuff)) {
2733 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2735 DEBUG_TRIE_COMPILE_r({
2736 Perl_re_indentf( aTHX_
2737 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2739 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2740 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2743 /* Find the node we are going to overwrite */
2744 if ( first == startbranch && OP( last ) != BRANCH ) {
2745 /* whole branch chain */
2748 /* branch sub-chain */
2749 convert = NEXTOPER( first );
2752 /* -- First loop and Setup --
2754 We first traverse the branches and scan each word to determine if it
2755 contains widechars, and how many unique chars there are, this is
2756 important as we have to build a table with at least as many columns as we
2759 We use an array of integers to represent the character codes 0..255
2760 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2761 the native representation of the character value as the key and IV's for
2764 *TODO* If we keep track of how many times each character is used we can
2765 remap the columns so that the table compression later on is more
2766 efficient in terms of memory by ensuring the most common value is in the
2767 middle and the least common are on the outside. IMO this would be better
2768 than a most to least common mapping as theres a decent chance the most
2769 common letter will share a node with the least common, meaning the node
2770 will not be compressible. With a middle is most common approach the worst
2771 case is when we have the least common nodes twice.
2775 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2776 regnode *noper = NEXTOPER( cur );
2780 U32 wordlen = 0; /* required init */
2781 STRLEN minchars = 0;
2782 STRLEN maxchars = 0;
2783 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2786 if (OP(noper) == NOTHING) {
2787 /* skip past a NOTHING at the start of an alternation
2788 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2790 regnode *noper_next= regnext(noper);
2791 if (noper_next < tail)
2796 && ( OP(noper) == flags
2797 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2798 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
2799 || OP(noper) == EXACTFUP))))
2801 uc= (U8*)STRING(noper);
2802 e= uc + STR_LEN(noper);
2809 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2810 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2811 regardless of encoding */
2812 if (OP( noper ) == EXACTFUP) {
2813 /* false positives are ok, so just set this */
2814 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2818 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2820 TRIE_CHARCOUNT(trie)++;
2823 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2824 * is in effect. Under /i, this character can match itself, or
2825 * anything that folds to it. If not under /i, it can match just
2826 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2827 * all fold to k, and all are single characters. But some folds
2828 * expand to more than one character, so for example LATIN SMALL
2829 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2830 * the string beginning at 'uc' is 'ffi', it could be matched by
2831 * three characters, or just by the one ligature character. (It
2832 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2833 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2834 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2835 * match.) The trie needs to know the minimum and maximum number
2836 * of characters that could match so that it can use size alone to
2837 * quickly reject many match attempts. The max is simple: it is
2838 * the number of folded characters in this branch (since a fold is
2839 * never shorter than what folds to it. */
2843 /* And the min is equal to the max if not under /i (indicated by
2844 * 'folder' being NULL), or there are no multi-character folds. If
2845 * there is a multi-character fold, the min is incremented just
2846 * once, for the character that folds to the sequence. Each
2847 * character in the sequence needs to be added to the list below of
2848 * characters in the trie, but we count only the first towards the
2849 * min number of characters needed. This is done through the
2850 * variable 'foldlen', which is returned by the macros that look
2851 * for these sequences as the number of bytes the sequence
2852 * occupies. Each time through the loop, we decrement 'foldlen' by
2853 * how many bytes the current char occupies. Only when it reaches
2854 * 0 do we increment 'minchars' or look for another multi-character
2856 if (folder == NULL) {
2859 else if (foldlen > 0) {
2860 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2865 /* See if *uc is the beginning of a multi-character fold. If
2866 * so, we decrement the length remaining to look at, to account
2867 * for the current character this iteration. (We can use 'uc'
2868 * instead of the fold returned by TRIE_READ_CHAR because for
2869 * non-UTF, the latin1_safe macro is smart enough to account
2870 * for all the unfolded characters, and because for UTF, the
2871 * string will already have been folded earlier in the
2872 * compilation process */
2874 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2875 foldlen -= UTF8SKIP(uc);
2878 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2883 /* The current character (and any potential folds) should be added
2884 * to the possible matching characters for this position in this
2888 U8 folded= folder[ (U8) uvc ];
2889 if ( !trie->charmap[ folded ] ) {
2890 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2891 TRIE_STORE_REVCHAR( folded );
2894 if ( !trie->charmap[ uvc ] ) {
2895 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2896 TRIE_STORE_REVCHAR( uvc );
2899 /* store the codepoint in the bitmap, and its folded
2901 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2902 set_bit = 0; /* We've done our bit :-) */
2906 /* XXX We could come up with the list of code points that fold
2907 * to this using PL_utf8_foldclosures, except not for
2908 * multi-char folds, as there may be multiple combinations
2909 * there that could work, which needs to wait until runtime to
2910 * resolve (The comment about LIGATURE FFI above is such an
2915 widecharmap = newHV();
2917 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2920 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2922 if ( !SvTRUE( *svpp ) ) {
2923 sv_setiv( *svpp, ++trie->uniquecharcount );
2924 TRIE_STORE_REVCHAR(uvc);
2927 } /* end loop through characters in this branch of the trie */
2929 /* We take the min and max for this branch and combine to find the min
2930 * and max for all branches processed so far */
2931 if( cur == first ) {
2932 trie->minlen = minchars;
2933 trie->maxlen = maxchars;
2934 } else if (minchars < trie->minlen) {
2935 trie->minlen = minchars;
2936 } else if (maxchars > trie->maxlen) {
2937 trie->maxlen = maxchars;
2939 } /* end first pass */
2940 DEBUG_TRIE_COMPILE_r(
2941 Perl_re_indentf( aTHX_
2942 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2944 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2945 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2946 (int)trie->minlen, (int)trie->maxlen )
2950 We now know what we are dealing with in terms of unique chars and
2951 string sizes so we can calculate how much memory a naive
2952 representation using a flat table will take. If it's over a reasonable
2953 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2954 conservative but potentially much slower representation using an array
2957 At the end we convert both representations into the same compressed
2958 form that will be used in regexec.c for matching with. The latter
2959 is a form that cannot be used to construct with but has memory
2960 properties similar to the list form and access properties similar
2961 to the table form making it both suitable for fast searches and
2962 small enough that its feasable to store for the duration of a program.
2964 See the comment in the code where the compressed table is produced
2965 inplace from the flat tabe representation for an explanation of how
2966 the compression works.
2971 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2974 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2975 > SvIV(re_trie_maxbuff) )
2978 Second Pass -- Array Of Lists Representation
2980 Each state will be represented by a list of charid:state records
2981 (reg_trie_trans_le) the first such element holds the CUR and LEN
2982 points of the allocated array. (See defines above).
2984 We build the initial structure using the lists, and then convert
2985 it into the compressed table form which allows faster lookups
2986 (but cant be modified once converted).
2989 STRLEN transcount = 1;
2991 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
2994 trie->states = (reg_trie_state *)
2995 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2996 sizeof(reg_trie_state) );
3000 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3002 regnode *noper = NEXTOPER( cur );
3003 U32 state = 1; /* required init */
3004 U16 charid = 0; /* sanity init */
3005 U32 wordlen = 0; /* required init */
3007 if (OP(noper) == NOTHING) {
3008 regnode *noper_next= regnext(noper);
3009 if (noper_next < tail)
3014 && ( OP(noper) == flags
3015 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3016 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3017 || OP(noper) == EXACTFUP))))
3019 const U8 *uc= (U8*)STRING(noper);
3020 const U8 *e= uc + STR_LEN(noper);
3022 for ( ; uc < e ; uc += len ) {
3027 charid = trie->charmap[ uvc ];
3029 SV** const svpp = hv_fetch( widecharmap,
3036 charid=(U16)SvIV( *svpp );
3039 /* charid is now 0 if we dont know the char read, or
3040 * nonzero if we do */
3047 if ( !trie->states[ state ].trans.list ) {
3048 TRIE_LIST_NEW( state );
3051 check <= TRIE_LIST_USED( state );
3054 if ( TRIE_LIST_ITEM( state, check ).forid
3057 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3062 newstate = next_alloc++;
3063 prev_states[newstate] = state;
3064 TRIE_LIST_PUSH( state, charid, newstate );
3069 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3073 TRIE_HANDLE_WORD(state);
3075 } /* end second pass */
3077 /* next alloc is the NEXT state to be allocated */
3078 trie->statecount = next_alloc;
3079 trie->states = (reg_trie_state *)
3080 PerlMemShared_realloc( trie->states,
3082 * sizeof(reg_trie_state) );
3084 /* and now dump it out before we compress it */
3085 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3086 revcharmap, next_alloc,
3090 trie->trans = (reg_trie_trans *)
3091 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3098 for( state=1 ; state < next_alloc ; state ++ ) {
3102 DEBUG_TRIE_COMPILE_MORE_r(
3103 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
3107 if (trie->states[state].trans.list) {
3108 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3112 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3113 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3114 if ( forid < minid ) {
3116 } else if ( forid > maxid ) {
3120 if ( transcount < tp + maxid - minid + 1) {
3122 trie->trans = (reg_trie_trans *)
3123 PerlMemShared_realloc( trie->trans,
3125 * sizeof(reg_trie_trans) );
3126 Zero( trie->trans + (transcount / 2),
3130 base = trie->uniquecharcount + tp - minid;
3131 if ( maxid == minid ) {
3133 for ( ; zp < tp ; zp++ ) {
3134 if ( ! trie->trans[ zp ].next ) {
3135 base = trie->uniquecharcount + zp - minid;
3136 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3138 trie->trans[ zp ].check = state;
3144 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3146 trie->trans[ tp ].check = state;
3151 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3152 const U32 tid = base
3153 - trie->uniquecharcount
3154 + TRIE_LIST_ITEM( state, idx ).forid;
3155 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3157 trie->trans[ tid ].check = state;
3159 tp += ( maxid - minid + 1 );
3161 Safefree(trie->states[ state ].trans.list);
3164 DEBUG_TRIE_COMPILE_MORE_r(
3165 Perl_re_printf( aTHX_ " base: %d\n",base);
3168 trie->states[ state ].trans.base=base;
3170 trie->lasttrans = tp + 1;
3174 Second Pass -- Flat Table Representation.
3176 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3177 each. We know that we will need Charcount+1 trans at most to store
3178 the data (one row per char at worst case) So we preallocate both
3179 structures assuming worst case.
3181 We then construct the trie using only the .next slots of the entry
3184 We use the .check field of the first entry of the node temporarily
3185 to make compression both faster and easier by keeping track of how
3186 many non zero fields are in the node.
3188 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3191 There are two terms at use here: state as a TRIE_NODEIDX() which is
3192 a number representing the first entry of the node, and state as a
3193 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3194 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3195 if there are 2 entrys per node. eg:
3203 The table is internally in the right hand, idx form. However as we
3204 also have to deal with the states array which is indexed by nodenum
3205 we have to use TRIE_NODENUM() to convert.
3208 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
3211 trie->trans = (reg_trie_trans *)
3212 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3213 * trie->uniquecharcount + 1,
3214 sizeof(reg_trie_trans) );
3215 trie->states = (reg_trie_state *)
3216 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3217 sizeof(reg_trie_state) );
3218 next_alloc = trie->uniquecharcount + 1;
3221 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3223 regnode *noper = NEXTOPER( cur );
3225 U32 state = 1; /* required init */
3227 U16 charid = 0; /* sanity init */
3228 U32 accept_state = 0; /* sanity init */
3230 U32 wordlen = 0; /* required init */
3232 if (OP(noper) == NOTHING) {
3233 regnode *noper_next= regnext(noper);
3234 if (noper_next < tail)
3239 && ( OP(noper) == flags
3240 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3241 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3242 || OP(noper) == EXACTFUP))))
3244 const U8 *uc= (U8*)STRING(noper);
3245 const U8 *e= uc + STR_LEN(noper);
3247 for ( ; uc < e ; uc += len ) {
3252 charid = trie->charmap[ uvc ];
3254 SV* const * const svpp = hv_fetch( widecharmap,
3258 charid = svpp ? (U16)SvIV(*svpp) : 0;
3262 if ( !trie->trans[ state + charid ].next ) {
3263 trie->trans[ state + charid ].next = next_alloc;
3264 trie->trans[ state ].check++;
3265 prev_states[TRIE_NODENUM(next_alloc)]
3266 = TRIE_NODENUM(state);
3267 next_alloc += trie->uniquecharcount;
3269 state = trie->trans[ state + charid ].next;
3271 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3273 /* charid is now 0 if we dont know the char read, or
3274 * nonzero if we do */
3277 accept_state = TRIE_NODENUM( state );
3278 TRIE_HANDLE_WORD(accept_state);
3280 } /* end second pass */
3282 /* and now dump it out before we compress it */
3283 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3285 next_alloc, depth+1));
3289 * Inplace compress the table.*
3291 For sparse data sets the table constructed by the trie algorithm will
3292 be mostly 0/FAIL transitions or to put it another way mostly empty.
3293 (Note that leaf nodes will not contain any transitions.)
3295 This algorithm compresses the tables by eliminating most such
3296 transitions, at the cost of a modest bit of extra work during lookup:
3298 - Each states[] entry contains a .base field which indicates the
3299 index in the state[] array wheres its transition data is stored.
3301 - If .base is 0 there are no valid transitions from that node.
3303 - If .base is nonzero then charid is added to it to find an entry in
3306 -If trans[states[state].base+charid].check!=state then the
3307 transition is taken to be a 0/Fail transition. Thus if there are fail
3308 transitions at the front of the node then the .base offset will point
3309 somewhere inside the previous nodes data (or maybe even into a node
3310 even earlier), but the .check field determines if the transition is
3314 The following process inplace converts the table to the compressed
3315 table: We first do not compress the root node 1,and mark all its
3316 .check pointers as 1 and set its .base pointer as 1 as well. This
3317 allows us to do a DFA construction from the compressed table later,
3318 and ensures that any .base pointers we calculate later are greater
3321 - We set 'pos' to indicate the first entry of the second node.
3323 - We then iterate over the columns of the node, finding the first and
3324 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3325 and set the .check pointers accordingly, and advance pos
3326 appropriately and repreat for the next node. Note that when we copy
3327 the next pointers we have to convert them from the original
3328 NODEIDX form to NODENUM form as the former is not valid post
3331 - If a node has no transitions used we mark its base as 0 and do not
3332 advance the pos pointer.
3334 - If a node only has one transition we use a second pointer into the
3335 structure to fill in allocated fail transitions from other states.
3336 This pointer is independent of the main pointer and scans forward
3337 looking for null transitions that are allocated to a state. When it
3338 finds one it writes the single transition into the "hole". If the
3339 pointer doesnt find one the single transition is appended as normal.
3341 - Once compressed we can Renew/realloc the structures to release the
3344 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3345 specifically Fig 3.47 and the associated pseudocode.
3349 const U32 laststate = TRIE_NODENUM( next_alloc );
3352 trie->statecount = laststate;
3354 for ( state = 1 ; state < laststate ; state++ ) {
3356 const U32 stateidx = TRIE_NODEIDX( state );
3357 const U32 o_used = trie->trans[ stateidx ].check;
3358 U32 used = trie->trans[ stateidx ].check;
3359 trie->trans[ stateidx ].check = 0;
3362 used && charid < trie->uniquecharcount;
3365 if ( flag || trie->trans[ stateidx + charid ].next ) {
3366 if ( trie->trans[ stateidx + charid ].next ) {
3368 for ( ; zp < pos ; zp++ ) {
3369 if ( ! trie->trans[ zp ].next ) {
3373 trie->states[ state ].trans.base
3375 + trie->uniquecharcount
3377 trie->trans[ zp ].next
3378 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3380 trie->trans[ zp ].check = state;
3381 if ( ++zp > pos ) pos = zp;
3388 trie->states[ state ].trans.base
3389 = pos + trie->uniquecharcount - charid ;
3391 trie->trans[ pos ].next
3392 = SAFE_TRIE_NODENUM(
3393 trie->trans[ stateidx + charid ].next );
3394 trie->trans[ pos ].check = state;
3399 trie->lasttrans = pos + 1;
3400 trie->states = (reg_trie_state *)
3401 PerlMemShared_realloc( trie->states, laststate
3402 * sizeof(reg_trie_state) );
3403 DEBUG_TRIE_COMPILE_MORE_r(
3404 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3406 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3410 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3413 } /* end table compress */
3415 DEBUG_TRIE_COMPILE_MORE_r(
3416 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3418 (UV)trie->statecount,
3419 (UV)trie->lasttrans)
3421 /* resize the trans array to remove unused space */
3422 trie->trans = (reg_trie_trans *)
3423 PerlMemShared_realloc( trie->trans, trie->lasttrans
3424 * sizeof(reg_trie_trans) );
3426 { /* Modify the program and insert the new TRIE node */
3427 U8 nodetype =(U8)(flags & 0xFF);
3431 regnode *optimize = NULL;
3432 #ifdef RE_TRACK_PATTERN_OFFSETS
3435 U32 mjd_nodelen = 0;
3436 #endif /* RE_TRACK_PATTERN_OFFSETS */
3437 #endif /* DEBUGGING */
3439 This means we convert either the first branch or the first Exact,
3440 depending on whether the thing following (in 'last') is a branch
3441 or not and whther first is the startbranch (ie is it a sub part of
3442 the alternation or is it the whole thing.)
3443 Assuming its a sub part we convert the EXACT otherwise we convert
3444 the whole branch sequence, including the first.
3446 /* Find the node we are going to overwrite */
3447 if ( first != startbranch || OP( last ) == BRANCH ) {
3448 /* branch sub-chain */
3449 NEXT_OFF( first ) = (U16)(last - first);
3450 #ifdef RE_TRACK_PATTERN_OFFSETS
3452 mjd_offset= Node_Offset((convert));
3453 mjd_nodelen= Node_Length((convert));
3456 /* whole branch chain */
3458 #ifdef RE_TRACK_PATTERN_OFFSETS
3461 const regnode *nop = NEXTOPER( convert );
3462 mjd_offset= Node_Offset((nop));
3463 mjd_nodelen= Node_Length((nop));
3467 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3469 (UV)mjd_offset, (UV)mjd_nodelen)
3472 /* But first we check to see if there is a common prefix we can
3473 split out as an EXACT and put in front of the TRIE node. */
3474 trie->startstate= 1;
3475 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3476 /* we want to find the first state that has more than
3477 * one transition, if that state is not the first state
3478 * then we have a common prefix which we can remove.
3481 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3483 I32 first_ofs = -1; /* keeps track of the ofs of the first
3484 transition, -1 means none */
3486 const U32 base = trie->states[ state ].trans.base;
3488 /* does this state terminate an alternation? */
3489 if ( trie->states[state].wordnum )
3492 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3493 if ( ( base + ofs >= trie->uniquecharcount ) &&
3494 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3495 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3497 if ( ++count > 1 ) {
3498 /* we have more than one transition */
3501 /* if this is the first state there is no common prefix
3502 * to extract, so we can exit */
3503 if ( state == 1 ) break;
3504 tmp = av_fetch( revcharmap, ofs, 0);
3505 ch = (U8*)SvPV_nolen_const( *tmp );
3507 /* if we are on count 2 then we need to initialize the
3508 * bitmap, and store the previous char if there was one
3511 /* clear the bitmap */
3512 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3514 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
3517 if (first_ofs >= 0) {
3518 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3519 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3521 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3523 Perl_re_printf( aTHX_ "%s", (char*)ch)
3527 /* store the current firstchar in the bitmap */
3528 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3529 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3535 /* This state has only one transition, its transition is part
3536 * of a common prefix - we need to concatenate the char it
3537 * represents to what we have so far. */
3538 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3540 char *ch = SvPV( *tmp, len );
3542 SV *sv=sv_newmortal();
3543 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3545 (UV)state, (UV)first_ofs,
3546 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3547 PL_colors[0], PL_colors[1],
3548 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3549 PERL_PV_ESCAPE_FIRSTCHAR
3554 OP( convert ) = nodetype;
3555 str=STRING(convert);
3556 setSTR_LEN(convert, 0);
3558 setSTR_LEN(convert, STR_LEN(convert) + len);
3564 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3569 trie->prefixlen = (state-1);
3571 regnode *n = convert+NODE_SZ_STR(convert);
3572 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3573 trie->startstate = state;
3574 trie->minlen -= (state - 1);
3575 trie->maxlen -= (state - 1);
3577 /* At least the UNICOS C compiler choked on this
3578 * being argument to DEBUG_r(), so let's just have
3581 #ifdef PERL_EXT_RE_BUILD
3587 regnode *fix = convert;
3588 U32 word = trie->wordcount;
3589 #ifdef RE_TRACK_PATTERN_OFFSETS
3592 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3593 while( ++fix < n ) {
3594 Set_Node_Offset_Length(fix, 0, 0);
3597 SV ** const tmp = av_fetch( trie_words, word, 0 );
3599 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3600 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3602 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3610 NEXT_OFF(convert) = (U16)(tail - convert);
3611 DEBUG_r(optimize= n);
3617 if ( trie->maxlen ) {
3618 NEXT_OFF( convert ) = (U16)(tail - convert);
3619 ARG_SET( convert, data_slot );
3620 /* Store the offset to the first unabsorbed branch in
3621 jump[0], which is otherwise unused by the jump logic.
3622 We use this when dumping a trie and during optimisation. */
3624 trie->jump[0] = (U16)(nextbranch - convert);
3626 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3627 * and there is a bitmap
3628 * and the first "jump target" node we found leaves enough room
3629 * then convert the TRIE node into a TRIEC node, with the bitmap
3630 * embedded inline in the opcode - this is hypothetically faster.
3632 if ( !trie->states[trie->startstate].wordnum
3634 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3636 OP( convert ) = TRIEC;
3637 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3638 PerlMemShared_free(trie->bitmap);
3641 OP( convert ) = TRIE;
3643 /* store the type in the flags */
3644 convert->flags = nodetype;
3648 + regarglen[ OP( convert ) ];
3650 /* XXX We really should free up the resource in trie now,
3651 as we won't use them - (which resources?) dmq */
3653 /* needed for dumping*/
3654 DEBUG_r(if (optimize) {
3655 regnode *opt = convert;
3657 while ( ++opt < optimize) {
3658 Set_Node_Offset_Length(opt, 0, 0);
3661 Try to clean up some of the debris left after the
3664 while( optimize < jumper ) {
3665 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3666 OP( optimize ) = OPTIMIZED;
3667 Set_Node_Offset_Length(optimize, 0, 0);
3670 Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3672 } /* end node insert */
3674 /* Finish populating the prev field of the wordinfo array. Walk back
3675 * from each accept state until we find another accept state, and if
3676 * so, point the first word's .prev field at the second word. If the
3677 * second already has a .prev field set, stop now. This will be the
3678 * case either if we've already processed that word's accept state,
3679 * or that state had multiple words, and the overspill words were
3680 * already linked up earlier.
3687 for (word=1; word <= trie->wordcount; word++) {
3689 if (trie->wordinfo[word].prev)
3691 state = trie->wordinfo[word].accept;
3693 state = prev_states[state];
3696 prev = trie->states[state].wordnum;
3700 trie->wordinfo[word].prev = prev;
3702 Safefree(prev_states);
3706 /* and now dump out the compressed format */
3707 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3709 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3711 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3712 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3714 SvREFCNT_dec_NN(revcharmap);
3718 : trie->startstate>1
3724 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3726 /* The Trie is constructed and compressed now so we can build a fail array if
3729 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3731 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3735 We find the fail state for each state in the trie, this state is the longest
3736 proper suffix of the current state's 'word' that is also a proper prefix of
3737 another word in our trie. State 1 represents the word '' and is thus the
3738 default fail state. This allows the DFA not to have to restart after its
3739 tried and failed a word at a given point, it simply continues as though it
3740 had been matching the other word in the first place.
3742 'abcdgu'=~/abcdefg|cdgu/
3743 When we get to 'd' we are still matching the first word, we would encounter
3744 'g' which would fail, which would bring us to the state representing 'd' in
3745 the second word where we would try 'g' and succeed, proceeding to match
3748 /* add a fail transition */
3749 const U32 trie_offset = ARG(source);
3750 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3752 const U32 ucharcount = trie->uniquecharcount;
3753 const U32 numstates = trie->statecount;
3754 const U32 ubound = trie->lasttrans + ucharcount;
3758 U32 base = trie->states[ 1 ].trans.base;
3761 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3763 GET_RE_DEBUG_FLAGS_DECL;
3765 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3766 PERL_UNUSED_CONTEXT;
3768 PERL_UNUSED_ARG(depth);
3771 if ( OP(source) == TRIE ) {
3772 struct regnode_1 *op = (struct regnode_1 *)
3773 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3774 StructCopy(source, op, struct regnode_1);
3775 stclass = (regnode *)op;
3777 struct regnode_charclass *op = (struct regnode_charclass *)
3778 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3779 StructCopy(source, op, struct regnode_charclass);
3780 stclass = (regnode *)op;
3782 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3784 ARG_SET( stclass, data_slot );
3785 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3786 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3787 aho->trie=trie_offset;
3788 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3789 Copy( trie->states, aho->states, numstates, reg_trie_state );
3790 Newx( q, numstates, U32);
3791 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3794 /* initialize fail[0..1] to be 1 so that we always have
3795 a valid final fail state */
3796 fail[ 0 ] = fail[ 1 ] = 1;
3798 for ( charid = 0; charid < ucharcount ; charid++ ) {
3799 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3801 q[ q_write ] = newstate;
3802 /* set to point at the root */
3803 fail[ q[ q_write++ ] ]=1;
3806 while ( q_read < q_write) {
3807 const U32 cur = q[ q_read++ % numstates ];
3808 base = trie->states[ cur ].trans.base;
3810 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3811 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3813 U32 fail_state = cur;
3816 fail_state = fail[ fail_state ];
3817 fail_base = aho->states[ fail_state ].trans.base;
3818 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3820 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3821 fail[ ch_state ] = fail_state;
3822 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3824 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3826 q[ q_write++ % numstates] = ch_state;
3830 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3831 when we fail in state 1, this allows us to use the
3832 charclass scan to find a valid start char. This is based on the principle
3833 that theres a good chance the string being searched contains lots of stuff
3834 that cant be a start char.
3836 fail[ 0 ] = fail[ 1 ] = 0;
3837 DEBUG_TRIE_COMPILE_r({
3838 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
3839 depth, (UV)numstates
3841 for( q_read=1; q_read<numstates; q_read++ ) {
3842 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
3844 Perl_re_printf( aTHX_ "\n");
3847 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3852 /* The below joins as many adjacent EXACTish nodes as possible into a single
3853 * one. The regop may be changed if the node(s) contain certain sequences that
3854 * require special handling. The joining is only done if:
3855 * 1) there is room in the current conglomerated node to entirely contain the
3857 * 2) they are compatible node types
3859 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3860 * these get optimized out
3862 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3863 * as possible, even if that means splitting an existing node so that its first
3864 * part is moved to the preceeding node. This would maximise the efficiency of
3865 * memEQ during matching.
3867 * If a node is to match under /i (folded), the number of characters it matches
3868 * can be different than its character length if it contains a multi-character
3869 * fold. *min_subtract is set to the total delta number of characters of the
3872 * And *unfolded_multi_char is set to indicate whether or not the node contains
3873 * an unfolded multi-char fold. This happens when it won't be known until
3874 * runtime whether the fold is valid or not; namely
3875 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3876 * target string being matched against turns out to be UTF-8 is that fold
3878 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
3880 * (Multi-char folds whose components are all above the Latin1 range are not
3881 * run-time locale dependent, and have already been folded by the time this
3882 * function is called.)
3884 * This is as good a place as any to discuss the design of handling these
3885 * multi-character fold sequences. It's been wrong in Perl for a very long
3886 * time. There are three code points in Unicode whose multi-character folds
3887 * were long ago discovered to mess things up. The previous designs for
3888 * dealing with these involved assigning a special node for them. This
3889 * approach doesn't always work, as evidenced by this example:
3890 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3891 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3892 * would match just the \xDF, it won't be able to handle the case where a
3893 * successful match would have to cross the node's boundary. The new approach
3894 * that hopefully generally solves the problem generates an EXACTFUP node
3895 * that is "sss" in this case.
3897 * It turns out that there are problems with all multi-character folds, and not
3898 * just these three. Now the code is general, for all such cases. The
3899 * approach taken is:
3900 * 1) This routine examines each EXACTFish node that could contain multi-
3901 * character folded sequences. Since a single character can fold into
3902 * such a sequence, the minimum match length for this node is less than
3903 * the number of characters in the node. This routine returns in
3904 * *min_subtract how many characters to subtract from the the actual
3905 * length of the string to get a real minimum match length; it is 0 if
3906 * there are no multi-char foldeds. This delta is used by the caller to
3907 * adjust the min length of the match, and the delta between min and max,
3908 * so that the optimizer doesn't reject these possibilities based on size
3911 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
3912 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
3913 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
3914 * EXACTFU nodes. The node type of such nodes is then changed to
3915 * EXACTFUP, indicating it is problematic, and needs careful handling.
3916 * (The procedures in step 1) above are sufficient to handle this case in
3917 * UTF-8 encoded nodes.) The reason this is problematic is that this is
3918 * the only case where there is a possible fold length change in non-UTF-8
3919 * patterns. By reserving a special node type for problematic cases, the
3920 * far more common regular EXACTFU nodes can be processed faster.
3921 * regexec.c takes advantage of this.
3923 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
3924 * problematic cases. These all only occur when the pattern is not
3925 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
3926 * length change, it handles the situation where the string cannot be
3927 * entirely folded. The strings in an EXACTFish node are folded as much
3928 * as possible during compilation in regcomp.c. This saves effort in
3929 * regex matching. By using an EXACTFUP node when it is not possible to
3930 * fully fold at compile time, regexec.c can know that everything in an
3931 * EXACTFU node is folded, so folding can be skipped at runtime. The only
3932 * case where folding in EXACTFU nodes can't be done at compile time is
3933 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
3934 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
3935 * handle two very different cases. Alternatively, there could have been
3936 * a node type where there are length changes, one for unfolded, and one
3937 * for both. If yet another special case needed to be created, the number
3938 * of required node types would have to go to 7. khw figures that even
3939 * though there are plenty of node types to spare, that the maintenance
3940 * cost wasn't worth the small speedup of doing it that way, especially
3941 * since he thinks the MICRO SIGN is rarely encountered in practice.
3943 * There are other cases where folding isn't done at compile time, but
3944 * none of them are under /u, and hence not for EXACTFU nodes. The folds
3945 * in EXACTFL nodes aren't known until runtime, and vary as the locale
3946 * changes. Some folds in EXACTF depend on if the runtime target string
3947 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
3948 * when no fold in it depends on the UTF-8ness of the target string.)
3950 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3951 * validity of the fold won't be known until runtime, and so must remain
3952 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
3953 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3954 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3955 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3956 * The reason this is a problem is that the optimizer part of regexec.c
3957 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3958 * that a character in the pattern corresponds to at most a single
3959 * character in the target string. (And I do mean character, and not byte
3960 * here, unlike other parts of the documentation that have never been
3961 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
3962 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
3963 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
3964 * EXACTFL nodes, violate the assumption, and they are the only instances
3965 * where it is violated. I'm reluctant to try to change the assumption,
3966 * as the code involved is impenetrable to me (khw), so instead the code
3967 * here punts. This routine examines EXACTFL nodes, and (when the pattern
3968 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
3969 * boolean indicating whether or not the node contains such a fold. When
3970 * it is true, the caller sets a flag that later causes the optimizer in
3971 * this file to not set values for the floating and fixed string lengths,
3972 * and thus avoids the optimizer code in regexec.c that makes the invalid
3973 * assumption. Thus, there is no optimization based on string lengths for
3974 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3975 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
3976 * assumption is wrong only in these cases is that all other non-UTF-8
3977 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3978 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3979 * EXACTF nodes because we don't know at compile time if it actually
3980 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3981 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3982 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
3983 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3984 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3985 * string would require the pattern to be forced into UTF-8, the overhead
3986 * of which we want to avoid. Similarly the unfolded multi-char folds in
3987 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3990 * Similarly, the code that generates tries doesn't currently handle
3991 * not-already-folded multi-char folds, and it looks like a pain to change
3992 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
3993 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
3994 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
3995 * using /iaa matching will be doing so almost entirely with ASCII
3996 * strings, so this should rarely be encountered in practice */
3998 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3999 if (PL_regkind[OP(scan)] == EXACT && OP(scan) != LEXACT \
4000 && OP(scan) != LEXACT_REQ8) \
4001 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
4004 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4005 UV *min_subtract, bool *unfolded_multi_char,
4006 U32 flags, regnode *val, U32 depth)
4008 /* Merge several consecutive EXACTish nodes into one. */
4010 regnode *n = regnext(scan);
4012 regnode *next = scan + NODE_SZ_STR(scan);
4016 regnode *stop = scan;
4017 GET_RE_DEBUG_FLAGS_DECL;
4019 PERL_UNUSED_ARG(depth);
4022 PERL_ARGS_ASSERT_JOIN_EXACT;
4023 #ifndef EXPERIMENTAL_INPLACESCAN
4024 PERL_UNUSED_ARG(flags);
4025 PERL_UNUSED_ARG(val);
4027 DEBUG_PEEP("join", scan, depth, 0);
4029 assert(PL_regkind[OP(scan)] == EXACT);
4031 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
4032 * EXACT ones that are mergeable to the current one. */
4034 && ( PL_regkind[OP(n)] == NOTHING
4035 || (stringok && PL_regkind[OP(n)] == EXACT))
4037 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4040 if (OP(n) == TAIL || n > next)
4042 if (PL_regkind[OP(n)] == NOTHING) {
4043 DEBUG_PEEP("skip:", n, depth, 0);
4044 NEXT_OFF(scan) += NEXT_OFF(n);
4045 next = n + NODE_STEP_REGNODE;
4052 else if (stringok) {
4053 const unsigned int oldl = STR_LEN(scan);
4054 regnode * const nnext = regnext(n);
4056 /* XXX I (khw) kind of doubt that this works on platforms (should
4057 * Perl ever run on one) where U8_MAX is above 255 because of lots
4058 * of other assumptions */
4059 /* Don't join if the sum can't fit into a single node */
4060 if (oldl + STR_LEN(n) > U8_MAX)
4063 /* Joining something that requires UTF-8 with something that
4064 * doesn't, means the result requires UTF-8. */
4065 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4066 OP(scan) = EXACT_REQ8;
4068 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4069 ; /* join is compatible, no need to change OP */
4071 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4072 OP(scan) = EXACTFU_REQ8;
4074 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4075 ; /* join is compatible, no need to change OP */
4077 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4078 ; /* join is compatible, no need to change OP */
4080 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4082 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4083 * which can join with EXACTFU ones. We check for this case
4084 * here. These need to be resolved to either EXACTFU or
4085 * EXACTF at joining time. They have nothing in them that
4086 * would forbid them from being the more desirable EXACTFU
4087 * nodes except that they begin and/or end with a single [Ss].
4088 * The reason this is problematic is because they could be
4089 * joined in this loop with an adjacent node that ends and/or
4090 * begins with [Ss] which would then form the sequence 'ss',
4091 * which matches differently under /di than /ui, in which case
4092 * EXACTFU can't be used. If the 'ss' sequence doesn't get
4093 * formed, the nodes get absorbed into any adjacent EXACTFU
4094 * node. And if the only adjacent node is EXACTF, they get
4095 * absorbed into that, under the theory that a longer node is
4096 * better than two shorter ones, even if one is EXACTFU. Note
4097 * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4098 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
4100 if (STRING(n)[STR_LEN(n)-1] == 's') {
4102 /* Here the joined node would end with 's'. If the node
4103 * following the combination is an EXACTF one, it's better to
4104 * join this trailing edge 's' node with that one, leaving the
4105 * current one in 'scan' be the more desirable EXACTFU */
4106 if (OP(nnext) == EXACTF) {
4110 OP(scan) = EXACTFU_S_EDGE;
4112 } /* Otherwise, the beginning 's' of the 2nd node just
4113 becomes an interior 's' in 'scan' */
4115 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4116 ; /* join is compatible, no need to change OP */
4118 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4120 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4121 * nodes. But the latter nodes can be also joined with EXACTFU
4122 * ones, and that is a better outcome, so if the node following
4123 * 'n' is EXACTFU, quit now so that those two can be joined
4125 if (OP(nnext) == EXACTFU) {
4129 /* The join is compatible, and the combined node will be
4130 * EXACTF. (These don't care if they begin or end with 's' */
4132 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4133 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
4134 && STRING(n)[0] == 's')
4136 /* When combined, we have the sequence 'ss', which means we
4137 * have to remain /di */
4141 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4142 if (STRING(n)[0] == 's') {
4143 ; /* Here the join is compatible and the combined node
4144 starts with 's', no need to change OP */
4146 else { /* Now the trailing 's' is in the interior */
4150 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4152 /* The join is compatible, and the combined node will be
4153 * EXACTF. (These don't care if they begin or end with 's' */
4156 else if (OP(scan) != OP(n)) {
4158 /* The only other compatible joinings are the same node type */
4162 DEBUG_PEEP("merg", n, depth, 0);
4165 NEXT_OFF(scan) += NEXT_OFF(n);
4166 setSTR_LEN(scan, STR_LEN(scan) + STR_LEN(n));
4167 next = n + NODE_SZ_STR(n);
4168 /* Now we can overwrite *n : */
4169 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4177 #ifdef EXPERIMENTAL_INPLACESCAN
4178 if (flags && !NEXT_OFF(n)) {
4179 DEBUG_PEEP("atch", val, depth, 0);
4180 if (reg_off_by_arg[OP(n)]) {
4181 ARG_SET(n, val - n);
4184 NEXT_OFF(n) = val - n;
4191 /* This temporary node can now be turned into EXACTFU, and must, as
4192 * regexec.c doesn't handle it */
4193 if (OP(scan) == EXACTFU_S_EDGE) {
4198 *unfolded_multi_char = FALSE;
4200 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
4201 * can now analyze for sequences of problematic code points. (Prior to
4202 * this final joining, sequences could have been split over boundaries, and
4203 * hence missed). The sequences only happen in folding, hence for any
4204 * non-EXACT EXACTish node */
4205 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4206 U8* s0 = (U8*) STRING(scan);
4208 U8* s_end = s0 + STR_LEN(scan);
4210 int total_count_delta = 0; /* Total delta number of characters that
4211 multi-char folds expand to */
4213 /* One pass is made over the node's string looking for all the
4214 * possibilities. To avoid some tests in the loop, there are two main
4215 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4220 if (OP(scan) == EXACTFL) {
4223 /* An EXACTFL node would already have been changed to another
4224 * node type unless there is at least one character in it that
4225 * is problematic; likely a character whose fold definition
4226 * won't be known until runtime, and so has yet to be folded.
4227 * For all but the UTF-8 locale, folds are 1-1 in length, but
4228 * to handle the UTF-8 case, we need to create a temporary
4229 * folded copy using UTF-8 locale rules in order to analyze it.
4230 * This is because our macros that look to see if a sequence is
4231 * a multi-char fold assume everything is folded (otherwise the
4232 * tests in those macros would be too complicated and slow).
4233 * Note that here, the non-problematic folds will have already
4234 * been done, so we can just copy such characters. We actually
4235 * don't completely fold the EXACTFL string. We skip the
4236 * unfolded multi-char folds, as that would just create work
4237 * below to figure out the size they already are */
4239 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4242 STRLEN s_len = UTF8SKIP(s);
4243 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4244 Copy(s, d, s_len, U8);
4247 else if (is_FOLDS_TO_MULTI_utf8(s)) {
4248 *unfolded_multi_char = TRUE;
4249 Copy(s, d, s_len, U8);
4252 else if (isASCII(*s)) {
4253 *(d++) = toFOLD(*s);
4257 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4263 /* Point the remainder of the routine to look at our temporary
4267 } /* End of creating folded copy of EXACTFL string */
4269 /* Examine the string for a multi-character fold sequence. UTF-8
4270 * patterns have all characters pre-folded by the time this code is
4272 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4273 length sequence we are looking for is 2 */
4275 int count = 0; /* How many characters in a multi-char fold */
4276 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4277 if (! len) { /* Not a multi-char fold: get next char */
4282 { /* Here is a generic multi-char fold. */
4283 U8* multi_end = s + len;
4285 /* Count how many characters are in it. In the case of
4286 * /aa, no folds which contain ASCII code points are
4287 * allowed, so check for those, and skip if found. */
4288 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4289 count = utf8_length(s, multi_end);
4293 while (s < multi_end) {
4296 goto next_iteration;
4306 /* The delta is how long the sequence is minus 1 (1 is how long
4307 * the character that folds to the sequence is) */
4308 total_count_delta += count - 1;
4312 /* We created a temporary folded copy of the string in EXACTFL
4313 * nodes. Therefore we need to be sure it doesn't go below zero,
4314 * as the real string could be shorter */
4315 if (OP(scan) == EXACTFL) {
4316 int total_chars = utf8_length((U8*) STRING(scan),
4317 (U8*) STRING(scan) + STR_LEN(scan));
4318 if (total_count_delta > total_chars) {
4319 total_count_delta = total_chars;
4323 *min_subtract += total_count_delta;
4326 else if (OP(scan) == EXACTFAA) {
4328 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
4329 * fold to the ASCII range (and there are no existing ones in the
4330 * upper latin1 range). But, as outlined in the comments preceding
4331 * this function, we need to flag any occurrences of the sharp s.
4332 * This character forbids trie formation (because of added
4334 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
4335 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
4336 || UNICODE_DOT_DOT_VERSION > 0)
4338 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4339 OP(scan) = EXACTFAA_NO_TRIE;
4340 *unfolded_multi_char = TRUE;
4348 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
4349 * folds that are all Latin1. As explained in the comments
4350 * preceding this function, we look also for the sharp s in EXACTF
4351 * and EXACTFL nodes; it can be in the final position. Otherwise
4352 * we can stop looking 1 byte earlier because have to find at least
4353 * two characters for a multi-fold */
4354 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4359 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4360 if (! len) { /* Not a multi-char fold. */
4361 if (*s == LATIN_SMALL_LETTER_SHARP_S
4362 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4364 *unfolded_multi_char = TRUE;
4371 && isALPHA_FOLD_EQ(*s, 's')
4372 && isALPHA_FOLD_EQ(*(s+1), 's'))
4375 /* EXACTF nodes need to know that the minimum length
4376 * changed so that a sharp s in the string can match this
4377 * ss in the pattern, but they remain EXACTF nodes, as they
4378 * won't match this unless the target string is is UTF-8,
4379 * which we don't know until runtime. EXACTFL nodes can't
4380 * transform into EXACTFU nodes */
4381 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4382 OP(scan) = EXACTFUP;
4386 *min_subtract += len - 1;
4392 if ( STR_LEN(scan) == 1
4393 && isALPHA_A(* STRING(scan))
4394 && ( OP(scan) == EXACTFAA
4395 || ( OP(scan) == EXACTFU
4396 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
4398 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
4400 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
4401 * with the mask set to the complement of the bit that differs
4402 * between upper and lower case, and the lowest code point of the
4403 * pair (which the '&' forces) */
4405 ARG_SET(scan, *STRING(scan) & mask);
4411 /* Allow dumping but overwriting the collection of skipped
4412 * ops and/or strings with fake optimized ops */
4413 n = scan + NODE_SZ_STR(scan);
4421 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4425 /* REx optimizer. Converts nodes into quicker variants "in place".
4426 Finds fixed substrings. */
4428 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4429 to the position after last scanned or to NULL. */
4431 #define INIT_AND_WITHP \
4432 assert(!and_withp); \
4433 Newx(and_withp, 1, regnode_ssc); \
4434 SAVEFREEPV(and_withp)
4438 S_unwind_scan_frames(pTHX_ const void *p)
4440 scan_frame *f= (scan_frame *)p;
4442 scan_frame *n= f->next_frame;
4448 /* the return from this sub is the minimum length that could possibly match */
4450 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4451 SSize_t *minlenp, SSize_t *deltap,
4456 regnode_ssc *and_withp,
4457 U32 flags, U32 depth)
4458 /* scanp: Start here (read-write). */
4459 /* deltap: Write maxlen-minlen here. */
4460 /* last: Stop before this one. */
4461 /* data: string data about the pattern */
4462 /* stopparen: treat close N as END */
4463 /* recursed: which subroutines have we recursed into */
4464 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4467 /* There must be at least this number of characters to match */
4470 regnode *scan = *scanp, *next;
4472 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4473 int is_inf_internal = 0; /* The studied chunk is infinite */
4474 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4475 scan_data_t data_fake;
4476 SV *re_trie_maxbuff = NULL;
4477 regnode *first_non_open = scan;
4478 SSize_t stopmin = SSize_t_MAX;
4479 scan_frame *frame = NULL;
4480 GET_RE_DEBUG_FLAGS_DECL;
4482 PERL_ARGS_ASSERT_STUDY_CHUNK;
4483 RExC_study_started= 1;
4485 Zero(&data_fake, 1, scan_data_t);
4488 while (first_non_open && OP(first_non_open) == OPEN)
4489 first_non_open=regnext(first_non_open);
4495 RExC_study_chunk_recursed_count++;
4497 DEBUG_OPTIMISE_MORE_r(
4499 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4500 depth, (long)stopparen,
4501 (unsigned long)RExC_study_chunk_recursed_count,
4502 (unsigned long)depth, (unsigned long)recursed_depth,
4505 if (recursed_depth) {
4508 for ( j = 0 ; j < recursed_depth ; j++ ) {
4509 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4511 PAREN_TEST(RExC_study_chunk_recursed +
4512 ( j * RExC_study_chunk_recursed_bytes), i )
4515 !PAREN_TEST(RExC_study_chunk_recursed +
4516 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4519 Perl_re_printf( aTHX_ " %d",(int)i);
4523 if ( j + 1 < recursed_depth ) {
4524 Perl_re_printf( aTHX_ ",");
4528 Perl_re_printf( aTHX_ "\n");
4531 while ( scan && OP(scan) != END && scan < last ){
4532 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
4533 node length to get a real minimum (because
4534 the folded version may be shorter) */
4535 bool unfolded_multi_char = FALSE;
4536 /* Peephole optimizer: */
4537 DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4538 DEBUG_PEEP("Peep", scan, depth, flags);
4541 /* The reason we do this here is that we need to deal with things like
4542 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4543 * parsing code, as each (?:..) is handled by a different invocation of
4546 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4548 /* Follow the next-chain of the current node and optimize
4549 away all the NOTHINGs from it. */
4550 if (OP(scan) != CURLYX) {
4551 const int max = (reg_off_by_arg[OP(scan)]
4553 /* I32 may be smaller than U16 on CRAYs! */
4554 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4555 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4559 /* Skip NOTHING and LONGJMP. */
4560 while ( (n = regnext(n))
4561 && ( (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4562 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4563 && off + noff < max)
4565 if (reg_off_by_arg[OP(scan)])
4568 NEXT_OFF(scan) = off;
4571 /* The principal pseudo-switch. Cannot be a switch, since we look into
4572 * several different things. */
4573 if ( OP(scan) == DEFINEP ) {
4575 SSize_t deltanext = 0;
4576 SSize_t fake_last_close = 0;
4577 I32 f = SCF_IN_DEFINE;
4579 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4580 scan = regnext(scan);
4581 assert( OP(scan) == IFTHEN );
4582 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4584 data_fake.last_closep= &fake_last_close;
4586 next = regnext(scan);
4587 scan = NEXTOPER(NEXTOPER(scan));
4588 DEBUG_PEEP("scan", scan, depth, flags);
4589 DEBUG_PEEP("next", next, depth, flags);
4591 /* we suppose the run is continuous, last=next...
4592 * NOTE we dont use the return here! */
4593 /* DEFINEP study_chunk() recursion */
4594 (void)study_chunk(pRExC_state, &scan, &minlen,
4595 &deltanext, next, &data_fake, stopparen,
4596 recursed_depth, NULL, f, depth+1);
4601 OP(scan) == BRANCH ||
4602 OP(scan) == BRANCHJ ||
4605 next = regnext(scan);
4608 /* The op(next)==code check below is to see if we
4609 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4610 * IFTHEN is special as it might not appear in pairs.
4611 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4612 * we dont handle it cleanly. */
4613 if (OP(next) == code || code == IFTHEN) {
4614 /* NOTE - There is similar code to this block below for
4615 * handling TRIE nodes on a re-study. If you change stuff here
4616 * check there too. */
4617 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4619 regnode * const startbranch=scan;
4621 if (flags & SCF_DO_SUBSTR) {
4622 /* Cannot merge strings after this. */
4623 scan_commit(pRExC_state, data, minlenp, is_inf);
4626 if (flags & SCF_DO_STCLASS)
4627 ssc_init_zero(pRExC_state, &accum);
4629 while (OP(scan) == code) {
4630 SSize_t deltanext, minnext, fake;
4632 regnode_ssc this_class;
4634 DEBUG_PEEP("Branch", scan, depth, flags);
4637 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4639 data_fake.whilem_c = data->whilem_c;
4640 data_fake.last_closep = data->last_closep;
4643 data_fake.last_closep = &fake;
4645 data_fake.pos_delta = delta;
4646 next = regnext(scan);
4648 scan = NEXTOPER(scan); /* everything */
4649 if (code != BRANCH) /* everything but BRANCH */
4650 scan = NEXTOPER(scan);
4652 if (flags & SCF_DO_STCLASS) {
4653 ssc_init(pRExC_state, &this_class);
4654 data_fake.start_class = &this_class;
4655 f = SCF_DO_STCLASS_AND;
4657 if (flags & SCF_WHILEM_VISITED_POS)
4658 f |= SCF_WHILEM_VISITED_POS;
4660 /* we suppose the run is continuous, last=next...*/
4661 /* recurse study_chunk() for each BRANCH in an alternation */
4662 minnext = study_chunk(pRExC_state, &scan, minlenp,
4663 &deltanext, next, &data_fake, stopparen,
4664 recursed_depth, NULL, f, depth+1);
4668 if (deltanext == SSize_t_MAX) {
4669 is_inf = is_inf_internal = 1;
4671 } else if (max1 < minnext + deltanext)
4672 max1 = minnext + deltanext;
4674 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4676 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4677 if ( stopmin > minnext)
4678 stopmin = min + min1;
4679 flags &= ~SCF_DO_SUBSTR;
4681 data->flags |= SCF_SEEN_ACCEPT;
4684 if (data_fake.flags & SF_HAS_EVAL)
4685 data->flags |= SF_HAS_EVAL;
4686 data->whilem_c = data_fake.whilem_c;
4688 if (flags & SCF_DO_STCLASS)
4689 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4691 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4693 if (flags & SCF_DO_SUBSTR) {
4694 data->pos_min += min1;
4695 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4696 data->pos_delta = SSize_t_MAX;
4698 data->pos_delta += max1 - min1;
4699 if (max1 != min1 || is_inf)
4700 data->cur_is_floating = 1;
4703 if (delta == SSize_t_MAX
4704 || SSize_t_MAX - delta - (max1 - min1) < 0)
4705 delta = SSize_t_MAX;
4707 delta += max1 - min1;
4708 if (flags & SCF_DO_STCLASS_OR) {
4709 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4711 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4712 flags &= ~SCF_DO_STCLASS;
4715 else if (flags & SCF_DO_STCLASS_AND) {
4717 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4718 flags &= ~SCF_DO_STCLASS;
4721 /* Switch to OR mode: cache the old value of
4722 * data->start_class */
4724 StructCopy(data->start_class, and_withp, regnode_ssc);
4725 flags &= ~SCF_DO_STCLASS_AND;
4726 StructCopy(&accum, data->start_class, regnode_ssc);
4727 flags |= SCF_DO_STCLASS_OR;
4731 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4732 OP( startbranch ) == BRANCH )
4736 Assuming this was/is a branch we are dealing with: 'scan'
4737 now points at the item that follows the branch sequence,
4738 whatever it is. We now start at the beginning of the
4739 sequence and look for subsequences of
4745 which would be constructed from a pattern like
4748 If we can find such a subsequence we need to turn the first
4749 element into a trie and then add the subsequent branch exact
4750 strings to the trie.
4754 1. patterns where the whole set of branches can be
4757 2. patterns where only a subset can be converted.
4759 In case 1 we can replace the whole set with a single regop
4760 for the trie. In case 2 we need to keep the start and end
4763 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4764 becomes BRANCH TRIE; BRANCH X;
4766 There is an additional case, that being where there is a
4767 common prefix, which gets split out into an EXACT like node
4768 preceding the TRIE node.
4770 If x(1..n)==tail then we can do a simple trie, if not we make
4771 a "jump" trie, such that when we match the appropriate word
4772 we "jump" to the appropriate tail node. Essentially we turn
4773 a nested if into a case structure of sorts.
4778 if (!re_trie_maxbuff) {
4779 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4780 if (!SvIOK(re_trie_maxbuff))
4781 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4783 if ( SvIV(re_trie_maxbuff)>=0 ) {
4785 regnode *first = (regnode *)NULL;
4786 regnode *last = (regnode *)NULL;
4787 regnode *tail = scan;
4791 /* var tail is used because there may be a TAIL
4792 regop in the way. Ie, the exacts will point to the
4793 thing following the TAIL, but the last branch will
4794 point at the TAIL. So we advance tail. If we
4795 have nested (?:) we may have to move through several
4799 while ( OP( tail ) == TAIL ) {
4800 /* this is the TAIL generated by (?:) */
4801 tail = regnext( tail );
4805 DEBUG_TRIE_COMPILE_r({
4806 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4807 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
4809 "Looking for TRIE'able sequences. Tail node is ",
4810 (UV) REGNODE_OFFSET(tail),
4811 SvPV_nolen_const( RExC_mysv )
4817 Step through the branches
4818 cur represents each branch,
4819 noper is the first thing to be matched as part
4821 noper_next is the regnext() of that node.
4823 We normally handle a case like this
4824 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4825 support building with NOJUMPTRIE, which restricts
4826 the trie logic to structures like /FOO|BAR/.
4828 If noper is a trieable nodetype then the branch is
4829 a possible optimization target. If we are building
4830 under NOJUMPTRIE then we require that noper_next is
4831 the same as scan (our current position in the regex
4834 Once we have two or more consecutive such branches
4835 we can create a trie of the EXACT's contents and
4836 stitch it in place into the program.
4838 If the sequence represents all of the branches in
4839 the alternation we replace the entire thing with a
4842 Otherwise when it is a subsequence we need to
4843 stitch it in place and replace only the relevant
4844 branches. This means the first branch has to remain
4845 as it is used by the alternation logic, and its
4846 next pointer, and needs to be repointed at the item
4847 on the branch chain following the last branch we
4848 have optimized away.
4850 This could be either a BRANCH, in which case the
4851 subsequence is internal, or it could be the item
4852 following the branch sequence in which case the
4853 subsequence is at the end (which does not
4854 necessarily mean the first node is the start of the
4857 TRIE_TYPE(X) is a define which maps the optype to a
4861 ----------------+-----------
4866 EXACTFU_REQ8 | EXACTFU
4870 EXACTFLU8 | EXACTFLU8
4874 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4876 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
4878 : ( EXACTFU == (X) \
4879 || EXACTFU_REQ8 == (X) \
4880 || EXACTFUP == (X) ) \
4882 : ( EXACTFAA == (X) ) \
4884 : ( EXACTL == (X) ) \
4886 : ( EXACTFLU8 == (X) ) \
4890 /* dont use tail as the end marker for this traverse */
4891 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4892 regnode * const noper = NEXTOPER( cur );
4893 U8 noper_type = OP( noper );
4894 U8 noper_trietype = TRIE_TYPE( noper_type );
4895 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4896 regnode * const noper_next = regnext( noper );
4897 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4898 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4901 DEBUG_TRIE_COMPILE_r({
4902 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4903 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
4905 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4907 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4908 Perl_re_printf( aTHX_ " -> %d:%s",
4909 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4912 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4913 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4914 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4916 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4917 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4918 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4922 /* Is noper a trieable nodetype that can be merged
4923 * with the current trie (if there is one)? */
4927 ( noper_trietype == NOTHING )
4928 || ( trietype == NOTHING )
4929 || ( trietype == noper_trietype )
4932 && noper_next >= tail
4936 /* Handle mergable triable node Either we are
4937 * the first node in a new trieable sequence,
4938 * in which case we do some bookkeeping,
4939 * otherwise we update the end pointer. */
4942 if ( noper_trietype == NOTHING ) {
4943 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4944 regnode * const noper_next = regnext( noper );
4945 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4946 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4949 if ( noper_next_trietype ) {
4950 trietype = noper_next_trietype;
4951 } else if (noper_next_type) {
4952 /* a NOTHING regop is 1 regop wide.
4953 * We need at least two for a trie
4954 * so we can't merge this in */
4958 trietype = noper_trietype;
4961 if ( trietype == NOTHING )
4962 trietype = noper_trietype;
4967 } /* end handle mergable triable node */
4969 /* handle unmergable node -
4970 * noper may either be a triable node which can
4971 * not be tried together with the current trie,
4972 * or a non triable node */
4974 /* If last is set and trietype is not
4975 * NOTHING then we have found at least two
4976 * triable branch sequences in a row of a
4977 * similar trietype so we can turn them
4978 * into a trie. If/when we allow NOTHING to
4979 * start a trie sequence this condition
4980 * will be required, and it isn't expensive
4981 * so we leave it in for now. */
4982 if ( trietype && trietype != NOTHING )
4983 make_trie( pRExC_state,
4984 startbranch, first, cur, tail,
4985 count, trietype, depth+1 );
4986 last = NULL; /* note: we clear/update
4987 first, trietype etc below,
4988 so we dont do it here */
4992 && noper_next >= tail
4995 /* noper is triable, so we can start a new
4999 trietype = noper_trietype;
5001 /* if we already saw a first but the
5002 * current node is not triable then we have
5003 * to reset the first information. */
5008 } /* end handle unmergable node */
5009 } /* loop over branches */
5010 DEBUG_TRIE_COMPILE_r({
5011 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5012 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
5013 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5014 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5015 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
5016 PL_reg_name[trietype]
5020 if ( last && trietype ) {
5021 if ( trietype != NOTHING ) {
5022 /* the last branch of the sequence was part of
5023 * a trie, so we have to construct it here
5024 * outside of the loop */
5025 made= make_trie( pRExC_state, startbranch,
5026 first, scan, tail, count,
5027 trietype, depth+1 );
5028 #ifdef TRIE_STUDY_OPT
5029 if ( ((made == MADE_EXACT_TRIE &&
5030 startbranch == first)
5031 || ( first_non_open == first )) &&
5033 flags |= SCF_TRIE_RESTUDY;
5034 if ( startbranch == first
5037 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5042 /* at this point we know whatever we have is a
5043 * NOTHING sequence/branch AND if 'startbranch'
5044 * is 'first' then we can turn the whole thing
5047 if ( startbranch == first ) {
5049 /* the entire thing is a NOTHING sequence,
5050 * something like this: (?:|) So we can
5051 * turn it into a plain NOTHING op. */
5052 DEBUG_TRIE_COMPILE_r({
5053 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5054 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5056 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5059 OP(startbranch)= NOTHING;
5060 NEXT_OFF(startbranch)= tail - startbranch;
5061 for ( opt= startbranch + 1; opt < tail ; opt++ )
5065 } /* end if ( last) */
5066 } /* TRIE_MAXBUF is non zero */
5071 else if ( code == BRANCHJ ) { /* single branch is optimized. */
5072 scan = NEXTOPER(NEXTOPER(scan));
5073 } else /* single branch is optimized. */
5074 scan = NEXTOPER(scan);
5076 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5078 regnode *start = NULL;
5079 regnode *end = NULL;
5080 U32 my_recursed_depth= recursed_depth;
5082 if (OP(scan) != SUSPEND) { /* GOSUB */
5083 /* Do setup, note this code has side effects beyond
5084 * the rest of this block. Specifically setting
5085 * RExC_recurse[] must happen at least once during
5088 RExC_recurse[ARG2L(scan)] = scan;
5089 start = REGNODE_p(RExC_open_parens[paren]);
5090 end = REGNODE_p(RExC_close_parens[paren]);
5092 /* NOTE we MUST always execute the above code, even
5093 * if we do nothing with a GOSUB */
5095 ( flags & SCF_IN_DEFINE )
5098 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5100 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5103 /* no need to do anything here if we are in a define. */
5104 /* or we are after some kind of infinite construct
5105 * so we can skip recursing into this item.
5106 * Since it is infinite we will not change the maxlen
5107 * or delta, and if we miss something that might raise
5108 * the minlen it will merely pessimise a little.
5110 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5111 * might result in a minlen of 1 and not of 4,
5112 * but this doesn't make us mismatch, just try a bit
5113 * harder than we should.
5115 scan= regnext(scan);
5122 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
5124 /* it is quite possible that there are more efficient ways
5125 * to do this. We maintain a bitmap per level of recursion
5126 * of which patterns we have entered so we can detect if a
5127 * pattern creates a possible infinite loop. When we
5128 * recurse down a level we copy the previous levels bitmap
5129 * down. When we are at recursion level 0 we zero the top
5130 * level bitmap. It would be nice to implement a different
5131 * more efficient way of doing this. In particular the top
5132 * level bitmap may be unnecessary.
5134 if (!recursed_depth) {
5135 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5137 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
5138 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
5139 RExC_study_chunk_recursed_bytes, U8);
5141 /* we havent recursed into this paren yet, so recurse into it */
5142 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5143 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
5144 my_recursed_depth= recursed_depth + 1;
5146 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5147 /* some form of infinite recursion, assume infinite length
5149 if (flags & SCF_DO_SUBSTR) {
5150 scan_commit(pRExC_state, data, minlenp, is_inf);
5151 data->cur_is_floating = 1;
5153 is_inf = is_inf_internal = 1;
5154 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5155 ssc_anything(data->start_class);
5156 flags &= ~SCF_DO_STCLASS;
5158 start= NULL; /* reset start so we dont recurse later on. */
5163 end = regnext(scan);
5166 scan_frame *newframe;
5168 if (!RExC_frame_last) {
5169 Newxz(newframe, 1, scan_frame);
5170 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5171 RExC_frame_head= newframe;
5173 } else if (!RExC_frame_last->next_frame) {
5174 Newxz(newframe, 1, scan_frame);
5175 RExC_frame_last->next_frame= newframe;
5176 newframe->prev_frame= RExC_frame_last;
5179 newframe= RExC_frame_last->next_frame;
5181 RExC_frame_last= newframe;
5183 newframe->next_regnode = regnext(scan);
5184 newframe->last_regnode = last;
5185 newframe->stopparen = stopparen;
5186 newframe->prev_recursed_depth = recursed_depth;
5187 newframe->this_prev_frame= frame;
5189 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5190 DEBUG_PEEP("fnew", scan, depth, flags);
5197 recursed_depth= my_recursed_depth;
5202 else if ( OP(scan) == EXACT
5203 || OP(scan) == LEXACT
5204 || OP(scan) == EXACT_REQ8
5205 || OP(scan) == LEXACT_REQ8
5206 || OP(scan) == EXACTL)
5208 SSize_t l = STR_LEN(scan);
5212 const U8 * const s = (U8*)STRING(scan);
5213 uc = utf8_to_uvchr_buf(s, s + l, NULL);
5214 l = utf8_length(s, s + l);
5216 uc = *((U8*)STRING(scan));
5219 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5220 /* The code below prefers earlier match for fixed
5221 offset, later match for variable offset. */
5222 if (data->last_end == -1) { /* Update the start info. */
5223 data->last_start_min = data->pos_min;
5224 data->last_start_max = is_inf
5225 ? SSize_t_MAX : data->pos_min + data->pos_delta;
5227 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
5229 SvUTF8_on(data->last_found);
5231 SV * const sv = data->last_found;
5232 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5233 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5234 if (mg && mg->mg_len >= 0)
5235 mg->mg_len += utf8_length((U8*)STRING(scan),
5236 (U8*)STRING(scan)+STR_LEN(scan));
5238 data->last_end = data->pos_min + l;
5239 data->pos_min += l; /* As in the first entry. */
5240 data->flags &= ~SF_BEFORE_EOL;
5243 /* ANDing the code point leaves at most it, and not in locale, and
5244 * can't match null string */
5245 if (flags & SCF_DO_STCLASS_AND) {
5246 ssc_cp_and(data->start_class, uc);
5247 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5248 ssc_clear_locale(data->start_class);
5250 else if (flags & SCF_DO_STCLASS_OR) {
5251 ssc_add_cp(data->start_class, uc);
5252 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5254 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5255 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5257 flags &= ~SCF_DO_STCLASS;
5259 else if (PL_regkind[OP(scan)] == EXACT) {
5260 /* But OP != EXACT!, so is EXACTFish */
5261 SSize_t l = STR_LEN(scan);
5262 const U8 * s = (U8*)STRING(scan);
5264 /* Search for fixed substrings supports EXACT only. */
5265 if (flags & SCF_DO_SUBSTR) {
5267 scan_commit(pRExC_state, data, minlenp, is_inf);
5270 l = utf8_length(s, s + l);
5272 if (unfolded_multi_char) {
5273 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5275 min += l - min_subtract;
5277 delta += min_subtract;
5278 if (flags & SCF_DO_SUBSTR) {
5279 data->pos_min += l - min_subtract;
5280 if (data->pos_min < 0) {
5283 data->pos_delta += min_subtract;
5285 data->cur_is_floating = 1; /* float */
5289 if (flags & SCF_DO_STCLASS) {
5290 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5292 assert(EXACTF_invlist);
5293 if (flags & SCF_DO_STCLASS_AND) {
5294 if (OP(scan) != EXACTFL)
5295 ssc_clear_locale(data->start_class);
5296 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5297 ANYOF_POSIXL_ZERO(data->start_class);
5298 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5300 else { /* SCF_DO_STCLASS_OR */
5301 ssc_union(data->start_class, EXACTF_invlist, FALSE);
5302 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5304 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5305 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5307 flags &= ~SCF_DO_STCLASS;
5308 SvREFCNT_dec(EXACTF_invlist);
5311 else if (REGNODE_VARIES(OP(scan))) {
5312 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5313 I32 fl = 0, f = flags;
5314 regnode * const oscan = scan;
5315 regnode_ssc this_class;
5316 regnode_ssc *oclass = NULL;
5317 I32 next_is_eval = 0;
5319 switch (PL_regkind[OP(scan)]) {
5320 case WHILEM: /* End of (?:...)* . */
5321 scan = NEXTOPER(scan);
5324 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5325 next = NEXTOPER(scan);
5326 if ( OP(next) == EXACT
5327 || OP(next) == LEXACT
5328 || OP(next) == EXACT_REQ8
5329 || OP(next) == LEXACT_REQ8
5330 || OP(next) == EXACTL
5331 || (flags & SCF_DO_STCLASS))
5334 maxcount = REG_INFTY;
5335 next = regnext(scan);
5336 scan = NEXTOPER(scan);
5340 if (flags & SCF_DO_SUBSTR)
5345 next = NEXTOPER(scan);
5347 /* This temporary node can now be turned into EXACTFU, and
5348 * must, as regexec.c doesn't handle it */
5349 if (OP(next) == EXACTFU_S_EDGE) {
5353 if ( STR_LEN(next) == 1
5354 && isALPHA_A(* STRING(next))
5355 && ( OP(next) == EXACTFAA
5356 || ( OP(next) == EXACTFU
5357 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
5359 /* These differ in just one bit */
5360 U8 mask = ~ ('A' ^ 'a');
5362 assert(isALPHA_A(* STRING(next)));
5364 /* Then replace it by an ANYOFM node, with
5365 * the mask set to the complement of the
5366 * bit that differs between upper and lower
5367 * case, and the lowest code point of the
5368 * pair (which the '&' forces) */
5370 ARG_SET(next, *STRING(next) & mask);
5374 if (flags & SCF_DO_STCLASS) {
5376 maxcount = REG_INFTY;
5377 next = regnext(scan);
5378 scan = NEXTOPER(scan);
5381 if (flags & SCF_DO_SUBSTR) {
5382 scan_commit(pRExC_state, data, minlenp, is_inf);
5383 /* Cannot extend fixed substrings */
5384 data->cur_is_floating = 1; /* float */
5386 is_inf = is_inf_internal = 1;
5387 scan = regnext(scan);
5388 goto optimize_curly_tail;
5390 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5391 && (scan->flags == stopparen))
5396 mincount = ARG1(scan);
5397 maxcount = ARG2(scan);
5399 next = regnext(scan);
5400 if (OP(scan) == CURLYX) {
5401 I32 lp = (data ? *(data->last_closep) : 0);
5402 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5404 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5405 next_is_eval = (OP(scan) == EVAL);
5407 if (flags & SCF_DO_SUBSTR) {
5409 scan_commit(pRExC_state, data, minlenp, is_inf);
5410 /* Cannot extend fixed substrings */
5411 pos_before = data->pos_min;
5415 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5417 data->flags |= SF_IS_INF;
5419 if (flags & SCF_DO_STCLASS) {
5420 ssc_init(pRExC_state, &this_class);
5421 oclass = data->start_class;
5422 data->start_class = &this_class;
5423 f |= SCF_DO_STCLASS_AND;
5424 f &= ~SCF_DO_STCLASS_OR;
5426 /* Exclude from super-linear cache processing any {n,m}
5427 regops for which the combination of input pos and regex
5428 pos is not enough information to determine if a match
5431 For example, in the regex /foo(bar\s*){4,8}baz/ with the
5432 regex pos at the \s*, the prospects for a match depend not
5433 only on the input position but also on how many (bar\s*)
5434 repeats into the {4,8} we are. */
5435 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5436 f &= ~SCF_WHILEM_VISITED_POS;
5438 /* This will finish on WHILEM, setting scan, or on NULL: */
5439 /* recurse study_chunk() on loop bodies */
5440 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5441 last, data, stopparen, recursed_depth, NULL,
5443 ? (f & ~SCF_DO_SUBSTR)
5447 if (flags & SCF_DO_STCLASS)
5448 data->start_class = oclass;
5449 if (mincount == 0 || minnext == 0) {
5450 if (flags & SCF_DO_STCLASS_OR) {
5451 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5453 else if (flags & SCF_DO_STCLASS_AND) {
5454 /* Switch to OR mode: cache the old value of
5455 * data->start_class */
5457 StructCopy(data->start_class, and_withp, regnode_ssc);
5458 flags &= ~SCF_DO_STCLASS_AND;
5459 StructCopy(&this_class, data->start_class, regnode_ssc);
5460 flags |= SCF_DO_STCLASS_OR;
5461 ANYOF_FLAGS(data->start_class)
5462 |= SSC_MATCHES_EMPTY_STRING;
5464 } else { /* Non-zero len */
5465 if (flags & SCF_DO_STCLASS_OR) {
5466 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5467 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5469 else if (flags & SCF_DO_STCLASS_AND)
5470 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5471 flags &= ~SCF_DO_STCLASS;
5473 if (!scan) /* It was not CURLYX, but CURLY. */
5475 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5476 /* ? quantifier ok, except for (?{ ... }) */
5477 && (next_is_eval || !(mincount == 0 && maxcount == 1))
5478 && (minnext == 0) && (deltanext == 0)
5479 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5480 && maxcount <= REG_INFTY/3) /* Complement check for big
5483 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5484 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5485 "Quantifier unexpected on zero-length expression "
5486 "in regex m/%" UTF8f "/",
5487 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5491 min += minnext * mincount;
5492 is_inf_internal |= deltanext == SSize_t_MAX
5493 || (maxcount == REG_INFTY && minnext + deltanext > 0);
5494 is_inf |= is_inf_internal;
5496 delta = SSize_t_MAX;
5498 delta += (minnext + deltanext) * maxcount
5499 - minnext * mincount;
5501 /* Try powerful optimization CURLYX => CURLYN. */
5502 if ( OP(oscan) == CURLYX && data
5503 && data->flags & SF_IN_PAR
5504 && !(data->flags & SF_HAS_EVAL)
5505 && !deltanext && minnext == 1 ) {
5506 /* Try to optimize to CURLYN. */
5507 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5508 regnode * const nxt1 = nxt;
5515 if (!REGNODE_SIMPLE(OP(nxt))
5516 && !(PL_regkind[OP(nxt)] == EXACT
5517 && STR_LEN(nxt) == 1))
5523 if (OP(nxt) != CLOSE)
5525 if (RExC_open_parens) {
5528 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5531 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5533 /* Now we know that nxt2 is the only contents: */
5534 oscan->flags = (U8)ARG(nxt);
5536 OP(nxt1) = NOTHING; /* was OPEN. */
5539 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5540 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5541 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5542 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5543 OP(nxt + 1) = OPTIMIZED; /* was count. */
5544 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5549 /* Try optimization CURLYX => CURLYM. */
5550 if ( OP(oscan) == CURLYX && data
5551 && !(data->flags & SF_HAS_PAR)
5552 && !(data->flags & SF_HAS_EVAL)
5553 && !deltanext /* atom is fixed width */
5554 && minnext != 0 /* CURLYM can't handle zero width */
5556 /* Nor characters whose fold at run-time may be
5557 * multi-character */
5558 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5560 /* XXXX How to optimize if data == 0? */
5561 /* Optimize to a simpler form. */
5562 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5566 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5567 && (OP(nxt2) != WHILEM))
5569 OP(nxt2) = SUCCEED; /* Whas WHILEM */
5570 /* Need to optimize away parenths. */
5571 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5572 /* Set the parenth number. */
5573 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5575 oscan->flags = (U8)ARG(nxt);
5576 if (RExC_open_parens) {
5578 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5581 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5584 OP(nxt1) = OPTIMIZED; /* was OPEN. */
5585 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5588 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5589 OP(nxt + 1) = OPTIMIZED; /* was count. */
5590 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5591 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5594 while ( nxt1 && (OP(nxt1) != WHILEM)) {
5595 regnode *nnxt = regnext(nxt1);
5597 if (reg_off_by_arg[OP(nxt1)])
5598 ARG_SET(nxt1, nxt2 - nxt1);
5599 else if (nxt2 - nxt1 < U16_MAX)
5600 NEXT_OFF(nxt1) = nxt2 - nxt1;
5602 OP(nxt) = NOTHING; /* Cannot beautify */
5607 /* Optimize again: */
5608 /* recurse study_chunk() on optimised CURLYX => CURLYM */
5609 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5610 NULL, stopparen, recursed_depth, NULL, 0,
5616 else if ((OP(oscan) == CURLYX)
5617 && (flags & SCF_WHILEM_VISITED_POS)
5618 /* See the comment on a similar expression above.
5619 However, this time it's not a subexpression
5620 we care about, but the expression itself. */
5621 && (maxcount == REG_INFTY)
5623 /* This stays as CURLYX, we can put the count/of pair. */
5624 /* Find WHILEM (as in regexec.c) */
5625 regnode *nxt = oscan + NEXT_OFF(oscan);
5627 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5629 nxt = PREVOPER(nxt);
5630 if (nxt->flags & 0xf) {
5631 /* we've already set whilem count on this node */
5632 } else if (++data->whilem_c < 16) {
5633 assert(data->whilem_c <= RExC_whilem_seen);
5634 nxt->flags = (U8)(data->whilem_c
5635 | (RExC_whilem_seen << 4)); /* On WHILEM */
5638 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5640 if (flags & SCF_DO_SUBSTR) {
5641 SV *last_str = NULL;
5642 STRLEN last_chrs = 0;
5643 int counted = mincount != 0;
5645 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5647 SSize_t b = pos_before >= data->last_start_min
5648 ? pos_before : data->last_start_min;
5650 const char * const s = SvPV_const(data->last_found, l);
5651 SSize_t old = b - data->last_start_min;
5655 old = utf8_hop_forward((U8*)s, old,
5656 (U8 *) SvEND(data->last_found))
5659 /* Get the added string: */
5660 last_str = newSVpvn_utf8(s + old, l, UTF);
5661 last_chrs = UTF ? utf8_length((U8*)(s + old),
5662 (U8*)(s + old + l)) : l;
5663 if (deltanext == 0 && pos_before == b) {
5664 /* What was added is a constant string */
5667 SvGROW(last_str, (mincount * l) + 1);
5668 repeatcpy(SvPVX(last_str) + l,
5669 SvPVX_const(last_str), l,
5671 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5672 /* Add additional parts. */
5673 SvCUR_set(data->last_found,
5674 SvCUR(data->last_found) - l);
5675 sv_catsv(data->last_found, last_str);
5677 SV * sv = data->last_found;
5679 SvUTF8(sv) && SvMAGICAL(sv) ?
5680 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5681 if (mg && mg->mg_len >= 0)
5682 mg->mg_len += last_chrs * (mincount-1);
5684 last_chrs *= mincount;
5685 data->last_end += l * (mincount - 1);
5688 /* start offset must point into the last copy */
5689 data->last_start_min += minnext * (mincount - 1);
5690 data->last_start_max =
5693 : data->last_start_max +
5694 (maxcount - 1) * (minnext + data->pos_delta);
5697 /* It is counted once already... */
5698 data->pos_min += minnext * (mincount - counted);
5700 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
5701 " SSize_t_MAX=%" UVuf " minnext=%" UVuf
5702 " maxcount=%" UVuf " mincount=%" UVuf "\n",
5703 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5705 if (deltanext != SSize_t_MAX)
5706 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
5707 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5708 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5710 if (deltanext == SSize_t_MAX
5711 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5712 data->pos_delta = SSize_t_MAX;
5714 data->pos_delta += - counted * deltanext +
5715 (minnext + deltanext) * maxcount - minnext * mincount;
5716 if (mincount != maxcount) {
5717 /* Cannot extend fixed substrings found inside
5719 scan_commit(pRExC_state, data, minlenp, is_inf);
5720 if (mincount && last_str) {
5721 SV * const sv = data->last_found;
5722 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5723 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5727 sv_setsv(sv, last_str);
5728 data->last_end = data->pos_min;
5729 data->last_start_min = data->pos_min - last_chrs;
5730 data->last_start_max = is_inf
5732 : data->pos_min + data->pos_delta - last_chrs;
5734 data->cur_is_floating = 1; /* float */
5736 SvREFCNT_dec(last_str);
5738 if (data && (fl & SF_HAS_EVAL))
5739 data->flags |= SF_HAS_EVAL;
5740 optimize_curly_tail:
5741 if (OP(oscan) != CURLYX) {
5742 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5744 NEXT_OFF(oscan) += NEXT_OFF(next);
5750 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5755 if (flags & SCF_DO_SUBSTR) {
5756 /* Cannot expect anything... */
5757 scan_commit(pRExC_state, data, minlenp, is_inf);
5758 data->cur_is_floating = 1; /* float */
5760 is_inf = is_inf_internal = 1;
5761 if (flags & SCF_DO_STCLASS_OR) {
5762 if (OP(scan) == CLUMP) {
5763 /* Actually is any start char, but very few code points
5764 * aren't start characters */
5765 ssc_match_all_cp(data->start_class);
5768 ssc_anything(data->start_class);
5771 flags &= ~SCF_DO_STCLASS;
5775 else if (OP(scan) == LNBREAK) {
5776 if (flags & SCF_DO_STCLASS) {
5777 if (flags & SCF_DO_STCLASS_AND) {
5778 ssc_intersection(data->start_class,
5779 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5780 ssc_clear_locale(data->start_class);
5781 ANYOF_FLAGS(data->start_class)
5782 &= ~SSC_MATCHES_EMPTY_STRING;
5784 else if (flags & SCF_DO_STCLASS_OR) {
5785 ssc_union(data->start_class,
5786 PL_XPosix_ptrs[_CC_VERTSPACE],
5788 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5790 /* See commit msg for
5791 * 749e076fceedeb708a624933726e7989f2302f6a */
5792 ANYOF_FLAGS(data->start_class)
5793 &= ~SSC_MATCHES_EMPTY_STRING;
5795 flags &= ~SCF_DO_STCLASS;
5798 if (delta != SSize_t_MAX)
5799 delta++; /* Because of the 2 char string cr-lf */
5800 if (flags & SCF_DO_SUBSTR) {
5801 /* Cannot expect anything... */
5802 scan_commit(pRExC_state, data, minlenp, is_inf);
5804 if (data->pos_delta != SSize_t_MAX) {
5805 data->pos_delta += 1;
5807 data->cur_is_floating = 1; /* float */
5810 else if (REGNODE_SIMPLE(OP(scan))) {
5812 if (flags & SCF_DO_SUBSTR) {
5813 scan_commit(pRExC_state, data, minlenp, is_inf);
5817 if (flags & SCF_DO_STCLASS) {
5819 SV* my_invlist = NULL;
5822 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5823 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5825 /* Some of the logic below assumes that switching
5826 locale on will only add false positives. */
5831 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5835 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5836 ssc_match_all_cp(data->start_class);
5841 SV* REG_ANY_invlist = _new_invlist(2);
5842 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5844 if (flags & SCF_DO_STCLASS_OR) {
5845 ssc_union(data->start_class,
5847 TRUE /* TRUE => invert, hence all but \n
5851 else if (flags & SCF_DO_STCLASS_AND) {
5852 ssc_intersection(data->start_class,
5854 TRUE /* TRUE => invert */
5856 ssc_clear_locale(data->start_class);
5858 SvREFCNT_dec_NN(REG_ANY_invlist);
5870 if (flags & SCF_DO_STCLASS_AND)
5871 ssc_and(pRExC_state, data->start_class,
5872 (regnode_charclass *) scan);
5874 ssc_or(pRExC_state, data->start_class,
5875 (regnode_charclass *) scan);
5881 SV* cp_list = get_ANYOFM_contents(scan);
5883 if (flags & SCF_DO_STCLASS_OR) {
5884 ssc_union(data->start_class, cp_list, invert);
5886 else if (flags & SCF_DO_STCLASS_AND) {
5887 ssc_intersection(data->start_class, cp_list, invert);
5890 SvREFCNT_dec_NN(cp_list);
5899 cp_list = _add_range_to_invlist(cp_list,
5901 ANYOFRbase(scan) + ANYOFRdelta(scan));
5903 if (flags & SCF_DO_STCLASS_OR) {
5904 ssc_union(data->start_class, cp_list, invert);
5906 else if (flags & SCF_DO_STCLASS_AND) {
5907 ssc_intersection(data->start_class, cp_list, invert);
5910 SvREFCNT_dec_NN(cp_list);
5919 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5920 if (flags & SCF_DO_STCLASS_AND) {
5921 bool was_there = cBOOL(
5922 ANYOF_POSIXL_TEST(data->start_class,
5924 ANYOF_POSIXL_ZERO(data->start_class);
5925 if (was_there) { /* Do an AND */
5926 ANYOF_POSIXL_SET(data->start_class, namedclass);
5928 /* No individual code points can now match */
5929 data->start_class->invlist
5930 = sv_2mortal(_new_invlist(0));
5933 int complement = namedclass + ((invert) ? -1 : 1);
5935 assert(flags & SCF_DO_STCLASS_OR);
5937 /* If the complement of this class was already there,
5938 * the result is that they match all code points,
5939 * (\d + \D == everything). Remove the classes from
5940 * future consideration. Locale is not relevant in
5942 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5943 ssc_match_all_cp(data->start_class);
5944 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5945 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5947 else { /* The usual case; just add this class to the
5949 ANYOF_POSIXL_SET(data->start_class, namedclass);
5954 case NPOSIXA: /* For these, we always know the exact set of
5959 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
5960 goto join_posix_and_ascii;
5968 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
5970 /* NPOSIXD matches all upper Latin1 code points unless the
5971 * target string being matched is UTF-8, which is
5972 * unknowable until match time. Since we are going to
5973 * invert, we want to get rid of all of them so that the
5974 * inversion will match all */
5975 if (OP(scan) == NPOSIXD) {
5976 _invlist_subtract(my_invlist, PL_UpperLatin1,
5980 join_posix_and_ascii:
5982 if (flags & SCF_DO_STCLASS_AND) {
5983 ssc_intersection(data->start_class, my_invlist, invert);
5984 ssc_clear_locale(data->start_class);
5987 assert(flags & SCF_DO_STCLASS_OR);
5988 ssc_union(data->start_class, my_invlist, invert);
5990 SvREFCNT_dec(my_invlist);
5992 if (flags & SCF_DO_STCLASS_OR)
5993 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5994 flags &= ~SCF_DO_STCLASS;
5997 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5998 data->flags |= (OP(scan) == MEOL
6001 scan_commit(pRExC_state, data, minlenp, is_inf);
6004 else if ( PL_regkind[OP(scan)] == BRANCHJ
6005 /* Lookbehind, or need to calculate parens/evals/stclass: */
6006 && (scan->flags || data || (flags & SCF_DO_STCLASS))
6007 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6009 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6010 || OP(scan) == UNLESSM )
6012 /* Negative Lookahead/lookbehind
6013 In this case we can't do fixed string optimisation.
6016 SSize_t deltanext, minnext, fake = 0;
6021 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6023 data_fake.whilem_c = data->whilem_c;
6024 data_fake.last_closep = data->last_closep;
6027 data_fake.last_closep = &fake;
6028 data_fake.pos_delta = delta;
6029 if ( flags & SCF_DO_STCLASS && !scan->flags
6030 && OP(scan) == IFMATCH ) { /* Lookahead */
6031 ssc_init(pRExC_state, &intrnl);
6032 data_fake.start_class = &intrnl;
6033 f |= SCF_DO_STCLASS_AND;
6035 if (flags & SCF_WHILEM_VISITED_POS)
6036 f |= SCF_WHILEM_VISITED_POS;
6037 next = regnext(scan);
6038 nscan = NEXTOPER(NEXTOPER(scan));
6040 /* recurse study_chunk() for lookahead body */
6041 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6042 last, &data_fake, stopparen,
6043 recursed_depth, NULL, f, depth+1);
6046 || deltanext > (I32) U8_MAX
6047 || minnext > (I32)U8_MAX
6048 || minnext + deltanext > (I32)U8_MAX)
6050 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6054 /* The 'next_off' field has been repurposed to count the
6055 * additional starting positions to try beyond the initial
6056 * one. (This leaves it at 0 for non-variable length
6057 * matches to avoid breakage for those not using this
6060 scan->next_off = deltanext;
6061 ckWARNexperimental(RExC_parse,
6062 WARN_EXPERIMENTAL__VLB,
6063 "Variable length lookbehind is experimental");
6065 scan->flags = (U8)minnext + deltanext;
6068 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6070 if (data_fake.flags & SF_HAS_EVAL)
6071 data->flags |= SF_HAS_EVAL;
6072 data->whilem_c = data_fake.whilem_c;
6074 if (f & SCF_DO_STCLASS_AND) {
6075 if (flags & SCF_DO_STCLASS_OR) {
6076 /* OR before, AND after: ideally we would recurse with
6077 * data_fake to get the AND applied by study of the
6078 * remainder of the pattern, and then derecurse;
6079 * *** HACK *** for now just treat as "no information".
6080 * See [perl #56690].
6082 ssc_init(pRExC_state, data->start_class);
6084 /* AND before and after: combine and continue. These
6085 * assertions are zero-length, so can match an EMPTY
6087 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6088 ANYOF_FLAGS(data->start_class)
6089 |= SSC_MATCHES_EMPTY_STRING;
6093 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6095 /* Positive Lookahead/lookbehind
6096 In this case we can do fixed string optimisation,
6097 but we must be careful about it. Note in the case of
6098 lookbehind the positions will be offset by the minimum
6099 length of the pattern, something we won't know about
6100 until after the recurse.
6102 SSize_t deltanext, fake = 0;
6106 /* We use SAVEFREEPV so that when the full compile
6107 is finished perl will clean up the allocated
6108 minlens when it's all done. This way we don't
6109 have to worry about freeing them when we know
6110 they wont be used, which would be a pain.
6113 Newx( minnextp, 1, SSize_t );
6114 SAVEFREEPV(minnextp);
6117 StructCopy(data, &data_fake, scan_data_t);
6118 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6121 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6122 data_fake.last_found=newSVsv(data->last_found);
6126 data_fake.last_closep = &fake;
6127 data_fake.flags = 0;
6128 data_fake.substrs[0].flags = 0;
6129 data_fake.substrs[1].flags = 0;
6130 data_fake.pos_delta = delta;
6132 data_fake.flags |= SF_IS_INF;
6133 if ( flags & SCF_DO_STCLASS && !scan->flags
6134 && OP(scan) == IFMATCH ) { /* Lookahead */
6135 ssc_init(pRExC_state, &intrnl);
6136 data_fake.start_class = &intrnl;
6137 f |= SCF_DO_STCLASS_AND;
6139 if (flags & SCF_WHILEM_VISITED_POS)
6140 f |= SCF_WHILEM_VISITED_POS;
6141 next = regnext(scan);
6142 nscan = NEXTOPER(NEXTOPER(scan));
6144 /* positive lookahead study_chunk() recursion */
6145 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6146 &deltanext, last, &data_fake,
6147 stopparen, recursed_depth, NULL,
6150 assert(0); /* This code has never been tested since this
6151 is normally not compiled */
6153 || deltanext > (I32) U8_MAX
6154 || *minnextp > (I32)U8_MAX
6155 || *minnextp + deltanext > (I32)U8_MAX)
6157 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6162 scan->next_off = deltanext;
6164 scan->flags = (U8)*minnextp + deltanext;
6169 if (f & SCF_DO_STCLASS_AND) {
6170 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6171 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6174 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6176 if (data_fake.flags & SF_HAS_EVAL)
6177 data->flags |= SF_HAS_EVAL;
6178 data->whilem_c = data_fake.whilem_c;
6179 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6181 if (RExC_rx->minlen<*minnextp)
6182 RExC_rx->minlen=*minnextp;
6183 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6184 SvREFCNT_dec_NN(data_fake.last_found);
6186 for (i = 0; i < 2; i++) {
6187 if (data_fake.substrs[i].minlenp != minlenp) {
6188 data->substrs[i].min_offset =
6189 data_fake.substrs[i].min_offset;
6190 data->substrs[i].max_offset =
6191 data_fake.substrs[i].max_offset;
6192 data->substrs[i].minlenp =
6193 data_fake.substrs[i].minlenp;
6194 data->substrs[i].lookbehind += scan->flags;
6202 else if (OP(scan) == OPEN) {
6203 if (stopparen != (I32)ARG(scan))
6206 else if (OP(scan) == CLOSE) {
6207 if (stopparen == (I32)ARG(scan)) {
6210 if ((I32)ARG(scan) == is_par) {
6211 next = regnext(scan);
6213 if ( next && (OP(next) != WHILEM) && next < last)
6214 is_par = 0; /* Disable optimization */
6217 *(data->last_closep) = ARG(scan);
6219 else if (OP(scan) == EVAL) {
6221 data->flags |= SF_HAS_EVAL;
6223 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6224 if (flags & SCF_DO_SUBSTR) {
6225 scan_commit(pRExC_state, data, minlenp, is_inf);
6226 flags &= ~SCF_DO_SUBSTR;
6228 if (data && OP(scan)==ACCEPT) {
6229 data->flags |= SCF_SEEN_ACCEPT;
6234 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6236 if (flags & SCF_DO_SUBSTR) {
6237 scan_commit(pRExC_state, data, minlenp, is_inf);
6238 data->cur_is_floating = 1; /* float */
6240 is_inf = is_inf_internal = 1;
6241 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6242 ssc_anything(data->start_class);
6243 flags &= ~SCF_DO_STCLASS;
6245 else if (OP(scan) == GPOS) {
6246 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6247 !(delta || is_inf || (data && data->pos_delta)))
6249 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6250 RExC_rx->intflags |= PREGf_ANCH_GPOS;
6251 if (RExC_rx->gofs < (STRLEN)min)
6252 RExC_rx->gofs = min;
6254 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6258 #ifdef TRIE_STUDY_OPT
6259 #ifdef FULL_TRIE_STUDY
6260 else if (PL_regkind[OP(scan)] == TRIE) {
6261 /* NOTE - There is similar code to this block above for handling
6262 BRANCH nodes on the initial study. If you change stuff here
6264 regnode *trie_node= scan;
6265 regnode *tail= regnext(scan);
6266 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6267 SSize_t max1 = 0, min1 = SSize_t_MAX;
6270 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6271 /* Cannot merge strings after this. */
6272 scan_commit(pRExC_state, data, minlenp, is_inf);
6274 if (flags & SCF_DO_STCLASS)
6275 ssc_init_zero(pRExC_state, &accum);
6281 const regnode *nextbranch= NULL;
6284 for ( word=1 ; word <= trie->wordcount ; word++)
6286 SSize_t deltanext=0, minnext=0, f = 0, fake;
6287 regnode_ssc this_class;
6289 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6291 data_fake.whilem_c = data->whilem_c;
6292 data_fake.last_closep = data->last_closep;
6295 data_fake.last_closep = &fake;
6296 data_fake.pos_delta = delta;
6297 if (flags & SCF_DO_STCLASS) {
6298 ssc_init(pRExC_state, &this_class);
6299 data_fake.start_class = &this_class;
6300 f = SCF_DO_STCLASS_AND;
6302 if (flags & SCF_WHILEM_VISITED_POS)
6303 f |= SCF_WHILEM_VISITED_POS;
6305 if (trie->jump[word]) {
6307 nextbranch = trie_node + trie->jump[0];
6308 scan= trie_node + trie->jump[word];
6309 /* We go from the jump point to the branch that follows
6310 it. Note this means we need the vestigal unused
6311 branches even though they arent otherwise used. */
6312 /* optimise study_chunk() for TRIE */
6313 minnext = study_chunk(pRExC_state, &scan, minlenp,
6314 &deltanext, (regnode *)nextbranch, &data_fake,
6315 stopparen, recursed_depth, NULL, f, depth+1);
6317 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6318 nextbranch= regnext((regnode*)nextbranch);
6320 if (min1 > (SSize_t)(minnext + trie->minlen))
6321 min1 = minnext + trie->minlen;
6322 if (deltanext == SSize_t_MAX) {
6323 is_inf = is_inf_internal = 1;
6325 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6326 max1 = minnext + deltanext + trie->maxlen;
6328 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6330 if (data_fake.flags & SCF_SEEN_ACCEPT) {
6331 if ( stopmin > min + min1)
6332 stopmin = min + min1;
6333 flags &= ~SCF_DO_SUBSTR;
6335 data->flags |= SCF_SEEN_ACCEPT;
6338 if (data_fake.flags & SF_HAS_EVAL)
6339 data->flags |= SF_HAS_EVAL;
6340 data->whilem_c = data_fake.whilem_c;
6342 if (flags & SCF_DO_STCLASS)
6343 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6346 if (flags & SCF_DO_SUBSTR) {
6347 data->pos_min += min1;
6348 data->pos_delta += max1 - min1;
6349 if (max1 != min1 || is_inf)
6350 data->cur_is_floating = 1; /* float */
6353 if (delta != SSize_t_MAX) {
6354 if (SSize_t_MAX - (max1 - min1) >= delta)
6355 delta += max1 - min1;
6357 delta = SSize_t_MAX;
6359 if (flags & SCF_DO_STCLASS_OR) {
6360 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6362 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6363 flags &= ~SCF_DO_STCLASS;
6366 else if (flags & SCF_DO_STCLASS_AND) {
6368 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6369 flags &= ~SCF_DO_STCLASS;
6372 /* Switch to OR mode: cache the old value of
6373 * data->start_class */
6375 StructCopy(data->start_class, and_withp, regnode_ssc);
6376 flags &= ~SCF_DO_STCLASS_AND;
6377 StructCopy(&accum, data->start_class, regnode_ssc);
6378 flags |= SCF_DO_STCLASS_OR;
6385 else if (PL_regkind[OP(scan)] == TRIE) {
6386 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6389 min += trie->minlen;
6390 delta += (trie->maxlen - trie->minlen);
6391 flags &= ~SCF_DO_STCLASS; /* xxx */
6392 if (flags & SCF_DO_SUBSTR) {
6393 /* Cannot expect anything... */
6394 scan_commit(pRExC_state, data, minlenp, is_inf);
6395 data->pos_min += trie->minlen;
6396 data->pos_delta += (trie->maxlen - trie->minlen);
6397 if (trie->maxlen != trie->minlen)
6398 data->cur_is_floating = 1; /* float */
6400 if (trie->jump) /* no more substrings -- for now /grr*/
6401 flags &= ~SCF_DO_SUBSTR;
6403 #endif /* old or new */
6404 #endif /* TRIE_STUDY_OPT */
6406 /* Else: zero-length, ignore. */
6407 scan = regnext(scan);
6412 /* we need to unwind recursion. */
6415 DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6416 DEBUG_PEEP("fend", scan, depth, flags);
6418 /* restore previous context */
6419 last = frame->last_regnode;
6420 scan = frame->next_regnode;
6421 stopparen = frame->stopparen;
6422 recursed_depth = frame->prev_recursed_depth;
6424 RExC_frame_last = frame->prev_frame;
6425 frame = frame->this_prev_frame;
6426 goto fake_study_recurse;
6430 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6433 *deltap = is_inf_internal ? SSize_t_MAX : delta;
6435 if (flags & SCF_DO_SUBSTR && is_inf)
6436 data->pos_delta = SSize_t_MAX - data->pos_min;
6437 if (is_par > (I32)U8_MAX)
6439 if (is_par && pars==1 && data) {
6440 data->flags |= SF_IN_PAR;
6441 data->flags &= ~SF_HAS_PAR;
6443 else if (pars && data) {
6444 data->flags |= SF_HAS_PAR;
6445 data->flags &= ~SF_IN_PAR;
6447 if (flags & SCF_DO_STCLASS_OR)
6448 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6449 if (flags & SCF_TRIE_RESTUDY)
6450 data->flags |= SCF_TRIE_RESTUDY;
6452 DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6455 SSize_t final_minlen= min < stopmin ? min : stopmin;
6457 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6458 if (final_minlen > SSize_t_MAX - delta)
6459 RExC_maxlen = SSize_t_MAX;
6460 else if (RExC_maxlen < final_minlen + delta)
6461 RExC_maxlen = final_minlen + delta;
6463 return final_minlen;
6465 NOT_REACHED; /* NOTREACHED */
6469 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6471 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6473 PERL_ARGS_ASSERT_ADD_DATA;
6475 Renewc(RExC_rxi->data,
6476 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6477 char, struct reg_data);
6479 Renew(RExC_rxi->data->what, count + n, U8);
6481 Newx(RExC_rxi->data->what, n, U8);
6482 RExC_rxi->data->count = count + n;
6483 Copy(s, RExC_rxi->data->what + count, n, U8);
6487 /*XXX: todo make this not included in a non debugging perl, but appears to be
6488 * used anyway there, in 'use re' */
6489 #ifndef PERL_IN_XSUB_RE
6491 Perl_reginitcolors(pTHX)
6493 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6495 char *t = savepv(s);
6499 t = strchr(t, '\t');
6505 PL_colors[i] = t = (char *)"";
6510 PL_colors[i++] = (char *)"";
6517 #ifdef TRIE_STUDY_OPT
6518 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
6521 (data.flags & SCF_TRIE_RESTUDY) \
6529 #define CHECK_RESTUDY_GOTO_butfirst
6533 * pregcomp - compile a regular expression into internal code
6535 * Decides which engine's compiler to call based on the hint currently in
6539 #ifndef PERL_IN_XSUB_RE
6541 /* return the currently in-scope regex engine (or the default if none) */
6543 regexp_engine const *
6544 Perl_current_re_engine(pTHX)
6546 if (IN_PERL_COMPILETIME) {
6547 HV * const table = GvHV(PL_hintgv);
6550 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6551 return &PL_core_reg_engine;
6552 ptr = hv_fetchs(table, "regcomp", FALSE);
6553 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6554 return &PL_core_reg_engine;
6555 return INT2PTR(regexp_engine*, SvIV(*ptr));
6559 if (!PL_curcop->cop_hints_hash)
6560 return &PL_core_reg_engine;
6561 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6562 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6563 return &PL_core_reg_engine;
6564 return INT2PTR(regexp_engine*, SvIV(ptr));
6570 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6572 regexp_engine const *eng = current_re_engine();
6573 GET_RE_DEBUG_FLAGS_DECL;
6575 PERL_ARGS_ASSERT_PREGCOMP;
6577 /* Dispatch a request to compile a regexp to correct regexp engine. */
6579 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
6582 return CALLREGCOMP_ENG(eng, pattern, flags);
6586 /* public(ish) entry point for the perl core's own regex compiling code.
6587 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6588 * pattern rather than a list of OPs, and uses the internal engine rather
6589 * than the current one */
6592 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6594 SV *pat = pattern; /* defeat constness! */
6595 PERL_ARGS_ASSERT_RE_COMPILE;
6596 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6597 #ifdef PERL_IN_XSUB_RE
6600 &PL_core_reg_engine,
6602 NULL, NULL, rx_flags, 0);
6607 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6611 if (--cbs->refcnt > 0)
6613 for (n = 0; n < cbs->count; n++) {
6614 REGEXP *rx = cbs->cb[n].src_regex;
6616 cbs->cb[n].src_regex = NULL;
6617 SvREFCNT_dec_NN(rx);
6625 static struct reg_code_blocks *
6626 S_alloc_code_blocks(pTHX_ int ncode)
6628 struct reg_code_blocks *cbs;
6629 Newx(cbs, 1, struct reg_code_blocks);
6632 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6634 Newx(cbs->cb, ncode, struct reg_code_block);
6641 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6642 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6643 * point to the realloced string and length.
6645 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6649 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6650 char **pat_p, STRLEN *plen_p, int num_code_blocks)
6652 U8 *const src = (U8*)*pat_p;
6657 GET_RE_DEBUG_FLAGS_DECL;
6659 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6660 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6662 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6663 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6666 while (s < *plen_p) {
6667 append_utf8_from_native_byte(src[s], &d);
6669 if (n < num_code_blocks) {
6670 assert(pRExC_state->code_blocks);
6671 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6672 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6673 assert(*(d - 1) == '(');
6676 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6677 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6678 assert(*(d - 1) == ')');
6687 *pat_p = (char*) dst;
6689 RExC_orig_utf8 = RExC_utf8 = 1;
6694 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6695 * while recording any code block indices, and handling overloading,
6696 * nested qr// objects etc. If pat is null, it will allocate a new
6697 * string, or just return the first arg, if there's only one.
6699 * Returns the malloced/updated pat.
6700 * patternp and pat_count is the array of SVs to be concatted;
6701 * oplist is the optional list of ops that generated the SVs;
6702 * recompile_p is a pointer to a boolean that will be set if
6703 * the regex will need to be recompiled.
6704 * delim, if non-null is an SV that will be inserted between each element
6708 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6709 SV *pat, SV ** const patternp, int pat_count,
6710 OP *oplist, bool *recompile_p, SV *delim)
6714 bool use_delim = FALSE;
6715 bool alloced = FALSE;
6717 /* if we know we have at least two args, create an empty string,
6718 * then concatenate args to that. For no args, return an empty string */
6719 if (!pat && pat_count != 1) {
6725 for (svp = patternp; svp < patternp + pat_count; svp++) {
6728 STRLEN orig_patlen = 0;
6730 SV *msv = use_delim ? delim : *svp;
6731 if (!msv) msv = &PL_sv_undef;
6733 /* if we've got a delimiter, we go round the loop twice for each
6734 * svp slot (except the last), using the delimiter the second
6743 if (SvTYPE(msv) == SVt_PVAV) {
6744 /* we've encountered an interpolated array within
6745 * the pattern, e.g. /...@a..../. Expand the list of elements,
6746 * then recursively append elements.
6747 * The code in this block is based on S_pushav() */
6749 AV *const av = (AV*)msv;
6750 const SSize_t maxarg = AvFILL(av) + 1;
6754 assert(oplist->op_type == OP_PADAV
6755 || oplist->op_type == OP_RV2AV);
6756 oplist = OpSIBLING(oplist);
6759 if (SvRMAGICAL(av)) {
6762 Newx(array, maxarg, SV*);
6764 for (i=0; i < maxarg; i++) {
6765 SV ** const svp = av_fetch(av, i, FALSE);
6766 array[i] = svp ? *svp : &PL_sv_undef;
6770 array = AvARRAY(av);
6772 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6773 array, maxarg, NULL, recompile_p,
6775 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6781 /* we make the assumption here that each op in the list of
6782 * op_siblings maps to one SV pushed onto the stack,
6783 * except for code blocks, with have both an OP_NULL and
6785 * This allows us to match up the list of SVs against the
6786 * list of OPs to find the next code block.
6788 * Note that PUSHMARK PADSV PADSV ..
6790 * PADRANGE PADSV PADSV ..
6791 * so the alignment still works. */
6794 if (oplist->op_type == OP_NULL
6795 && (oplist->op_flags & OPf_SPECIAL))
6797 assert(n < pRExC_state->code_blocks->count);
6798 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6799 pRExC_state->code_blocks->cb[n].block = oplist;
6800 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6803 oplist = OpSIBLING(oplist); /* skip CONST */
6806 oplist = OpSIBLING(oplist);;
6809 /* apply magic and QR overloading to arg */
6812 if (SvROK(msv) && SvAMAGIC(msv)) {
6813 SV *sv = AMG_CALLunary(msv, regexp_amg);
6817 if (SvTYPE(sv) != SVt_REGEXP)
6818 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6823 /* try concatenation overload ... */
6824 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6825 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6828 /* overloading involved: all bets are off over literal
6829 * code. Pretend we haven't seen it */
6831 pRExC_state->code_blocks->count -= n;
6835 /* ... or failing that, try "" overload */
6836 while (SvAMAGIC(msv)
6837 && (sv = AMG_CALLunary(msv, string_amg))
6841 && SvRV(msv) == SvRV(sv))
6846 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6850 /* this is a partially unrolled
6851 * sv_catsv_nomg(pat, msv);
6852 * that allows us to adjust code block indices if
6855 char *dst = SvPV_force_nomg(pat, dlen);
6857 if (SvUTF8(msv) && !SvUTF8(pat)) {
6858 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6859 sv_setpvn(pat, dst, dlen);
6862 sv_catsv_nomg(pat, msv);
6866 /* We have only one SV to process, but we need to verify
6867 * it is properly null terminated or we will fail asserts
6868 * later. In theory we probably shouldn't get such SV's,
6869 * but if we do we should handle it gracefully. */
6870 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
6871 /* not a string, or a string with a trailing null */
6874 /* a string with no trailing null, we need to copy it
6875 * so it has a trailing null */
6876 pat = sv_2mortal(newSVsv(msv));
6881 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6884 /* extract any code blocks within any embedded qr//'s */
6885 if (rx && SvTYPE(rx) == SVt_REGEXP
6886 && RX_ENGINE((REGEXP*)rx)->op_comp)
6889 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6890 if (ri->code_blocks && ri->code_blocks->count) {
6892 /* the presence of an embedded qr// with code means
6893 * we should always recompile: the text of the
6894 * qr// may not have changed, but it may be a
6895 * different closure than last time */
6897 if (pRExC_state->code_blocks) {
6898 int new_count = pRExC_state->code_blocks->count
6899 + ri->code_blocks->count;
6900 Renew(pRExC_state->code_blocks->cb,
6901 new_count, struct reg_code_block);
6902 pRExC_state->code_blocks->count = new_count;
6905 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
6906 ri->code_blocks->count);
6908 for (i=0; i < ri->code_blocks->count; i++) {
6909 struct reg_code_block *src, *dst;
6910 STRLEN offset = orig_patlen
6911 + ReANY((REGEXP *)rx)->pre_prefix;
6912 assert(n < pRExC_state->code_blocks->count);
6913 src = &ri->code_blocks->cb[i];
6914 dst = &pRExC_state->code_blocks->cb[n];
6915 dst->start = src->start + offset;
6916 dst->end = src->end + offset;
6917 dst->block = src->block;
6918 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6927 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6936 /* see if there are any run-time code blocks in the pattern.
6937 * False positives are allowed */
6940 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6941 char *pat, STRLEN plen)
6946 PERL_UNUSED_CONTEXT;
6948 for (s = 0; s < plen; s++) {
6949 if ( pRExC_state->code_blocks
6950 && n < pRExC_state->code_blocks->count
6951 && s == pRExC_state->code_blocks->cb[n].start)
6953 s = pRExC_state->code_blocks->cb[n].end;
6957 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6959 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6961 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6968 /* Handle run-time code blocks. We will already have compiled any direct
6969 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6970 * copy of it, but with any literal code blocks blanked out and
6971 * appropriate chars escaped; then feed it into
6973 * eval "qr'modified_pattern'"
6977 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6981 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6983 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6984 * and merge them with any code blocks of the original regexp.
6986 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6987 * instead, just save the qr and return FALSE; this tells our caller that
6988 * the original pattern needs upgrading to utf8.
6992 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6993 char *pat, STRLEN plen)
6997 GET_RE_DEBUG_FLAGS_DECL;
6999 if (pRExC_state->runtime_code_qr) {
7000 /* this is the second time we've been called; this should
7001 * only happen if the main pattern got upgraded to utf8
7002 * during compilation; re-use the qr we compiled first time
7003 * round (which should be utf8 too)
7005 qr = pRExC_state->runtime_code_qr;
7006 pRExC_state->runtime_code_qr = NULL;
7007 assert(RExC_utf8 && SvUTF8(qr));
7013 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7017 /* determine how many extra chars we need for ' and \ escaping */
7018 for (s = 0; s < plen; s++) {
7019 if (pat[s] == '\'' || pat[s] == '\\')
7023 Newx(newpat, newlen, char);
7025 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7027 for (s = 0; s < plen; s++) {
7028 if ( pRExC_state->code_blocks
7029 && n < pRExC_state->code_blocks->count
7030 && s == pRExC_state->code_blocks->cb[n].start)
7032 /* blank out literal code block so that they aren't
7033 * recompiled: eg change from/to:
7043 assert(pat[s] == '(');
7044 assert(pat[s+1] == '?');
7048 while (s < pRExC_state->code_blocks->cb[n].end) {
7056 if (pat[s] == '\'' || pat[s] == '\\')
7061 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7063 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7069 Perl_re_printf( aTHX_
7070 "%sre-parsing pattern for runtime code:%s %s\n",
7071 PL_colors[4], PL_colors[5], newpat);
7074 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7080 PUSHSTACKi(PERLSI_REQUIRE);
7081 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7082 * parsing qr''; normally only q'' does this. It also alters
7084 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7085 SvREFCNT_dec_NN(sv);
7090 SV * const errsv = ERRSV;
7091 if (SvTRUE_NN(errsv))
7092 /* use croak_sv ? */
7093 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7095 assert(SvROK(qr_ref));
7097 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7098 /* the leaving below frees the tmp qr_ref.
7099 * Give qr a life of its own */
7107 if (!RExC_utf8 && SvUTF8(qr)) {
7108 /* first time through; the pattern got upgraded; save the
7109 * qr for the next time through */
7110 assert(!pRExC_state->runtime_code_qr);
7111 pRExC_state->runtime_code_qr = qr;
7116 /* extract any code blocks within the returned qr// */
7119 /* merge the main (r1) and run-time (r2) code blocks into one */
7121 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7122 struct reg_code_block *new_block, *dst;
7123 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7127 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7129 SvREFCNT_dec_NN(qr);
7133 if (!r1->code_blocks)
7134 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7136 r1c = r1->code_blocks->count;
7137 r2c = r2->code_blocks->count;
7139 Newx(new_block, r1c + r2c, struct reg_code_block);
7143 while (i1 < r1c || i2 < r2c) {
7144 struct reg_code_block *src;
7148 src = &r2->code_blocks->cb[i2++];
7152 src = &r1->code_blocks->cb[i1++];
7153 else if ( r1->code_blocks->cb[i1].start
7154 < r2->code_blocks->cb[i2].start)
7156 src = &r1->code_blocks->cb[i1++];
7157 assert(src->end < r2->code_blocks->cb[i2].start);
7160 assert( r1->code_blocks->cb[i1].start
7161 > r2->code_blocks->cb[i2].start);
7162 src = &r2->code_blocks->cb[i2++];
7164 assert(src->end < r1->code_blocks->cb[i1].start);
7167 assert(pat[src->start] == '(');
7168 assert(pat[src->end] == ')');
7169 dst->start = src->start;
7170 dst->end = src->end;
7171 dst->block = src->block;
7172 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7176 r1->code_blocks->count += r2c;
7177 Safefree(r1->code_blocks->cb);
7178 r1->code_blocks->cb = new_block;
7181 SvREFCNT_dec_NN(qr);
7187 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7188 struct reg_substr_datum *rsd,
7189 struct scan_data_substrs *sub,
7190 STRLEN longest_length)
7192 /* This is the common code for setting up the floating and fixed length
7193 * string data extracted from Perl_re_op_compile() below. Returns a boolean
7194 * as to whether succeeded or not */
7198 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
7199 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7201 if (! (longest_length
7202 || (eol /* Can't have SEOL and MULTI */
7203 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7205 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7206 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7211 /* copy the information about the longest from the reg_scan_data
7212 over to the program. */
7213 if (SvUTF8(sub->str)) {
7215 rsd->utf8_substr = sub->str;
7217 rsd->substr = sub->str;
7218 rsd->utf8_substr = NULL;
7220 /* end_shift is how many chars that must be matched that
7221 follow this item. We calculate it ahead of time as once the
7222 lookbehind offset is added in we lose the ability to correctly
7224 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7225 rsd->end_shift = ml - sub->min_offset
7227 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7229 + (SvTAIL(sub->str) != 0)
7233 t = (eol/* Can't have SEOL and MULTI */
7234 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7235 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7241 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7243 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7244 * properly wrapped with the right modifiers */
7246 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7247 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7248 != REGEX_DEPENDS_CHARSET);
7250 /* The caret is output if there are any defaults: if not all the STD
7251 * flags are set, or if no character set specifier is needed */
7253 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7255 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7256 == REG_RUN_ON_COMMENT_SEEN);
7257 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7258 >> RXf_PMf_STD_PMMOD_SHIFT);
7259 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
7261 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7263 /* We output all the necessary flags; we never output a minus, as all
7264 * those are defaults, so are
7265 * covered by the caret */
7266 const STRLEN wraplen = pat_len + has_p + has_runon
7267 + has_default /* If needs a caret */
7268 + PL_bitcount[reganch] /* 1 char for each set standard flag */
7270 /* If needs a character set specifier */
7271 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7272 + (sizeof("(?:)") - 1);
7274 PERL_ARGS_ASSERT_SET_REGEX_PV;
7276 /* make sure PL_bitcount bounds not exceeded */
7277 assert(sizeof(STD_PAT_MODS) <= 8);
7279 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7282 SvFLAGS(Rx) |= SVf_UTF8;
7285 /* If a default, cover it using the caret */
7287 *p++= DEFAULT_PAT_MOD;
7293 name = get_regex_charset_name(RExC_rx->extflags, &len);
7294 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
7296 name = UNICODE_PAT_MODS;
7297 len = sizeof(UNICODE_PAT_MODS) - 1;
7299 Copy(name, p, len, char);
7303 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7306 while((ch = *fptr++)) {
7314 Copy(RExC_precomp, p, pat_len, char);
7315 assert ((RX_WRAPPED(Rx) - p) < 16);
7316 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7319 /* Adding a trailing \n causes this to compile properly:
7320 my $R = qr / A B C # D E/x; /($R)/
7321 Otherwise the parens are considered part of the comment */
7326 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7330 * Perl_re_op_compile - the perl internal RE engine's function to compile a
7331 * regular expression into internal code.
7332 * The pattern may be passed either as:
7333 * a list of SVs (patternp plus pat_count)
7334 * a list of OPs (expr)
7335 * If both are passed, the SV list is used, but the OP list indicates
7336 * which SVs are actually pre-compiled code blocks
7338 * The SVs in the list have magic and qr overloading applied to them (and
7339 * the list may be modified in-place with replacement SVs in the latter
7342 * If the pattern hasn't changed from old_re, then old_re will be
7345 * eng is the current engine. If that engine has an op_comp method, then
7346 * handle directly (i.e. we assume that op_comp was us); otherwise, just
7347 * do the initial concatenation of arguments and pass on to the external
7350 * If is_bare_re is not null, set it to a boolean indicating whether the
7351 * arg list reduced (after overloading) to a single bare regex which has
7352 * been returned (i.e. /$qr/).
7354 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7356 * pm_flags contains the PMf_* flags, typically based on those from the
7357 * pm_flags field of the related PMOP. Currently we're only interested in
7358 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
7360 * For many years this code had an initial sizing pass that calculated
7361 * (sometimes incorrectly, leading to security holes) the size needed for the
7362 * compiled pattern. That was changed by commit
7363 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7364 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
7365 * references to this sizing pass.
7367 * Now, an initial crude guess as to the size needed is made, based on the
7368 * length of the pattern. Patches welcome to improve that guess. That amount
7369 * of space is malloc'd and then immediately freed, and then clawed back node
7370 * by node. This design is to minimze, to the extent possible, memory churn
7371 * when doing the the reallocs.
7373 * A separate parentheses counting pass may be needed in some cases.
7374 * (Previously the sizing pass did this.) Patches welcome to reduce the number
7377 * The existence of a sizing pass necessitated design decisions that are no
7378 * longer needed. There are potential areas of simplification.
7380 * Beware that the optimization-preparation code in here knows about some
7381 * of the structure of the compiled regexp. [I'll say.]
7385 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7386 OP *expr, const regexp_engine* eng, REGEXP *old_re,
7387 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7390 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
7398 SV** new_patternp = patternp;
7400 /* these are all flags - maybe they should be turned
7401 * into a single int with different bit masks */
7402 I32 sawlookahead = 0;
7407 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7409 bool runtime_code = 0;
7411 RExC_state_t RExC_state;
7412 RExC_state_t * const pRExC_state = &RExC_state;
7413 #ifdef TRIE_STUDY_OPT
7415 RExC_state_t copyRExC_state;
7417 GET_RE_DEBUG_FLAGS_DECL;
7419 PERL_ARGS_ASSERT_RE_OP_COMPILE;
7421 DEBUG_r(if (!PL_colorset) reginitcolors());
7423 /* Initialize these here instead of as-needed, as is quick and avoids
7424 * having to test them each time otherwise */
7425 if (! PL_InBitmap) {
7427 char * dump_len_string;
7430 /* This is calculated here, because the Perl program that generates the
7431 * static global ones doesn't currently have access to
7432 * NUM_ANYOF_CODE_POINTS */
7433 PL_InBitmap = _new_invlist(2);
7434 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
7435 NUM_ANYOF_CODE_POINTS - 1);
7437 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
7438 if ( ! dump_len_string
7439 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
7441 PL_dump_re_max_len = 60; /* A reasonable default */
7446 pRExC_state->warn_text = NULL;
7447 pRExC_state->unlexed_names = NULL;
7448 pRExC_state->code_blocks = NULL;
7451 *is_bare_re = FALSE;
7453 if (expr && (expr->op_type == OP_LIST ||
7454 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7455 /* allocate code_blocks if needed */
7459 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7460 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7461 ncode++; /* count of DO blocks */
7464 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7468 /* compile-time pattern with just OP_CONSTs and DO blocks */
7473 /* find how many CONSTs there are */
7476 if (expr->op_type == OP_CONST)
7479 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7480 if (o->op_type == OP_CONST)
7484 /* fake up an SV array */
7486 assert(!new_patternp);
7487 Newx(new_patternp, n, SV*);
7488 SAVEFREEPV(new_patternp);
7492 if (expr->op_type == OP_CONST)
7493 new_patternp[n] = cSVOPx_sv(expr);
7495 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7496 if (o->op_type == OP_CONST)
7497 new_patternp[n++] = cSVOPo_sv;
7502 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7503 "Assembling pattern from %d elements%s\n", pat_count,
7504 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7506 /* set expr to the first arg op */
7508 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7509 && expr->op_type != OP_CONST)
7511 expr = cLISTOPx(expr)->op_first;
7512 assert( expr->op_type == OP_PUSHMARK
7513 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7514 || expr->op_type == OP_PADRANGE);
7515 expr = OpSIBLING(expr);
7518 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7519 expr, &recompile, NULL);
7521 /* handle bare (possibly after overloading) regex: foo =~ $re */
7526 if (SvTYPE(re) == SVt_REGEXP) {
7530 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7531 "Precompiled pattern%s\n",
7532 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7538 exp = SvPV_nomg(pat, plen);
7540 if (!eng->op_comp) {
7541 if ((SvUTF8(pat) && IN_BYTES)
7542 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7544 /* make a temporary copy; either to convert to bytes,
7545 * or to avoid repeating get-magic / overloaded stringify */
7546 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7547 (IN_BYTES ? 0 : SvUTF8(pat)));
7549 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7552 /* ignore the utf8ness if the pattern is 0 length */
7553 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7554 RExC_uni_semantics = 0;
7555 RExC_contains_locale = 0;
7556 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7557 RExC_in_script_run = 0;
7558 RExC_study_started = 0;
7559 pRExC_state->runtime_code_qr = NULL;
7560 RExC_frame_head= NULL;
7561 RExC_frame_last= NULL;
7562 RExC_frame_count= 0;
7563 RExC_latest_warn_offset = 0;
7564 RExC_use_BRANCHJ = 0;
7565 RExC_total_parens = 0;
7566 RExC_open_parens = NULL;
7567 RExC_close_parens = NULL;
7568 RExC_paren_names = NULL;
7570 RExC_seen_d_op = FALSE;
7572 RExC_paren_name_list = NULL;
7576 RExC_mysv1= sv_newmortal();
7577 RExC_mysv2= sv_newmortal();
7581 SV *dsv= sv_newmortal();
7582 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7583 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
7584 PL_colors[4], PL_colors[5], s);
7587 /* we jump here if we have to recompile, e.g., from upgrading the pattern
7590 if ((pm_flags & PMf_USE_RE_EVAL)
7591 /* this second condition covers the non-regex literal case,
7592 * i.e. $foo =~ '(?{})'. */
7593 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7595 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7598 /* return old regex if pattern hasn't changed */
7599 /* XXX: note in the below we have to check the flags as well as the
7602 * Things get a touch tricky as we have to compare the utf8 flag
7603 * independently from the compile flags. */
7607 && !!RX_UTF8(old_re) == !!RExC_utf8
7608 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7609 && RX_PRECOMP(old_re)
7610 && RX_PRELEN(old_re) == plen
7611 && memEQ(RX_PRECOMP(old_re), exp, plen)
7612 && !runtime_code /* with runtime code, always recompile */ )
7615 SV *dsv= sv_newmortal();
7616 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7617 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
7618 PL_colors[4], PL_colors[5], s);
7623 /* Allocate the pattern's SV */
7624 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7625 RExC_rx = ReANY(Rx);
7626 if ( RExC_rx == NULL )
7627 FAIL("Regexp out of space");
7629 rx_flags = orig_rx_flags;
7631 if ( (UTF || RExC_uni_semantics)
7632 && initial_charset == REGEX_DEPENDS_CHARSET)
7635 /* Set to use unicode semantics if the pattern is in utf8 and has the
7636 * 'depends' charset specified, as it means unicode when utf8 */
7637 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7638 RExC_uni_semantics = 1;
7641 RExC_pm_flags = pm_flags;
7644 assert(TAINTING_get || !TAINT_get);
7646 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7648 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7649 /* whoops, we have a non-utf8 pattern, whilst run-time code
7650 * got compiled as utf8. Try again with a utf8 pattern */
7651 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7652 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7656 assert(!pRExC_state->runtime_code_qr);
7662 RExC_in_lookbehind = 0;
7663 RExC_in_lookahead = 0;
7664 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7665 RExC_recode_x_to_native = 0;
7666 RExC_in_multi_char_class = 0;
7668 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7669 RExC_precomp_end = RExC_end = exp + plen;
7671 RExC_whilem_seen = 0;
7673 RExC_recurse = NULL;
7674 RExC_study_chunk_recursed = NULL;
7675 RExC_study_chunk_recursed_bytes= 0;
7676 RExC_recurse_count = 0;
7677 pRExC_state->code_index = 0;
7679 /* Initialize the string in the compiled pattern. This is so that there is
7680 * something to output if necessary */
7681 set_regex_pv(pRExC_state, Rx);
7684 Perl_re_printf( aTHX_
7685 "Starting parse and generation\n");
7687 RExC_lastparse=NULL;
7690 /* Allocate space and zero-initialize. Note, the two step process
7691 of zeroing when in debug mode, thus anything assigned has to
7692 happen after that */
7695 /* On the first pass of the parse, we guess how big this will be. Then
7696 * we grow in one operation to that amount and then give it back. As
7697 * we go along, we re-allocate what we need.
7699 * XXX Currently the guess is essentially that the pattern will be an
7700 * EXACT node with one byte input, one byte output. This is crude, and
7701 * better heuristics are welcome.
7703 * On any subsequent passes, we guess what we actually computed in the
7704 * latest earlier pass. Such a pass probably didn't complete so is
7705 * missing stuff. We could improve those guesses by knowing where the
7706 * parse stopped, and use the length so far plus apply the above
7707 * assumption to what's left. */
7708 RExC_size = STR_SZ(RExC_end - RExC_start);
7711 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7712 if ( RExC_rxi == NULL )
7713 FAIL("Regexp out of space");
7715 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7716 RXi_SET( RExC_rx, RExC_rxi );
7718 /* We start from 0 (over from 0 in the case this is a reparse. The first
7719 * node parsed will give back any excess memory we have allocated so far).
7723 /* non-zero initialization begins here */
7724 RExC_rx->engine= eng;
7725 RExC_rx->extflags = rx_flags;
7726 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7728 if (pm_flags & PMf_IS_QR) {
7729 RExC_rxi->code_blocks = pRExC_state->code_blocks;
7730 if (RExC_rxi->code_blocks) {
7731 RExC_rxi->code_blocks->refcnt++;
7735 RExC_rx->intflags = 0;
7737 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
7740 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7741 * code makes sure the final byte is an uncounted NUL. But should this
7742 * ever not be the case, lots of things could read beyond the end of the
7743 * buffer: loops like
7744 * while(isFOO(*RExC_parse)) RExC_parse++;
7745 * strchr(RExC_parse, "foo");
7746 * etc. So it is worth noting. */
7747 assert(*RExC_end == '\0');
7751 RExC_parens_buf_size = 0;
7752 RExC_emit_start = RExC_rxi->program;
7753 pRExC_state->code_index = 0;
7755 *((char*) RExC_emit_start) = (char) REG_MAGIC;
7759 if (reg(pRExC_state, 0, &flags, 1)) {
7761 /* Success!, But we may need to redo the parse knowing how many parens
7762 * there actually are */
7763 if (IN_PARENS_PASS) {
7764 flags |= RESTART_PARSE;
7767 /* We have that number in RExC_npar */
7768 RExC_total_parens = RExC_npar;
7770 else if (! MUST_RESTART(flags)) {
7772 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7775 /* Here, we either have success, or we have to redo the parse for some reason */
7776 if (MUST_RESTART(flags)) {
7778 /* It's possible to write a regexp in ascii that represents Unicode
7779 codepoints outside of the byte range, such as via \x{100}. If we
7780 detect such a sequence we have to convert the entire pattern to utf8
7781 and then recompile, as our sizing calculation will have been based
7782 on 1 byte == 1 character, but we will need to use utf8 to encode
7783 at least some part of the pattern, and therefore must convert the whole
7786 if (flags & NEED_UTF8) {
7788 /* We have stored the offset of the final warning output so far.
7789 * That must be adjusted. Any variant characters between the start
7790 * of the pattern and this warning count for 2 bytes in the final,
7791 * so just add them again */
7792 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7793 RExC_latest_warn_offset +=
7794 variant_under_utf8_count((U8 *) exp, (U8 *) exp
7795 + RExC_latest_warn_offset);
7797 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7798 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7799 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7802 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7805 if (ALL_PARENS_COUNTED) {
7806 /* Make enough room for all the known parens, and zero it */
7807 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7808 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7809 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
7811 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7812 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7814 else { /* Parse did not complete. Reinitialize the parentheses
7816 RExC_total_parens = 0;
7817 if (RExC_open_parens) {
7818 Safefree(RExC_open_parens);
7819 RExC_open_parens = NULL;
7821 if (RExC_close_parens) {
7822 Safefree(RExC_close_parens);
7823 RExC_close_parens = NULL;
7827 /* Clean up what we did in this parse */
7828 SvREFCNT_dec_NN(RExC_rx_sv);
7833 /* Here, we have successfully parsed and generated the pattern's program
7834 * for the regex engine. We are ready to finish things up and look for
7837 /* Update the string to compile, with correct modifiers, etc */
7838 set_regex_pv(pRExC_state, Rx);
7840 RExC_rx->nparens = RExC_total_parens - 1;
7842 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7843 if (RExC_whilem_seen > 15)
7844 RExC_whilem_seen = 15;
7847 Perl_re_printf( aTHX_
7848 "Required size %" IVdf " nodes\n", (IV)RExC_size);
7850 RExC_lastparse=NULL;
7853 #ifdef RE_TRACK_PATTERN_OFFSETS
7854 DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7855 "%s %" UVuf " bytes for offset annotations.\n",
7856 RExC_offsets ? "Got" : "Couldn't get",
7857 (UV)((RExC_offsets[0] * 2 + 1))));
7858 DEBUG_OFFSETS_r(if (RExC_offsets) {
7859 const STRLEN len = RExC_offsets[0];
7861 GET_RE_DEBUG_FLAGS_DECL;
7862 Perl_re_printf( aTHX_
7863 "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7864 for (i = 1; i <= len; i++) {
7865 if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7866 Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ",
7867 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7869 Perl_re_printf( aTHX_ "\n");
7873 SetProgLen(RExC_rxi,RExC_size);
7876 DEBUG_DUMP_PRE_OPTIMIZE_r({
7877 SV * const sv = sv_newmortal();
7878 RXi_GET_DECL(RExC_rx, ri);
7880 Perl_re_printf( aTHX_ "Program before optimization:\n");
7882 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
7887 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
7890 /* XXXX To minimize changes to RE engine we always allocate
7891 3-units-long substrs field. */
7892 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
7893 if (RExC_recurse_count) {
7894 Newx(RExC_recurse, RExC_recurse_count, regnode *);
7895 SAVEFREEPV(RExC_recurse);
7898 if (RExC_seen & REG_RECURSE_SEEN) {
7899 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
7900 * So its 1 if there are no parens. */
7901 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
7902 ((RExC_total_parens & 0x07) != 0);
7903 Newx(RExC_study_chunk_recursed,
7904 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7905 SAVEFREEPV(RExC_study_chunk_recursed);
7909 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7911 RExC_study_chunk_recursed_count= 0;
7913 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
7914 if (RExC_study_chunk_recursed) {
7915 Zero(RExC_study_chunk_recursed,
7916 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7920 #ifdef TRIE_STUDY_OPT
7922 StructCopy(&zero_scan_data, &data, scan_data_t);
7923 copyRExC_state = RExC_state;
7926 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7928 RExC_state = copyRExC_state;
7929 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7930 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7932 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7933 StructCopy(&zero_scan_data, &data, scan_data_t);
7936 StructCopy(&zero_scan_data, &data, scan_data_t);
7939 /* Dig out information for optimizations. */
7940 RExC_rx->extflags = RExC_flags; /* was pm_op */
7941 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7944 SvUTF8_on(Rx); /* Unicode in it? */
7945 RExC_rxi->regstclass = NULL;
7946 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
7947 RExC_rx->intflags |= PREGf_NAUGHTY;
7948 scan = RExC_rxi->program + 1; /* First BRANCH. */
7950 /* testing for BRANCH here tells us whether there is "must appear"
7951 data in the pattern. If there is then we can use it for optimisations */
7952 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
7955 STRLEN longest_length[2];
7956 regnode_ssc ch_class; /* pointed to by data */
7958 SSize_t last_close = 0; /* pointed to by data */
7959 regnode *first= scan;
7960 regnode *first_next= regnext(first);
7964 * Skip introductions and multiplicators >= 1
7965 * so that we can extract the 'meat' of the pattern that must
7966 * match in the large if() sequence following.
7967 * NOTE that EXACT is NOT covered here, as it is normally
7968 * picked up by the optimiser separately.
7970 * This is unfortunate as the optimiser isnt handling lookahead
7971 * properly currently.
7974 while ((OP(first) == OPEN && (sawopen = 1)) ||
7975 /* An OR of *one* alternative - should not happen now. */
7976 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7977 /* for now we can't handle lookbehind IFMATCH*/
7978 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7979 (OP(first) == PLUS) ||
7980 (OP(first) == MINMOD) ||
7981 /* An {n,m} with n>0 */
7982 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7983 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7986 * the only op that could be a regnode is PLUS, all the rest
7987 * will be regnode_1 or regnode_2.
7989 * (yves doesn't think this is true)
7991 if (OP(first) == PLUS)
7994 if (OP(first) == MINMOD)
7996 first += regarglen[OP(first)];
7998 first = NEXTOPER(first);
7999 first_next= regnext(first);
8002 /* Starting-point info. */
8004 DEBUG_PEEP("first:", first, 0, 0);
8005 /* Ignore EXACT as we deal with it later. */
8006 if (PL_regkind[OP(first)] == EXACT) {
8007 if ( OP(first) == EXACT
8008 || OP(first) == LEXACT
8009 || OP(first) == EXACT_REQ8
8010 || OP(first) == LEXACT_REQ8
8011 || OP(first) == EXACTL)
8013 NOOP; /* Empty, get anchored substr later. */
8016 RExC_rxi->regstclass = first;
8019 else if (PL_regkind[OP(first)] == TRIE &&
8020 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8022 /* this can happen only on restudy */
8023 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8026 else if (REGNODE_SIMPLE(OP(first)))
8027 RExC_rxi->regstclass = first;
8028 else if (PL_regkind[OP(first)] == BOUND ||
8029 PL_regkind[OP(first)] == NBOUND)
8030 RExC_rxi->regstclass = first;
8031 else if (PL_regkind[OP(first)] == BOL) {
8032 RExC_rx->intflags |= (OP(first) == MBOL
8035 first = NEXTOPER(first);
8038 else if (OP(first) == GPOS) {
8039 RExC_rx->intflags |= PREGf_ANCH_GPOS;
8040 first = NEXTOPER(first);
8043 else if ((!sawopen || !RExC_sawback) &&
8045 (OP(first) == STAR &&
8046 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8047 !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8049 /* turn .* into ^.* with an implied $*=1 */
8051 (OP(NEXTOPER(first)) == REG_ANY)
8054 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8055 first = NEXTOPER(first);
8058 if (sawplus && !sawminmod && !sawlookahead
8059 && (!sawopen || !RExC_sawback)
8060 && !pRExC_state->code_blocks) /* May examine pos and $& */
8061 /* x+ must match at the 1st pos of run of x's */
8062 RExC_rx->intflags |= PREGf_SKIP;
8064 /* Scan is after the zeroth branch, first is atomic matcher. */
8065 #ifdef TRIE_STUDY_OPT
8068 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8069 (IV)(first - scan + 1))
8073 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8074 (IV)(first - scan + 1))
8080 * If there's something expensive in the r.e., find the
8081 * longest literal string that must appear and make it the
8082 * regmust. Resolve ties in favor of later strings, since
8083 * the regstart check works with the beginning of the r.e.
8084 * and avoiding duplication strengthens checking. Not a
8085 * strong reason, but sufficient in the absence of others.
8086 * [Now we resolve ties in favor of the earlier string if
8087 * it happens that c_offset_min has been invalidated, since the
8088 * earlier string may buy us something the later one won't.]
8091 data.substrs[0].str = newSVpvs("");
8092 data.substrs[1].str = newSVpvs("");
8093 data.last_found = newSVpvs("");
8094 data.cur_is_floating = 0; /* initially any found substring is fixed */
8095 ENTER_with_name("study_chunk");
8096 SAVEFREESV(data.substrs[0].str);
8097 SAVEFREESV(data.substrs[1].str);
8098 SAVEFREESV(data.last_found);
8100 if (!RExC_rxi->regstclass) {
8101 ssc_init(pRExC_state, &ch_class);
8102 data.start_class = &ch_class;
8103 stclass_flag = SCF_DO_STCLASS_AND;
8104 } else /* XXXX Check for BOUND? */
8106 data.last_closep = &last_close;
8110 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8111 * (NO top level branches)
8113 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8114 scan + RExC_size, /* Up to end */
8116 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8117 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8121 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8124 if ( RExC_total_parens == 1 && !data.cur_is_floating
8125 && data.last_start_min == 0 && data.last_end > 0
8126 && !RExC_seen_zerolen
8127 && !(RExC_seen & REG_VERBARG_SEEN)
8128 && !(RExC_seen & REG_GPOS_SEEN)
8130 RExC_rx->extflags |= RXf_CHECK_ALL;
8132 scan_commit(pRExC_state, &data,&minlen, 0);
8135 /* XXX this is done in reverse order because that's the way the
8136 * code was before it was parameterised. Don't know whether it
8137 * actually needs doing in reverse order. DAPM */
8138 for (i = 1; i >= 0; i--) {
8139 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8142 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
8143 && data.substrs[0].min_offset
8144 == data.substrs[1].min_offset
8145 && SvCUR(data.substrs[0].str)
8146 == SvCUR(data.substrs[1].str)
8148 && S_setup_longest (aTHX_ pRExC_state,
8149 &(RExC_rx->substrs->data[i]),
8153 RExC_rx->substrs->data[i].min_offset =
8154 data.substrs[i].min_offset - data.substrs[i].lookbehind;
8156 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8157 /* Don't offset infinity */
8158 if (data.substrs[i].max_offset < SSize_t_MAX)
8159 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8160 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8163 RExC_rx->substrs->data[i].substr = NULL;
8164 RExC_rx->substrs->data[i].utf8_substr = NULL;
8165 longest_length[i] = 0;
8169 LEAVE_with_name("study_chunk");
8171 if (RExC_rxi->regstclass
8172 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8173 RExC_rxi->regstclass = NULL;
8175 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8176 || RExC_rx->substrs->data[0].min_offset)
8178 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8179 && is_ssc_worth_it(pRExC_state, data.start_class))
8181 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8183 ssc_finalize(pRExC_state, data.start_class);
8185 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8186 StructCopy(data.start_class,
8187 (regnode_ssc*)RExC_rxi->data->data[n],
8189 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8190 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8191 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8192 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8193 Perl_re_printf( aTHX_
8194 "synthetic stclass \"%s\".\n",
8195 SvPVX_const(sv));});
8196 data.start_class = NULL;
8199 /* A temporary algorithm prefers floated substr to fixed one of
8200 * same length to dig more info. */
8201 i = (longest_length[0] <= longest_length[1]);
8202 RExC_rx->substrs->check_ix = i;
8203 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
8204 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
8205 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
8206 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8207 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8208 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8209 RExC_rx->intflags |= PREGf_NOSCAN;
8211 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8212 RExC_rx->extflags |= RXf_USE_INTUIT;
8213 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8214 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8217 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8218 if ( (STRLEN)minlen < longest_length[1] )
8219 minlen= longest_length[1];
8220 if ( (STRLEN)minlen < longest_length[0] )
8221 minlen= longest_length[0];
8225 /* Several toplevels. Best we can is to set minlen. */
8227 regnode_ssc ch_class;
8228 SSize_t last_close = 0;
8230 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
8232 scan = RExC_rxi->program + 1;
8233 ssc_init(pRExC_state, &ch_class);
8234 data.start_class = &ch_class;
8235 data.last_closep = &last_close;
8239 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8240 * (patterns WITH top level branches)
8242 minlen = study_chunk(pRExC_state,
8243 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8244 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8245 ? SCF_TRIE_DOING_RESTUDY
8249 CHECK_RESTUDY_GOTO_butfirst(NOOP);
8251 RExC_rx->check_substr = NULL;
8252 RExC_rx->check_utf8 = NULL;
8253 RExC_rx->substrs->data[0].substr = NULL;
8254 RExC_rx->substrs->data[0].utf8_substr = NULL;
8255 RExC_rx->substrs->data[1].substr = NULL;
8256 RExC_rx->substrs->data[1].utf8_substr = NULL;
8258 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8259 && is_ssc_worth_it(pRExC_state, data.start_class))
8261 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8263 ssc_finalize(pRExC_state, data.start_class);
8265 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8266 StructCopy(data.start_class,
8267 (regnode_ssc*)RExC_rxi->data->data[n],
8269 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8270 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8271 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8272 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8273 Perl_re_printf( aTHX_
8274 "synthetic stclass \"%s\".\n",
8275 SvPVX_const(sv));});
8276 data.start_class = NULL;
8280 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8281 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8282 RExC_rx->maxlen = REG_INFTY;
8285 RExC_rx->maxlen = RExC_maxlen;
8288 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8289 the "real" pattern. */
8291 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8292 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8294 RExC_rx->minlenret = minlen;
8295 if (RExC_rx->minlen < minlen)
8296 RExC_rx->minlen = minlen;
8298 if (RExC_seen & REG_RECURSE_SEEN ) {
8299 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8300 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8302 if (RExC_seen & REG_GPOS_SEEN)
8303 RExC_rx->intflags |= PREGf_GPOS_SEEN;
8304 if (RExC_seen & REG_LOOKBEHIND_SEEN)
8305 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8307 if (pRExC_state->code_blocks)
8308 RExC_rx->extflags |= RXf_EVAL_SEEN;
8309 if (RExC_seen & REG_VERBARG_SEEN)
8311 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8312 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8314 if (RExC_seen & REG_CUTGROUP_SEEN)
8315 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8316 if (pm_flags & PMf_USE_RE_EVAL)
8317 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8318 if (RExC_paren_names)
8319 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8321 RXp_PAREN_NAMES(RExC_rx) = NULL;
8323 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8324 * so it can be used in pp.c */
8325 if (RExC_rx->intflags & PREGf_ANCH)
8326 RExC_rx->extflags |= RXf_IS_ANCHORED;
8330 /* this is used to identify "special" patterns that might result
8331 * in Perl NOT calling the regex engine and instead doing the match "itself",
8332 * particularly special cases in split//. By having the regex compiler
8333 * do this pattern matching at a regop level (instead of by inspecting the pattern)
8334 * we avoid weird issues with equivalent patterns resulting in different behavior,
8335 * AND we allow non Perl engines to get the same optimizations by the setting the
8336 * flags appropriately - Yves */
8337 regnode *first = RExC_rxi->program + 1;
8339 regnode *next = regnext(first);
8342 if (PL_regkind[fop] == NOTHING && nop == END)
8343 RExC_rx->extflags |= RXf_NULL;
8344 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8345 /* when fop is SBOL first->flags will be true only when it was
8346 * produced by parsing /\A/, and not when parsing /^/. This is
8347 * very important for the split code as there we want to
8348 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8349 * See rt #122761 for more details. -- Yves */
8350 RExC_rx->extflags |= RXf_START_ONLY;
8351 else if (fop == PLUS
8352 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8354 RExC_rx->extflags |= RXf_WHITE;
8355 else if ( RExC_rx->extflags & RXf_SPLIT
8356 && ( fop == EXACT || fop == LEXACT
8357 || fop == EXACT_REQ8 || fop == LEXACT_REQ8
8359 && STR_LEN(first) == 1
8360 && *(STRING(first)) == ' '
8362 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8366 if (RExC_contains_locale) {
8367 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8371 if (RExC_paren_names) {
8372 RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8373 RExC_rxi->data->data[RExC_rxi->name_list_idx]
8374 = (void*)SvREFCNT_inc(RExC_paren_name_list);
8377 RExC_rxi->name_list_idx = 0;
8379 while ( RExC_recurse_count > 0 ) {
8380 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8382 * This data structure is set up in study_chunk() and is used
8383 * to calculate the distance between a GOSUB regopcode and
8384 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8387 * If for some reason someone writes code that optimises
8388 * away a GOSUB opcode then the assert should be changed to
8389 * an if(scan) to guard the ARG2L_SET() - Yves
8392 assert(scan && OP(scan) == GOSUB);
8393 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8396 Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8397 /* assume we don't need to swap parens around before we match */
8399 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8400 (unsigned long)RExC_study_chunk_recursed_count);
8404 Perl_re_printf( aTHX_ "Final program:\n");
8408 if (RExC_open_parens) {
8409 Safefree(RExC_open_parens);
8410 RExC_open_parens = NULL;
8412 if (RExC_close_parens) {
8413 Safefree(RExC_close_parens);
8414 RExC_close_parens = NULL;
8418 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8419 * by setting the regexp SV to readonly-only instead. If the
8420 * pattern's been recompiled, the USEDness should remain. */
8421 if (old_re && SvREADONLY(old_re))
8429 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8432 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8434 PERL_UNUSED_ARG(value);
8436 if (flags & RXapif_FETCH) {
8437 return reg_named_buff_fetch(rx, key, flags);
8438 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8439 Perl_croak_no_modify();
8441 } else if (flags & RXapif_EXISTS) {
8442 return reg_named_buff_exists(rx, key, flags)
8445 } else if (flags & RXapif_REGNAMES) {
8446 return reg_named_buff_all(rx, flags);
8447 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8448 return reg_named_buff_scalar(rx, flags);
8450 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8456 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8459 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8460 PERL_UNUSED_ARG(lastkey);
8462 if (flags & RXapif_FIRSTKEY)
8463 return reg_named_buff_firstkey(rx, flags);
8464 else if (flags & RXapif_NEXTKEY)
8465 return reg_named_buff_nextkey(rx, flags);
8467 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8474 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8478 struct regexp *const rx = ReANY(r);
8480 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8482 if (rx && RXp_PAREN_NAMES(rx)) {
8483 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8486 SV* sv_dat=HeVAL(he_str);
8487 I32 *nums=(I32*)SvPVX(sv_dat);
8488 AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8489 for ( i=0; i<SvIVX(sv_dat); i++ ) {
8490 if ((I32)(rx->nparens) >= nums[i]
8491 && rx->offs[nums[i]].start != -1
8492 && rx->offs[nums[i]].end != -1)
8495 CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8500 ret = newSVsv(&PL_sv_undef);
8503 av_push(retarray, ret);
8506 return newRV_noinc(MUTABLE_SV(retarray));
8513 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8516 struct regexp *const rx = ReANY(r);
8518 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8520 if (rx && RXp_PAREN_NAMES(rx)) {
8521 if (flags & RXapif_ALL) {
8522 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8524 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8526 SvREFCNT_dec_NN(sv);
8538 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8540 struct regexp *const rx = ReANY(r);
8542 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8544 if ( rx && RXp_PAREN_NAMES(rx) ) {
8545 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8547 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8554 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8556 struct regexp *const rx = ReANY(r);
8557 GET_RE_DEBUG_FLAGS_DECL;
8559 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8561 if (rx && RXp_PAREN_NAMES(rx)) {
8562 HV *hv = RXp_PAREN_NAMES(rx);
8564 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8567 SV* sv_dat = HeVAL(temphe);
8568 I32 *nums = (I32*)SvPVX(sv_dat);
8569 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8570 if ((I32)(rx->lastparen) >= nums[i] &&
8571 rx->offs[nums[i]].start != -1 &&
8572 rx->offs[nums[i]].end != -1)
8578 if (parno || flags & RXapif_ALL) {
8579 return newSVhek(HeKEY_hek(temphe));
8587 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8592 struct regexp *const rx = ReANY(r);
8594 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8596 if (rx && RXp_PAREN_NAMES(rx)) {
8597 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8598 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8599 } else if (flags & RXapif_ONE) {
8600 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8601 av = MUTABLE_AV(SvRV(ret));
8602 length = av_tindex(av);
8603 SvREFCNT_dec_NN(ret);
8604 return newSViv(length + 1);
8606 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8611 return &PL_sv_undef;
8615 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8617 struct regexp *const rx = ReANY(r);
8620 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8622 if (rx && RXp_PAREN_NAMES(rx)) {
8623 HV *hv= RXp_PAREN_NAMES(rx);
8625 (void)hv_iterinit(hv);
8626 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8629 SV* sv_dat = HeVAL(temphe);
8630 I32 *nums = (I32*)SvPVX(sv_dat);
8631 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8632 if ((I32)(rx->lastparen) >= nums[i] &&
8633 rx->offs[nums[i]].start != -1 &&
8634 rx->offs[nums[i]].end != -1)
8640 if (parno || flags & RXapif_ALL) {
8641 av_push(av, newSVhek(HeKEY_hek(temphe)));
8646 return newRV_noinc(MUTABLE_SV(av));
8650 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8653 struct regexp *const rx = ReANY(r);
8659 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8661 if ( n == RX_BUFF_IDX_CARET_PREMATCH
8662 || n == RX_BUFF_IDX_CARET_FULLMATCH
8663 || n == RX_BUFF_IDX_CARET_POSTMATCH
8666 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8668 /* on something like
8671 * the KEEPCOPY is set on the PMOP rather than the regex */
8672 if (PL_curpm && r == PM_GETRE(PL_curpm))
8673 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8682 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8683 /* no need to distinguish between them any more */
8684 n = RX_BUFF_IDX_FULLMATCH;
8686 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8687 && rx->offs[0].start != -1)
8689 /* $`, ${^PREMATCH} */
8690 i = rx->offs[0].start;
8694 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8695 && rx->offs[0].end != -1)
8697 /* $', ${^POSTMATCH} */
8698 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8699 i = rx->sublen + rx->suboffset - rx->offs[0].end;
8702 if ( 0 <= n && n <= (I32)rx->nparens &&
8703 (s1 = rx->offs[n].start) != -1 &&
8704 (t1 = rx->offs[n].end) != -1)
8706 /* $&, ${^MATCH}, $1 ... */
8708 s = rx->subbeg + s1 - rx->suboffset;
8713 assert(s >= rx->subbeg);
8714 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8716 #ifdef NO_TAINT_SUPPORT
8717 sv_setpvn(sv, s, i);
8719 const int oldtainted = TAINT_get;
8721 sv_setpvn(sv, s, i);
8722 TAINT_set(oldtainted);
8724 if (RXp_MATCH_UTF8(rx))
8729 if (RXp_MATCH_TAINTED(rx)) {
8730 if (SvTYPE(sv) >= SVt_PVMG) {
8731 MAGIC* const mg = SvMAGIC(sv);
8734 SvMAGIC_set(sv, mg->mg_moremagic);
8736 if ((mgt = SvMAGIC(sv))) {
8737 mg->mg_moremagic = mgt;
8738 SvMAGIC_set(sv, mg);
8755 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8756 SV const * const value)
8758 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8760 PERL_UNUSED_ARG(rx);
8761 PERL_UNUSED_ARG(paren);
8762 PERL_UNUSED_ARG(value);
8765 Perl_croak_no_modify();
8769 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8772 struct regexp *const rx = ReANY(r);
8776 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8778 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
8779 || paren == RX_BUFF_IDX_CARET_FULLMATCH
8780 || paren == RX_BUFF_IDX_CARET_POSTMATCH
8783 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8785 /* on something like
8788 * the KEEPCOPY is set on the PMOP rather than the regex */
8789 if (PL_curpm && r == PM_GETRE(PL_curpm))
8790 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8796 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8798 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8799 case RX_BUFF_IDX_PREMATCH: /* $` */
8800 if (rx->offs[0].start != -1) {
8801 i = rx->offs[0].start;
8810 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8811 case RX_BUFF_IDX_POSTMATCH: /* $' */
8812 if (rx->offs[0].end != -1) {
8813 i = rx->sublen - rx->offs[0].end;
8815 s1 = rx->offs[0].end;
8822 default: /* $& / ${^MATCH}, $1, $2, ... */
8823 if (paren <= (I32)rx->nparens &&
8824 (s1 = rx->offs[paren].start) != -1 &&
8825 (t1 = rx->offs[paren].end) != -1)
8831 if (ckWARN(WARN_UNINITIALIZED))
8832 report_uninit((const SV *)sv);
8837 if (i > 0 && RXp_MATCH_UTF8(rx)) {
8838 const char * const s = rx->subbeg - rx->suboffset + s1;
8843 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8850 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8852 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8853 PERL_UNUSED_ARG(rx);
8857 return newSVpvs("Regexp");
8860 /* Scans the name of a named buffer from the pattern.
8861 * If flags is REG_RSN_RETURN_NULL returns null.
8862 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8863 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8864 * to the parsed name as looked up in the RExC_paren_names hash.
8865 * If there is an error throws a vFAIL().. type exception.
8868 #define REG_RSN_RETURN_NULL 0
8869 #define REG_RSN_RETURN_NAME 1
8870 #define REG_RSN_RETURN_DATA 2
8873 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8875 char *name_start = RExC_parse;
8878 PERL_ARGS_ASSERT_REG_SCAN_NAME;
8880 assert (RExC_parse <= RExC_end);
8881 if (RExC_parse == RExC_end) NOOP;
8882 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8883 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
8884 * using do...while */
8887 RExC_parse += UTF8SKIP(RExC_parse);
8888 } while ( RExC_parse < RExC_end
8889 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8893 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8895 RExC_parse++; /* so the <- from the vFAIL is after the offending
8897 vFAIL("Group name must start with a non-digit word character");
8899 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8900 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8901 if ( flags == REG_RSN_RETURN_NAME)
8903 else if (flags==REG_RSN_RETURN_DATA) {
8906 if ( ! sv_name ) /* should not happen*/
8907 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8908 if (RExC_paren_names)
8909 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8911 sv_dat = HeVAL(he_str);
8912 if ( ! sv_dat ) { /* Didn't find group */
8914 /* It might be a forward reference; we can't fail until we
8915 * know, by completing the parse to get all the groups, and
8917 if (ALL_PARENS_COUNTED) {
8918 vFAIL("Reference to nonexistent named group");
8921 REQUIRE_PARENS_PASS;
8927 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8928 (unsigned long) flags);
8931 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
8932 if (RExC_lastparse!=RExC_parse) { \
8933 Perl_re_printf( aTHX_ "%s", \
8934 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
8935 RExC_end - RExC_parse, 16, \
8937 PERL_PV_ESCAPE_UNI_DETECT | \
8938 PERL_PV_PRETTY_ELLIPSES | \
8939 PERL_PV_PRETTY_LTGT | \
8940 PERL_PV_ESCAPE_RE | \
8941 PERL_PV_PRETTY_EXACTSIZE \
8945 Perl_re_printf( aTHX_ "%16s",""); \
8947 if (RExC_lastnum!=RExC_emit) \
8948 Perl_re_printf( aTHX_ "|%4d", RExC_emit); \
8950 Perl_re_printf( aTHX_ "|%4s",""); \
8951 Perl_re_printf( aTHX_ "|%*s%-4s", \
8952 (int)((depth*2)), "", \
8955 RExC_lastnum=RExC_emit; \
8956 RExC_lastparse=RExC_parse; \
8961 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
8962 DEBUG_PARSE_MSG((funcname)); \
8963 Perl_re_printf( aTHX_ "%4s","\n"); \
8965 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
8966 DEBUG_PARSE_MSG((funcname)); \
8967 Perl_re_printf( aTHX_ fmt "\n",args); \
8970 /* This section of code defines the inversion list object and its methods. The
8971 * interfaces are highly subject to change, so as much as possible is static to
8972 * this file. An inversion list is here implemented as a malloc'd C UV array
8973 * as an SVt_INVLIST scalar.
8975 * An inversion list for Unicode is an array of code points, sorted by ordinal
8976 * number. Each element gives the code point that begins a range that extends
8977 * up-to but not including the code point given by the next element. The final
8978 * element gives the first code point of a range that extends to the platform's
8979 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
8980 * ...) give ranges whose code points are all in the inversion list. We say
8981 * that those ranges are in the set. The odd-numbered elements give ranges
8982 * whose code points are not in the inversion list, and hence not in the set.
8983 * Thus, element [0] is the first code point in the list. Element [1]
8984 * is the first code point beyond that not in the list; and element [2] is the
8985 * first code point beyond that that is in the list. In other words, the first
8986 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8987 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
8988 * all code points in that range are not in the inversion list. The third
8989 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8990 * list, and so forth. Thus every element whose index is divisible by two
8991 * gives the beginning of a range that is in the list, and every element whose
8992 * index is not divisible by two gives the beginning of a range not in the
8993 * list. If the final element's index is divisible by two, the inversion list
8994 * extends to the platform's infinity; otherwise the highest code point in the
8995 * inversion list is the contents of that element minus 1.
8997 * A range that contains just a single code point N will look like
8999 * invlist[i+1] == N+1
9001 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9002 * impossible to represent, so element [i+1] is omitted. The single element
9004 * invlist[0] == UV_MAX
9005 * contains just UV_MAX, but is interpreted as matching to infinity.
9007 * Taking the complement (inverting) an inversion list is quite simple, if the
9008 * first element is 0, remove it; otherwise add a 0 element at the beginning.
9009 * This implementation reserves an element at the beginning of each inversion
9010 * list to always contain 0; there is an additional flag in the header which
9011 * indicates if the list begins at the 0, or is offset to begin at the next
9012 * element. This means that the inversion list can be inverted without any
9013 * copying; just flip the flag.
9015 * More about inversion lists can be found in "Unicode Demystified"
9016 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9018 * The inversion list data structure is currently implemented as an SV pointing
9019 * to an array of UVs that the SV thinks are bytes. This allows us to have an
9020 * array of UV whose memory management is automatically handled by the existing
9021 * facilities for SV's.
9023 * Some of the methods should always be private to the implementation, and some
9024 * should eventually be made public */
9026 /* The header definitions are in F<invlist_inline.h> */
9028 #ifndef PERL_IN_XSUB_RE
9030 PERL_STATIC_INLINE UV*
9031 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9033 /* Returns a pointer to the first element in the inversion list's array.
9034 * This is called upon initialization of an inversion list. Where the
9035 * array begins depends on whether the list has the code point U+0000 in it
9036 * or not. The other parameter tells it whether the code that follows this
9037 * call is about to put a 0 in the inversion list or not. The first
9038 * element is either the element reserved for 0, if TRUE, or the element
9039 * after it, if FALSE */
9041 bool* offset = get_invlist_offset_addr(invlist);
9042 UV* zero_addr = (UV *) SvPVX(invlist);
9044 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9047 assert(! _invlist_len(invlist));
9051 /* 1^1 = 0; 1^0 = 1 */
9052 *offset = 1 ^ will_have_0;
9053 return zero_addr + *offset;
9057 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9059 /* Replaces the inversion list in 'dest' with the one from 'src'. It
9060 * steals the list from 'src', so 'src' is made to have a NULL list. This
9061 * is similar to what SvSetMagicSV() would do, if it were implemented on
9062 * inversion lists, though this routine avoids a copy */
9064 const UV src_len = _invlist_len(src);
9065 const bool src_offset = *get_invlist_offset_addr(src);
9066 const STRLEN src_byte_len = SvLEN(src);
9067 char * array = SvPVX(src);
9069 const int oldtainted = TAINT_get;
9071 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9073 assert(is_invlist(src));
9074 assert(is_invlist(dest));
9075 assert(! invlist_is_iterating(src));
9076 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9078 /* Make sure it ends in the right place with a NUL, as our inversion list
9079 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9081 array[src_byte_len - 1] = '\0';
9083 TAINT_NOT; /* Otherwise it breaks */
9084 sv_usepvn_flags(dest,
9088 /* This flag is documented to cause a copy to be avoided */
9089 SV_HAS_TRAILING_NUL);
9090 TAINT_set(oldtainted);
9095 /* Finish up copying over the other fields in an inversion list */
9096 *get_invlist_offset_addr(dest) = src_offset;
9097 invlist_set_len(dest, src_len, src_offset);
9098 *get_invlist_previous_index_addr(dest) = 0;
9099 invlist_iterfinish(dest);
9102 PERL_STATIC_INLINE IV*
9103 S_get_invlist_previous_index_addr(SV* invlist)
9105 /* Return the address of the IV that is reserved to hold the cached index
9107 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9109 assert(is_invlist(invlist));
9111 return &(((XINVLIST*) SvANY(invlist))->prev_index);
9114 PERL_STATIC_INLINE IV
9115 S_invlist_previous_index(SV* const invlist)
9117 /* Returns cached index of previous search */
9119 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9121 return *get_invlist_previous_index_addr(invlist);
9124 PERL_STATIC_INLINE void
9125 S_invlist_set_previous_index(SV* const invlist, const IV index)
9127 /* Caches <index> for later retrieval */
9129 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9131 assert(index == 0 || index < (int) _invlist_len(invlist));
9133 *get_invlist_previous_index_addr(invlist) = index;
9136 PERL_STATIC_INLINE void
9137 S_invlist_trim(SV* invlist)
9139 /* Free the not currently-being-used space in an inversion list */
9141 /* But don't free up the space needed for the 0 UV that is always at the
9142 * beginning of the list, nor the trailing NUL */
9143 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9145 PERL_ARGS_ASSERT_INVLIST_TRIM;
9147 assert(is_invlist(invlist));
9149 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9152 PERL_STATIC_INLINE void
9153 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
9155 PERL_ARGS_ASSERT_INVLIST_CLEAR;
9157 assert(is_invlist(invlist));
9159 invlist_set_len(invlist, 0, 0);
9160 invlist_trim(invlist);
9163 #endif /* ifndef PERL_IN_XSUB_RE */
9165 PERL_STATIC_INLINE bool
9166 S_invlist_is_iterating(SV* const invlist)
9168 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9170 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9173 #ifndef PERL_IN_XSUB_RE
9175 PERL_STATIC_INLINE UV
9176 S_invlist_max(SV* const invlist)
9178 /* Returns the maximum number of elements storable in the inversion list's
9179 * array, without having to realloc() */
9181 PERL_ARGS_ASSERT_INVLIST_MAX;
9183 assert(is_invlist(invlist));
9185 /* Assumes worst case, in which the 0 element is not counted in the
9186 * inversion list, so subtracts 1 for that */
9187 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
9188 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9189 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9193 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9195 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9197 /* First 1 is in case the zero element isn't in the list; second 1 is for
9199 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9200 invlist_set_len(invlist, 0, 0);
9202 /* Force iterinit() to be used to get iteration to work */
9203 invlist_iterfinish(invlist);
9205 *get_invlist_previous_index_addr(invlist) = 0;
9206 SvPOK_on(invlist); /* This allows B to extract the PV */
9210 Perl__new_invlist(pTHX_ IV initial_size)
9213 /* Return a pointer to a newly constructed inversion list, with enough
9214 * space to store 'initial_size' elements. If that number is negative, a
9215 * system default is used instead */
9219 if (initial_size < 0) {
9223 new_list = newSV_type(SVt_INVLIST);
9224 initialize_invlist_guts(new_list, initial_size);
9230 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9232 /* Return a pointer to a newly constructed inversion list, initialized to
9233 * point to <list>, which has to be in the exact correct inversion list
9234 * form, including internal fields. Thus this is a dangerous routine that
9235 * should not be used in the wrong hands. The passed in 'list' contains
9236 * several header fields at the beginning that are not part of the
9237 * inversion list body proper */
9239 const STRLEN length = (STRLEN) list[0];
9240 const UV version_id = list[1];
9241 const bool offset = cBOOL(list[2]);
9242 #define HEADER_LENGTH 3
9243 /* If any of the above changes in any way, you must change HEADER_LENGTH
9244 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9245 * perl -E 'say int(rand 2**31-1)'
9247 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9248 data structure type, so that one being
9249 passed in can be validated to be an
9250 inversion list of the correct vintage.
9253 SV* invlist = newSV_type(SVt_INVLIST);
9255 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9257 if (version_id != INVLIST_VERSION_ID) {
9258 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9261 /* The generated array passed in includes header elements that aren't part
9262 * of the list proper, so start it just after them */
9263 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9265 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
9266 shouldn't touch it */
9268 *(get_invlist_offset_addr(invlist)) = offset;
9270 /* The 'length' passed to us is the physical number of elements in the
9271 * inversion list. But if there is an offset the logical number is one
9273 invlist_set_len(invlist, length - offset, offset);
9275 invlist_set_previous_index(invlist, 0);
9277 /* Initialize the iteration pointer. */
9278 invlist_iterfinish(invlist);
9280 SvREADONLY_on(invlist);
9287 S__append_range_to_invlist(pTHX_ SV* const invlist,
9288 const UV start, const UV end)
9290 /* Subject to change or removal. Append the range from 'start' to 'end' at
9291 * the end of the inversion list. The range must be above any existing
9295 UV max = invlist_max(invlist);
9296 UV len = _invlist_len(invlist);
9299 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9301 if (len == 0) { /* Empty lists must be initialized */
9302 offset = start != 0;
9303 array = _invlist_array_init(invlist, ! offset);
9306 /* Here, the existing list is non-empty. The current max entry in the
9307 * list is generally the first value not in the set, except when the
9308 * set extends to the end of permissible values, in which case it is
9309 * the first entry in that final set, and so this call is an attempt to
9310 * append out-of-order */
9312 UV final_element = len - 1;
9313 array = invlist_array(invlist);
9314 if ( array[final_element] > start
9315 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9317 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9318 array[final_element], start,
9319 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9322 /* Here, it is a legal append. If the new range begins 1 above the end
9323 * of the range below it, it is extending the range below it, so the
9324 * new first value not in the set is one greater than the newly
9325 * extended range. */
9326 offset = *get_invlist_offset_addr(invlist);
9327 if (array[final_element] == start) {
9328 if (end != UV_MAX) {
9329 array[final_element] = end + 1;
9332 /* But if the end is the maximum representable on the machine,
9333 * assume that infinity was actually what was meant. Just let
9334 * the range that this would extend to have no end */
9335 invlist_set_len(invlist, len - 1, offset);
9341 /* Here the new range doesn't extend any existing set. Add it */
9343 len += 2; /* Includes an element each for the start and end of range */
9345 /* If wll overflow the existing space, extend, which may cause the array to
9348 invlist_extend(invlist, len);
9350 /* Have to set len here to avoid assert failure in invlist_array() */
9351 invlist_set_len(invlist, len, offset);
9353 array = invlist_array(invlist);
9356 invlist_set_len(invlist, len, offset);
9359 /* The next item on the list starts the range, the one after that is
9360 * one past the new range. */
9361 array[len - 2] = start;
9362 if (end != UV_MAX) {
9363 array[len - 1] = end + 1;
9366 /* But if the end is the maximum representable on the machine, just let
9367 * the range have no end */
9368 invlist_set_len(invlist, len - 1, offset);
9373 Perl__invlist_search(SV* const invlist, const UV cp)
9375 /* Searches the inversion list for the entry that contains the input code
9376 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
9377 * return value is the index into the list's array of the range that
9378 * contains <cp>, that is, 'i' such that
9379 * array[i] <= cp < array[i+1]
9384 IV high = _invlist_len(invlist);
9385 const IV highest_element = high - 1;
9388 PERL_ARGS_ASSERT__INVLIST_SEARCH;
9390 /* If list is empty, return failure. */
9395 /* (We can't get the array unless we know the list is non-empty) */
9396 array = invlist_array(invlist);
9398 mid = invlist_previous_index(invlist);
9400 if (mid > highest_element) {
9401 mid = highest_element;
9404 /* <mid> contains the cache of the result of the previous call to this
9405 * function (0 the first time). See if this call is for the same result,
9406 * or if it is for mid-1. This is under the theory that calls to this
9407 * function will often be for related code points that are near each other.
9408 * And benchmarks show that caching gives better results. We also test
9409 * here if the code point is within the bounds of the list. These tests
9410 * replace others that would have had to be made anyway to make sure that
9411 * the array bounds were not exceeded, and these give us extra information
9412 * at the same time */
9413 if (cp >= array[mid]) {
9414 if (cp >= array[highest_element]) {
9415 return highest_element;
9418 /* Here, array[mid] <= cp < array[highest_element]. This means that
9419 * the final element is not the answer, so can exclude it; it also
9420 * means that <mid> is not the final element, so can refer to 'mid + 1'
9422 if (cp < array[mid + 1]) {
9428 else { /* cp < aray[mid] */
9429 if (cp < array[0]) { /* Fail if outside the array */
9433 if (cp >= array[mid - 1]) {
9438 /* Binary search. What we are looking for is <i> such that
9439 * array[i] <= cp < array[i+1]
9440 * The loop below converges on the i+1. Note that there may not be an
9441 * (i+1)th element in the array, and things work nonetheless */
9442 while (low < high) {
9443 mid = (low + high) / 2;
9444 assert(mid <= highest_element);
9445 if (array[mid] <= cp) { /* cp >= array[mid] */
9448 /* We could do this extra test to exit the loop early.
9449 if (cp < array[low]) {
9454 else { /* cp < array[mid] */
9461 invlist_set_previous_index(invlist, high);
9466 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9467 const bool complement_b, SV** output)
9469 /* Take the union of two inversion lists and point '*output' to it. On
9470 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9471 * even 'a' or 'b'). If to an inversion list, the contents of the original
9472 * list will be replaced by the union. The first list, 'a', may be
9473 * NULL, in which case a copy of the second list is placed in '*output'.
9474 * If 'complement_b' is TRUE, the union is taken of the complement
9475 * (inversion) of 'b' instead of b itself.
9477 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9478 * Richard Gillam, published by Addison-Wesley, and explained at some
9479 * length there. The preface says to incorporate its examples into your
9480 * code at your own risk.
9482 * The algorithm is like a merge sort. */
9484 const UV* array_a; /* a's array */
9486 UV len_a; /* length of a's array */
9489 SV* u; /* the resulting union */
9493 UV i_a = 0; /* current index into a's array */
9497 /* running count, as explained in the algorithm source book; items are
9498 * stopped accumulating and are output when the count changes to/from 0.
9499 * The count is incremented when we start a range that's in an input's set,
9500 * and decremented when we start a range that's not in a set. So this
9501 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
9502 * and hence nothing goes into the union; 1, just one of the inputs is in
9503 * its set (and its current range gets added to the union); and 2 when both
9504 * inputs are in their sets. */
9507 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9509 assert(*output == NULL || is_invlist(*output));
9511 len_b = _invlist_len(b);
9514 /* Here, 'b' is empty, hence it's complement is all possible code
9515 * points. So if the union includes the complement of 'b', it includes
9516 * everything, and we need not even look at 'a'. It's easiest to
9517 * create a new inversion list that matches everything. */
9519 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9521 if (*output == NULL) { /* If the output didn't exist, just point it
9523 *output = everything;
9525 else { /* Otherwise, replace its contents with the new list */
9526 invlist_replace_list_destroys_src(*output, everything);
9527 SvREFCNT_dec_NN(everything);
9533 /* Here, we don't want the complement of 'b', and since 'b' is empty,
9534 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
9535 * output will be empty */
9537 if (a == NULL || _invlist_len(a) == 0) {
9538 if (*output == NULL) {
9539 *output = _new_invlist(0);
9542 invlist_clear(*output);
9547 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9548 * union. We can just return a copy of 'a' if '*output' doesn't point
9549 * to an existing list */
9550 if (*output == NULL) {
9551 *output = invlist_clone(a, NULL);
9555 /* If the output is to overwrite 'a', we have a no-op, as it's
9561 /* Here, '*output' is to be overwritten by 'a' */
9562 u = invlist_clone(a, NULL);
9563 invlist_replace_list_destroys_src(*output, u);
9569 /* Here 'b' is not empty. See about 'a' */
9571 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9573 /* Here, 'a' is empty (and b is not). That means the union will come
9574 * entirely from 'b'. If '*output' is NULL, we can directly return a
9575 * clone of 'b'. Otherwise, we replace the contents of '*output' with
9578 SV ** dest = (*output == NULL) ? output : &u;
9579 *dest = invlist_clone(b, NULL);
9581 _invlist_invert(*dest);
9585 invlist_replace_list_destroys_src(*output, u);
9592 /* Here both lists exist and are non-empty */
9593 array_a = invlist_array(a);
9594 array_b = invlist_array(b);
9596 /* If are to take the union of 'a' with the complement of b, set it
9597 * up so are looking at b's complement. */
9600 /* To complement, we invert: if the first element is 0, remove it. To
9601 * do this, we just pretend the array starts one later */
9602 if (array_b[0] == 0) {
9608 /* But if the first element is not zero, we pretend the list starts
9609 * at the 0 that is always stored immediately before the array. */
9615 /* Size the union for the worst case: that the sets are completely
9617 u = _new_invlist(len_a + len_b);
9619 /* Will contain U+0000 if either component does */
9620 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
9621 || (len_b > 0 && array_b[0] == 0));
9623 /* Go through each input list item by item, stopping when have exhausted
9625 while (i_a < len_a && i_b < len_b) {
9626 UV cp; /* The element to potentially add to the union's array */
9627 bool cp_in_set; /* is it in the the input list's set or not */
9629 /* We need to take one or the other of the two inputs for the union.
9630 * Since we are merging two sorted lists, we take the smaller of the
9631 * next items. In case of a tie, we take first the one that is in its
9632 * set. If we first took the one not in its set, it would decrement
9633 * the count, possibly to 0 which would cause it to be output as ending
9634 * the range, and the next time through we would take the same number,
9635 * and output it again as beginning the next range. By doing it the
9636 * opposite way, there is no possibility that the count will be
9637 * momentarily decremented to 0, and thus the two adjoining ranges will
9638 * be seamlessly merged. (In a tie and both are in the set or both not
9639 * in the set, it doesn't matter which we take first.) */
9640 if ( array_a[i_a] < array_b[i_b]
9641 || ( array_a[i_a] == array_b[i_b]
9642 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9644 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9645 cp = array_a[i_a++];
9648 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9649 cp = array_b[i_b++];
9652 /* Here, have chosen which of the two inputs to look at. Only output
9653 * if the running count changes to/from 0, which marks the
9654 * beginning/end of a range that's in the set */
9657 array_u[i_u++] = cp;
9664 array_u[i_u++] = cp;
9670 /* The loop above increments the index into exactly one of the input lists
9671 * each iteration, and ends when either index gets to its list end. That
9672 * means the other index is lower than its end, and so something is
9673 * remaining in that one. We decrement 'count', as explained below, if
9674 * that list is in its set. (i_a and i_b each currently index the element
9675 * beyond the one we care about.) */
9676 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9677 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9682 /* Above we decremented 'count' if the list that had unexamined elements in
9683 * it was in its set. This has made it so that 'count' being non-zero
9684 * means there isn't anything left to output; and 'count' equal to 0 means
9685 * that what is left to output is precisely that which is left in the
9686 * non-exhausted input list.
9688 * To see why, note first that the exhausted input obviously has nothing
9689 * left to add to the union. If it was in its set at its end, that means
9690 * the set extends from here to the platform's infinity, and hence so does
9691 * the union and the non-exhausted set is irrelevant. The exhausted set
9692 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
9693 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9694 * 'count' remains at 1. This is consistent with the decremented 'count'
9695 * != 0 meaning there's nothing left to add to the union.
9697 * But if the exhausted input wasn't in its set, it contributed 0 to
9698 * 'count', and the rest of the union will be whatever the other input is.
9699 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9700 * otherwise it gets decremented to 0. This is consistent with 'count'
9701 * == 0 meaning the remainder of the union is whatever is left in the
9702 * non-exhausted list. */
9707 IV copy_count = len_a - i_a;
9708 if (copy_count > 0) { /* The non-exhausted input is 'a' */
9709 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9711 else { /* The non-exhausted input is b */
9712 copy_count = len_b - i_b;
9713 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9715 len_u = i_u + copy_count;
9718 /* Set the result to the final length, which can change the pointer to
9719 * array_u, so re-find it. (Note that it is unlikely that this will
9720 * change, as we are shrinking the space, not enlarging it) */
9721 if (len_u != _invlist_len(u)) {
9722 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9724 array_u = invlist_array(u);
9727 if (*output == NULL) { /* Simply return the new inversion list */
9731 /* Otherwise, overwrite the inversion list that was in '*output'. We
9732 * could instead free '*output', and then set it to 'u', but experience
9733 * has shown [perl #127392] that if the input is a mortal, we can get a
9734 * huge build-up of these during regex compilation before they get
9736 invlist_replace_list_destroys_src(*output, u);
9744 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9745 const bool complement_b, SV** i)
9747 /* Take the intersection of two inversion lists and point '*i' to it. On
9748 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9749 * even 'a' or 'b'). If to an inversion list, the contents of the original
9750 * list will be replaced by the intersection. The first list, 'a', may be
9751 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
9752 * TRUE, the result will be the intersection of 'a' and the complement (or
9753 * inversion) of 'b' instead of 'b' directly.
9755 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9756 * Richard Gillam, published by Addison-Wesley, and explained at some
9757 * length there. The preface says to incorporate its examples into your
9758 * code at your own risk. In fact, it had bugs
9760 * The algorithm is like a merge sort, and is essentially the same as the
9764 const UV* array_a; /* a's array */
9766 UV len_a; /* length of a's array */
9769 SV* r; /* the resulting intersection */
9773 UV i_a = 0; /* current index into a's array */
9777 /* running count of how many of the two inputs are postitioned at ranges
9778 * that are in their sets. As explained in the algorithm source book,
9779 * items are stopped accumulating and are output when the count changes
9780 * to/from 2. The count is incremented when we start a range that's in an
9781 * input's set, and decremented when we start a range that's not in a set.
9782 * Only when it is 2 are we in the intersection. */
9785 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9787 assert(*i == NULL || is_invlist(*i));
9789 /* Special case if either one is empty */
9790 len_a = (a == NULL) ? 0 : _invlist_len(a);
9791 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9792 if (len_a != 0 && complement_b) {
9794 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9795 * must be empty. Here, also we are using 'b's complement, which
9796 * hence must be every possible code point. Thus the intersection
9799 if (*i == a) { /* No-op */
9804 *i = invlist_clone(a, NULL);
9808 r = invlist_clone(a, NULL);
9809 invlist_replace_list_destroys_src(*i, r);
9814 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
9815 * intersection must be empty */
9817 *i = _new_invlist(0);
9825 /* Here both lists exist and are non-empty */
9826 array_a = invlist_array(a);
9827 array_b = invlist_array(b);
9829 /* If are to take the intersection of 'a' with the complement of b, set it
9830 * up so are looking at b's complement. */
9833 /* To complement, we invert: if the first element is 0, remove it. To
9834 * do this, we just pretend the array starts one later */
9835 if (array_b[0] == 0) {
9841 /* But if the first element is not zero, we pretend the list starts
9842 * at the 0 that is always stored immediately before the array. */
9848 /* Size the intersection for the worst case: that the intersection ends up
9849 * fragmenting everything to be completely disjoint */
9850 r= _new_invlist(len_a + len_b);
9852 /* Will contain U+0000 iff both components do */
9853 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9854 && len_b > 0 && array_b[0] == 0);
9856 /* Go through each list item by item, stopping when have exhausted one of
9858 while (i_a < len_a && i_b < len_b) {
9859 UV cp; /* The element to potentially add to the intersection's
9861 bool cp_in_set; /* Is it in the input list's set or not */
9863 /* We need to take one or the other of the two inputs for the
9864 * intersection. Since we are merging two sorted lists, we take the
9865 * smaller of the next items. In case of a tie, we take first the one
9866 * that is not in its set (a difference from the union algorithm). If
9867 * we first took the one in its set, it would increment the count,
9868 * possibly to 2 which would cause it to be output as starting a range
9869 * in the intersection, and the next time through we would take that
9870 * same number, and output it again as ending the set. By doing the
9871 * opposite of this, there is no possibility that the count will be
9872 * momentarily incremented to 2. (In a tie and both are in the set or
9873 * both not in the set, it doesn't matter which we take first.) */
9874 if ( array_a[i_a] < array_b[i_b]
9875 || ( array_a[i_a] == array_b[i_b]
9876 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9878 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9879 cp = array_a[i_a++];
9882 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9886 /* Here, have chosen which of the two inputs to look at. Only output
9887 * if the running count changes to/from 2, which marks the
9888 * beginning/end of a range that's in the intersection */
9892 array_r[i_r++] = cp;
9897 array_r[i_r++] = cp;
9904 /* The loop above increments the index into exactly one of the input lists
9905 * each iteration, and ends when either index gets to its list end. That
9906 * means the other index is lower than its end, and so something is
9907 * remaining in that one. We increment 'count', as explained below, if the
9908 * exhausted list was in its set. (i_a and i_b each currently index the
9909 * element beyond the one we care about.) */
9910 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9911 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9916 /* Above we incremented 'count' if the exhausted list was in its set. This
9917 * has made it so that 'count' being below 2 means there is nothing left to
9918 * output; otheriwse what's left to add to the intersection is precisely
9919 * that which is left in the non-exhausted input list.
9921 * To see why, note first that the exhausted input obviously has nothing
9922 * left to affect the intersection. If it was in its set at its end, that
9923 * means the set extends from here to the platform's infinity, and hence
9924 * anything in the non-exhausted's list will be in the intersection, and
9925 * anything not in it won't be. Hence, the rest of the intersection is
9926 * precisely what's in the non-exhausted list The exhausted set also
9927 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
9928 * it means 'count' is now at least 2. This is consistent with the
9929 * incremented 'count' being >= 2 means to add the non-exhausted list to
9932 * But if the exhausted input wasn't in its set, it contributed 0 to
9933 * 'count', and the intersection can't include anything further; the
9934 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
9935 * incremented. This is consistent with 'count' being < 2 meaning nothing
9936 * further to add to the intersection. */
9937 if (count < 2) { /* Nothing left to put in the intersection. */
9940 else { /* copy the non-exhausted list, unchanged. */
9941 IV copy_count = len_a - i_a;
9942 if (copy_count > 0) { /* a is the one with stuff left */
9943 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9945 else { /* b is the one with stuff left */
9946 copy_count = len_b - i_b;
9947 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9949 len_r = i_r + copy_count;
9952 /* Set the result to the final length, which can change the pointer to
9953 * array_r, so re-find it. (Note that it is unlikely that this will
9954 * change, as we are shrinking the space, not enlarging it) */
9955 if (len_r != _invlist_len(r)) {
9956 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9958 array_r = invlist_array(r);
9961 if (*i == NULL) { /* Simply return the calculated intersection */
9964 else { /* Otherwise, replace the existing inversion list in '*i'. We could
9965 instead free '*i', and then set it to 'r', but experience has
9966 shown [perl #127392] that if the input is a mortal, we can get a
9967 huge build-up of these during regex compilation before they get
9970 invlist_replace_list_destroys_src(*i, r);
9982 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
9984 /* Add the range from 'start' to 'end' inclusive to the inversion list's
9985 * set. A pointer to the inversion list is returned. This may actually be
9986 * a new list, in which case the passed in one has been destroyed. The
9987 * passed-in inversion list can be NULL, in which case a new one is created
9988 * with just the one range in it. The new list is not necessarily
9989 * NUL-terminated. Space is not freed if the inversion list shrinks as a
9990 * result of this function. The gain would not be large, and in many
9991 * cases, this is called multiple times on a single inversion list, so
9992 * anything freed may almost immediately be needed again.
9994 * This used to mostly call the 'union' routine, but that is much more
9995 * heavyweight than really needed for a single range addition */
9997 UV* array; /* The array implementing the inversion list */
9998 UV len; /* How many elements in 'array' */
9999 SSize_t i_s; /* index into the invlist array where 'start'
10001 SSize_t i_e = 0; /* And the index where 'end' should go */
10002 UV cur_highest; /* The highest code point in the inversion list
10003 upon entry to this function */
10005 /* This range becomes the whole inversion list if none already existed */
10006 if (invlist == NULL) {
10007 invlist = _new_invlist(2);
10008 _append_range_to_invlist(invlist, start, end);
10012 /* Likewise, if the inversion list is currently empty */
10013 len = _invlist_len(invlist);
10015 _append_range_to_invlist(invlist, start, end);
10019 /* Starting here, we have to know the internals of the list */
10020 array = invlist_array(invlist);
10022 /* If the new range ends higher than the current highest ... */
10023 cur_highest = invlist_highest(invlist);
10024 if (end > cur_highest) {
10026 /* If the whole range is higher, we can just append it */
10027 if (start > cur_highest) {
10028 _append_range_to_invlist(invlist, start, end);
10032 /* Otherwise, add the portion that is higher ... */
10033 _append_range_to_invlist(invlist, cur_highest + 1, end);
10035 /* ... and continue on below to handle the rest. As a result of the
10036 * above append, we know that the index of the end of the range is the
10037 * final even numbered one of the array. Recall that the final element
10038 * always starts a range that extends to infinity. If that range is in
10039 * the set (meaning the set goes from here to infinity), it will be an
10040 * even index, but if it isn't in the set, it's odd, and the final
10041 * range in the set is one less, which is even. */
10042 if (end == UV_MAX) {
10050 /* We have dealt with appending, now see about prepending. If the new
10051 * range starts lower than the current lowest ... */
10052 if (start < array[0]) {
10054 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10055 * Let the union code handle it, rather than having to know the
10056 * trickiness in two code places. */
10057 if (UNLIKELY(start == 0)) {
10060 range_invlist = _new_invlist(2);
10061 _append_range_to_invlist(range_invlist, start, end);
10063 _invlist_union(invlist, range_invlist, &invlist);
10065 SvREFCNT_dec_NN(range_invlist);
10070 /* If the whole new range comes before the first entry, and doesn't
10071 * extend it, we have to insert it as an additional range */
10072 if (end < array[0] - 1) {
10074 goto splice_in_new_range;
10077 /* Here the new range adjoins the existing first range, extending it
10081 /* And continue on below to handle the rest. We know that the index of
10082 * the beginning of the range is the first one of the array */
10085 else { /* Not prepending any part of the new range to the existing list.
10086 * Find where in the list it should go. This finds i_s, such that:
10087 * invlist[i_s] <= start < array[i_s+1]
10089 i_s = _invlist_search(invlist, start);
10092 /* At this point, any extending before the beginning of the inversion list
10093 * and/or after the end has been done. This has made it so that, in the
10094 * code below, each endpoint of the new range is either in a range that is
10095 * in the set, or is in a gap between two ranges that are. This means we
10096 * don't have to worry about exceeding the array bounds.
10098 * Find where in the list the new range ends (but we can skip this if we
10099 * have already determined what it is, or if it will be the same as i_s,
10100 * which we already have computed) */
10102 i_e = (start == end)
10104 : _invlist_search(invlist, end);
10107 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
10108 * is a range that goes to infinity there is no element at invlist[i_e+1],
10109 * so only the first relation holds. */
10111 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10113 /* Here, the ranges on either side of the beginning of the new range
10114 * are in the set, and this range starts in the gap between them.
10116 * The new range extends the range above it downwards if the new range
10117 * ends at or above that range's start */
10118 const bool extends_the_range_above = ( end == UV_MAX
10119 || end + 1 >= array[i_s+1]);
10121 /* The new range extends the range below it upwards if it begins just
10122 * after where that range ends */
10123 if (start == array[i_s]) {
10125 /* If the new range fills the entire gap between the other ranges,
10126 * they will get merged together. Other ranges may also get
10127 * merged, depending on how many of them the new range spans. In
10128 * the general case, we do the merge later, just once, after we
10129 * figure out how many to merge. But in the case where the new
10130 * range exactly spans just this one gap (possibly extending into
10131 * the one above), we do the merge here, and an early exit. This
10132 * is done here to avoid having to special case later. */
10133 if (i_e - i_s <= 1) {
10135 /* If i_e - i_s == 1, it means that the new range terminates
10136 * within the range above, and hence 'extends_the_range_above'
10137 * must be true. (If the range above it extends to infinity,
10138 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10139 * will be 0, so no harm done.) */
10140 if (extends_the_range_above) {
10141 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10142 invlist_set_len(invlist,
10144 *(get_invlist_offset_addr(invlist)));
10148 /* Here, i_e must == i_s. We keep them in sync, as they apply
10149 * to the same range, and below we are about to decrement i_s
10154 /* Here, the new range is adjacent to the one below. (It may also
10155 * span beyond the range above, but that will get resolved later.)
10156 * Extend the range below to include this one. */
10157 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10159 start = array[i_s];
10161 else if (extends_the_range_above) {
10163 /* Here the new range only extends the range above it, but not the
10164 * one below. It merges with the one above. Again, we keep i_e
10165 * and i_s in sync if they point to the same range */
10170 array[i_s] = start;
10174 /* Here, we've dealt with the new range start extending any adjoining
10177 * If the new range extends to infinity, it is now the final one,
10178 * regardless of what was there before */
10179 if (UNLIKELY(end == UV_MAX)) {
10180 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10184 /* If i_e started as == i_s, it has also been dealt with,
10185 * and been updated to the new i_s, which will fail the following if */
10186 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10188 /* Here, the ranges on either side of the end of the new range are in
10189 * the set, and this range ends in the gap between them.
10191 * If this range is adjacent to (hence extends) the range above it, it
10192 * becomes part of that range; likewise if it extends the range below,
10193 * it becomes part of that range */
10194 if (end + 1 == array[i_e+1]) {
10196 array[i_e] = start;
10198 else if (start <= array[i_e]) {
10199 array[i_e] = end + 1;
10206 /* If the range fits entirely in an existing range (as possibly already
10207 * extended above), it doesn't add anything new */
10208 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10212 /* Here, no part of the range is in the list. Must add it. It will
10213 * occupy 2 more slots */
10214 splice_in_new_range:
10216 invlist_extend(invlist, len + 2);
10217 array = invlist_array(invlist);
10218 /* Move the rest of the array down two slots. Don't include any
10220 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10222 /* Do the actual splice */
10223 array[i_e+1] = start;
10224 array[i_e+2] = end + 1;
10225 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10229 /* Here the new range crossed the boundaries of a pre-existing range. The
10230 * code above has adjusted things so that both ends are in ranges that are
10231 * in the set. This means everything in between must also be in the set.
10232 * Just squash things together */
10233 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10234 invlist_set_len(invlist,
10236 *(get_invlist_offset_addr(invlist)));
10242 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10243 UV** other_elements_ptr)
10245 /* Create and return an inversion list whose contents are to be populated
10246 * by the caller. The caller gives the number of elements (in 'size') and
10247 * the very first element ('element0'). This function will set
10248 * '*other_elements_ptr' to an array of UVs, where the remaining elements
10249 * are to be placed.
10251 * Obviously there is some trust involved that the caller will properly
10252 * fill in the other elements of the array.
10254 * (The first element needs to be passed in, as the underlying code does
10255 * things differently depending on whether it is zero or non-zero) */
10257 SV* invlist = _new_invlist(size);
10260 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10262 invlist = add_cp_to_invlist(invlist, element0);
10263 offset = *get_invlist_offset_addr(invlist);
10265 invlist_set_len(invlist, size, offset);
10266 *other_elements_ptr = invlist_array(invlist) + 1;
10272 #ifndef PERL_IN_XSUB_RE
10274 Perl__invlist_invert(pTHX_ SV* const invlist)
10276 /* Complement the input inversion list. This adds a 0 if the list didn't
10277 * have a zero; removes it otherwise. As described above, the data
10278 * structure is set up so that this is very efficient */
10280 PERL_ARGS_ASSERT__INVLIST_INVERT;
10282 assert(! invlist_is_iterating(invlist));
10284 /* The inverse of matching nothing is matching everything */
10285 if (_invlist_len(invlist) == 0) {
10286 _append_range_to_invlist(invlist, 0, UV_MAX);
10290 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10294 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10296 /* Return a new inversion list that is a copy of the input one, which is
10297 * unchanged. The new list will not be mortal even if the old one was. */
10299 const STRLEN nominal_length = _invlist_len(invlist);
10300 const STRLEN physical_length = SvCUR(invlist);
10301 const bool offset = *(get_invlist_offset_addr(invlist));
10303 PERL_ARGS_ASSERT_INVLIST_CLONE;
10305 if (new_invlist == NULL) {
10306 new_invlist = _new_invlist(nominal_length);
10309 sv_upgrade(new_invlist, SVt_INVLIST);
10310 initialize_invlist_guts(new_invlist, nominal_length);
10313 *(get_invlist_offset_addr(new_invlist)) = offset;
10314 invlist_set_len(new_invlist, nominal_length, offset);
10315 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10317 return new_invlist;
10322 PERL_STATIC_INLINE UV
10323 S_invlist_lowest(SV* const invlist)
10325 /* Returns the lowest code point that matches an inversion list. This API
10326 * has an ambiguity, as it returns 0 under either the lowest is actually
10327 * 0, or if the list is empty. If this distinction matters to you, check
10328 * for emptiness before calling this function */
10330 UV len = _invlist_len(invlist);
10333 PERL_ARGS_ASSERT_INVLIST_LOWEST;
10339 array = invlist_array(invlist);
10345 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10347 /* Get the contents of an inversion list into a string SV so that they can
10348 * be printed out. If 'traditional_style' is TRUE, it uses the format
10349 * traditionally done for debug tracing; otherwise it uses a format
10350 * suitable for just copying to the output, with blanks between ranges and
10351 * a dash between range components */
10355 const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10356 const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10358 if (traditional_style) {
10359 output = newSVpvs("\n");
10362 output = newSVpvs("");
10365 PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10367 assert(! invlist_is_iterating(invlist));
10369 invlist_iterinit(invlist);
10370 while (invlist_iternext(invlist, &start, &end)) {
10371 if (end == UV_MAX) {
10372 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10373 start, intra_range_delimiter,
10374 inter_range_delimiter);
10376 else if (end != start) {
10377 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10379 intra_range_delimiter,
10380 end, inter_range_delimiter);
10383 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10384 start, inter_range_delimiter);
10388 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10389 SvCUR_set(output, SvCUR(output) - 1);
10395 #ifndef PERL_IN_XSUB_RE
10397 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10398 const char * const indent, SV* const invlist)
10400 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
10401 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
10402 * the string 'indent'. The output looks like this:
10403 [0] 0x000A .. 0x000D
10405 [4] 0x2028 .. 0x2029
10406 [6] 0x3104 .. INFTY
10407 * This means that the first range of code points matched by the list are
10408 * 0xA through 0xD; the second range contains only the single code point
10409 * 0x85, etc. An inversion list is an array of UVs. Two array elements
10410 * are used to define each range (except if the final range extends to
10411 * infinity, only a single element is needed). The array index of the
10412 * first element for the corresponding range is given in brackets. */
10417 PERL_ARGS_ASSERT__INVLIST_DUMP;
10419 if (invlist_is_iterating(invlist)) {
10420 Perl_dump_indent(aTHX_ level, file,
10421 "%sCan't dump inversion list because is in middle of iterating\n",
10426 invlist_iterinit(invlist);
10427 while (invlist_iternext(invlist, &start, &end)) {
10428 if (end == UV_MAX) {
10429 Perl_dump_indent(aTHX_ level, file,
10430 "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10431 indent, (UV)count, start);
10433 else if (end != start) {
10434 Perl_dump_indent(aTHX_ level, file,
10435 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10436 indent, (UV)count, start, end);
10439 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10440 indent, (UV)count, start);
10448 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10450 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10452 /* Return a boolean as to if the two passed in inversion lists are
10453 * identical. The final argument, if TRUE, says to take the complement of
10454 * the second inversion list before doing the comparison */
10456 const UV len_a = _invlist_len(a);
10457 UV len_b = _invlist_len(b);
10459 const UV* array_a = NULL;
10460 const UV* array_b = NULL;
10462 PERL_ARGS_ASSERT__INVLISTEQ;
10464 /* This code avoids accessing the arrays unless it knows the length is
10469 return ! complement_b;
10473 array_a = invlist_array(a);
10477 array_b = invlist_array(b);
10480 /* If are to compare 'a' with the complement of b, set it
10481 * up so are looking at b's complement. */
10482 if (complement_b) {
10484 /* The complement of nothing is everything, so <a> would have to have
10485 * just one element, starting at zero (ending at infinity) */
10487 return (len_a == 1 && array_a[0] == 0);
10489 if (array_b[0] == 0) {
10491 /* Otherwise, to complement, we invert. Here, the first element is
10492 * 0, just remove it. To do this, we just pretend the array starts
10500 /* But if the first element is not zero, we pretend the list starts
10501 * at the 0 that is always stored immediately before the array. */
10507 return len_a == len_b
10508 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10514 * As best we can, determine the characters that can match the start of
10515 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
10516 * can be false positive matches
10518 * Returns the invlist as a new SV*; it is the caller's responsibility to
10519 * call SvREFCNT_dec() when done with it.
10522 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10525 const U8 * s = (U8*)STRING(node);
10526 SSize_t bytelen = STR_LEN(node);
10528 /* Start out big enough for 2 separate code points */
10529 SV* invlist = _new_invlist(4);
10531 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
10536 /* We punt and assume can match anything if the node begins
10537 * with a multi-character fold. Things are complicated. For
10538 * example, /ffi/i could match any of:
10539 * "\N{LATIN SMALL LIGATURE FFI}"
10540 * "\N{LATIN SMALL LIGATURE FF}I"
10541 * "F\N{LATIN SMALL LIGATURE FI}"
10542 * plus several other things; and making sure we have all the
10543 * possibilities is hard. */
10544 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10545 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10548 /* Any Latin1 range character can potentially match any
10549 * other depending on the locale, and in Turkic locales, U+130 and
10551 if (OP(node) == EXACTFL) {
10552 _invlist_union(invlist, PL_Latin1, &invlist);
10553 invlist = add_cp_to_invlist(invlist,
10554 LATIN_SMALL_LETTER_DOTLESS_I);
10555 invlist = add_cp_to_invlist(invlist,
10556 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10559 /* But otherwise, it matches at least itself. We can
10560 * quickly tell if it has a distinct fold, and if so,
10561 * it matches that as well */
10562 invlist = add_cp_to_invlist(invlist, uc);
10563 if (IS_IN_SOME_FOLD_L1(uc))
10564 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10567 /* Some characters match above-Latin1 ones under /i. This
10568 * is true of EXACTFL ones when the locale is UTF-8 */
10569 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10570 && (! isASCII(uc) || (OP(node) != EXACTFAA
10571 && OP(node) != EXACTFAA_NO_TRIE)))
10573 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10577 else { /* Pattern is UTF-8 */
10578 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10579 const U8* e = s + bytelen;
10582 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10584 /* The only code points that aren't folded in a UTF EXACTFish
10585 * node are are the problematic ones in EXACTFL nodes */
10586 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10587 /* We need to check for the possibility that this EXACTFL
10588 * node begins with a multi-char fold. Therefore we fold
10589 * the first few characters of it so that we can make that
10595 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10597 *(d++) = (U8) toFOLD(*s);
10598 if (fc < 0) { /* Save the first fold */
10605 UV fold = toFOLD_utf8_safe(s, e, d, &len);
10606 if (fc < 0) { /* Save the first fold */
10614 /* And set up so the code below that looks in this folded
10615 * buffer instead of the node's string */
10620 /* When we reach here 's' points to the fold of the first
10621 * character(s) of the node; and 'e' points to far enough along
10622 * the folded string to be just past any possible multi-char
10625 * Unlike the non-UTF-8 case, the macro for determining if a
10626 * string is a multi-char fold requires all the characters to
10627 * already be folded. This is because of all the complications
10628 * if not. Note that they are folded anyway, except in EXACTFL
10629 * nodes. Like the non-UTF case above, we punt if the node
10630 * begins with a multi-char fold */
10632 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10633 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10635 else { /* Single char fold */
10637 unsigned int first_fold;
10638 const unsigned int * remaining_folds;
10639 Size_t folds_count;
10641 /* It matches itself */
10642 invlist = add_cp_to_invlist(invlist, fc);
10644 /* ... plus all the things that fold to it, which are found in
10645 * PL_utf8_foldclosures */
10646 folds_count = _inverse_folds(fc, &first_fold,
10648 for (k = 0; k < folds_count; k++) {
10649 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10651 /* /aa doesn't allow folds between ASCII and non- */
10652 if ( (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10653 && isASCII(c) != isASCII(fc))
10658 invlist = add_cp_to_invlist(invlist, c);
10661 if (OP(node) == EXACTFL) {
10663 /* If either [iI] are present in an EXACTFL node the above code
10664 * should have added its normal case pair, but under a Turkish
10665 * locale they could match instead the case pairs from it. Add
10666 * those as potential matches as well */
10667 if (isALPHA_FOLD_EQ(fc, 'I')) {
10668 invlist = add_cp_to_invlist(invlist,
10669 LATIN_SMALL_LETTER_DOTLESS_I);
10670 invlist = add_cp_to_invlist(invlist,
10671 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10673 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10674 invlist = add_cp_to_invlist(invlist, 'I');
10676 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10677 invlist = add_cp_to_invlist(invlist, 'i');
10686 #undef HEADER_LENGTH
10687 #undef TO_INTERNAL_SIZE
10688 #undef FROM_INTERNAL_SIZE
10689 #undef INVLIST_VERSION_ID
10691 /* End of inversion list object */
10694 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10696 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10697 * constructs, and updates RExC_flags with them. On input, RExC_parse
10698 * should point to the first flag; it is updated on output to point to the
10699 * final ')' or ':'. There needs to be at least one flag, or this will
10702 /* for (?g), (?gc), and (?o) warnings; warning
10703 about (?c) will warn about (?g) -- japhy */
10705 #define WASTED_O 0x01
10706 #define WASTED_G 0x02
10707 #define WASTED_C 0x04
10708 #define WASTED_GC (WASTED_G|WASTED_C)
10709 I32 wastedflags = 0x00;
10710 U32 posflags = 0, negflags = 0;
10711 U32 *flagsp = &posflags;
10712 char has_charset_modifier = '\0';
10714 bool has_use_defaults = FALSE;
10715 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10716 int x_mod_count = 0;
10718 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10720 /* '^' as an initial flag sets certain defaults */
10721 if (UCHARAT(RExC_parse) == '^') {
10723 has_use_defaults = TRUE;
10724 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10725 cs = (RExC_uni_semantics)
10726 ? REGEX_UNICODE_CHARSET
10727 : REGEX_DEPENDS_CHARSET;
10728 set_regex_charset(&RExC_flags, cs);
10731 cs = get_regex_charset(RExC_flags);
10732 if ( cs == REGEX_DEPENDS_CHARSET
10733 && RExC_uni_semantics)
10735 cs = REGEX_UNICODE_CHARSET;
10739 while (RExC_parse < RExC_end) {
10740 /* && strchr("iogcmsx", *RExC_parse) */
10741 /* (?g), (?gc) and (?o) are useless here
10742 and must be globally applied -- japhy */
10743 switch (*RExC_parse) {
10745 /* Code for the imsxn flags */
10746 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10748 case LOCALE_PAT_MOD:
10749 if (has_charset_modifier) {
10750 goto excess_modifier;
10752 else if (flagsp == &negflags) {
10755 cs = REGEX_LOCALE_CHARSET;
10756 has_charset_modifier = LOCALE_PAT_MOD;
10758 case UNICODE_PAT_MOD:
10759 if (has_charset_modifier) {
10760 goto excess_modifier;
10762 else if (flagsp == &negflags) {
10765 cs = REGEX_UNICODE_CHARSET;
10766 has_charset_modifier = UNICODE_PAT_MOD;
10768 case ASCII_RESTRICT_PAT_MOD:
10769 if (flagsp == &negflags) {
10772 if (has_charset_modifier) {
10773 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10774 goto excess_modifier;
10776 /* Doubled modifier implies more restricted */
10777 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10780 cs = REGEX_ASCII_RESTRICTED_CHARSET;
10782 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10784 case DEPENDS_PAT_MOD:
10785 if (has_use_defaults) {
10786 goto fail_modifiers;
10788 else if (flagsp == &negflags) {
10791 else if (has_charset_modifier) {
10792 goto excess_modifier;
10795 /* The dual charset means unicode semantics if the
10796 * pattern (or target, not known until runtime) are
10797 * utf8, or something in the pattern indicates unicode
10799 cs = (RExC_uni_semantics)
10800 ? REGEX_UNICODE_CHARSET
10801 : REGEX_DEPENDS_CHARSET;
10802 has_charset_modifier = DEPENDS_PAT_MOD;
10806 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10807 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10809 else if (has_charset_modifier == *(RExC_parse - 1)) {
10810 vFAIL2("Regexp modifier \"%c\" may not appear twice",
10811 *(RExC_parse - 1));
10814 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10816 NOT_REACHED; /*NOTREACHED*/
10819 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10820 *(RExC_parse - 1));
10821 NOT_REACHED; /*NOTREACHED*/
10822 case ONCE_PAT_MOD: /* 'o' */
10823 case GLOBAL_PAT_MOD: /* 'g' */
10824 if (ckWARN(WARN_REGEXP)) {
10825 const I32 wflagbit = *RExC_parse == 'o'
10828 if (! (wastedflags & wflagbit) ) {
10829 wastedflags |= wflagbit;
10830 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10833 "Useless (%s%c) - %suse /%c modifier",
10834 flagsp == &negflags ? "?-" : "?",
10836 flagsp == &negflags ? "don't " : "",
10843 case CONTINUE_PAT_MOD: /* 'c' */
10844 if (ckWARN(WARN_REGEXP)) {
10845 if (! (wastedflags & WASTED_C) ) {
10846 wastedflags |= WASTED_GC;
10847 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10850 "Useless (%sc) - %suse /gc modifier",
10851 flagsp == &negflags ? "?-" : "?",
10852 flagsp == &negflags ? "don't " : ""
10857 case KEEPCOPY_PAT_MOD: /* 'p' */
10858 if (flagsp == &negflags) {
10859 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10861 *flagsp |= RXf_PMf_KEEPCOPY;
10865 /* A flag is a default iff it is following a minus, so
10866 * if there is a minus, it means will be trying to
10867 * re-specify a default which is an error */
10868 if (has_use_defaults || flagsp == &negflags) {
10869 goto fail_modifiers;
10871 flagsp = &negflags;
10872 wastedflags = 0; /* reset so (?g-c) warns twice */
10878 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
10879 negflags |= RXf_PMf_EXTENDED_MORE;
10881 RExC_flags |= posflags;
10883 if (negflags & RXf_PMf_EXTENDED) {
10884 negflags |= RXf_PMf_EXTENDED_MORE;
10886 RExC_flags &= ~negflags;
10887 set_regex_charset(&RExC_flags, cs);
10892 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
10893 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10894 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
10895 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10896 NOT_REACHED; /*NOTREACHED*/
10899 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10902 vFAIL("Sequence (?... not terminated");
10906 - reg - regular expression, i.e. main body or parenthesized thing
10908 * Caller must absorb opening parenthesis.
10910 * Combining parenthesis handling with the base level of regular expression
10911 * is a trifle forced, but the need to tie the tails of the branches to what
10912 * follows makes it hard to avoid.
10914 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10916 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10918 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10921 PERL_STATIC_INLINE regnode_offset
10922 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10924 char * parse_start,
10928 regnode_offset ret;
10929 char* name_start = RExC_parse;
10931 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
10932 GET_RE_DEBUG_FLAGS_DECL;
10934 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
10936 if (RExC_parse == name_start || *RExC_parse != ch) {
10937 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
10938 vFAIL2("Sequence %.3s... not terminated", parse_start);
10942 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10943 RExC_rxi->data->data[num]=(void*)sv_dat;
10944 SvREFCNT_inc_simple_void_NN(sv_dat);
10947 ret = reganode(pRExC_state,
10950 : (ASCII_FOLD_RESTRICTED)
10952 : (AT_LEAST_UNI_SEMANTICS)
10958 *flagp |= HASWIDTH;
10960 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
10961 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
10963 nextchar(pRExC_state);
10967 /* On success, returns the offset at which any next node should be placed into
10968 * the regex engine program being compiled.
10970 * Returns 0 otherwise, with *flagp set to indicate why:
10971 * TRYAGAIN at the end of (?) that only sets flags.
10972 * RESTART_PARSE if the parse needs to be restarted, or'd with
10973 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
10974 * Otherwise would only return 0 if regbranch() returns 0, which cannot
10976 STATIC regnode_offset
10977 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
10978 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
10979 * 2 is like 1, but indicates that nextchar() has been called to advance
10980 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
10981 * this flag alerts us to the need to check for that */
10983 regnode_offset ret = 0; /* Will be the head of the group. */
10985 regnode_offset lastbr;
10986 regnode_offset ender = 0;
10989 U32 oregflags = RExC_flags;
10990 bool have_branch = 0;
10992 I32 freeze_paren = 0;
10993 I32 after_freeze = 0;
10994 I32 num; /* numeric backreferences */
10995 SV * max_open; /* Max number of unclosed parens */
10997 char * parse_start = RExC_parse; /* MJD */
10998 char * const oregcomp_parse = RExC_parse;
11000 GET_RE_DEBUG_FLAGS_DECL;
11002 PERL_ARGS_ASSERT_REG;
11003 DEBUG_PARSE("reg ");
11005 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11007 if (!SvIOK(max_open)) {
11008 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11010 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11012 vFAIL("Too many nested open parens");
11015 *flagp = 0; /* Tentatively. */
11017 if (RExC_in_lookbehind) {
11018 RExC_in_lookbehind++;
11020 if (RExC_in_lookahead) {
11021 RExC_in_lookahead++;
11024 /* Having this true makes it feasible to have a lot fewer tests for the
11025 * parse pointer being in scope. For example, we can write
11026 * while(isFOO(*RExC_parse)) RExC_parse++;
11028 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11030 assert(*RExC_end == '\0');
11032 /* Make an OPEN node, if parenthesized. */
11035 /* Under /x, space and comments can be gobbled up between the '(' and
11036 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
11037 * intervening space, as the sequence is a token, and a token should be
11039 bool has_intervening_patws = (paren == 2)
11040 && *(RExC_parse - 1) != '(';
11042 if (RExC_parse >= RExC_end) {
11043 vFAIL("Unmatched (");
11046 if (paren == 'r') { /* Atomic script run */
11050 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11051 char *start_verb = RExC_parse + 1;
11053 char *start_arg = NULL;
11054 unsigned char op = 0;
11055 int arg_required = 0;
11056 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11057 bool has_upper = FALSE;
11059 if (has_intervening_patws) {
11060 RExC_parse++; /* past the '*' */
11062 /* For strict backwards compatibility, don't change the message
11063 * now that we also have lowercase operands */
11064 if (isUPPER(*RExC_parse)) {
11065 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11068 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11071 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11072 if ( *RExC_parse == ':' ) {
11073 start_arg = RExC_parse + 1;
11077 if (isUPPER(*RExC_parse)) {
11083 RExC_parse += UTF8SKIP(RExC_parse);
11086 verb_len = RExC_parse - start_verb;
11088 if (RExC_parse >= RExC_end) {
11089 goto unterminated_verb_pattern;
11092 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11093 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11094 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11096 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11097 unterminated_verb_pattern:
11099 vFAIL("Unterminated verb pattern argument");
11102 vFAIL("Unterminated '(*...' argument");
11106 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11108 vFAIL("Unterminated verb pattern");
11111 vFAIL("Unterminated '(*...' construct");
11116 /* Here, we know that RExC_parse < RExC_end */
11118 switch ( *start_verb ) {
11119 case 'A': /* (*ACCEPT) */
11120 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11122 internal_argval = RExC_nestroot;
11125 case 'C': /* (*COMMIT) */
11126 if ( memEQs(start_verb, verb_len,"COMMIT") )
11129 case 'F': /* (*FAIL) */
11130 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11134 case ':': /* (*:NAME) */
11135 case 'M': /* (*MARK:NAME) */
11136 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11141 case 'P': /* (*PRUNE) */
11142 if ( memEQs(start_verb, verb_len,"PRUNE") )
11145 case 'S': /* (*SKIP) */
11146 if ( memEQs(start_verb, verb_len,"SKIP") )
11149 case 'T': /* (*THEN) */
11150 /* [19:06] <TimToady> :: is then */
11151 if ( memEQs(start_verb, verb_len,"THEN") ) {
11153 RExC_seen |= REG_CUTGROUP_SEEN;
11157 if ( memEQs(start_verb, verb_len, "asr")
11158 || memEQs(start_verb, verb_len, "atomic_script_run"))
11160 paren = 'r'; /* Mnemonic: recursed run */
11163 else if (memEQs(start_verb, verb_len, "atomic")) {
11164 paren = 't'; /* AtOMIC */
11165 goto alpha_assertions;
11169 if ( memEQs(start_verb, verb_len, "plb")
11170 || memEQs(start_verb, verb_len, "positive_lookbehind"))
11173 goto lookbehind_alpha_assertions;
11175 else if ( memEQs(start_verb, verb_len, "pla")
11176 || memEQs(start_verb, verb_len, "positive_lookahead"))
11179 goto alpha_assertions;
11183 if ( memEQs(start_verb, verb_len, "nlb")
11184 || memEQs(start_verb, verb_len, "negative_lookbehind"))
11187 goto lookbehind_alpha_assertions;
11189 else if ( memEQs(start_verb, verb_len, "nla")
11190 || memEQs(start_verb, verb_len, "negative_lookahead"))
11193 goto alpha_assertions;
11197 if ( memEQs(start_verb, verb_len, "sr")
11198 || memEQs(start_verb, verb_len, "script_run"))
11200 regnode_offset atomic;
11206 /* This indicates Unicode rules. */
11207 REQUIRE_UNI_RULES(flagp, 0);
11213 RExC_parse = start_arg;
11215 if (RExC_in_script_run) {
11217 /* Nested script runs are treated as no-ops, because
11218 * if the nested one fails, the outer one must as
11219 * well. It could fail sooner, and avoid (??{} with
11220 * side effects, but that is explicitly documented as
11221 * undefined behavior. */
11225 if (paren == 's') {
11230 /* But, the atomic part of a nested atomic script run
11231 * isn't a no-op, but can be treated just like a '(?>'
11237 if (paren == 's') {
11238 /* Here, we're starting a new regular script run */
11239 ret = reg_node(pRExC_state, SROPEN);
11240 RExC_in_script_run = 1;
11245 /* Here, we are starting an atomic script run. This is
11246 * handled by recursing to deal with the atomic portion
11247 * separately, enclosed in SROPEN ... SRCLOSE nodes */
11249 ret = reg_node(pRExC_state, SROPEN);
11251 RExC_in_script_run = 1;
11253 atomic = reg(pRExC_state, 'r', &flags, depth);
11254 if (flags & (RESTART_PARSE|NEED_UTF8)) {
11255 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11259 if (! REGTAIL(pRExC_state, ret, atomic)) {
11260 REQUIRE_BRANCHJ(flagp, 0);
11263 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11266 REQUIRE_BRANCHJ(flagp, 0);
11269 RExC_in_script_run = 0;
11275 lookbehind_alpha_assertions:
11276 RExC_seen |= REG_LOOKBEHIND_SEEN;
11277 RExC_in_lookbehind++;
11282 RExC_seen_zerolen++;
11288 /* An empty negative lookahead assertion simply is failure */
11289 if (paren == 'A' && RExC_parse == start_arg) {
11290 ret=reganode(pRExC_state, OPFAIL, 0);
11291 nextchar(pRExC_state);
11295 RExC_parse = start_arg;
11300 "'(*%" UTF8f "' requires a terminating ':'",
11301 UTF8fARG(UTF, verb_len, start_verb));
11302 NOT_REACHED; /*NOTREACHED*/
11304 } /* End of switch */
11307 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11309 if (has_upper || verb_len == 0) {
11311 "Unknown verb pattern '%" UTF8f "'",
11312 UTF8fARG(UTF, verb_len, start_verb));
11316 "Unknown '(*...)' construct '%" UTF8f "'",
11317 UTF8fARG(UTF, verb_len, start_verb));
11320 if ( RExC_parse == start_arg ) {
11323 if ( arg_required && !start_arg ) {
11324 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11325 verb_len, start_verb);
11327 if (internal_argval == -1) {
11328 ret = reganode(pRExC_state, op, 0);
11330 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11332 RExC_seen |= REG_VERBARG_SEEN;
11334 SV *sv = newSVpvn( start_arg,
11335 RExC_parse - start_arg);
11336 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11337 STR_WITH_LEN("S"));
11338 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11339 FLAGS(REGNODE_p(ret)) = 1;
11341 FLAGS(REGNODE_p(ret)) = 0;
11343 if ( internal_argval != -1 )
11344 ARG2L_SET(REGNODE_p(ret), internal_argval);
11345 nextchar(pRExC_state);
11348 else if (*RExC_parse == '?') { /* (?...) */
11349 bool is_logical = 0;
11350 const char * const seqstart = RExC_parse;
11351 const char * endptr;
11352 if (has_intervening_patws) {
11354 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11357 RExC_parse++; /* past the '?' */
11358 paren = *RExC_parse; /* might be a trailing NUL, if not
11360 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11361 if (RExC_parse > RExC_end) {
11364 ret = 0; /* For look-ahead/behind. */
11367 case 'P': /* (?P...) variants for those used to PCRE/Python */
11368 paren = *RExC_parse;
11369 if ( paren == '<') { /* (?P<...>) named capture */
11371 if (RExC_parse >= RExC_end) {
11372 vFAIL("Sequence (?P<... not terminated");
11374 goto named_capture;
11376 else if (paren == '>') { /* (?P>name) named recursion */
11378 if (RExC_parse >= RExC_end) {
11379 vFAIL("Sequence (?P>... not terminated");
11381 goto named_recursion;
11383 else if (paren == '=') { /* (?P=...) named backref */
11385 return handle_named_backref(pRExC_state, flagp,
11388 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11389 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11390 vFAIL3("Sequence (%.*s...) not recognized",
11391 RExC_parse-seqstart, seqstart);
11392 NOT_REACHED; /*NOTREACHED*/
11393 case '<': /* (?<...) */
11394 if (*RExC_parse == '!')
11396 else if (*RExC_parse != '=')
11403 case '\'': /* (?'...') */
11404 name_start = RExC_parse;
11405 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11406 if ( RExC_parse == name_start
11407 || RExC_parse >= RExC_end
11408 || *RExC_parse != paren)
11410 vFAIL2("Sequence (?%c... not terminated",
11411 paren=='>' ? '<' : paren);
11416 if (!svname) /* shouldn't happen */
11418 "panic: reg_scan_name returned NULL");
11419 if (!RExC_paren_names) {
11420 RExC_paren_names= newHV();
11421 sv_2mortal(MUTABLE_SV(RExC_paren_names));
11423 RExC_paren_name_list= newAV();
11424 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11427 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11429 sv_dat = HeVAL(he_str);
11431 /* croak baby croak */
11433 "panic: paren_name hash element allocation failed");
11434 } else if ( SvPOK(sv_dat) ) {
11435 /* (?|...) can mean we have dupes so scan to check
11436 its already been stored. Maybe a flag indicating
11437 we are inside such a construct would be useful,
11438 but the arrays are likely to be quite small, so
11439 for now we punt -- dmq */
11440 IV count = SvIV(sv_dat);
11441 I32 *pv = (I32*)SvPVX(sv_dat);
11443 for ( i = 0 ; i < count ; i++ ) {
11444 if ( pv[i] == RExC_npar ) {
11450 pv = (I32*)SvGROW(sv_dat,
11451 SvCUR(sv_dat) + sizeof(I32)+1);
11452 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11453 pv[count] = RExC_npar;
11454 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11457 (void)SvUPGRADE(sv_dat, SVt_PVNV);
11458 sv_setpvn(sv_dat, (char *)&(RExC_npar),
11461 SvIV_set(sv_dat, 1);
11464 /* Yes this does cause a memory leak in debugging Perls
11466 if (!av_store(RExC_paren_name_list,
11467 RExC_npar, SvREFCNT_inc_NN(svname)))
11468 SvREFCNT_dec_NN(svname);
11471 /*sv_dump(sv_dat);*/
11473 nextchar(pRExC_state);
11475 goto capturing_parens;
11478 RExC_seen |= REG_LOOKBEHIND_SEEN;
11479 RExC_in_lookbehind++;
11481 if (RExC_parse >= RExC_end) {
11482 vFAIL("Sequence (?... not terminated");
11484 RExC_seen_zerolen++;
11486 case '=': /* (?=...) */
11487 RExC_seen_zerolen++;
11488 RExC_in_lookahead++;
11490 case '!': /* (?!...) */
11491 RExC_seen_zerolen++;
11492 /* check if we're really just a "FAIL" assertion */
11493 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11494 FALSE /* Don't force to /x */ );
11495 if (*RExC_parse == ')') {
11496 ret=reganode(pRExC_state, OPFAIL, 0);
11497 nextchar(pRExC_state);
11501 case '|': /* (?|...) */
11502 /* branch reset, behave like a (?:...) except that
11503 buffers in alternations share the same numbers */
11505 after_freeze = freeze_paren = RExC_npar;
11507 /* XXX This construct currently requires an extra pass.
11508 * Investigation would be required to see if that could be
11510 REQUIRE_PARENS_PASS;
11512 case ':': /* (?:...) */
11513 case '>': /* (?>...) */
11515 case '$': /* (?$...) */
11516 case '@': /* (?@...) */
11517 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11519 case '0' : /* (?0) */
11520 case 'R' : /* (?R) */
11521 if (RExC_parse == RExC_end || *RExC_parse != ')')
11522 FAIL("Sequence (?R) not terminated");
11524 RExC_seen |= REG_RECURSE_SEEN;
11526 /* XXX These constructs currently require an extra pass.
11527 * It probably could be changed */
11528 REQUIRE_PARENS_PASS;
11530 *flagp |= POSTPONED;
11531 goto gen_recurse_regop;
11533 /* named and numeric backreferences */
11534 case '&': /* (?&NAME) */
11535 parse_start = RExC_parse - 1;
11538 SV *sv_dat = reg_scan_name(pRExC_state,
11539 REG_RSN_RETURN_DATA);
11540 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11542 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11543 vFAIL("Sequence (?&... not terminated");
11544 goto gen_recurse_regop;
11547 if (! inRANGE(RExC_parse[0], '1', '9')) {
11549 vFAIL("Illegal pattern");
11551 goto parse_recursion;
11553 case '-': /* (?-1) */
11554 if (! inRANGE(RExC_parse[0], '1', '9')) {
11555 RExC_parse--; /* rewind to let it be handled later */
11559 case '1': case '2': case '3': case '4': /* (?1) */
11560 case '5': case '6': case '7': case '8': case '9':
11561 RExC_parse = (char *) seqstart + 1; /* Point to the digit */
11564 bool is_neg = FALSE;
11566 parse_start = RExC_parse - 1; /* MJD */
11567 if (*RExC_parse == '-') {
11572 if (grok_atoUV(RExC_parse, &unum, &endptr)
11576 RExC_parse = (char*)endptr;
11580 /* Some limit for num? */
11584 if (*RExC_parse!=')')
11585 vFAIL("Expecting close bracket");
11588 if ( paren == '-' ) {
11590 Diagram of capture buffer numbering.
11591 Top line is the normal capture buffer numbers
11592 Bottom line is the negative indexing as from
11596 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11600 num = RExC_npar + num;
11603 /* It might be a forward reference; we can't fail until
11604 * we know, by completing the parse to get all the
11605 * groups, and then reparsing */
11606 if (ALL_PARENS_COUNTED) {
11608 vFAIL("Reference to nonexistent group");
11611 REQUIRE_PARENS_PASS;
11614 } else if ( paren == '+' ) {
11615 num = RExC_npar + num - 1;
11617 /* We keep track how many GOSUB items we have produced.
11618 To start off the ARG2L() of the GOSUB holds its "id",
11619 which is used later in conjunction with RExC_recurse
11620 to calculate the offset we need to jump for the GOSUB,
11621 which it will store in the final representation.
11622 We have to defer the actual calculation until much later
11623 as the regop may move.
11626 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11627 if (num >= RExC_npar) {
11629 /* It might be a forward reference; we can't fail until we
11630 * know, by completing the parse to get all the groups, and
11631 * then reparsing */
11632 if (ALL_PARENS_COUNTED) {
11633 if (num >= RExC_total_parens) {
11635 vFAIL("Reference to nonexistent group");
11639 REQUIRE_PARENS_PASS;
11642 RExC_recurse_count++;
11643 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11644 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11645 22, "| |", (int)(depth * 2 + 1), "",
11646 (UV)ARG(REGNODE_p(ret)),
11647 (IV)ARG2L(REGNODE_p(ret))));
11648 RExC_seen |= REG_RECURSE_SEEN;
11650 Set_Node_Length(REGNODE_p(ret),
11651 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11652 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11654 *flagp |= POSTPONED;
11655 assert(*RExC_parse == ')');
11656 nextchar(pRExC_state);
11661 case '?': /* (??...) */
11663 if (*RExC_parse != '{') {
11664 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11665 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11667 "Sequence (%" UTF8f "...) not recognized",
11668 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11669 NOT_REACHED; /*NOTREACHED*/
11671 *flagp |= POSTPONED;
11675 case '{': /* (?{...}) */
11678 struct reg_code_block *cb;
11681 RExC_seen_zerolen++;
11683 if ( !pRExC_state->code_blocks
11684 || pRExC_state->code_index
11685 >= pRExC_state->code_blocks->count
11686 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11687 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11690 if (RExC_pm_flags & PMf_USE_RE_EVAL)
11691 FAIL("panic: Sequence (?{...}): no code block found\n");
11692 FAIL("Eval-group not allowed at runtime, use re 'eval'");
11694 /* this is a pre-compiled code block (?{...}) */
11695 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11696 RExC_parse = RExC_start + cb->end;
11698 if (cb->src_regex) {
11699 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11700 RExC_rxi->data->data[n] =
11701 (void*)SvREFCNT_inc((SV*)cb->src_regex);
11702 RExC_rxi->data->data[n+1] = (void*)o;
11705 n = add_data(pRExC_state,
11706 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11707 RExC_rxi->data->data[n] = (void*)o;
11709 pRExC_state->code_index++;
11710 nextchar(pRExC_state);
11713 regnode_offset eval;
11714 ret = reg_node(pRExC_state, LOGICAL);
11716 eval = reg2Lanode(pRExC_state, EVAL,
11719 /* for later propagation into (??{})
11721 RExC_flags & RXf_PMf_COMPILETIME
11723 FLAGS(REGNODE_p(ret)) = 2;
11724 if (! REGTAIL(pRExC_state, ret, eval)) {
11725 REQUIRE_BRANCHJ(flagp, 0);
11727 /* deal with the length of this later - MJD */
11730 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11731 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11732 Set_Node_Offset(REGNODE_p(ret), parse_start);
11735 case '(': /* (?(?{...})...) and (?(?=...)...) */
11738 const int DEFINE_len = sizeof("DEFINE") - 1;
11739 if ( RExC_parse < RExC_end - 1
11740 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
11741 && ( RExC_parse[1] == '='
11742 || RExC_parse[1] == '!'
11743 || RExC_parse[1] == '<'
11744 || RExC_parse[1] == '{'))
11745 || ( RExC_parse[0] == '*' /* (?(*...)) */
11746 && ( memBEGINs(RExC_parse + 1,
11747 (Size_t) (RExC_end - (RExC_parse + 1)),
11749 || memBEGINs(RExC_parse + 1,
11750 (Size_t) (RExC_end - (RExC_parse + 1)),
11752 || memBEGINs(RExC_parse + 1,
11753 (Size_t) (RExC_end - (RExC_parse + 1)),
11755 || memBEGINs(RExC_parse + 1,
11756 (Size_t) (RExC_end - (RExC_parse + 1)),
11758 || memBEGINs(RExC_parse + 1,
11759 (Size_t) (RExC_end - (RExC_parse + 1)),
11760 "positive_lookahead:")
11761 || memBEGINs(RExC_parse + 1,
11762 (Size_t) (RExC_end - (RExC_parse + 1)),
11763 "positive_lookbehind:")
11764 || memBEGINs(RExC_parse + 1,
11765 (Size_t) (RExC_end - (RExC_parse + 1)),
11766 "negative_lookahead:")
11767 || memBEGINs(RExC_parse + 1,
11768 (Size_t) (RExC_end - (RExC_parse + 1)),
11769 "negative_lookbehind:"))))
11770 ) { /* Lookahead or eval. */
11772 regnode_offset tail;
11774 ret = reg_node(pRExC_state, LOGICAL);
11775 FLAGS(REGNODE_p(ret)) = 1;
11777 tail = reg(pRExC_state, 1, &flag, depth+1);
11778 RETURN_FAIL_ON_RESTART(flag, flagp);
11779 if (! REGTAIL(pRExC_state, ret, tail)) {
11780 REQUIRE_BRANCHJ(flagp, 0);
11784 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
11785 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11787 char ch = RExC_parse[0] == '<' ? '>' : '\'';
11788 char *name_start= RExC_parse++;
11790 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11791 if ( RExC_parse == name_start
11792 || RExC_parse >= RExC_end
11793 || *RExC_parse != ch)
11795 vFAIL2("Sequence (?(%c... not terminated",
11796 (ch == '>' ? '<' : ch));
11800 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11801 RExC_rxi->data->data[num]=(void*)sv_dat;
11802 SvREFCNT_inc_simple_void_NN(sv_dat);
11804 ret = reganode(pRExC_state, GROUPPN, num);
11805 goto insert_if_check_paren;
11807 else if (memBEGINs(RExC_parse,
11808 (STRLEN) (RExC_end - RExC_parse),
11811 ret = reganode(pRExC_state, DEFINEP, 0);
11812 RExC_parse += DEFINE_len;
11814 goto insert_if_check_paren;
11816 else if (RExC_parse[0] == 'R') {
11818 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
11819 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11820 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11823 if (RExC_parse[0] == '0') {
11827 else if (inRANGE(RExC_parse[0], '1', '9')) {
11830 if (grok_atoUV(RExC_parse, &uv, &endptr)
11833 parno = (I32)uv + 1;
11834 RExC_parse = (char*)endptr;
11836 /* else "Switch condition not recognized" below */
11837 } else if (RExC_parse[0] == '&') {
11840 sv_dat = reg_scan_name(pRExC_state,
11841 REG_RSN_RETURN_DATA);
11843 parno = 1 + *((I32 *)SvPVX(sv_dat));
11845 ret = reganode(pRExC_state, INSUBP, parno);
11846 goto insert_if_check_paren;
11848 else if (inRANGE(RExC_parse[0], '1', '9')) {
11853 if (grok_atoUV(RExC_parse, &uv, &endptr)
11857 RExC_parse = (char*)endptr;
11860 vFAIL("panic: grok_atoUV returned FALSE");
11862 ret = reganode(pRExC_state, GROUPP, parno);
11864 insert_if_check_paren:
11865 if (UCHARAT(RExC_parse) != ')') {
11867 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11869 vFAIL("Switch condition not recognized");
11871 nextchar(pRExC_state);
11873 if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
11876 REQUIRE_BRANCHJ(flagp, 0);
11878 br = regbranch(pRExC_state, &flags, 1, depth+1);
11880 RETURN_FAIL_ON_RESTART(flags,flagp);
11881 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11884 if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
11887 REQUIRE_BRANCHJ(flagp, 0);
11889 c = UCHARAT(RExC_parse);
11890 nextchar(pRExC_state);
11891 if (flags&HASWIDTH)
11892 *flagp |= HASWIDTH;
11895 vFAIL("(?(DEFINE)....) does not allow branches");
11897 /* Fake one for optimizer. */
11898 lastbr = reganode(pRExC_state, IFTHEN, 0);
11900 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
11901 RETURN_FAIL_ON_RESTART(flags, flagp);
11902 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11905 if (! REGTAIL(pRExC_state, ret, lastbr)) {
11906 REQUIRE_BRANCHJ(flagp, 0);
11908 if (flags&HASWIDTH)
11909 *flagp |= HASWIDTH;
11910 c = UCHARAT(RExC_parse);
11911 nextchar(pRExC_state);
11916 if (RExC_parse >= RExC_end)
11917 vFAIL("Switch (?(condition)... not terminated");
11919 vFAIL("Switch (?(condition)... contains too many branches");
11921 ender = reg_node(pRExC_state, TAIL);
11922 if (! REGTAIL(pRExC_state, br, ender)) {
11923 REQUIRE_BRANCHJ(flagp, 0);
11926 if (! REGTAIL(pRExC_state, lastbr, ender)) {
11927 REQUIRE_BRANCHJ(flagp, 0);
11929 if (! REGTAIL(pRExC_state,
11932 NEXTOPER(REGNODE_p(lastbr)))),
11935 REQUIRE_BRANCHJ(flagp, 0);
11939 if (! REGTAIL(pRExC_state, ret, ender)) {
11940 REQUIRE_BRANCHJ(flagp, 0);
11942 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */
11943 RExC_size++; /* XXX WHY do we need this?!!
11944 For large programs it seems to be required
11945 but I can't figure out why. -- dmq*/
11950 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11952 vFAIL("Unknown switch condition (?(...))");
11954 case '[': /* (?[ ... ]) */
11955 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
11957 case 0: /* A NUL */
11958 RExC_parse--; /* for vFAIL to print correctly */
11959 vFAIL("Sequence (? incomplete");
11963 if (RExC_strict) { /* [perl #132851] */
11964 ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
11967 default: /* e.g., (?i) */
11968 RExC_parse = (char *) seqstart + 1;
11970 parse_lparen_question_flags(pRExC_state);
11971 if (UCHARAT(RExC_parse) != ':') {
11972 if (RExC_parse < RExC_end)
11973 nextchar(pRExC_state);
11978 nextchar(pRExC_state);
11983 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
11987 if (! ALL_PARENS_COUNTED) {
11988 /* If we are in our first pass through (and maybe only pass),
11989 * we need to allocate memory for the capturing parentheses
11993 if (!RExC_parens_buf_size) {
11994 /* first guess at number of parens we might encounter */
11995 RExC_parens_buf_size = 10;
11997 /* setup RExC_open_parens, which holds the address of each
11998 * OPEN tag, and to make things simpler for the 0 index the
11999 * start of the program - this is used later for offsets */
12000 Newxz(RExC_open_parens, RExC_parens_buf_size,
12002 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
12004 /* setup RExC_close_parens, which holds the address of each
12005 * CLOSE tag, and to make things simpler for the 0 index
12006 * the end of the program - this is used later for offsets
12008 Newxz(RExC_close_parens, RExC_parens_buf_size,
12010 /* we dont know where end op starts yet, so we dont need to
12011 * set RExC_close_parens[0] like we do RExC_open_parens[0]
12014 else if (RExC_npar > RExC_parens_buf_size) {
12015 I32 old_size = RExC_parens_buf_size;
12017 RExC_parens_buf_size *= 2;
12019 Renew(RExC_open_parens, RExC_parens_buf_size,
12021 Zero(RExC_open_parens + old_size,
12022 RExC_parens_buf_size - old_size, regnode_offset);
12024 Renew(RExC_close_parens, RExC_parens_buf_size,
12026 Zero(RExC_close_parens + old_size,
12027 RExC_parens_buf_size - old_size, regnode_offset);
12031 ret = reganode(pRExC_state, OPEN, parno);
12032 if (!RExC_nestroot)
12033 RExC_nestroot = parno;
12034 if (RExC_open_parens && !RExC_open_parens[parno])
12036 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12037 "%*s%*s Setting open paren #%" IVdf " to %d\n",
12038 22, "| |", (int)(depth * 2 + 1), "",
12040 RExC_open_parens[parno]= ret;
12043 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12044 Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12047 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12056 /* Pick up the branches, linking them together. */
12057 parse_start = RExC_parse; /* MJD */
12058 br = regbranch(pRExC_state, &flags, 1, depth+1);
12060 /* branch_len = (paren != 0); */
12063 RETURN_FAIL_ON_RESTART(flags, flagp);
12064 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12066 if (*RExC_parse == '|') {
12067 if (RExC_use_BRANCHJ) {
12068 reginsert(pRExC_state, BRANCHJ, br, depth+1);
12071 reginsert(pRExC_state, BRANCH, br, depth+1);
12072 Set_Node_Length(REGNODE_p(br), paren != 0);
12073 Set_Node_Offset_To_R(br, parse_start-RExC_start);
12077 else if (paren == ':') {
12078 *flagp |= flags&SIMPLE;
12080 if (is_open) { /* Starts with OPEN. */
12081 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
12082 REQUIRE_BRANCHJ(flagp, 0);
12085 else if (paren != '?') /* Not Conditional */
12087 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12089 while (*RExC_parse == '|') {
12090 if (RExC_use_BRANCHJ) {
12093 ender = reganode(pRExC_state, LONGJMP, 0);
12095 /* Append to the previous. */
12096 shut_gcc_up = REGTAIL(pRExC_state,
12097 REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12099 PERL_UNUSED_VAR(shut_gcc_up);
12101 nextchar(pRExC_state);
12102 if (freeze_paren) {
12103 if (RExC_npar > after_freeze)
12104 after_freeze = RExC_npar;
12105 RExC_npar = freeze_paren;
12107 br = regbranch(pRExC_state, &flags, 0, depth+1);
12110 RETURN_FAIL_ON_RESTART(flags, flagp);
12111 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12113 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
12114 REQUIRE_BRANCHJ(flagp, 0);
12117 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12120 if (have_branch || paren != ':') {
12123 /* Make a closing node, and hook it on the end. */
12126 ender = reg_node(pRExC_state, TAIL);
12129 ender = reganode(pRExC_state, CLOSE, parno);
12130 if ( RExC_close_parens ) {
12131 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12132 "%*s%*s Setting close paren #%" IVdf " to %d\n",
12133 22, "| |", (int)(depth * 2 + 1), "",
12134 (IV)parno, ender));
12135 RExC_close_parens[parno]= ender;
12136 if (RExC_nestroot == parno)
12139 Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12140 Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12143 ender = reg_node(pRExC_state, SRCLOSE);
12144 RExC_in_script_run = 0;
12154 *flagp &= ~HASWIDTH;
12156 case 't': /* aTomic */
12158 ender = reg_node(pRExC_state, SUCCEED);
12161 ender = reg_node(pRExC_state, END);
12162 assert(!RExC_end_op); /* there can only be one! */
12163 RExC_end_op = REGNODE_p(ender);
12164 if (RExC_close_parens) {
12165 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12166 "%*s%*s Setting close paren #0 (END) to %d\n",
12167 22, "| |", (int)(depth * 2 + 1), "",
12170 RExC_close_parens[0]= ender;
12175 DEBUG_PARSE_MSG("lsbr");
12176 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12177 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12178 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12179 SvPV_nolen_const(RExC_mysv1),
12181 SvPV_nolen_const(RExC_mysv2),
12183 (IV)(ender - lastbr)
12186 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12187 REQUIRE_BRANCHJ(flagp, 0);
12191 char is_nothing= 1;
12193 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12195 /* Hook the tails of the branches to the closing node. */
12196 for (br = REGNODE_p(ret); br; br = regnext(br)) {
12197 const U8 op = PL_regkind[OP(br)];
12198 if (op == BRANCH) {
12199 if (! REGTAIL_STUDY(pRExC_state,
12200 REGNODE_OFFSET(NEXTOPER(br)),
12203 REQUIRE_BRANCHJ(flagp, 0);
12205 if ( OP(NEXTOPER(br)) != NOTHING
12206 || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12209 else if (op == BRANCHJ) {
12210 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12211 REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12213 PERL_UNUSED_VAR(shut_gcc_up);
12214 /* for now we always disable this optimisation * /
12215 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12216 || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12222 regnode * ret_as_regnode = REGNODE_p(ret);
12223 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12224 ? regnext(ret_as_regnode)
12227 DEBUG_PARSE_MSG("NADA");
12228 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12229 NULL, pRExC_state);
12230 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12231 NULL, pRExC_state);
12232 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12233 SvPV_nolen_const(RExC_mysv1),
12234 (IV)REG_NODE_NUM(ret_as_regnode),
12235 SvPV_nolen_const(RExC_mysv2),
12241 if (OP(REGNODE_p(ender)) == TAIL) {
12243 RExC_emit= REGNODE_OFFSET(br) + 1;
12246 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12247 OP(opt)= OPTIMIZED;
12248 NEXT_OFF(br)= REGNODE_p(ender) - br;
12256 /* Even/odd or x=don't care: 010101x10x */
12257 static const char parens[] = "=!aA<,>Bbt";
12258 /* flag below is set to 0 up through 'A'; 1 for larger */
12260 if (paren && (p = strchr(parens, paren))) {
12261 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12262 int flag = (p - parens) > 3;
12264 if (paren == '>' || paren == 't') {
12265 node = SUSPEND, flag = 0;
12268 reginsert(pRExC_state, node, ret, depth+1);
12269 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12270 Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12271 FLAGS(REGNODE_p(ret)) = flag;
12272 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12274 REQUIRE_BRANCHJ(flagp, 0);
12279 /* Check for proper termination. */
12281 /* restore original flags, but keep (?p) and, if we've encountered
12282 * something in the parse that changes /d rules into /u, keep the /u */
12283 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12284 if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12285 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12287 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12288 RExC_parse = oregcomp_parse;
12289 vFAIL("Unmatched (");
12291 nextchar(pRExC_state);
12293 else if (!paren && RExC_parse < RExC_end) {
12294 if (*RExC_parse == ')') {
12296 vFAIL("Unmatched )");
12299 FAIL("Junk on end of regexp"); /* "Can't happen". */
12300 NOT_REACHED; /* NOTREACHED */
12303 if (RExC_in_lookbehind) {
12304 RExC_in_lookbehind--;
12306 if (RExC_in_lookahead) {
12307 RExC_in_lookahead--;
12309 if (after_freeze > RExC_npar)
12310 RExC_npar = after_freeze;
12315 - regbranch - one alternative of an | operator
12317 * Implements the concatenation operator.
12319 * On success, returns the offset at which any next node should be placed into
12320 * the regex engine program being compiled.
12322 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12323 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12326 STATIC regnode_offset
12327 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12329 regnode_offset ret;
12330 regnode_offset chain = 0;
12331 regnode_offset latest;
12332 I32 flags = 0, c = 0;
12333 GET_RE_DEBUG_FLAGS_DECL;
12335 PERL_ARGS_ASSERT_REGBRANCH;
12337 DEBUG_PARSE("brnc");
12342 if (RExC_use_BRANCHJ)
12343 ret = reganode(pRExC_state, BRANCHJ, 0);
12345 ret = reg_node(pRExC_state, BRANCH);
12346 Set_Node_Length(REGNODE_p(ret), 1);
12350 *flagp = WORST; /* Tentatively. */
12352 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12353 FALSE /* Don't force to /x */ );
12354 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12355 flags &= ~TRYAGAIN;
12356 latest = regpiece(pRExC_state, &flags, depth+1);
12358 if (flags & TRYAGAIN)
12360 RETURN_FAIL_ON_RESTART(flags, flagp);
12361 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12365 *flagp |= flags&(HASWIDTH|POSTPONED);
12366 if (chain == 0) /* First piece. */
12367 *flagp |= flags&SPSTART;
12369 /* FIXME adding one for every branch after the first is probably
12370 * excessive now we have TRIE support. (hv) */
12372 if (! REGTAIL(pRExC_state, chain, latest)) {
12373 /* XXX We could just redo this branch, but figuring out what
12374 * bookkeeping needs to be reset is a pain, and it's likely
12375 * that other branches that goto END will also be too large */
12376 REQUIRE_BRANCHJ(flagp, 0);
12382 if (chain == 0) { /* Loop ran zero times. */
12383 chain = reg_node(pRExC_state, NOTHING);
12388 *flagp |= flags&SIMPLE;
12395 - regpiece - something followed by possible quantifier * + ? {n,m}
12397 * Note that the branching code sequences used for ? and the general cases
12398 * of * and + are somewhat optimized: they use the same NOTHING node as
12399 * both the endmarker for their branch list and the body of the last branch.
12400 * It might seem that this node could be dispensed with entirely, but the
12401 * endmarker role is not redundant.
12403 * On success, returns the offset at which any next node should be placed into
12404 * the regex engine program being compiled.
12406 * Returns 0 otherwise, with *flagp set to indicate why:
12407 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
12408 * RESTART_PARSE if the parse needs to be restarted, or'd with
12409 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12411 STATIC regnode_offset
12412 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12414 regnode_offset ret;
12418 const char * const origparse = RExC_parse;
12420 I32 max = REG_INFTY;
12421 #ifdef RE_TRACK_PATTERN_OFFSETS
12424 const char *maxpos = NULL;
12427 /* Save the original in case we change the emitted regop to a FAIL. */
12428 const regnode_offset orig_emit = RExC_emit;
12430 GET_RE_DEBUG_FLAGS_DECL;
12432 PERL_ARGS_ASSERT_REGPIECE;
12434 DEBUG_PARSE("piec");
12436 ret = regatom(pRExC_state, &flags, depth+1);
12438 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12439 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12444 if (op == '{' && regcurly(RExC_parse)) {
12446 #ifdef RE_TRACK_PATTERN_OFFSETS
12447 parse_start = RExC_parse; /* MJD */
12449 next = RExC_parse + 1;
12450 while (isDIGIT(*next) || *next == ',') {
12451 if (*next == ',') {
12459 if (*next == '}') { /* got one */
12460 const char* endptr;
12464 if (isDIGIT(*RExC_parse)) {
12466 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12467 vFAIL("Invalid quantifier in {,}");
12468 if (uv >= REG_INFTY)
12469 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12474 if (*maxpos == ',')
12477 maxpos = RExC_parse;
12478 if (isDIGIT(*maxpos)) {
12480 if (!grok_atoUV(maxpos, &uv, &endptr))
12481 vFAIL("Invalid quantifier in {,}");
12482 if (uv >= REG_INFTY)
12483 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12486 max = REG_INFTY; /* meaning "infinity" */
12489 nextchar(pRExC_state);
12490 if (max < min) { /* If can't match, warn and optimize to fail
12492 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12493 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12494 NEXT_OFF(REGNODE_p(orig_emit)) =
12495 regarglen[OPFAIL] + NODE_STEP_REGNODE;
12498 else if (min == max && *RExC_parse == '?')
12500 ckWARN2reg(RExC_parse + 1,
12501 "Useless use of greediness modifier '%c'",
12506 if ((flags&SIMPLE)) {
12507 if (min == 0 && max == REG_INFTY) {
12508 reginsert(pRExC_state, STAR, ret, depth+1);
12510 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12513 if (min == 1 && max == REG_INFTY) {
12514 reginsert(pRExC_state, PLUS, ret, depth+1);
12516 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12519 MARK_NAUGHTY_EXP(2, 2);
12520 reginsert(pRExC_state, CURLY, ret, depth+1);
12521 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12522 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12525 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12527 FLAGS(REGNODE_p(w)) = 0;
12528 if (! REGTAIL(pRExC_state, ret, w)) {
12529 REQUIRE_BRANCHJ(flagp, 0);
12531 if (RExC_use_BRANCHJ) {
12532 reginsert(pRExC_state, LONGJMP, ret, depth+1);
12533 reginsert(pRExC_state, NOTHING, ret, depth+1);
12534 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
12536 reginsert(pRExC_state, CURLYX, ret, depth+1);
12538 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12539 Set_Node_Length(REGNODE_p(ret),
12540 op == '{' ? (RExC_parse - parse_start) : 1);
12542 if (RExC_use_BRANCHJ)
12543 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
12545 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12548 REQUIRE_BRANCHJ(flagp, 0);
12550 RExC_whilem_seen++;
12551 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
12553 FLAGS(REGNODE_p(ret)) = 0;
12558 *flagp |= HASWIDTH;
12559 ARG1_SET(REGNODE_p(ret), (U16)min);
12560 ARG2_SET(REGNODE_p(ret), (U16)max);
12561 if (max == REG_INFTY)
12562 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12568 if (!ISMULT1(op)) {
12573 #if 0 /* Now runtime fix should be reliable. */
12575 /* if this is reinstated, don't forget to put this back into perldiag:
12577 =item Regexp *+ operand could be empty at {#} in regex m/%s/
12579 (F) The part of the regexp subject to either the * or + quantifier
12580 could match an empty string. The {#} shows in the regular
12581 expression about where the problem was discovered.
12585 if (!(flags&HASWIDTH) && op != '?')
12586 vFAIL("Regexp *+ operand could be empty");
12589 #ifdef RE_TRACK_PATTERN_OFFSETS
12590 parse_start = RExC_parse;
12592 nextchar(pRExC_state);
12594 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
12600 else if (op == '+') {
12604 else if (op == '?') {
12609 if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12610 ckWARN2reg(RExC_parse,
12611 "%" UTF8f " matches null string many times",
12612 UTF8fARG(UTF, (RExC_parse >= origparse
12613 ? RExC_parse - origparse
12618 if (*RExC_parse == '?') {
12619 nextchar(pRExC_state);
12620 reginsert(pRExC_state, MINMOD, ret, depth+1);
12621 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12622 REQUIRE_BRANCHJ(flagp, 0);
12625 else if (*RExC_parse == '+') {
12626 regnode_offset ender;
12627 nextchar(pRExC_state);
12628 ender = reg_node(pRExC_state, SUCCEED);
12629 if (! REGTAIL(pRExC_state, ret, ender)) {
12630 REQUIRE_BRANCHJ(flagp, 0);
12632 reginsert(pRExC_state, SUSPEND, ret, depth+1);
12633 ender = reg_node(pRExC_state, TAIL);
12634 if (! REGTAIL(pRExC_state, ret, ender)) {
12635 REQUIRE_BRANCHJ(flagp, 0);
12639 if (ISMULT2(RExC_parse)) {
12641 vFAIL("Nested quantifiers");
12648 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12649 regnode_offset * node_p,
12657 /* This routine teases apart the various meanings of \N and returns
12658 * accordingly. The input parameters constrain which meaning(s) is/are valid
12659 * in the current context.
12661 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12663 * If <code_point_p> is not NULL, the context is expecting the result to be a
12664 * single code point. If this \N instance turns out to a single code point,
12665 * the function returns TRUE and sets *code_point_p to that code point.
12667 * If <node_p> is not NULL, the context is expecting the result to be one of
12668 * the things representable by a regnode. If this \N instance turns out to be
12669 * one such, the function generates the regnode, returns TRUE and sets *node_p
12670 * to point to the offset of that regnode into the regex engine program being
12673 * If this instance of \N isn't legal in any context, this function will
12674 * generate a fatal error and not return.
12676 * On input, RExC_parse should point to the first char following the \N at the
12677 * time of the call. On successful return, RExC_parse will have been updated
12678 * to point to just after the sequence identified by this routine. Also
12679 * *flagp has been updated as needed.
12681 * When there is some problem with the current context and this \N instance,
12682 * the function returns FALSE, without advancing RExC_parse, nor setting
12683 * *node_p, nor *code_point_p, nor *flagp.
12685 * If <cp_count> is not NULL, the caller wants to know the length (in code
12686 * points) that this \N sequence matches. This is set, and the input is
12687 * parsed for errors, even if the function returns FALSE, as detailed below.
12689 * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12691 * Probably the most common case is for the \N to specify a single code point.
12692 * *cp_count will be set to 1, and *code_point_p will be set to that code
12695 * Another possibility is for the input to be an empty \N{}. This is no
12696 * longer accepted, and will generate a fatal error.
12698 * Another possibility is for a custom charnames handler to be in effect which
12699 * translates the input name to an empty string. *cp_count will be set to 0.
12700 * *node_p will be set to a generated NOTHING node.
12702 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12703 * set to 0. *node_p will be set to a generated REG_ANY node.
12705 * The fifth possibility is that \N resolves to a sequence of more than one
12706 * code points. *cp_count will be set to the number of code points in the
12707 * sequence. *node_p will be set to a generated node returned by this
12708 * function calling S_reg().
12710 * The final possibility is that it is premature to be calling this function;
12711 * the parse needs to be restarted. This can happen when this changes from
12712 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
12713 * latter occurs only when the fifth possibility would otherwise be in
12714 * effect, and is because one of those code points requires the pattern to be
12715 * recompiled as UTF-8. The function returns FALSE, and sets the
12716 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
12717 * happens, the caller needs to desist from continuing parsing, and return
12718 * this information to its caller. This is not set for when there is only one
12719 * code point, as this can be called as part of an ANYOF node, and they can
12720 * store above-Latin1 code points without the pattern having to be in UTF-8.
12722 * For non-single-quoted regexes, the tokenizer has resolved character and
12723 * sequence names inside \N{...} into their Unicode values, normalizing the
12724 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12725 * hex-represented code points in the sequence. This is done there because
12726 * the names can vary based on what charnames pragma is in scope at the time,
12727 * so we need a way to take a snapshot of what they resolve to at the time of
12728 * the original parse. [perl #56444].
12730 * That parsing is skipped for single-quoted regexes, so here we may get
12731 * '\N{NAME}', which is parsed now. If the single-quoted regex is something
12732 * like '\N{U+41}', that code point is Unicode, and has to be translated into
12733 * the native character set for non-ASCII platforms. The other possibilities
12734 * are already native, so no translation is done. */
12736 char * endbrace; /* points to '}' following the name */
12737 char* p = RExC_parse; /* Temporary */
12739 SV * substitute_parse = NULL;
12744 GET_RE_DEBUG_FLAGS_DECL;
12746 PERL_ARGS_ASSERT_GROK_BSLASH_N;
12748 GET_RE_DEBUG_FLAGS;
12750 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
12751 assert(! (node_p && cp_count)); /* At most 1 should be set */
12753 if (cp_count) { /* Initialize return for the most common case */
12757 /* The [^\n] meaning of \N ignores spaces and comments under the /x
12758 * modifier. The other meanings do not, so use a temporary until we find
12759 * out which we are being called with */
12760 skip_to_be_ignored_text(pRExC_state, &p,
12761 FALSE /* Don't force to /x */ );
12763 /* Disambiguate between \N meaning a named character versus \N meaning
12764 * [^\n]. The latter is assumed when the {...} following the \N is a legal
12765 * quantifier, or if there is no '{' at all */
12766 if (*p != '{' || regcurly(p)) {
12776 *node_p = reg_node(pRExC_state, REG_ANY);
12777 *flagp |= HASWIDTH|SIMPLE;
12779 Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12783 /* The test above made sure that the next real character is a '{', but
12784 * under the /x modifier, it could be separated by space (or a comment and
12785 * \n) and this is not allowed (for consistency with \x{...} and the
12786 * tokenizer handling of \N{NAME}). */
12787 if (*RExC_parse != '{') {
12788 vFAIL("Missing braces on \\N{}");
12791 RExC_parse++; /* Skip past the '{' */
12793 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12794 if (! endbrace) { /* no trailing brace */
12795 vFAIL2("Missing right brace on \\%c{}", 'N');
12798 /* Here, we have decided it should be a named character or sequence. These
12799 * imply Unicode semantics */
12800 REQUIRE_UNI_RULES(flagp, FALSE);
12802 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
12803 * nothing at all (not allowed under strict) */
12804 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
12805 RExC_parse = endbrace;
12807 RExC_parse++; /* Position after the "}" */
12808 vFAIL("Zero length \\N{}");
12814 nextchar(pRExC_state);
12819 *node_p = reg_node(pRExC_state, NOTHING);
12823 if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
12825 /* Here, the name isn't of the form U+.... This can happen if the
12826 * pattern is single-quoted, so didn't get evaluated in toke.c. Now
12827 * is the time to find out what the name means */
12829 const STRLEN name_len = endbrace - RExC_parse;
12830 SV * value_sv; /* What does this name evaluate to */
12832 const U8 * value; /* string of name's value */
12833 STRLEN value_len; /* and its length */
12835 /* RExC_unlexed_names is a hash of names that weren't evaluated by
12836 * toke.c, and their values. Make sure is initialized */
12837 if (! RExC_unlexed_names) {
12838 RExC_unlexed_names = newHV();
12841 /* If we have already seen this name in this pattern, use that. This
12842 * allows us to only call the charnames handler once per name per
12843 * pattern. A broken or malicious handler could return something
12844 * different each time, which could cause the results to vary depending
12845 * on if something gets added or subtracted from the pattern that
12846 * causes the number of passes to change, for example */
12847 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
12850 value_sv = *value_svp;
12852 else { /* Otherwise we have to go out and get the name */
12853 const char * error_msg = NULL;
12854 value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
12858 RExC_parse = endbrace;
12862 /* If no error message, should have gotten a valid return */
12865 /* Save the name's meaning for later use */
12866 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
12869 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
12873 /* Here, we have the value the name evaluates to in 'value_sv' */
12874 value = (U8 *) SvPV(value_sv, value_len);
12876 /* See if the result is one code point vs 0 or multiple */
12877 if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv))
12881 /* Here, exactly one code point. If that isn't what is wanted,
12883 if (! code_point_p) {
12888 /* Convert from string to numeric code point */
12889 *code_point_p = (SvUTF8(value_sv))
12890 ? valid_utf8_to_uvchr(value, NULL)
12893 /* Have parsed this entire single code point \N{...}. *cp_count
12894 * has already been set to 1, so don't do it again. */
12895 RExC_parse = endbrace;
12896 nextchar(pRExC_state);
12898 } /* End of is a single code point */
12900 /* Count the code points, if caller desires. The API says to do this
12901 * even if we will later return FALSE */
12905 *cp_count = (SvUTF8(value_sv))
12906 ? utf8_length(value, value + value_len)
12910 /* Fail if caller doesn't want to handle a multi-code-point sequence.
12911 * But don't back the pointer up if the caller wants to know how many
12912 * code points there are (they need to handle it themselves in this
12921 /* Convert this to a sub-pattern of the form "(?: ... )", and then call
12922 * reg recursively to parse it. That way, it retains its atomicness,
12923 * while not having to worry about any special handling that some code
12924 * points may have. */
12926 substitute_parse = newSVpvs("?:");
12927 sv_catsv(substitute_parse, value_sv);
12928 sv_catpv(substitute_parse, ")");
12930 /* The value should already be native, so no need to convert on EBCDIC
12932 assert(! RExC_recode_x_to_native);
12935 else { /* \N{U+...} */
12936 Size_t count = 0; /* code point count kept internally */
12938 /* We can get to here when the input is \N{U+...} or when toke.c has
12939 * converted a name to the \N{U+...} form. This include changing a
12940 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
12942 RExC_parse += 2; /* Skip past the 'U+' */
12944 /* Code points are separated by dots. The '}' terminates the whole
12947 do { /* Loop until the ending brace */
12949 char * start_digit; /* The first of the current code point */
12950 if (! isXDIGIT(*RExC_parse)) {
12952 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12955 start_digit = RExC_parse;
12958 /* Loop through the hex digits of the current code point */
12960 /* Adding this digit will shift the result 4 bits. If that
12961 * result would be above the legal max, it's overflow */
12962 if (cp > MAX_LEGAL_CP >> 4) {
12964 /* Find the end of the code point */
12967 } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
12969 /* Be sure to synchronize this message with the similar one
12971 vFAIL4("Use of code point 0x%.*s is not allowed; the"
12972 " permissible max is 0x%" UVxf,
12973 (int) (RExC_parse - start_digit), start_digit,
12977 /* Accumulate this (valid) digit into the running total */
12978 cp = (cp << 4) + READ_XDIGIT(RExC_parse);
12980 /* READ_XDIGIT advanced the input pointer. Ignore a single
12981 * underscore separator */
12982 if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
12985 } while (isXDIGIT(*RExC_parse));
12987 /* Here, have accumulated the next code point */
12988 if (RExC_parse >= endbrace) { /* If done ... */
12993 /* Here, is a single code point; fail if doesn't want that */
12994 if (! code_point_p) {
12999 /* A single code point is easy to handle; just return it */
13000 *code_point_p = UNI_TO_NATIVE(cp);
13001 RExC_parse = endbrace;
13002 nextchar(pRExC_state);
13006 /* Here, the only legal thing would be a multiple character
13007 * sequence (of the form "\N{U+c1.c2. ... }". So the next
13008 * character must be a dot (and the one after that can't be the
13009 * endbrace, or we'd have something like \N{U+100.} ) */
13010 if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
13011 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
13012 ? UTF8SKIP(RExC_parse)
13014 if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
13015 RExC_parse = endbrace;
13017 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13020 /* Here, looks like its really a multiple character sequence. Fail
13021 * if that's not what the caller wants. But continue with counting
13022 * and error checking if they still want a count */
13023 if (! node_p && ! cp_count) {
13027 /* What is done here is to convert this to a sub-pattern of the
13028 * form \x{char1}\x{char2}... and then call reg recursively to
13029 * parse it (enclosing in "(?: ... )" ). That way, it retains its
13030 * atomicness, while not having to worry about special handling
13031 * that some code points may have. We don't create a subpattern,
13032 * but go through the motions of code point counting and error
13033 * checking, if the caller doesn't want a node returned. */
13035 if (node_p && count == 1) {
13036 substitute_parse = newSVpvs("?:");
13042 /* Convert to notation the rest of the code understands */
13043 sv_catpvs(substitute_parse, "\\x{");
13044 sv_catpvn(substitute_parse, start_digit,
13045 RExC_parse - start_digit);
13046 sv_catpvs(substitute_parse, "}");
13049 /* Move to after the dot (or ending brace the final time through.)
13054 } while (RExC_parse < endbrace);
13056 if (! node_p) { /* Doesn't want the node */
13063 sv_catpvs(substitute_parse, ")");
13065 /* The values are Unicode, and therefore have to be converted to native
13066 * on a non-Unicode (meaning non-ASCII) platform. */
13067 SET_recode_x_to_native(1);
13070 /* Here, we have the string the name evaluates to, ready to be parsed,
13071 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13072 * constructs. This can be called from within a substitute parse already.
13073 * The error reporting mechanism doesn't work for 2 levels of this, but the
13074 * code above has validated this new construct, so there should be no
13075 * errors generated by the below. And this isn' an exact copy, so the
13076 * mechanism to seamlessly deal with this won't work, so turn off warnings
13078 save_start = RExC_start;
13079 orig_end = RExC_end;
13081 RExC_parse = RExC_start = SvPVX(substitute_parse);
13082 RExC_end = RExC_parse + SvCUR(substitute_parse);
13083 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13085 *node_p = reg(pRExC_state, 1, &flags, depth+1);
13087 /* Restore the saved values */
13089 RExC_start = save_start;
13090 RExC_parse = endbrace;
13091 RExC_end = orig_end;
13092 SET_recode_x_to_native(0);
13094 SvREFCNT_dec_NN(substitute_parse);
13097 RETURN_FAIL_ON_RESTART(flags, flagp);
13098 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13101 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13103 nextchar(pRExC_state);
13109 PERL_STATIC_INLINE U8
13110 S_compute_EXACTish(RExC_state_t *pRExC_state)
13114 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13122 op = get_regex_charset(RExC_flags);
13123 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13124 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13125 been, so there is no hole */
13128 return op + EXACTF;
13132 S_new_regcurly(const char *s, const char *e)
13134 /* This is a temporary function designed to match the most lenient form of
13135 * a {m,n} quantifier we ever envision, with either number omitted, and
13136 * spaces anywhere between/before/after them.
13138 * If this function fails, then the string it matches is very unlikely to
13139 * ever be considered a valid quantifier, so we can allow the '{' that
13140 * begins it to be considered as a literal */
13142 bool has_min = FALSE;
13143 bool has_max = FALSE;
13145 PERL_ARGS_ASSERT_NEW_REGCURLY;
13147 if (s >= e || *s++ != '{')
13150 while (s < e && isSPACE(*s)) {
13153 while (s < e && isDIGIT(*s)) {
13157 while (s < e && isSPACE(*s)) {
13163 while (s < e && isSPACE(*s)) {
13166 while (s < e && isDIGIT(*s)) {
13170 while (s < e && isSPACE(*s)) {
13175 return s < e && *s == '}' && (has_min || has_max);
13178 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13179 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13182 S_backref_value(char *p, char *e)
13184 const char* endptr = e;
13186 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13193 - regatom - the lowest level
13195 Try to identify anything special at the start of the current parse position.
13196 If there is, then handle it as required. This may involve generating a
13197 single regop, such as for an assertion; or it may involve recursing, such as
13198 to handle a () structure.
13200 If the string doesn't start with something special then we gobble up
13201 as much literal text as we can. If we encounter a quantifier, we have to
13202 back off the final literal character, as that quantifier applies to just it
13203 and not to the whole string of literals.
13205 Once we have been able to handle whatever type of thing started the
13206 sequence, we return the offset into the regex engine program being compiled
13207 at which any next regnode should be placed.
13209 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13210 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13211 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13212 Otherwise does not return 0.
13214 Note: we have to be careful with escapes, as they can be both literal
13215 and special, and in the case of \10 and friends, context determines which.
13217 A summary of the code structure is:
13219 switch (first_byte) {
13220 cases for each special:
13221 handle this special;
13224 switch (2nd byte) {
13225 cases for each unambiguous special:
13226 handle this special;
13228 cases for each ambigous special/literal:
13230 if (special) handle here
13232 default: // unambiguously literal:
13235 default: // is a literal char
13238 create EXACTish node for literal;
13239 while (more input and node isn't full) {
13240 switch (input_byte) {
13241 cases for each special;
13242 make sure parse pointer is set so that the next call to
13243 regatom will see this special first
13244 goto loopdone; // EXACTish node terminated by prev. char
13246 append char to EXACTISH node;
13248 get next input byte;
13252 return the generated node;
13254 Specifically there are two separate switches for handling
13255 escape sequences, with the one for handling literal escapes requiring
13256 a dummy entry for all of the special escapes that are actually handled
13261 STATIC regnode_offset
13262 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13265 regnode_offset ret = 0;
13271 GET_RE_DEBUG_FLAGS_DECL;
13273 *flagp = WORST; /* Tentatively. */
13275 DEBUG_PARSE("atom");
13277 PERL_ARGS_ASSERT_REGATOM;
13280 parse_start = RExC_parse;
13281 assert(RExC_parse < RExC_end);
13282 switch ((U8)*RExC_parse) {
13284 RExC_seen_zerolen++;
13285 nextchar(pRExC_state);
13286 if (RExC_flags & RXf_PMf_MULTILINE)
13287 ret = reg_node(pRExC_state, MBOL);
13289 ret = reg_node(pRExC_state, SBOL);
13290 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13293 nextchar(pRExC_state);
13295 RExC_seen_zerolen++;
13296 if (RExC_flags & RXf_PMf_MULTILINE)
13297 ret = reg_node(pRExC_state, MEOL);
13299 ret = reg_node(pRExC_state, SEOL);
13300 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13303 nextchar(pRExC_state);
13304 if (RExC_flags & RXf_PMf_SINGLELINE)
13305 ret = reg_node(pRExC_state, SANY);
13307 ret = reg_node(pRExC_state, REG_ANY);
13308 *flagp |= HASWIDTH|SIMPLE;
13310 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13314 char * const oregcomp_parse = ++RExC_parse;
13315 ret = regclass(pRExC_state, flagp, depth+1,
13316 FALSE, /* means parse the whole char class */
13317 TRUE, /* allow multi-char folds */
13318 FALSE, /* don't silence non-portable warnings. */
13319 (bool) RExC_strict,
13320 TRUE, /* Allow an optimized regnode result */
13323 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13324 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13327 if (*RExC_parse != ']') {
13328 RExC_parse = oregcomp_parse;
13329 vFAIL("Unmatched [");
13331 nextchar(pRExC_state);
13332 Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13336 nextchar(pRExC_state);
13337 ret = reg(pRExC_state, 2, &flags, depth+1);
13339 if (flags & TRYAGAIN) {
13340 if (RExC_parse >= RExC_end) {
13341 /* Make parent create an empty node if needed. */
13342 *flagp |= TRYAGAIN;
13347 RETURN_FAIL_ON_RESTART(flags, flagp);
13348 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13351 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13355 if (flags & TRYAGAIN) {
13356 *flagp |= TRYAGAIN;
13359 vFAIL("Internal urp");
13360 /* Supposed to be caught earlier. */
13366 vFAIL("Quantifier follows nothing");
13371 This switch handles escape sequences that resolve to some kind
13372 of special regop and not to literal text. Escape sequences that
13373 resolve to literal text are handled below in the switch marked
13376 Every entry in this switch *must* have a corresponding entry
13377 in the literal escape switch. However, the opposite is not
13378 required, as the default for this switch is to jump to the
13379 literal text handling code.
13382 switch ((U8)*RExC_parse) {
13383 /* Special Escapes */
13385 RExC_seen_zerolen++;
13386 ret = reg_node(pRExC_state, SBOL);
13387 /* SBOL is shared with /^/ so we set the flags so we can tell
13388 * /\A/ from /^/ in split. */
13389 FLAGS(REGNODE_p(ret)) = 1;
13391 goto finish_meta_pat;
13393 ret = reg_node(pRExC_state, GPOS);
13394 RExC_seen |= REG_GPOS_SEEN;
13396 goto finish_meta_pat;
13398 if (!RExC_in_lookbehind && !RExC_in_lookahead) {
13399 RExC_seen_zerolen++;
13400 ret = reg_node(pRExC_state, KEEPS);
13402 /* XXX:dmq : disabling in-place substitution seems to
13403 * be necessary here to avoid cases of memory corruption, as
13404 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13406 RExC_seen |= REG_LOOKBEHIND_SEEN;
13407 goto finish_meta_pat;
13410 ++RExC_parse; /* advance past the 'K' */
13411 vFAIL("\\K not permitted in lookahead/lookbehind");
13414 ret = reg_node(pRExC_state, SEOL);
13416 RExC_seen_zerolen++; /* Do not optimize RE away */
13417 goto finish_meta_pat;
13419 ret = reg_node(pRExC_state, EOS);
13421 RExC_seen_zerolen++; /* Do not optimize RE away */
13422 goto finish_meta_pat;
13424 vFAIL("\\C no longer supported");
13426 ret = reg_node(pRExC_state, CLUMP);
13427 *flagp |= HASWIDTH;
13428 goto finish_meta_pat;
13436 regex_charset charset = get_regex_charset(RExC_flags);
13438 RExC_seen_zerolen++;
13439 RExC_seen |= REG_LOOKBEHIND_SEEN;
13440 op = BOUND + charset;
13442 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13443 flags = TRADITIONAL_BOUND;
13444 if (op > BOUNDA) { /* /aa is same as /a */
13450 char name = *RExC_parse;
13451 char * endbrace = NULL;
13453 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13456 vFAIL2("Missing right brace on \\%c{}", name);
13458 /* XXX Need to decide whether to take spaces or not. Should be
13459 * consistent with \p{}, but that currently is SPACE, which
13460 * means vertical too, which seems wrong
13461 * while (isBLANK(*RExC_parse)) {
13464 if (endbrace == RExC_parse) {
13465 RExC_parse++; /* After the '}' */
13466 vFAIL2("Empty \\%c{}", name);
13468 length = endbrace - RExC_parse;
13469 /*while (isBLANK(*(RExC_parse + length - 1))) {
13472 switch (*RExC_parse) {
13475 && (memNEs(RExC_parse + 1, length - 1, "cb")))
13477 goto bad_bound_type;
13482 if (length != 2 || *(RExC_parse + 1) != 'b') {
13483 goto bad_bound_type;
13488 if (length != 2 || *(RExC_parse + 1) != 'b') {
13489 goto bad_bound_type;
13494 if (length != 2 || *(RExC_parse + 1) != 'b') {
13495 goto bad_bound_type;
13501 RExC_parse = endbrace;
13503 "'%" UTF8f "' is an unknown bound type",
13504 UTF8fARG(UTF, length, endbrace - length));
13505 NOT_REACHED; /*NOTREACHED*/
13507 RExC_parse = endbrace;
13508 REQUIRE_UNI_RULES(flagp, 0);
13513 else if (op >= BOUNDA) { /* /aa is same as /a */
13517 /* Don't have to worry about UTF-8, in this message because
13518 * to get here the contents of the \b must be ASCII */
13519 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
13520 "Using /u for '%.*s' instead of /%s",
13522 endbrace - length + 1,
13523 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13524 ? ASCII_RESTRICT_PAT_MODS
13525 : ASCII_MORE_RESTRICT_PAT_MODS);
13530 RExC_seen_d_op = TRUE;
13532 else if (op == BOUNDL) {
13533 RExC_contains_locale = 1;
13537 op += NBOUND - BOUND;
13540 ret = reg_node(pRExC_state, op);
13541 FLAGS(REGNODE_p(ret)) = flags;
13545 goto finish_meta_pat;
13549 ret = reg_node(pRExC_state, LNBREAK);
13550 *flagp |= HASWIDTH|SIMPLE;
13551 goto finish_meta_pat;
13565 /* These all have the same meaning inside [brackets], and it knows
13566 * how to do the best optimizations for them. So, pretend we found
13567 * these within brackets, and let it do the work */
13570 ret = regclass(pRExC_state, flagp, depth+1,
13571 TRUE, /* means just parse this element */
13572 FALSE, /* don't allow multi-char folds */
13573 FALSE, /* don't silence non-portable warnings. It
13574 would be a bug if these returned
13576 (bool) RExC_strict,
13577 TRUE, /* Allow an optimized regnode result */
13579 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13580 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13581 * multi-char folds are allowed. */
13583 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13586 RExC_parse--; /* regclass() leaves this one too far ahead */
13589 /* The escapes above that don't take a parameter can't be
13590 * followed by a '{'. But 'pX', 'p{foo}' and
13591 * correspondingly 'P' can be */
13592 if ( RExC_parse - parse_start == 1
13593 && UCHARAT(RExC_parse + 1) == '{'
13594 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13597 vFAIL("Unescaped left brace in regex is illegal here");
13599 Set_Node_Offset(REGNODE_p(ret), parse_start);
13600 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
13601 nextchar(pRExC_state);
13604 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13605 * \N{...} evaluates to a sequence of more than one code points).
13606 * The function call below returns a regnode, which is our result.
13607 * The parameters cause it to fail if the \N{} evaluates to a
13608 * single code point; we handle those like any other literal. The
13609 * reason that the multicharacter case is handled here and not as
13610 * part of the EXACtish code is because of quantifiers. In
13611 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13612 * this way makes that Just Happen. dmq.
13613 * join_exact() will join this up with adjacent EXACTish nodes
13614 * later on, if appropriate. */
13616 if (grok_bslash_N(pRExC_state,
13617 &ret, /* Want a regnode returned */
13618 NULL, /* Fail if evaluates to a single code
13620 NULL, /* Don't need a count of how many code
13629 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13631 /* Here, evaluates to a single code point. Go get that */
13632 RExC_parse = parse_start;
13635 case 'k': /* Handle \k<NAME> and \k'NAME' */
13639 if ( RExC_parse >= RExC_end - 1
13640 || (( ch = RExC_parse[1]) != '<'
13645 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13646 vFAIL2("Sequence %.2s... not terminated", parse_start);
13649 ret = handle_named_backref(pRExC_state,
13661 case '1': case '2': case '3': case '4':
13662 case '5': case '6': case '7': case '8': case '9':
13667 if (*RExC_parse == 'g') {
13671 if (*RExC_parse == '{') {
13675 if (*RExC_parse == '-') {
13679 if (hasbrace && !isDIGIT(*RExC_parse)) {
13680 if (isrel) RExC_parse--;
13682 goto parse_named_seq;
13685 if (RExC_parse >= RExC_end) {
13686 goto unterminated_g;
13688 num = S_backref_value(RExC_parse, RExC_end);
13690 vFAIL("Reference to invalid group 0");
13691 else if (num == I32_MAX) {
13692 if (isDIGIT(*RExC_parse))
13693 vFAIL("Reference to nonexistent group");
13696 vFAIL("Unterminated \\g... pattern");
13700 num = RExC_npar - num;
13702 vFAIL("Reference to nonexistent or unclosed group");
13706 num = S_backref_value(RExC_parse, RExC_end);
13707 /* bare \NNN might be backref or octal - if it is larger
13708 * than or equal RExC_npar then it is assumed to be an
13709 * octal escape. Note RExC_npar is +1 from the actual
13710 * number of parens. */
13711 /* Note we do NOT check if num == I32_MAX here, as that is
13712 * handled by the RExC_npar check */
13715 /* any numeric escape < 10 is always a backref */
13717 /* any numeric escape < RExC_npar is a backref */
13718 && num >= RExC_npar
13719 /* cannot be an octal escape if it starts with 8 */
13720 && *RExC_parse != '8'
13721 /* cannot be an octal escape if it starts with 9 */
13722 && *RExC_parse != '9'
13724 /* Probably not meant to be a backref, instead likely
13725 * to be an octal character escape, e.g. \35 or \777.
13726 * The above logic should make it obvious why using
13727 * octal escapes in patterns is problematic. - Yves */
13728 RExC_parse = parse_start;
13733 /* At this point RExC_parse points at a numeric escape like
13734 * \12 or \88 or something similar, which we should NOT treat
13735 * as an octal escape. It may or may not be a valid backref
13736 * escape. For instance \88888888 is unlikely to be a valid
13738 while (isDIGIT(*RExC_parse))
13741 if (*RExC_parse != '}')
13742 vFAIL("Unterminated \\g{...} pattern");
13745 if (num >= (I32)RExC_npar) {
13747 /* It might be a forward reference; we can't fail until we
13748 * know, by completing the parse to get all the groups, and
13749 * then reparsing */
13750 if (ALL_PARENS_COUNTED) {
13751 if (num >= RExC_total_parens) {
13752 vFAIL("Reference to nonexistent group");
13756 REQUIRE_PARENS_PASS;
13760 ret = reganode(pRExC_state,
13763 : (ASCII_FOLD_RESTRICTED)
13765 : (AT_LEAST_UNI_SEMANTICS)
13771 if (OP(REGNODE_p(ret)) == REFF) {
13772 RExC_seen_d_op = TRUE;
13774 *flagp |= HASWIDTH;
13776 /* override incorrect value set in reganode MJD */
13777 Set_Node_Offset(REGNODE_p(ret), parse_start);
13778 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13779 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13780 FALSE /* Don't force to /x */ );
13784 if (RExC_parse >= RExC_end)
13785 FAIL("Trailing \\");
13788 /* Do not generate "unrecognized" warnings here, we fall
13789 back into the quick-grab loop below */
13790 RExC_parse = parse_start;
13792 } /* end of switch on a \foo sequence */
13797 /* '#' comments should have been spaced over before this function was
13799 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13801 if (RExC_flags & RXf_PMf_EXTENDED) {
13802 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13803 if (RExC_parse < RExC_end)
13813 /* Here, we have determined that the next thing is probably a
13814 * literal character. RExC_parse points to the first byte of its
13815 * definition. (It still may be an escape sequence that evaluates
13816 * to a single character) */
13821 char *s, *old_s = NULL, *old_old_s = NULL;
13823 U32 max_string_len = 255;
13825 /* We may have to reparse the node, artificially stopping filling
13826 * it early, based on info gleaned in the first parse. This
13827 * variable gives where we stop. Make it above the normal stopping
13828 * place first time through; otherwise it would stop too early */
13829 U32 upper_fill = max_string_len + 1;
13831 /* We start out as an EXACT node, even if under /i, until we find a
13832 * character which is in a fold. The algorithm now segregates into
13833 * separate nodes, characters that fold from those that don't under
13834 * /i. (This hopefully will create nodes that are fixed strings
13835 * even under /i, giving the optimizer something to grab on to.)
13836 * So, if a node has something in it and the next character is in
13837 * the opposite category, that node is closed up, and the function
13838 * returns. Then regatom is called again, and a new node is
13839 * created for the new category. */
13840 U8 node_type = EXACT;
13842 /* Assume the node will be fully used; the excess is given back at
13843 * the end. Under /i, we may need to temporarily add the fold of
13844 * an extra character or two at the end to check for splitting
13845 * multi-char folds, so allocate extra space for that. We can't
13846 * make any other length assumptions, as a byte input sequence
13847 * could shrink down. */
13848 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
13852 ? UTF8_MAXBYTES_CASE
13853 /* Max non-UTF-8 expansion is 2 */ : 2)));
13855 bool next_is_quantifier;
13856 char * oldp = NULL;
13858 /* We can convert EXACTF nodes to EXACTFU if they contain only
13859 * characters that match identically regardless of the target
13860 * string's UTF8ness. The reason to do this is that EXACTF is not
13861 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
13864 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
13865 * contain only above-Latin1 characters (hence must be in UTF8),
13866 * which don't participate in folds with Latin1-range characters,
13867 * as the latter's folds aren't known until runtime. */
13868 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
13870 /* Single-character EXACTish nodes are almost always SIMPLE. This
13871 * allows us to override this as encountered */
13872 U8 maybe_SIMPLE = SIMPLE;
13874 /* Does this node contain something that can't match unless the
13875 * target string is (also) in UTF-8 */
13876 bool requires_utf8_target = FALSE;
13878 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
13879 bool has_ss = FALSE;
13881 /* So is the MICRO SIGN */
13882 bool has_micro_sign = FALSE;
13884 /* Set when we fill up the current node and there is still more
13885 * text to process */
13888 /* Allocate an EXACT node. The node_type may change below to
13889 * another EXACTish node, but since the size of the node doesn't
13890 * change, it works */
13891 ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
13893 FILL_NODE(ret, node_type);
13896 s = STRING(REGNODE_p(ret));
13907 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
13908 maybe_SIMPLE = SIMPLE;
13909 requires_utf8_target = FALSE;
13911 has_micro_sign = FALSE;
13915 /* This breaks under rare circumstances. If folding, we do not
13916 * want to split a node at a character that is a non-final in a
13917 * multi-char fold, as an input string could just happen to want to
13918 * match across the node boundary. The code at the end of the loop
13919 * looks for this, and backs off until it finds not such a
13920 * character, but it is possible (though extremely, extremely
13921 * unlikely) for all characters in the node to be non-final fold
13922 * ones, in which case we just leave the node fully filled, and
13923 * hope that it doesn't match the string in just the wrong place */
13925 assert( ! UTF /* Is at the beginning of a character */
13926 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
13927 || UTF8_IS_START(UCHARAT(RExC_parse)));
13929 overflowed = FALSE;
13931 /* Here, we have a literal character. Find the maximal string of
13932 * them in the input that we can fit into a single EXACTish node.
13933 * We quit at the first non-literal or when the node gets full, or
13934 * under /i the categorization of folding/non-folding character
13936 while (p < RExC_end && len < upper_fill) {
13938 /* In most cases each iteration adds one byte to the output.
13939 * The exceptions override this */
13940 Size_t added_len = 1;
13946 /* White space has already been ignored */
13947 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
13948 || ! is_PATWS_safe((p), RExC_end, UTF));
13960 /* Literal Escapes Switch
13962 This switch is meant to handle escape sequences that
13963 resolve to a literal character.
13965 Every escape sequence that represents something
13966 else, like an assertion or a char class, is handled
13967 in the switch marked 'Special Escapes' above in this
13968 routine, but also has an entry here as anything that
13969 isn't explicitly mentioned here will be treated as
13970 an unescaped equivalent literal.
13973 switch ((U8)*++p) {
13975 /* These are all the special escapes. */
13976 case 'A': /* Start assertion */
13977 case 'b': case 'B': /* Word-boundary assertion*/
13978 case 'C': /* Single char !DANGEROUS! */
13979 case 'd': case 'D': /* digit class */
13980 case 'g': case 'G': /* generic-backref, pos assertion */
13981 case 'h': case 'H': /* HORIZWS */
13982 case 'k': case 'K': /* named backref, keep marker */
13983 case 'p': case 'P': /* Unicode property */
13984 case 'R': /* LNBREAK */
13985 case 's': case 'S': /* space class */
13986 case 'v': case 'V': /* VERTWS */
13987 case 'w': case 'W': /* word class */
13988 case 'X': /* eXtended Unicode "combining
13989 character sequence" */
13990 case 'z': case 'Z': /* End of line/string assertion */
13994 /* Anything after here is an escape that resolves to a
13995 literal. (Except digits, which may or may not)
14001 case 'N': /* Handle a single-code point named character. */
14002 RExC_parse = p + 1;
14003 if (! grok_bslash_N(pRExC_state,
14004 NULL, /* Fail if evaluates to
14005 anything other than a
14006 single code point */
14007 &ender, /* The returned single code
14009 NULL, /* Don't need a count of
14010 how many code points */
14015 if (*flagp & NEED_UTF8)
14016 FAIL("panic: grok_bslash_N set NEED_UTF8");
14017 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14019 /* Here, it wasn't a single code point. Go close
14020 * up this EXACTish node. The switch() prior to
14021 * this switch handles the other cases */
14022 RExC_parse = p = oldp;
14026 RExC_parse = parse_start;
14028 /* The \N{} means the pattern, if previously /d,
14029 * becomes /u. That means it can't be an EXACTF node,
14030 * but an EXACTFU */
14031 if (node_type == EXACTF) {
14032 node_type = EXACTFU;
14034 /* If the node already contains something that
14035 * differs between EXACTF and EXACTFU, reparse it
14037 if (! maybe_exactfu) {
14058 ender = ESC_NATIVE;
14068 const char* error_msg;
14070 bool valid = grok_bslash_o(&p,
14074 TO_OUTPUT_WARNINGS(p),
14075 (bool) RExC_strict,
14076 TRUE, /* Output warnings
14081 RExC_parse = p; /* going to die anyway; point
14082 to exact spot of failure */
14085 UPDATE_WARNINGS_LOC(p - 1);
14091 UV result = UV_MAX; /* initialize to erroneous
14093 const char* error_msg;
14095 bool valid = grok_bslash_x(&p,
14099 TO_OUTPUT_WARNINGS(p),
14100 (bool) RExC_strict,
14101 TRUE, /* Silence warnings
14106 RExC_parse = p; /* going to die anyway; point
14107 to exact spot of failure */
14110 UPDATE_WARNINGS_LOC(p - 1);
14114 if (ender < 0x100) {
14115 if (RExC_recode_x_to_native) {
14116 ender = LATIN1_TO_NATIVE(ender);
14124 ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
14125 UPDATE_WARNINGS_LOC(p);
14128 case '8': case '9': /* must be a backreference */
14130 /* we have an escape like \8 which cannot be an octal escape
14131 * so we exit the loop, and let the outer loop handle this
14132 * escape which may or may not be a legitimate backref. */
14134 case '1': case '2': case '3':case '4':
14135 case '5': case '6': case '7':
14136 /* When we parse backslash escapes there is ambiguity
14137 * between backreferences and octal escapes. Any escape
14138 * from \1 - \9 is a backreference, any multi-digit
14139 * escape which does not start with 0 and which when
14140 * evaluated as decimal could refer to an already
14141 * parsed capture buffer is a back reference. Anything
14144 * Note this implies that \118 could be interpreted as
14145 * 118 OR as "\11" . "8" depending on whether there
14146 * were 118 capture buffers defined already in the
14149 /* NOTE, RExC_npar is 1 more than the actual number of
14150 * parens we have seen so far, hence the "<" as opposed
14152 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14153 { /* Not to be treated as an octal constant, go
14161 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14163 ender = grok_oct(p, &numlen, &flags, NULL);
14165 if ( isDIGIT(*p) /* like \08, \178 */
14166 && ckWARN(WARN_REGEXP)
14169 reg_warn_non_literal_string(
14171 form_short_octal_warning(p, numlen));
14177 FAIL("Trailing \\");
14180 if (isALPHANUMERIC(*p)) {
14181 /* An alpha followed by '{' is going to fail next
14182 * iteration, so don't output this warning in that
14184 if (! isALPHA(*p) || *(p + 1) != '{') {
14185 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14186 " passed through", p);
14189 goto normal_default;
14190 } /* End of switch on '\' */
14193 /* Trying to gain new uses for '{' without breaking too
14194 * much existing code is hard. The solution currently
14196 * 1) If there is no ambiguity that a '{' should always
14197 * be taken literally, at the start of a construct, we
14199 * 2) If the literal '{' conflicts with our desired use
14200 * of it as a metacharacter, we die. The deprecation
14201 * cycles for this have come and gone.
14202 * 3) If there is ambiguity, we raise a simple warning.
14203 * This could happen, for example, if the user
14204 * intended it to introduce a quantifier, but slightly
14205 * misspelled the quantifier. Without this warning,
14206 * the quantifier would silently be taken as a literal
14207 * string of characters instead of a meta construct */
14208 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14210 || ( p > parse_start + 1
14211 && isALPHA_A(*(p - 1))
14212 && *(p - 2) == '\\')
14213 || new_regcurly(p, RExC_end))
14215 RExC_parse = p + 1;
14216 vFAIL("Unescaped left brace in regex is "
14219 ckWARNreg(p + 1, "Unescaped left brace in regex is"
14220 " passed through");
14222 goto normal_default;
14225 if (p > RExC_parse && RExC_strict) {
14226 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14229 default: /* A literal character */
14231 if (! UTF8_IS_INVARIANT(*p) && UTF) {
14233 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14234 &numlen, UTF8_ALLOW_DEFAULT);
14240 } /* End of switch on the literal */
14242 /* Here, have looked at the literal character, and <ender>
14243 * contains its ordinal; <p> points to the character after it.
14247 REQUIRE_UTF8(flagp);
14250 /* We need to check if the next non-ignored thing is a
14251 * quantifier. Move <p> to after anything that should be
14252 * ignored, which, as a side effect, positions <p> for the next
14253 * loop iteration */
14254 skip_to_be_ignored_text(pRExC_state, &p,
14255 FALSE /* Don't force to /x */ );
14257 /* If the next thing is a quantifier, it applies to this
14258 * character only, which means that this character has to be in
14259 * its own node and can't just be appended to the string in an
14260 * existing node, so if there are already other characters in
14261 * the node, close the node with just them, and set up to do
14262 * this character again next time through, when it will be the
14263 * only thing in its new node */
14265 next_is_quantifier = LIKELY(p < RExC_end)
14266 && UNLIKELY(ISMULT2(p));
14268 if (next_is_quantifier && LIKELY(len)) {
14273 /* Ready to add 'ender' to the node */
14275 if (! FOLD) { /* The simple case, just append the literal */
14278 /* Don't output if it would overflow */
14279 if (UNLIKELY(len > max_string_len - ((UTF)
14280 ? UVCHR_SKIP(ender)
14287 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14288 *(s++) = (char) ender;
14291 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14292 added_len = (char *) new_s - s;
14293 s = (char *) new_s;
14296 requires_utf8_target = TRUE;
14300 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14302 /* Here are folding under /l, and the code point is
14303 * problematic. If this is the first character in the
14304 * node, change the node type to folding. Otherwise, if
14305 * this is the first problematic character, close up the
14306 * existing node, so can start a new node with this one */
14308 node_type = EXACTFL;
14309 RExC_contains_locale = 1;
14311 else if (node_type == EXACT) {
14316 /* This problematic code point means we can't simplify
14318 maybe_exactfu = FALSE;
14320 /* Here, we are adding a problematic fold character.
14321 * "Problematic" in this context means that its fold isn't
14322 * known until runtime. (The non-problematic code points
14323 * are the above-Latin1 ones that fold to also all
14324 * above-Latin1. Their folds don't vary no matter what the
14325 * locale is.) But here we have characters whose fold
14326 * depends on the locale. We just add in the unfolded
14327 * character, and wait until runtime to fold it */
14328 goto not_fold_common;
14330 else /* regular fold; see if actually is in a fold */
14331 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14333 && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14335 /* Here, folding, but the character isn't in a fold.
14337 * Start a new node if previous characters in the node were
14339 if (len && node_type != EXACT) {
14344 /* Here, continuing a node with non-folded characters. Add
14346 goto not_fold_common;
14348 else { /* Here, does participate in some fold */
14350 /* If this is the first character in the node, change its
14351 * type to folding. Otherwise, if this is the first
14352 * folding character in the node, close up the existing
14353 * node, so can start a new node with this one. */
14355 node_type = compute_EXACTish(pRExC_state);
14357 else if (node_type == EXACT) {
14362 if (UTF) { /* Alway use the folded value for UTF-8
14364 if (UVCHR_IS_INVARIANT(ender)) {
14365 if (UNLIKELY(len + 1 > max_string_len)) {
14370 *(s)++ = (U8) toFOLD(ender);
14373 UV folded = _to_uni_fold_flags(
14375 (U8 *) s, /* We have allocated extra space
14376 in 's' so can't run off the
14379 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14380 ? FOLD_FLAGS_NOMIX_ASCII
14382 if (UNLIKELY(len + added_len > max_string_len)) {
14390 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
14392 /* U+B5 folds to the MU, so its possible for a
14393 * non-UTF-8 target to match it */
14394 requires_utf8_target = TRUE;
14398 else { /* Here is non-UTF8. */
14400 /* The fold will be one or (rarely) two characters.
14401 * Check that there's room for at least a single one
14402 * before setting any flags, etc. Because otherwise an
14403 * overflowing character could cause a flag to be set
14404 * even though it doesn't end up in this node. (For
14405 * the two character fold, we check again, before
14406 * setting any flags) */
14407 if (UNLIKELY(len + 1 > max_string_len)) {
14412 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
14413 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
14414 || UNICODE_DOT_DOT_VERSION > 0)
14416 /* On non-ancient Unicodes, check for the only possible
14417 * multi-char fold */
14418 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14420 /* This potential multi-char fold means the node
14421 * can't be simple (because it could match more
14422 * than a single char). And in some cases it will
14423 * match 'ss', so set that flag */
14427 /* It can't change to be an EXACTFU (unless already
14428 * is one). We fold it iff under /u rules. */
14429 if (node_type != EXACTFU) {
14430 maybe_exactfu = FALSE;
14433 if (UNLIKELY(len + 2 > max_string_len)) {
14442 goto done_with_this_char;
14445 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
14447 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
14449 /* Also, the sequence 'ss' is special when not
14450 * under /u. If the target string is UTF-8, it
14451 * should match SHARP S; otherwise it won't. So,
14452 * here we have to exclude the possibility of this
14453 * node moving to /u.*/
14455 maybe_exactfu = FALSE;
14458 /* Here, the fold will be a single character */
14460 if (UNLIKELY(ender == MICRO_SIGN)) {
14461 has_micro_sign = TRUE;
14463 else if (PL_fold[ender] != PL_fold_latin1[ender]) {
14465 /* If the character's fold differs between /d and
14466 * /u, this can't change to be an EXACTFU node */
14467 maybe_exactfu = FALSE;
14470 *(s++) = (DEPENDS_SEMANTICS)
14471 ? (char) toFOLD(ender)
14473 /* Under /u, the fold of any character in
14474 * the 0-255 range happens to be its
14475 * lowercase equivalent, except for LATIN
14476 * SMALL LETTER SHARP S, which was handled
14477 * above, and the MICRO SIGN, whose fold
14478 * requires UTF-8 to represent. */
14479 : (char) toLOWER_L1(ender);
14481 } /* End of adding current character to the node */
14483 done_with_this_char:
14487 if (next_is_quantifier) {
14489 /* Here, the next input is a quantifier, and to get here,
14490 * the current character is the only one in the node. */
14494 } /* End of loop through literal characters */
14496 /* Here we have either exhausted the input or run out of room in
14497 * the node. If the former, we are done. (If we encountered a
14498 * character that can't be in the node, transfer is made directly
14499 * to <loopdone>, and so we wouldn't have fallen off the end of the
14501 if (LIKELY(! overflowed)) {
14505 /* Here we have run out of room. We can grow plain EXACT and
14506 * LEXACT nodes. If the pattern is gigantic enough, though,
14507 * eventually we'll have to artificially chunk the pattern into
14508 * multiple nodes. */
14509 if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
14510 Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
14511 Size_t overhead_expansion = 0;
14513 Size_t max_nodes_for_string;
14517 /* Here we couldn't fit the final character in the current
14518 * node, so it will have to be reparsed, no matter what else we
14522 /* If would have overflowed a regular EXACT node, switch
14523 * instead to an LEXACT. The code below is structured so that
14524 * the actual growing code is common to changing from an EXACT
14525 * or just increasing the LEXACT size. This means that we have
14526 * to save the string in the EXACT case before growing, and
14527 * then copy it afterwards to its new location */
14528 if (node_type == EXACT) {
14529 overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
14530 RExC_emit += overhead_expansion;
14531 Copy(s0, temp, len, char);
14534 /* Ready to grow. If it was a plain EXACT, the string was
14535 * saved, and the first few bytes of it overwritten by adding
14536 * an argument field. We assume, as we do elsewhere in this
14537 * file, that one byte of remaining input will translate into
14538 * one byte of output, and if that's too small, we grow again,
14539 * if too large the excess memory is freed at the end */
14541 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
14542 achievable = MIN(max_nodes_for_string,
14543 current_string_nodes + STR_SZ(RExC_end - p));
14544 delta = achievable - current_string_nodes;
14546 /* If there is just no more room, go finish up this chunk of
14552 change_engine_size(pRExC_state, delta + overhead_expansion);
14553 current_string_nodes += delta;
14555 = sizeof(struct regnode) * current_string_nodes;
14556 upper_fill = max_string_len + 1;
14558 /* If the length was small, we know this was originally an
14559 * EXACT node now converted to LEXACT, and the string has to be
14560 * restored. Otherwise the string was untouched. 260 is just
14561 * a number safely above 255 so don't have to worry about
14562 * getting it precise */
14564 node_type = LEXACT;
14565 FILL_NODE(ret, node_type);
14566 s0 = STRING(REGNODE_p(ret));
14567 Copy(temp, s0, len, char);
14571 goto continue_parse;
14574 bool splittable = FALSE;
14575 bool backed_up = FALSE;
14579 /* Here is /i. Running out of room creates a problem if we are
14580 * folding, and the split happens in the middle of a
14581 * multi-character fold, as a match that should have occurred,
14582 * won't, due to the way nodes are matched, and our artificial
14583 * boundary. So back off until we aren't splitting such a
14584 * fold. If there is no such place to back off to, we end up
14585 * taking the entire node as-is. This can happen if the node
14586 * consists entirely of 'f' or entirely of 's' characters (or
14587 * things that fold to them) as 'ff' and 'ss' are
14588 * multi-character folds.
14590 * The Unicode standard says that multi character folds consist
14591 * of either two or three characters. That means we would be
14592 * splitting one if the final character in the node is at the
14593 * beginning of either type, or is the second of a three
14597 * ender is the code point of the character that won't fit
14599 * s points to just beyond the final byte in the node.
14600 * It's where we would place ender if there were
14601 * room, and where in fact we do place ender's fold
14602 * in the code below, as we've over-allocated space
14603 * for s0 (hence s) to allow for this
14604 * e starts at 's' and advances as we append things.
14605 * old_s is the same as 's'. (If ender had fit, 's' would
14606 * have been advanced to beyond it).
14607 * old_old_s points to the beginning byte of the final
14608 * character in the node
14609 * p points to the beginning byte in the input of the
14610 * character beyond 'ender'.
14611 * oldp points to the beginning byte in the input of
14614 * In the case of /il, we haven't folded anything that could be
14615 * affected by the locale. That means only above-Latin1
14616 * characters that fold to other above-latin1 characters get
14617 * folded at compile time. To check where a good place to
14618 * split nodes is, everything in it will have to be folded.
14619 * The boolean 'maybe_exactfu' keeps track in /il if there are
14620 * any unfolded characters in the node. */
14621 bool need_to_fold_loc = LOC && ! maybe_exactfu;
14623 /* If we do need to fold the node, we need a place to store the
14624 * folded copy, and a way to map back to the unfolded original
14626 char * locfold_buf;
14627 Size_t * loc_correspondence;
14629 if (! need_to_fold_loc) { /* The normal case. Just
14630 initialize to the actual node */
14633 s = old_old_s; /* Point to the beginning of the final char
14634 that fits in the node */
14638 /* Here, we have filled a /il node, and there are unfolded
14639 * characters in it. If the runtime locale turns out to be
14640 * UTF-8, there are possible multi-character folds, just
14641 * like when not under /l. The node hence can't terminate
14642 * in the middle of such a fold. To determine this, we
14643 * have to create a folded copy of this node. That means
14644 * reparsing the node, folding everything assuming a UTF-8
14645 * locale. (If at runtime it isn't such a locale, the
14646 * actions here wouldn't have been necessary, but we have
14647 * to assume the worst case.) If we find we need to back
14648 * off the folded string, we do so, and then map that
14649 * position back to the original unfolded node, which then
14650 * gets output, truncated at that spot */
14652 char * redo_p = RExC_parse;
14656 /* Allow enough space assuming a single byte input folds to
14657 * a single byte output, plus assume that the two unparsed
14658 * characters (that we may need) fold to the largest number
14659 * of bytes possible, plus extra for one more worst case
14660 * scenario. In the loop below, if we start eating into
14661 * that final spare space, we enlarge this initial space */
14662 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
14664 Newxz(locfold_buf, size, char);
14665 Newxz(loc_correspondence, size, Size_t);
14667 /* Redo this node's parse, folding into 'locfold_buf' */
14668 redo_p = RExC_parse;
14669 redo_e = locfold_buf;
14670 while (redo_p <= oldp) {
14672 old_redo_e = redo_e;
14673 loc_correspondence[redo_e - locfold_buf]
14674 = redo_p - RExC_parse;
14679 (void) _to_utf8_fold_flags((U8 *) redo_p,
14684 redo_e += added_len;
14685 redo_p += UTF8SKIP(redo_p);
14689 /* Note that if this code is run on some ancient
14690 * Unicode versions, SHARP S doesn't fold to 'ss',
14691 * but rather than clutter the code with #ifdef's,
14692 * as is done above, we ignore that possibility.
14693 * This is ok because this code doesn't affect what
14694 * gets matched, but merely where the node gets
14696 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
14697 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
14707 /* If we're getting so close to the end that a
14708 * worst-case fold in the next character would cause us
14709 * to overflow, increase, assuming one byte output byte
14710 * per one byte input one, plus room for another worst
14712 if ( redo_p <= oldp
14713 && redo_e > locfold_buf + size
14714 - (UTF8_MAXBYTES_CASE + 1))
14716 Size_t new_size = size
14718 + UTF8_MAXBYTES_CASE + 1;
14719 Ptrdiff_t e_offset = redo_e - locfold_buf;
14721 Renew(locfold_buf, new_size, char);
14722 Renew(loc_correspondence, new_size, Size_t);
14725 redo_e = locfold_buf + e_offset;
14729 /* Set so that things are in terms of the folded, temporary
14732 s_start = locfold_buf;
14737 /* Here, we have 's', 's_start' and 'e' set up to point to the
14738 * input that goes into the node, folded.
14740 * If the final character of the node and the fold of ender
14741 * form the first two characters of a three character fold, we
14742 * need to peek ahead at the next (unparsed) character in the
14743 * input to determine if the three actually do form such a
14744 * fold. Just looking at that character is not generally
14745 * sufficient, as it could be, for example, an escape sequence
14746 * that evaluates to something else, and it needs to be folded.
14748 * khw originally thought to just go through the parse loop one
14749 * extra time, but that doesn't work easily as that iteration
14750 * could cause things to think that the parse is over and to
14751 * goto loopdone. The character could be a '$' for example, or
14752 * the character beyond could be a quantifier, and other
14753 * glitches as well.
14755 * The solution used here for peeking ahead is to look at that
14756 * next character. If it isn't ASCII punctuation, then it will
14757 * be something that continues in an EXACTish node if there
14758 * were space. We append the fold of it to s, having reserved
14759 * enough room in s0 for the purpose. If we can't reasonably
14760 * peek ahead, we instead assume the worst case: that it is
14761 * something that would form the completion of a multi-char
14764 * If we can't split between s and ender, we work backwards
14765 * character-by-character down to s0. At each current point
14766 * see if we are at the beginning of a multi-char fold. If so,
14767 * that means we would be splitting the fold across nodes, and
14768 * so we back up one and try again.
14770 * If we're not at the beginning, we still could be at the
14771 * final two characters of a (rare) three character fold. We
14772 * check if the sequence starting at the character before the
14773 * current position (and including the current and next
14774 * characters) is a three character fold. If not, the node can
14775 * be split here. If it is, we have to backup two characters
14778 * Otherwise, the node can be split at the current position.
14780 * The same logic is used for UTF-8 patterns and not */
14784 /* Append the fold of ender */
14785 (void) _to_uni_fold_flags(
14789 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14790 ? FOLD_FLAGS_NOMIX_ASCII
14794 /* 's' and the character folded to by ender may be the
14795 * first two of a three-character fold, in which case the
14796 * node should not be split here. That may mean examining
14797 * the so-far unparsed character starting at 'p'. But if
14798 * ender folded to more than one character, we already have
14799 * three characters to look at. Also, we first check if
14800 * the sequence consisting of s and the next character form
14801 * the first two of some three character fold. If not,
14802 * there's no need to peek ahead. */
14803 if ( added_len <= UTF8SKIP(e - added_len)
14804 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
14806 /* Here, the two do form the beginning of a potential
14807 * three character fold. The unexamined character may
14808 * or may not complete it. Peek at it. It might be
14809 * something that ends the node or an escape sequence,
14810 * in which case we don't know without a lot of work
14811 * what it evaluates to, so we have to assume the worst
14812 * case: that it does complete the fold, and so we
14813 * can't split here. All such instances will have
14814 * that character be an ASCII punctuation character,
14815 * like a backslash. So, for that case, backup one and
14816 * drop down to try at that position */
14818 s = (char *) utf8_hop_back((U8 *) s, -1,
14823 /* Here, since it's not punctuation, it must be a
14824 * real character, and we can append its fold to
14825 * 'e' (having deliberately reserved enough space
14826 * for this eventuality) and drop down to check if
14827 * the three actually do form a folded sequence */
14828 (void) _to_utf8_fold_flags(
14829 (U8 *) p, (U8 *) RExC_end,
14832 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14833 ? FOLD_FLAGS_NOMIX_ASCII
14839 /* Here, we either have three characters available in
14840 * sequence starting at 's', or we have two characters and
14841 * know that the following one can't possibly be part of a
14842 * three character fold. We go through the node backwards
14843 * until we find a place where we can split it without
14844 * breaking apart a multi-character fold. At any given
14845 * point we have to worry about if such a fold begins at
14846 * the current 's', and also if a three-character fold
14847 * begins at s-1, (containing s and s+1). Splitting in
14848 * either case would break apart a fold */
14850 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
14853 /* If is a multi-char fold, can't split here. Backup
14854 * one char and try again */
14855 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
14861 /* If the two characters beginning at 's' are part of a
14862 * three character fold starting at the character
14863 * before s, we can't split either before or after s.
14864 * Backup two chars and try again */
14865 if ( LIKELY(s > s_start)
14866 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
14869 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
14874 /* Here there's no multi-char fold between s and the
14875 * next character following it. We can split */
14879 } while (s > s_start); /* End of loops backing up through the node */
14881 /* Here we either couldn't find a place to split the node,
14882 * or else we broke out of the loop setting 'splittable' to
14883 * true. In the latter case, the place to split is between
14884 * the first and second characters in the sequence starting
14890 else { /* Pattern not UTF-8 */
14891 if ( ender != LATIN_SMALL_LETTER_SHARP_S
14892 || ASCII_FOLD_RESTRICTED)
14894 *e++ = toLOWER_L1(ender);
14902 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
14909 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
14910 || ASCII_FOLD_RESTRICTED)
14912 *e++ = toLOWER_L1(ender);
14922 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
14928 if ( LIKELY(s > s_start)
14929 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
14939 } while (s > s_start);
14946 /* Here, we are done backing up. If we didn't backup at all
14947 * (the likely case), just proceed */
14950 /* If we did find a place to split, reparse the entire node
14951 * stopping where we have calculated. */
14954 /* If we created a temporary folded string under /l, we
14955 * have to map that back to the original */
14956 if (need_to_fold_loc) {
14957 upper_fill = loc_correspondence[s - s_start];
14958 Safefree(locfold_buf);
14959 Safefree(loc_correspondence);
14961 if (upper_fill == 0) {
14962 FAIL2("panic: loc_correspondence[%d] is 0",
14963 (int) (s - s_start));
14967 upper_fill = s - s0;
14971 else if (need_to_fold_loc) {
14972 Safefree(locfold_buf);
14973 Safefree(loc_correspondence);
14976 /* Here the node consists entirely of non-final multi-char
14977 * folds. (Likely it is all 'f's or all 's's.) There's no
14978 * decent place to split it, so give up and just take the
14982 } /* End of verifying node ends with an appropriate char */
14984 /* We need to start the next node at the character that didn't fit
14988 loopdone: /* Jumped to when encounters something that shouldn't be
14991 /* Free up any over-allocated space; cast is to silence bogus
14992 * warning in MS VC */
14993 change_engine_size(pRExC_state,
14994 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
14996 /* I (khw) don't know if you can get here with zero length, but the
14997 * old code handled this situation by creating a zero-length EXACT
14998 * node. Might as well be NOTHING instead */
15000 OP(REGNODE_p(ret)) = NOTHING;
15004 /* If the node type is EXACT here, check to see if it
15005 * should be EXACTL, or EXACT_REQ8. */
15006 if (node_type == EXACT) {
15008 node_type = EXACTL;
15010 else if (requires_utf8_target) {
15011 node_type = EXACT_REQ8;
15014 else if (node_type == LEXACT) {
15015 if (requires_utf8_target) {
15016 node_type = LEXACT_REQ8;
15020 if ( UNLIKELY(has_micro_sign || has_ss)
15021 && (node_type == EXACTFU || ( node_type == EXACTF
15022 && maybe_exactfu)))
15023 { /* These two conditions are problematic in non-UTF-8
15026 node_type = EXACTFUP;
15028 else if (node_type == EXACTFL) {
15030 /* 'maybe_exactfu' is deliberately set above to
15031 * indicate this node type, where all code points in it
15033 if (maybe_exactfu) {
15034 node_type = EXACTFLU8;
15037 _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15039 /* A character that folds to more than one will
15040 * match multiple characters, so can't be SIMPLE.
15041 * We don't have to worry about this with EXACTFLU8
15042 * nodes just above, as they have already been
15043 * folded (since the fold doesn't vary at run
15044 * time). Here, if the final character in the node
15045 * folds to multiple, it can't be simple. (This
15046 * only has an effect if the node has only a single
15047 * character, hence the final one, as elsewhere we
15048 * turn off simple for nodes whose length > 1 */
15052 else if (node_type == EXACTF) { /* Means is /di */
15054 /* This intermediate variable is needed solely because
15055 * the asserts in the macro where used exceed Win32's
15056 * literal string capacity */
15057 char first_char = * STRING(REGNODE_p(ret));
15059 /* If 'maybe_exactfu' is clear, then we need to stay
15060 * /di. If it is set, it means there are no code
15061 * points that match differently depending on UTF8ness
15062 * of the target string, so it can become an EXACTFU
15064 if (! maybe_exactfu) {
15065 RExC_seen_d_op = TRUE;
15067 else if ( isALPHA_FOLD_EQ(first_char, 's')
15068 || isALPHA_FOLD_EQ(ender, 's'))
15070 /* But, if the node begins or ends in an 's' we
15071 * have to defer changing it into an EXACTFU, as
15072 * the node could later get joined with another one
15073 * that ends or begins with 's' creating an 'ss'
15074 * sequence which would then wrongly match the
15075 * sharp s without the target being UTF-8. We
15076 * create a special node that we resolve later when
15077 * we join nodes together */
15079 node_type = EXACTFU_S_EDGE;
15082 node_type = EXACTFU;
15086 if (requires_utf8_target && node_type == EXACTFU) {
15087 node_type = EXACTFU_REQ8;
15091 OP(REGNODE_p(ret)) = node_type;
15092 setSTR_LEN(REGNODE_p(ret), len);
15093 RExC_emit += STR_SZ(len);
15095 /* If the node isn't a single character, it can't be SIMPLE */
15096 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
15100 *flagp |= HASWIDTH | maybe_SIMPLE;
15103 Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
15107 /* len is STRLEN which is unsigned, need to copy to signed */
15110 vFAIL("Internal disaster");
15113 } /* End of label 'defchar:' */
15115 } /* End of giant switch on input character */
15117 /* Position parse to next real character */
15118 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15119 FALSE /* Don't force to /x */ );
15120 if ( *RExC_parse == '{'
15121 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
15123 if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
15125 vFAIL("Unescaped left brace in regex is illegal here");
15127 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
15128 " passed through");
15136 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
15138 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
15139 * sets up the bitmap and any flags, removing those code points from the
15140 * inversion list, setting it to NULL should it become completely empty */
15144 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
15145 assert(PL_regkind[OP(node)] == ANYOF);
15147 /* There is no bitmap for this node type */
15148 if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
15152 ANYOF_BITMAP_ZERO(node);
15153 if (*invlist_ptr) {
15155 /* This gets set if we actually need to modify things */
15156 bool change_invlist = FALSE;
15160 /* Start looking through *invlist_ptr */
15161 invlist_iterinit(*invlist_ptr);
15162 while (invlist_iternext(*invlist_ptr, &start, &end)) {
15166 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
15167 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
15170 /* Quit if are above what we should change */
15171 if (start >= NUM_ANYOF_CODE_POINTS) {
15175 change_invlist = TRUE;
15177 /* Set all the bits in the range, up to the max that we are doing */
15178 high = (end < NUM_ANYOF_CODE_POINTS - 1)
15180 : NUM_ANYOF_CODE_POINTS - 1;
15181 for (i = start; i <= (int) high; i++) {
15182 if (! ANYOF_BITMAP_TEST(node, i)) {
15183 ANYOF_BITMAP_SET(node, i);
15187 invlist_iterfinish(*invlist_ptr);
15189 /* Done with loop; remove any code points that are in the bitmap from
15190 * *invlist_ptr; similarly for code points above the bitmap if we have
15191 * a flag to match all of them anyways */
15192 if (change_invlist) {
15193 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
15195 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
15196 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
15199 /* If have completely emptied it, remove it completely */
15200 if (_invlist_len(*invlist_ptr) == 0) {
15201 SvREFCNT_dec_NN(*invlist_ptr);
15202 *invlist_ptr = NULL;
15207 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
15208 Character classes ([:foo:]) can also be negated ([:^foo:]).
15209 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
15210 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
15211 but trigger failures because they are currently unimplemented. */
15213 #define POSIXCC_DONE(c) ((c) == ':')
15214 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
15215 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
15216 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
15218 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
15219 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
15220 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
15222 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
15224 /* 'posix_warnings' and 'warn_text' are names of variables in the following
15226 #define ADD_POSIX_WARNING(p, text) STMT_START { \
15227 if (posix_warnings) { \
15228 if (! RExC_warn_text ) RExC_warn_text = \
15229 (AV *) sv_2mortal((SV *) newAV()); \
15230 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
15234 REPORT_LOCATION_ARGS(p))); \
15237 #define CLEAR_POSIX_WARNINGS() \
15239 if (posix_warnings && RExC_warn_text) \
15240 av_clear(RExC_warn_text); \
15243 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
15245 CLEAR_POSIX_WARNINGS(); \
15250 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15252 const char * const s, /* Where the putative posix class begins.
15253 Normally, this is one past the '['. This
15254 parameter exists so it can be somewhere
15255 besides RExC_parse. */
15256 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15258 AV ** posix_warnings, /* Where to place any generated warnings, or
15260 const bool check_only /* Don't die if error */
15263 /* This parses what the caller thinks may be one of the three POSIX
15265 * 1) a character class, like [:blank:]
15266 * 2) a collating symbol, like [. .]
15267 * 3) an equivalence class, like [= =]
15268 * In the latter two cases, it croaks if it finds a syntactically legal
15269 * one, as these are not handled by Perl.
15271 * The main purpose is to look for a POSIX character class. It returns:
15272 * a) the class number
15273 * if it is a completely syntactically and semantically legal class.
15274 * 'updated_parse_ptr', if not NULL, is set to point to just after the
15275 * closing ']' of the class
15276 * b) OOB_NAMEDCLASS
15277 * if it appears that one of the three POSIX constructs was meant, but
15278 * its specification was somehow defective. 'updated_parse_ptr', if
15279 * not NULL, is set to point to the character just after the end
15280 * character of the class. See below for handling of warnings.
15281 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15282 * if it doesn't appear that a POSIX construct was intended.
15283 * 'updated_parse_ptr' is not changed. No warnings nor errors are
15286 * In b) there may be errors or warnings generated. If 'check_only' is
15287 * TRUE, then any errors are discarded. Warnings are returned to the
15288 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
15289 * instead it is NULL, warnings are suppressed.
15291 * The reason for this function, and its complexity is that a bracketed
15292 * character class can contain just about anything. But it's easy to
15293 * mistype the very specific posix class syntax but yielding a valid
15294 * regular bracketed class, so it silently gets compiled into something
15295 * quite unintended.
15297 * The solution adopted here maintains backward compatibility except that
15298 * it adds a warning if it looks like a posix class was intended but
15299 * improperly specified. The warning is not raised unless what is input
15300 * very closely resembles one of the 14 legal posix classes. To do this,
15301 * it uses fuzzy parsing. It calculates how many single-character edits it
15302 * would take to transform what was input into a legal posix class. Only
15303 * if that number is quite small does it think that the intention was a
15304 * posix class. Obviously these are heuristics, and there will be cases
15305 * where it errs on one side or another, and they can be tweaked as
15306 * experience informs.
15308 * The syntax for a legal posix class is:
15310 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15312 * What this routine considers syntactically to be an intended posix class
15313 * is this (the comments indicate some restrictions that the pattern
15316 * qr/(?x: \[? # The left bracket, possibly
15318 * \h* # possibly followed by blanks
15319 * (?: \^ \h* )? # possibly a misplaced caret
15320 * [:;]? # The opening class character,
15321 * # possibly omitted. A typo
15322 * # semi-colon can also be used.
15324 * \^? # possibly a correctly placed
15325 * # caret, but not if there was also
15326 * # a misplaced one
15328 * .{3,15} # The class name. If there are
15329 * # deviations from the legal syntax,
15330 * # its edit distance must be close
15331 * # to a real class name in order
15332 * # for it to be considered to be
15333 * # an intended posix class.
15335 * [[:punct:]]? # The closing class character,
15336 * # possibly omitted. If not a colon
15337 * # nor semi colon, the class name
15338 * # must be even closer to a valid
15341 * \]? # The right bracket, possibly
15345 * In the above, \h must be ASCII-only.
15347 * These are heuristics, and can be tweaked as field experience dictates.
15348 * There will be cases when someone didn't intend to specify a posix class
15349 * that this warns as being so. The goal is to minimize these, while
15350 * maximizing the catching of things intended to be a posix class that
15351 * aren't parsed as such.
15355 const char * const e = RExC_end;
15356 unsigned complement = 0; /* If to complement the class */
15357 bool found_problem = FALSE; /* Assume OK until proven otherwise */
15358 bool has_opening_bracket = FALSE;
15359 bool has_opening_colon = FALSE;
15360 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
15362 const char * possible_end = NULL; /* used for a 2nd parse pass */
15363 const char* name_start; /* ptr to class name first char */
15365 /* If the number of single-character typos the input name is away from a
15366 * legal name is no more than this number, it is considered to have meant
15367 * the legal name */
15368 int max_distance = 2;
15370 /* to store the name. The size determines the maximum length before we
15371 * decide that no posix class was intended. Should be at least
15372 * sizeof("alphanumeric") */
15374 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15376 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15378 CLEAR_POSIX_WARNINGS();
15381 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15384 if (*(p - 1) != '[') {
15385 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15386 found_problem = TRUE;
15389 has_opening_bracket = TRUE;
15392 /* They could be confused and think you can put spaces between the
15395 found_problem = TRUE;
15399 } while (p < e && isBLANK(*p));
15401 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15404 /* For [. .] and [= =]. These are quite different internally from [: :],
15405 * so they are handled separately. */
15406 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15407 and 1 for at least one char in it
15410 const char open_char = *p;
15411 const char * temp_ptr = p + 1;
15413 /* These two constructs are not handled by perl, and if we find a
15414 * syntactically valid one, we croak. khw, who wrote this code, finds
15415 * this explanation of them very unclear:
15416 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15417 * And searching the rest of the internet wasn't very helpful either.
15418 * It looks like just about any byte can be in these constructs,
15419 * depending on the locale. But unless the pattern is being compiled
15420 * under /l, which is very rare, Perl runs under the C or POSIX locale.
15421 * In that case, it looks like [= =] isn't allowed at all, and that
15422 * [. .] could be any single code point, but for longer strings the
15423 * constituent characters would have to be the ASCII alphabetics plus
15424 * the minus-hyphen. Any sensible locale definition would limit itself
15425 * to these. And any portable one definitely should. Trying to parse
15426 * the general case is a nightmare (see [perl #127604]). So, this code
15427 * looks only for interiors of these constructs that match:
15429 * Using \w relaxes the apparent rules a little, without adding much
15430 * danger of mistaking something else for one of these constructs.
15432 * [. .] in some implementations described on the internet is usable to
15433 * escape a character that otherwise is special in bracketed character
15434 * classes. For example [.].] means a literal right bracket instead of
15435 * the ending of the class
15437 * [= =] can legitimately contain a [. .] construct, but we don't
15438 * handle this case, as that [. .] construct will later get parsed
15439 * itself and croak then. And [= =] is checked for even when not under
15440 * /l, as Perl has long done so.
15442 * The code below relies on there being a trailing NUL, so it doesn't
15443 * have to keep checking if the parse ptr < e.
15445 if (temp_ptr[1] == open_char) {
15448 else while ( temp_ptr < e
15449 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15454 if (*temp_ptr == open_char) {
15456 if (*temp_ptr == ']') {
15458 if (! found_problem && ! check_only) {
15459 RExC_parse = (char *) temp_ptr;
15460 vFAIL3("POSIX syntax [%c %c] is reserved for future "
15461 "extensions", open_char, open_char);
15464 /* Here, the syntax wasn't completely valid, or else the call
15465 * is to check-only */
15466 if (updated_parse_ptr) {
15467 *updated_parse_ptr = (char *) temp_ptr;
15470 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15474 /* If we find something that started out to look like one of these
15475 * constructs, but isn't, we continue below so that it can be checked
15476 * for being a class name with a typo of '.' or '=' instead of a colon.
15480 /* Here, we think there is a possibility that a [: :] class was meant, and
15481 * we have the first real character. It could be they think the '^' comes
15484 found_problem = TRUE;
15485 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15490 found_problem = TRUE;
15494 } while (p < e && isBLANK(*p));
15496 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15500 /* But the first character should be a colon, which they could have easily
15501 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15502 * distinguish from a colon, so treat that as a colon). */
15505 has_opening_colon = TRUE;
15507 else if (*p == ';') {
15508 found_problem = TRUE;
15510 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15511 has_opening_colon = TRUE;
15514 found_problem = TRUE;
15515 ADD_POSIX_WARNING(p, "there must be a starting ':'");
15517 /* Consider an initial punctuation (not one of the recognized ones) to
15518 * be a left terminator */
15519 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15524 /* They may think that you can put spaces between the components */
15526 found_problem = TRUE;
15530 } while (p < e && isBLANK(*p));
15532 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15537 /* We consider something like [^:^alnum:]] to not have been intended to
15538 * be a posix class, but XXX maybe we should */
15540 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15547 /* Again, they may think that you can put spaces between the components */
15549 found_problem = TRUE;
15553 } while (p < e && isBLANK(*p));
15555 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15560 /* XXX This ']' may be a typo, and something else was meant. But
15561 * treating it as such creates enough complications, that that
15562 * possibility isn't currently considered here. So we assume that the
15563 * ']' is what is intended, and if we've already found an initial '[',
15564 * this leaves this construct looking like [:] or [:^], which almost
15565 * certainly weren't intended to be posix classes */
15566 if (has_opening_bracket) {
15567 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15570 /* But this function can be called when we parse the colon for
15571 * something like qr/[alpha:]]/, so we back up to look for the
15576 found_problem = TRUE;
15577 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15579 else if (*p != ':') {
15581 /* XXX We are currently very restrictive here, so this code doesn't
15582 * consider the possibility that, say, /[alpha.]]/ was intended to
15583 * be a posix class. */
15584 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15587 /* Here we have something like 'foo:]'. There was no initial colon,
15588 * and we back up over 'foo. XXX Unlike the going forward case, we
15589 * don't handle typos of non-word chars in the middle */
15590 has_opening_colon = FALSE;
15593 while (p > RExC_start && isWORDCHAR(*p)) {
15598 /* Here, we have positioned ourselves to where we think the first
15599 * character in the potential class is */
15602 /* Now the interior really starts. There are certain key characters that
15603 * can end the interior, or these could just be typos. To catch both
15604 * cases, we may have to do two passes. In the first pass, we keep on
15605 * going unless we come to a sequence that matches
15606 * qr/ [[:punct:]] [[:blank:]]* \] /xa
15607 * This means it takes a sequence to end the pass, so two typos in a row if
15608 * that wasn't what was intended. If the class is perfectly formed, just
15609 * this one pass is needed. We also stop if there are too many characters
15610 * being accumulated, but this number is deliberately set higher than any
15611 * real class. It is set high enough so that someone who thinks that
15612 * 'alphanumeric' is a correct name would get warned that it wasn't.
15613 * While doing the pass, we keep track of where the key characters were in
15614 * it. If we don't find an end to the class, and one of the key characters
15615 * was found, we redo the pass, but stop when we get to that character.
15616 * Thus the key character was considered a typo in the first pass, but a
15617 * terminator in the second. If two key characters are found, we stop at
15618 * the second one in the first pass. Again this can miss two typos, but
15619 * catches a single one
15621 * In the first pass, 'possible_end' starts as NULL, and then gets set to
15622 * point to the first key character. For the second pass, it starts as -1.
15628 bool has_blank = FALSE;
15629 bool has_upper = FALSE;
15630 bool has_terminating_colon = FALSE;
15631 bool has_terminating_bracket = FALSE;
15632 bool has_semi_colon = FALSE;
15633 unsigned int name_len = 0;
15634 int punct_count = 0;
15638 /* Squeeze out blanks when looking up the class name below */
15639 if (isBLANK(*p) ) {
15641 found_problem = TRUE;
15646 /* The name will end with a punctuation */
15648 const char * peek = p + 1;
15650 /* Treat any non-']' punctuation followed by a ']' (possibly
15651 * with intervening blanks) as trying to terminate the class.
15652 * ']]' is very likely to mean a class was intended (but
15653 * missing the colon), but the warning message that gets
15654 * generated shows the error position better if we exit the
15655 * loop at the bottom (eventually), so skip it here. */
15657 if (peek < e && isBLANK(*peek)) {
15659 found_problem = TRUE;
15662 } while (peek < e && isBLANK(*peek));
15665 if (peek < e && *peek == ']') {
15666 has_terminating_bracket = TRUE;
15668 has_terminating_colon = TRUE;
15670 else if (*p == ';') {
15671 has_semi_colon = TRUE;
15672 has_terminating_colon = TRUE;
15675 found_problem = TRUE;
15682 /* Here we have punctuation we thought didn't end the class.
15683 * Keep track of the position of the key characters that are
15684 * more likely to have been class-enders */
15685 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15687 /* Allow just one such possible class-ender not actually
15688 * ending the class. */
15689 if (possible_end) {
15695 /* If we have too many punctuation characters, no use in
15697 if (++punct_count > max_distance) {
15701 /* Treat the punctuation as a typo. */
15702 input_text[name_len++] = *p;
15705 else if (isUPPER(*p)) { /* Use lowercase for lookup */
15706 input_text[name_len++] = toLOWER(*p);
15708 found_problem = TRUE;
15710 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15711 input_text[name_len++] = *p;
15715 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15719 /* The declaration of 'input_text' is how long we allow a potential
15720 * class name to be, before saying they didn't mean a class name at
15722 if (name_len >= C_ARRAY_LENGTH(input_text)) {
15727 /* We get to here when the possible class name hasn't been properly
15728 * terminated before:
15729 * 1) we ran off the end of the pattern; or
15730 * 2) found two characters, each of which might have been intended to
15731 * be the name's terminator
15732 * 3) found so many punctuation characters in the purported name,
15733 * that the edit distance to a valid one is exceeded
15734 * 4) we decided it was more characters than anyone could have
15735 * intended to be one. */
15737 found_problem = TRUE;
15739 /* In the final two cases, we know that looking up what we've
15740 * accumulated won't lead to a match, even a fuzzy one. */
15741 if ( name_len >= C_ARRAY_LENGTH(input_text)
15742 || punct_count > max_distance)
15744 /* If there was an intermediate key character that could have been
15745 * an intended end, redo the parse, but stop there */
15746 if (possible_end && possible_end != (char *) -1) {
15747 possible_end = (char *) -1; /* Special signal value to say
15748 we've done a first pass */
15753 /* Otherwise, it can't have meant to have been a class */
15754 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15757 /* If we ran off the end, and the final character was a punctuation
15758 * one, back up one, to look at that final one just below. Later, we
15759 * will restore the parse pointer if appropriate */
15760 if (name_len && p == e && isPUNCT(*(p-1))) {
15765 if (p < e && isPUNCT(*p)) {
15767 has_terminating_bracket = TRUE;
15769 /* If this is a 2nd ']', and the first one is just below this
15770 * one, consider that to be the real terminator. This gives a
15771 * uniform and better positioning for the warning message */
15773 && possible_end != (char *) -1
15774 && *possible_end == ']'
15775 && name_len && input_text[name_len - 1] == ']')
15780 /* And this is actually equivalent to having done the 2nd
15781 * pass now, so set it to not try again */
15782 possible_end = (char *) -1;
15787 has_terminating_colon = TRUE;
15789 else if (*p == ';') {
15790 has_semi_colon = TRUE;
15791 has_terminating_colon = TRUE;
15799 /* Here, we have a class name to look up. We can short circuit the
15800 * stuff below for short names that can't possibly be meant to be a
15801 * class name. (We can do this on the first pass, as any second pass
15802 * will yield an even shorter name) */
15803 if (name_len < 3) {
15804 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15807 /* Find which class it is. Initially switch on the length of the name.
15809 switch (name_len) {
15811 if (memEQs(name_start, 4, "word")) {
15812 /* this is not POSIX, this is the Perl \w */
15813 class_number = ANYOF_WORDCHAR;
15817 /* Names all of length 5: alnum alpha ascii blank cntrl digit
15818 * graph lower print punct space upper
15819 * Offset 4 gives the best switch position. */
15820 switch (name_start[4]) {
15822 if (memBEGINs(name_start, 5, "alph")) /* alpha */
15823 class_number = ANYOF_ALPHA;
15826 if (memBEGINs(name_start, 5, "spac")) /* space */
15827 class_number = ANYOF_SPACE;
15830 if (memBEGINs(name_start, 5, "grap")) /* graph */
15831 class_number = ANYOF_GRAPH;
15834 if (memBEGINs(name_start, 5, "asci")) /* ascii */
15835 class_number = ANYOF_ASCII;
15838 if (memBEGINs(name_start, 5, "blan")) /* blank */
15839 class_number = ANYOF_BLANK;
15842 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
15843 class_number = ANYOF_CNTRL;
15846 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
15847 class_number = ANYOF_ALPHANUMERIC;
15850 if (memBEGINs(name_start, 5, "lowe")) /* lower */
15851 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
15852 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
15853 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
15856 if (memBEGINs(name_start, 5, "digi")) /* digit */
15857 class_number = ANYOF_DIGIT;
15858 else if (memBEGINs(name_start, 5, "prin")) /* print */
15859 class_number = ANYOF_PRINT;
15860 else if (memBEGINs(name_start, 5, "punc")) /* punct */
15861 class_number = ANYOF_PUNCT;
15866 if (memEQs(name_start, 6, "xdigit"))
15867 class_number = ANYOF_XDIGIT;
15871 /* If the name exactly matches a posix class name the class number will
15872 * here be set to it, and the input almost certainly was meant to be a
15873 * posix class, so we can skip further checking. If instead the syntax
15874 * is exactly correct, but the name isn't one of the legal ones, we
15875 * will return that as an error below. But if neither of these apply,
15876 * it could be that no posix class was intended at all, or that one
15877 * was, but there was a typo. We tease these apart by doing fuzzy
15878 * matching on the name */
15879 if (class_number == OOB_NAMEDCLASS && found_problem) {
15880 const UV posix_names[][6] = {
15881 { 'a', 'l', 'n', 'u', 'm' },
15882 { 'a', 'l', 'p', 'h', 'a' },
15883 { 'a', 's', 'c', 'i', 'i' },
15884 { 'b', 'l', 'a', 'n', 'k' },
15885 { 'c', 'n', 't', 'r', 'l' },
15886 { 'd', 'i', 'g', 'i', 't' },
15887 { 'g', 'r', 'a', 'p', 'h' },
15888 { 'l', 'o', 'w', 'e', 'r' },
15889 { 'p', 'r', 'i', 'n', 't' },
15890 { 'p', 'u', 'n', 'c', 't' },
15891 { 's', 'p', 'a', 'c', 'e' },
15892 { 'u', 'p', 'p', 'e', 'r' },
15893 { 'w', 'o', 'r', 'd' },
15894 { 'x', 'd', 'i', 'g', 'i', 't' }
15896 /* The names of the above all have added NULs to make them the same
15897 * size, so we need to also have the real lengths */
15898 const UV posix_name_lengths[] = {
15899 sizeof("alnum") - 1,
15900 sizeof("alpha") - 1,
15901 sizeof("ascii") - 1,
15902 sizeof("blank") - 1,
15903 sizeof("cntrl") - 1,
15904 sizeof("digit") - 1,
15905 sizeof("graph") - 1,
15906 sizeof("lower") - 1,
15907 sizeof("print") - 1,
15908 sizeof("punct") - 1,
15909 sizeof("space") - 1,
15910 sizeof("upper") - 1,
15911 sizeof("word") - 1,
15912 sizeof("xdigit")- 1
15915 int temp_max = max_distance; /* Use a temporary, so if we
15916 reparse, we haven't changed the
15919 /* Use a smaller max edit distance if we are missing one of the
15921 if ( has_opening_bracket + has_opening_colon < 2
15922 || has_terminating_bracket + has_terminating_colon < 2)
15927 /* See if the input name is close to a legal one */
15928 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
15930 /* Short circuit call if the lengths are too far apart to be
15932 if (abs( (int) (name_len - posix_name_lengths[i]))
15938 if (edit_distance(input_text,
15941 posix_name_lengths[i],
15945 { /* If it is close, it probably was intended to be a class */
15946 goto probably_meant_to_be;
15950 /* Here the input name is not close enough to a valid class name
15951 * for us to consider it to be intended to be a posix class. If
15952 * we haven't already done so, and the parse found a character that
15953 * could have been terminators for the name, but which we absorbed
15954 * as typos during the first pass, repeat the parse, signalling it
15955 * to stop at that character */
15956 if (possible_end && possible_end != (char *) -1) {
15957 possible_end = (char *) -1;
15962 /* Here neither pass found a close-enough class name */
15963 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15966 probably_meant_to_be:
15968 /* Here we think that a posix specification was intended. Update any
15970 if (updated_parse_ptr) {
15971 *updated_parse_ptr = (char *) p;
15974 /* If a posix class name was intended but incorrectly specified, we
15975 * output or return the warnings */
15976 if (found_problem) {
15978 /* We set flags for these issues in the parse loop above instead of
15979 * adding them to the list of warnings, because we can parse it
15980 * twice, and we only want one warning instance */
15982 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
15985 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15987 if (has_semi_colon) {
15988 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15990 else if (! has_terminating_colon) {
15991 ADD_POSIX_WARNING(p, "there is no terminating ':'");
15993 if (! has_terminating_bracket) {
15994 ADD_POSIX_WARNING(p, "there is no terminating ']'");
15997 if ( posix_warnings
15999 && av_top_index(RExC_warn_text) > -1)
16001 *posix_warnings = RExC_warn_text;
16004 else if (class_number != OOB_NAMEDCLASS) {
16005 /* If it is a known class, return the class. The class number
16006 * #defines are structured so each complement is +1 to the normal
16008 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
16010 else if (! check_only) {
16012 /* Here, it is an unrecognized class. This is an error (unless the
16013 * call is to check only, which we've already handled above) */
16014 const char * const complement_string = (complement)
16017 RExC_parse = (char *) p;
16018 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16020 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16024 return OOB_NAMEDCLASS;
16026 #undef ADD_POSIX_WARNING
16028 STATIC unsigned int
16029 S_regex_set_precedence(const U8 my_operator) {
16031 /* Returns the precedence in the (?[...]) construct of the input operator,
16032 * specified by its character representation. The precedence follows
16033 * general Perl rules, but it extends this so that ')' and ']' have (low)
16034 * precedence even though they aren't really operators */
16036 switch (my_operator) {
16052 NOT_REACHED; /* NOTREACHED */
16053 return 0; /* Silence compiler warning */
16056 STATIC regnode_offset
16057 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16058 I32 *flagp, U32 depth,
16059 char * const oregcomp_parse)
16061 /* Handle the (?[...]) construct to do set operations */
16063 U8 curchar; /* Current character being parsed */
16064 UV start, end; /* End points of code point ranges */
16065 SV* final = NULL; /* The end result inversion list */
16066 SV* result_string; /* 'final' stringified */
16067 AV* stack; /* stack of operators and operands not yet
16069 AV* fence_stack = NULL; /* A stack containing the positions in
16070 'stack' of where the undealt-with left
16071 parens would be if they were actually
16073 /* The 'volatile' is a workaround for an optimiser bug
16074 * in Solaris Studio 12.3. See RT #127455 */
16075 volatile IV fence = 0; /* Position of where most recent undealt-
16076 with left paren in stack is; -1 if none.
16078 STRLEN len; /* Temporary */
16079 regnode_offset node; /* Temporary, and final regnode returned by
16081 const bool save_fold = FOLD; /* Temporary */
16082 char *save_end, *save_parse; /* Temporaries */
16083 const bool in_locale = LOC; /* we turn off /l during processing */
16085 GET_RE_DEBUG_FLAGS_DECL;
16087 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16089 DEBUG_PARSE("xcls");
16092 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16095 /* The use of this operator implies /u. This is required so that the
16096 * compile time values are valid in all runtime cases */
16097 REQUIRE_UNI_RULES(flagp, 0);
16099 ckWARNexperimental(RExC_parse,
16100 WARN_EXPERIMENTAL__REGEX_SETS,
16101 "The regex_sets feature is experimental");
16103 /* Everything in this construct is a metacharacter. Operands begin with
16104 * either a '\' (for an escape sequence), or a '[' for a bracketed
16105 * character class. Any other character should be an operator, or
16106 * parenthesis for grouping. Both types of operands are handled by calling
16107 * regclass() to parse them. It is called with a parameter to indicate to
16108 * return the computed inversion list. The parsing here is implemented via
16109 * a stack. Each entry on the stack is a single character representing one
16110 * of the operators; or else a pointer to an operand inversion list. */
16112 #define IS_OPERATOR(a) SvIOK(a)
16113 #define IS_OPERAND(a) (! IS_OPERATOR(a))
16115 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
16116 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
16117 * with pronouncing it called it Reverse Polish instead, but now that YOU
16118 * know how to pronounce it you can use the correct term, thus giving due
16119 * credit to the person who invented it, and impressing your geek friends.
16120 * Wikipedia says that the pronounciation of "Ł" has been changing so that
16121 * it is now more like an English initial W (as in wonk) than an L.)
16123 * This means that, for example, 'a | b & c' is stored on the stack as
16131 * where the numbers in brackets give the stack [array] element number.
16132 * In this implementation, parentheses are not stored on the stack.
16133 * Instead a '(' creates a "fence" so that the part of the stack below the
16134 * fence is invisible except to the corresponding ')' (this allows us to
16135 * replace testing for parens, by using instead subtraction of the fence
16136 * position). As new operands are processed they are pushed onto the stack
16137 * (except as noted in the next paragraph). New operators of higher
16138 * precedence than the current final one are inserted on the stack before
16139 * the lhs operand (so that when the rhs is pushed next, everything will be
16140 * in the correct positions shown above. When an operator of equal or
16141 * lower precedence is encountered in parsing, all the stacked operations
16142 * of equal or higher precedence are evaluated, leaving the result as the
16143 * top entry on the stack. This makes higher precedence operations
16144 * evaluate before lower precedence ones, and causes operations of equal
16145 * precedence to left associate.
16147 * The only unary operator '!' is immediately pushed onto the stack when
16148 * encountered. When an operand is encountered, if the top of the stack is
16149 * a '!", the complement is immediately performed, and the '!' popped. The
16150 * resulting value is treated as a new operand, and the logic in the
16151 * previous paragraph is executed. Thus in the expression
16153 * the stack looks like
16159 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
16166 * A ')' is treated as an operator with lower precedence than all the
16167 * aforementioned ones, which causes all operations on the stack above the
16168 * corresponding '(' to be evaluated down to a single resultant operand.
16169 * Then the fence for the '(' is removed, and the operand goes through the
16170 * algorithm above, without the fence.
16172 * A separate stack is kept of the fence positions, so that the position of
16173 * the latest so-far unbalanced '(' is at the top of it.
16175 * The ']' ending the construct is treated as the lowest operator of all,
16176 * so that everything gets evaluated down to a single operand, which is the
16179 sv_2mortal((SV *)(stack = newAV()));
16180 sv_2mortal((SV *)(fence_stack = newAV()));
16182 while (RExC_parse < RExC_end) {
16183 I32 top_index; /* Index of top-most element in 'stack' */
16184 SV** top_ptr; /* Pointer to top 'stack' element */
16185 SV* current = NULL; /* To contain the current inversion list
16187 SV* only_to_avoid_leaks;
16189 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16190 TRUE /* Force /x */ );
16191 if (RExC_parse >= RExC_end) { /* Fail */
16195 curchar = UCHARAT(RExC_parse);
16199 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16200 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
16201 DEBUG_U(dump_regex_sets_structures(pRExC_state,
16202 stack, fence, fence_stack));
16205 top_index = av_tindex_skip_len_mg(stack);
16208 SV** stacked_ptr; /* Ptr to something already on 'stack' */
16209 char stacked_operator; /* The topmost operator on the 'stack'. */
16210 SV* lhs; /* Operand to the left of the operator */
16211 SV* rhs; /* Operand to the right of the operator */
16212 SV* fence_ptr; /* Pointer to top element of the fence
16217 if ( RExC_parse < RExC_end - 2
16218 && UCHARAT(RExC_parse + 1) == '?'
16219 && UCHARAT(RExC_parse + 2) == '^')
16221 /* If is a '(?', could be an embedded '(?^flags:(?[...])'.
16222 * This happens when we have some thing like
16224 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
16226 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
16228 * Here we would be handling the interpolated
16229 * '$thai_or_lao'. We handle this by a recursive call to
16230 * ourselves which returns the inversion list the
16231 * interpolated expression evaluates to. We use the flags
16232 * from the interpolated pattern. */
16233 U32 save_flags = RExC_flags;
16234 const char * save_parse;
16236 RExC_parse += 2; /* Skip past the '(?' */
16237 save_parse = RExC_parse;
16239 /* Parse the flags for the '(?'. We already know the first
16240 * flag to parse is a '^' */
16241 parse_lparen_question_flags(pRExC_state);
16243 if ( RExC_parse >= RExC_end - 4
16244 || UCHARAT(RExC_parse) != ':'
16245 || UCHARAT(++RExC_parse) != '('
16246 || UCHARAT(++RExC_parse) != '?'
16247 || UCHARAT(++RExC_parse) != '[')
16250 /* In combination with the above, this moves the
16251 * pointer to the point just after the first erroneous
16253 if (RExC_parse >= RExC_end - 4) {
16254 RExC_parse = RExC_end;
16256 else if (RExC_parse != save_parse) {
16257 RExC_parse += (UTF)
16258 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
16261 vFAIL("Expecting '(?flags:(?[...'");
16264 /* Recurse, with the meat of the embedded expression */
16266 if (! handle_regex_sets(pRExC_state, ¤t, flagp,
16267 depth+1, oregcomp_parse))
16269 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16272 /* Here, 'current' contains the embedded expression's
16273 * inversion list, and RExC_parse points to the trailing
16274 * ']'; the next character should be the ')' */
16276 if (UCHARAT(RExC_parse) != ')')
16277 vFAIL("Expecting close paren for nested extended charclass");
16279 /* Then the ')' matching the original '(' handled by this
16280 * case: statement */
16282 if (UCHARAT(RExC_parse) != ')')
16283 vFAIL("Expecting close paren for wrapper for nested extended charclass");
16285 RExC_flags = save_flags;
16286 goto handle_operand;
16289 /* A regular '('. Look behind for illegal syntax */
16290 if (top_index - fence >= 0) {
16291 /* If the top entry on the stack is an operator, it had
16292 * better be a '!', otherwise the entry below the top
16293 * operand should be an operator */
16294 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
16295 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16296 || ( IS_OPERAND(*top_ptr)
16297 && ( top_index - fence < 1
16298 || ! (stacked_ptr = av_fetch(stack,
16301 || ! IS_OPERATOR(*stacked_ptr))))
16304 vFAIL("Unexpected '(' with no preceding operator");
16308 /* Stack the position of this undealt-with left paren */
16309 av_push(fence_stack, newSViv(fence));
16310 fence = top_index + 1;
16314 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16315 * multi-char folds are allowed. */
16316 if (!regclass(pRExC_state, flagp, depth+1,
16317 TRUE, /* means parse just the next thing */
16318 FALSE, /* don't allow multi-char folds */
16319 FALSE, /* don't silence non-portable warnings. */
16321 FALSE, /* Require return to be an ANYOF */
16324 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16325 goto regclass_failed;
16328 /* regclass() will return with parsing just the \ sequence,
16329 * leaving the parse pointer at the next thing to parse */
16331 goto handle_operand;
16333 case '[': /* Is a bracketed character class */
16335 /* See if this is a [:posix:] class. */
16336 bool is_posix_class = (OOB_NAMEDCLASS
16337 < handle_possible_posix(pRExC_state,
16341 TRUE /* checking only */));
16342 /* If it is a posix class, leave the parse pointer at the '['
16343 * to fool regclass() into thinking it is part of a
16344 * '[[:posix:]]'. */
16345 if (! is_posix_class) {
16349 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16350 * multi-char folds are allowed. */
16351 if (!regclass(pRExC_state, flagp, depth+1,
16352 is_posix_class, /* parse the whole char
16353 class only if not a
16355 FALSE, /* don't allow multi-char folds */
16356 TRUE, /* silence non-portable warnings. */
16358 FALSE, /* Require return to be an ANYOF */
16361 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16362 goto regclass_failed;
16369 /* function call leaves parse pointing to the ']', except if we
16371 if (is_posix_class) {
16375 goto handle_operand;
16379 if (top_index >= 1) {
16380 goto join_operators;
16383 /* Only a single operand on the stack: are done */
16387 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16388 if (UCHARAT(RExC_parse - 1) == ']') {
16392 vFAIL("Unexpected ')'");
16395 /* If nothing after the fence, is missing an operand */
16396 if (top_index - fence < 0) {
16400 /* If at least two things on the stack, treat this as an
16402 if (top_index - fence >= 1) {
16403 goto join_operators;
16406 /* Here only a single thing on the fenced stack, and there is a
16407 * fence. Get rid of it */
16408 fence_ptr = av_pop(fence_stack);
16410 fence = SvIV(fence_ptr);
16411 SvREFCNT_dec_NN(fence_ptr);
16418 /* Having gotten rid of the fence, we pop the operand at the
16419 * stack top and process it as a newly encountered operand */
16420 current = av_pop(stack);
16421 if (IS_OPERAND(current)) {
16422 goto handle_operand;
16434 /* These binary operators should have a left operand already
16436 if ( top_index - fence < 0
16437 || top_index - fence == 1
16438 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16439 || ! IS_OPERAND(*top_ptr))
16441 goto unexpected_binary;
16444 /* If only the one operand is on the part of the stack visible
16445 * to us, we just place this operator in the proper position */
16446 if (top_index - fence < 2) {
16448 /* Place the operator before the operand */
16450 SV* lhs = av_pop(stack);
16451 av_push(stack, newSVuv(curchar));
16452 av_push(stack, lhs);
16456 /* But if there is something else on the stack, we need to
16457 * process it before this new operator if and only if the
16458 * stacked operation has equal or higher precedence than the
16463 /* The operator on the stack is supposed to be below both its
16465 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16466 || IS_OPERAND(*stacked_ptr))
16468 /* But if not, it's legal and indicates we are completely
16469 * done if and only if we're currently processing a ']',
16470 * which should be the final thing in the expression */
16471 if (curchar == ']') {
16477 vFAIL2("Unexpected binary operator '%c' with no "
16478 "preceding operand", curchar);
16480 stacked_operator = (char) SvUV(*stacked_ptr);
16482 if (regex_set_precedence(curchar)
16483 > regex_set_precedence(stacked_operator))
16485 /* Here, the new operator has higher precedence than the
16486 * stacked one. This means we need to add the new one to
16487 * the stack to await its rhs operand (and maybe more
16488 * stuff). We put it before the lhs operand, leaving
16489 * untouched the stacked operator and everything below it
16491 lhs = av_pop(stack);
16492 assert(IS_OPERAND(lhs));
16494 av_push(stack, newSVuv(curchar));
16495 av_push(stack, lhs);
16499 /* Here, the new operator has equal or lower precedence than
16500 * what's already there. This means the operation already
16501 * there should be performed now, before the new one. */
16503 rhs = av_pop(stack);
16504 if (! IS_OPERAND(rhs)) {
16506 /* This can happen when a ! is not followed by an operand,
16507 * like in /(?[\t &!])/ */
16511 lhs = av_pop(stack);
16513 if (! IS_OPERAND(lhs)) {
16515 /* This can happen when there is an empty (), like in
16516 * /(?[[0]+()+])/ */
16520 switch (stacked_operator) {
16522 _invlist_intersection(lhs, rhs, &rhs);
16527 _invlist_union(lhs, rhs, &rhs);
16531 _invlist_subtract(lhs, rhs, &rhs);
16534 case '^': /* The union minus the intersection */
16539 _invlist_union(lhs, rhs, &u);
16540 _invlist_intersection(lhs, rhs, &i);
16541 _invlist_subtract(u, i, &rhs);
16542 SvREFCNT_dec_NN(i);
16543 SvREFCNT_dec_NN(u);
16549 /* Here, the higher precedence operation has been done, and the
16550 * result is in 'rhs'. We overwrite the stacked operator with
16551 * the result. Then we redo this code to either push the new
16552 * operator onto the stack or perform any higher precedence
16553 * stacked operation */
16554 only_to_avoid_leaks = av_pop(stack);
16555 SvREFCNT_dec(only_to_avoid_leaks);
16556 av_push(stack, rhs);
16559 case '!': /* Highest priority, right associative */
16561 /* If what's already at the top of the stack is another '!",
16562 * they just cancel each other out */
16563 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
16564 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16566 only_to_avoid_leaks = av_pop(stack);
16567 SvREFCNT_dec(only_to_avoid_leaks);
16569 else { /* Otherwise, since it's right associative, just push
16571 av_push(stack, newSVuv(curchar));
16576 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16577 if (RExC_parse >= RExC_end) {
16580 vFAIL("Unexpected character");
16584 /* Here 'current' is the operand. If something is already on the
16585 * stack, we have to check if it is a !. But first, the code above
16586 * may have altered the stack in the time since we earlier set
16589 top_index = av_tindex_skip_len_mg(stack);
16590 if (top_index - fence >= 0) {
16591 /* If the top entry on the stack is an operator, it had better
16592 * be a '!', otherwise the entry below the top operand should
16593 * be an operator */
16594 top_ptr = av_fetch(stack, top_index, FALSE);
16596 if (IS_OPERATOR(*top_ptr)) {
16598 /* The only permissible operator at the top of the stack is
16599 * '!', which is applied immediately to this operand. */
16600 curchar = (char) SvUV(*top_ptr);
16601 if (curchar != '!') {
16602 SvREFCNT_dec(current);
16603 vFAIL2("Unexpected binary operator '%c' with no "
16604 "preceding operand", curchar);
16607 _invlist_invert(current);
16609 only_to_avoid_leaks = av_pop(stack);
16610 SvREFCNT_dec(only_to_avoid_leaks);
16612 /* And we redo with the inverted operand. This allows
16613 * handling multiple ! in a row */
16614 goto handle_operand;
16616 /* Single operand is ok only for the non-binary ')'
16618 else if ((top_index - fence == 0 && curchar != ')')
16619 || (top_index - fence > 0
16620 && (! (stacked_ptr = av_fetch(stack,
16623 || IS_OPERAND(*stacked_ptr))))
16625 SvREFCNT_dec(current);
16626 vFAIL("Operand with no preceding operator");
16630 /* Here there was nothing on the stack or the top element was
16631 * another operand. Just add this new one */
16632 av_push(stack, current);
16634 } /* End of switch on next parse token */
16636 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16637 } /* End of loop parsing through the construct */
16639 vFAIL("Syntax error in (?[...])");
16643 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16644 if (RExC_parse < RExC_end) {
16648 vFAIL("Unexpected ']' with no following ')' in (?[...");
16651 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16652 vFAIL("Unmatched (");
16655 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
16656 || ((final = av_pop(stack)) == NULL)
16657 || ! IS_OPERAND(final)
16658 || ! is_invlist(final)
16659 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
16662 SvREFCNT_dec(final);
16663 vFAIL("Incomplete expression within '(?[ ])'");
16666 /* Here, 'final' is the resultant inversion list from evaluating the
16667 * expression. Return it if so requested */
16668 if (return_invlist) {
16669 *return_invlist = final;
16673 /* Otherwise generate a resultant node, based on 'final'. regclass() is
16674 * expecting a string of ranges and individual code points */
16675 invlist_iterinit(final);
16676 result_string = newSVpvs("");
16677 while (invlist_iternext(final, &start, &end)) {
16678 if (start == end) {
16679 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16682 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
16687 /* About to generate an ANYOF (or similar) node from the inversion list we
16688 * have calculated */
16689 save_parse = RExC_parse;
16690 RExC_parse = SvPV(result_string, len);
16691 save_end = RExC_end;
16692 RExC_end = RExC_parse + len;
16693 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16695 /* We turn off folding around the call, as the class we have constructed
16696 * already has all folding taken into consideration, and we don't want
16697 * regclass() to add to that */
16698 RExC_flags &= ~RXf_PMf_FOLD;
16699 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16700 * folds are allowed. */
16701 node = regclass(pRExC_state, flagp, depth+1,
16702 FALSE, /* means parse the whole char class */
16703 FALSE, /* don't allow multi-char folds */
16704 TRUE, /* silence non-portable warnings. The above may very
16705 well have generated non-portable code points, but
16706 they're valid on this machine */
16707 FALSE, /* similarly, no need for strict */
16709 /* We can optimize into something besides an ANYOF, except
16710 * under /l, which needs to be ANYOF because of runtime
16711 * checks for locale sanity, etc */
16717 RExC_parse = save_parse + 1;
16718 RExC_end = save_end;
16719 SvREFCNT_dec_NN(final);
16720 SvREFCNT_dec_NN(result_string);
16723 RExC_flags |= RXf_PMf_FOLD;
16727 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16728 goto regclass_failed;
16731 /* Fix up the node type if we are in locale. (We have pretended we are
16732 * under /u for the purposes of regclass(), as this construct will only
16733 * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so
16734 * as to cause any warnings about bad locales to be output in regexec.c),
16735 * and add the flag that indicates to check if not in a UTF-8 locale. The
16736 * reason we above forbid optimization into something other than an ANYOF
16737 * node is simply to minimize the number of code changes in regexec.c.
16738 * Otherwise we would have to create new EXACTish node types and deal with
16739 * them. This decision could be revisited should this construct become
16742 * (One might think we could look at the resulting ANYOF node and suppress
16743 * the flag if everything is above 255, as those would be UTF-8 only,
16744 * but this isn't true, as the components that led to that result could
16745 * have been locale-affected, and just happen to cancel each other out
16746 * under UTF-8 locales.) */
16748 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16750 assert(OP(REGNODE_p(node)) == ANYOF);
16752 OP(REGNODE_p(node)) = ANYOFL;
16753 ANYOF_FLAGS(REGNODE_p(node))
16754 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16757 nextchar(pRExC_state);
16758 Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16762 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
16766 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16769 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16770 AV * stack, const IV fence, AV * fence_stack)
16771 { /* Dumps the stacks in handle_regex_sets() */
16773 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16774 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16777 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16779 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16781 if (stack_top < 0) {
16782 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16785 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16786 for (i = stack_top; i >= 0; i--) {
16787 SV ** element_ptr = av_fetch(stack, i, FALSE);
16788 if (! element_ptr) {
16791 if (IS_OPERATOR(*element_ptr)) {
16792 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
16793 (int) i, (int) SvIV(*element_ptr));
16796 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
16797 sv_dump(*element_ptr);
16802 if (fence_stack_top < 0) {
16803 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
16806 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
16807 for (i = fence_stack_top; i >= 0; i--) {
16808 SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
16809 if (! element_ptr) {
16812 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
16813 (int) i, (int) SvIV(*element_ptr));
16824 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
16826 /* This adds the Latin1/above-Latin1 folding rules.
16828 * This should be called only for a Latin1-range code points, cp, which is
16829 * known to be involved in a simple fold with other code points above
16830 * Latin1. It would give false results if /aa has been specified.
16831 * Multi-char folds are outside the scope of this, and must be handled
16834 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
16836 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
16838 /* The rules that are valid for all Unicode versions are hard-coded in */
16843 add_cp_to_invlist(*invlist, KELVIN_SIGN);
16847 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
16850 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
16851 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
16853 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
16854 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
16855 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
16857 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
16858 *invlist = add_cp_to_invlist(*invlist,
16859 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
16862 default: /* Other code points are checked against the data for the
16863 current Unicode version */
16865 Size_t folds_count;
16866 unsigned int first_fold;
16867 const unsigned int * remaining_folds;
16871 folded_cp = toFOLD(cp);
16874 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
16876 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
16879 if (folded_cp > 255) {
16880 *invlist = add_cp_to_invlist(*invlist, folded_cp);
16883 folds_count = _inverse_folds(folded_cp, &first_fold,
16885 if (folds_count == 0) {
16887 /* Use deprecated warning to increase the chances of this being
16889 ckWARN2reg_d(RExC_parse,
16890 "Perl folding rules are not up-to-date for 0x%02X;"
16891 " please use the perlbug utility to report;", cp);
16896 if (first_fold > 255) {
16897 *invlist = add_cp_to_invlist(*invlist, first_fold);
16899 for (i = 0; i < folds_count - 1; i++) {
16900 if (remaining_folds[i] > 255) {
16901 *invlist = add_cp_to_invlist(*invlist,
16902 remaining_folds[i]);
16912 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
16914 /* Output the elements of the array given by '*posix_warnings' as REGEXP
16918 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
16920 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
16922 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
16926 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
16927 if (first_is_fatal) { /* Avoid leaking this */
16928 av_undef(posix_warnings); /* This isn't necessary if the
16929 array is mortal, but is a
16931 (void) sv_2mortal(msg);
16934 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
16935 SvREFCNT_dec_NN(msg);
16938 UPDATE_WARNINGS_LOC(RExC_parse);
16941 PERL_STATIC_INLINE Size_t
16942 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
16944 const U8 * const start = s1;
16945 const U8 * const send = start + max;
16947 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
16949 while (s1 < send && *s1 == *s2) {
16958 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
16960 /* This adds the string scalar <multi_string> to the array
16961 * <multi_char_matches>. <multi_string> is known to have exactly
16962 * <cp_count> code points in it. This is used when constructing a
16963 * bracketed character class and we find something that needs to match more
16964 * than a single character.
16966 * <multi_char_matches> is actually an array of arrays. Each top-level
16967 * element is an array that contains all the strings known so far that are
16968 * the same length. And that length (in number of code points) is the same
16969 * as the index of the top-level array. Hence, the [2] element is an
16970 * array, each element thereof is a string containing TWO code points;
16971 * while element [3] is for strings of THREE characters, and so on. Since
16972 * this is for multi-char strings there can never be a [0] nor [1] element.
16974 * When we rewrite the character class below, we will do so such that the
16975 * longest strings are written first, so that it prefers the longest
16976 * matching strings first. This is done even if it turns out that any
16977 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
16978 * Christiansen has agreed that this is ok. This makes the test for the
16979 * ligature 'ffi' come before the test for 'ff', for example */
16982 AV** this_array_ptr;
16984 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
16986 if (! multi_char_matches) {
16987 multi_char_matches = newAV();
16990 if (av_exists(multi_char_matches, cp_count)) {
16991 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
16992 this_array = *this_array_ptr;
16995 this_array = newAV();
16996 av_store(multi_char_matches, cp_count,
16999 av_push(this_array, multi_string);
17001 return multi_char_matches;
17004 /* The names of properties whose definitions are not known at compile time are
17005 * stored in this SV, after a constant heading. So if the length has been
17006 * changed since initialization, then there is a run-time definition. */
17007 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
17008 (SvCUR(listsv) != initial_listsv_len)
17010 /* There is a restricted set of white space characters that are legal when
17011 * ignoring white space in a bracketed character class. This generates the
17012 * code to skip them.
17014 * There is a line below that uses the same white space criteria but is outside
17015 * this macro. Both here and there must use the same definition */
17016 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
17019 while (isBLANK_A(UCHARAT(p))) \
17026 STATIC regnode_offset
17027 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17028 const bool stop_at_1, /* Just parse the next thing, don't
17029 look for a full character class */
17030 bool allow_mutiple_chars,
17031 const bool silence_non_portable, /* Don't output warnings
17035 bool optimizable, /* ? Allow a non-ANYOF return
17037 SV** ret_invlist /* Return an inversion list, not a node */
17040 /* parse a bracketed class specification. Most of these will produce an
17041 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17042 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
17043 * under /i with multi-character folds: it will be rewritten following the
17044 * paradigm of this example, where the <multi-fold>s are characters which
17045 * fold to multiple character sequences:
17046 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17047 * gets effectively rewritten as:
17048 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17049 * reg() gets called (recursively) on the rewritten version, and this
17050 * function will return what it constructs. (Actually the <multi-fold>s
17051 * aren't physically removed from the [abcdefghi], it's just that they are
17052 * ignored in the recursion by means of a flag:
17053 * <RExC_in_multi_char_class>.)
17055 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17056 * characters, with the corresponding bit set if that character is in the
17057 * list. For characters above this, an inversion list is used. There
17058 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17059 * determinable at compile time
17061 * On success, returns the offset at which any next node should be placed
17062 * into the regex engine program being compiled.
17064 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17065 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17070 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17072 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17073 regnode_offset ret = -1; /* Initialized to an illegal value */
17075 int namedclass = OOB_NAMEDCLASS;
17076 char *rangebegin = NULL;
17077 SV *listsv = NULL; /* List of \p{user-defined} whose definitions
17078 aren't available at the time this was called */
17079 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17080 than just initialized. */
17081 SV* properties = NULL; /* Code points that match \p{} \P{} */
17082 SV* posixes = NULL; /* Code points that match classes like [:word:],
17083 extended beyond the Latin1 range. These have to
17084 be kept separate from other code points for much
17085 of this function because their handling is
17086 different under /i, and for most classes under
17088 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
17089 separate for a while from the non-complemented
17090 versions because of complications with /d
17092 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17093 treated more simply than the general case,
17094 leading to less compilation and execution
17096 UV element_count = 0; /* Number of distinct elements in the class.
17097 Optimizations may be possible if this is tiny */
17098 AV * multi_char_matches = NULL; /* Code points that fold to more than one
17099 character; used under /i */
17101 char * stop_ptr = RExC_end; /* where to stop parsing */
17103 /* ignore unescaped whitespace? */
17104 const bool skip_white = cBOOL( ret_invlist
17105 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17107 /* inversion list of code points this node matches only when the target
17108 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
17110 SV* upper_latin1_only_utf8_matches = NULL;
17112 /* Inversion list of code points this node matches regardless of things
17113 * like locale, folding, utf8ness of the target string */
17114 SV* cp_list = NULL;
17116 /* Like cp_list, but code points on this list need to be checked for things
17117 * that fold to/from them under /i */
17118 SV* cp_foldable_list = NULL;
17120 /* Like cp_list, but code points on this list are valid only when the
17121 * runtime locale is UTF-8 */
17122 SV* only_utf8_locale_list = NULL;
17124 /* In a range, if one of the endpoints is non-character-set portable,
17125 * meaning that it hard-codes a code point that may mean a different
17126 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
17127 * mnemonic '\t' which each mean the same character no matter which
17128 * character set the platform is on. */
17129 unsigned int non_portable_endpoint = 0;
17131 /* Is the range unicode? which means on a platform that isn't 1-1 native
17132 * to Unicode (i.e. non-ASCII), each code point in it should be considered
17133 * to be a Unicode value. */
17134 bool unicode_range = FALSE;
17135 bool invert = FALSE; /* Is this class to be complemented */
17137 bool warn_super = ALWAYS_WARN_SUPER;
17139 const char * orig_parse = RExC_parse;
17141 /* This variable is used to mark where the end in the input is of something
17142 * that looks like a POSIX construct but isn't. During the parse, when
17143 * something looks like it could be such a construct is encountered, it is
17144 * checked for being one, but not if we've already checked this area of the
17145 * input. Only after this position is reached do we check again */
17146 char *not_posix_region_end = RExC_parse - 1;
17148 AV* posix_warnings = NULL;
17149 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
17150 U8 op = END; /* The returned node-type, initialized to an impossible
17152 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
17153 U32 posixl = 0; /* bit field of posix classes matched under /l */
17156 /* Flags as to what things aren't knowable until runtime. (Note that these are
17157 * mutually exclusive.) */
17158 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
17159 haven't been defined as of yet */
17160 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
17162 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
17163 what gets folded */
17164 U32 has_runtime_dependency = 0; /* OR of the above flags */
17166 GET_RE_DEBUG_FLAGS_DECL;
17168 PERL_ARGS_ASSERT_REGCLASS;
17170 PERL_UNUSED_ARG(depth);
17174 /* If wants an inversion list returned, we can't optimize to something
17177 optimizable = FALSE;
17180 DEBUG_PARSE("clas");
17182 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
17183 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
17184 && UNICODE_DOT_DOT_VERSION == 0)
17185 allow_mutiple_chars = FALSE;
17188 /* We include the /i status at the beginning of this so that we can
17189 * know it at runtime */
17190 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
17191 initial_listsv_len = SvCUR(listsv);
17192 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
17194 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17196 assert(RExC_parse <= RExC_end);
17198 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
17201 allow_mutiple_chars = FALSE;
17203 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17206 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
17207 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
17208 int maybe_class = handle_possible_posix(pRExC_state,
17210 ¬_posix_region_end,
17212 TRUE /* checking only */);
17213 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
17214 ckWARN4reg(not_posix_region_end,
17215 "POSIX syntax [%c %c] belongs inside character classes%s",
17216 *RExC_parse, *RExC_parse,
17217 (maybe_class == OOB_NAMEDCLASS)
17218 ? ((POSIXCC_NOTYET(*RExC_parse))
17219 ? " (but this one isn't implemented)"
17220 : " (but this one isn't fully valid)")
17226 /* If the caller wants us to just parse a single element, accomplish this
17227 * by faking the loop ending condition */
17228 if (stop_at_1 && RExC_end > RExC_parse) {
17229 stop_ptr = RExC_parse + 1;
17232 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
17233 if (UCHARAT(RExC_parse) == ']')
17234 goto charclassloop;
17238 if ( posix_warnings
17239 && av_tindex_skip_len_mg(posix_warnings) >= 0
17240 && RExC_parse > not_posix_region_end)
17242 /* Warnings about posix class issues are considered tentative until
17243 * we are far enough along in the parse that we can no longer
17244 * change our mind, at which point we output them. This is done
17245 * each time through the loop so that a later class won't zap them
17246 * before they have been dealt with. */
17247 output_posix_warnings(pRExC_state, posix_warnings);
17250 if (RExC_parse >= stop_ptr) {
17254 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17256 if (UCHARAT(RExC_parse) == ']') {
17262 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17263 save_value = value;
17264 save_prevvalue = prevvalue;
17267 rangebegin = RExC_parse;
17269 non_portable_endpoint = 0;
17271 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17272 value = utf8n_to_uvchr((U8*)RExC_parse,
17273 RExC_end - RExC_parse,
17274 &numlen, UTF8_ALLOW_DEFAULT);
17275 RExC_parse += numlen;
17278 value = UCHARAT(RExC_parse++);
17280 if (value == '[') {
17281 char * posix_class_end;
17282 namedclass = handle_possible_posix(pRExC_state,
17285 do_posix_warnings ? &posix_warnings : NULL,
17286 FALSE /* die if error */);
17287 if (namedclass > OOB_NAMEDCLASS) {
17289 /* If there was an earlier attempt to parse this particular
17290 * posix class, and it failed, it was a false alarm, as this
17291 * successful one proves */
17292 if ( posix_warnings
17293 && av_tindex_skip_len_mg(posix_warnings) >= 0
17294 && not_posix_region_end >= RExC_parse
17295 && not_posix_region_end <= posix_class_end)
17297 av_undef(posix_warnings);
17300 RExC_parse = posix_class_end;
17302 else if (namedclass == OOB_NAMEDCLASS) {
17303 not_posix_region_end = posix_class_end;
17306 namedclass = OOB_NAMEDCLASS;
17309 else if ( RExC_parse - 1 > not_posix_region_end
17310 && MAYBE_POSIXCC(value))
17312 (void) handle_possible_posix(
17314 RExC_parse - 1, /* -1 because parse has already been
17316 ¬_posix_region_end,
17317 do_posix_warnings ? &posix_warnings : NULL,
17318 TRUE /* checking only */);
17320 else if ( strict && ! skip_white
17321 && ( _generic_isCC(value, _CC_VERTSPACE)
17322 || is_VERTWS_cp_high(value)))
17324 vFAIL("Literal vertical space in [] is illegal except under /x");
17326 else if (value == '\\') {
17327 /* Is a backslash; get the code point of the char after it */
17329 if (RExC_parse >= RExC_end) {
17330 vFAIL("Unmatched [");
17333 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17334 value = utf8n_to_uvchr((U8*)RExC_parse,
17335 RExC_end - RExC_parse,
17336 &numlen, UTF8_ALLOW_DEFAULT);
17337 RExC_parse += numlen;
17340 value = UCHARAT(RExC_parse++);
17342 /* Some compilers cannot handle switching on 64-bit integer
17343 * values, therefore value cannot be an UV. Yes, this will
17344 * be a problem later if we want switch on Unicode.
17345 * A similar issue a little bit later when switching on
17346 * namedclass. --jhi */
17348 /* If the \ is escaping white space when white space is being
17349 * skipped, it means that that white space is wanted literally, and
17350 * is already in 'value'. Otherwise, need to translate the escape
17351 * into what it signifies. */
17352 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17354 case 'w': namedclass = ANYOF_WORDCHAR; break;
17355 case 'W': namedclass = ANYOF_NWORDCHAR; break;
17356 case 's': namedclass = ANYOF_SPACE; break;
17357 case 'S': namedclass = ANYOF_NSPACE; break;
17358 case 'd': namedclass = ANYOF_DIGIT; break;
17359 case 'D': namedclass = ANYOF_NDIGIT; break;
17360 case 'v': namedclass = ANYOF_VERTWS; break;
17361 case 'V': namedclass = ANYOF_NVERTWS; break;
17362 case 'h': namedclass = ANYOF_HORIZWS; break;
17363 case 'H': namedclass = ANYOF_NHORIZWS; break;
17364 case 'N': /* Handle \N{NAME} in class */
17366 const char * const backslash_N_beg = RExC_parse - 2;
17369 if (! grok_bslash_N(pRExC_state,
17370 NULL, /* No regnode */
17371 &value, /* Yes single value */
17372 &cp_count, /* Multiple code pt count */
17378 if (*flagp & NEED_UTF8)
17379 FAIL("panic: grok_bslash_N set NEED_UTF8");
17381 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17383 if (cp_count < 0) {
17384 vFAIL("\\N in a character class must be a named character: \\N{...}");
17386 else if (cp_count == 0) {
17387 ckWARNreg(RExC_parse,
17388 "Ignoring zero length \\N{} in character class");
17390 else { /* cp_count > 1 */
17391 assert(cp_count > 1);
17392 if (! RExC_in_multi_char_class) {
17393 if ( ! allow_mutiple_chars
17396 || *RExC_parse == '-')
17400 vFAIL("\\N{} here is restricted to one character");
17402 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17403 break; /* <value> contains the first code
17404 point. Drop out of the switch to
17408 SV * multi_char_N = newSVpvn(backslash_N_beg,
17409 RExC_parse - backslash_N_beg);
17411 = add_multi_match(multi_char_matches,
17416 } /* End of cp_count != 1 */
17418 /* This element should not be processed further in this
17421 value = save_value;
17422 prevvalue = save_prevvalue;
17423 continue; /* Back to top of loop to get next char */
17426 /* Here, is a single code point, and <value> contains it */
17427 unicode_range = TRUE; /* \N{} are Unicode */
17435 /* \p means they want Unicode semantics */
17436 REQUIRE_UNI_RULES(flagp, 0);
17438 if (RExC_parse >= RExC_end)
17439 vFAIL2("Empty \\%c", (U8)value);
17440 if (*RExC_parse == '{') {
17441 const U8 c = (U8)value;
17442 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17445 vFAIL2("Missing right brace on \\%c{}", c);
17450 /* White space is allowed adjacent to the braces and after
17451 * any '^', even when not under /x */
17452 while (isSPACE(*RExC_parse)) {
17456 if (UCHARAT(RExC_parse) == '^') {
17458 /* toggle. (The rhs xor gets the single bit that
17459 * differs between P and p; the other xor inverts just
17461 value ^= 'P' ^ 'p';
17464 while (isSPACE(*RExC_parse)) {
17469 if (e == RExC_parse)
17470 vFAIL2("Empty \\%c{}", c);
17472 n = e - RExC_parse;
17473 while (isSPACE(*(RExC_parse + n - 1)))
17476 } /* The \p isn't immediately followed by a '{' */
17477 else if (! isALPHA(*RExC_parse)) {
17478 RExC_parse += (UTF)
17479 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17481 vFAIL2("Character following \\%c must be '{' or a "
17482 "single-character Unicode property name",
17490 char* name = RExC_parse;
17492 /* Any message returned about expanding the definition */
17493 SV* msg = newSVpvs_flags("", SVs_TEMP);
17495 /* If set TRUE, the property is user-defined as opposed to
17496 * official Unicode */
17497 bool user_defined = FALSE;
17499 SV * prop_definition = parse_uniprop_string(
17500 name, n, UTF, FOLD,
17501 FALSE, /* This is compile-time */
17503 /* We can't defer this defn when
17504 * the full result is required in
17506 ! cBOOL(ret_invlist),
17512 if (SvCUR(msg)) { /* Assumes any error causes a msg */
17513 assert(prop_definition == NULL);
17514 RExC_parse = e + 1;
17515 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
17516 thing so, or else the display is
17520 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17521 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17522 SvCUR(msg), SvPVX(msg)));
17525 if (! is_invlist(prop_definition)) {
17527 /* Here, the definition isn't known, so we have gotten
17528 * returned a string that will be evaluated if and when
17529 * encountered at runtime. We add it to the list of
17530 * such properties, along with whether it should be
17531 * complemented or not */
17532 if (value == 'P') {
17533 sv_catpvs(listsv, "!");
17536 sv_catpvs(listsv, "+");
17538 sv_catsv(listsv, prop_definition);
17540 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17542 /* We don't know yet what this matches, so have to flag
17544 anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17547 assert (prop_definition && is_invlist(prop_definition));
17549 /* Here we do have the complete property definition
17551 * Temporary workaround for [perl #133136]. For this
17552 * precise input that is in the .t that is failing,
17553 * load utf8.pm, which is what the test wants, so that
17554 * that .t passes */
17555 if ( memEQs(RExC_start, e + 1 - RExC_start,
17557 && ! hv_common(GvHVn(PL_incgv),
17559 "utf8.pm", sizeof("utf8.pm") - 1,
17560 0, HV_FETCH_ISEXISTS, NULL, 0))
17562 require_pv("utf8.pm");
17565 if (! user_defined &&
17566 /* We warn on matching an above-Unicode code point
17567 * if the match would return true, except don't
17568 * warn for \p{All}, which has exactly one element
17570 (_invlist_contains_cp(prop_definition, 0x110000)
17571 && (! (_invlist_len(prop_definition) == 1
17572 && *invlist_array(prop_definition) == 0))))
17577 /* Invert if asking for the complement */
17578 if (value == 'P') {
17579 _invlist_union_complement_2nd(properties,
17584 _invlist_union(properties, prop_definition, &properties);
17589 RExC_parse = e + 1;
17590 namedclass = ANYOF_UNIPROP; /* no official name, but it's
17594 case 'n': value = '\n'; break;
17595 case 'r': value = '\r'; break;
17596 case 't': value = '\t'; break;
17597 case 'f': value = '\f'; break;
17598 case 'b': value = '\b'; break;
17599 case 'e': value = ESC_NATIVE; break;
17600 case 'a': value = '\a'; break;
17602 RExC_parse--; /* function expects to be pointed at the 'o' */
17604 const char* error_msg;
17605 bool valid = grok_bslash_o(&RExC_parse,
17609 TO_OUTPUT_WARNINGS(RExC_parse),
17611 silence_non_portable,
17616 UPDATE_WARNINGS_LOC(RExC_parse - 1);
17618 non_portable_endpoint++;
17621 RExC_parse--; /* function expects to be pointed at the 'x' */
17623 const char* error_msg;
17624 bool valid = grok_bslash_x(&RExC_parse,
17628 TO_OUTPUT_WARNINGS(RExC_parse),
17630 silence_non_portable,
17635 UPDATE_WARNINGS_LOC(RExC_parse - 1);
17637 non_portable_endpoint++;
17640 value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
17641 UPDATE_WARNINGS_LOC(RExC_parse);
17643 non_portable_endpoint++;
17645 case '0': case '1': case '2': case '3': case '4':
17646 case '5': case '6': case '7':
17648 /* Take 1-3 octal digits */
17649 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
17650 numlen = (strict) ? 4 : 3;
17651 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17652 RExC_parse += numlen;
17655 RExC_parse += (UTF)
17656 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17658 vFAIL("Need exactly 3 octal digits");
17660 else if ( numlen < 3 /* like \08, \178 */
17661 && RExC_parse < RExC_end
17662 && isDIGIT(*RExC_parse)
17663 && ckWARN(WARN_REGEXP))
17665 reg_warn_non_literal_string(
17667 form_short_octal_warning(RExC_parse, numlen));
17670 non_portable_endpoint++;
17674 /* Allow \_ to not give an error */
17675 if (isWORDCHAR(value) && value != '_') {
17677 vFAIL2("Unrecognized escape \\%c in character class",
17681 ckWARN2reg(RExC_parse,
17682 "Unrecognized escape \\%c in character class passed through",
17687 } /* End of switch on char following backslash */
17688 } /* end of handling backslash escape sequences */
17690 /* Here, we have the current token in 'value' */
17692 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
17695 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
17696 * literal, as is the character that began the false range, i.e.
17697 * the 'a' in the examples */
17699 const int w = (RExC_parse >= rangebegin)
17700 ? RExC_parse - rangebegin
17704 "False [] range \"%" UTF8f "\"",
17705 UTF8fARG(UTF, w, rangebegin));
17708 ckWARN2reg(RExC_parse,
17709 "False [] range \"%" UTF8f "\"",
17710 UTF8fARG(UTF, w, rangebegin));
17711 cp_list = add_cp_to_invlist(cp_list, '-');
17712 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
17716 range = 0; /* this was not a true range */
17717 element_count += 2; /* So counts for three values */
17720 classnum = namedclass_to_classnum(namedclass);
17722 if (LOC && namedclass < ANYOF_POSIXL_MAX
17723 #ifndef HAS_ISASCII
17724 && classnum != _CC_ASCII
17727 SV* scratch_list = NULL;
17729 /* What the Posix classes (like \w, [:space:]) match isn't
17730 * generally knowable under locale until actual match time. A
17731 * special node is used for these which has extra space for a
17732 * bitmap, with a bit reserved for each named class that is to
17733 * be matched against. (This isn't needed for \p{} and
17734 * pseudo-classes, as they are not affected by locale, and
17735 * hence are dealt with separately.) However, if a named class
17736 * and its complement are both present, then it matches
17737 * everything, and there is no runtime dependency. Odd numbers
17738 * are the complements of the next lower number, so xor works.
17739 * (Note that something like [\w\D] should match everything,
17740 * because \d should be a proper subset of \w. But rather than
17741 * trust that the locale is well behaved, we leave this to
17742 * runtime to sort out) */
17743 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
17744 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
17745 POSIXL_ZERO(posixl);
17746 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
17747 anyof_flags &= ~ANYOF_MATCHES_POSIXL;
17748 continue; /* We could ignore the rest of the class, but
17749 best to parse it for any errors */
17751 else { /* Here, isn't the complement of any already parsed
17753 POSIXL_SET(posixl, namedclass);
17754 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
17755 anyof_flags |= ANYOF_MATCHES_POSIXL;
17757 /* The above-Latin1 characters are not subject to locale
17758 * rules. Just add them to the unconditionally-matched
17761 /* Get the list of the above-Latin1 code points this
17763 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
17764 PL_XPosix_ptrs[classnum],
17766 /* Odd numbers are complements,
17767 * like NDIGIT, NASCII, ... */
17768 namedclass % 2 != 0,
17770 /* Checking if 'cp_list' is NULL first saves an extra
17771 * clone. Its reference count will be decremented at the
17772 * next union, etc, or if this is the only instance, at the
17773 * end of the routine */
17775 cp_list = scratch_list;
17778 _invlist_union(cp_list, scratch_list, &cp_list);
17779 SvREFCNT_dec_NN(scratch_list);
17781 continue; /* Go get next character */
17786 /* Here, is not /l, or is a POSIX class for which /l doesn't
17787 * matter (or is a Unicode property, which is skipped here). */
17788 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
17789 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
17791 /* Here, should be \h, \H, \v, or \V. None of /d, /i
17792 * nor /l make a difference in what these match,
17793 * therefore we just add what they match to cp_list. */
17794 if (classnum != _CC_VERTSPACE) {
17795 assert( namedclass == ANYOF_HORIZWS
17796 || namedclass == ANYOF_NHORIZWS);
17798 /* It turns out that \h is just a synonym for
17800 classnum = _CC_BLANK;
17803 _invlist_union_maybe_complement_2nd(
17805 PL_XPosix_ptrs[classnum],
17806 namedclass % 2 != 0, /* Complement if odd
17807 (NHORIZWS, NVERTWS)
17812 else if ( AT_LEAST_UNI_SEMANTICS
17813 || classnum == _CC_ASCII
17814 || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
17815 || classnum == _CC_XDIGIT)))
17817 /* We usually have to worry about /d affecting what POSIX
17818 * classes match, with special code needed because we won't
17819 * know until runtime what all matches. But there is no
17820 * extra work needed under /u and /a; and [:ascii:] is
17821 * unaffected by /d; and :digit: and :xdigit: don't have
17822 * runtime differences under /d. So we can special case
17823 * these, and avoid some extra work below, and at runtime.
17825 _invlist_union_maybe_complement_2nd(
17827 ((AT_LEAST_ASCII_RESTRICTED)
17828 ? PL_Posix_ptrs[classnum]
17829 : PL_XPosix_ptrs[classnum]),
17830 namedclass % 2 != 0,
17833 else { /* Garden variety class. If is NUPPER, NALPHA, ...
17834 complement and use nposixes */
17835 SV** posixes_ptr = namedclass % 2 == 0
17838 _invlist_union_maybe_complement_2nd(
17840 PL_XPosix_ptrs[classnum],
17841 namedclass % 2 != 0,
17845 } /* end of namedclass \blah */
17847 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17849 /* If 'range' is set, 'value' is the ending of a range--check its
17850 * validity. (If value isn't a single code point in the case of a
17851 * range, we should have figured that out above in the code that
17852 * catches false ranges). Later, we will handle each individual code
17853 * point in the range. If 'range' isn't set, this could be the
17854 * beginning of a range, so check for that by looking ahead to see if
17855 * the next real character to be processed is the range indicator--the
17860 /* For unicode ranges, we have to test that the Unicode as opposed
17861 * to the native values are not decreasing. (Above 255, there is
17862 * no difference between native and Unicode) */
17863 if (unicode_range && prevvalue < 255 && value < 255) {
17864 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
17865 goto backwards_range;
17870 if (prevvalue > value) /* b-a */ {
17875 w = RExC_parse - rangebegin;
17877 "Invalid [] range \"%" UTF8f "\"",
17878 UTF8fARG(UTF, w, rangebegin));
17879 NOT_REACHED; /* NOTREACHED */
17883 prevvalue = value; /* save the beginning of the potential range */
17884 if (! stop_at_1 /* Can't be a range if parsing just one thing */
17885 && *RExC_parse == '-')
17887 char* next_char_ptr = RExC_parse + 1;
17889 /* Get the next real char after the '-' */
17890 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
17892 /* If the '-' is at the end of the class (just before the ']',
17893 * it is a literal minus; otherwise it is a range */
17894 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
17895 RExC_parse = next_char_ptr;
17897 /* a bad range like \w-, [:word:]- ? */
17898 if (namedclass > OOB_NAMEDCLASS) {
17899 if (strict || ckWARN(WARN_REGEXP)) {
17900 const int w = RExC_parse >= rangebegin
17901 ? RExC_parse - rangebegin
17904 vFAIL4("False [] range \"%*.*s\"",
17909 "False [] range \"%*.*s\"",
17913 cp_list = add_cp_to_invlist(cp_list, '-');
17916 range = 1; /* yeah, it's a range! */
17917 continue; /* but do it the next time */
17922 if (namedclass > OOB_NAMEDCLASS) {
17926 /* Here, we have a single value this time through the loop, and
17927 * <prevvalue> is the beginning of the range, if any; or <value> if
17930 /* non-Latin1 code point implies unicode semantics. */
17932 REQUIRE_UNI_RULES(flagp, 0);
17935 /* Ready to process either the single value, or the completed range.
17936 * For single-valued non-inverted ranges, we consider the possibility
17937 * of multi-char folds. (We made a conscious decision to not do this
17938 * for the other cases because it can often lead to non-intuitive
17939 * results. For example, you have the peculiar case that:
17940 * "s s" =~ /^[^\xDF]+$/i => Y
17941 * "ss" =~ /^[^\xDF]+$/i => N
17943 * See [perl #89750] */
17944 if (FOLD && allow_mutiple_chars && value == prevvalue) {
17945 if ( value == LATIN_SMALL_LETTER_SHARP_S
17946 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
17949 /* Here <value> is indeed a multi-char fold. Get what it is */
17951 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17954 UV folded = _to_uni_fold_flags(
17958 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
17959 ? FOLD_FLAGS_NOMIX_ASCII
17963 /* Here, <folded> should be the first character of the
17964 * multi-char fold of <value>, with <foldbuf> containing the
17965 * whole thing. But, if this fold is not allowed (because of
17966 * the flags), <fold> will be the same as <value>, and should
17967 * be processed like any other character, so skip the special
17969 if (folded != value) {
17971 /* Skip if we are recursed, currently parsing the class
17972 * again. Otherwise add this character to the list of
17973 * multi-char folds. */
17974 if (! RExC_in_multi_char_class) {
17975 STRLEN cp_count = utf8_length(foldbuf,
17976 foldbuf + foldlen);
17977 SV* multi_fold = sv_2mortal(newSVpvs(""));
17979 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
17982 = add_multi_match(multi_char_matches,
17988 /* This element should not be processed further in this
17991 value = save_value;
17992 prevvalue = save_prevvalue;
17998 if (strict && ckWARN(WARN_REGEXP)) {
18001 /* If the range starts above 255, everything is portable and
18002 * likely to be so for any forseeable character set, so don't
18004 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
18005 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
18007 else if (prevvalue != value) {
18009 /* Under strict, ranges that stop and/or end in an ASCII
18010 * printable should have each end point be a portable value
18011 * for it (preferably like 'A', but we don't warn if it is
18012 * a (portable) Unicode name or code point), and the range
18013 * must be be all digits or all letters of the same case.
18014 * Otherwise, the range is non-portable and unclear as to
18015 * what it contains */
18016 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
18017 && ( non_portable_endpoint
18018 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
18019 || (isLOWER_A(prevvalue) && isLOWER_A(value))
18020 || (isUPPER_A(prevvalue) && isUPPER_A(value))
18022 vWARN(RExC_parse, "Ranges of ASCII printables should"
18023 " be some subset of \"0-9\","
18024 " \"A-Z\", or \"a-z\"");
18026 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
18027 SSize_t index_start;
18028 SSize_t index_final;
18030 /* But the nature of Unicode and languages mean we
18031 * can't do the same checks for above-ASCII ranges,
18032 * except in the case of digit ones. These should
18033 * contain only digits from the same group of 10. The
18034 * ASCII case is handled just above. Hence here, the
18035 * range could be a range of digits. First some
18036 * unlikely special cases. Grandfather in that a range
18037 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
18038 * if its starting value is one of the 10 digits prior
18039 * to it. This is because it is an alternate way of
18040 * writing 19D1, and some people may expect it to be in
18041 * that group. But it is bad, because it won't give
18042 * the expected results. In Unicode 5.2 it was
18043 * considered to be in that group (of 11, hence), but
18044 * this was fixed in the next version */
18046 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
18047 goto warn_bad_digit_range;
18049 else if (UNLIKELY( prevvalue >= 0x1D7CE
18050 && value <= 0x1D7FF))
18052 /* This is the only other case currently in Unicode
18053 * where the algorithm below fails. The code
18054 * points just above are the end points of a single
18055 * range containing only decimal digits. It is 5
18056 * different series of 0-9. All other ranges of
18057 * digits currently in Unicode are just a single
18058 * series. (And mktables will notify us if a later
18059 * Unicode version breaks this.)
18061 * If the range being checked is at most 9 long,
18062 * and the digit values represented are in
18063 * numerical order, they are from the same series.
18065 if ( value - prevvalue > 9
18066 || ((( value - 0x1D7CE) % 10)
18067 <= (prevvalue - 0x1D7CE) % 10))
18069 goto warn_bad_digit_range;
18074 /* For all other ranges of digits in Unicode, the
18075 * algorithm is just to check if both end points
18076 * are in the same series, which is the same range.
18078 index_start = _invlist_search(
18079 PL_XPosix_ptrs[_CC_DIGIT],
18082 /* Warn if the range starts and ends with a digit,
18083 * and they are not in the same group of 10. */
18084 if ( index_start >= 0
18085 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
18087 _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
18088 value)) != index_start
18089 && index_final >= 0
18090 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
18092 warn_bad_digit_range:
18093 vWARN(RExC_parse, "Ranges of digits should be"
18094 " from the same group of"
18101 if ((! range || prevvalue == value) && non_portable_endpoint) {
18102 if (isPRINT_A(value)) {
18105 if (isBACKSLASHED_PUNCT(value)) {
18106 literal[d++] = '\\';
18108 literal[d++] = (char) value;
18109 literal[d++] = '\0';
18112 "\"%.*s\" is more clearly written simply as \"%s\"",
18113 (int) (RExC_parse - rangebegin),
18118 else if (isMNEMONIC_CNTRL(value)) {
18120 "\"%.*s\" is more clearly written simply as \"%s\"",
18121 (int) (RExC_parse - rangebegin),
18123 cntrl_to_mnemonic((U8) value)
18129 /* Deal with this element of the class */
18132 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18135 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
18136 * that don't require special handling, we can just add the range like
18137 * we do for ASCII platforms */
18138 if ((UNLIKELY(prevvalue == 0) && value >= 255)
18139 || ! (prevvalue < 256
18141 || (! non_portable_endpoint
18142 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
18143 || (isUPPER_A(prevvalue)
18144 && isUPPER_A(value)))))))
18146 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18150 /* Here, requires special handling. This can be because it is a
18151 * range whose code points are considered to be Unicode, and so
18152 * must be individually translated into native, or because its a
18153 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
18154 * EBCDIC, but we have defined them to include only the "expected"
18155 * upper or lower case ASCII alphabetics. Subranges above 255 are
18156 * the same in native and Unicode, so can be added as a range */
18157 U8 start = NATIVE_TO_LATIN1(prevvalue);
18159 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
18160 for (j = start; j <= end; j++) {
18161 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
18164 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18170 range = 0; /* this range (if it was one) is done now */
18171 } /* End of loop through all the text within the brackets */
18173 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
18174 output_posix_warnings(pRExC_state, posix_warnings);
18177 /* If anything in the class expands to more than one character, we have to
18178 * deal with them by building up a substitute parse string, and recursively
18179 * calling reg() on it, instead of proceeding */
18180 if (multi_char_matches) {
18181 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
18184 char *save_end = RExC_end;
18185 char *save_parse = RExC_parse;
18186 char *save_start = RExC_start;
18187 Size_t constructed_prefix_len = 0; /* This gives the length of the
18188 constructed portion of the
18189 substitute parse. */
18190 bool first_time = TRUE; /* First multi-char occurrence doesn't get
18195 /* Only one level of recursion allowed */
18196 assert(RExC_copy_start_in_constructed == RExC_precomp);
18198 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
18199 because too confusing */
18201 sv_catpvs(substitute_parse, "(?:");
18205 /* Look at the longest folds first */
18206 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
18211 if (av_exists(multi_char_matches, cp_count)) {
18212 AV** this_array_ptr;
18215 this_array_ptr = (AV**) av_fetch(multi_char_matches,
18217 while ((this_sequence = av_pop(*this_array_ptr)) !=
18220 if (! first_time) {
18221 sv_catpvs(substitute_parse, "|");
18223 first_time = FALSE;
18225 sv_catpv(substitute_parse, SvPVX(this_sequence));
18230 /* If the character class contains anything else besides these
18231 * multi-character folds, have to include it in recursive parsing */
18232 if (element_count) {
18233 sv_catpvs(substitute_parse, "|[");
18234 constructed_prefix_len = SvCUR(substitute_parse);
18235 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
18237 /* Put in a closing ']' only if not going off the end, as otherwise
18238 * we are adding something that really isn't there */
18239 if (RExC_parse < RExC_end) {
18240 sv_catpvs(substitute_parse, "]");
18244 sv_catpvs(substitute_parse, ")");
18247 /* This is a way to get the parse to skip forward a whole named
18248 * sequence instead of matching the 2nd character when it fails the
18250 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
18254 /* Set up the data structure so that any errors will be properly
18255 * reported. See the comments at the definition of
18256 * REPORT_LOCATION_ARGS for details */
18257 RExC_copy_start_in_input = (char *) orig_parse;
18258 RExC_start = RExC_parse = SvPV(substitute_parse, len);
18259 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18260 RExC_end = RExC_parse + len;
18261 RExC_in_multi_char_class = 1;
18263 ret = reg(pRExC_state, 1, ®_flags, depth+1);
18265 *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
18267 /* And restore so can parse the rest of the pattern */
18268 RExC_parse = save_parse;
18269 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18270 RExC_end = save_end;
18271 RExC_in_multi_char_class = 0;
18272 SvREFCNT_dec_NN(multi_char_matches);
18276 /* If folding, we calculate all characters that could fold to or from the
18277 * ones already on the list */
18278 if (cp_foldable_list) {
18280 UV start, end; /* End points of code point ranges */
18282 SV* fold_intersection = NULL;
18285 /* Our calculated list will be for Unicode rules. For locale
18286 * matching, we have to keep a separate list that is consulted at
18287 * runtime only when the locale indicates Unicode rules (and we
18288 * don't include potential matches in the ASCII/Latin1 range, as
18289 * any code point could fold to any other, based on the run-time
18290 * locale). For non-locale, we just use the general list */
18292 use_list = &only_utf8_locale_list;
18295 use_list = &cp_list;
18298 /* Only the characters in this class that participate in folds need
18299 * be checked. Get the intersection of this class and all the
18300 * possible characters that are foldable. This can quickly narrow
18301 * down a large class */
18302 _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18303 &fold_intersection);
18305 /* Now look at the foldable characters in this class individually */
18306 invlist_iterinit(fold_intersection);
18307 while (invlist_iternext(fold_intersection, &start, &end)) {
18311 /* Look at every character in the range */
18312 for (j = start; j <= end; j++) {
18313 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18316 Size_t folds_count;
18317 unsigned int first_fold;
18318 const unsigned int * remaining_folds;
18322 /* Under /l, we don't know what code points below 256
18323 * fold to, except we do know the MICRO SIGN folds to
18324 * an above-255 character if the locale is UTF-8, so we
18325 * add it to the special list (in *use_list) Otherwise
18326 * we know now what things can match, though some folds
18327 * are valid under /d only if the target is UTF-8.
18328 * Those go in a separate list */
18329 if ( IS_IN_SOME_FOLD_L1(j)
18330 && ! (LOC && j != MICRO_SIGN))
18333 /* ASCII is always matched; non-ASCII is matched
18334 * only under Unicode rules (which could happen
18335 * under /l if the locale is a UTF-8 one */
18336 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18337 *use_list = add_cp_to_invlist(*use_list,
18338 PL_fold_latin1[j]);
18340 else if (j != PL_fold_latin1[j]) {
18341 upper_latin1_only_utf8_matches
18342 = add_cp_to_invlist(
18343 upper_latin1_only_utf8_matches,
18344 PL_fold_latin1[j]);
18348 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18349 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18351 add_above_Latin1_folds(pRExC_state,
18358 /* Here is an above Latin1 character. We don't have the
18359 * rules hard-coded for it. First, get its fold. This is
18360 * the simple fold, as the multi-character folds have been
18361 * handled earlier and separated out */
18362 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18363 (ASCII_FOLD_RESTRICTED)
18364 ? FOLD_FLAGS_NOMIX_ASCII
18367 /* Single character fold of above Latin1. Add everything
18368 * in its fold closure to the list that this node should
18370 folds_count = _inverse_folds(folded, &first_fold,
18372 for (k = 0; k <= folds_count; k++) {
18373 UV c = (k == 0) /* First time through use itself */
18375 : (k == 1) /* 2nd time use, the first fold */
18378 /* Then the remaining ones */
18379 : remaining_folds[k-2];
18381 /* /aa doesn't allow folds between ASCII and non- */
18382 if (( ASCII_FOLD_RESTRICTED
18383 && (isASCII(c) != isASCII(j))))
18388 /* Folds under /l which cross the 255/256 boundary are
18389 * added to a separate list. (These are valid only
18390 * when the locale is UTF-8.) */
18391 if (c < 256 && LOC) {
18392 *use_list = add_cp_to_invlist(*use_list, c);
18396 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18398 cp_list = add_cp_to_invlist(cp_list, c);
18401 /* Similarly folds involving non-ascii Latin1
18402 * characters under /d are added to their list */
18403 upper_latin1_only_utf8_matches
18404 = add_cp_to_invlist(
18405 upper_latin1_only_utf8_matches,
18411 SvREFCNT_dec_NN(fold_intersection);
18414 /* Now that we have finished adding all the folds, there is no reason
18415 * to keep the foldable list separate */
18416 _invlist_union(cp_list, cp_foldable_list, &cp_list);
18417 SvREFCNT_dec_NN(cp_foldable_list);
18420 /* And combine the result (if any) with any inversion lists from posix
18421 * classes. The lists are kept separate up to now because we don't want to
18422 * fold the classes */
18423 if (simple_posixes) { /* These are the classes known to be unaffected by
18426 _invlist_union(cp_list, simple_posixes, &cp_list);
18427 SvREFCNT_dec_NN(simple_posixes);
18430 cp_list = simple_posixes;
18433 if (posixes || nposixes) {
18434 if (! DEPENDS_SEMANTICS) {
18436 /* For everything but /d, we can just add the current 'posixes' and
18437 * 'nposixes' to the main list */
18440 _invlist_union(cp_list, posixes, &cp_list);
18441 SvREFCNT_dec_NN(posixes);
18449 _invlist_union(cp_list, nposixes, &cp_list);
18450 SvREFCNT_dec_NN(nposixes);
18453 cp_list = nposixes;
18458 /* Under /d, things like \w match upper Latin1 characters only if
18459 * the target string is in UTF-8. But things like \W match all the
18460 * upper Latin1 characters if the target string is not in UTF-8.
18462 * Handle the case with something like \W separately */
18464 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18466 /* A complemented posix class matches all upper Latin1
18467 * characters if not in UTF-8. And it matches just certain
18468 * ones when in UTF-8. That means those certain ones are
18469 * matched regardless, so can just be added to the
18470 * unconditional list */
18472 _invlist_union(cp_list, nposixes, &cp_list);
18473 SvREFCNT_dec_NN(nposixes);
18477 cp_list = nposixes;
18480 /* Likewise for 'posixes' */
18481 _invlist_union(posixes, cp_list, &cp_list);
18482 SvREFCNT_dec(posixes);
18484 /* Likewise for anything else in the range that matched only
18486 if (upper_latin1_only_utf8_matches) {
18487 _invlist_union(cp_list,
18488 upper_latin1_only_utf8_matches,
18490 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18491 upper_latin1_only_utf8_matches = NULL;
18494 /* If we don't match all the upper Latin1 characters regardless
18495 * of UTF-8ness, we have to set a flag to match the rest when
18497 _invlist_subtract(only_non_utf8_list, cp_list,
18498 &only_non_utf8_list);
18499 if (_invlist_len(only_non_utf8_list) != 0) {
18500 anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18502 SvREFCNT_dec_NN(only_non_utf8_list);
18505 /* Here there were no complemented posix classes. That means
18506 * the upper Latin1 characters in 'posixes' match only when the
18507 * target string is in UTF-8. So we have to add them to the
18508 * list of those types of code points, while adding the
18509 * remainder to the unconditional list.
18511 * First calculate what they are */
18512 SV* nonascii_but_latin1_properties = NULL;
18513 _invlist_intersection(posixes, PL_UpperLatin1,
18514 &nonascii_but_latin1_properties);
18516 /* And add them to the final list of such characters. */
18517 _invlist_union(upper_latin1_only_utf8_matches,
18518 nonascii_but_latin1_properties,
18519 &upper_latin1_only_utf8_matches);
18521 /* Remove them from what now becomes the unconditional list */
18522 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18525 /* And add those unconditional ones to the final list */
18527 _invlist_union(cp_list, posixes, &cp_list);
18528 SvREFCNT_dec_NN(posixes);
18535 SvREFCNT_dec(nonascii_but_latin1_properties);
18537 /* Get rid of any characters from the conditional list that we
18538 * now know are matched unconditionally, which may make that
18540 _invlist_subtract(upper_latin1_only_utf8_matches,
18542 &upper_latin1_only_utf8_matches);
18543 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18544 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18545 upper_latin1_only_utf8_matches = NULL;
18551 /* And combine the result (if any) with any inversion list from properties.
18552 * The lists are kept separate up to now so that we can distinguish the two
18553 * in regards to matching above-Unicode. A run-time warning is generated
18554 * if a Unicode property is matched against a non-Unicode code point. But,
18555 * we allow user-defined properties to match anything, without any warning,
18556 * and we also suppress the warning if there is a portion of the character
18557 * class that isn't a Unicode property, and which matches above Unicode, \W
18558 * or [\x{110000}] for example.
18559 * (Note that in this case, unlike the Posix one above, there is no
18560 * <upper_latin1_only_utf8_matches>, because having a Unicode property
18561 * forces Unicode semantics */
18565 /* If it matters to the final outcome, see if a non-property
18566 * component of the class matches above Unicode. If so, the
18567 * warning gets suppressed. This is true even if just a single
18568 * such code point is specified, as, though not strictly correct if
18569 * another such code point is matched against, the fact that they
18570 * are using above-Unicode code points indicates they should know
18571 * the issues involved */
18573 warn_super = ! (invert
18574 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18577 _invlist_union(properties, cp_list, &cp_list);
18578 SvREFCNT_dec_NN(properties);
18581 cp_list = properties;
18586 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18588 /* Because an ANYOF node is the only one that warns, this node
18589 * can't be optimized into something else */
18590 optimizable = FALSE;
18594 /* Here, we have calculated what code points should be in the character
18597 * Now we can see about various optimizations. Fold calculation (which we
18598 * did above) needs to take place before inversion. Otherwise /[^k]/i
18599 * would invert to include K, which under /i would match k, which it
18600 * shouldn't. Therefore we can't invert folded locale now, as it won't be
18601 * folded until runtime */
18603 /* If we didn't do folding, it's because some information isn't available
18604 * until runtime; set the run-time fold flag for these We know to set the
18605 * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18606 * at least one 0-255 range code point */
18609 /* Some things on the list might be unconditionally included because of
18610 * other components. Remove them, and clean up the list if it goes to
18612 if (only_utf8_locale_list && cp_list) {
18613 _invlist_subtract(only_utf8_locale_list, cp_list,
18614 &only_utf8_locale_list);
18616 if (_invlist_len(only_utf8_locale_list) == 0) {
18617 SvREFCNT_dec_NN(only_utf8_locale_list);
18618 only_utf8_locale_list = NULL;
18621 if ( only_utf8_locale_list
18622 || (cp_list && ( _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18623 || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18625 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18628 | ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18630 else if (cp_list && invlist_lowest(cp_list) < 256) {
18631 /* If nothing is below 256, has no locale dependency; otherwise it
18633 anyof_flags |= ANYOFL_FOLD;
18634 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18637 else if ( DEPENDS_SEMANTICS
18638 && ( upper_latin1_only_utf8_matches
18639 || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18641 RExC_seen_d_op = TRUE;
18642 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18645 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18649 && ! has_runtime_dependency)
18651 _invlist_invert(cp_list);
18653 /* Clear the invert flag since have just done it here */
18658 *ret_invlist = cp_list;
18663 /* All possible optimizations below still have these characteristics.
18664 * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18666 *flagp |= HASWIDTH|SIMPLE;
18668 if (anyof_flags & ANYOF_LOCALE_FLAGS) {
18669 RExC_contains_locale = 1;
18672 /* Some character classes are equivalent to other nodes. Such nodes take
18673 * up less room, and some nodes require fewer operations to execute, than
18674 * ANYOF nodes. EXACTish nodes may be joinable with adjacent nodes to
18675 * improve efficiency. */
18678 PERL_UINT_FAST8_T i;
18679 UV partial_cp_count = 0;
18680 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
18681 UV end[MAX_FOLD_FROMS+1] = { 0 };
18682 bool single_range = FALSE;
18684 if (cp_list) { /* Count the code points in enough ranges that we would
18685 see all the ones possible in any fold in this version
18688 invlist_iterinit(cp_list);
18689 for (i = 0; i <= MAX_FOLD_FROMS; i++) {
18690 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
18693 partial_cp_count += end[i] - start[i] + 1;
18697 single_range = TRUE;
18699 invlist_iterfinish(cp_list);
18702 /* If we know at compile time that this matches every possible code
18703 * point, any run-time dependencies don't matter */
18704 if (start[0] == 0 && end[0] == UV_MAX) {
18706 ret = reganode(pRExC_state, OPFAIL, 0);
18709 ret = reg_node(pRExC_state, SANY);
18715 /* Similarly, for /l posix classes, if both a class and its
18716 * complement match, any run-time dependencies don't matter */
18718 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
18721 if ( POSIXL_TEST(posixl, namedclass) /* class */
18722 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
18725 ret = reganode(pRExC_state, OPFAIL, 0);
18728 ret = reg_node(pRExC_state, SANY);
18735 /* For well-behaved locales, some classes are subsets of others,
18736 * so complementing the subset and including the non-complemented
18737 * superset should match everything, like [\D[:alnum:]], and
18738 * [[:^alpha:][:alnum:]], but some implementations of locales are
18739 * buggy, and khw thinks its a bad idea to have optimization change
18740 * behavior, even if it avoids an OS bug in a given case */
18742 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
18744 /* If is a single posix /l class, can optimize to just that op.
18745 * Such a node will not match anything in the Latin1 range, as that
18746 * is not determinable until runtime, but will match whatever the
18747 * class does outside that range. (Note that some classes won't
18748 * match anything outside the range, like [:ascii:]) */
18749 if ( isSINGLE_BIT_SET(posixl)
18750 && (partial_cp_count == 0 || start[0] > 255))
18753 SV * class_above_latin1 = NULL;
18754 bool already_inverted;
18755 bool are_equivalent;
18757 /* Compute which bit is set, which is the same thing as, e.g.,
18758 * ANYOF_CNTRL. From
18759 * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
18761 static const int MultiplyDeBruijnBitPosition2[32] =
18763 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
18764 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
18767 namedclass = MultiplyDeBruijnBitPosition2[(posixl
18768 * 0x077CB531U) >> 27];
18769 classnum = namedclass_to_classnum(namedclass);
18771 /* The named classes are such that the inverted number is one
18772 * larger than the non-inverted one */
18773 already_inverted = namedclass
18774 - classnum_to_namedclass(classnum);
18776 /* Create an inversion list of the official property, inverted
18777 * if the constructed node list is inverted, and restricted to
18778 * only the above latin1 code points, which are the only ones
18779 * known at compile time */
18780 _invlist_intersection_maybe_complement_2nd(
18782 PL_XPosix_ptrs[classnum],
18784 &class_above_latin1);
18785 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
18787 SvREFCNT_dec_NN(class_above_latin1);
18789 if (are_equivalent) {
18791 /* Resolve the run-time inversion flag with this possibly
18792 * inverted class */
18793 invert = invert ^ already_inverted;
18795 ret = reg_node(pRExC_state,
18796 POSIXL + invert * (NPOSIXL - POSIXL));
18797 FLAGS(REGNODE_p(ret)) = classnum;
18803 /* khw can't think of any other possible transformation involving
18805 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
18809 if (! has_runtime_dependency) {
18811 /* If the list is empty, nothing matches. This happens, for
18812 * example, when a Unicode property that doesn't match anything is
18813 * the only element in the character class (perluniprops.pod notes
18814 * such properties). */
18815 if (partial_cp_count == 0) {
18817 ret = reg_node(pRExC_state, SANY);
18820 ret = reganode(pRExC_state, OPFAIL, 0);
18826 /* If matches everything but \n */
18827 if ( start[0] == 0 && end[0] == '\n' - 1
18828 && start[1] == '\n' + 1 && end[1] == UV_MAX)
18831 ret = reg_node(pRExC_state, REG_ANY);
18837 /* Next see if can optimize classes that contain just a few code points
18838 * into an EXACTish node. The reason to do this is to let the
18839 * optimizer join this node with adjacent EXACTish ones, and ANYOF
18840 * nodes require conversion to code point from UTF-8.
18842 * An EXACTFish node can be generated even if not under /i, and vice
18843 * versa. But care must be taken. An EXACTFish node has to be such
18844 * that it only matches precisely the code points in the class, but we
18845 * want to generate the least restrictive one that does that, to
18846 * increase the odds of being able to join with an adjacent node. For
18847 * example, if the class contains [kK], we have to make it an EXACTFAA
18848 * node to prevent the KELVIN SIGN from matching. Whether we are under
18849 * /i or not is irrelevant in this case. Less obvious is the pattern
18850 * qr/[\x{02BC}]n/i. U+02BC is MODIFIER LETTER APOSTROPHE. That is
18851 * supposed to match the single character U+0149 LATIN SMALL LETTER N
18852 * PRECEDED BY APOSTROPHE. And so even though there is no simple fold
18853 * that includes \X{02BC}, there is a multi-char fold that does, and so
18854 * the node generated for it must be an EXACTFish one. On the other
18855 * hand qr/:/i should generate a plain EXACT node since the colon
18856 * participates in no fold whatsoever, and having it EXACT tells the
18857 * optimizer the target string cannot match unless it has a colon in
18863 /* Only try if there are no more code points in the class than
18864 * in the max possible fold */
18865 && partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1)
18867 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
18869 /* We can always make a single code point class into an
18870 * EXACTish node. */
18874 /* Here is /l: Use EXACTL, except if there is a fold not
18875 * known until runtime so shows as only a single code point
18876 * here. For code points above 255, we know which can
18877 * cause problems by having a potential fold to the Latin1
18880 || ( start[0] > 255
18881 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
18889 else if (! FOLD) { /* Not /l and not /i */
18890 op = (start[0] < 256) ? EXACT : EXACT_REQ8;
18892 else if (start[0] < 256) { /* /i, not /l, and the code point is
18895 /* Under /i, it gets a little tricky. A code point that
18896 * doesn't participate in a fold should be an EXACT node.
18897 * We know this one isn't the result of a simple fold, or
18898 * there'd be more than one code point in the list, but it
18899 * could be part of a multi- character fold. In that case
18900 * we better not create an EXACT node, as we would wrongly
18901 * be telling the optimizer that this code point must be in
18902 * the target string, and that is wrong. This is because
18903 * if the sequence around this code point forms a
18904 * multi-char fold, what needs to be in the string could be
18905 * the code point that folds to the sequence.
18907 * This handles the case of below-255 code points, as we
18908 * have an easy look up for those. The next clause handles
18909 * the above-256 one */
18910 op = IS_IN_SOME_FOLD_L1(start[0])
18914 else { /* /i, larger code point. Since we are under /i, and
18915 have just this code point, we know that it can't
18916 fold to something else, so PL_InMultiCharFold
18918 op = _invlist_contains_cp(PL_InMultiCharFold,
18926 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
18927 && _invlist_contains_cp(PL_in_some_fold, start[0]))
18929 /* Here, the only runtime dependency, if any, is from /d, and
18930 * the class matches more than one code point, and the lowest
18931 * code point participates in some fold. It might be that the
18932 * other code points are /i equivalent to this one, and hence
18933 * they would representable by an EXACTFish node. Above, we
18934 * eliminated classes that contain too many code points to be
18935 * EXACTFish, with the test for MAX_FOLD_FROMS
18937 * First, special case the ASCII fold pairs, like 'B' and 'b'.
18938 * We do this because we have EXACTFAA at our disposal for the
18940 if (partial_cp_count == 2 && isASCII(start[0])) {
18942 /* The only ASCII characters that participate in folds are
18944 assert(isALPHA(start[0]));
18945 if ( end[0] == start[0] /* First range is a single
18946 character, so 2nd exists */
18947 && isALPHA_FOLD_EQ(start[0], start[1]))
18950 /* Here, is part of an ASCII fold pair */
18952 if ( ASCII_FOLD_RESTRICTED
18953 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
18955 /* If the second clause just above was true, it
18956 * means we can't be under /i, or else the list
18957 * would have included more than this fold pair.
18958 * Therefore we have to exclude the possibility of
18959 * whatever else it is that folds to these, by
18960 * using EXACTFAA */
18963 else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
18965 /* Here, there's no simple fold that start[0] is part
18966 * of, but there is a multi-character one. If we
18967 * are not under /i, we want to exclude that
18968 * possibility; if under /i, we want to include it
18970 op = (FOLD) ? EXACTFU : EXACTFAA;
18974 /* Here, the only possible fold start[0] particpates in
18975 * is with start[1]. /i or not isn't relevant */
18979 value = toFOLD(start[0]);
18982 else if ( ! upper_latin1_only_utf8_matches
18983 || ( _invlist_len(upper_latin1_only_utf8_matches)
18986 invlist_highest(upper_latin1_only_utf8_matches)]
18989 /* Here, the smallest character is non-ascii or there are
18990 * more than 2 code points matched by this node. Also, we
18991 * either don't have /d UTF-8 dependent matches, or if we
18992 * do, they look like they could be a single character that
18993 * is the fold of the lowest one in the always-match list.
18994 * This test quickly excludes most of the false positives
18995 * when there are /d UTF-8 depdendent matches. These are
18996 * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
18997 * SMALL LETTER A WITH GRAVE iff the target string is
18998 * UTF-8. (We don't have to worry above about exceeding
18999 * the array bounds of PL_fold_latin1[] because any code
19000 * point in 'upper_latin1_only_utf8_matches' is below 256.)
19002 * EXACTFAA would apply only to pairs (hence exactly 2 code
19003 * points) in the ASCII range, so we can't use it here to
19004 * artificially restrict the fold domain, so we check if
19005 * the class does or does not match some EXACTFish node.
19006 * Further, if we aren't under /i, and and the folded-to
19007 * character is part of a multi-character fold, we can't do
19008 * this optimization, as the sequence around it could be
19009 * that multi-character fold, and we don't here know the
19010 * context, so we have to assume it is that multi-char
19011 * fold, to prevent potential bugs.
19013 * To do the general case, we first find the fold of the
19014 * lowest code point (which may be higher than the lowest
19015 * one), then find everything that folds to it. (The data
19016 * structure we have only maps from the folded code points,
19017 * so we have to do the earlier step.) */
19020 U8 foldbuf[UTF8_MAXBYTES_CASE];
19021 UV folded = _to_uni_fold_flags(start[0],
19022 foldbuf, &foldlen, 0);
19023 unsigned int first_fold;
19024 const unsigned int * remaining_folds;
19025 Size_t folds_to_this_cp_count = _inverse_folds(
19029 Size_t folds_count = folds_to_this_cp_count + 1;
19030 SV * fold_list = _new_invlist(folds_count);
19033 /* If there are UTF-8 dependent matches, create a temporary
19034 * list of what this node matches, including them. */
19035 SV * all_cp_list = NULL;
19036 SV ** use_this_list = &cp_list;
19038 if (upper_latin1_only_utf8_matches) {
19039 all_cp_list = _new_invlist(0);
19040 use_this_list = &all_cp_list;
19041 _invlist_union(cp_list,
19042 upper_latin1_only_utf8_matches,
19046 /* Having gotten everything that participates in the fold
19047 * containing the lowest code point, we turn that into an
19048 * inversion list, making sure everything is included. */
19049 fold_list = add_cp_to_invlist(fold_list, start[0]);
19050 fold_list = add_cp_to_invlist(fold_list, folded);
19051 if (folds_to_this_cp_count > 0) {
19052 fold_list = add_cp_to_invlist(fold_list, first_fold);
19053 for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
19054 fold_list = add_cp_to_invlist(fold_list,
19055 remaining_folds[i]);
19059 /* If the fold list is identical to what's in this ANYOF
19060 * node, the node can be represented by an EXACTFish one
19062 if (_invlistEQ(*use_this_list, fold_list,
19063 0 /* Don't complement */ )
19066 /* But, we have to be careful, as mentioned above.
19067 * Just the right sequence of characters could match
19068 * this if it is part of a multi-character fold. That
19069 * IS what we want if we are under /i. But it ISN'T
19070 * what we want if not under /i, as it could match when
19071 * it shouldn't. So, when we aren't under /i and this
19072 * character participates in a multi-char fold, we
19073 * don't optimize into an EXACTFish node. So, for each
19074 * case below we have to check if we are folding
19075 * and if not, if it is not part of a multi-char fold.
19077 if (start[0] > 255) { /* Highish code point */
19078 if (FOLD || ! _invlist_contains_cp(
19079 PL_InMultiCharFold, folded))
19083 : (ASCII_FOLD_RESTRICTED)
19088 } /* Below, the lowest code point < 256 */
19091 && DEPENDS_SEMANTICS)
19092 { /* An EXACTF node containing a single character
19093 's', can be an EXACTFU if it doesn't get
19094 joined with an adjacent 's' */
19095 op = EXACTFU_S_EDGE;
19099 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
19101 if (upper_latin1_only_utf8_matches) {
19104 /* We can't use the fold, as that only matches
19108 else if ( UNLIKELY(start[0] == MICRO_SIGN)
19110 { /* EXACTFUP is a special node for this
19112 op = (ASCII_FOLD_RESTRICTED)
19115 value = MICRO_SIGN;
19117 else if ( ASCII_FOLD_RESTRICTED
19118 && ! isASCII(start[0]))
19119 { /* For ASCII under /iaa, we can use EXACTFU
19131 SvREFCNT_dec_NN(fold_list);
19132 SvREFCNT_dec(all_cp_list);
19139 /* Here, we have calculated what EXACTish node to use. Have to
19140 * convert to UTF-8 if not already there */
19143 SvREFCNT_dec(cp_list);;
19144 REQUIRE_UTF8(flagp);
19147 /* This is a kludge to the special casing issues with this
19148 * ligature under /aa. FB05 should fold to FB06, but the
19149 * call above to _to_uni_fold_flags() didn't find this, as
19150 * it didn't use the /aa restriction in order to not miss
19151 * other folds that would be affected. This is the only
19152 * instance likely to ever be a problem in all of Unicode.
19153 * So special case it. */
19154 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
19155 && ASCII_FOLD_RESTRICTED)
19157 value = LATIN_SMALL_LIGATURE_ST;
19161 len = (UTF) ? UVCHR_SKIP(value) : 1;
19163 ret = regnode_guts(pRExC_state, op, len, "exact");
19164 FILL_NODE(ret, op);
19165 RExC_emit += 1 + STR_SZ(len);
19166 setSTR_LEN(REGNODE_p(ret), len);
19168 *STRINGs(REGNODE_p(ret)) = (U8) value;
19171 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
19177 if (! has_runtime_dependency) {
19179 /* See if this can be turned into an ANYOFM node. Think about the
19180 * bit patterns in two different bytes. In some positions, the
19181 * bits in each will be 1; and in other positions both will be 0;
19182 * and in some positions the bit will be 1 in one byte, and 0 in
19183 * the other. Let 'n' be the number of positions where the bits
19184 * differ. We create a mask which has exactly 'n' 0 bits, each in
19185 * a position where the two bytes differ. Now take the set of all
19186 * bytes that when ANDed with the mask yield the same result. That
19187 * set has 2**n elements, and is representable by just two 8 bit
19188 * numbers: the result and the mask. Importantly, matching the set
19189 * can be vectorized by creating a word full of the result bytes,
19190 * and a word full of the mask bytes, yielding a significant speed
19191 * up. Here, see if this node matches such a set. As a concrete
19192 * example consider [01], and the byte representing '0' which is
19193 * 0x30 on ASCII machines. It has the bits 0011 0000. Take the
19194 * mask 1111 1110. If we AND 0x31 and 0x30 with that mask we get
19195 * 0x30. Any other bytes ANDed yield something else. So [01],
19196 * which is a common usage, is optimizable into ANYOFM, and can
19197 * benefit from the speed up. We can only do this on UTF-8
19198 * invariant bytes, because they have the same bit patterns under
19200 PERL_UINT_FAST8_T inverted = 0;
19202 const PERL_UINT_FAST8_T max_permissible = 0xFF;
19204 const PERL_UINT_FAST8_T max_permissible = 0x7F;
19206 /* If doesn't fit the criteria for ANYOFM, invert and try again.
19207 * If that works we will instead later generate an NANYOFM, and
19208 * invert back when through */
19209 if (invlist_highest(cp_list) > max_permissible) {
19210 _invlist_invert(cp_list);
19214 if (invlist_highest(cp_list) <= max_permissible) {
19215 UV this_start, this_end;
19216 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
19217 U8 bits_differing = 0;
19218 Size_t full_cp_count = 0;
19219 bool first_time = TRUE;
19221 /* Go through the bytes and find the bit positions that differ
19223 invlist_iterinit(cp_list);
19224 while (invlist_iternext(cp_list, &this_start, &this_end)) {
19225 unsigned int i = this_start;
19228 if (! UVCHR_IS_INVARIANT(i)) {
19232 first_time = FALSE;
19233 lowest_cp = this_start;
19235 /* We have set up the code point to compare with.
19236 * Don't compare it with itself */
19240 /* Find the bit positions that differ from the lowest code
19241 * point in the node. Keep track of all such positions by
19243 for (; i <= this_end; i++) {
19244 if (! UVCHR_IS_INVARIANT(i)) {
19248 bits_differing |= i ^ lowest_cp;
19251 full_cp_count += this_end - this_start + 1;
19254 /* At the end of the loop, we count how many bits differ from
19255 * the bits in lowest code point, call the count 'd'. If the
19256 * set we found contains 2**d elements, it is the closure of
19257 * all code points that differ only in those bit positions. To
19258 * convince yourself of that, first note that the number in the
19259 * closure must be a power of 2, which we test for. The only
19260 * way we could have that count and it be some differing set,
19261 * is if we got some code points that don't differ from the
19262 * lowest code point in any position, but do differ from each
19263 * other in some other position. That means one code point has
19264 * a 1 in that position, and another has a 0. But that would
19265 * mean that one of them differs from the lowest code point in
19266 * that position, which possibility we've already excluded. */
19267 if ( (inverted || full_cp_count > 1)
19268 && full_cp_count == 1U << PL_bitcount[bits_differing])
19272 op = ANYOFM + inverted;;
19274 /* We need to make the bits that differ be 0's */
19275 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19277 /* The argument is the lowest code point */
19278 ret = reganode(pRExC_state, op, lowest_cp);
19279 FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19283 invlist_iterfinish(cp_list);
19287 _invlist_invert(cp_list);
19294 /* XXX We could create an ANYOFR_LOW node here if we saved above if
19295 * all were invariants, it wasn't inverted, and there is a single
19296 * range. This would be faster than some of the posix nodes we
19297 * create below like /\d/a, but would be twice the size. Without
19298 * having actually measured the gain, khw doesn't think the
19299 * tradeoff is really worth it */
19302 if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19303 PERL_UINT_FAST8_T type;
19304 SV * intersection = NULL;
19305 SV* d_invlist = NULL;
19307 /* See if this matches any of the POSIX classes. The POSIXA and
19308 * POSIXD ones are about the same speed as ANYOF ops, but take less
19309 * room; the ones that have above-Latin1 code point matches are
19310 * somewhat faster than ANYOF. */
19312 for (type = POSIXA; type >= POSIXD; type--) {
19315 if (type == POSIXL) { /* But not /l posix classes */
19319 for (posix_class = 0;
19320 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19323 SV** our_code_points = &cp_list;
19324 SV** official_code_points;
19327 if (type == POSIXA) {
19328 official_code_points = &PL_Posix_ptrs[posix_class];
19331 official_code_points = &PL_XPosix_ptrs[posix_class];
19334 /* Skip non-existent classes of this type. e.g. \v only
19335 * has an entry in PL_XPosix_ptrs */
19336 if (! *official_code_points) {
19340 /* Try both the regular class, and its inversion */
19341 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19342 bool this_inverted = invert ^ try_inverted;
19344 if (type != POSIXD) {
19346 /* This class that isn't /d can't match if we have
19347 * /d dependencies */
19348 if (has_runtime_dependency
19349 & HAS_D_RUNTIME_DEPENDENCY)
19354 else /* is /d */ if (! this_inverted) {
19356 /* /d classes don't match anything non-ASCII below
19357 * 256 unconditionally (which cp_list contains) */
19358 _invlist_intersection(cp_list, PL_UpperLatin1,
19360 if (_invlist_len(intersection) != 0) {
19364 SvREFCNT_dec(d_invlist);
19365 d_invlist = invlist_clone(cp_list, NULL);
19367 /* But under UTF-8 it turns into using /u rules.
19368 * Add the things it matches under these conditions
19369 * so that we check below that these are identical
19370 * to what the tested class should match */
19371 if (upper_latin1_only_utf8_matches) {
19374 upper_latin1_only_utf8_matches,
19377 our_code_points = &d_invlist;
19379 else { /* POSIXD, inverted. If this doesn't have this
19380 flag set, it isn't /d. */
19381 if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19385 our_code_points = &cp_list;
19388 /* Here, have weeded out some things. We want to see
19389 * if the list of characters this node contains
19390 * ('*our_code_points') precisely matches those of the
19391 * class we are currently checking against
19392 * ('*official_code_points'). */
19393 if (_invlistEQ(*our_code_points,
19394 *official_code_points,
19397 /* Here, they precisely match. Optimize this ANYOF
19398 * node into its equivalent POSIX one of the
19399 * correct type, possibly inverted */
19400 ret = reg_node(pRExC_state, (try_inverted)
19404 FLAGS(REGNODE_p(ret)) = posix_class;
19405 SvREFCNT_dec(d_invlist);
19406 SvREFCNT_dec(intersection);
19412 SvREFCNT_dec(d_invlist);
19413 SvREFCNT_dec(intersection);
19416 /* If it is a single contiguous range, ANYOFR is an efficient regnode,
19417 * both in size and speed. Currently, a 20 bit range base (smallest
19418 * code point in the range), and a 12 bit maximum delta are packed into
19419 * a 32 bit word. This allows for using it on all of the Unicode code
19420 * points except for the highest plane, which is only for private use
19421 * code points. khw doubts that a bigger delta is likely in real world
19424 && ! has_runtime_dependency
19425 && anyof_flags == 0
19426 && start[0] < (1 << ANYOFR_BASE_BITS)
19427 && end[0] - start[0]
19428 < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
19429 * CHARBITS - ANYOFR_BASE_BITS))))
19432 U8 low_utf8[UTF8_MAXBYTES+1];
19433 U8 high_utf8[UTF8_MAXBYTES+1];
19435 ret = reganode(pRExC_state, ANYOFR,
19436 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
19438 /* Place the lowest UTF-8 start byte in the flags field, so as to
19439 * allow efficient ruling out at run time of many possible inputs.
19441 (void) uvchr_to_utf8(low_utf8, start[0]);
19442 (void) uvchr_to_utf8(high_utf8, end[0]);
19444 /* If all code points share the same first byte, this can be an
19445 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
19446 * quickly rule out many inputs at run-time without having to
19447 * compute the code point from UTF-8. For EBCDIC, we use I8, as
19448 * not doing that transformation would not rule out nearly so many
19450 if (low_utf8[0] == high_utf8[0]) {
19451 OP(REGNODE_p(ret)) = ANYOFRb;
19452 ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
19455 ANYOF_FLAGS(REGNODE_p(ret))
19456 = NATIVE_UTF8_TO_I8(low_utf8[0]);
19462 /* If didn't find an optimization and there is no need for a bitmap,
19463 * optimize to indicate that */
19464 if ( start[0] >= NUM_ANYOF_CODE_POINTS
19466 && ! upper_latin1_only_utf8_matches
19467 && anyof_flags == 0)
19469 U8 low_utf8[UTF8_MAXBYTES+1];
19470 UV highest_cp = invlist_highest(cp_list);
19472 /* Currently the maximum allowed code point by the system is
19473 * IV_MAX. Higher ones are reserved for future internal use. This
19474 * particular regnode can be used for higher ones, but we can't
19475 * calculate the code point of those. IV_MAX suffices though, as
19476 * it will be a large first byte */
19477 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
19480 /* We store the lowest possible first byte of the UTF-8
19481 * representation, using the flags field. This allows for quick
19482 * ruling out of some inputs without having to convert from UTF-8
19483 * to code point. For EBCDIC, we use I8, as not doing that
19484 * transformation would not rule out nearly so many things */
19485 anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19489 /* If the first UTF-8 start byte for the highest code point in the
19490 * range is suitably small, we may be able to get an upper bound as
19492 if (highest_cp <= IV_MAX) {
19493 U8 high_utf8[UTF8_MAXBYTES+1];
19494 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
19497 /* If the lowest and highest are the same, we can get an exact
19498 * first byte instead of a just minimum or even a sequence of
19499 * exact leading bytes. We signal these with different
19501 if (low_utf8[0] == high_utf8[0]) {
19502 Size_t len = find_first_differing_byte_pos(low_utf8,
19504 MIN(low_len, high_len));
19508 /* No need to convert to I8 for EBCDIC as this is an
19510 anyof_flags = low_utf8[0];
19515 ret = regnode_guts(pRExC_state, op,
19516 regarglen[op] + STR_SZ(len),
19518 FILL_NODE(ret, op);
19519 RExC_emit += 1 + regarglen[op]
19520 - 1 + STR_SZ(len); /* Replace the [1]
19521 element of the struct
19522 by the real value */
19523 REGNODE_p(ret)->flags = len;
19524 Copy(low_utf8, /* Add the common bytes */
19525 ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
19527 NEXT_OFF(REGNODE_p(ret)) = regarglen[op] + STR_SZ(len);
19528 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19529 NULL, only_utf8_locale_list);
19533 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19536 /* Here, the high byte is not the same as the low, but is
19537 * small enough that its reasonable to have a loose upper
19538 * bound, which is packed in with the strict lower bound.
19539 * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19540 * On EBCDIC platforms, I8 is used. On ASCII platforms I8
19541 * is the same thing as UTF-8 */
19544 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19545 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19548 if (range_diff <= max_range_diff / 8) {
19551 else if (range_diff <= max_range_diff / 4) {
19554 else if (range_diff <= max_range_diff / 2) {
19557 anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19562 goto done_finding_op;
19564 } /* End of seeing if can optimize it into a different node */
19566 is_anyof: /* It's going to be an ANYOF node. */
19567 op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19577 ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19578 FILL_NODE(ret, op); /* We set the argument later */
19579 RExC_emit += 1 + regarglen[op];
19580 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19582 /* Here, <cp_list> contains all the code points we can determine at
19583 * compile time that match under all conditions. Go through it, and
19584 * for things that belong in the bitmap, put them there, and delete from
19585 * <cp_list>. While we are at it, see if everything above 255 is in the
19586 * list, and if so, set a flag to speed up execution */
19588 populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19591 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19595 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19598 /* Here, the bitmap has been populated with all the Latin1 code points that
19599 * always match. Can now add to the overall list those that match only
19600 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19602 if (upper_latin1_only_utf8_matches) {
19604 _invlist_union(cp_list,
19605 upper_latin1_only_utf8_matches,
19607 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19610 cp_list = upper_latin1_only_utf8_matches;
19612 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19615 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19616 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19619 only_utf8_locale_list);
19620 SvREFCNT_dec(cp_list);;
19621 SvREFCNT_dec(only_utf8_locale_list);
19626 /* Here, the node is getting optimized into something that's not an ANYOF
19627 * one. Finish up. */
19629 Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19630 RExC_parse - orig_parse);;
19631 SvREFCNT_dec(cp_list);;
19632 SvREFCNT_dec(only_utf8_locale_list);
19636 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19639 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19640 regnode* const node,
19642 SV* const runtime_defns,
19643 SV* const only_utf8_locale_list)
19645 /* Sets the arg field of an ANYOF-type node 'node', using information about
19646 * the node passed-in. If there is nothing outside the node's bitmap, the
19647 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
19648 * the count returned by add_data(), having allocated and stored an array,
19651 * av[0] stores the inversion list defining this class as far as known at
19652 * this time, or PL_sv_undef if nothing definite is now known.
19653 * av[1] stores the inversion list of code points that match only if the
19654 * current locale is UTF-8, or if none, PL_sv_undef if there is an
19655 * av[2], or no entry otherwise.
19656 * av[2] stores the list of user-defined properties whose subroutine
19657 * definitions aren't known at this time, or no entry if none. */
19661 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19663 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19664 assert(! (ANYOF_FLAGS(node)
19665 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19666 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19669 AV * const av = newAV();
19673 av_store(av, INVLIST_INDEX, SvREFCNT_inc(cp_list));
19676 if (only_utf8_locale_list) {
19677 av_store(av, ONLY_LOCALE_MATCHES_INDEX,
19678 SvREFCNT_inc(only_utf8_locale_list));
19681 if (runtime_defns) {
19682 av_store(av, DEFERRED_USER_DEFINED_INDEX, SvREFCNT_inc(runtime_defns));
19685 rv = newRV_noinc(MUTABLE_SV(av));
19686 n = add_data(pRExC_state, STR_WITH_LEN("s"));
19687 RExC_rxi->data->data[n] = (void*)rv;
19692 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
19694 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
19695 const regnode* node,
19698 SV** only_utf8_locale_ptr,
19699 SV** output_invlist)
19702 /* For internal core use only.
19703 * Returns the inversion list for the input 'node' in the regex 'prog'.
19704 * If <doinit> is 'true', will attempt to create the inversion list if not
19706 * If <listsvp> is non-null, will return the printable contents of the
19707 * property definition. This can be used to get debugging information
19708 * even before the inversion list exists, by calling this function with
19709 * 'doinit' set to false, in which case the components that will be used
19710 * to eventually create the inversion list are returned (in a printable
19712 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
19713 * store an inversion list of code points that should match only if the
19714 * execution-time locale is a UTF-8 one.
19715 * If <output_invlist> is not NULL, it is where this routine is to store an
19716 * inversion list of the code points that would be instead returned in
19717 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
19718 * when this parameter is used, is just the non-code point data that
19719 * will go into creating the inversion list. This currently should be just
19720 * user-defined properties whose definitions were not known at compile
19721 * time. Using this parameter allows for easier manipulation of the
19722 * inversion list's data by the caller. It is illegal to call this
19723 * function with this parameter set, but not <listsvp>
19725 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
19726 * that, in spite of this function's name, the inversion list it returns
19727 * may include the bitmap data as well */
19729 SV *si = NULL; /* Input initialization string */
19730 SV* invlist = NULL;
19732 RXi_GET_DECL(prog, progi);
19733 const struct reg_data * const data = prog ? progi->data : NULL;
19735 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
19736 assert(! output_invlist || listsvp);
19738 if (data && data->count) {
19739 const U32 n = ARG(node);
19741 if (data->what[n] == 's') {
19742 SV * const rv = MUTABLE_SV(data->data[n]);
19743 AV * const av = MUTABLE_AV(SvRV(rv));
19744 SV **const ary = AvARRAY(av);
19746 invlist = ary[INVLIST_INDEX];
19748 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
19749 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
19752 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
19753 si = ary[DEFERRED_USER_DEFINED_INDEX];
19756 if (doinit && (si || invlist)) {
19759 SV * msg = newSVpvs_flags("", SVs_TEMP);
19761 SV * prop_definition = handle_user_defined_property(
19762 "", 0, FALSE, /* There is no \p{}, \P{} */
19763 SvPVX_const(si)[1] - '0', /* /i or not has been
19764 stored here for just
19766 TRUE, /* run time */
19767 FALSE, /* This call must find the defn */
19768 si, /* The property definition */
19771 0 /* base level call */
19775 assert(prop_definition == NULL);
19777 Perl_croak(aTHX_ "%" UTF8f,
19778 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
19782 _invlist_union(invlist, prop_definition, &invlist);
19783 SvREFCNT_dec_NN(prop_definition);
19786 invlist = prop_definition;
19789 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
19790 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
19792 av_store(av, INVLIST_INDEX, invlist);
19793 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
19794 ? ONLY_LOCALE_MATCHES_INDEX:
19802 /* If requested, return a printable version of what this ANYOF node matches
19805 SV* matches_string = NULL;
19807 /* This function can be called at compile-time, before everything gets
19808 * resolved, in which case we return the currently best available
19809 * information, which is the string that will eventually be used to do
19810 * that resolving, 'si' */
19812 /* Here, we only have 'si' (and possibly some passed-in data in
19813 * 'invlist', which is handled below) If the caller only wants
19814 * 'si', use that. */
19815 if (! output_invlist) {
19816 matches_string = newSVsv(si);
19819 /* But if the caller wants an inversion list of the node, we
19820 * need to parse 'si' and place as much as possible in the
19821 * desired output inversion list, making 'matches_string' only
19822 * contain the currently unresolvable things */
19823 const char *si_string = SvPVX(si);
19824 STRLEN remaining = SvCUR(si);
19828 /* Ignore everything before the first new-line */
19829 while (*si_string != '\n' && remaining > 0) {
19833 assert(remaining > 0);
19838 while (remaining > 0) {
19840 /* The data consists of just strings defining user-defined
19841 * property names, but in prior incarnations, and perhaps
19842 * somehow from pluggable regex engines, it could still
19843 * hold hex code point definitions. Each component of a
19844 * range would be separated by a tab, and each range by a
19845 * new-line. If these are found, instead add them to the
19846 * inversion list */
19847 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
19848 |PERL_SCAN_SILENT_NON_PORTABLE;
19849 STRLEN len = remaining;
19850 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
19852 /* If the hex decode routine found something, it should go
19853 * up to the next \n */
19854 if ( *(si_string + len) == '\n') {
19855 if (count) { /* 2nd code point on line */
19856 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
19859 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
19862 goto prepare_for_next_iteration;
19865 /* If the hex decode was instead for the lower range limit,
19866 * save it, and go parse the upper range limit */
19867 if (*(si_string + len) == '\t') {
19868 assert(count == 0);
19872 prepare_for_next_iteration:
19873 si_string += len + 1;
19874 remaining -= len + 1;
19878 /* Here, didn't find a legal hex number. Just add it from
19879 * here to the next \n */
19882 while (*(si_string + len) != '\n' && remaining > 0) {
19886 if (*(si_string + len) == '\n') {
19890 if (matches_string) {
19891 sv_catpvn(matches_string, si_string, len - 1);
19894 matches_string = newSVpvn(si_string, len - 1);
19897 sv_catpvs(matches_string, " ");
19898 } /* end of loop through the text */
19900 assert(matches_string);
19901 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
19902 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
19904 } /* end of has an 'si' */
19907 /* Add the stuff that's already known */
19910 /* Again, if the caller doesn't want the output inversion list, put
19911 * everything in 'matches-string' */
19912 if (! output_invlist) {
19913 if ( ! matches_string) {
19914 matches_string = newSVpvs("\n");
19916 sv_catsv(matches_string, invlist_contents(invlist,
19917 TRUE /* traditional style */
19920 else if (! *output_invlist) {
19921 *output_invlist = invlist_clone(invlist, NULL);
19924 _invlist_union(*output_invlist, invlist, output_invlist);
19928 *listsvp = matches_string;
19933 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
19935 /* reg_skipcomment()
19937 Absorbs an /x style # comment from the input stream,
19938 returning a pointer to the first character beyond the comment, or if the
19939 comment terminates the pattern without anything following it, this returns
19940 one past the final character of the pattern (in other words, RExC_end) and
19941 sets the REG_RUN_ON_COMMENT_SEEN flag.
19943 Note it's the callers responsibility to ensure that we are
19944 actually in /x mode
19948 PERL_STATIC_INLINE char*
19949 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
19951 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
19955 while (p < RExC_end) {
19956 if (*(++p) == '\n') {
19961 /* we ran off the end of the pattern without ending the comment, so we have
19962 * to add an \n when wrapping */
19963 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
19968 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
19970 const bool force_to_xmod
19973 /* If the text at the current parse position '*p' is a '(?#...)' comment,
19974 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
19975 * is /x whitespace, advance '*p' so that on exit it points to the first
19976 * byte past all such white space and comments */
19978 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
19980 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
19982 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
19985 if (RExC_end - (*p) >= 3
19987 && *(*p + 1) == '?'
19988 && *(*p + 2) == '#')
19990 while (*(*p) != ')') {
19991 if ((*p) == RExC_end)
19992 FAIL("Sequence (?#... not terminated");
20000 const char * save_p = *p;
20001 while ((*p) < RExC_end) {
20003 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
20006 else if (*(*p) == '#') {
20007 (*p) = reg_skipcomment(pRExC_state, (*p));
20013 if (*p != save_p) {
20026 Advances the parse position by one byte, unless that byte is the beginning
20027 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
20028 those two cases, the parse position is advanced beyond all such comments and
20031 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
20035 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
20037 PERL_ARGS_ASSERT_NEXTCHAR;
20039 if (RExC_parse < RExC_end) {
20041 || UTF8_IS_INVARIANT(*RExC_parse)
20042 || UTF8_IS_START(*RExC_parse));
20044 RExC_parse += (UTF)
20045 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
20048 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
20049 FALSE /* Don't force /x */ );
20054 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
20056 /* 'size' is the delta number of smallest regnode equivalents to add or
20057 * subtract from the current memory allocated to the regex engine being
20060 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
20065 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
20066 /* +1 for REG_MAGIC */
20069 if ( RExC_rxi == NULL )
20070 FAIL("Regexp out of space");
20071 RXi_SET(RExC_rx, RExC_rxi);
20073 RExC_emit_start = RExC_rxi->program;
20075 Zero(REGNODE_p(RExC_emit), size, regnode);
20078 #ifdef RE_TRACK_PATTERN_OFFSETS
20079 Renew(RExC_offsets, 2*RExC_size+1, U32);
20081 Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
20083 RExC_offsets[0] = RExC_size;
20087 STATIC regnode_offset
20088 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
20090 /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
20091 * equivalents space. It aligns and increments RExC_size
20093 * It returns the regnode's offset into the regex engine program */
20095 const regnode_offset ret = RExC_emit;
20097 GET_RE_DEBUG_FLAGS_DECL;
20099 PERL_ARGS_ASSERT_REGNODE_GUTS;
20101 SIZE_ALIGN(RExC_size);
20102 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
20103 NODE_ALIGN_FILL(REGNODE_p(ret));
20104 #ifndef RE_TRACK_PATTERN_OFFSETS
20105 PERL_UNUSED_ARG(name);
20106 PERL_UNUSED_ARG(op);
20108 assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
20110 if (RExC_offsets) { /* MJD */
20112 ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
20115 (UV)(RExC_emit) > RExC_offsets[0]
20116 ? "Overwriting end of array!\n" : "OK",
20118 (UV)(RExC_parse - RExC_start),
20119 (UV)RExC_offsets[0]));
20120 Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
20127 - reg_node - emit a node
20129 STATIC regnode_offset /* Location. */
20130 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
20132 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
20133 regnode_offset ptr = ret;
20135 PERL_ARGS_ASSERT_REG_NODE;
20137 assert(regarglen[op] == 0);
20139 FILL_ADVANCE_NODE(ptr, op);
20145 - reganode - emit a node with an argument
20147 STATIC regnode_offset /* Location. */
20148 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20150 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
20151 regnode_offset ptr = ret;
20153 PERL_ARGS_ASSERT_REGANODE;
20155 /* ANYOF are special cased to allow non-length 1 args */
20156 assert(regarglen[op] == 1);
20158 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
20163 STATIC regnode_offset
20164 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20166 /* emit a node with U32 and I32 arguments */
20168 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
20169 regnode_offset ptr = ret;
20171 PERL_ARGS_ASSERT_REG2LANODE;
20173 assert(regarglen[op] == 2);
20175 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
20181 - reginsert - insert an operator in front of already-emitted operand
20183 * That means that on exit 'operand' is the offset of the newly inserted
20184 * operator, and the original operand has been relocated.
20186 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
20187 * set up NEXT_OFF() of the inserted node if needed. Something like this:
20189 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
20190 * NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
20192 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
20195 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
20196 const regnode_offset operand, const U32 depth)
20201 const int offset = regarglen[(U8)op];
20202 const int size = NODE_STEP_REGNODE + offset;
20203 GET_RE_DEBUG_FLAGS_DECL;
20205 PERL_ARGS_ASSERT_REGINSERT;
20206 PERL_UNUSED_CONTEXT;
20207 PERL_UNUSED_ARG(depth);
20208 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
20209 DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
20210 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
20211 studying. If this is wrong then we need to adjust RExC_recurse
20212 below like we do with RExC_open_parens/RExC_close_parens. */
20213 change_engine_size(pRExC_state, (Ptrdiff_t) size);
20214 src = REGNODE_p(RExC_emit);
20216 dst = REGNODE_p(RExC_emit);
20218 /* If we are in a "count the parentheses" pass, the numbers are unreliable,
20219 * and [perl #133871] shows this can lead to problems, so skip this
20220 * realignment of parens until a later pass when they are reliable */
20221 if (! IN_PARENS_PASS && RExC_open_parens) {
20223 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
20224 /* remember that RExC_npar is rex->nparens + 1,
20225 * iow it is 1 more than the number of parens seen in
20226 * the pattern so far. */
20227 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
20228 /* note, RExC_open_parens[0] is the start of the
20229 * regex, it can't move. RExC_close_parens[0] is the end
20230 * of the regex, it *can* move. */
20231 if ( paren && RExC_open_parens[paren] >= operand ) {
20232 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
20233 RExC_open_parens[paren] += size;
20235 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
20237 if ( RExC_close_parens[paren] >= operand ) {
20238 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
20239 RExC_close_parens[paren] += size;
20241 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
20246 RExC_end_op += size;
20248 while (src > REGNODE_p(operand)) {
20249 StructCopy(--src, --dst, regnode);
20250 #ifdef RE_TRACK_PATTERN_OFFSETS
20251 if (RExC_offsets) { /* MJD 20010112 */
20253 ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
20257 (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
20258 ? "Overwriting end of array!\n" : "OK",
20259 (UV)REGNODE_OFFSET(src),
20260 (UV)REGNODE_OFFSET(dst),
20261 (UV)RExC_offsets[0]));
20262 Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
20263 Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
20268 place = REGNODE_p(operand); /* Op node, where operand used to be. */
20269 #ifdef RE_TRACK_PATTERN_OFFSETS
20270 if (RExC_offsets) { /* MJD */
20272 ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
20276 (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
20277 ? "Overwriting end of array!\n" : "OK",
20278 (UV)REGNODE_OFFSET(place),
20279 (UV)(RExC_parse - RExC_start),
20280 (UV)RExC_offsets[0]));
20281 Set_Node_Offset(place, RExC_parse);
20282 Set_Node_Length(place, 1);
20285 src = NEXTOPER(place);
20287 FILL_NODE(operand, op);
20289 /* Zero out any arguments in the new node */
20290 Zero(src, offset, regnode);
20294 - regtail - set the next-pointer at the end of a node chain of p to val. If
20295 that value won't fit in the space available, instead returns FALSE.
20296 (Except asserts if we can't fit in the largest space the regex
20297 engine is designed for.)
20298 - SEE ALSO: regtail_study
20301 S_regtail(pTHX_ RExC_state_t * pRExC_state,
20302 const regnode_offset p,
20303 const regnode_offset val,
20306 regnode_offset scan;
20307 GET_RE_DEBUG_FLAGS_DECL;
20309 PERL_ARGS_ASSERT_REGTAIL;
20311 PERL_UNUSED_ARG(depth);
20314 /* Find last node. */
20315 scan = (regnode_offset) p;
20317 regnode * const temp = regnext(REGNODE_p(scan));
20319 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
20320 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20321 Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n",
20322 SvPV_nolen_const(RExC_mysv), scan,
20323 (temp == NULL ? "->" : ""),
20324 (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
20329 scan = REGNODE_OFFSET(temp);
20332 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20333 assert((UV) (val - scan) <= U32_MAX);
20334 ARG_SET(REGNODE_p(scan), val - scan);
20337 if (val - scan > U16_MAX) {
20338 /* Populate this with something that won't loop and will likely
20339 * lead to a crash if the caller ignores the failure return, and
20340 * execution continues */
20341 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20344 NEXT_OFF(REGNODE_p(scan)) = val - scan;
20352 - regtail_study - set the next-pointer at the end of a node chain of p to val.
20353 - Look for optimizable sequences at the same time.
20354 - currently only looks for EXACT chains.
20356 This is experimental code. The idea is to use this routine to perform
20357 in place optimizations on branches and groups as they are constructed,
20358 with the long term intention of removing optimization from study_chunk so
20359 that it is purely analytical.
20361 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
20362 to control which is which.
20364 This used to return a value that was ignored. It was a problem that it is
20365 #ifdef'd to be another function that didn't return a value. khw has changed it
20366 so both currently return a pass/fail return.
20369 /* TODO: All four parms should be const */
20372 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
20373 const regnode_offset val, U32 depth)
20375 regnode_offset scan;
20377 #ifdef EXPERIMENTAL_INPLACESCAN
20380 GET_RE_DEBUG_FLAGS_DECL;
20382 PERL_ARGS_ASSERT_REGTAIL_STUDY;
20385 /* Find last node. */
20389 regnode * const temp = regnext(REGNODE_p(scan));
20390 #ifdef EXPERIMENTAL_INPLACESCAN
20391 if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20392 bool unfolded_multi_char; /* Unexamined in this routine */
20393 if (join_exact(pRExC_state, scan, &min,
20394 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20395 return TRUE; /* Was return EXACT */
20399 switch (OP(REGNODE_p(scan))) {
20406 case EXACTFU_S_EDGE:
20407 case EXACTFAA_NO_TRIE:
20414 if( exact == PSEUDO )
20415 exact= OP(REGNODE_p(scan));
20416 else if ( exact != OP(REGNODE_p(scan)) )
20425 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20426 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20427 Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n",
20428 SvPV_nolen_const(RExC_mysv),
20430 PL_reg_name[exact]);
20434 scan = REGNODE_OFFSET(temp);
20437 DEBUG_PARSE_MSG("");
20438 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20439 Perl_re_printf( aTHX_
20440 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20441 SvPV_nolen_const(RExC_mysv),
20446 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20447 assert((UV) (val - scan) <= U32_MAX);
20448 ARG_SET(REGNODE_p(scan), val - scan);
20451 if (val - scan > U16_MAX) {
20452 /* Populate this with something that won't loop and will likely
20453 * lead to a crash if the caller ignores the failure return, and
20454 * execution continues */
20455 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20458 NEXT_OFF(REGNODE_p(scan)) = val - scan;
20461 return TRUE; /* Was 'return exact' */
20466 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20468 /* Returns an inversion list of all the code points matched by the
20469 * ANYOFM/NANYOFM node 'n' */
20471 SV * cp_list = _new_invlist(-1);
20472 const U8 lowest = (U8) ARG(n);
20475 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20477 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20479 /* Starting with the lowest code point, any code point that ANDed with the
20480 * mask yields the lowest code point is in the set */
20481 for (i = lowest; i <= 0xFF; i++) {
20482 if ((i & FLAGS(n)) == ARG(n)) {
20483 cp_list = add_cp_to_invlist(cp_list, i);
20486 /* We know how many code points (a power of two) that are in the
20487 * set. No use looking once we've got that number */
20488 if (count >= needed) break;
20492 if (OP(n) == NANYOFM) {
20493 _invlist_invert(cp_list);
20499 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20504 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20509 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20511 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20512 if (flags & (1<<bit)) {
20513 if (!set++ && lead)
20514 Perl_re_printf( aTHX_ "%s", lead);
20515 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
20520 Perl_re_printf( aTHX_ "\n");
20522 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20527 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20533 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20535 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20536 if (flags & (1<<bit)) {
20537 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
20540 if (!set++ && lead)
20541 Perl_re_printf( aTHX_ "%s", lead);
20542 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
20545 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20546 if (!set++ && lead) {
20547 Perl_re_printf( aTHX_ "%s", lead);
20550 case REGEX_UNICODE_CHARSET:
20551 Perl_re_printf( aTHX_ "UNICODE");
20553 case REGEX_LOCALE_CHARSET:
20554 Perl_re_printf( aTHX_ "LOCALE");
20556 case REGEX_ASCII_RESTRICTED_CHARSET:
20557 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
20559 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20560 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
20563 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
20569 Perl_re_printf( aTHX_ "\n");
20571 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20577 Perl_regdump(pTHX_ const regexp *r)
20581 SV * const sv = sv_newmortal();
20582 SV *dsv= sv_newmortal();
20583 RXi_GET_DECL(r, ri);
20584 GET_RE_DEBUG_FLAGS_DECL;
20586 PERL_ARGS_ASSERT_REGDUMP;
20588 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20590 /* Header fields of interest. */
20591 for (i = 0; i < 2; i++) {
20592 if (r->substrs->data[i].substr) {
20593 RE_PV_QUOTED_DECL(s, 0, dsv,
20594 SvPVX_const(r->substrs->data[i].substr),
20595 RE_SV_DUMPLEN(r->substrs->data[i].substr),
20596 PL_dump_re_max_len);
20597 Perl_re_printf( aTHX_
20598 "%s %s%s at %" IVdf "..%" UVuf " ",
20599 i ? "floating" : "anchored",
20601 RE_SV_TAIL(r->substrs->data[i].substr),
20602 (IV)r->substrs->data[i].min_offset,
20603 (UV)r->substrs->data[i].max_offset);
20605 else if (r->substrs->data[i].utf8_substr) {
20606 RE_PV_QUOTED_DECL(s, 1, dsv,
20607 SvPVX_const(r->substrs->data[i].utf8_substr),
20608 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20610 Perl_re_printf( aTHX_
20611 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20612 i ? "floating" : "anchored",
20614 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20615 (IV)r->substrs->data[i].min_offset,
20616 (UV)r->substrs->data[i].max_offset);
20620 if (r->check_substr || r->check_utf8)
20621 Perl_re_printf( aTHX_
20623 ( r->check_substr == r->substrs->data[1].substr
20624 && r->check_utf8 == r->substrs->data[1].utf8_substr
20625 ? "(checking floating" : "(checking anchored"));
20626 if (r->intflags & PREGf_NOSCAN)
20627 Perl_re_printf( aTHX_ " noscan");
20628 if (r->extflags & RXf_CHECK_ALL)
20629 Perl_re_printf( aTHX_ " isall");
20630 if (r->check_substr || r->check_utf8)
20631 Perl_re_printf( aTHX_ ") ");
20633 if (ri->regstclass) {
20634 regprop(r, sv, ri->regstclass, NULL, NULL);
20635 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
20637 if (r->intflags & PREGf_ANCH) {
20638 Perl_re_printf( aTHX_ "anchored");
20639 if (r->intflags & PREGf_ANCH_MBOL)
20640 Perl_re_printf( aTHX_ "(MBOL)");
20641 if (r->intflags & PREGf_ANCH_SBOL)
20642 Perl_re_printf( aTHX_ "(SBOL)");
20643 if (r->intflags & PREGf_ANCH_GPOS)
20644 Perl_re_printf( aTHX_ "(GPOS)");
20645 Perl_re_printf( aTHX_ " ");
20647 if (r->intflags & PREGf_GPOS_SEEN)
20648 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
20649 if (r->intflags & PREGf_SKIP)
20650 Perl_re_printf( aTHX_ "plus ");
20651 if (r->intflags & PREGf_IMPLICIT)
20652 Perl_re_printf( aTHX_ "implicit ");
20653 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
20654 if (r->extflags & RXf_EVAL_SEEN)
20655 Perl_re_printf( aTHX_ "with eval ");
20656 Perl_re_printf( aTHX_ "\n");
20658 regdump_extflags("r->extflags: ", r->extflags);
20659 regdump_intflags("r->intflags: ", r->intflags);
20662 PERL_ARGS_ASSERT_REGDUMP;
20663 PERL_UNUSED_CONTEXT;
20664 PERL_UNUSED_ARG(r);
20665 #endif /* DEBUGGING */
20668 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
20671 # if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
20672 || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
20673 || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
20674 || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
20675 || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
20676 || _CC_VERTSPACE != 15
20677 # error Need to adjust order of anyofs[]
20679 static const char * const anyofs[] = {
20716 - regprop - printable representation of opcode, with run time support
20720 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
20725 RXi_GET_DECL(prog, progi);
20726 GET_RE_DEBUG_FLAGS_DECL;
20728 PERL_ARGS_ASSERT_REGPROP;
20732 if (OP(o) > REGNODE_MAX) { /* regnode.type is unsigned */
20733 if (pRExC_state) { /* This gives more info, if we have it */
20734 FAIL3("panic: corrupted regexp opcode %d > %d",
20735 (int)OP(o), (int)REGNODE_MAX);
20738 Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
20739 (int)OP(o), (int)REGNODE_MAX);
20742 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
20744 k = PL_regkind[OP(o)];
20747 sv_catpvs(sv, " ");
20748 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
20749 * is a crude hack but it may be the best for now since
20750 * we have no flag "this EXACTish node was UTF-8"
20752 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
20753 PL_colors[0], PL_colors[1],
20754 PERL_PV_ESCAPE_UNI_DETECT |
20755 PERL_PV_ESCAPE_NONASCII |
20756 PERL_PV_PRETTY_ELLIPSES |
20757 PERL_PV_PRETTY_LTGT |
20758 PERL_PV_PRETTY_NOCLEAR
20760 } else if (k == TRIE) {
20761 /* print the details of the trie in dumpuntil instead, as
20762 * progi->data isn't available here */
20763 const char op = OP(o);
20764 const U32 n = ARG(o);
20765 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
20766 (reg_ac_data *)progi->data->data[n] :
20768 const reg_trie_data * const trie
20769 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
20771 Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
20772 DEBUG_TRIE_COMPILE_r({
20774 sv_catpvs(sv, "(JUMP)");
20775 Perl_sv_catpvf(aTHX_ sv,
20776 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
20777 (UV)trie->startstate,
20778 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
20779 (UV)trie->wordcount,
20782 (UV)TRIE_CHARCOUNT(trie),
20783 (UV)trie->uniquecharcount
20786 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
20787 sv_catpvs(sv, "[");
20788 (void) put_charclass_bitmap_innards(sv,
20789 ((IS_ANYOF_TRIE(op))
20791 : TRIE_BITMAP(trie)),
20798 sv_catpvs(sv, "]");
20800 } else if (k == CURLY) {
20801 U32 lo = ARG1(o), hi = ARG2(o);
20802 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
20803 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
20804 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
20805 if (hi == REG_INFTY)
20806 sv_catpvs(sv, "INFTY");
20808 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
20809 sv_catpvs(sv, "}");
20811 else if (k == WHILEM && o->flags) /* Ordinal/of */
20812 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
20813 else if (k == REF || k == OPEN || k == CLOSE
20814 || k == GROUPP || OP(o)==ACCEPT)
20816 AV *name_list= NULL;
20817 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
20818 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
20819 if ( RXp_PAREN_NAMES(prog) ) {
20820 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20821 } else if ( pRExC_state ) {
20822 name_list= RExC_paren_name_list;
20825 if ( k != REF || (OP(o) < REFN)) {
20826 SV **name= av_fetch(name_list, parno, 0 );
20828 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20831 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
20832 I32 *nums=(I32*)SvPVX(sv_dat);
20833 SV **name= av_fetch(name_list, nums[0], 0 );
20836 for ( n=0; n<SvIVX(sv_dat); n++ ) {
20837 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
20838 (n ? "," : ""), (IV)nums[n]);
20840 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20844 if ( k == REF && reginfo) {
20845 U32 n = ARG(o); /* which paren pair */
20846 I32 ln = prog->offs[n].start;
20847 if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
20848 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
20849 else if (ln == prog->offs[n].end)
20850 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
20852 const char *s = reginfo->strbeg + ln;
20853 Perl_sv_catpvf(aTHX_ sv, ": ");
20854 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
20855 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
20858 } else if (k == GOSUB) {
20859 AV *name_list= NULL;
20860 if ( RXp_PAREN_NAMES(prog) ) {
20861 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20862 } else if ( pRExC_state ) {
20863 name_list= RExC_paren_name_list;
20866 /* Paren and offset */
20867 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
20868 (int)((o + (int)ARG2L(o)) - progi->program) );
20870 SV **name= av_fetch(name_list, ARG(o), 0 );
20872 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20875 else if (k == LOGICAL)
20876 /* 2: embedded, otherwise 1 */
20877 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
20878 else if (k == ANYOF || k == ANYOFR) {
20882 bool do_sep = FALSE; /* Do we need to separate various components of
20884 /* Set if there is still an unresolved user-defined property */
20885 SV *unresolved = NULL;
20887 /* Things that are ignored except when the runtime locale is UTF-8 */
20888 SV *only_utf8_locale_invlist = NULL;
20890 /* Code points that don't fit in the bitmap */
20891 SV *nonbitmap_invlist = NULL;
20893 /* And things that aren't in the bitmap, but are small enough to be */
20894 SV* bitmap_range_not_in_bitmap = NULL;
20898 if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
20904 flags = ANYOF_FLAGS(o);
20905 bitmap = ANYOF_BITMAP(o);
20909 if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
20910 if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
20911 sv_catpvs(sv, "{utf8-locale-reqd}");
20913 if (flags & ANYOFL_FOLD) {
20914 sv_catpvs(sv, "{i}");
20918 inverted = flags & ANYOF_INVERT;
20920 /* If there is stuff outside the bitmap, get it */
20921 if (arg != ANYOF_ONLY_HAS_BITMAP) {
20922 if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
20923 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
20925 ANYOFRbase(o) + ANYOFRdelta(o));
20928 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
20930 &only_utf8_locale_invlist,
20931 &nonbitmap_invlist);
20934 /* The non-bitmap data may contain stuff that could fit in the
20935 * bitmap. This could come from a user-defined property being
20936 * finally resolved when this call was done; or much more likely
20937 * because there are matches that require UTF-8 to be valid, and so
20938 * aren't in the bitmap (or ANYOFR). This is teased apart later */
20939 _invlist_intersection(nonbitmap_invlist,
20941 &bitmap_range_not_in_bitmap);
20942 /* Leave just the things that don't fit into the bitmap */
20943 _invlist_subtract(nonbitmap_invlist,
20945 &nonbitmap_invlist);
20948 /* Obey this flag to add all above-the-bitmap code points */
20949 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
20950 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
20951 NUM_ANYOF_CODE_POINTS,
20955 /* Ready to start outputting. First, the initial left bracket */
20956 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20958 /* ANYOFH by definition doesn't have anything that will fit inside the
20959 * bitmap; ANYOFR may or may not. */
20960 if ( ! inRANGE(OP(o), ANYOFH, ANYOFHr)
20961 && ( ! inRANGE(OP(o), ANYOFR, ANYOFRb)
20962 || ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
20964 /* Then all the things that could fit in the bitmap */
20965 do_sep = put_charclass_bitmap_innards(sv,
20967 bitmap_range_not_in_bitmap,
20968 only_utf8_locale_invlist,
20972 /* Can't try inverting for a
20973 * better display if there
20974 * are things that haven't
20977 || inRANGE(OP(o), ANYOFR, ANYOFRb));
20978 SvREFCNT_dec(bitmap_range_not_in_bitmap);
20980 /* If there are user-defined properties which haven't been defined
20981 * yet, output them. If the result is not to be inverted, it is
20982 * clearest to output them in a separate [] from the bitmap range
20983 * stuff. If the result is to be complemented, we have to show
20984 * everything in one [], as the inversion applies to the whole
20985 * thing. Use {braces} to separate them from anything in the
20986 * bitmap and anything above the bitmap. */
20989 if (! do_sep) { /* If didn't output anything in the bitmap
20991 sv_catpvs(sv, "^");
20993 sv_catpvs(sv, "{");
20996 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
20999 sv_catsv(sv, unresolved);
21001 sv_catpvs(sv, "}");
21003 do_sep = ! inverted;
21007 /* And, finally, add the above-the-bitmap stuff */
21008 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
21011 /* See if truncation size is overridden */
21012 const STRLEN dump_len = (PL_dump_re_max_len > 256)
21013 ? PL_dump_re_max_len
21016 /* This is output in a separate [] */
21018 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
21021 /* And, for easy of understanding, it is shown in the
21022 * uncomplemented form if possible. The one exception being if
21023 * there are unresolved items, where the inversion has to be
21024 * delayed until runtime */
21025 if (inverted && ! unresolved) {
21026 _invlist_invert(nonbitmap_invlist);
21027 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
21030 contents = invlist_contents(nonbitmap_invlist,
21031 FALSE /* output suitable for catsv */
21034 /* If the output is shorter than the permissible maximum, just do it. */
21035 if (SvCUR(contents) <= dump_len) {
21036 sv_catsv(sv, contents);
21039 const char * contents_string = SvPVX(contents);
21040 STRLEN i = dump_len;
21042 /* Otherwise, start at the permissible max and work back to the
21043 * first break possibility */
21044 while (i > 0 && contents_string[i] != ' ') {
21047 if (i == 0) { /* Fail-safe. Use the max if we couldn't
21048 find a legal break */
21052 sv_catpvn(sv, contents_string, i);
21053 sv_catpvs(sv, "...");
21056 SvREFCNT_dec_NN(contents);
21057 SvREFCNT_dec_NN(nonbitmap_invlist);
21060 /* And finally the matching, closing ']' */
21061 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21063 if (OP(o) == ANYOFHs) {
21064 Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
21066 else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21067 U8 lowest = (OP(o) != ANYOFHr)
21069 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
21070 U8 highest = (OP(o) == ANYOFHr)
21071 ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
21072 : (OP(o) == ANYOFH || OP(o) == ANYOFR)
21075 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
21076 if (lowest != highest) {
21077 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
21079 Perl_sv_catpvf(aTHX_ sv, ")");
21082 SvREFCNT_dec(unresolved);
21084 else if (k == ANYOFM) {
21085 SV * cp_list = get_ANYOFM_contents(o);
21087 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21088 if (OP(o) == NANYOFM) {
21089 _invlist_invert(cp_list);
21092 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
21093 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21095 SvREFCNT_dec(cp_list);
21097 else if (k == POSIXD || k == NPOSIXD) {
21098 U8 index = FLAGS(o) * 2;
21099 if (index < C_ARRAY_LENGTH(anyofs)) {
21100 if (*anyofs[index] != '[') {
21101 sv_catpvs(sv, "[");
21103 sv_catpv(sv, anyofs[index]);
21104 if (*anyofs[index] != '[') {
21105 sv_catpvs(sv, "]");
21109 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
21112 else if (k == BOUND || k == NBOUND) {
21113 /* Must be synced with order of 'bound_type' in regcomp.h */
21114 const char * const bounds[] = {
21115 "", /* Traditional */
21121 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
21122 sv_catpv(sv, bounds[FLAGS(o)]);
21124 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
21125 Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
21127 Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
21129 Perl_sv_catpvf(aTHX_ sv, "]");
21131 else if (OP(o) == SBOL)
21132 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
21134 /* add on the verb argument if there is one */
21135 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
21137 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
21138 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
21140 sv_catpvs(sv, ":NULL");
21143 PERL_UNUSED_CONTEXT;
21144 PERL_UNUSED_ARG(sv);
21145 PERL_UNUSED_ARG(o);
21146 PERL_UNUSED_ARG(prog);
21147 PERL_UNUSED_ARG(reginfo);
21148 PERL_UNUSED_ARG(pRExC_state);
21149 #endif /* DEBUGGING */
21155 Perl_re_intuit_string(pTHX_ REGEXP * const r)
21156 { /* Assume that RE_INTUIT is set */
21157 struct regexp *const prog = ReANY(r);
21158 GET_RE_DEBUG_FLAGS_DECL;
21160 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
21161 PERL_UNUSED_CONTEXT;
21165 const char * const s = SvPV_nolen_const(RX_UTF8(r)
21166 ? prog->check_utf8 : prog->check_substr);
21168 if (!PL_colorset) reginitcolors();
21169 Perl_re_printf( aTHX_
21170 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
21172 RX_UTF8(r) ? "utf8 " : "",
21173 PL_colors[5], PL_colors[0],
21176 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
21179 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
21180 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
21186 handles refcounting and freeing the perl core regexp structure. When
21187 it is necessary to actually free the structure the first thing it
21188 does is call the 'free' method of the regexp_engine associated to
21189 the regexp, allowing the handling of the void *pprivate; member
21190 first. (This routine is not overridable by extensions, which is why
21191 the extensions free is called first.)
21193 See regdupe and regdupe_internal if you change anything here.
21195 #ifndef PERL_IN_XSUB_RE
21197 Perl_pregfree(pTHX_ REGEXP *r)
21203 Perl_pregfree2(pTHX_ REGEXP *rx)
21205 struct regexp *const r = ReANY(rx);
21206 GET_RE_DEBUG_FLAGS_DECL;
21208 PERL_ARGS_ASSERT_PREGFREE2;
21213 if (r->mother_re) {
21214 ReREFCNT_dec(r->mother_re);
21216 CALLREGFREE_PVT(rx); /* free the private data */
21217 SvREFCNT_dec(RXp_PAREN_NAMES(r));
21221 for (i = 0; i < 2; i++) {
21222 SvREFCNT_dec(r->substrs->data[i].substr);
21223 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
21225 Safefree(r->substrs);
21227 RX_MATCH_COPY_FREE(rx);
21228 #ifdef PERL_ANY_COW
21229 SvREFCNT_dec(r->saved_copy);
21232 SvREFCNT_dec(r->qr_anoncv);
21233 if (r->recurse_locinput)
21234 Safefree(r->recurse_locinput);
21240 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
21241 except that dsv will be created if NULL.
21243 This function is used in two main ways. First to implement
21244 $r = qr/....; $s = $$r;
21246 Secondly, it is used as a hacky workaround to the structural issue of
21248 being stored in the regexp structure which is in turn stored in
21249 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
21250 could be PL_curpm in multiple contexts, and could require multiple
21251 result sets being associated with the pattern simultaneously, such
21252 as when doing a recursive match with (??{$qr})
21254 The solution is to make a lightweight copy of the regexp structure
21255 when a qr// is returned from the code executed by (??{$qr}) this
21256 lightweight copy doesn't actually own any of its data except for
21257 the starp/end and the actual regexp structure itself.
21263 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
21265 struct regexp *drx;
21266 struct regexp *const srx = ReANY(ssv);
21267 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
21269 PERL_ARGS_ASSERT_REG_TEMP_COPY;
21272 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
21274 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
21276 /* our only valid caller, sv_setsv_flags(), should have done
21277 * a SV_CHECK_THINKFIRST_COW_DROP() by now */
21278 assert(!SvOOK(dsv));
21279 assert(!SvIsCOW(dsv));
21280 assert(!SvROK(dsv));
21282 if (SvPVX_const(dsv)) {
21284 Safefree(SvPVX(dsv));
21289 SvOK_off((SV *)dsv);
21292 /* For PVLVs, the head (sv_any) points to an XPVLV, while
21293 * the LV's xpvlenu_rx will point to a regexp body, which
21294 * we allocate here */
21295 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
21296 assert(!SvPVX(dsv));
21297 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
21298 temp->sv_any = NULL;
21299 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
21300 SvREFCNT_dec_NN(temp);
21301 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
21302 ing below will not set it. */
21303 SvCUR_set(dsv, SvCUR(ssv));
21306 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
21307 sv_force_normal(sv) is called. */
21311 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
21312 SvPV_set(dsv, RX_WRAPPED(ssv));
21313 /* We share the same string buffer as the original regexp, on which we
21314 hold a reference count, incremented when mother_re is set below.
21315 The string pointer is copied here, being part of the regexp struct.
21317 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
21318 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
21322 const I32 npar = srx->nparens+1;
21323 Newx(drx->offs, npar, regexp_paren_pair);
21324 Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
21326 if (srx->substrs) {
21328 Newx(drx->substrs, 1, struct reg_substr_data);
21329 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
21331 for (i = 0; i < 2; i++) {
21332 SvREFCNT_inc_void(drx->substrs->data[i].substr);
21333 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
21336 /* check_substr and check_utf8, if non-NULL, point to either their
21337 anchored or float namesakes, and don't hold a second reference. */
21339 RX_MATCH_COPIED_off(dsv);
21340 #ifdef PERL_ANY_COW
21341 drx->saved_copy = NULL;
21343 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
21344 SvREFCNT_inc_void(drx->qr_anoncv);
21345 if (srx->recurse_locinput)
21346 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
21353 /* regfree_internal()
21355 Free the private data in a regexp. This is overloadable by
21356 extensions. Perl takes care of the regexp structure in pregfree(),
21357 this covers the *pprivate pointer which technically perl doesn't
21358 know about, however of course we have to handle the
21359 regexp_internal structure when no extension is in use.
21361 Note this is called before freeing anything in the regexp
21366 Perl_regfree_internal(pTHX_ REGEXP * const rx)
21368 struct regexp *const r = ReANY(rx);
21369 RXi_GET_DECL(r, ri);
21370 GET_RE_DEBUG_FLAGS_DECL;
21372 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
21382 SV *dsv= sv_newmortal();
21383 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
21384 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
21385 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
21386 PL_colors[4], PL_colors[5], s);
21390 #ifdef RE_TRACK_PATTERN_OFFSETS
21392 Safefree(ri->u.offsets); /* 20010421 MJD */
21394 if (ri->code_blocks)
21395 S_free_codeblocks(aTHX_ ri->code_blocks);
21398 int n = ri->data->count;
21401 /* If you add a ->what type here, update the comment in regcomp.h */
21402 switch (ri->data->what[n]) {
21408 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
21411 Safefree(ri->data->data[n]);
21417 { /* Aho Corasick add-on structure for a trie node.
21418 Used in stclass optimization only */
21420 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
21421 #ifdef USE_ITHREADS
21425 refcount = --aho->refcount;
21428 PerlMemShared_free(aho->states);
21429 PerlMemShared_free(aho->fail);
21430 /* do this last!!!! */
21431 PerlMemShared_free(ri->data->data[n]);
21432 /* we should only ever get called once, so
21433 * assert as much, and also guard the free
21434 * which /might/ happen twice. At the least
21435 * it will make code anlyzers happy and it
21436 * doesn't cost much. - Yves */
21437 assert(ri->regstclass);
21438 if (ri->regstclass) {
21439 PerlMemShared_free(ri->regstclass);
21440 ri->regstclass = 0;
21447 /* trie structure. */
21449 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21450 #ifdef USE_ITHREADS
21454 refcount = --trie->refcount;
21457 PerlMemShared_free(trie->charmap);
21458 PerlMemShared_free(trie->states);
21459 PerlMemShared_free(trie->trans);
21461 PerlMemShared_free(trie->bitmap);
21463 PerlMemShared_free(trie->jump);
21464 PerlMemShared_free(trie->wordinfo);
21465 /* do this last!!!! */
21466 PerlMemShared_free(ri->data->data[n]);
21471 Perl_croak(aTHX_ "panic: regfree data code '%c'",
21472 ri->data->what[n]);
21475 Safefree(ri->data->what);
21476 Safefree(ri->data);
21482 #define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21483 #define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21484 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
21487 re_dup_guts - duplicate a regexp.
21489 This routine is expected to clone a given regexp structure. It is only
21490 compiled under USE_ITHREADS.
21492 After all of the core data stored in struct regexp is duplicated
21493 the regexp_engine.dupe method is used to copy any private data
21494 stored in the *pprivate pointer. This allows extensions to handle
21495 any duplication it needs to do.
21497 See pregfree() and regfree_internal() if you change anything here.
21499 #if defined(USE_ITHREADS)
21500 #ifndef PERL_IN_XSUB_RE
21502 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21506 const struct regexp *r = ReANY(sstr);
21507 struct regexp *ret = ReANY(dstr);
21509 PERL_ARGS_ASSERT_RE_DUP_GUTS;
21511 npar = r->nparens+1;
21512 Newx(ret->offs, npar, regexp_paren_pair);
21513 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21515 if (ret->substrs) {
21516 /* Do it this way to avoid reading from *r after the StructCopy().
21517 That way, if any of the sv_dup_inc()s dislodge *r from the L1
21518 cache, it doesn't matter. */
21520 const bool anchored = r->check_substr
21521 ? r->check_substr == r->substrs->data[0].substr
21522 : r->check_utf8 == r->substrs->data[0].utf8_substr;
21523 Newx(ret->substrs, 1, struct reg_substr_data);
21524 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21526 for (i = 0; i < 2; i++) {
21527 ret->substrs->data[i].substr =
21528 sv_dup_inc(ret->substrs->data[i].substr, param);
21529 ret->substrs->data[i].utf8_substr =
21530 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21533 /* check_substr and check_utf8, if non-NULL, point to either their
21534 anchored or float namesakes, and don't hold a second reference. */
21536 if (ret->check_substr) {
21538 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21540 ret->check_substr = ret->substrs->data[0].substr;
21541 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21543 assert(r->check_substr == r->substrs->data[1].substr);
21544 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
21546 ret->check_substr = ret->substrs->data[1].substr;
21547 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21549 } else if (ret->check_utf8) {
21551 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21553 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21558 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21559 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21560 if (r->recurse_locinput)
21561 Newx(ret->recurse_locinput, r->nparens + 1, char *);
21564 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21566 if (RX_MATCH_COPIED(dstr))
21567 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
21569 ret->subbeg = NULL;
21570 #ifdef PERL_ANY_COW
21571 ret->saved_copy = NULL;
21574 /* Whether mother_re be set or no, we need to copy the string. We
21575 cannot refrain from copying it when the storage points directly to
21576 our mother regexp, because that's
21577 1: a buffer in a different thread
21578 2: something we no longer hold a reference on
21579 so we need to copy it locally. */
21580 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21581 /* set malloced length to a non-zero value so it will be freed
21582 * (otherwise in combination with SVf_FAKE it looks like an alien
21583 * buffer). It doesn't have to be the actual malloced size, since it
21584 * should never be grown */
21585 SvLEN_set(dstr, SvCUR(sstr)+1);
21586 ret->mother_re = NULL;
21588 #endif /* PERL_IN_XSUB_RE */
21593 This is the internal complement to regdupe() which is used to copy
21594 the structure pointed to by the *pprivate pointer in the regexp.
21595 This is the core version of the extension overridable cloning hook.
21596 The regexp structure being duplicated will be copied by perl prior
21597 to this and will be provided as the regexp *r argument, however
21598 with the /old/ structures pprivate pointer value. Thus this routine
21599 may override any copying normally done by perl.
21601 It returns a pointer to the new regexp_internal structure.
21605 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
21608 struct regexp *const r = ReANY(rx);
21609 regexp_internal *reti;
21611 RXi_GET_DECL(r, ri);
21613 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
21617 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
21618 char, regexp_internal);
21619 Copy(ri->program, reti->program, len+1, regnode);
21622 if (ri->code_blocks) {
21624 Newx(reti->code_blocks, 1, struct reg_code_blocks);
21625 Newx(reti->code_blocks->cb, ri->code_blocks->count,
21626 struct reg_code_block);
21627 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
21628 ri->code_blocks->count, struct reg_code_block);
21629 for (n = 0; n < ri->code_blocks->count; n++)
21630 reti->code_blocks->cb[n].src_regex = (REGEXP*)
21631 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
21632 reti->code_blocks->count = ri->code_blocks->count;
21633 reti->code_blocks->refcnt = 1;
21636 reti->code_blocks = NULL;
21638 reti->regstclass = NULL;
21641 struct reg_data *d;
21642 const int count = ri->data->count;
21645 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
21646 char, struct reg_data);
21647 Newx(d->what, count, U8);
21650 for (i = 0; i < count; i++) {
21651 d->what[i] = ri->data->what[i];
21652 switch (d->what[i]) {
21653 /* see also regcomp.h and regfree_internal() */
21654 case 'a': /* actually an AV, but the dup function is identical.
21655 values seem to be "plain sv's" generally. */
21656 case 'r': /* a compiled regex (but still just another SV) */
21657 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
21658 this use case should go away, the code could have used
21659 'a' instead - see S_set_ANYOF_arg() for array contents. */
21660 case 'S': /* actually an SV, but the dup function is identical. */
21661 case 'u': /* actually an HV, but the dup function is identical.
21662 values are "plain sv's" */
21663 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
21666 /* Synthetic Start Class - "Fake" charclass we generate to optimize
21667 * patterns which could start with several different things. Pre-TRIE
21668 * this was more important than it is now, however this still helps
21669 * in some places, for instance /x?a+/ might produce a SSC equivalent
21670 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
21673 /* This is cheating. */
21674 Newx(d->data[i], 1, regnode_ssc);
21675 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
21676 reti->regstclass = (regnode*)d->data[i];
21679 /* AHO-CORASICK fail table */
21680 /* Trie stclasses are readonly and can thus be shared
21681 * without duplication. We free the stclass in pregfree
21682 * when the corresponding reg_ac_data struct is freed.
21684 reti->regstclass= ri->regstclass;
21687 /* TRIE transition table */
21689 ((reg_trie_data*)ri->data->data[i])->refcount++;
21692 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
21693 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
21694 is not from another regexp */
21695 d->data[i] = ri->data->data[i];
21698 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
21699 ri->data->what[i]);
21708 reti->name_list_idx = ri->name_list_idx;
21710 #ifdef RE_TRACK_PATTERN_OFFSETS
21711 if (ri->u.offsets) {
21712 Newx(reti->u.offsets, 2*len+1, U32);
21713 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
21716 SetProgLen(reti, len);
21719 return (void*)reti;
21722 #endif /* USE_ITHREADS */
21724 #ifndef PERL_IN_XSUB_RE
21727 - regnext - dig the "next" pointer out of a node
21730 Perl_regnext(pTHX_ regnode *p)
21737 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
21738 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
21739 (int)OP(p), (int)REGNODE_MAX);
21742 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
21752 S_re_croak2(pTHX_ bool utf8, const char* pat1, const char* pat2,...)
21755 STRLEN l1 = strlen(pat1);
21756 STRLEN l2 = strlen(pat2);
21759 const char *message;
21761 PERL_ARGS_ASSERT_RE_CROAK2;
21767 Copy(pat1, buf, l1 , char);
21768 Copy(pat2, buf + l1, l2 , char);
21769 buf[l1 + l2] = '\n';
21770 buf[l1 + l2 + 1] = '\0';
21771 va_start(args, pat2);
21772 msv = vmess(buf, &args);
21774 message = SvPV_const(msv, l1);
21777 Copy(message, buf, l1 , char);
21778 /* l1-1 to avoid \n */
21779 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
21782 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
21784 #ifndef PERL_IN_XSUB_RE
21786 Perl_save_re_context(pTHX)
21791 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
21794 const REGEXP * const rx = PM_GETRE(PL_curpm);
21796 nparens = RX_NPARENS(rx);
21799 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
21800 * that PL_curpm will be null, but that utf8.pm and the modules it
21801 * loads will only use $1..$3.
21802 * The t/porting/re_context.t test file checks this assumption.
21807 for (i = 1; i <= nparens; i++) {
21808 char digits[TYPE_CHARS(long)];
21809 const STRLEN len = my_snprintf(digits, sizeof(digits),
21811 GV *const *const gvp
21812 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
21815 GV * const gv = *gvp;
21816 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
21826 S_put_code_point(pTHX_ SV *sv, UV c)
21828 PERL_ARGS_ASSERT_PUT_CODE_POINT;
21831 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
21833 else if (isPRINT(c)) {
21834 const char string = (char) c;
21836 /* We use {phrase} as metanotation in the class, so also escape literal
21838 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
21839 sv_catpvs(sv, "\\");
21840 sv_catpvn(sv, &string, 1);
21842 else if (isMNEMONIC_CNTRL(c)) {
21843 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
21846 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
21850 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
21853 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
21855 /* Appends to 'sv' a displayable version of the range of code points from
21856 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
21857 * that have them, when they occur at the beginning or end of the range.
21858 * It uses hex to output the remaining code points, unless 'allow_literals'
21859 * is true, in which case the printable ASCII ones are output as-is (though
21860 * some of these will be escaped by put_code_point()).
21862 * NOTE: This is designed only for printing ranges of code points that fit
21863 * inside an ANYOF bitmap. Higher code points are simply suppressed
21866 const unsigned int min_range_count = 3;
21868 assert(start <= end);
21870 PERL_ARGS_ASSERT_PUT_RANGE;
21872 while (start <= end) {
21874 const char * format;
21876 if (end - start < min_range_count) {
21878 /* Output chars individually when they occur in short ranges */
21879 for (; start <= end; start++) {
21880 put_code_point(sv, start);
21885 /* If permitted by the input options, and there is a possibility that
21886 * this range contains a printable literal, look to see if there is
21888 if (allow_literals && start <= MAX_PRINT_A) {
21890 /* If the character at the beginning of the range isn't an ASCII
21891 * printable, effectively split the range into two parts:
21892 * 1) the portion before the first such printable,
21894 * and output them separately. */
21895 if (! isPRINT_A(start)) {
21896 UV temp_end = start + 1;
21898 /* There is no point looking beyond the final possible
21899 * printable, in MAX_PRINT_A */
21900 UV max = MIN(end, MAX_PRINT_A);
21902 while (temp_end <= max && ! isPRINT_A(temp_end)) {
21906 /* Here, temp_end points to one beyond the first printable if
21907 * found, or to one beyond 'max' if not. If none found, make
21908 * sure that we use the entire range */
21909 if (temp_end > MAX_PRINT_A) {
21910 temp_end = end + 1;
21913 /* Output the first part of the split range: the part that
21914 * doesn't have printables, with the parameter set to not look
21915 * for literals (otherwise we would infinitely recurse) */
21916 put_range(sv, start, temp_end - 1, FALSE);
21918 /* The 2nd part of the range (if any) starts here. */
21921 /* We do a continue, instead of dropping down, because even if
21922 * the 2nd part is non-empty, it could be so short that we want
21923 * to output it as individual characters, as tested for at the
21924 * top of this loop. */
21928 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
21929 * output a sub-range of just the digits or letters, then process
21930 * the remaining portion as usual. */
21931 if (isALPHANUMERIC_A(start)) {
21932 UV mask = (isDIGIT_A(start))
21937 UV temp_end = start + 1;
21939 /* Find the end of the sub-range that includes just the
21940 * characters in the same class as the first character in it */
21941 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
21946 /* For short ranges, don't duplicate the code above to output
21947 * them; just call recursively */
21948 if (temp_end - start < min_range_count) {
21949 put_range(sv, start, temp_end, FALSE);
21951 else { /* Output as a range */
21952 put_code_point(sv, start);
21953 sv_catpvs(sv, "-");
21954 put_code_point(sv, temp_end);
21956 start = temp_end + 1;
21960 /* We output any other printables as individual characters */
21961 if (isPUNCT_A(start) || isSPACE_A(start)) {
21962 while (start <= end && (isPUNCT_A(start)
21963 || isSPACE_A(start)))
21965 put_code_point(sv, start);
21970 } /* End of looking for literals */
21972 /* Here is not to output as a literal. Some control characters have
21973 * mnemonic names. Split off any of those at the beginning and end of
21974 * the range to print mnemonically. It isn't possible for many of
21975 * these to be in a row, so this won't overwhelm with output */
21977 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
21979 while (isMNEMONIC_CNTRL(start) && start <= end) {
21980 put_code_point(sv, start);
21984 /* If this didn't take care of the whole range ... */
21985 if (start <= end) {
21987 /* Look backwards from the end to find the final non-mnemonic
21990 while (isMNEMONIC_CNTRL(temp_end)) {
21994 /* And separately output the interior range that doesn't start
21995 * or end with mnemonics */
21996 put_range(sv, start, temp_end, FALSE);
21998 /* Then output the mnemonic trailing controls */
21999 start = temp_end + 1;
22000 while (start <= end) {
22001 put_code_point(sv, start);
22008 /* As a final resort, output the range or subrange as hex. */
22010 if (start >= NUM_ANYOF_CODE_POINTS) {
22013 else { /* Have to split range at the bitmap boundary */
22014 this_end = (end < NUM_ANYOF_CODE_POINTS)
22016 : NUM_ANYOF_CODE_POINTS - 1;
22018 #if NUM_ANYOF_CODE_POINTS > 256
22019 format = (this_end < 256)
22020 ? "\\x%02" UVXf "-\\x%02" UVXf
22021 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
22023 format = "\\x%02" UVXf "-\\x%02" UVXf;
22025 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
22026 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
22027 GCC_DIAG_RESTORE_STMT;
22033 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
22035 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
22039 bool allow_literals = TRUE;
22041 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
22043 /* Generally, it is more readable if printable characters are output as
22044 * literals, but if a range (nearly) spans all of them, it's best to output
22045 * it as a single range. This code will use a single range if all but 2
22046 * ASCII printables are in it */
22047 invlist_iterinit(invlist);
22048 while (invlist_iternext(invlist, &start, &end)) {
22050 /* If the range starts beyond the final printable, it doesn't have any
22052 if (start > MAX_PRINT_A) {
22056 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
22057 * all but two, the range must start and end no later than 2 from
22059 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
22060 if (end > MAX_PRINT_A) {
22066 if (end - start >= MAX_PRINT_A - ' ' - 2) {
22067 allow_literals = FALSE;
22072 invlist_iterfinish(invlist);
22074 /* Here we have figured things out. Output each range */
22075 invlist_iterinit(invlist);
22076 while (invlist_iternext(invlist, &start, &end)) {
22077 if (start >= NUM_ANYOF_CODE_POINTS) {
22080 put_range(sv, start, end, allow_literals);
22082 invlist_iterfinish(invlist);
22088 S_put_charclass_bitmap_innards_common(pTHX_
22089 SV* invlist, /* The bitmap */
22090 SV* posixes, /* Under /l, things like [:word:], \S */
22091 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
22092 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
22093 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
22094 const bool invert /* Is the result to be inverted? */
22097 /* Create and return an SV containing a displayable version of the bitmap
22098 * and associated information determined by the input parameters. If the
22099 * output would have been only the inversion indicator '^', NULL is instead
22105 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
22108 output = newSVpvs("^");
22111 output = newSVpvs("");
22114 /* First, the code points in the bitmap that are unconditionally there */
22115 put_charclass_bitmap_innards_invlist(output, invlist);
22117 /* Traditionally, these have been placed after the main code points */
22119 sv_catsv(output, posixes);
22122 if (only_utf8 && _invlist_len(only_utf8)) {
22123 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
22124 put_charclass_bitmap_innards_invlist(output, only_utf8);
22127 if (not_utf8 && _invlist_len(not_utf8)) {
22128 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
22129 put_charclass_bitmap_innards_invlist(output, not_utf8);
22132 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
22133 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
22134 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
22136 /* This is the only list in this routine that can legally contain code
22137 * points outside the bitmap range. The call just above to
22138 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
22139 * output them here. There's about a half-dozen possible, and none in
22140 * contiguous ranges longer than 2 */
22141 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22143 SV* above_bitmap = NULL;
22145 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
22147 invlist_iterinit(above_bitmap);
22148 while (invlist_iternext(above_bitmap, &start, &end)) {
22151 for (i = start; i <= end; i++) {
22152 put_code_point(output, i);
22155 invlist_iterfinish(above_bitmap);
22156 SvREFCNT_dec_NN(above_bitmap);
22160 if (invert && SvCUR(output) == 1) {
22168 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
22170 SV *nonbitmap_invlist,
22171 SV *only_utf8_locale_invlist,
22172 const regnode * const node,
22174 const bool force_as_is_display)
22176 /* Appends to 'sv' a displayable version of the innards of the bracketed
22177 * character class defined by the other arguments:
22178 * 'bitmap' points to the bitmap, or NULL if to ignore that.
22179 * 'nonbitmap_invlist' is an inversion list of the code points that are in
22180 * the bitmap range, but for some reason aren't in the bitmap; NULL if
22181 * none. The reasons for this could be that they require some
22182 * condition such as the target string being or not being in UTF-8
22183 * (under /d), or because they came from a user-defined property that
22184 * was not resolved at the time of the regex compilation (under /u)
22185 * 'only_utf8_locale_invlist' is an inversion list of the code points that
22186 * are valid only if the runtime locale is a UTF-8 one; NULL if none
22187 * 'node' is the regex pattern ANYOF node. It is needed only when the
22188 * above two parameters are not null, and is passed so that this
22189 * routine can tease apart the various reasons for them.
22190 * 'flags' is the flags field of 'node'
22191 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
22192 * to invert things to see if that leads to a cleaner display. If
22193 * FALSE, this routine is free to use its judgment about doing this.
22195 * It returns TRUE if there was actually something output. (It may be that
22196 * the bitmap, etc is empty.)
22198 * When called for outputting the bitmap of a non-ANYOF node, just pass the
22199 * bitmap, with the succeeding parameters set to NULL, and the final one to
22203 /* In general, it tries to display the 'cleanest' representation of the
22204 * innards, choosing whether to display them inverted or not, regardless of
22205 * whether the class itself is to be inverted. However, there are some
22206 * cases where it can't try inverting, as what actually matches isn't known
22207 * until runtime, and hence the inversion isn't either. */
22210 bool inverting_allowed = ! force_as_is_display;
22213 STRLEN orig_sv_cur = SvCUR(sv);
22215 SV* invlist; /* Inversion list we accumulate of code points that
22216 are unconditionally matched */
22217 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
22219 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
22221 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
22222 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
22225 SV* as_is_display; /* The output string when we take the inputs
22227 SV* inverted_display; /* The output string when we invert the inputs */
22229 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
22231 /* We are biased in favor of displaying things without them being inverted,
22232 * as that is generally easier to understand */
22233 const int bias = 5;
22235 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
22237 /* Start off with whatever code points are passed in. (We clone, so we
22238 * don't change the caller's list) */
22239 if (nonbitmap_invlist) {
22240 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
22241 invlist = invlist_clone(nonbitmap_invlist, NULL);
22243 else { /* Worst case size is every other code point is matched */
22244 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
22248 if (OP(node) == ANYOFD) {
22250 /* This flag indicates that the code points below 0x100 in the
22251 * nonbitmap list are precisely the ones that match only when the
22252 * target is UTF-8 (they should all be non-ASCII). */
22253 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
22255 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
22256 _invlist_subtract(invlist, only_utf8, &invlist);
22259 /* And this flag for matching all non-ASCII 0xFF and below */
22260 if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
22262 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
22265 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
22267 /* If either of these flags are set, what matches isn't
22268 * determinable except during execution, so don't know enough here
22270 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
22271 inverting_allowed = FALSE;
22274 /* What the posix classes match also varies at runtime, so these
22275 * will be output symbolically. */
22276 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
22279 posixes = newSVpvs("");
22280 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
22281 if (ANYOF_POSIXL_TEST(node, i)) {
22282 sv_catpv(posixes, anyofs[i]);
22289 /* Accumulate the bit map into the unconditional match list */
22291 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
22292 if (BITMAP_TEST(bitmap, i)) {
22295 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
22298 invlist = _add_range_to_invlist(invlist, start, i-1);
22303 /* Make sure that the conditional match lists don't have anything in them
22304 * that match unconditionally; otherwise the output is quite confusing.
22305 * This could happen if the code that populates these misses some
22308 _invlist_subtract(only_utf8, invlist, &only_utf8);
22311 _invlist_subtract(not_utf8, invlist, ¬_utf8);
22314 if (only_utf8_locale_invlist) {
22316 /* Since this list is passed in, we have to make a copy before
22318 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
22320 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
22322 /* And, it can get really weird for us to try outputting an inverted
22323 * form of this list when it has things above the bitmap, so don't even
22325 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22326 inverting_allowed = FALSE;
22330 /* Calculate what the output would be if we take the input as-is */
22331 as_is_display = put_charclass_bitmap_innards_common(invlist,
22338 /* If have to take the output as-is, just do that */
22339 if (! inverting_allowed) {
22340 if (as_is_display) {
22341 sv_catsv(sv, as_is_display);
22342 SvREFCNT_dec_NN(as_is_display);
22345 else { /* But otherwise, create the output again on the inverted input, and
22346 use whichever version is shorter */
22348 int inverted_bias, as_is_bias;
22350 /* We will apply our bias to whichever of the the results doesn't have
22360 inverted_bias = bias;
22363 /* Now invert each of the lists that contribute to the output,
22364 * excluding from the result things outside the possible range */
22366 /* For the unconditional inversion list, we have to add in all the
22367 * conditional code points, so that when inverted, they will be gone
22369 _invlist_union(only_utf8, invlist, &invlist);
22370 _invlist_union(not_utf8, invlist, &invlist);
22371 _invlist_union(only_utf8_locale, invlist, &invlist);
22372 _invlist_invert(invlist);
22373 _invlist_intersection(invlist, PL_InBitmap, &invlist);
22376 _invlist_invert(only_utf8);
22377 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
22379 else if (not_utf8) {
22381 /* If a code point matches iff the target string is not in UTF-8,
22382 * then complementing the result has it not match iff not in UTF-8,
22383 * which is the same thing as matching iff it is UTF-8. */
22384 only_utf8 = not_utf8;
22388 if (only_utf8_locale) {
22389 _invlist_invert(only_utf8_locale);
22390 _invlist_intersection(only_utf8_locale,
22392 &only_utf8_locale);
22395 inverted_display = put_charclass_bitmap_innards_common(
22400 only_utf8_locale, invert);
22402 /* Use the shortest representation, taking into account our bias
22403 * against showing it inverted */
22404 if ( inverted_display
22405 && ( ! as_is_display
22406 || ( SvCUR(inverted_display) + inverted_bias
22407 < SvCUR(as_is_display) + as_is_bias)))
22409 sv_catsv(sv, inverted_display);
22411 else if (as_is_display) {
22412 sv_catsv(sv, as_is_display);
22415 SvREFCNT_dec(as_is_display);
22416 SvREFCNT_dec(inverted_display);
22419 SvREFCNT_dec_NN(invlist);
22420 SvREFCNT_dec(only_utf8);
22421 SvREFCNT_dec(not_utf8);
22422 SvREFCNT_dec(posixes);
22423 SvREFCNT_dec(only_utf8_locale);
22425 return SvCUR(sv) > orig_sv_cur;
22428 #define CLEAR_OPTSTART \
22429 if (optstart) STMT_START { \
22430 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
22431 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
22435 #define DUMPUNTIL(b,e) \
22437 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22439 STATIC const regnode *
22440 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22441 const regnode *last, const regnode *plast,
22442 SV* sv, I32 indent, U32 depth)
22444 U8 op = PSEUDO; /* Arbitrary non-END op. */
22445 const regnode *next;
22446 const regnode *optstart= NULL;
22448 RXi_GET_DECL(r, ri);
22449 GET_RE_DEBUG_FLAGS_DECL;
22451 PERL_ARGS_ASSERT_DUMPUNTIL;
22453 #ifdef DEBUG_DUMPUNTIL
22454 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
22455 last ? last-start : 0, plast ? plast-start : 0);
22458 if (plast && plast < last)
22461 while (PL_regkind[op] != END && (!last || node < last)) {
22463 /* While that wasn't END last time... */
22466 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22468 next = regnext((regnode *)node);
22471 if (OP(node) == OPTIMIZED) {
22472 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22479 regprop(r, sv, node, NULL, NULL);
22480 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
22481 (int)(2*indent + 1), "", SvPVX_const(sv));
22483 if (OP(node) != OPTIMIZED) {
22484 if (next == NULL) /* Next ptr. */
22485 Perl_re_printf( aTHX_ " (0)");
22486 else if (PL_regkind[(U8)op] == BRANCH
22487 && PL_regkind[OP(next)] != BRANCH )
22488 Perl_re_printf( aTHX_ " (FAIL)");
22490 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
22491 Perl_re_printf( aTHX_ "\n");
22495 if (PL_regkind[(U8)op] == BRANCHJ) {
22498 const regnode *nnode = (OP(next) == LONGJMP
22499 ? regnext((regnode *)next)
22501 if (last && nnode > last)
22503 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22506 else if (PL_regkind[(U8)op] == BRANCH) {
22508 DUMPUNTIL(NEXTOPER(node), next);
22510 else if ( PL_regkind[(U8)op] == TRIE ) {
22511 const regnode *this_trie = node;
22512 const char op = OP(node);
22513 const U32 n = ARG(node);
22514 const reg_ac_data * const ac = op>=AHOCORASICK ?
22515 (reg_ac_data *)ri->data->data[n] :
22517 const reg_trie_data * const trie =
22518 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22520 AV *const trie_words
22521 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22523 const regnode *nextbranch= NULL;
22526 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22527 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22529 Perl_re_indentf( aTHX_ "%s ",
22532 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22533 SvCUR(*elem_ptr), PL_dump_re_max_len,
22534 PL_colors[0], PL_colors[1],
22536 ? PERL_PV_ESCAPE_UNI
22538 | PERL_PV_PRETTY_ELLIPSES
22539 | PERL_PV_PRETTY_LTGT
22544 U16 dist= trie->jump[word_idx+1];
22545 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
22546 (UV)((dist ? this_trie + dist : next) - start));
22549 nextbranch= this_trie + trie->jump[0];
22550 DUMPUNTIL(this_trie + dist, nextbranch);
22552 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22553 nextbranch= regnext((regnode *)nextbranch);
22555 Perl_re_printf( aTHX_ "\n");
22558 if (last && next > last)
22563 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
22564 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22565 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22567 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22569 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22571 else if ( op == PLUS || op == STAR) {
22572 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22574 else if (PL_regkind[(U8)op] == EXACT) {
22575 /* Literal string, where present. */
22576 node += NODE_SZ_STR(node) - 1;
22577 node = NEXTOPER(node);
22580 node = NEXTOPER(node);
22581 node += regarglen[(U8)op];
22583 if (op == CURLYX || op == OPEN || op == SROPEN)
22587 #ifdef DEBUG_DUMPUNTIL
22588 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
22593 #endif /* DEBUGGING */
22595 #ifndef PERL_IN_XSUB_RE
22597 #include "uni_keywords.h"
22600 Perl_init_uniprops(pTHX)
22604 PL_user_def_props = newHV();
22606 #ifdef USE_ITHREADS
22608 HvSHAREKEYS_off(PL_user_def_props);
22609 PL_user_def_props_aTHX = aTHX;
22613 /* Set up the inversion list global variables */
22615 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22616 PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
22617 PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
22618 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
22619 PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
22620 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
22621 PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
22622 PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
22623 PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
22624 PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
22625 PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
22626 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
22627 PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
22628 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
22629 PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
22630 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
22632 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22633 PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
22634 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
22635 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
22636 PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
22637 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
22638 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
22639 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
22640 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
22641 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
22642 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
22643 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
22644 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
22645 PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
22646 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
22647 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
22649 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
22650 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
22651 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
22652 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
22653 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
22655 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
22656 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
22657 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
22659 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
22661 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
22662 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
22664 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
22665 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
22667 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
22668 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
22669 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
22670 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
22671 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
22672 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
22673 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
22674 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
22675 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
22676 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
22677 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
22678 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
22679 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
22680 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
22683 /* The below are used only by deprecated functions. They could be removed */
22684 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
22685 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
22686 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
22692 This code was mainly added for backcompat to give a warning for non-portable
22693 code points in user-defined properties. But experiments showed that the
22694 warning in earlier perls were only omitted on overflow, which should be an
22695 error, so there really isnt a backcompat issue, and actually adding the
22696 warning when none was present before might cause breakage, for little gain. So
22697 khw left this code in, but not enabled. Tests were never added.
22700 Ei |const char *|get_extended_utf8_msg|const UV cp
22702 PERL_STATIC_INLINE const char *
22703 S_get_extended_utf8_msg(pTHX_ const UV cp)
22705 U8 dummy[UTF8_MAXBYTES + 1];
22709 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
22712 msg = hv_fetchs(msgs, "text", 0);
22715 (void) sv_2mortal((SV *) msgs);
22717 return SvPVX(*msg);
22723 Perl_handle_user_defined_property(pTHX_
22725 /* Parses the contents of a user-defined property definition; returning the
22726 * expanded definition if possible. If so, the return is an inversion
22729 * If there are subroutines that are part of the expansion and which aren't
22730 * known at the time of the call to this function, this returns what
22731 * parse_uniprop_string() returned for the first one encountered.
22733 * If an error was found, NULL is returned, and 'msg' gets a suitable
22734 * message appended to it. (Appending allows the back trace of how we got
22735 * to the faulty definition to be displayed through nested calls of
22736 * user-defined subs.)
22738 * The caller IS responsible for freeing any returned SV.
22740 * The syntax of the contents is pretty much described in perlunicode.pod,
22741 * but we also allow comments on each line */
22743 const char * name, /* Name of property */
22744 const STRLEN name_len, /* The name's length in bytes */
22745 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
22746 const bool to_fold, /* ? Is this under /i */
22747 const bool runtime, /* ? Are we in compile- or run-time */
22748 const bool deferrable, /* Is it ok for this property's full definition
22749 to be deferred until later? */
22750 SV* contents, /* The property's definition */
22751 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
22752 getting called unless this is thought to be
22753 a user-defined property */
22754 SV * msg, /* Any error or warning msg(s) are appended to
22756 const STRLEN level) /* Recursion level of this call */
22759 const char * string = SvPV_const(contents, len);
22760 const char * const e = string + len;
22761 const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
22762 const STRLEN msgs_length_on_entry = SvCUR(msg);
22764 const char * s0 = string; /* Points to first byte in the current line
22765 being parsed in 'string' */
22766 const char overflow_msg[] = "Code point too large in \"";
22767 SV* running_definition = NULL;
22769 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
22771 *user_defined_ptr = TRUE;
22773 /* Look at each line */
22775 const char * s; /* Current byte */
22776 char op = '+'; /* Default operation is 'union' */
22777 IV min = 0; /* range begin code point */
22778 IV max = -1; /* and range end */
22779 SV* this_definition;
22781 /* Skip comment lines */
22783 s0 = strchr(s0, '\n');
22791 /* For backcompat, allow an empty first line */
22797 /* First character in the line may optionally be the operation */
22806 /* If the line is one or two hex digits separated by blank space, its
22807 * a range; otherwise it is either another user-defined property or an
22812 if (! isXDIGIT(*s)) {
22813 goto check_if_property;
22816 do { /* Each new hex digit will add 4 bits. */
22817 if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
22818 s = strchr(s, '\n');
22822 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22823 sv_catpv(msg, overflow_msg);
22824 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22825 UTF8fARG(is_contents_utf8, s - s0, s0));
22826 sv_catpvs(msg, "\"");
22827 goto return_failure;
22830 /* Accumulate this digit into the value */
22831 min = (min << 4) + READ_XDIGIT(s);
22832 } while (isXDIGIT(*s));
22834 while (isBLANK(*s)) { s++; }
22836 /* We allow comments at the end of the line */
22838 s = strchr(s, '\n');
22844 else if (s < e && *s != '\n') {
22845 if (! isXDIGIT(*s)) {
22846 goto check_if_property;
22849 /* Look for the high point of the range */
22852 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
22853 s = strchr(s, '\n');
22857 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22858 sv_catpv(msg, overflow_msg);
22859 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22860 UTF8fARG(is_contents_utf8, s - s0, s0));
22861 sv_catpvs(msg, "\"");
22862 goto return_failure;
22865 max = (max << 4) + READ_XDIGIT(s);
22866 } while (isXDIGIT(*s));
22868 while (isBLANK(*s)) { s++; }
22871 s = strchr(s, '\n');
22876 else if (s < e && *s != '\n') {
22877 goto check_if_property;
22881 if (max == -1) { /* The line only had one entry */
22884 else if (max < min) {
22885 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22886 sv_catpvs(msg, "Illegal range in \"");
22887 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22888 UTF8fARG(is_contents_utf8, s - s0, s0));
22889 sv_catpvs(msg, "\"");
22890 goto return_failure;
22893 #if 0 /* See explanation at definition above of get_extended_utf8_msg() */
22895 if ( UNICODE_IS_PERL_EXTENDED(min)
22896 || UNICODE_IS_PERL_EXTENDED(max))
22898 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22900 /* If both code points are non-portable, warn only on the lower
22902 sv_catpv(msg, get_extended_utf8_msg(
22903 (UNICODE_IS_PERL_EXTENDED(min))
22905 sv_catpvs(msg, " in \"");
22906 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22907 UTF8fARG(is_contents_utf8, s - s0, s0));
22908 sv_catpvs(msg, "\"");
22913 /* Here, this line contains a legal range */
22914 this_definition = sv_2mortal(_new_invlist(2));
22915 this_definition = _add_range_to_invlist(this_definition, min, max);
22920 /* Here it isn't a legal range line. See if it is a legal property
22921 * line. First find the end of the meat of the line */
22922 s = strpbrk(s, "#\n");
22927 /* Ignore trailing blanks in keeping with the requirements of
22928 * parse_uniprop_string() */
22930 while (s > s0 && isBLANK_A(*s)) {
22935 this_definition = parse_uniprop_string(s0, s - s0,
22936 is_utf8, to_fold, runtime,
22938 user_defined_ptr, msg,
22940 ? level /* Don't increase level
22941 if input is empty */
22944 if (this_definition == NULL) {
22945 goto return_failure; /* 'msg' should have had the reason
22946 appended to it by the above call */
22949 if (! is_invlist(this_definition)) { /* Unknown at this time */
22950 return newSVsv(this_definition);
22954 s = strchr(s, '\n');
22964 _invlist_union(running_definition, this_definition,
22965 &running_definition);
22968 _invlist_subtract(running_definition, this_definition,
22969 &running_definition);
22972 _invlist_intersection(running_definition, this_definition,
22973 &running_definition);
22976 _invlist_union_complement_2nd(running_definition,
22977 this_definition, &running_definition);
22980 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
22981 __FILE__, __LINE__, op);
22985 /* Position past the '\n' */
22987 } /* End of loop through the lines of 'contents' */
22989 /* Here, we processed all the lines in 'contents' without error. If we
22990 * didn't add any warnings, simply return success */
22991 if (msgs_length_on_entry == SvCUR(msg)) {
22993 /* If the expansion was empty, the answer isn't nothing: its an empty
22994 * inversion list */
22995 if (running_definition == NULL) {
22996 running_definition = _new_invlist(1);
22999 return running_definition;
23002 /* Otherwise, add some explanatory text, but we will return success */
23006 running_definition = NULL;
23010 if (name_len > 0) {
23011 sv_catpvs(msg, " in expansion of ");
23012 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23015 return running_definition;
23018 /* As explained below, certain operations need to take place in the first
23019 * thread created. These macros switch contexts */
23020 #ifdef USE_ITHREADS
23021 # define DECLARATION_FOR_GLOBAL_CONTEXT \
23022 PerlInterpreter * save_aTHX = aTHX;
23023 # define SWITCH_TO_GLOBAL_CONTEXT \
23024 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
23025 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
23026 # define CUR_CONTEXT aTHX
23027 # define ORIGINAL_CONTEXT save_aTHX
23029 # define DECLARATION_FOR_GLOBAL_CONTEXT
23030 # define SWITCH_TO_GLOBAL_CONTEXT NOOP
23031 # define RESTORE_CONTEXT NOOP
23032 # define CUR_CONTEXT NULL
23033 # define ORIGINAL_CONTEXT NULL
23037 S_delete_recursion_entry(pTHX_ void *key)
23039 /* Deletes the entry used to detect recursion when expanding user-defined
23040 * properties. This is a function so it can be set up to be called even if
23041 * the program unexpectedly quits */
23044 SV ** current_entry;
23045 const STRLEN key_len = strlen((const char *) key);
23046 DECLARATION_FOR_GLOBAL_CONTEXT;
23048 SWITCH_TO_GLOBAL_CONTEXT;
23050 /* If the entry is one of these types, it is a permanent entry, and not the
23051 * one used to detect recursions. This function should delete only the
23052 * recursion entry */
23053 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
23055 && ! is_invlist(*current_entry)
23056 && ! SvPOK(*current_entry))
23058 (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
23066 S_get_fq_name(pTHX_
23067 const char * const name, /* The first non-blank in the \p{}, \P{} */
23068 const Size_t name_len, /* Its length in bytes, not including any trailing space */
23069 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23070 const bool has_colon_colon
23073 /* Returns a mortal SV containing the fully qualified version of the input
23078 fq_name = newSVpvs_flags("", SVs_TEMP);
23080 /* Use the current package if it wasn't included in our input */
23081 if (! has_colon_colon) {
23082 const HV * pkg = (IN_PERL_COMPILETIME)
23084 : CopSTASH(PL_curcop);
23085 const char* pkgname = HvNAME(pkg);
23087 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23088 UTF8fARG(is_utf8, strlen(pkgname), pkgname));
23089 sv_catpvs(fq_name, "::");
23092 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23093 UTF8fARG(is_utf8, name_len, name));
23098 Perl_parse_uniprop_string(pTHX_
23100 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
23101 * now. If so, the return is an inversion list.
23103 * If the property is user-defined, it is a subroutine, which in turn
23104 * may call other subroutines. This function will call the whole nest of
23105 * them to get the definition they return; if some aren't known at the time
23106 * of the call to this function, the fully qualified name of the highest
23107 * level sub is returned. It is an error to call this function at runtime
23108 * without every sub defined.
23110 * If an error was found, NULL is returned, and 'msg' gets a suitable
23111 * message appended to it. (Appending allows the back trace of how we got
23112 * to the faulty definition to be displayed through nested calls of
23113 * user-defined subs.)
23115 * The caller should NOT try to free any returned inversion list.
23117 * Other parameters will be set on return as described below */
23119 const char * const name, /* The first non-blank in the \p{}, \P{} */
23120 const Size_t name_len, /* Its length in bytes, not including any
23122 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23123 const bool to_fold, /* ? Is this under /i */
23124 const bool runtime, /* TRUE if this is being called at run time */
23125 const bool deferrable, /* TRUE if it's ok for the definition to not be
23126 known at this call */
23127 bool *user_defined_ptr, /* Upon return from this function it will be
23128 set to TRUE if any component is a
23129 user-defined property */
23130 SV * msg, /* Any error or warning msg(s) are appended to
23132 const STRLEN level) /* Recursion level of this call */
23135 char* lookup_name; /* normalized name for lookup in our tables */
23136 unsigned lookup_len; /* Its length */
23137 bool stricter = FALSE; /* Some properties have stricter name
23138 normalization rules, which we decide upon
23139 based on parsing */
23141 /* nv= or numeric_value=, or possibly one of the cjk numeric properties
23142 * (though it requires extra effort to download them from Unicode and
23143 * compile perl to know about them) */
23144 bool is_nv_type = FALSE;
23146 unsigned int i, j = 0;
23147 int equals_pos = -1; /* Where the '=' is found, or negative if none */
23148 int slash_pos = -1; /* Where the '/' is found, or negative if none */
23149 int table_index = 0; /* The entry number for this property in the table
23150 of all Unicode property names */
23151 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
23152 Size_t lookup_offset = 0; /* Used to ignore the first few characters of
23153 the normalized name in certain situations */
23154 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
23155 part of a package name */
23156 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
23157 property rather than a Unicode
23159 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
23160 if an error. If it is an inversion list,
23161 it is the definition. Otherwise it is a
23162 string containing the fully qualified sub
23164 SV * fq_name = NULL; /* For user-defined properties, the fully
23166 bool invert_return = FALSE; /* ? Do we need to complement the result before
23169 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
23171 /* The input will be normalized into 'lookup_name' */
23172 Newx(lookup_name, name_len, char);
23173 SAVEFREEPV(lookup_name);
23175 /* Parse the input. */
23176 for (i = 0; i < name_len; i++) {
23177 char cur = name[i];
23179 /* Most of the characters in the input will be of this ilk, being parts
23181 if (isIDCONT_A(cur)) {
23183 /* Case differences are ignored. Our lookup routine assumes
23184 * everything is lowercase, so normalize to that */
23185 if (isUPPER_A(cur)) {
23186 lookup_name[j++] = toLOWER_A(cur);
23190 if (cur == '_') { /* Don't include these in the normalized name */
23194 lookup_name[j++] = cur;
23196 /* The first character in a user-defined name must be of this type.
23198 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
23199 could_be_user_defined = FALSE;
23205 /* Here, the character is not something typically in a name, But these
23206 * two types of characters (and the '_' above) can be freely ignored in
23207 * most situations. Later it may turn out we shouldn't have ignored
23208 * them, and we have to reparse, but we don't have enough information
23209 * yet to make that decision */
23210 if (cur == '-' || isSPACE_A(cur)) {
23211 could_be_user_defined = FALSE;
23215 /* An equals sign or single colon mark the end of the first part of
23216 * the property name */
23218 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
23220 lookup_name[j++] = '='; /* Treat the colon as an '=' */
23221 equals_pos = j; /* Note where it occurred in the input */
23222 could_be_user_defined = FALSE;
23226 /* Otherwise, this character is part of the name. */
23227 lookup_name[j++] = cur;
23229 /* Here it isn't a single colon, so if it is a colon, it must be a
23233 /* A double colon should be a package qualifier. We note its
23234 * position and continue. Note that one could have
23235 * pkg1::pkg2::...::foo
23236 * so that the position at the end of the loop will be just after
23237 * the final qualifier */
23240 non_pkg_begin = i + 1;
23241 lookup_name[j++] = ':';
23243 else { /* Only word chars (and '::') can be in a user-defined name */
23244 could_be_user_defined = FALSE;
23246 } /* End of parsing through the lhs of the property name (or all of it if
23249 #define STRLENs(s) (sizeof("" s "") - 1)
23251 /* If there is a single package name 'utf8::', it is ambiguous. It could
23252 * be for a user-defined property, or it could be a Unicode property, as
23253 * all of them are considered to be for that package. For the purposes of
23254 * parsing the rest of the property, strip it off */
23255 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
23256 lookup_name += STRLENs("utf8::");
23257 j -= STRLENs("utf8::");
23258 equals_pos -= STRLENs("utf8::");
23261 /* Here, we are either done with the whole property name, if it was simple;
23262 * or are positioned just after the '=' if it is compound. */
23264 if (equals_pos >= 0) {
23265 assert(! stricter); /* We shouldn't have set this yet */
23267 /* Space immediately after the '=' is ignored */
23269 for (; i < name_len; i++) {
23270 if (! isSPACE_A(name[i])) {
23275 /* Most punctuation after the equals indicates a subpattern, like
23277 if ( isPUNCT_A(name[i])
23283 /* Find the property. The table includes the equals sign, so we
23285 table_index = match_uniprop((U8 *) lookup_name, j);
23287 const char * const * prop_values
23288 = UNI_prop_value_ptrs[table_index];
23290 Size_t subpattern_len;
23291 REGEXP * subpattern_re;
23292 char open = name[i++];
23294 const char * pos_in_brackets;
23297 /* A backslash means the real delimitter is the next character.
23299 if (open == '\\') {
23304 /* This data structure is constructed so that the matching
23305 * closing bracket is 3 past its matching opening. The second
23306 * set of closing is so that if the opening is something like
23307 * ']', the closing will be that as well. Something similar is
23308 * done in toke.c */
23309 pos_in_brackets = strchr("([<)]>)]>", open);
23310 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
23313 || name[name_len-1] != close
23314 || (escaped && name[name_len-2] != '\\'))
23316 sv_catpvs(msg, "Unicode property wildcard not terminated");
23317 goto append_name_to_msg;
23320 Perl_ck_warner_d(aTHX_
23321 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
23322 "The Unicode property wildcards feature is experimental");
23324 /* Now create and compile the wildcard subpattern. Use /iaa
23325 * because nothing outside of ASCII will match, and it the
23326 * property values should all match /i. Note that when the
23327 * pattern fails to compile, our added text to the user's
23328 * pattern will be displayed to the user, which is not so
23330 subpattern_len = name_len - i - 1 - escaped;
23331 subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)",
23332 (unsigned) subpattern_len,
23334 subpattern = sv_2mortal(subpattern);
23335 subpattern_re = re_compile(subpattern, 0);
23336 assert(subpattern_re); /* Should have died if didn't compile
23339 /* For each legal property value, see if the supplied pattern
23341 while (*prop_values) {
23342 const char * const entry = *prop_values;
23343 const Size_t len = strlen(entry);
23344 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
23346 if (pregexec(subpattern_re,
23348 (char *) entry + len,
23352 { /* Here, matched. Add to the returned list */
23353 Size_t total_len = j + len;
23354 SV * sub_invlist = NULL;
23355 char * this_string;
23357 /* We know this is a legal \p{property=value}. Call
23358 * the function to return the list of code points that
23360 Newxz(this_string, total_len + 1, char);
23361 Copy(lookup_name, this_string, j, char);
23362 my_strlcat(this_string, entry, total_len + 1);
23363 SAVEFREEPV(this_string);
23364 sub_invlist = parse_uniprop_string(this_string,
23373 _invlist_union(prop_definition, sub_invlist,
23377 prop_values++; /* Next iteration, look at next propvalue */
23378 } /* End of looking through property values; (the data
23379 structure is terminated by a NULL ptr) */
23381 SvREFCNT_dec_NN(subpattern_re);
23383 if (prop_definition) {
23384 return prop_definition;
23387 sv_catpvs(msg, "No Unicode property value wildcard matches:");
23388 goto append_name_to_msg;
23391 /* Here's how khw thinks we should proceed to handle the properties
23392 * not yet done: Bidi Mirroring Glyph
23393 Bidi Paired Bracket
23394 Case Folding (both full and simple)
23395 Decomposition Mapping
23396 Equivalent Unified Ideograph
23399 Lowercase Mapping (both full and simple)
23401 Titlecase Mapping (both full and simple)
23402 Uppercase Mapping (both full and simple)
23403 * Move the part that looks at the property values into a perl
23404 * script, like utf8_heavy.pl was done. This makes things somewhat
23405 * easier, but most importantly, it avoids always adding all these
23406 * strings to the memory usage when the feature is little-used.
23408 * The property values would all be concatenated into a single
23409 * string per property with each value on a separate line, and the
23410 * code point it's for on alternating lines. Then we match the
23411 * user's input pattern m//mg, without having to worry about their
23412 * uses of '^' and '$'. Only the values that aren't the default
23413 * would be in the strings. Code points would be in UTF-8. The
23414 * search pattern that we would construct would look like
23415 * (?: \n (code-point_re) \n (?aam: user-re ) \n )
23416 * And so $1 would contain the code point that matched the user-re.
23417 * For properties where the default is the code point itself, such
23418 * as any of the case changing mappings, the string would otherwise
23419 * consist of all Unicode code points in UTF-8 strung together.
23420 * This would be impractical. So instead, examine their compiled
23421 * pattern, looking at the ssc. If none, reject the pattern as an
23422 * error. Otherwise run the pattern against every code point in
23423 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
23424 * And it might be good to create an API to return the ssc.
23426 * For the name properties, a new function could be created in
23427 * charnames which essentially does the same thing as above,
23428 * sharing Name.pl with the other charname functions. Don't know
23429 * about loose name matching, or algorithmically determined names.
23430 * Decomposition.pl similarly.
23432 * It might be that a new pattern modifier would have to be
23433 * created, like /t for resTricTed, which changed the behavior of
23434 * some constructs in their subpattern, like \A. */
23435 } /* End of is a wildcard subppattern */
23438 /* Certain properties whose values are numeric need special handling.
23439 * They may optionally be prefixed by 'is'. Ignore that prefix for the
23440 * purposes of checking if this is one of those properties */
23441 if (memBEGINPs(lookup_name, j, "is")) {
23445 /* Then check if it is one of these specially-handled properties. The
23446 * possibilities are hard-coded because easier this way, and the list
23447 * is unlikely to change.
23449 * All numeric value type properties are of this ilk, and are also
23450 * special in a different way later on. So find those first. There
23451 * are several numeric value type properties in the Unihan DB (which is
23452 * unlikely to be compiled with perl, but we handle it here in case it
23453 * does get compiled). They all end with 'numeric'. The interiors
23454 * aren't checked for the precise property. This would stop working if
23455 * a cjk property were to be created that ended with 'numeric' and
23456 * wasn't a numeric type */
23457 is_nv_type = memEQs(lookup_name + lookup_offset,
23458 j - 1 - lookup_offset, "numericvalue")
23459 || memEQs(lookup_name + lookup_offset,
23460 j - 1 - lookup_offset, "nv")
23461 || ( memENDPs(lookup_name + lookup_offset,
23462 j - 1 - lookup_offset, "numeric")
23463 && ( memBEGINPs(lookup_name + lookup_offset,
23464 j - 1 - lookup_offset, "cjk")
23465 || memBEGINPs(lookup_name + lookup_offset,
23466 j - 1 - lookup_offset, "k")));
23468 || memEQs(lookup_name + lookup_offset,
23469 j - 1 - lookup_offset, "canonicalcombiningclass")
23470 || memEQs(lookup_name + lookup_offset,
23471 j - 1 - lookup_offset, "ccc")
23472 || memEQs(lookup_name + lookup_offset,
23473 j - 1 - lookup_offset, "age")
23474 || memEQs(lookup_name + lookup_offset,
23475 j - 1 - lookup_offset, "in")
23476 || memEQs(lookup_name + lookup_offset,
23477 j - 1 - lookup_offset, "presentin"))
23481 /* Since the stuff after the '=' is a number, we can't throw away
23482 * '-' willy-nilly, as those could be a minus sign. Other stricter
23483 * rules also apply. However, these properties all can have the
23484 * rhs not be a number, in which case they contain at least one
23485 * alphabetic. In those cases, the stricter rules don't apply.
23486 * But the numeric type properties can have the alphas [Ee] to
23487 * signify an exponent, and it is still a number with stricter
23488 * rules. So look for an alpha that signifies not-strict */
23490 for (k = i; k < name_len; k++) {
23491 if ( isALPHA_A(name[k])
23492 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
23502 /* A number may have a leading '+' or '-'. The latter is retained
23504 if (name[i] == '+') {
23507 else if (name[i] == '-') {
23508 lookup_name[j++] = '-';
23512 /* Skip leading zeros including single underscores separating the
23513 * zeros, or between the final leading zero and the first other
23515 for (; i < name_len - 1; i++) {
23516 if ( name[i] != '0'
23517 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
23524 else { /* No '=' */
23526 /* Only a few properties without an '=' should be parsed with stricter
23527 * rules. The list is unlikely to change. */
23528 if ( memBEGINPs(lookup_name, j, "perl")
23529 && memNEs(lookup_name + 4, j - 4, "space")
23530 && memNEs(lookup_name + 4, j - 4, "word"))
23534 /* We set the inputs back to 0 and the code below will reparse,
23540 /* Here, we have either finished the property, or are positioned to parse
23541 * the remainder, and we know if stricter rules apply. Finish out, if not
23543 for (; i < name_len; i++) {
23544 char cur = name[i];
23546 /* In all instances, case differences are ignored, and we normalize to
23548 if (isUPPER_A(cur)) {
23549 lookup_name[j++] = toLOWER(cur);
23553 /* An underscore is skipped, but not under strict rules unless it
23554 * separates two digits */
23557 && ( i == 0 || (int) i == equals_pos || i == name_len- 1
23558 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
23560 lookup_name[j++] = '_';
23565 /* Hyphens are skipped except under strict */
23566 if (cur == '-' && ! stricter) {
23570 /* XXX Bug in documentation. It says white space skipped adjacent to
23571 * non-word char. Maybe we should, but shouldn't skip it next to a dot
23573 if (isSPACE_A(cur) && ! stricter) {
23577 lookup_name[j++] = cur;
23579 /* Unless this is a non-trailing slash, we are done with it */
23580 if (i >= name_len - 1 || cur != '/') {
23586 /* A slash in the 'numeric value' property indicates that what follows
23587 * is a denominator. It can have a leading '+' and '0's that should be
23588 * skipped. But we have never allowed a negative denominator, so treat
23589 * a minus like every other character. (No need to rule out a second
23590 * '/', as that won't match anything anyway */
23593 if (i < name_len && name[i] == '+') {
23597 /* Skip leading zeros including underscores separating digits */
23598 for (; i < name_len - 1; i++) {
23599 if ( name[i] != '0'
23600 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
23606 /* Store the first real character in the denominator */
23607 if (i < name_len) {
23608 lookup_name[j++] = name[i];
23613 /* Here are completely done parsing the input 'name', and 'lookup_name'
23614 * contains a copy, normalized.
23616 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
23617 * different from without the underscores. */
23618 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
23619 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
23620 && UNLIKELY(name[name_len-1] == '_'))
23622 lookup_name[j++] = '&';
23625 /* If the original input began with 'In' or 'Is', it could be a subroutine
23626 * call to a user-defined property instead of a Unicode property name. */
23627 if ( name_len - non_pkg_begin > 2
23628 && name[non_pkg_begin+0] == 'I'
23629 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
23631 /* Names that start with In have different characterstics than those
23632 * that start with Is */
23633 if (name[non_pkg_begin+1] == 's') {
23634 starts_with_Is = TRUE;
23638 could_be_user_defined = FALSE;
23641 if (could_be_user_defined) {
23644 /* If the user defined property returns the empty string, it could
23645 * easily be because the pattern is being compiled before the data it
23646 * actually needs to compile is available. This could be argued to be
23647 * a bug in the perl code, but this is a change of behavior for Perl,
23648 * so we handle it. This means that intentionally returning nothing
23649 * will not be resolved until runtime */
23650 bool empty_return = FALSE;
23652 /* Here, the name could be for a user defined property, which are
23653 * implemented as subs. */
23654 user_sub = get_cvn_flags(name, name_len, 0);
23656 const char insecure[] = "Insecure user-defined property";
23658 /* Here, there is a sub by the correct name. Normally we call it
23659 * to get the property definition */
23661 SV * user_sub_sv = MUTABLE_SV(user_sub);
23662 SV * error; /* Any error returned by calling 'user_sub' */
23663 SV * key; /* The key into the hash of user defined sub names
23666 SV ** saved_user_prop_ptr; /* Hash entry for this property */
23668 /* How many times to retry when another thread is in the middle of
23669 * expanding the same definition we want */
23670 PERL_INT_FAST8_T retry_countdown = 10;
23672 DECLARATION_FOR_GLOBAL_CONTEXT;
23674 /* If we get here, we know this property is user-defined */
23675 *user_defined_ptr = TRUE;
23677 /* We refuse to call a potentially tainted subroutine; returning an
23680 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23681 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
23682 goto append_name_to_msg;
23685 /* In principal, we only call each subroutine property definition
23686 * once during the life of the program. This guarantees that the
23687 * property definition never changes. The results of the single
23688 * sub call are stored in a hash, which is used instead for future
23689 * references to this property. The property definition is thus
23690 * immutable. But, to allow the user to have a /i-dependent
23691 * definition, we call the sub once for non-/i, and once for /i,
23692 * should the need arise, passing the /i status as a parameter.
23694 * We start by constructing the hash key name, consisting of the
23695 * fully qualified subroutine name, preceded by the /i status, so
23696 * that there is a key for /i and a different key for non-/i */
23697 key = newSVpvn(((to_fold) ? "1" : "0"), 1);
23698 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
23699 non_pkg_begin != 0);
23700 sv_catsv(key, fq_name);
23703 /* We only call the sub once throughout the life of the program
23704 * (with the /i, non-/i exception noted above). That means the
23705 * hash must be global and accessible to all threads. It is
23706 * created at program start-up, before any threads are created, so
23707 * is accessible to all children. But this creates some
23710 * 1) The keys can't be shared, or else problems arise; sharing is
23711 * turned off at hash creation time
23712 * 2) All SVs in it are there for the remainder of the life of the
23713 * program, and must be created in the same interpreter context
23714 * as the hash, or else they will be freed from the wrong pool
23715 * at global destruction time. This is handled by switching to
23716 * the hash's context to create each SV going into it, and then
23717 * immediately switching back
23718 * 3) All accesses to the hash must be controlled by a mutex, to
23719 * prevent two threads from getting an unstable state should
23720 * they simultaneously be accessing it. The code below is
23721 * crafted so that the mutex is locked whenever there is an
23722 * access and unlocked only when the next stable state is
23725 * The hash stores either the definition of the property if it was
23726 * valid, or, if invalid, the error message that was raised. We
23727 * use the type of SV to distinguish.
23729 * There's also the need to guard against the definition expansion
23730 * from infinitely recursing. This is handled by storing the aTHX
23731 * of the expanding thread during the expansion. Again the SV type
23732 * is used to distinguish this from the other two cases. If we
23733 * come to here and the hash entry for this property is our aTHX,
23734 * it means we have recursed, and the code assumes that we would
23735 * infinitely recurse, so instead stops and raises an error.
23736 * (Any recursion has always been treated as infinite recursion in
23739 * If instead, the entry is for a different aTHX, it means that
23740 * that thread has gotten here first, and hasn't finished expanding
23741 * the definition yet. We just have to wait until it is done. We
23742 * sleep and retry a few times, returning an error if the other
23743 * thread doesn't complete. */
23746 USER_PROP_MUTEX_LOCK;
23748 /* If we have an entry for this key, the subroutine has already
23749 * been called once with this /i status. */
23750 saved_user_prop_ptr = hv_fetch(PL_user_def_props,
23751 SvPVX(key), SvCUR(key), 0);
23752 if (saved_user_prop_ptr) {
23754 /* If the saved result is an inversion list, it is the valid
23755 * definition of this property */
23756 if (is_invlist(*saved_user_prop_ptr)) {
23757 prop_definition = *saved_user_prop_ptr;
23759 /* The SV in the hash won't be removed until global
23760 * destruction, so it is stable and we can unlock */
23761 USER_PROP_MUTEX_UNLOCK;
23763 /* The caller shouldn't try to free this SV */
23764 return prop_definition;
23767 /* Otherwise, if it is a string, it is the error message
23768 * that was returned when we first tried to evaluate this
23769 * property. Fail, and append the message */
23770 if (SvPOK(*saved_user_prop_ptr)) {
23771 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23772 sv_catsv(msg, *saved_user_prop_ptr);
23774 /* The SV in the hash won't be removed until global
23775 * destruction, so it is stable and we can unlock */
23776 USER_PROP_MUTEX_UNLOCK;
23781 assert(SvIOK(*saved_user_prop_ptr));
23783 /* Here, we have an unstable entry in the hash. Either another
23784 * thread is in the middle of expanding the property's
23785 * definition, or we are ourselves recursing. We use the aTHX
23786 * in it to distinguish */
23787 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
23789 /* Here, it's another thread doing the expanding. We've
23790 * looked as much as we are going to at the contents of the
23791 * hash entry. It's safe to unlock. */
23792 USER_PROP_MUTEX_UNLOCK;
23794 /* Retry a few times */
23795 if (retry_countdown-- > 0) {
23800 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23801 sv_catpvs(msg, "Timeout waiting for another thread to "
23803 goto append_name_to_msg;
23806 /* Here, we are recursing; don't dig any deeper */
23807 USER_PROP_MUTEX_UNLOCK;
23809 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23811 "Infinite recursion in user-defined property");
23812 goto append_name_to_msg;
23815 /* Here, this thread has exclusive control, and there is no entry
23816 * for this property in the hash. So we have the go ahead to
23817 * expand the definition ourselves. */
23819 PUSHSTACKi(PERLSI_MAGIC);
23822 /* Create a temporary placeholder in the hash to detect recursion
23824 SWITCH_TO_GLOBAL_CONTEXT;
23825 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
23826 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
23829 /* Now that we have a placeholder, we can let other threads
23831 USER_PROP_MUTEX_UNLOCK;
23833 /* Make sure the placeholder always gets destroyed */
23834 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
23839 /* Call the user's function, with the /i status as a parameter.
23840 * Note that we have gone to a lot of trouble to keep this call
23841 * from being within the locked mutex region. */
23842 XPUSHs(boolSV(to_fold));
23845 /* The following block was taken from swash_init(). Presumably
23846 * they apply to here as well, though we no longer use a swash --
23850 /* We might get here via a subroutine signature which uses a utf8
23851 * parameter name, at which point PL_subname will have been set
23852 * but not yet used. */
23853 save_item(PL_subname);
23855 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
23860 if (TAINT_get || SvTRUE(error)) {
23861 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23862 if (SvTRUE(error)) {
23863 sv_catpvs(msg, "Error \"");
23864 sv_catsv(msg, error);
23865 sv_catpvs(msg, "\"");
23868 if (SvTRUE(error)) sv_catpvs(msg, "; ");
23869 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
23872 if (name_len > 0) {
23873 sv_catpvs(msg, " in expansion of ");
23874 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
23880 prop_definition = NULL;
23882 else { /* G_SCALAR guarantees a single return value */
23883 SV * contents = POPs;
23885 /* The contents is supposed to be the expansion of the property
23886 * definition. If the definition is deferrable, and we got an
23887 * empty string back, set a flag to later defer it (after clean
23890 && (! SvPOK(contents) || SvCUR(contents) == 0))
23892 empty_return = TRUE;
23894 else { /* Otherwise, call a function to check for valid syntax,
23897 prop_definition = handle_user_defined_property(
23899 is_utf8, to_fold, runtime,
23901 contents, user_defined_ptr,
23907 /* Here, we have the results of the expansion. Delete the
23908 * placeholder, and if the definition is now known, replace it with
23909 * that definition. We need exclusive access to the hash, and we
23910 * can't let anyone else in, between when we delete the placeholder
23911 * and add the permanent entry */
23912 USER_PROP_MUTEX_LOCK;
23914 S_delete_recursion_entry(aTHX_ SvPVX(key));
23916 if ( ! empty_return
23917 && (! prop_definition || is_invlist(prop_definition)))
23919 /* If we got success we use the inversion list defining the
23920 * property; otherwise use the error message */
23921 SWITCH_TO_GLOBAL_CONTEXT;
23922 (void) hv_store_ent(PL_user_def_props,
23925 ? newSVsv(prop_definition)
23931 /* All done, and the hash now has a permanent entry for this
23932 * property. Give up exclusive control */
23933 USER_PROP_MUTEX_UNLOCK;
23939 if (empty_return) {
23940 goto definition_deferred;
23943 if (prop_definition) {
23945 /* If the definition is for something not known at this time,
23946 * we toss it, and go return the main property name, as that's
23947 * the one the user will be aware of */
23948 if (! is_invlist(prop_definition)) {
23949 SvREFCNT_dec_NN(prop_definition);
23950 goto definition_deferred;
23953 sv_2mortal(prop_definition);
23957 return prop_definition;
23959 } /* End of calling the subroutine for the user-defined property */
23960 } /* End of it could be a user-defined property */
23962 /* Here it wasn't a user-defined property that is known at this time. See
23963 * if it is a Unicode property */
23965 lookup_len = j; /* This is a more mnemonic name than 'j' */
23967 /* Get the index into our pointer table of the inversion list corresponding
23968 * to the property */
23969 table_index = match_uniprop((U8 *) lookup_name, lookup_len);
23971 /* If it didn't find the property ... */
23972 if (table_index == 0) {
23974 /* Try again stripping off any initial 'Is'. This is because we
23975 * promise that an initial Is is optional. The same isn't true of
23976 * names that start with 'In'. Those can match only blocks, and the
23977 * lookup table already has those accounted for. */
23978 if (starts_with_Is) {
23984 table_index = match_uniprop((U8 *) lookup_name, lookup_len);
23987 if (table_index == 0) {
23990 /* Here, we didn't find it. If not a numeric type property, and
23991 * can't be a user-defined one, it isn't a legal property */
23992 if (! is_nv_type) {
23993 if (! could_be_user_defined) {
23997 /* Here, the property name is legal as a user-defined one. At
23998 * compile time, it might just be that the subroutine for that
23999 * property hasn't been encountered yet, but at runtime, it's
24000 * an error to try to use an undefined one */
24001 if (! deferrable) {
24002 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24003 sv_catpvs(msg, "Unknown user-defined property name");
24004 goto append_name_to_msg;
24007 goto definition_deferred;
24008 } /* End of isn't a numeric type property */
24010 /* The numeric type properties need more work to decide. What we
24011 * do is make sure we have the number in canonical form and look
24014 if (slash_pos < 0) { /* No slash */
24016 /* When it isn't a rational, take the input, convert it to a
24017 * NV, then create a canonical string representation of that
24021 SSize_t value_len = lookup_len - equals_pos;
24023 /* Get the value */
24024 if ( value_len <= 0
24025 || my_atof3(lookup_name + equals_pos, &value,
24027 != lookup_name + lookup_len)
24032 /* If the value is an integer, the canonical value is integral
24034 if (Perl_ceil(value) == value) {
24035 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
24036 equals_pos, lookup_name, value);
24038 else { /* Otherwise, it is %e with a known precision */
24041 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
24042 equals_pos, lookup_name,
24043 PL_E_FORMAT_PRECISION, value);
24045 /* The exponent generated is expecting two digits, whereas
24046 * %e on some systems will generate three. Remove leading
24047 * zeros in excess of 2 from the exponent. We start
24048 * looking for them after the '=' */
24049 exp_ptr = strchr(canonical + equals_pos, 'e');
24051 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
24052 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
24054 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
24056 if (excess_exponent_len > 0) {
24057 SSize_t leading_zeros = strspn(cur_ptr, "0");
24058 SSize_t excess_leading_zeros
24059 = MIN(leading_zeros, excess_exponent_len);
24060 if (excess_leading_zeros > 0) {
24061 Move(cur_ptr + excess_leading_zeros,
24063 strlen(cur_ptr) - excess_leading_zeros
24064 + 1, /* Copy the NUL as well */
24071 else { /* Has a slash. Create a rational in canonical form */
24072 UV numerator, denominator, gcd, trial;
24073 const char * end_ptr;
24074 const char * sign = "";
24076 /* We can't just find the numerator, denominator, and do the
24077 * division, then use the method above, because that is
24078 * inexact. And the input could be a rational that is within
24079 * epsilon (given our precision) of a valid rational, and would
24080 * then incorrectly compare valid.
24082 * We're only interested in the part after the '=' */
24083 const char * this_lookup_name = lookup_name + equals_pos;
24084 lookup_len -= equals_pos;
24085 slash_pos -= equals_pos;
24087 /* Handle any leading minus */
24088 if (this_lookup_name[0] == '-') {
24090 this_lookup_name++;
24095 /* Convert the numerator to numeric */
24096 end_ptr = this_lookup_name + slash_pos;
24097 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
24101 /* It better have included all characters before the slash */
24102 if (*end_ptr != '/') {
24106 /* Set to look at just the denominator */
24107 this_lookup_name += slash_pos;
24108 lookup_len -= slash_pos;
24109 end_ptr = this_lookup_name + lookup_len;
24111 /* Convert the denominator to numeric */
24112 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
24116 /* It better be the rest of the characters, and don't divide by
24118 if ( end_ptr != this_lookup_name + lookup_len
24119 || denominator == 0)
24124 /* Get the greatest common denominator using
24125 http://en.wikipedia.org/wiki/Euclidean_algorithm */
24127 trial = denominator;
24128 while (trial != 0) {
24130 trial = gcd % trial;
24134 /* If already in lowest possible terms, we have already tried
24135 * looking this up */
24140 /* Reduce the rational, which should put it in canonical form
24143 denominator /= gcd;
24145 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
24146 equals_pos, lookup_name, sign, numerator, denominator);
24149 /* Here, we have the number in canonical form. Try that */
24150 table_index = match_uniprop((U8 *) canonical, strlen(canonical));
24151 if (table_index == 0) {
24154 } /* End of still didn't find the property in our table */
24155 } /* End of didn't find the property in our table */
24157 /* Here, we have a non-zero return, which is an index into a table of ptrs.
24158 * A negative return signifies that the real index is the absolute value,
24159 * but the result needs to be inverted */
24160 if (table_index < 0) {
24161 invert_return = TRUE;
24162 table_index = -table_index;
24165 /* Out-of band indices indicate a deprecated property. The proper index is
24166 * modulo it with the table size. And dividing by the table size yields
24167 * an offset into a table constructed by regen/mk_invlists.pl to contain
24168 * the corresponding warning message */
24169 if (table_index > MAX_UNI_KEYWORD_INDEX) {
24170 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
24171 table_index %= MAX_UNI_KEYWORD_INDEX;
24172 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
24173 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
24174 (int) name_len, name, deprecated_property_msgs[warning_offset]);
24177 /* In a few properties, a different property is used under /i. These are
24178 * unlikely to change, so are hard-coded here. */
24180 if ( table_index == UNI_XPOSIXUPPER
24181 || table_index == UNI_XPOSIXLOWER
24182 || table_index == UNI_TITLE)
24184 table_index = UNI_CASED;
24186 else if ( table_index == UNI_UPPERCASELETTER
24187 || table_index == UNI_LOWERCASELETTER
24188 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
24189 || table_index == UNI_TITLECASELETTER
24192 table_index = UNI_CASEDLETTER;
24194 else if ( table_index == UNI_POSIXUPPER
24195 || table_index == UNI_POSIXLOWER)
24197 table_index = UNI_POSIXALPHA;
24201 /* Create and return the inversion list */
24202 prop_definition =_new_invlist_C_array(uni_prop_ptrs[table_index]);
24203 sv_2mortal(prop_definition);
24206 /* See if there is a private use override to add to this definition */
24208 COPHH * hinthash = (IN_PERL_COMPILETIME)
24209 ? CopHINTHASH_get(&PL_compiling)
24210 : CopHINTHASH_get(PL_curcop);
24211 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
24213 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
24215 /* See if there is an element in the hints hash for this table */
24216 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
24217 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
24221 SV * pu_definition;
24223 SV * expanded_prop_definition =
24224 sv_2mortal(invlist_clone(prop_definition, NULL));
24226 /* If so, it's definition is the string from here to the next
24227 * \a character. And its format is the same as a user-defined
24229 pos += SvCUR(pu_lookup);
24230 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
24231 pu_invlist = handle_user_defined_property(lookup_name,
24234 0, /* Not folded */
24242 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24243 sv_catpvs(msg, "Insecure private-use override");
24244 goto append_name_to_msg;
24247 /* For now, as a safety measure, make sure that it doesn't
24248 * override non-private use code points */
24249 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
24251 /* Add it to the list to be returned */
24252 _invlist_union(prop_definition, pu_invlist,
24253 &expanded_prop_definition);
24254 prop_definition = expanded_prop_definition;
24255 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
24260 if (invert_return) {
24261 _invlist_invert(prop_definition);
24263 return prop_definition;
24267 if (non_pkg_begin != 0) {
24268 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24269 sv_catpvs(msg, "Illegal user-defined property name");
24272 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24273 sv_catpvs(msg, "Can't find Unicode property definition");
24277 append_name_to_msg:
24279 const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
24280 const char * suffix = (runtime && level == 0) ? "}" : "\"";
24282 sv_catpv(msg, prefix);
24283 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
24284 sv_catpv(msg, suffix);
24289 definition_deferred:
24291 /* Here it could yet to be defined, so defer evaluation of this
24292 * until its needed at runtime. We need the fully qualified property name
24293 * to avoid ambiguity, and a trailing newline */
24295 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24296 non_pkg_begin != 0 /* If has "::" */
24299 sv_catpvs(fq_name, "\n");
24301 *user_defined_ptr = TRUE;
24308 * ex: set ts=8 sts=4 sw=4 et: