5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
78 #ifdef PERL_IN_XSUB_RE
80 EXTERN_C const struct regexp_engine my_reg_engine;
85 #include "dquote_inline.h"
86 #include "invlist_inline.h"
87 #include "unicode_constants.h"
89 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
90 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
91 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
92 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
93 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
100 /* this is a chain of data about sub patterns we are processing that
101 need to be handled separately/specially in study_chunk. Its so
102 we can simulate recursion without losing state. */
104 typedef struct scan_frame {
105 regnode *last_regnode; /* last node to process in this frame */
106 regnode *next_regnode; /* next node to process when last is reached */
107 U32 prev_recursed_depth;
108 I32 stopparen; /* what stopparen do we use */
110 struct scan_frame *this_prev_frame; /* this previous frame */
111 struct scan_frame *prev_frame; /* previous frame */
112 struct scan_frame *next_frame; /* next frame */
115 /* Certain characters are output as a sequence with the first being a
117 #define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
120 struct RExC_state_t {
121 U32 flags; /* RXf_* are we folding, multilining? */
122 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
123 char *precomp; /* uncompiled string. */
124 char *precomp_end; /* pointer to end of uncompiled string. */
125 REGEXP *rx_sv; /* The SV that is the regexp. */
126 regexp *rx; /* perl core regexp structure */
127 regexp_internal *rxi; /* internal data for regexp object
129 char *start; /* Start of input for compile */
130 char *end; /* End of input for compile */
131 char *parse; /* Input-scan pointer. */
132 char *copy_start; /* start of copy of input within
133 constructed parse string */
134 char *save_copy_start; /* Provides one level of saving
135 and restoring 'copy_start' */
136 char *copy_start_in_input; /* Position in input string
137 corresponding to copy_start */
138 SSize_t whilem_seen; /* number of WHILEM in this expr */
139 regnode *emit_start; /* Start of emitted-code area */
140 regnode_offset emit; /* Code-emit pointer */
141 I32 naughty; /* How bad is this pattern? */
142 I32 sawback; /* Did we see \1, ...? */
144 SSize_t size; /* Number of regnode equivalents in
147 /* position beyond 'precomp' of the warning message furthest away from
148 * 'precomp'. During the parse, no warnings are raised for any problems
149 * earlier in the parse than this position. This works if warnings are
150 * raised the first time a given spot is parsed, and if only one
151 * independent warning is raised for any given spot */
152 Size_t latest_warn_offset;
154 I32 npar; /* Capture buffer count so far in the
155 parse, (OPEN) plus one. ("par" 0 is
157 I32 total_par; /* During initial parse, is either 0,
158 or -1; the latter indicating a
159 reparse is needed. After that pass,
160 it is what 'npar' became after the
161 pass. Hence, it being > 0 indicates
162 we are in a reparse situation */
163 I32 nestroot; /* root parens we are in - used by
166 regnode_offset *open_parens; /* offsets to open parens */
167 regnode_offset *close_parens; /* offsets to close parens */
168 I32 parens_buf_size; /* #slots malloced open/close_parens */
169 regnode *end_op; /* END node in program */
170 I32 utf8; /* whether the pattern is utf8 or not */
171 I32 orig_utf8; /* whether the pattern was originally in utf8 */
172 /* XXX use this for future optimisation of case
173 * where pattern must be upgraded to utf8. */
174 I32 uni_semantics; /* If a d charset modifier should use unicode
175 rules, even if the pattern is not in
177 HV *paren_names; /* Paren names */
179 regnode **recurse; /* Recurse regops */
180 I32 recurse_count; /* Number of recurse regops we have generated */
181 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
183 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
186 I32 override_recoding;
188 I32 recode_x_to_native;
190 I32 in_multi_char_class;
191 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
193 int code_index; /* next code_blocks[] slot */
194 SSize_t maxlen; /* mininum possible number of chars in string to match */
195 scan_frame *frame_head;
196 scan_frame *frame_last;
200 #ifdef ADD_TO_REGEXEC
201 char *starttry; /* -Dr: where regtry was called. */
202 #define RExC_starttry (pRExC_state->starttry)
204 SV *runtime_code_qr; /* qr with the runtime code blocks */
206 const char *lastparse;
208 AV *paren_name_list; /* idx -> name */
209 U32 study_chunk_recursed_count;
213 #define RExC_lastparse (pRExC_state->lastparse)
214 #define RExC_lastnum (pRExC_state->lastnum)
215 #define RExC_paren_name_list (pRExC_state->paren_name_list)
216 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
217 #define RExC_mysv (pRExC_state->mysv1)
218 #define RExC_mysv1 (pRExC_state->mysv1)
219 #define RExC_mysv2 (pRExC_state->mysv2)
229 #define RExC_flags (pRExC_state->flags)
230 #define RExC_pm_flags (pRExC_state->pm_flags)
231 #define RExC_precomp (pRExC_state->precomp)
232 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
233 #define RExC_copy_start_in_constructed (pRExC_state->copy_start)
234 #define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
235 #define RExC_precomp_end (pRExC_state->precomp_end)
236 #define RExC_rx_sv (pRExC_state->rx_sv)
237 #define RExC_rx (pRExC_state->rx)
238 #define RExC_rxi (pRExC_state->rxi)
239 #define RExC_start (pRExC_state->start)
240 #define RExC_end (pRExC_state->end)
241 #define RExC_parse (pRExC_state->parse)
242 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
243 #define RExC_whilem_seen (pRExC_state->whilem_seen)
244 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
245 under /d from /u ? */
248 #ifdef RE_TRACK_PATTERN_OFFSETS
249 # define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
252 #define RExC_emit (pRExC_state->emit)
253 #define RExC_emit_start (pRExC_state->emit_start)
254 #define RExC_sawback (pRExC_state->sawback)
255 #define RExC_seen (pRExC_state->seen)
256 #define RExC_size (pRExC_state->size)
257 #define RExC_maxlen (pRExC_state->maxlen)
258 #define RExC_npar (pRExC_state->npar)
259 #define RExC_total_parens (pRExC_state->total_par)
260 #define RExC_parens_buf_size (pRExC_state->parens_buf_size)
261 #define RExC_nestroot (pRExC_state->nestroot)
262 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
263 #define RExC_utf8 (pRExC_state->utf8)
264 #define RExC_uni_semantics (pRExC_state->uni_semantics)
265 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
266 #define RExC_open_parens (pRExC_state->open_parens)
267 #define RExC_close_parens (pRExC_state->close_parens)
268 #define RExC_end_op (pRExC_state->end_op)
269 #define RExC_paren_names (pRExC_state->paren_names)
270 #define RExC_recurse (pRExC_state->recurse)
271 #define RExC_recurse_count (pRExC_state->recurse_count)
272 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
273 #define RExC_study_chunk_recursed_bytes \
274 (pRExC_state->study_chunk_recursed_bytes)
275 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
276 #define RExC_contains_locale (pRExC_state->contains_locale)
278 # define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
280 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
281 #define RExC_frame_head (pRExC_state->frame_head)
282 #define RExC_frame_last (pRExC_state->frame_last)
283 #define RExC_frame_count (pRExC_state->frame_count)
284 #define RExC_strict (pRExC_state->strict)
285 #define RExC_study_started (pRExC_state->study_started)
286 #define RExC_warn_text (pRExC_state->warn_text)
287 #define RExC_in_script_run (pRExC_state->in_script_run)
288 #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
289 #define RExC_unlexed_names (pRExC_state->unlexed_names)
291 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
292 * a flag to disable back-off on the fixed/floating substrings - if it's
293 * a high complexity pattern we assume the benefit of avoiding a full match
294 * is worth the cost of checking for the substrings even if they rarely help.
296 #define RExC_naughty (pRExC_state->naughty)
297 #define TOO_NAUGHTY (10)
298 #define MARK_NAUGHTY(add) \
299 if (RExC_naughty < TOO_NAUGHTY) \
300 RExC_naughty += (add)
301 #define MARK_NAUGHTY_EXP(exp, add) \
302 if (RExC_naughty < TOO_NAUGHTY) \
303 RExC_naughty += RExC_naughty / (exp) + (add)
305 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
306 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
307 ((*s) == '{' && regcurly(s)))
310 * Flags to be passed up and down.
312 #define WORST 0 /* Worst case. */
313 #define HASWIDTH 0x01 /* Known to not match null strings, could match
316 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
317 * character. (There needs to be a case: in the switch statement in regexec.c
318 * for any node marked SIMPLE.) Note that this is not the same thing as
321 #define SPSTART 0x04 /* Starts with * or + */
322 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
323 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
324 #define RESTART_PARSE 0x20 /* Need to redo the parse */
325 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
326 calcuate sizes as UTF-8 */
328 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
330 /* whether trie related optimizations are enabled */
331 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
332 #define TRIE_STUDY_OPT
333 #define FULL_TRIE_STUDY
339 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
340 #define PBITVAL(paren) (1 << ((paren) & 7))
341 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
342 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
343 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
345 #define REQUIRE_UTF8(flagp) STMT_START { \
347 *flagp = RESTART_PARSE|NEED_UTF8; \
352 /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
353 * a flag that indicates we need to override /d with /u as a result of
354 * something in the pattern. It should only be used in regards to calling
355 * set_regex_charset() or get_regex_charse() */
356 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
358 if (DEPENDS_SEMANTICS) { \
359 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
360 RExC_uni_semantics = 1; \
361 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
362 /* No need to restart the parse if we haven't seen \
363 * anything that differs between /u and /d, and no need \
364 * to restart immediately if we're going to reparse \
365 * anyway to count parens */ \
366 *flagp |= RESTART_PARSE; \
367 return restart_retval; \
372 #define REQUIRE_BRANCHJ(flagp, restart_retval) \
374 RExC_use_BRANCHJ = 1; \
375 if (LIKELY(! IN_PARENS_PASS)) { \
376 /* No need to restart the parse immediately if we're \
377 * going to reparse anyway to count parens */ \
378 *flagp |= RESTART_PARSE; \
379 return restart_retval; \
383 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
384 * less. After that, it must always be positive, because the whole re is
385 * considered to be surrounded by virtual parens. Setting it to negative
386 * indicates there is some construct that needs to know the actual number of
387 * parens to be properly handled. And that means an extra pass will be
388 * required after we've counted them all */
389 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
390 #define REQUIRE_PARENS_PASS \
391 STMT_START { /* No-op if have completed a pass */ \
392 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
394 #define IN_PARENS_PASS (RExC_total_parens < 0)
397 /* This is used to return failure (zero) early from the calling function if
398 * various flags in 'flags' are set. Two flags always cause a return:
399 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
400 * additional flags that should cause a return; 0 if none. If the return will
401 * be done, '*flagp' is first set to be all of the flags that caused the
403 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
405 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
406 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
411 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
413 #define RETURN_FAIL_ON_RESTART(flags,flagp) \
414 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
415 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
416 if (MUST_RESTART(*(flagp))) return 0
418 /* This converts the named class defined in regcomp.h to its equivalent class
419 * number defined in handy.h. */
420 #define namedclass_to_classnum(class) ((int) ((class) / 2))
421 #define classnum_to_namedclass(classnum) ((classnum) * 2)
423 #define _invlist_union_complement_2nd(a, b, output) \
424 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
425 #define _invlist_intersection_complement_2nd(a, b, output) \
426 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
428 /* About scan_data_t.
430 During optimisation we recurse through the regexp program performing
431 various inplace (keyhole style) optimisations. In addition study_chunk
432 and scan_commit populate this data structure with information about
433 what strings MUST appear in the pattern. We look for the longest
434 string that must appear at a fixed location, and we look for the
435 longest string that may appear at a floating location. So for instance
440 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
441 strings (because they follow a .* construct). study_chunk will identify
442 both FOO and BAR as being the longest fixed and floating strings respectively.
444 The strings can be composites, for instance
448 will result in a composite fixed substring 'foo'.
450 For each string some basic information is maintained:
453 This is the position the string must appear at, or not before.
454 It also implicitly (when combined with minlenp) tells us how many
455 characters must match before the string we are searching for.
456 Likewise when combined with minlenp and the length of the string it
457 tells us how many characters must appear after the string we have
461 Only used for floating strings. This is the rightmost point that
462 the string can appear at. If set to SSize_t_MAX it indicates that the
463 string can occur infinitely far to the right.
464 For fixed strings, it is equal to min_offset.
467 A pointer to the minimum number of characters of the pattern that the
468 string was found inside. This is important as in the case of positive
469 lookahead or positive lookbehind we can have multiple patterns
474 The minimum length of the pattern overall is 3, the minimum length
475 of the lookahead part is 3, but the minimum length of the part that
476 will actually match is 1. So 'FOO's minimum length is 3, but the
477 minimum length for the F is 1. This is important as the minimum length
478 is used to determine offsets in front of and behind the string being
479 looked for. Since strings can be composites this is the length of the
480 pattern at the time it was committed with a scan_commit. Note that
481 the length is calculated by study_chunk, so that the minimum lengths
482 are not known until the full pattern has been compiled, thus the
483 pointer to the value.
487 In the case of lookbehind the string being searched for can be
488 offset past the start point of the final matching string.
489 If this value was just blithely removed from the min_offset it would
490 invalidate some of the calculations for how many chars must match
491 before or after (as they are derived from min_offset and minlen and
492 the length of the string being searched for).
493 When the final pattern is compiled and the data is moved from the
494 scan_data_t structure into the regexp structure the information
495 about lookbehind is factored in, with the information that would
496 have been lost precalculated in the end_shift field for the
499 The fields pos_min and pos_delta are used to store the minimum offset
500 and the delta to the maximum offset at the current point in the pattern.
504 struct scan_data_substrs {
505 SV *str; /* longest substring found in pattern */
506 SSize_t min_offset; /* earliest point in string it can appear */
507 SSize_t max_offset; /* latest point in string it can appear */
508 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
509 SSize_t lookbehind; /* is the pos of the string modified by LB */
510 I32 flags; /* per substring SF_* and SCF_* flags */
513 typedef struct scan_data_t {
514 /*I32 len_min; unused */
515 /*I32 len_delta; unused */
519 SSize_t last_end; /* min value, <0 unless valid. */
520 SSize_t last_start_min;
521 SSize_t last_start_max;
522 U8 cur_is_floating; /* whether the last_* values should be set as
523 * the next fixed (0) or floating (1)
526 /* [0] is longest fixed substring so far, [1] is longest float so far */
527 struct scan_data_substrs substrs[2];
529 I32 flags; /* common SF_* and SCF_* flags */
531 SSize_t *last_closep;
532 regnode_ssc *start_class;
536 * Forward declarations for pregcomp()'s friends.
539 static const scan_data_t zero_scan_data = {
540 0, 0, NULL, 0, 0, 0, 0,
542 { NULL, 0, 0, 0, 0, 0 },
543 { NULL, 0, 0, 0, 0, 0 },
550 #define SF_BEFORE_SEOL 0x0001
551 #define SF_BEFORE_MEOL 0x0002
552 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
554 #define SF_IS_INF 0x0040
555 #define SF_HAS_PAR 0x0080
556 #define SF_IN_PAR 0x0100
557 #define SF_HAS_EVAL 0x0200
560 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
561 * longest substring in the pattern. When it is not set the optimiser keeps
562 * track of position, but does not keep track of the actual strings seen,
564 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
567 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
568 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
569 * turned off because of the alternation (BRANCH). */
570 #define SCF_DO_SUBSTR 0x0400
572 #define SCF_DO_STCLASS_AND 0x0800
573 #define SCF_DO_STCLASS_OR 0x1000
574 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
575 #define SCF_WHILEM_VISITED_POS 0x2000
577 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
578 #define SCF_SEEN_ACCEPT 0x8000
579 #define SCF_TRIE_DOING_RESTUDY 0x10000
580 #define SCF_IN_DEFINE 0x20000
585 #define UTF cBOOL(RExC_utf8)
587 /* The enums for all these are ordered so things work out correctly */
588 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
589 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
590 == REGEX_DEPENDS_CHARSET)
591 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
592 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
593 >= REGEX_UNICODE_CHARSET)
594 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
595 == REGEX_ASCII_RESTRICTED_CHARSET)
596 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
597 >= REGEX_ASCII_RESTRICTED_CHARSET)
598 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
599 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
601 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
603 /* For programs that want to be strictly Unicode compatible by dying if any
604 * attempt is made to match a non-Unicode code point against a Unicode
606 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
608 #define OOB_NAMEDCLASS -1
610 /* There is no code point that is out-of-bounds, so this is problematic. But
611 * its only current use is to initialize a variable that is always set before
613 #define OOB_UNICODE 0xDEADBEEF
615 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
618 /* length of regex to show in messages that don't mark a position within */
619 #define RegexLengthToShowInErrorMessages 127
622 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
623 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
624 * op/pragma/warn/regcomp.
626 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
627 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
629 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
630 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
632 /* The code in this file in places uses one level of recursion with parsing
633 * rebased to an alternate string constructed by us in memory. This can take
634 * the form of something that is completely different from the input, or
635 * something that uses the input as part of the alternate. In the first case,
636 * there should be no possibility of an error, as we are in complete control of
637 * the alternate string. But in the second case we don't completely control
638 * the input portion, so there may be errors in that. Here's an example:
640 * is handled specially because \x{df} folds to a sequence of more than one
641 * character: 'ss'. What is done is to create and parse an alternate string,
642 * which looks like this:
643 * /(?:\x{DF}|[abc\x{DF}def])/ui
644 * where it uses the input unchanged in the middle of something it constructs,
645 * which is a branch for the DF outside the character class, and clustering
646 * parens around the whole thing. (It knows enough to skip the DF inside the
647 * class while in this substitute parse.) 'abc' and 'def' may have errors that
648 * need to be reported. The general situation looks like this:
650 * |<------- identical ------>|
652 * Input: ---------------------------------------------------------------
653 * Constructed: ---------------------------------------------------
655 * |<------- identical ------>|
657 * sI..eI is the portion of the input pattern we are concerned with here.
658 * sC..EC is the constructed substitute parse string.
659 * sC..tC is constructed by us
660 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
661 * In the diagram, these are vertically aligned.
662 * eC..EC is also constructed by us.
663 * xC is the position in the substitute parse string where we found a
665 * xI is the position in the original pattern corresponding to xC.
667 * We want to display a message showing the real input string. Thus we need to
668 * translate from xC to xI. We know that xC >= tC, since the portion of the
669 * string sC..tC has been constructed by us, and so shouldn't have errors. We
671 * xI = tI + (xC - tC)
673 * When the substitute parse is constructed, the code needs to set:
676 * RExC_copy_start_in_input (tI)
677 * RExC_copy_start_in_constructed (tC)
678 * and restore them when done.
680 * During normal processing of the input pattern, both
681 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
682 * sI, so that xC equals xI.
685 #define sI RExC_precomp
686 #define eI RExC_precomp_end
687 #define sC RExC_start
689 #define tI RExC_copy_start_in_input
690 #define tC RExC_copy_start_in_constructed
691 #define xI(xC) (tI + (xC - tC))
692 #define xI_offset(xC) (xI(xC) - sI)
694 #define REPORT_LOCATION_ARGS(xC) \
696 (xI(xC) > eI) /* Don't run off end */ \
697 ? eI - sI /* Length before the <--HERE */ \
698 : ((xI_offset(xC) >= 0) \
700 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
701 IVdf " trying to output message for " \
703 __FILE__, __LINE__, (IV) xI_offset(xC), \
704 ((int) (eC - sC)), sC), 0)), \
705 sI), /* The input pattern printed up to the <--HERE */ \
707 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
708 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
710 /* Used to point after bad bytes for an error message, but avoid skipping
711 * past a nul byte. */
712 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
714 /* Set up to clean up after our imminent demise */
715 #define PREPARE_TO_DIE \
718 SAVEFREESV(RExC_rx_sv); \
719 if (RExC_open_parens) \
720 SAVEFREEPV(RExC_open_parens); \
721 if (RExC_close_parens) \
722 SAVEFREEPV(RExC_close_parens); \
726 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
727 * arg. Show regex, up to a maximum length. If it's too long, chop and add
730 #define _FAIL(code) STMT_START { \
731 const char *ellipses = ""; \
732 IV len = RExC_precomp_end - RExC_precomp; \
735 if (len > RegexLengthToShowInErrorMessages) { \
736 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
737 len = RegexLengthToShowInErrorMessages - 10; \
743 #define FAIL(msg) _FAIL( \
744 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
745 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
747 #define FAIL2(msg,arg) _FAIL( \
748 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
749 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
752 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
754 #define Simple_vFAIL(m) STMT_START { \
755 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
756 m, REPORT_LOCATION_ARGS(RExC_parse)); \
760 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
762 #define vFAIL(m) STMT_START { \
768 * Like Simple_vFAIL(), but accepts two arguments.
770 #define Simple_vFAIL2(m,a1) STMT_START { \
771 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
772 REPORT_LOCATION_ARGS(RExC_parse)); \
776 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
778 #define vFAIL2(m,a1) STMT_START { \
780 Simple_vFAIL2(m, a1); \
785 * Like Simple_vFAIL(), but accepts three arguments.
787 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
788 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
789 REPORT_LOCATION_ARGS(RExC_parse)); \
793 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
795 #define vFAIL3(m,a1,a2) STMT_START { \
797 Simple_vFAIL3(m, a1, a2); \
801 * Like Simple_vFAIL(), but accepts four arguments.
803 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
804 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
805 REPORT_LOCATION_ARGS(RExC_parse)); \
808 #define vFAIL4(m,a1,a2,a3) STMT_START { \
810 Simple_vFAIL4(m, a1, a2, a3); \
813 /* A specialized version of vFAIL2 that works with UTF8f */
814 #define vFAIL2utf8f(m, a1) STMT_START { \
816 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
817 REPORT_LOCATION_ARGS(RExC_parse)); \
820 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
822 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
823 REPORT_LOCATION_ARGS(RExC_parse)); \
826 /* Setting this to NULL is a signal to not output warnings */
827 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
829 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
830 RExC_copy_start_in_constructed = NULL; \
832 #define RESTORE_WARNINGS \
833 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
835 /* Since a warning can be generated multiple times as the input is reparsed, we
836 * output it the first time we come to that point in the parse, but suppress it
837 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
838 * generate any warnings */
839 #define TO_OUTPUT_WARNINGS(loc) \
840 ( RExC_copy_start_in_constructed \
841 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
843 /* After we've emitted a warning, we save the position in the input so we don't
845 #define UPDATE_WARNINGS_LOC(loc) \
847 if (TO_OUTPUT_WARNINGS(loc)) { \
848 RExC_latest_warn_offset = (xI(loc)) - RExC_precomp; \
852 /* 'warns' is the output of the packWARNx macro used in 'code' */
853 #define _WARN_HELPER(loc, warns, code) \
855 if (! RExC_copy_start_in_constructed) { \
856 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
857 " expected at '%s'", \
858 __FILE__, __LINE__, loc); \
860 if (TO_OUTPUT_WARNINGS(loc)) { \
864 UPDATE_WARNINGS_LOC(loc); \
868 /* m is not necessarily a "literal string", in this macro */
869 #define reg_warn_non_literal_string(loc, m) \
870 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
871 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
872 "%s" REPORT_LOCATION, \
873 m, REPORT_LOCATION_ARGS(loc)))
875 #define ckWARNreg(loc,m) \
876 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
877 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
879 REPORT_LOCATION_ARGS(loc)))
881 #define vWARN(loc, m) \
882 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
883 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
885 REPORT_LOCATION_ARGS(loc))) \
887 #define vWARN_dep(loc, m) \
888 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
889 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
891 REPORT_LOCATION_ARGS(loc)))
893 #define ckWARNdep(loc,m) \
894 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
895 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
897 REPORT_LOCATION_ARGS(loc)))
899 #define ckWARNregdep(loc,m) \
900 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
901 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
904 REPORT_LOCATION_ARGS(loc)))
906 #define ckWARN2reg_d(loc,m, a1) \
907 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
908 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
910 a1, REPORT_LOCATION_ARGS(loc)))
912 #define ckWARN2reg(loc, m, a1) \
913 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
914 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
916 a1, REPORT_LOCATION_ARGS(loc)))
918 #define vWARN3(loc, m, a1, a2) \
919 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
920 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
922 a1, a2, REPORT_LOCATION_ARGS(loc)))
924 #define ckWARN3reg(loc, m, a1, a2) \
925 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
926 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
929 REPORT_LOCATION_ARGS(loc)))
931 #define vWARN4(loc, m, a1, a2, a3) \
932 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
933 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
936 REPORT_LOCATION_ARGS(loc)))
938 #define ckWARN4reg(loc, m, a1, a2, a3) \
939 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
940 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
943 REPORT_LOCATION_ARGS(loc)))
945 #define vWARN5(loc, m, a1, a2, a3, a4) \
946 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
947 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
950 REPORT_LOCATION_ARGS(loc)))
952 #define ckWARNexperimental(loc, class, m) \
953 _WARN_HELPER(loc, packWARN(class), \
954 Perl_ck_warner_d(aTHX_ packWARN(class), \
956 REPORT_LOCATION_ARGS(loc)))
958 /* Convert between a pointer to a node and its offset from the beginning of the
960 #define REGNODE_p(offset) (RExC_emit_start + (offset))
961 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
963 /* Macros for recording node offsets. 20001227 mjd@plover.com
964 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
965 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
966 * Element 0 holds the number n.
967 * Position is 1 indexed.
969 #ifndef RE_TRACK_PATTERN_OFFSETS
970 #define Set_Node_Offset_To_R(offset,byte)
971 #define Set_Node_Offset(node,byte)
972 #define Set_Cur_Node_Offset
973 #define Set_Node_Length_To_R(node,len)
974 #define Set_Node_Length(node,len)
975 #define Set_Node_Cur_Length(node,start)
976 #define Node_Offset(n)
977 #define Node_Length(n)
978 #define Set_Node_Offset_Length(node,offset,len)
979 #define ProgLen(ri) ri->u.proglen
980 #define SetProgLen(ri,x) ri->u.proglen = x
981 #define Track_Code(code)
983 #define ProgLen(ri) ri->u.offsets[0]
984 #define SetProgLen(ri,x) ri->u.offsets[0] = x
985 #define Set_Node_Offset_To_R(offset,byte) STMT_START { \
986 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
987 __LINE__, (int)(offset), (int)(byte))); \
989 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
992 RExC_offsets[2*(offset)-1] = (byte); \
996 #define Set_Node_Offset(node,byte) \
997 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
998 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1000 #define Set_Node_Length_To_R(node,len) STMT_START { \
1001 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
1002 __LINE__, (int)(node), (int)(len))); \
1004 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1007 RExC_offsets[2*(node)] = (len); \
1011 #define Set_Node_Length(node,len) \
1012 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1013 #define Set_Node_Cur_Length(node, start) \
1014 Set_Node_Length(node, RExC_parse - start)
1016 /* Get offsets and lengths */
1017 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1018 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1020 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1021 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1022 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
1025 #define Track_Code(code) STMT_START { code } STMT_END
1028 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1029 #define EXPERIMENTAL_INPLACESCAN
1030 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1034 Perl_re_printf(pTHX_ const char *fmt, ...)
1038 PerlIO *f= Perl_debug_log;
1039 PERL_ARGS_ASSERT_RE_PRINTF;
1041 result = PerlIO_vprintf(f, fmt, ap);
1047 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1051 PerlIO *f= Perl_debug_log;
1052 PERL_ARGS_ASSERT_RE_INDENTF;
1053 va_start(ap, depth);
1054 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1055 result = PerlIO_vprintf(f, fmt, ap);
1059 #endif /* DEBUGGING */
1061 #define DEBUG_RExC_seen() \
1062 DEBUG_OPTIMISE_MORE_r({ \
1063 Perl_re_printf( aTHX_ "RExC_seen: "); \
1065 if (RExC_seen & REG_ZERO_LEN_SEEN) \
1066 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
1068 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
1069 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
1071 if (RExC_seen & REG_GPOS_SEEN) \
1072 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
1074 if (RExC_seen & REG_RECURSE_SEEN) \
1075 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
1077 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
1078 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
1080 if (RExC_seen & REG_VERBARG_SEEN) \
1081 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
1083 if (RExC_seen & REG_CUTGROUP_SEEN) \
1084 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
1086 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
1087 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
1089 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
1090 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
1092 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
1093 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
1095 Perl_re_printf( aTHX_ "\n"); \
1098 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1099 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
1104 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1105 const char *close_str)
1110 Perl_re_printf( aTHX_ "%s", open_str);
1111 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1112 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1113 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1114 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1115 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1116 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1117 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1118 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1119 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1120 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1121 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1122 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1123 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1124 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1125 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1126 Perl_re_printf( aTHX_ "%s", close_str);
1131 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1132 U32 depth, int is_inf)
1134 GET_RE_DEBUG_FLAGS_DECL;
1136 DEBUG_OPTIMISE_MORE_r({
1139 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1143 (IV)data->pos_delta,
1147 S_debug_show_study_flags(aTHX_ data->flags," [","]");
1149 Perl_re_printf( aTHX_
1150 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1152 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1153 is_inf ? "INF " : ""
1156 if (data->last_found) {
1158 Perl_re_printf(aTHX_
1159 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1160 SvPVX_const(data->last_found),
1162 (IV)data->last_start_min,
1163 (IV)data->last_start_max
1166 for (i = 0; i < 2; i++) {
1167 Perl_re_printf(aTHX_
1168 " %s%s: '%s' @ %" IVdf "/%" IVdf,
1169 data->cur_is_floating == i ? "*" : "",
1170 i ? "Float" : "Fixed",
1171 SvPVX_const(data->substrs[i].str),
1172 (IV)data->substrs[i].min_offset,
1173 (IV)data->substrs[i].max_offset
1175 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1179 Perl_re_printf( aTHX_ "\n");
1185 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1186 regnode *scan, U32 depth, U32 flags)
1188 GET_RE_DEBUG_FLAGS_DECL;
1195 Next = regnext(scan);
1196 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1197 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1200 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1201 Next ? (REG_NODE_NUM(Next)) : 0 );
1202 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1203 Perl_re_printf( aTHX_ "\n");
1208 # define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1209 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1211 # define DEBUG_PEEP(str, scan, depth, flags) \
1212 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1215 # define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1216 # define DEBUG_PEEP(str, scan, depth, flags) NOOP
1220 /* =========================================================
1221 * BEGIN edit_distance stuff.
1223 * This calculates how many single character changes of any type are needed to
1224 * transform a string into another one. It is taken from version 3.1 of
1226 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1229 /* Our unsorted dictionary linked list. */
1230 /* Note we use UVs, not chars. */
1235 struct dictionary* next;
1237 typedef struct dictionary item;
1240 PERL_STATIC_INLINE item*
1241 push(UV key, item* curr)
1244 Newx(head, 1, item);
1252 PERL_STATIC_INLINE item*
1253 find(item* head, UV key)
1255 item* iterator = head;
1257 if (iterator->key == key){
1260 iterator = iterator->next;
1266 PERL_STATIC_INLINE item*
1267 uniquePush(item* head, UV key)
1269 item* iterator = head;
1272 if (iterator->key == key) {
1275 iterator = iterator->next;
1278 return push(key, head);
1281 PERL_STATIC_INLINE void
1282 dict_free(item* head)
1284 item* iterator = head;
1287 item* temp = iterator;
1288 iterator = iterator->next;
1295 /* End of Dictionary Stuff */
1297 /* All calculations/work are done here */
1299 S_edit_distance(const UV* src,
1301 const STRLEN x, /* length of src[] */
1302 const STRLEN y, /* length of tgt[] */
1303 const SSize_t maxDistance
1307 UV swapCount, swapScore, targetCharCount, i, j;
1309 UV score_ceil = x + y;
1311 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1313 /* intialize matrix start values */
1314 Newx(scores, ( (x + 2) * (y + 2)), UV);
1315 scores[0] = score_ceil;
1316 scores[1 * (y + 2) + 0] = score_ceil;
1317 scores[0 * (y + 2) + 1] = score_ceil;
1318 scores[1 * (y + 2) + 1] = 0;
1319 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1324 for (i=1;i<=x;i++) {
1326 head = uniquePush(head, src[i]);
1327 scores[(i+1) * (y + 2) + 1] = i;
1328 scores[(i+1) * (y + 2) + 0] = score_ceil;
1331 for (j=1;j<=y;j++) {
1334 head = uniquePush(head, tgt[j]);
1335 scores[1 * (y + 2) + (j + 1)] = j;
1336 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1339 targetCharCount = find(head, tgt[j-1])->value;
1340 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1342 if (src[i-1] != tgt[j-1]){
1343 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1347 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1351 find(head, src[i-1])->value = i;
1355 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1358 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1362 /* END of edit_distance() stuff
1363 * ========================================================= */
1365 /* is c a control character for which we have a mnemonic? */
1366 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1369 S_cntrl_to_mnemonic(const U8 c)
1371 /* Returns the mnemonic string that represents character 'c', if one
1372 * exists; NULL otherwise. The only ones that exist for the purposes of
1373 * this routine are a few control characters */
1376 case '\a': return "\\a";
1377 case '\b': return "\\b";
1378 case ESC_NATIVE: return "\\e";
1379 case '\f': return "\\f";
1380 case '\n': return "\\n";
1381 case '\r': return "\\r";
1382 case '\t': return "\\t";
1388 /* Mark that we cannot extend a found fixed substring at this point.
1389 Update the longest found anchored substring or the longest found
1390 floating substrings if needed. */
1393 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1394 SSize_t *minlenp, int is_inf)
1396 const STRLEN l = CHR_SVLEN(data->last_found);
1397 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1398 const STRLEN old_l = CHR_SVLEN(longest_sv);
1399 GET_RE_DEBUG_FLAGS_DECL;
1401 PERL_ARGS_ASSERT_SCAN_COMMIT;
1403 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1404 const U8 i = data->cur_is_floating;
1405 SvSetMagicSV(longest_sv, data->last_found);
1406 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1409 data->substrs[0].max_offset = data->substrs[0].min_offset;
1411 data->substrs[1].max_offset = (l
1412 ? data->last_start_max
1413 : (data->pos_delta > SSize_t_MAX - data->pos_min
1415 : data->pos_min + data->pos_delta));
1417 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1418 data->substrs[1].max_offset = SSize_t_MAX;
1421 if (data->flags & SF_BEFORE_EOL)
1422 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1424 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1425 data->substrs[i].minlenp = minlenp;
1426 data->substrs[i].lookbehind = 0;
1429 SvCUR_set(data->last_found, 0);
1431 SV * const sv = data->last_found;
1432 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1433 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1438 data->last_end = -1;
1439 data->flags &= ~SF_BEFORE_EOL;
1440 DEBUG_STUDYDATA("commit", data, 0, is_inf);
1443 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1444 * list that describes which code points it matches */
1447 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1449 /* Set the SSC 'ssc' to match an empty string or any code point */
1451 PERL_ARGS_ASSERT_SSC_ANYTHING;
1453 assert(is_ANYOF_SYNTHETIC(ssc));
1455 /* mortalize so won't leak */
1456 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1457 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1461 S_ssc_is_anything(const regnode_ssc *ssc)
1463 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1464 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1465 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1466 * in any way, so there's no point in using it */
1471 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1473 assert(is_ANYOF_SYNTHETIC(ssc));
1475 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1479 /* See if the list consists solely of the range 0 - Infinity */
1480 invlist_iterinit(ssc->invlist);
1481 ret = invlist_iternext(ssc->invlist, &start, &end)
1485 invlist_iterfinish(ssc->invlist);
1491 /* If e.g., both \w and \W are set, matches everything */
1492 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1494 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1495 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1505 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1507 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1508 * string, any code point, or any posix class under locale */
1510 PERL_ARGS_ASSERT_SSC_INIT;
1512 Zero(ssc, 1, regnode_ssc);
1513 set_ANYOF_SYNTHETIC(ssc);
1514 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1517 /* If any portion of the regex is to operate under locale rules that aren't
1518 * fully known at compile time, initialization includes it. The reason
1519 * this isn't done for all regexes is that the optimizer was written under
1520 * the assumption that locale was all-or-nothing. Given the complexity and
1521 * lack of documentation in the optimizer, and that there are inadequate
1522 * test cases for locale, many parts of it may not work properly, it is
1523 * safest to avoid locale unless necessary. */
1524 if (RExC_contains_locale) {
1525 ANYOF_POSIXL_SETALL(ssc);
1528 ANYOF_POSIXL_ZERO(ssc);
1533 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1534 const regnode_ssc *ssc)
1536 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1537 * to the list of code points matched, and locale posix classes; hence does
1538 * not check its flags) */
1543 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1545 assert(is_ANYOF_SYNTHETIC(ssc));
1547 invlist_iterinit(ssc->invlist);
1548 ret = invlist_iternext(ssc->invlist, &start, &end)
1552 invlist_iterfinish(ssc->invlist);
1558 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1565 #define INVLIST_INDEX 0
1566 #define ONLY_LOCALE_MATCHES_INDEX 1
1567 #define DEFERRED_USER_DEFINED_INDEX 2
1570 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1571 const regnode_charclass* const node)
1573 /* Returns a mortal inversion list defining which code points are matched
1574 * by 'node', which is of type ANYOF. Handles complementing the result if
1575 * appropriate. If some code points aren't knowable at this time, the
1576 * returned list must, and will, contain every code point that is a
1581 SV* only_utf8_locale_invlist = NULL;
1583 const U32 n = ARG(node);
1584 bool new_node_has_latin1 = FALSE;
1585 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
1587 : ANYOF_FLAGS(node);
1589 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1591 /* Look at the data structure created by S_set_ANYOF_arg() */
1592 if (n != ANYOF_ONLY_HAS_BITMAP) {
1593 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1594 AV * const av = MUTABLE_AV(SvRV(rv));
1595 SV **const ary = AvARRAY(av);
1596 assert(RExC_rxi->data->what[n] == 's');
1598 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1600 /* Here there are things that won't be known until runtime -- we
1601 * have to assume it could be anything */
1602 invlist = sv_2mortal(_new_invlist(1));
1603 return _add_range_to_invlist(invlist, 0, UV_MAX);
1605 else if (ary[INVLIST_INDEX]) {
1607 /* Use the node's inversion list */
1608 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1611 /* Get the code points valid only under UTF-8 locales */
1612 if ( (flags & ANYOFL_FOLD)
1613 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1615 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1620 invlist = sv_2mortal(_new_invlist(0));
1623 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1624 * code points, and an inversion list for the others, but if there are code
1625 * points that should match only conditionally on the target string being
1626 * UTF-8, those are placed in the inversion list, and not the bitmap.
1627 * Since there are circumstances under which they could match, they are
1628 * included in the SSC. But if the ANYOF node is to be inverted, we have
1629 * to exclude them here, so that when we invert below, the end result
1630 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1631 * have to do this here before we add the unconditionally matched code
1633 if (flags & ANYOF_INVERT) {
1634 _invlist_intersection_complement_2nd(invlist,
1639 /* Add in the points from the bit map */
1640 if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
1641 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1642 if (ANYOF_BITMAP_TEST(node, i)) {
1643 unsigned int start = i++;
1645 for (; i < NUM_ANYOF_CODE_POINTS
1646 && ANYOF_BITMAP_TEST(node, i); ++i)
1650 invlist = _add_range_to_invlist(invlist, start, i-1);
1651 new_node_has_latin1 = TRUE;
1656 /* If this can match all upper Latin1 code points, have to add them
1657 * as well. But don't add them if inverting, as when that gets done below,
1658 * it would exclude all these characters, including the ones it shouldn't
1659 * that were added just above */
1660 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1661 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1663 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1666 /* Similarly for these */
1667 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1668 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1671 if (flags & ANYOF_INVERT) {
1672 _invlist_invert(invlist);
1674 else if (flags & ANYOFL_FOLD) {
1675 if (new_node_has_latin1) {
1677 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1678 * the locale. We can skip this if there are no 0-255 at all. */
1679 _invlist_union(invlist, PL_Latin1, &invlist);
1681 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1682 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1685 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1686 invlist = add_cp_to_invlist(invlist, 'I');
1688 if (_invlist_contains_cp(invlist,
1689 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1691 invlist = add_cp_to_invlist(invlist, 'i');
1696 /* Similarly add the UTF-8 locale possible matches. These have to be
1697 * deferred until after the non-UTF-8 locale ones are taken care of just
1698 * above, or it leads to wrong results under ANYOF_INVERT */
1699 if (only_utf8_locale_invlist) {
1700 _invlist_union_maybe_complement_2nd(invlist,
1701 only_utf8_locale_invlist,
1702 flags & ANYOF_INVERT,
1709 /* These two functions currently do the exact same thing */
1710 #define ssc_init_zero ssc_init
1712 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1713 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1715 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1716 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1717 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1720 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1721 const regnode_charclass *and_with)
1723 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1724 * another SSC or a regular ANYOF class. Can create false positives. */
1727 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
1729 : ANYOF_FLAGS(and_with);
1732 PERL_ARGS_ASSERT_SSC_AND;
1734 assert(is_ANYOF_SYNTHETIC(ssc));
1736 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1737 * the code point inversion list and just the relevant flags */
1738 if (is_ANYOF_SYNTHETIC(and_with)) {
1739 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1740 anded_flags = and_with_flags;
1742 /* XXX This is a kludge around what appears to be deficiencies in the
1743 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1744 * there are paths through the optimizer where it doesn't get weeded
1745 * out when it should. And if we don't make some extra provision for
1746 * it like the code just below, it doesn't get added when it should.
1747 * This solution is to add it only when AND'ing, which is here, and
1748 * only when what is being AND'ed is the pristine, original node
1749 * matching anything. Thus it is like adding it to ssc_anything() but
1750 * only when the result is to be AND'ed. Probably the same solution
1751 * could be adopted for the same problem we have with /l matching,
1752 * which is solved differently in S_ssc_init(), and that would lead to
1753 * fewer false positives than that solution has. But if this solution
1754 * creates bugs, the consequences are only that a warning isn't raised
1755 * that should be; while the consequences for having /l bugs is
1756 * incorrect matches */
1757 if (ssc_is_anything((regnode_ssc *)and_with)) {
1758 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1762 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1763 if (OP(and_with) == ANYOFD) {
1764 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1767 anded_flags = and_with_flags
1768 &( ANYOF_COMMON_FLAGS
1769 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1770 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1771 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1773 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1778 ANYOF_FLAGS(ssc) &= anded_flags;
1780 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1781 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1782 * 'and_with' may be inverted. When not inverted, we have the situation of
1784 * (C1 | P1) & (C2 | P2)
1785 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1786 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1787 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1788 * <= ((C1 & C2) | P1 | P2)
1789 * Alternatively, the last few steps could be:
1790 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1791 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1792 * <= (C1 | C2 | (P1 & P2))
1793 * We favor the second approach if either P1 or P2 is non-empty. This is
1794 * because these components are a barrier to doing optimizations, as what
1795 * they match cannot be known until the moment of matching as they are
1796 * dependent on the current locale, 'AND"ing them likely will reduce or
1798 * But we can do better if we know that C1,P1 are in their initial state (a
1799 * frequent occurrence), each matching everything:
1800 * (<everything>) & (C2 | P2) = C2 | P2
1801 * Similarly, if C2,P2 are in their initial state (again a frequent
1802 * occurrence), the result is a no-op
1803 * (C1 | P1) & (<everything>) = C1 | P1
1806 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1807 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1808 * <= (C1 & ~C2) | (P1 & ~P2)
1811 if ((and_with_flags & ANYOF_INVERT)
1812 && ! is_ANYOF_SYNTHETIC(and_with))
1816 ssc_intersection(ssc,
1818 FALSE /* Has already been inverted */
1821 /* If either P1 or P2 is empty, the intersection will be also; can skip
1823 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1824 ANYOF_POSIXL_ZERO(ssc);
1826 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1828 /* Note that the Posix class component P from 'and_with' actually
1830 * P = Pa | Pb | ... | Pn
1831 * where each component is one posix class, such as in [\w\s].
1833 * ~P = ~(Pa | Pb | ... | Pn)
1834 * = ~Pa & ~Pb & ... & ~Pn
1835 * <= ~Pa | ~Pb | ... | ~Pn
1836 * The last is something we can easily calculate, but unfortunately
1837 * is likely to have many false positives. We could do better
1838 * in some (but certainly not all) instances if two classes in
1839 * P have known relationships. For example
1840 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1842 * :lower: & :print: = :lower:
1843 * And similarly for classes that must be disjoint. For example,
1844 * since \s and \w can have no elements in common based on rules in
1845 * the POSIX standard,
1846 * \w & ^\S = nothing
1847 * Unfortunately, some vendor locales do not meet the Posix
1848 * standard, in particular almost everything by Microsoft.
1849 * The loop below just changes e.g., \w into \W and vice versa */
1851 regnode_charclass_posixl temp;
1852 int add = 1; /* To calculate the index of the complement */
1854 Zero(&temp, 1, regnode_charclass_posixl);
1855 ANYOF_POSIXL_ZERO(&temp);
1856 for (i = 0; i < ANYOF_MAX; i++) {
1858 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1859 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1861 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1862 ANYOF_POSIXL_SET(&temp, i + add);
1864 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1866 ANYOF_POSIXL_AND(&temp, ssc);
1868 } /* else ssc already has no posixes */
1869 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1870 in its initial state */
1871 else if (! is_ANYOF_SYNTHETIC(and_with)
1872 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1874 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1875 * copy it over 'ssc' */
1876 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1877 if (is_ANYOF_SYNTHETIC(and_with)) {
1878 StructCopy(and_with, ssc, regnode_ssc);
1881 ssc->invlist = anded_cp_list;
1882 ANYOF_POSIXL_ZERO(ssc);
1883 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1884 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1888 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1889 || (and_with_flags & ANYOF_MATCHES_POSIXL))
1891 /* One or the other of P1, P2 is non-empty. */
1892 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1893 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1895 ssc_union(ssc, anded_cp_list, FALSE);
1897 else { /* P1 = P2 = empty */
1898 ssc_intersection(ssc, anded_cp_list, FALSE);
1904 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1905 const regnode_charclass *or_with)
1907 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1908 * another SSC or a regular ANYOF class. Can create false positives if
1909 * 'or_with' is to be inverted. */
1913 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
1915 : ANYOF_FLAGS(or_with);
1917 PERL_ARGS_ASSERT_SSC_OR;
1919 assert(is_ANYOF_SYNTHETIC(ssc));
1921 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1922 * the code point inversion list and just the relevant flags */
1923 if (is_ANYOF_SYNTHETIC(or_with)) {
1924 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1925 ored_flags = or_with_flags;
1928 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1929 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
1930 if (OP(or_with) != ANYOFD) {
1933 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1934 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1935 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
1937 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1942 ANYOF_FLAGS(ssc) |= ored_flags;
1944 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1945 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1946 * 'or_with' may be inverted. When not inverted, we have the simple
1947 * situation of computing:
1948 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1949 * If P1|P2 yields a situation with both a class and its complement are
1950 * set, like having both \w and \W, this matches all code points, and we
1951 * can delete these from the P component of the ssc going forward. XXX We
1952 * might be able to delete all the P components, but I (khw) am not certain
1953 * about this, and it is better to be safe.
1956 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1957 * <= (C1 | P1) | ~C2
1958 * <= (C1 | ~C2) | P1
1959 * (which results in actually simpler code than the non-inverted case)
1962 if ((or_with_flags & ANYOF_INVERT)
1963 && ! is_ANYOF_SYNTHETIC(or_with))
1965 /* We ignore P2, leaving P1 going forward */
1966 } /* else Not inverted */
1967 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
1968 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1969 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1971 for (i = 0; i < ANYOF_MAX; i += 2) {
1972 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1974 ssc_match_all_cp(ssc);
1975 ANYOF_POSIXL_CLEAR(ssc, i);
1976 ANYOF_POSIXL_CLEAR(ssc, i+1);
1984 FALSE /* Already has been inverted */
1988 PERL_STATIC_INLINE void
1989 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1991 PERL_ARGS_ASSERT_SSC_UNION;
1993 assert(is_ANYOF_SYNTHETIC(ssc));
1995 _invlist_union_maybe_complement_2nd(ssc->invlist,
2001 PERL_STATIC_INLINE void
2002 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2004 const bool invert2nd)
2006 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2008 assert(is_ANYOF_SYNTHETIC(ssc));
2010 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2016 PERL_STATIC_INLINE void
2017 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2019 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2021 assert(is_ANYOF_SYNTHETIC(ssc));
2023 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2026 PERL_STATIC_INLINE void
2027 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2029 /* AND just the single code point 'cp' into the SSC 'ssc' */
2031 SV* cp_list = _new_invlist(2);
2033 PERL_ARGS_ASSERT_SSC_CP_AND;
2035 assert(is_ANYOF_SYNTHETIC(ssc));
2037 cp_list = add_cp_to_invlist(cp_list, cp);
2038 ssc_intersection(ssc, cp_list,
2039 FALSE /* Not inverted */
2041 SvREFCNT_dec_NN(cp_list);
2044 PERL_STATIC_INLINE void
2045 S_ssc_clear_locale(regnode_ssc *ssc)
2047 /* Set the SSC 'ssc' to not match any locale things */
2048 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2050 assert(is_ANYOF_SYNTHETIC(ssc));
2052 ANYOF_POSIXL_ZERO(ssc);
2053 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2056 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2059 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2061 /* The synthetic start class is used to hopefully quickly winnow down
2062 * places where a pattern could start a match in the target string. If it
2063 * doesn't really narrow things down that much, there isn't much point to
2064 * having the overhead of using it. This function uses some very crude
2065 * heuristics to decide if to use the ssc or not.
2067 * It returns TRUE if 'ssc' rules out more than half what it considers to
2068 * be the "likely" possible matches, but of course it doesn't know what the
2069 * actual things being matched are going to be; these are only guesses
2071 * For /l matches, it assumes that the only likely matches are going to be
2072 * in the 0-255 range, uniformly distributed, so half of that is 127
2073 * For /a and /d matches, it assumes that the likely matches will be just
2074 * the ASCII range, so half of that is 63
2075 * For /u and there isn't anything matching above the Latin1 range, it
2076 * assumes that that is the only range likely to be matched, and uses
2077 * half that as the cut-off: 127. If anything matches above Latin1,
2078 * it assumes that all of Unicode could match (uniformly), except for
2079 * non-Unicode code points and things in the General Category "Other"
2080 * (unassigned, private use, surrogates, controls and formats). This
2081 * is a much large number. */
2083 U32 count = 0; /* Running total of number of code points matched by
2085 UV start, end; /* Start and end points of current range in inversion
2086 XXX outdated. UTF-8 locales are common, what about invert? list */
2087 const U32 max_code_points = (LOC)
2089 : (( ! UNI_SEMANTICS
2090 || invlist_highest(ssc->invlist) < 256)
2093 const U32 max_match = max_code_points / 2;
2095 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2097 invlist_iterinit(ssc->invlist);
2098 while (invlist_iternext(ssc->invlist, &start, &end)) {
2099 if (start >= max_code_points) {
2102 end = MIN(end, max_code_points - 1);
2103 count += end - start + 1;
2104 if (count >= max_match) {
2105 invlist_iterfinish(ssc->invlist);
2115 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2117 /* The inversion list in the SSC is marked mortal; now we need a more
2118 * permanent copy, which is stored the same way that is done in a regular
2119 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2122 SV* invlist = invlist_clone(ssc->invlist, NULL);
2124 PERL_ARGS_ASSERT_SSC_FINALIZE;
2126 assert(is_ANYOF_SYNTHETIC(ssc));
2128 /* The code in this file assumes that all but these flags aren't relevant
2129 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2130 * by the time we reach here */
2131 assert(! (ANYOF_FLAGS(ssc)
2132 & ~( ANYOF_COMMON_FLAGS
2133 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2134 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2136 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2138 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2140 /* Make sure is clone-safe */
2141 ssc->invlist = NULL;
2143 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2144 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2145 OP(ssc) = ANYOFPOSIXL;
2147 else if (RExC_contains_locale) {
2151 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2154 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2155 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2156 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2157 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2158 ? (TRIE_LIST_CUR( idx ) - 1) \
2164 dump_trie(trie,widecharmap,revcharmap)
2165 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2166 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2168 These routines dump out a trie in a somewhat readable format.
2169 The _interim_ variants are used for debugging the interim
2170 tables that are used to generate the final compressed
2171 representation which is what dump_trie expects.
2173 Part of the reason for their existence is to provide a form
2174 of documentation as to how the different representations function.
2179 Dumps the final compressed table form of the trie to Perl_debug_log.
2180 Used for debugging make_trie().
2184 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2185 AV *revcharmap, U32 depth)
2188 SV *sv=sv_newmortal();
2189 int colwidth= widecharmap ? 6 : 4;
2191 GET_RE_DEBUG_FLAGS_DECL;
2193 PERL_ARGS_ASSERT_DUMP_TRIE;
2195 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
2196 depth+1, "Match","Base","Ofs" );
2198 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2199 SV ** const tmp = av_fetch( revcharmap, state, 0);
2201 Perl_re_printf( aTHX_ "%*s",
2203 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2204 PL_colors[0], PL_colors[1],
2205 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2206 PERL_PV_ESCAPE_FIRSTCHAR
2211 Perl_re_printf( aTHX_ "\n");
2212 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2214 for( state = 0 ; state < trie->uniquecharcount ; state++ )
2215 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2216 Perl_re_printf( aTHX_ "\n");
2218 for( state = 1 ; state < trie->statecount ; state++ ) {
2219 const U32 base = trie->states[ state ].trans.base;
2221 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
2223 if ( trie->states[ state ].wordnum ) {
2224 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
2226 Perl_re_printf( aTHX_ "%6s", "" );
2229 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
2234 while( ( base + ofs < trie->uniquecharcount ) ||
2235 ( base + ofs - trie->uniquecharcount < trie->lasttrans
2236 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2240 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
2242 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2243 if ( ( base + ofs >= trie->uniquecharcount )
2244 && ( base + ofs - trie->uniquecharcount
2246 && trie->trans[ base + ofs
2247 - trie->uniquecharcount ].check == state )
2249 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
2250 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2253 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
2257 Perl_re_printf( aTHX_ "]");
2260 Perl_re_printf( aTHX_ "\n" );
2262 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
2264 for (word=1; word <= trie->wordcount; word++) {
2265 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2266 (int)word, (int)(trie->wordinfo[word].prev),
2267 (int)(trie->wordinfo[word].len));
2269 Perl_re_printf( aTHX_ "\n" );
2272 Dumps a fully constructed but uncompressed trie in list form.
2273 List tries normally only are used for construction when the number of
2274 possible chars (trie->uniquecharcount) is very high.
2275 Used for debugging make_trie().
2278 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2279 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2283 SV *sv=sv_newmortal();
2284 int colwidth= widecharmap ? 6 : 4;
2285 GET_RE_DEBUG_FLAGS_DECL;
2287 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2289 /* print out the table precompression. */
2290 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
2292 Perl_re_indentf( aTHX_ "%s",
2293 depth+1, "------:-----+-----------------\n" );
2295 for( state=1 ; state < next_alloc ; state ++ ) {
2298 Perl_re_indentf( aTHX_ " %4" UVXf " :",
2299 depth+1, (UV)state );
2300 if ( ! trie->states[ state ].wordnum ) {
2301 Perl_re_printf( aTHX_ "%5s| ","");
2303 Perl_re_printf( aTHX_ "W%4x| ",
2304 trie->states[ state ].wordnum
2307 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2308 SV ** const tmp = av_fetch( revcharmap,
2309 TRIE_LIST_ITEM(state, charid).forid, 0);
2311 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
2313 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2315 PL_colors[0], PL_colors[1],
2316 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2317 | PERL_PV_ESCAPE_FIRSTCHAR
2319 TRIE_LIST_ITEM(state, charid).forid,
2320 (UV)TRIE_LIST_ITEM(state, charid).newstate
2323 Perl_re_printf( aTHX_ "\n%*s| ",
2324 (int)((depth * 2) + 14), "");
2327 Perl_re_printf( aTHX_ "\n");
2332 Dumps a fully constructed but uncompressed trie in table form.
2333 This is the normal DFA style state transition table, with a few
2334 twists to facilitate compression later.
2335 Used for debugging make_trie().
2338 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2339 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2344 SV *sv=sv_newmortal();
2345 int colwidth= widecharmap ? 6 : 4;
2346 GET_RE_DEBUG_FLAGS_DECL;
2348 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2351 print out the table precompression so that we can do a visual check
2352 that they are identical.
2355 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
2357 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2358 SV ** const tmp = av_fetch( revcharmap, charid, 0);
2360 Perl_re_printf( aTHX_ "%*s",
2362 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2363 PL_colors[0], PL_colors[1],
2364 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2365 PERL_PV_ESCAPE_FIRSTCHAR
2371 Perl_re_printf( aTHX_ "\n");
2372 Perl_re_indentf( aTHX_ "State+-", depth+1 );
2374 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2375 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
2378 Perl_re_printf( aTHX_ "\n" );
2380 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2382 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
2384 (UV)TRIE_NODENUM( state ) );
2386 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2387 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2389 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
2391 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
2393 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2394 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
2395 (UV)trie->trans[ state ].check );
2397 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
2398 (UV)trie->trans[ state ].check,
2399 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2407 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2408 startbranch: the first branch in the whole branch sequence
2409 first : start branch of sequence of branch-exact nodes.
2410 May be the same as startbranch
2411 last : Thing following the last branch.
2412 May be the same as tail.
2413 tail : item following the branch sequence
2414 count : words in the sequence
2415 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2416 depth : indent depth
2418 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2420 A trie is an N'ary tree where the branches are determined by digital
2421 decomposition of the key. IE, at the root node you look up the 1st character and
2422 follow that branch repeat until you find the end of the branches. Nodes can be
2423 marked as "accepting" meaning they represent a complete word. Eg:
2427 would convert into the following structure. Numbers represent states, letters
2428 following numbers represent valid transitions on the letter from that state, if
2429 the number is in square brackets it represents an accepting state, otherwise it
2430 will be in parenthesis.
2432 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2436 (1) +-i->(6)-+-s->[7]
2438 +-s->(3)-+-h->(4)-+-e->[5]
2440 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2442 This shows that when matching against the string 'hers' we will begin at state 1
2443 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2444 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2445 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2446 single traverse. We store a mapping from accepting to state to which word was
2447 matched, and then when we have multiple possibilities we try to complete the
2448 rest of the regex in the order in which they occurred in the alternation.
2450 The only prior NFA like behaviour that would be changed by the TRIE support is
2451 the silent ignoring of duplicate alternations which are of the form:
2453 / (DUPE|DUPE) X? (?{ ... }) Y /x
2455 Thus EVAL blocks following a trie may be called a different number of times with
2456 and without the optimisation. With the optimisations dupes will be silently
2457 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2458 the following demonstrates:
2460 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2462 which prints out 'word' three times, but
2464 'words'=~/(word|word|word)(?{ print $1 })S/
2466 which doesnt print it out at all. This is due to other optimisations kicking in.
2468 Example of what happens on a structural level:
2470 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2472 1: CURLYM[1] {1,32767}(18)
2483 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2484 and should turn into:
2486 1: CURLYM[1] {1,32767}(18)
2488 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2496 Cases where tail != last would be like /(?foo|bar)baz/:
2506 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2507 and would end up looking like:
2510 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2517 d = uvchr_to_utf8_flags(d, uv, 0);
2519 is the recommended Unicode-aware way of saying
2524 #define TRIE_STORE_REVCHAR(val) \
2527 SV *zlopp = newSV(UTF8_MAXBYTES); \
2528 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2529 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2530 SvCUR_set(zlopp, kapow - flrbbbbb); \
2533 av_push(revcharmap, zlopp); \
2535 char ooooff = (char)val; \
2536 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2540 /* This gets the next character from the input, folding it if not already
2542 #define TRIE_READ_CHAR STMT_START { \
2545 /* if it is UTF then it is either already folded, or does not need \
2547 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2549 else if (folder == PL_fold_latin1) { \
2550 /* This folder implies Unicode rules, which in the range expressible \
2551 * by not UTF is the lower case, with the two exceptions, one of \
2552 * which should have been taken care of before calling this */ \
2553 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2554 uvc = toLOWER_L1(*uc); \
2555 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2558 /* raw data, will be folded later if needed */ \
2566 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2567 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2568 U32 ging = TRIE_LIST_LEN( state ) * 2; \
2569 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2570 TRIE_LIST_LEN( state ) = ging; \
2572 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2573 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2574 TRIE_LIST_CUR( state )++; \
2577 #define TRIE_LIST_NEW(state) STMT_START { \
2578 Newx( trie->states[ state ].trans.list, \
2579 4, reg_trie_trans_le ); \
2580 TRIE_LIST_CUR( state ) = 1; \
2581 TRIE_LIST_LEN( state ) = 4; \
2584 #define TRIE_HANDLE_WORD(state) STMT_START { \
2585 U16 dupe= trie->states[ state ].wordnum; \
2586 regnode * const noper_next = regnext( noper ); \
2589 /* store the word for dumping */ \
2591 if (OP(noper) != NOTHING) \
2592 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2594 tmp = newSVpvn_utf8( "", 0, UTF ); \
2595 av_push( trie_words, tmp ); \
2599 trie->wordinfo[curword].prev = 0; \
2600 trie->wordinfo[curword].len = wordlen; \
2601 trie->wordinfo[curword].accept = state; \
2603 if ( noper_next < tail ) { \
2605 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2607 trie->jump[curword] = (U16)(noper_next - convert); \
2609 jumper = noper_next; \
2611 nextbranch= regnext(cur); \
2615 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2616 /* chain, so that when the bits of chain are later */\
2617 /* linked together, the dups appear in the chain */\
2618 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2619 trie->wordinfo[dupe].prev = curword; \
2621 /* we haven't inserted this word yet. */ \
2622 trie->states[ state ].wordnum = curword; \
2627 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2628 ( ( base + charid >= ucharcount \
2629 && base + charid < ubound \
2630 && state == trie->trans[ base - ucharcount + charid ].check \
2631 && trie->trans[ base - ucharcount + charid ].next ) \
2632 ? trie->trans[ base - ucharcount + charid ].next \
2633 : ( state==1 ? special : 0 ) \
2636 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2638 TRIE_BITMAP_SET(trie, uvc); \
2639 /* store the folded codepoint */ \
2641 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2644 /* store first byte of utf8 representation of */ \
2645 /* variant codepoints */ \
2646 if (! UVCHR_IS_INVARIANT(uvc)) { \
2647 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2652 #define MADE_JUMP_TRIE 2
2653 #define MADE_EXACT_TRIE 4
2656 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2657 regnode *first, regnode *last, regnode *tail,
2658 U32 word_count, U32 flags, U32 depth)
2660 /* first pass, loop through and scan words */
2661 reg_trie_data *trie;
2662 HV *widecharmap = NULL;
2663 AV *revcharmap = newAV();
2669 regnode *jumper = NULL;
2670 regnode *nextbranch = NULL;
2671 regnode *convert = NULL;
2672 U32 *prev_states; /* temp array mapping each state to previous one */
2673 /* we just use folder as a flag in utf8 */
2674 const U8 * folder = NULL;
2676 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2677 * which stands for one trie structure, one hash, optionally followed
2680 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2681 AV *trie_words = NULL;
2682 /* along with revcharmap, this only used during construction but both are
2683 * useful during debugging so we store them in the struct when debugging.
2686 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2687 STRLEN trie_charcount=0;
2689 SV *re_trie_maxbuff;
2690 GET_RE_DEBUG_FLAGS_DECL;
2692 PERL_ARGS_ASSERT_MAKE_TRIE;
2694 PERL_UNUSED_ARG(depth);
2698 case EXACT: case EXACT_ONLY8: case EXACTL: break;
2702 case EXACTFLU8: folder = PL_fold_latin1; break;
2703 case EXACTF: folder = PL_fold; break;
2704 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2707 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2709 trie->startstate = 1;
2710 trie->wordcount = word_count;
2711 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2712 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2713 if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
2714 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2715 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2716 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2719 trie_words = newAV();
2722 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2723 assert(re_trie_maxbuff);
2724 if (!SvIOK(re_trie_maxbuff)) {
2725 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2727 DEBUG_TRIE_COMPILE_r({
2728 Perl_re_indentf( aTHX_
2729 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2731 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2732 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2735 /* Find the node we are going to overwrite */
2736 if ( first == startbranch && OP( last ) != BRANCH ) {
2737 /* whole branch chain */
2740 /* branch sub-chain */
2741 convert = NEXTOPER( first );
2744 /* -- First loop and Setup --
2746 We first traverse the branches and scan each word to determine if it
2747 contains widechars, and how many unique chars there are, this is
2748 important as we have to build a table with at least as many columns as we
2751 We use an array of integers to represent the character codes 0..255
2752 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2753 the native representation of the character value as the key and IV's for
2756 *TODO* If we keep track of how many times each character is used we can
2757 remap the columns so that the table compression later on is more
2758 efficient in terms of memory by ensuring the most common value is in the
2759 middle and the least common are on the outside. IMO this would be better
2760 than a most to least common mapping as theres a decent chance the most
2761 common letter will share a node with the least common, meaning the node
2762 will not be compressible. With a middle is most common approach the worst
2763 case is when we have the least common nodes twice.
2767 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2768 regnode *noper = NEXTOPER( cur );
2772 U32 wordlen = 0; /* required init */
2773 STRLEN minchars = 0;
2774 STRLEN maxchars = 0;
2775 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2778 if (OP(noper) == NOTHING) {
2779 /* skip past a NOTHING at the start of an alternation
2780 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2782 regnode *noper_next= regnext(noper);
2783 if (noper_next < tail)
2788 && ( OP(noper) == flags
2789 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
2790 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
2791 || OP(noper) == EXACTFUP))))
2793 uc= (U8*)STRING(noper);
2794 e= uc + STR_LEN(noper);
2801 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2802 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2803 regardless of encoding */
2804 if (OP( noper ) == EXACTFUP) {
2805 /* false positives are ok, so just set this */
2806 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2810 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2812 TRIE_CHARCOUNT(trie)++;
2815 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2816 * is in effect. Under /i, this character can match itself, or
2817 * anything that folds to it. If not under /i, it can match just
2818 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2819 * all fold to k, and all are single characters. But some folds
2820 * expand to more than one character, so for example LATIN SMALL
2821 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2822 * the string beginning at 'uc' is 'ffi', it could be matched by
2823 * three characters, or just by the one ligature character. (It
2824 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2825 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2826 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2827 * match.) The trie needs to know the minimum and maximum number
2828 * of characters that could match so that it can use size alone to
2829 * quickly reject many match attempts. The max is simple: it is
2830 * the number of folded characters in this branch (since a fold is
2831 * never shorter than what folds to it. */
2835 /* And the min is equal to the max if not under /i (indicated by
2836 * 'folder' being NULL), or there are no multi-character folds. If
2837 * there is a multi-character fold, the min is incremented just
2838 * once, for the character that folds to the sequence. Each
2839 * character in the sequence needs to be added to the list below of
2840 * characters in the trie, but we count only the first towards the
2841 * min number of characters needed. This is done through the
2842 * variable 'foldlen', which is returned by the macros that look
2843 * for these sequences as the number of bytes the sequence
2844 * occupies. Each time through the loop, we decrement 'foldlen' by
2845 * how many bytes the current char occupies. Only when it reaches
2846 * 0 do we increment 'minchars' or look for another multi-character
2848 if (folder == NULL) {
2851 else if (foldlen > 0) {
2852 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2857 /* See if *uc is the beginning of a multi-character fold. If
2858 * so, we decrement the length remaining to look at, to account
2859 * for the current character this iteration. (We can use 'uc'
2860 * instead of the fold returned by TRIE_READ_CHAR because for
2861 * non-UTF, the latin1_safe macro is smart enough to account
2862 * for all the unfolded characters, and because for UTF, the
2863 * string will already have been folded earlier in the
2864 * compilation process */
2866 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2867 foldlen -= UTF8SKIP(uc);
2870 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2875 /* The current character (and any potential folds) should be added
2876 * to the possible matching characters for this position in this
2880 U8 folded= folder[ (U8) uvc ];
2881 if ( !trie->charmap[ folded ] ) {
2882 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2883 TRIE_STORE_REVCHAR( folded );
2886 if ( !trie->charmap[ uvc ] ) {
2887 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2888 TRIE_STORE_REVCHAR( uvc );
2891 /* store the codepoint in the bitmap, and its folded
2893 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2894 set_bit = 0; /* We've done our bit :-) */
2898 /* XXX We could come up with the list of code points that fold
2899 * to this using PL_utf8_foldclosures, except not for
2900 * multi-char folds, as there may be multiple combinations
2901 * there that could work, which needs to wait until runtime to
2902 * resolve (The comment about LIGATURE FFI above is such an
2907 widecharmap = newHV();
2909 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2912 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
2914 if ( !SvTRUE( *svpp ) ) {
2915 sv_setiv( *svpp, ++trie->uniquecharcount );
2916 TRIE_STORE_REVCHAR(uvc);
2919 } /* end loop through characters in this branch of the trie */
2921 /* We take the min and max for this branch and combine to find the min
2922 * and max for all branches processed so far */
2923 if( cur == first ) {
2924 trie->minlen = minchars;
2925 trie->maxlen = maxchars;
2926 } else if (minchars < trie->minlen) {
2927 trie->minlen = minchars;
2928 } else if (maxchars > trie->maxlen) {
2929 trie->maxlen = maxchars;
2931 } /* end first pass */
2932 DEBUG_TRIE_COMPILE_r(
2933 Perl_re_indentf( aTHX_
2934 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2936 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2937 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2938 (int)trie->minlen, (int)trie->maxlen )
2942 We now know what we are dealing with in terms of unique chars and
2943 string sizes so we can calculate how much memory a naive
2944 representation using a flat table will take. If it's over a reasonable
2945 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2946 conservative but potentially much slower representation using an array
2949 At the end we convert both representations into the same compressed
2950 form that will be used in regexec.c for matching with. The latter
2951 is a form that cannot be used to construct with but has memory
2952 properties similar to the list form and access properties similar
2953 to the table form making it both suitable for fast searches and
2954 small enough that its feasable to store for the duration of a program.
2956 See the comment in the code where the compressed table is produced
2957 inplace from the flat tabe representation for an explanation of how
2958 the compression works.
2963 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2966 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2967 > SvIV(re_trie_maxbuff) )
2970 Second Pass -- Array Of Lists Representation
2972 Each state will be represented by a list of charid:state records
2973 (reg_trie_trans_le) the first such element holds the CUR and LEN
2974 points of the allocated array. (See defines above).
2976 We build the initial structure using the lists, and then convert
2977 it into the compressed table form which allows faster lookups
2978 (but cant be modified once converted).
2981 STRLEN transcount = 1;
2983 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
2986 trie->states = (reg_trie_state *)
2987 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2988 sizeof(reg_trie_state) );
2992 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2994 regnode *noper = NEXTOPER( cur );
2995 U32 state = 1; /* required init */
2996 U16 charid = 0; /* sanity init */
2997 U32 wordlen = 0; /* required init */
2999 if (OP(noper) == NOTHING) {
3000 regnode *noper_next= regnext(noper);
3001 if (noper_next < tail)
3006 && ( OP(noper) == flags
3007 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
3008 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
3009 || OP(noper) == EXACTFUP))))
3011 const U8 *uc= (U8*)STRING(noper);
3012 const U8 *e= uc + STR_LEN(noper);
3014 for ( ; uc < e ; uc += len ) {
3019 charid = trie->charmap[ uvc ];
3021 SV** const svpp = hv_fetch( widecharmap,
3028 charid=(U16)SvIV( *svpp );
3031 /* charid is now 0 if we dont know the char read, or
3032 * nonzero if we do */
3039 if ( !trie->states[ state ].trans.list ) {
3040 TRIE_LIST_NEW( state );
3043 check <= TRIE_LIST_USED( state );
3046 if ( TRIE_LIST_ITEM( state, check ).forid
3049 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3054 newstate = next_alloc++;
3055 prev_states[newstate] = state;
3056 TRIE_LIST_PUSH( state, charid, newstate );
3061 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3065 TRIE_HANDLE_WORD(state);
3067 } /* end second pass */
3069 /* next alloc is the NEXT state to be allocated */
3070 trie->statecount = next_alloc;
3071 trie->states = (reg_trie_state *)
3072 PerlMemShared_realloc( trie->states,
3074 * sizeof(reg_trie_state) );
3076 /* and now dump it out before we compress it */
3077 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3078 revcharmap, next_alloc,
3082 trie->trans = (reg_trie_trans *)
3083 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3090 for( state=1 ; state < next_alloc ; state ++ ) {
3094 DEBUG_TRIE_COMPILE_MORE_r(
3095 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
3099 if (trie->states[state].trans.list) {
3100 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3104 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3105 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3106 if ( forid < minid ) {
3108 } else if ( forid > maxid ) {
3112 if ( transcount < tp + maxid - minid + 1) {
3114 trie->trans = (reg_trie_trans *)
3115 PerlMemShared_realloc( trie->trans,
3117 * sizeof(reg_trie_trans) );
3118 Zero( trie->trans + (transcount / 2),
3122 base = trie->uniquecharcount + tp - minid;
3123 if ( maxid == minid ) {
3125 for ( ; zp < tp ; zp++ ) {
3126 if ( ! trie->trans[ zp ].next ) {
3127 base = trie->uniquecharcount + zp - minid;
3128 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3130 trie->trans[ zp ].check = state;
3136 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3138 trie->trans[ tp ].check = state;
3143 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3144 const U32 tid = base
3145 - trie->uniquecharcount
3146 + TRIE_LIST_ITEM( state, idx ).forid;
3147 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3149 trie->trans[ tid ].check = state;
3151 tp += ( maxid - minid + 1 );
3153 Safefree(trie->states[ state ].trans.list);
3156 DEBUG_TRIE_COMPILE_MORE_r(
3157 Perl_re_printf( aTHX_ " base: %d\n",base);
3160 trie->states[ state ].trans.base=base;
3162 trie->lasttrans = tp + 1;
3166 Second Pass -- Flat Table Representation.
3168 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3169 each. We know that we will need Charcount+1 trans at most to store
3170 the data (one row per char at worst case) So we preallocate both
3171 structures assuming worst case.
3173 We then construct the trie using only the .next slots of the entry
3176 We use the .check field of the first entry of the node temporarily
3177 to make compression both faster and easier by keeping track of how
3178 many non zero fields are in the node.
3180 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3183 There are two terms at use here: state as a TRIE_NODEIDX() which is
3184 a number representing the first entry of the node, and state as a
3185 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3186 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3187 if there are 2 entrys per node. eg:
3195 The table is internally in the right hand, idx form. However as we
3196 also have to deal with the states array which is indexed by nodenum
3197 we have to use TRIE_NODENUM() to convert.
3200 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
3203 trie->trans = (reg_trie_trans *)
3204 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3205 * trie->uniquecharcount + 1,
3206 sizeof(reg_trie_trans) );
3207 trie->states = (reg_trie_state *)
3208 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3209 sizeof(reg_trie_state) );
3210 next_alloc = trie->uniquecharcount + 1;
3213 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3215 regnode *noper = NEXTOPER( cur );
3217 U32 state = 1; /* required init */
3219 U16 charid = 0; /* sanity init */
3220 U32 accept_state = 0; /* sanity init */
3222 U32 wordlen = 0; /* required init */
3224 if (OP(noper) == NOTHING) {
3225 regnode *noper_next= regnext(noper);
3226 if (noper_next < tail)
3231 && ( OP(noper) == flags
3232 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
3233 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
3234 || OP(noper) == EXACTFUP))))
3236 const U8 *uc= (U8*)STRING(noper);
3237 const U8 *e= uc + STR_LEN(noper);
3239 for ( ; uc < e ; uc += len ) {
3244 charid = trie->charmap[ uvc ];
3246 SV* const * const svpp = hv_fetch( widecharmap,
3250 charid = svpp ? (U16)SvIV(*svpp) : 0;
3254 if ( !trie->trans[ state + charid ].next ) {
3255 trie->trans[ state + charid ].next = next_alloc;
3256 trie->trans[ state ].check++;
3257 prev_states[TRIE_NODENUM(next_alloc)]
3258 = TRIE_NODENUM(state);
3259 next_alloc += trie->uniquecharcount;
3261 state = trie->trans[ state + charid ].next;
3263 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3265 /* charid is now 0 if we dont know the char read, or
3266 * nonzero if we do */
3269 accept_state = TRIE_NODENUM( state );
3270 TRIE_HANDLE_WORD(accept_state);
3272 } /* end second pass */
3274 /* and now dump it out before we compress it */
3275 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3277 next_alloc, depth+1));
3281 * Inplace compress the table.*
3283 For sparse data sets the table constructed by the trie algorithm will
3284 be mostly 0/FAIL transitions or to put it another way mostly empty.
3285 (Note that leaf nodes will not contain any transitions.)
3287 This algorithm compresses the tables by eliminating most such
3288 transitions, at the cost of a modest bit of extra work during lookup:
3290 - Each states[] entry contains a .base field which indicates the
3291 index in the state[] array wheres its transition data is stored.
3293 - If .base is 0 there are no valid transitions from that node.
3295 - If .base is nonzero then charid is added to it to find an entry in
3298 -If trans[states[state].base+charid].check!=state then the
3299 transition is taken to be a 0/Fail transition. Thus if there are fail
3300 transitions at the front of the node then the .base offset will point
3301 somewhere inside the previous nodes data (or maybe even into a node
3302 even earlier), but the .check field determines if the transition is
3306 The following process inplace converts the table to the compressed
3307 table: We first do not compress the root node 1,and mark all its
3308 .check pointers as 1 and set its .base pointer as 1 as well. This
3309 allows us to do a DFA construction from the compressed table later,
3310 and ensures that any .base pointers we calculate later are greater
3313 - We set 'pos' to indicate the first entry of the second node.
3315 - We then iterate over the columns of the node, finding the first and
3316 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3317 and set the .check pointers accordingly, and advance pos
3318 appropriately and repreat for the next node. Note that when we copy
3319 the next pointers we have to convert them from the original
3320 NODEIDX form to NODENUM form as the former is not valid post
3323 - If a node has no transitions used we mark its base as 0 and do not
3324 advance the pos pointer.
3326 - If a node only has one transition we use a second pointer into the
3327 structure to fill in allocated fail transitions from other states.
3328 This pointer is independent of the main pointer and scans forward
3329 looking for null transitions that are allocated to a state. When it
3330 finds one it writes the single transition into the "hole". If the
3331 pointer doesnt find one the single transition is appended as normal.
3333 - Once compressed we can Renew/realloc the structures to release the
3336 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3337 specifically Fig 3.47 and the associated pseudocode.
3341 const U32 laststate = TRIE_NODENUM( next_alloc );
3344 trie->statecount = laststate;
3346 for ( state = 1 ; state < laststate ; state++ ) {
3348 const U32 stateidx = TRIE_NODEIDX( state );
3349 const U32 o_used = trie->trans[ stateidx ].check;
3350 U32 used = trie->trans[ stateidx ].check;
3351 trie->trans[ stateidx ].check = 0;
3354 used && charid < trie->uniquecharcount;
3357 if ( flag || trie->trans[ stateidx + charid ].next ) {
3358 if ( trie->trans[ stateidx + charid ].next ) {
3360 for ( ; zp < pos ; zp++ ) {
3361 if ( ! trie->trans[ zp ].next ) {
3365 trie->states[ state ].trans.base
3367 + trie->uniquecharcount
3369 trie->trans[ zp ].next
3370 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3372 trie->trans[ zp ].check = state;
3373 if ( ++zp > pos ) pos = zp;
3380 trie->states[ state ].trans.base
3381 = pos + trie->uniquecharcount - charid ;
3383 trie->trans[ pos ].next
3384 = SAFE_TRIE_NODENUM(
3385 trie->trans[ stateidx + charid ].next );
3386 trie->trans[ pos ].check = state;
3391 trie->lasttrans = pos + 1;
3392 trie->states = (reg_trie_state *)
3393 PerlMemShared_realloc( trie->states, laststate
3394 * sizeof(reg_trie_state) );
3395 DEBUG_TRIE_COMPILE_MORE_r(
3396 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3398 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3402 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3405 } /* end table compress */
3407 DEBUG_TRIE_COMPILE_MORE_r(
3408 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3410 (UV)trie->statecount,
3411 (UV)trie->lasttrans)
3413 /* resize the trans array to remove unused space */
3414 trie->trans = (reg_trie_trans *)
3415 PerlMemShared_realloc( trie->trans, trie->lasttrans
3416 * sizeof(reg_trie_trans) );
3418 { /* Modify the program and insert the new TRIE node */
3419 U8 nodetype =(U8)(flags & 0xFF);
3423 regnode *optimize = NULL;
3424 #ifdef RE_TRACK_PATTERN_OFFSETS
3427 U32 mjd_nodelen = 0;
3428 #endif /* RE_TRACK_PATTERN_OFFSETS */
3429 #endif /* DEBUGGING */
3431 This means we convert either the first branch or the first Exact,
3432 depending on whether the thing following (in 'last') is a branch
3433 or not and whther first is the startbranch (ie is it a sub part of
3434 the alternation or is it the whole thing.)
3435 Assuming its a sub part we convert the EXACT otherwise we convert
3436 the whole branch sequence, including the first.
3438 /* Find the node we are going to overwrite */
3439 if ( first != startbranch || OP( last ) == BRANCH ) {
3440 /* branch sub-chain */
3441 NEXT_OFF( first ) = (U16)(last - first);
3442 #ifdef RE_TRACK_PATTERN_OFFSETS
3444 mjd_offset= Node_Offset((convert));
3445 mjd_nodelen= Node_Length((convert));
3448 /* whole branch chain */
3450 #ifdef RE_TRACK_PATTERN_OFFSETS
3453 const regnode *nop = NEXTOPER( convert );
3454 mjd_offset= Node_Offset((nop));
3455 mjd_nodelen= Node_Length((nop));
3459 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3461 (UV)mjd_offset, (UV)mjd_nodelen)
3464 /* But first we check to see if there is a common prefix we can
3465 split out as an EXACT and put in front of the TRIE node. */
3466 trie->startstate= 1;
3467 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3468 /* we want to find the first state that has more than
3469 * one transition, if that state is not the first state
3470 * then we have a common prefix which we can remove.
3473 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3475 I32 first_ofs = -1; /* keeps track of the ofs of the first
3476 transition, -1 means none */
3478 const U32 base = trie->states[ state ].trans.base;
3480 /* does this state terminate an alternation? */
3481 if ( trie->states[state].wordnum )
3484 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3485 if ( ( base + ofs >= trie->uniquecharcount ) &&
3486 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3487 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3489 if ( ++count > 1 ) {
3490 /* we have more than one transition */
3493 /* if this is the first state there is no common prefix
3494 * to extract, so we can exit */
3495 if ( state == 1 ) break;
3496 tmp = av_fetch( revcharmap, ofs, 0);
3497 ch = (U8*)SvPV_nolen_const( *tmp );
3499 /* if we are on count 2 then we need to initialize the
3500 * bitmap, and store the previous char if there was one
3503 /* clear the bitmap */
3504 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3506 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
3509 if (first_ofs >= 0) {
3510 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3511 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3513 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3515 Perl_re_printf( aTHX_ "%s", (char*)ch)
3519 /* store the current firstchar in the bitmap */
3520 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3521 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3527 /* This state has only one transition, its transition is part
3528 * of a common prefix - we need to concatenate the char it
3529 * represents to what we have so far. */
3530 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3532 char *ch = SvPV( *tmp, len );
3534 SV *sv=sv_newmortal();
3535 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3537 (UV)state, (UV)first_ofs,
3538 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3539 PL_colors[0], PL_colors[1],
3540 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3541 PERL_PV_ESCAPE_FIRSTCHAR
3546 OP( convert ) = nodetype;
3547 str=STRING(convert);
3550 STR_LEN(convert) += len;
3556 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3561 trie->prefixlen = (state-1);
3563 regnode *n = convert+NODE_SZ_STR(convert);
3564 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3565 trie->startstate = state;
3566 trie->minlen -= (state - 1);
3567 trie->maxlen -= (state - 1);
3569 /* At least the UNICOS C compiler choked on this
3570 * being argument to DEBUG_r(), so let's just have
3573 #ifdef PERL_EXT_RE_BUILD
3579 regnode *fix = convert;
3580 U32 word = trie->wordcount;
3581 #ifdef RE_TRACK_PATTERN_OFFSETS
3584 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3585 while( ++fix < n ) {
3586 Set_Node_Offset_Length(fix, 0, 0);
3589 SV ** const tmp = av_fetch( trie_words, word, 0 );
3591 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3592 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3594 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3602 NEXT_OFF(convert) = (U16)(tail - convert);
3603 DEBUG_r(optimize= n);
3609 if ( trie->maxlen ) {
3610 NEXT_OFF( convert ) = (U16)(tail - convert);
3611 ARG_SET( convert, data_slot );
3612 /* Store the offset to the first unabsorbed branch in
3613 jump[0], which is otherwise unused by the jump logic.
3614 We use this when dumping a trie and during optimisation. */
3616 trie->jump[0] = (U16)(nextbranch - convert);
3618 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3619 * and there is a bitmap
3620 * and the first "jump target" node we found leaves enough room
3621 * then convert the TRIE node into a TRIEC node, with the bitmap
3622 * embedded inline in the opcode - this is hypothetically faster.
3624 if ( !trie->states[trie->startstate].wordnum
3626 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3628 OP( convert ) = TRIEC;
3629 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3630 PerlMemShared_free(trie->bitmap);
3633 OP( convert ) = TRIE;
3635 /* store the type in the flags */
3636 convert->flags = nodetype;
3640 + regarglen[ OP( convert ) ];
3642 /* XXX We really should free up the resource in trie now,
3643 as we won't use them - (which resources?) dmq */
3645 /* needed for dumping*/
3646 DEBUG_r(if (optimize) {
3647 regnode *opt = convert;
3649 while ( ++opt < optimize) {
3650 Set_Node_Offset_Length(opt, 0, 0);
3653 Try to clean up some of the debris left after the
3656 while( optimize < jumper ) {
3657 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3658 OP( optimize ) = OPTIMIZED;
3659 Set_Node_Offset_Length(optimize, 0, 0);
3662 Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3664 } /* end node insert */
3666 /* Finish populating the prev field of the wordinfo array. Walk back
3667 * from each accept state until we find another accept state, and if
3668 * so, point the first word's .prev field at the second word. If the
3669 * second already has a .prev field set, stop now. This will be the
3670 * case either if we've already processed that word's accept state,
3671 * or that state had multiple words, and the overspill words were
3672 * already linked up earlier.
3679 for (word=1; word <= trie->wordcount; word++) {
3681 if (trie->wordinfo[word].prev)
3683 state = trie->wordinfo[word].accept;
3685 state = prev_states[state];
3688 prev = trie->states[state].wordnum;
3692 trie->wordinfo[word].prev = prev;
3694 Safefree(prev_states);
3698 /* and now dump out the compressed format */
3699 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3701 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3703 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3704 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3706 SvREFCNT_dec_NN(revcharmap);
3710 : trie->startstate>1
3716 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3718 /* The Trie is constructed and compressed now so we can build a fail array if
3721 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3723 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3727 We find the fail state for each state in the trie, this state is the longest
3728 proper suffix of the current state's 'word' that is also a proper prefix of
3729 another word in our trie. State 1 represents the word '' and is thus the
3730 default fail state. This allows the DFA not to have to restart after its
3731 tried and failed a word at a given point, it simply continues as though it
3732 had been matching the other word in the first place.
3734 'abcdgu'=~/abcdefg|cdgu/
3735 When we get to 'd' we are still matching the first word, we would encounter
3736 'g' which would fail, which would bring us to the state representing 'd' in
3737 the second word where we would try 'g' and succeed, proceeding to match
3740 /* add a fail transition */
3741 const U32 trie_offset = ARG(source);
3742 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3744 const U32 ucharcount = trie->uniquecharcount;
3745 const U32 numstates = trie->statecount;
3746 const U32 ubound = trie->lasttrans + ucharcount;
3750 U32 base = trie->states[ 1 ].trans.base;
3753 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3755 GET_RE_DEBUG_FLAGS_DECL;
3757 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3758 PERL_UNUSED_CONTEXT;
3760 PERL_UNUSED_ARG(depth);
3763 if ( OP(source) == TRIE ) {
3764 struct regnode_1 *op = (struct regnode_1 *)
3765 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3766 StructCopy(source, op, struct regnode_1);
3767 stclass = (regnode *)op;
3769 struct regnode_charclass *op = (struct regnode_charclass *)
3770 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3771 StructCopy(source, op, struct regnode_charclass);
3772 stclass = (regnode *)op;
3774 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3776 ARG_SET( stclass, data_slot );
3777 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3778 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3779 aho->trie=trie_offset;
3780 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3781 Copy( trie->states, aho->states, numstates, reg_trie_state );
3782 Newx( q, numstates, U32);
3783 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3786 /* initialize fail[0..1] to be 1 so that we always have
3787 a valid final fail state */
3788 fail[ 0 ] = fail[ 1 ] = 1;
3790 for ( charid = 0; charid < ucharcount ; charid++ ) {
3791 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3793 q[ q_write ] = newstate;
3794 /* set to point at the root */
3795 fail[ q[ q_write++ ] ]=1;
3798 while ( q_read < q_write) {
3799 const U32 cur = q[ q_read++ % numstates ];
3800 base = trie->states[ cur ].trans.base;
3802 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3803 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3805 U32 fail_state = cur;
3808 fail_state = fail[ fail_state ];
3809 fail_base = aho->states[ fail_state ].trans.base;
3810 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3812 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3813 fail[ ch_state ] = fail_state;
3814 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3816 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3818 q[ q_write++ % numstates] = ch_state;
3822 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3823 when we fail in state 1, this allows us to use the
3824 charclass scan to find a valid start char. This is based on the principle
3825 that theres a good chance the string being searched contains lots of stuff
3826 that cant be a start char.
3828 fail[ 0 ] = fail[ 1 ] = 0;
3829 DEBUG_TRIE_COMPILE_r({
3830 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
3831 depth, (UV)numstates
3833 for( q_read=1; q_read<numstates; q_read++ ) {
3834 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
3836 Perl_re_printf( aTHX_ "\n");
3839 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3844 /* The below joins as many adjacent EXACTish nodes as possible into a single
3845 * one. The regop may be changed if the node(s) contain certain sequences that
3846 * require special handling. The joining is only done if:
3847 * 1) there is room in the current conglomerated node to entirely contain the
3849 * 2) they are compatible node types
3851 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3852 * these get optimized out
3854 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3855 * as possible, even if that means splitting an existing node so that its first
3856 * part is moved to the preceeding node. This would maximise the efficiency of
3857 * memEQ during matching.
3859 * If a node is to match under /i (folded), the number of characters it matches
3860 * can be different than its character length if it contains a multi-character
3861 * fold. *min_subtract is set to the total delta number of characters of the
3864 * And *unfolded_multi_char is set to indicate whether or not the node contains
3865 * an unfolded multi-char fold. This happens when it won't be known until
3866 * runtime whether the fold is valid or not; namely
3867 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3868 * target string being matched against turns out to be UTF-8 is that fold
3870 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
3872 * (Multi-char folds whose components are all above the Latin1 range are not
3873 * run-time locale dependent, and have already been folded by the time this
3874 * function is called.)
3876 * This is as good a place as any to discuss the design of handling these
3877 * multi-character fold sequences. It's been wrong in Perl for a very long
3878 * time. There are three code points in Unicode whose multi-character folds
3879 * were long ago discovered to mess things up. The previous designs for
3880 * dealing with these involved assigning a special node for them. This
3881 * approach doesn't always work, as evidenced by this example:
3882 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3883 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3884 * would match just the \xDF, it won't be able to handle the case where a
3885 * successful match would have to cross the node's boundary. The new approach
3886 * that hopefully generally solves the problem generates an EXACTFUP node
3887 * that is "sss" in this case.
3889 * It turns out that there are problems with all multi-character folds, and not
3890 * just these three. Now the code is general, for all such cases. The
3891 * approach taken is:
3892 * 1) This routine examines each EXACTFish node that could contain multi-
3893 * character folded sequences. Since a single character can fold into
3894 * such a sequence, the minimum match length for this node is less than
3895 * the number of characters in the node. This routine returns in
3896 * *min_subtract how many characters to subtract from the the actual
3897 * length of the string to get a real minimum match length; it is 0 if
3898 * there are no multi-char foldeds. This delta is used by the caller to
3899 * adjust the min length of the match, and the delta between min and max,
3900 * so that the optimizer doesn't reject these possibilities based on size
3903 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
3904 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
3905 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
3906 * EXACTFU nodes. The node type of such nodes is then changed to
3907 * EXACTFUP, indicating it is problematic, and needs careful handling.
3908 * (The procedures in step 1) above are sufficient to handle this case in
3909 * UTF-8 encoded nodes.) The reason this is problematic is that this is
3910 * the only case where there is a possible fold length change in non-UTF-8
3911 * patterns. By reserving a special node type for problematic cases, the
3912 * far more common regular EXACTFU nodes can be processed faster.
3913 * regexec.c takes advantage of this.
3915 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
3916 * problematic cases. These all only occur when the pattern is not
3917 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
3918 * length change, it handles the situation where the string cannot be
3919 * entirely folded. The strings in an EXACTFish node are folded as much
3920 * as possible during compilation in regcomp.c. This saves effort in
3921 * regex matching. By using an EXACTFUP node when it is not possible to
3922 * fully fold at compile time, regexec.c can know that everything in an
3923 * EXACTFU node is folded, so folding can be skipped at runtime. The only
3924 * case where folding in EXACTFU nodes can't be done at compile time is
3925 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
3926 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
3927 * handle two very different cases. Alternatively, there could have been
3928 * a node type where there are length changes, one for unfolded, and one
3929 * for both. If yet another special case needed to be created, the number
3930 * of required node types would have to go to 7. khw figures that even
3931 * though there are plenty of node types to spare, that the maintenance
3932 * cost wasn't worth the small speedup of doing it that way, especially
3933 * since he thinks the MICRO SIGN is rarely encountered in practice.
3935 * There are other cases where folding isn't done at compile time, but
3936 * none of them are under /u, and hence not for EXACTFU nodes. The folds
3937 * in EXACTFL nodes aren't known until runtime, and vary as the locale
3938 * changes. Some folds in EXACTF depend on if the runtime target string
3939 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
3940 * when no fold in it depends on the UTF-8ness of the target string.)
3942 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3943 * validity of the fold won't be known until runtime, and so must remain
3944 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
3945 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3946 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3947 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3948 * The reason this is a problem is that the optimizer part of regexec.c
3949 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3950 * that a character in the pattern corresponds to at most a single
3951 * character in the target string. (And I do mean character, and not byte
3952 * here, unlike other parts of the documentation that have never been
3953 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
3954 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
3955 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
3956 * EXACTFL nodes, violate the assumption, and they are the only instances
3957 * where it is violated. I'm reluctant to try to change the assumption,
3958 * as the code involved is impenetrable to me (khw), so instead the code
3959 * here punts. This routine examines EXACTFL nodes, and (when the pattern
3960 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
3961 * boolean indicating whether or not the node contains such a fold. When
3962 * it is true, the caller sets a flag that later causes the optimizer in
3963 * this file to not set values for the floating and fixed string lengths,
3964 * and thus avoids the optimizer code in regexec.c that makes the invalid
3965 * assumption. Thus, there is no optimization based on string lengths for
3966 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3967 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
3968 * assumption is wrong only in these cases is that all other non-UTF-8
3969 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3970 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3971 * EXACTF nodes because we don't know at compile time if it actually
3972 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3973 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3974 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
3975 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3976 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3977 * string would require the pattern to be forced into UTF-8, the overhead
3978 * of which we want to avoid. Similarly the unfolded multi-char folds in
3979 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3982 * Similarly, the code that generates tries doesn't currently handle
3983 * not-already-folded multi-char folds, and it looks like a pain to change
3984 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
3985 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
3986 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
3987 * using /iaa matching will be doing so almost entirely with ASCII
3988 * strings, so this should rarely be encountered in practice */
3990 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3991 if (PL_regkind[OP(scan)] == EXACT) \
3992 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags), NULL, depth+1)
3995 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3996 UV *min_subtract, bool *unfolded_multi_char,
3997 U32 flags, regnode *val, U32 depth)
3999 /* Merge several consecutive EXACTish nodes into one. */
4001 regnode *n = regnext(scan);
4003 regnode *next = scan + NODE_SZ_STR(scan);
4007 regnode *stop = scan;
4008 GET_RE_DEBUG_FLAGS_DECL;
4010 PERL_UNUSED_ARG(depth);
4013 PERL_ARGS_ASSERT_JOIN_EXACT;
4014 #ifndef EXPERIMENTAL_INPLACESCAN
4015 PERL_UNUSED_ARG(flags);
4016 PERL_UNUSED_ARG(val);
4018 DEBUG_PEEP("join", scan, depth, 0);
4020 assert(PL_regkind[OP(scan)] == EXACT);
4022 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
4023 * EXACT ones that are mergeable to the current one. */
4025 && ( PL_regkind[OP(n)] == NOTHING
4026 || (stringok && PL_regkind[OP(n)] == EXACT))
4028 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4031 if (OP(n) == TAIL || n > next)
4033 if (PL_regkind[OP(n)] == NOTHING) {
4034 DEBUG_PEEP("skip:", n, depth, 0);
4035 NEXT_OFF(scan) += NEXT_OFF(n);
4036 next = n + NODE_STEP_REGNODE;
4043 else if (stringok) {
4044 const unsigned int oldl = STR_LEN(scan);
4045 regnode * const nnext = regnext(n);
4047 /* XXX I (khw) kind of doubt that this works on platforms (should
4048 * Perl ever run on one) where U8_MAX is above 255 because of lots
4049 * of other assumptions */
4050 /* Don't join if the sum can't fit into a single node */
4051 if (oldl + STR_LEN(n) > U8_MAX)
4054 /* Joining something that requires UTF-8 with something that
4055 * doesn't, means the result requires UTF-8. */
4056 if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) {
4057 OP(scan) = EXACT_ONLY8;
4059 else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) {
4060 ; /* join is compatible, no need to change OP */
4062 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) {
4063 OP(scan) = EXACTFU_ONLY8;
4065 else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) {
4066 ; /* join is compatible, no need to change OP */
4068 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4069 ; /* join is compatible, no need to change OP */
4071 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4073 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4074 * which can join with EXACTFU ones. We check for this case
4075 * here. These need to be resolved to either EXACTFU or
4076 * EXACTF at joining time. They have nothing in them that
4077 * would forbid them from being the more desirable EXACTFU
4078 * nodes except that they begin and/or end with a single [Ss].
4079 * The reason this is problematic is because they could be
4080 * joined in this loop with an adjacent node that ends and/or
4081 * begins with [Ss] which would then form the sequence 'ss',
4082 * which matches differently under /di than /ui, in which case
4083 * EXACTFU can't be used. If the 'ss' sequence doesn't get
4084 * formed, the nodes get absorbed into any adjacent EXACTFU
4085 * node. And if the only adjacent node is EXACTF, they get
4086 * absorbed into that, under the theory that a longer node is
4087 * better than two shorter ones, even if one is EXACTFU. Note
4088 * that EXACTFU_ONLY8 is generated only for UTF-8 patterns,
4089 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
4091 if (STRING(n)[STR_LEN(n)-1] == 's') {
4093 /* Here the joined node would end with 's'. If the node
4094 * following the combination is an EXACTF one, it's better to
4095 * join this trailing edge 's' node with that one, leaving the
4096 * current one in 'scan' be the more desirable EXACTFU */
4097 if (OP(nnext) == EXACTF) {
4101 OP(scan) = EXACTFU_S_EDGE;
4103 } /* Otherwise, the beginning 's' of the 2nd node just
4104 becomes an interior 's' in 'scan' */
4106 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4107 ; /* join is compatible, no need to change OP */
4109 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4111 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4112 * nodes. But the latter nodes can be also joined with EXACTFU
4113 * ones, and that is a better outcome, so if the node following
4114 * 'n' is EXACTFU, quit now so that those two can be joined
4116 if (OP(nnext) == EXACTFU) {
4120 /* The join is compatible, and the combined node will be
4121 * EXACTF. (These don't care if they begin or end with 's' */
4123 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4124 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
4125 && STRING(n)[0] == 's')
4127 /* When combined, we have the sequence 'ss', which means we
4128 * have to remain /di */
4132 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4133 if (STRING(n)[0] == 's') {
4134 ; /* Here the join is compatible and the combined node
4135 starts with 's', no need to change OP */
4137 else { /* Now the trailing 's' is in the interior */
4141 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4143 /* The join is compatible, and the combined node will be
4144 * EXACTF. (These don't care if they begin or end with 's' */
4147 else if (OP(scan) != OP(n)) {
4149 /* The only other compatible joinings are the same node type */
4153 DEBUG_PEEP("merg", n, depth, 0);
4156 NEXT_OFF(scan) += NEXT_OFF(n);
4157 STR_LEN(scan) += STR_LEN(n);
4158 next = n + NODE_SZ_STR(n);
4159 /* Now we can overwrite *n : */
4160 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4168 #ifdef EXPERIMENTAL_INPLACESCAN
4169 if (flags && !NEXT_OFF(n)) {
4170 DEBUG_PEEP("atch", val, depth, 0);
4171 if (reg_off_by_arg[OP(n)]) {
4172 ARG_SET(n, val - n);
4175 NEXT_OFF(n) = val - n;
4182 /* This temporary node can now be turned into EXACTFU, and must, as
4183 * regexec.c doesn't handle it */
4184 if (OP(scan) == EXACTFU_S_EDGE) {
4189 *unfolded_multi_char = FALSE;
4191 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
4192 * can now analyze for sequences of problematic code points. (Prior to
4193 * this final joining, sequences could have been split over boundaries, and
4194 * hence missed). The sequences only happen in folding, hence for any
4195 * non-EXACT EXACTish node */
4196 if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
4197 U8* s0 = (U8*) STRING(scan);
4199 U8* s_end = s0 + STR_LEN(scan);
4201 int total_count_delta = 0; /* Total delta number of characters that
4202 multi-char folds expand to */
4204 /* One pass is made over the node's string looking for all the
4205 * possibilities. To avoid some tests in the loop, there are two main
4206 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4211 if (OP(scan) == EXACTFL) {
4214 /* An EXACTFL node would already have been changed to another
4215 * node type unless there is at least one character in it that
4216 * is problematic; likely a character whose fold definition
4217 * won't be known until runtime, and so has yet to be folded.
4218 * For all but the UTF-8 locale, folds are 1-1 in length, but
4219 * to handle the UTF-8 case, we need to create a temporary
4220 * folded copy using UTF-8 locale rules in order to analyze it.
4221 * This is because our macros that look to see if a sequence is
4222 * a multi-char fold assume everything is folded (otherwise the
4223 * tests in those macros would be too complicated and slow).
4224 * Note that here, the non-problematic folds will have already
4225 * been done, so we can just copy such characters. We actually
4226 * don't completely fold the EXACTFL string. We skip the
4227 * unfolded multi-char folds, as that would just create work
4228 * below to figure out the size they already are */
4230 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4233 STRLEN s_len = UTF8SKIP(s);
4234 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4235 Copy(s, d, s_len, U8);
4238 else if (is_FOLDS_TO_MULTI_utf8(s)) {
4239 *unfolded_multi_char = TRUE;
4240 Copy(s, d, s_len, U8);
4243 else if (isASCII(*s)) {
4244 *(d++) = toFOLD(*s);
4248 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4254 /* Point the remainder of the routine to look at our temporary
4258 } /* End of creating folded copy of EXACTFL string */
4260 /* Examine the string for a multi-character fold sequence. UTF-8
4261 * patterns have all characters pre-folded by the time this code is
4263 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4264 length sequence we are looking for is 2 */
4266 int count = 0; /* How many characters in a multi-char fold */
4267 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4268 if (! len) { /* Not a multi-char fold: get next char */
4273 { /* Here is a generic multi-char fold. */
4274 U8* multi_end = s + len;
4276 /* Count how many characters are in it. In the case of
4277 * /aa, no folds which contain ASCII code points are
4278 * allowed, so check for those, and skip if found. */
4279 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4280 count = utf8_length(s, multi_end);
4284 while (s < multi_end) {
4287 goto next_iteration;
4297 /* The delta is how long the sequence is minus 1 (1 is how long
4298 * the character that folds to the sequence is) */
4299 total_count_delta += count - 1;
4303 /* We created a temporary folded copy of the string in EXACTFL
4304 * nodes. Therefore we need to be sure it doesn't go below zero,
4305 * as the real string could be shorter */
4306 if (OP(scan) == EXACTFL) {
4307 int total_chars = utf8_length((U8*) STRING(scan),
4308 (U8*) STRING(scan) + STR_LEN(scan));
4309 if (total_count_delta > total_chars) {
4310 total_count_delta = total_chars;
4314 *min_subtract += total_count_delta;
4317 else if (OP(scan) == EXACTFAA) {
4319 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
4320 * fold to the ASCII range (and there are no existing ones in the
4321 * upper latin1 range). But, as outlined in the comments preceding
4322 * this function, we need to flag any occurrences of the sharp s.
4323 * This character forbids trie formation (because of added
4325 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
4326 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
4327 || UNICODE_DOT_DOT_VERSION > 0)
4329 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4330 OP(scan) = EXACTFAA_NO_TRIE;
4331 *unfolded_multi_char = TRUE;
4339 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
4340 * folds that are all Latin1. As explained in the comments
4341 * preceding this function, we look also for the sharp s in EXACTF
4342 * and EXACTFL nodes; it can be in the final position. Otherwise
4343 * we can stop looking 1 byte earlier because have to find at least
4344 * two characters for a multi-fold */
4345 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4350 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4351 if (! len) { /* Not a multi-char fold. */
4352 if (*s == LATIN_SMALL_LETTER_SHARP_S
4353 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4355 *unfolded_multi_char = TRUE;
4362 && isALPHA_FOLD_EQ(*s, 's')
4363 && isALPHA_FOLD_EQ(*(s+1), 's'))
4366 /* EXACTF nodes need to know that the minimum length
4367 * changed so that a sharp s in the string can match this
4368 * ss in the pattern, but they remain EXACTF nodes, as they
4369 * won't match this unless the target string is is UTF-8,
4370 * which we don't know until runtime. EXACTFL nodes can't
4371 * transform into EXACTFU nodes */
4372 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4373 OP(scan) = EXACTFUP;
4377 *min_subtract += len - 1;
4383 if ( STR_LEN(scan) == 1
4384 && isALPHA_A(* STRING(scan))
4385 && ( OP(scan) == EXACTFAA
4386 || ( OP(scan) == EXACTFU
4387 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
4389 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
4391 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
4392 * with the mask set to the complement of the bit that differs
4393 * between upper and lower case, and the lowest code point of the
4394 * pair (which the '&' forces) */
4396 ARG_SET(scan, *STRING(scan) & mask);
4402 /* Allow dumping but overwriting the collection of skipped
4403 * ops and/or strings with fake optimized ops */
4404 n = scan + NODE_SZ_STR(scan);
4412 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4416 /* REx optimizer. Converts nodes into quicker variants "in place".
4417 Finds fixed substrings. */
4419 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4420 to the position after last scanned or to NULL. */
4422 #define INIT_AND_WITHP \
4423 assert(!and_withp); \
4424 Newx(and_withp, 1, regnode_ssc); \
4425 SAVEFREEPV(and_withp)
4429 S_unwind_scan_frames(pTHX_ const void *p)
4431 scan_frame *f= (scan_frame *)p;
4433 scan_frame *n= f->next_frame;
4439 /* the return from this sub is the minimum length that could possibly match */
4441 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4442 SSize_t *minlenp, SSize_t *deltap,
4447 regnode_ssc *and_withp,
4448 U32 flags, U32 depth)
4449 /* scanp: Start here (read-write). */
4450 /* deltap: Write maxlen-minlen here. */
4451 /* last: Stop before this one. */
4452 /* data: string data about the pattern */
4453 /* stopparen: treat close N as END */
4454 /* recursed: which subroutines have we recursed into */
4455 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4458 /* There must be at least this number of characters to match */
4461 regnode *scan = *scanp, *next;
4463 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4464 int is_inf_internal = 0; /* The studied chunk is infinite */
4465 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4466 scan_data_t data_fake;
4467 SV *re_trie_maxbuff = NULL;
4468 regnode *first_non_open = scan;
4469 SSize_t stopmin = SSize_t_MAX;
4470 scan_frame *frame = NULL;
4471 GET_RE_DEBUG_FLAGS_DECL;
4473 PERL_ARGS_ASSERT_STUDY_CHUNK;
4474 RExC_study_started= 1;
4476 Zero(&data_fake, 1, scan_data_t);
4479 while (first_non_open && OP(first_non_open) == OPEN)
4480 first_non_open=regnext(first_non_open);
4486 RExC_study_chunk_recursed_count++;
4488 DEBUG_OPTIMISE_MORE_r(
4490 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4491 depth, (long)stopparen,
4492 (unsigned long)RExC_study_chunk_recursed_count,
4493 (unsigned long)depth, (unsigned long)recursed_depth,
4496 if (recursed_depth) {
4499 for ( j = 0 ; j < recursed_depth ; j++ ) {
4500 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4502 PAREN_TEST(RExC_study_chunk_recursed +
4503 ( j * RExC_study_chunk_recursed_bytes), i )
4506 !PAREN_TEST(RExC_study_chunk_recursed +
4507 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
4510 Perl_re_printf( aTHX_ " %d",(int)i);
4514 if ( j + 1 < recursed_depth ) {
4515 Perl_re_printf( aTHX_ ",");
4519 Perl_re_printf( aTHX_ "\n");
4522 while ( scan && OP(scan) != END && scan < last ){
4523 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
4524 node length to get a real minimum (because
4525 the folded version may be shorter) */
4526 bool unfolded_multi_char = FALSE;
4527 /* Peephole optimizer: */
4528 DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4529 DEBUG_PEEP("Peep", scan, depth, flags);
4532 /* The reason we do this here is that we need to deal with things like
4533 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4534 * parsing code, as each (?:..) is handled by a different invocation of
4537 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
4539 /* Follow the next-chain of the current node and optimize
4540 away all the NOTHINGs from it. */
4541 if (OP(scan) != CURLYX) {
4542 const int max = (reg_off_by_arg[OP(scan)]
4544 /* I32 may be smaller than U16 on CRAYs! */
4545 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4546 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
4550 /* Skip NOTHING and LONGJMP. */
4551 while ((n = regnext(n))
4552 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4553 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
4554 && off + noff < max)
4556 if (reg_off_by_arg[OP(scan)])
4559 NEXT_OFF(scan) = off;
4562 /* The principal pseudo-switch. Cannot be a switch, since we
4563 look into several different things. */
4564 if ( OP(scan) == DEFINEP ) {
4566 SSize_t deltanext = 0;
4567 SSize_t fake_last_close = 0;
4568 I32 f = SCF_IN_DEFINE;
4570 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4571 scan = regnext(scan);
4572 assert( OP(scan) == IFTHEN );
4573 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4575 data_fake.last_closep= &fake_last_close;
4577 next = regnext(scan);
4578 scan = NEXTOPER(NEXTOPER(scan));
4579 DEBUG_PEEP("scan", scan, depth, flags);
4580 DEBUG_PEEP("next", next, depth, flags);
4582 /* we suppose the run is continuous, last=next...
4583 * NOTE we dont use the return here! */
4584 /* DEFINEP study_chunk() recursion */
4585 (void)study_chunk(pRExC_state, &scan, &minlen,
4586 &deltanext, next, &data_fake, stopparen,
4587 recursed_depth, NULL, f, depth+1);
4592 OP(scan) == BRANCH ||
4593 OP(scan) == BRANCHJ ||
4596 next = regnext(scan);
4599 /* The op(next)==code check below is to see if we
4600 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4601 * IFTHEN is special as it might not appear in pairs.
4602 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4603 * we dont handle it cleanly. */
4604 if (OP(next) == code || code == IFTHEN) {
4605 /* NOTE - There is similar code to this block below for
4606 * handling TRIE nodes on a re-study. If you change stuff here
4607 * check there too. */
4608 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
4610 regnode * const startbranch=scan;
4612 if (flags & SCF_DO_SUBSTR) {
4613 /* Cannot merge strings after this. */
4614 scan_commit(pRExC_state, data, minlenp, is_inf);
4617 if (flags & SCF_DO_STCLASS)
4618 ssc_init_zero(pRExC_state, &accum);
4620 while (OP(scan) == code) {
4621 SSize_t deltanext, minnext, fake;
4623 regnode_ssc this_class;
4625 DEBUG_PEEP("Branch", scan, depth, flags);
4628 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4630 data_fake.whilem_c = data->whilem_c;
4631 data_fake.last_closep = data->last_closep;
4634 data_fake.last_closep = &fake;
4636 data_fake.pos_delta = delta;
4637 next = regnext(scan);
4639 scan = NEXTOPER(scan); /* everything */
4640 if (code != BRANCH) /* everything but BRANCH */
4641 scan = NEXTOPER(scan);
4643 if (flags & SCF_DO_STCLASS) {
4644 ssc_init(pRExC_state, &this_class);
4645 data_fake.start_class = &this_class;
4646 f = SCF_DO_STCLASS_AND;
4648 if (flags & SCF_WHILEM_VISITED_POS)
4649 f |= SCF_WHILEM_VISITED_POS;
4651 /* we suppose the run is continuous, last=next...*/
4652 /* recurse study_chunk() for each BRANCH in an alternation */
4653 minnext = study_chunk(pRExC_state, &scan, minlenp,
4654 &deltanext, next, &data_fake, stopparen,
4655 recursed_depth, NULL, f, depth+1);
4659 if (deltanext == SSize_t_MAX) {
4660 is_inf = is_inf_internal = 1;
4662 } else if (max1 < minnext + deltanext)
4663 max1 = minnext + deltanext;
4665 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4667 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4668 if ( stopmin > minnext)
4669 stopmin = min + min1;
4670 flags &= ~SCF_DO_SUBSTR;
4672 data->flags |= SCF_SEEN_ACCEPT;
4675 if (data_fake.flags & SF_HAS_EVAL)
4676 data->flags |= SF_HAS_EVAL;
4677 data->whilem_c = data_fake.whilem_c;
4679 if (flags & SCF_DO_STCLASS)
4680 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4682 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4684 if (flags & SCF_DO_SUBSTR) {
4685 data->pos_min += min1;
4686 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
4687 data->pos_delta = SSize_t_MAX;
4689 data->pos_delta += max1 - min1;
4690 if (max1 != min1 || is_inf)
4691 data->cur_is_floating = 1;
4694 if (delta == SSize_t_MAX
4695 || SSize_t_MAX - delta - (max1 - min1) < 0)
4696 delta = SSize_t_MAX;
4698 delta += max1 - min1;
4699 if (flags & SCF_DO_STCLASS_OR) {
4700 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4702 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4703 flags &= ~SCF_DO_STCLASS;
4706 else if (flags & SCF_DO_STCLASS_AND) {
4708 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4709 flags &= ~SCF_DO_STCLASS;
4712 /* Switch to OR mode: cache the old value of
4713 * data->start_class */
4715 StructCopy(data->start_class, and_withp, regnode_ssc);
4716 flags &= ~SCF_DO_STCLASS_AND;
4717 StructCopy(&accum, data->start_class, regnode_ssc);
4718 flags |= SCF_DO_STCLASS_OR;
4722 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4723 OP( startbranch ) == BRANCH )
4727 Assuming this was/is a branch we are dealing with: 'scan'
4728 now points at the item that follows the branch sequence,
4729 whatever it is. We now start at the beginning of the
4730 sequence and look for subsequences of
4736 which would be constructed from a pattern like
4739 If we can find such a subsequence we need to turn the first
4740 element into a trie and then add the subsequent branch exact
4741 strings to the trie.
4745 1. patterns where the whole set of branches can be
4748 2. patterns where only a subset can be converted.
4750 In case 1 we can replace the whole set with a single regop
4751 for the trie. In case 2 we need to keep the start and end
4754 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4755 becomes BRANCH TRIE; BRANCH X;
4757 There is an additional case, that being where there is a
4758 common prefix, which gets split out into an EXACT like node
4759 preceding the TRIE node.
4761 If x(1..n)==tail then we can do a simple trie, if not we make
4762 a "jump" trie, such that when we match the appropriate word
4763 we "jump" to the appropriate tail node. Essentially we turn
4764 a nested if into a case structure of sorts.
4769 if (!re_trie_maxbuff) {
4770 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4771 if (!SvIOK(re_trie_maxbuff))
4772 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4774 if ( SvIV(re_trie_maxbuff)>=0 ) {
4776 regnode *first = (regnode *)NULL;
4777 regnode *last = (regnode *)NULL;
4778 regnode *tail = scan;
4782 /* var tail is used because there may be a TAIL
4783 regop in the way. Ie, the exacts will point to the
4784 thing following the TAIL, but the last branch will
4785 point at the TAIL. So we advance tail. If we
4786 have nested (?:) we may have to move through several
4790 while ( OP( tail ) == TAIL ) {
4791 /* this is the TAIL generated by (?:) */
4792 tail = regnext( tail );
4796 DEBUG_TRIE_COMPILE_r({
4797 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4798 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
4800 "Looking for TRIE'able sequences. Tail node is ",
4801 (UV) REGNODE_OFFSET(tail),
4802 SvPV_nolen_const( RExC_mysv )
4808 Step through the branches
4809 cur represents each branch,
4810 noper is the first thing to be matched as part
4812 noper_next is the regnext() of that node.
4814 We normally handle a case like this
4815 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4816 support building with NOJUMPTRIE, which restricts
4817 the trie logic to structures like /FOO|BAR/.
4819 If noper is a trieable nodetype then the branch is
4820 a possible optimization target. If we are building
4821 under NOJUMPTRIE then we require that noper_next is
4822 the same as scan (our current position in the regex
4825 Once we have two or more consecutive such branches
4826 we can create a trie of the EXACT's contents and
4827 stitch it in place into the program.
4829 If the sequence represents all of the branches in
4830 the alternation we replace the entire thing with a
4833 Otherwise when it is a subsequence we need to
4834 stitch it in place and replace only the relevant
4835 branches. This means the first branch has to remain
4836 as it is used by the alternation logic, and its
4837 next pointer, and needs to be repointed at the item
4838 on the branch chain following the last branch we
4839 have optimized away.
4841 This could be either a BRANCH, in which case the
4842 subsequence is internal, or it could be the item
4843 following the branch sequence in which case the
4844 subsequence is at the end (which does not
4845 necessarily mean the first node is the start of the
4848 TRIE_TYPE(X) is a define which maps the optype to a
4852 ----------------+-----------
4857 EXACTFU_ONLY8 | EXACTFU
4861 EXACTFLU8 | EXACTFLU8
4865 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4867 : ( EXACT == (X) || EXACT_ONLY8 == (X) ) \
4869 : ( EXACTFU == (X) \
4870 || EXACTFU_ONLY8 == (X) \
4871 || EXACTFUP == (X) ) \
4873 : ( EXACTFAA == (X) ) \
4875 : ( EXACTL == (X) ) \
4877 : ( EXACTFLU8 == (X) ) \
4881 /* dont use tail as the end marker for this traverse */
4882 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4883 regnode * const noper = NEXTOPER( cur );
4884 U8 noper_type = OP( noper );
4885 U8 noper_trietype = TRIE_TYPE( noper_type );
4886 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4887 regnode * const noper_next = regnext( noper );
4888 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4889 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
4892 DEBUG_TRIE_COMPILE_r({
4893 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4894 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
4896 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4898 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4899 Perl_re_printf( aTHX_ " -> %d:%s",
4900 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
4903 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4904 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
4905 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
4907 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
4908 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4909 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4913 /* Is noper a trieable nodetype that can be merged
4914 * with the current trie (if there is one)? */
4918 ( noper_trietype == NOTHING )
4919 || ( trietype == NOTHING )
4920 || ( trietype == noper_trietype )
4923 && noper_next >= tail
4927 /* Handle mergable triable node Either we are
4928 * the first node in a new trieable sequence,
4929 * in which case we do some bookkeeping,
4930 * otherwise we update the end pointer. */
4933 if ( noper_trietype == NOTHING ) {
4934 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4935 regnode * const noper_next = regnext( noper );
4936 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4937 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4940 if ( noper_next_trietype ) {
4941 trietype = noper_next_trietype;
4942 } else if (noper_next_type) {
4943 /* a NOTHING regop is 1 regop wide.
4944 * We need at least two for a trie
4945 * so we can't merge this in */
4949 trietype = noper_trietype;
4952 if ( trietype == NOTHING )
4953 trietype = noper_trietype;
4958 } /* end handle mergable triable node */
4960 /* handle unmergable node -
4961 * noper may either be a triable node which can
4962 * not be tried together with the current trie,
4963 * or a non triable node */
4965 /* If last is set and trietype is not
4966 * NOTHING then we have found at least two
4967 * triable branch sequences in a row of a
4968 * similar trietype so we can turn them
4969 * into a trie. If/when we allow NOTHING to
4970 * start a trie sequence this condition
4971 * will be required, and it isn't expensive
4972 * so we leave it in for now. */
4973 if ( trietype && trietype != NOTHING )
4974 make_trie( pRExC_state,
4975 startbranch, first, cur, tail,
4976 count, trietype, depth+1 );
4977 last = NULL; /* note: we clear/update
4978 first, trietype etc below,
4979 so we dont do it here */
4983 && noper_next >= tail
4986 /* noper is triable, so we can start a new
4990 trietype = noper_trietype;
4992 /* if we already saw a first but the
4993 * current node is not triable then we have
4994 * to reset the first information. */
4999 } /* end handle unmergable node */
5000 } /* loop over branches */
5001 DEBUG_TRIE_COMPILE_r({
5002 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5003 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
5004 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5005 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5006 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
5007 PL_reg_name[trietype]
5011 if ( last && trietype ) {
5012 if ( trietype != NOTHING ) {
5013 /* the last branch of the sequence was part of
5014 * a trie, so we have to construct it here
5015 * outside of the loop */
5016 made= make_trie( pRExC_state, startbranch,
5017 first, scan, tail, count,
5018 trietype, depth+1 );
5019 #ifdef TRIE_STUDY_OPT
5020 if ( ((made == MADE_EXACT_TRIE &&
5021 startbranch == first)
5022 || ( first_non_open == first )) &&
5024 flags |= SCF_TRIE_RESTUDY;
5025 if ( startbranch == first
5028 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5033 /* at this point we know whatever we have is a
5034 * NOTHING sequence/branch AND if 'startbranch'
5035 * is 'first' then we can turn the whole thing
5038 if ( startbranch == first ) {
5040 /* the entire thing is a NOTHING sequence,
5041 * something like this: (?:|) So we can
5042 * turn it into a plain NOTHING op. */
5043 DEBUG_TRIE_COMPILE_r({
5044 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5045 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5047 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5050 OP(startbranch)= NOTHING;
5051 NEXT_OFF(startbranch)= tail - startbranch;
5052 for ( opt= startbranch + 1; opt < tail ; opt++ )
5056 } /* end if ( last) */
5057 } /* TRIE_MAXBUF is non zero */
5062 else if ( code == BRANCHJ ) { /* single branch is optimized. */
5063 scan = NEXTOPER(NEXTOPER(scan));
5064 } else /* single branch is optimized. */
5065 scan = NEXTOPER(scan);
5067 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5069 regnode *start = NULL;
5070 regnode *end = NULL;
5071 U32 my_recursed_depth= recursed_depth;
5073 if (OP(scan) != SUSPEND) { /* GOSUB */
5074 /* Do setup, note this code has side effects beyond
5075 * the rest of this block. Specifically setting
5076 * RExC_recurse[] must happen at least once during
5079 RExC_recurse[ARG2L(scan)] = scan;
5080 start = REGNODE_p(RExC_open_parens[paren]);
5081 end = REGNODE_p(RExC_close_parens[paren]);
5083 /* NOTE we MUST always execute the above code, even
5084 * if we do nothing with a GOSUB */
5086 ( flags & SCF_IN_DEFINE )
5089 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5091 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5094 /* no need to do anything here if we are in a define. */
5095 /* or we are after some kind of infinite construct
5096 * so we can skip recursing into this item.
5097 * Since it is infinite we will not change the maxlen
5098 * or delta, and if we miss something that might raise
5099 * the minlen it will merely pessimise a little.
5101 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5102 * might result in a minlen of 1 and not of 4,
5103 * but this doesn't make us mismatch, just try a bit
5104 * harder than we should.
5106 scan= regnext(scan);
5113 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
5115 /* it is quite possible that there are more efficient ways
5116 * to do this. We maintain a bitmap per level of recursion
5117 * of which patterns we have entered so we can detect if a
5118 * pattern creates a possible infinite loop. When we
5119 * recurse down a level we copy the previous levels bitmap
5120 * down. When we are at recursion level 0 we zero the top
5121 * level bitmap. It would be nice to implement a different
5122 * more efficient way of doing this. In particular the top
5123 * level bitmap may be unnecessary.
5125 if (!recursed_depth) {
5126 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5128 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
5129 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
5130 RExC_study_chunk_recursed_bytes, U8);
5132 /* we havent recursed into this paren yet, so recurse into it */
5133 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5134 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
5135 my_recursed_depth= recursed_depth + 1;
5137 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5138 /* some form of infinite recursion, assume infinite length
5140 if (flags & SCF_DO_SUBSTR) {
5141 scan_commit(pRExC_state, data, minlenp, is_inf);
5142 data->cur_is_floating = 1;
5144 is_inf = is_inf_internal = 1;
5145 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5146 ssc_anything(data->start_class);
5147 flags &= ~SCF_DO_STCLASS;
5149 start= NULL; /* reset start so we dont recurse later on. */
5154 end = regnext(scan);
5157 scan_frame *newframe;
5159 if (!RExC_frame_last) {
5160 Newxz(newframe, 1, scan_frame);
5161 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5162 RExC_frame_head= newframe;
5164 } else if (!RExC_frame_last->next_frame) {
5165 Newxz(newframe, 1, scan_frame);
5166 RExC_frame_last->next_frame= newframe;
5167 newframe->prev_frame= RExC_frame_last;
5170 newframe= RExC_frame_last->next_frame;
5172 RExC_frame_last= newframe;
5174 newframe->next_regnode = regnext(scan);
5175 newframe->last_regnode = last;
5176 newframe->stopparen = stopparen;
5177 newframe->prev_recursed_depth = recursed_depth;
5178 newframe->this_prev_frame= frame;
5180 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5181 DEBUG_PEEP("fnew", scan, depth, flags);
5188 recursed_depth= my_recursed_depth;
5193 else if ( OP(scan) == EXACT
5194 || OP(scan) == EXACT_ONLY8
5195 || OP(scan) == EXACTL)
5197 SSize_t l = STR_LEN(scan);
5201 const U8 * const s = (U8*)STRING(scan);
5202 uc = utf8_to_uvchr_buf(s, s + l, NULL);
5203 l = utf8_length(s, s + l);
5205 uc = *((U8*)STRING(scan));
5208 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5209 /* The code below prefers earlier match for fixed
5210 offset, later match for variable offset. */
5211 if (data->last_end == -1) { /* Update the start info. */
5212 data->last_start_min = data->pos_min;
5213 data->last_start_max = is_inf
5214 ? SSize_t_MAX : data->pos_min + data->pos_delta;
5216 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
5218 SvUTF8_on(data->last_found);
5220 SV * const sv = data->last_found;
5221 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5222 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5223 if (mg && mg->mg_len >= 0)
5224 mg->mg_len += utf8_length((U8*)STRING(scan),
5225 (U8*)STRING(scan)+STR_LEN(scan));
5227 data->last_end = data->pos_min + l;
5228 data->pos_min += l; /* As in the first entry. */
5229 data->flags &= ~SF_BEFORE_EOL;
5232 /* ANDing the code point leaves at most it, and not in locale, and
5233 * can't match null string */
5234 if (flags & SCF_DO_STCLASS_AND) {
5235 ssc_cp_and(data->start_class, uc);
5236 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5237 ssc_clear_locale(data->start_class);
5239 else if (flags & SCF_DO_STCLASS_OR) {
5240 ssc_add_cp(data->start_class, uc);
5241 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5243 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5244 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5246 flags &= ~SCF_DO_STCLASS;
5248 else if (PL_regkind[OP(scan)] == EXACT) {
5249 /* But OP != EXACT!, so is EXACTFish */
5250 SSize_t l = STR_LEN(scan);
5251 const U8 * s = (U8*)STRING(scan);
5253 /* Search for fixed substrings supports EXACT only. */
5254 if (flags & SCF_DO_SUBSTR) {
5256 scan_commit(pRExC_state, data, minlenp, is_inf);
5259 l = utf8_length(s, s + l);
5261 if (unfolded_multi_char) {
5262 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5264 min += l - min_subtract;
5266 delta += min_subtract;
5267 if (flags & SCF_DO_SUBSTR) {
5268 data->pos_min += l - min_subtract;
5269 if (data->pos_min < 0) {
5272 data->pos_delta += min_subtract;
5274 data->cur_is_floating = 1; /* float */
5278 if (flags & SCF_DO_STCLASS) {
5279 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
5281 assert(EXACTF_invlist);
5282 if (flags & SCF_DO_STCLASS_AND) {
5283 if (OP(scan) != EXACTFL)
5284 ssc_clear_locale(data->start_class);
5285 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5286 ANYOF_POSIXL_ZERO(data->start_class);
5287 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5289 else { /* SCF_DO_STCLASS_OR */
5290 ssc_union(data->start_class, EXACTF_invlist, FALSE);
5291 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5293 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5294 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5296 flags &= ~SCF_DO_STCLASS;
5297 SvREFCNT_dec(EXACTF_invlist);
5300 else if (REGNODE_VARIES(OP(scan))) {
5301 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5302 I32 fl = 0, f = flags;
5303 regnode * const oscan = scan;
5304 regnode_ssc this_class;
5305 regnode_ssc *oclass = NULL;
5306 I32 next_is_eval = 0;
5308 switch (PL_regkind[OP(scan)]) {
5309 case WHILEM: /* End of (?:...)* . */
5310 scan = NEXTOPER(scan);
5313 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5314 next = NEXTOPER(scan);
5315 if ( OP(next) == EXACT
5316 || OP(next) == EXACT_ONLY8
5317 || OP(next) == EXACTL
5318 || (flags & SCF_DO_STCLASS))
5321 maxcount = REG_INFTY;
5322 next = regnext(scan);
5323 scan = NEXTOPER(scan);
5327 if (flags & SCF_DO_SUBSTR)
5332 next = NEXTOPER(scan);
5334 /* This temporary node can now be turned into EXACTFU, and
5335 * must, as regexec.c doesn't handle it */
5336 if (OP(next) == EXACTFU_S_EDGE) {
5340 if ( STR_LEN(next) == 1
5341 && isALPHA_A(* STRING(next))
5342 && ( OP(next) == EXACTFAA
5343 || ( OP(next) == EXACTFU
5344 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
5346 /* These differ in just one bit */
5347 U8 mask = ~ ('A' ^ 'a');
5349 assert(isALPHA_A(* STRING(next)));
5351 /* Then replace it by an ANYOFM node, with
5352 * the mask set to the complement of the
5353 * bit that differs between upper and lower
5354 * case, and the lowest code point of the
5355 * pair (which the '&' forces) */
5357 ARG_SET(next, *STRING(next) & mask);
5361 if (flags & SCF_DO_STCLASS) {
5363 maxcount = REG_INFTY;
5364 next = regnext(scan);
5365 scan = NEXTOPER(scan);
5368 if (flags & SCF_DO_SUBSTR) {
5369 scan_commit(pRExC_state, data, minlenp, is_inf);
5370 /* Cannot extend fixed substrings */
5371 data->cur_is_floating = 1; /* float */
5373 is_inf = is_inf_internal = 1;
5374 scan = regnext(scan);
5375 goto optimize_curly_tail;
5377 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5378 && (scan->flags == stopparen))
5383 mincount = ARG1(scan);
5384 maxcount = ARG2(scan);
5386 next = regnext(scan);
5387 if (OP(scan) == CURLYX) {
5388 I32 lp = (data ? *(data->last_closep) : 0);
5389 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5391 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5392 next_is_eval = (OP(scan) == EVAL);
5394 if (flags & SCF_DO_SUBSTR) {
5396 scan_commit(pRExC_state, data, minlenp, is_inf);
5397 /* Cannot extend fixed substrings */
5398 pos_before = data->pos_min;
5402 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5404 data->flags |= SF_IS_INF;
5406 if (flags & SCF_DO_STCLASS) {
5407 ssc_init(pRExC_state, &this_class);
5408 oclass = data->start_class;
5409 data->start_class = &this_class;
5410 f |= SCF_DO_STCLASS_AND;
5411 f &= ~SCF_DO_STCLASS_OR;
5413 /* Exclude from super-linear cache processing any {n,m}
5414 regops for which the combination of input pos and regex
5415 pos is not enough information to determine if a match
5418 For example, in the regex /foo(bar\s*){4,8}baz/ with the
5419 regex pos at the \s*, the prospects for a match depend not
5420 only on the input position but also on how many (bar\s*)
5421 repeats into the {4,8} we are. */
5422 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5423 f &= ~SCF_WHILEM_VISITED_POS;
5425 /* This will finish on WHILEM, setting scan, or on NULL: */
5426 /* recurse study_chunk() on loop bodies */
5427 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5428 last, data, stopparen, recursed_depth, NULL,
5430 ? (f & ~SCF_DO_SUBSTR)
5434 if (flags & SCF_DO_STCLASS)
5435 data->start_class = oclass;
5436 if (mincount == 0 || minnext == 0) {
5437 if (flags & SCF_DO_STCLASS_OR) {
5438 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5440 else if (flags & SCF_DO_STCLASS_AND) {
5441 /* Switch to OR mode: cache the old value of
5442 * data->start_class */
5444 StructCopy(data->start_class, and_withp, regnode_ssc);
5445 flags &= ~SCF_DO_STCLASS_AND;
5446 StructCopy(&this_class, data->start_class, regnode_ssc);
5447 flags |= SCF_DO_STCLASS_OR;
5448 ANYOF_FLAGS(data->start_class)
5449 |= SSC_MATCHES_EMPTY_STRING;
5451 } else { /* Non-zero len */
5452 if (flags & SCF_DO_STCLASS_OR) {
5453 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5454 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5456 else if (flags & SCF_DO_STCLASS_AND)
5457 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5458 flags &= ~SCF_DO_STCLASS;
5460 if (!scan) /* It was not CURLYX, but CURLY. */
5462 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5463 /* ? quantifier ok, except for (?{ ... }) */
5464 && (next_is_eval || !(mincount == 0 && maxcount == 1))
5465 && (minnext == 0) && (deltanext == 0)
5466 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5467 && maxcount <= REG_INFTY/3) /* Complement check for big
5470 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5471 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5472 "Quantifier unexpected on zero-length expression "
5473 "in regex m/%" UTF8f "/",
5474 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5478 min += minnext * mincount;
5479 is_inf_internal |= deltanext == SSize_t_MAX
5480 || (maxcount == REG_INFTY && minnext + deltanext > 0);
5481 is_inf |= is_inf_internal;
5483 delta = SSize_t_MAX;
5485 delta += (minnext + deltanext) * maxcount
5486 - minnext * mincount;
5488 /* Try powerful optimization CURLYX => CURLYN. */
5489 if ( OP(oscan) == CURLYX && data
5490 && data->flags & SF_IN_PAR
5491 && !(data->flags & SF_HAS_EVAL)
5492 && !deltanext && minnext == 1 ) {
5493 /* Try to optimize to CURLYN. */
5494 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5495 regnode * const nxt1 = nxt;
5502 if (!REGNODE_SIMPLE(OP(nxt))
5503 && !(PL_regkind[OP(nxt)] == EXACT
5504 && STR_LEN(nxt) == 1))
5510 if (OP(nxt) != CLOSE)
5512 if (RExC_open_parens) {
5515 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5518 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5520 /* Now we know that nxt2 is the only contents: */
5521 oscan->flags = (U8)ARG(nxt);
5523 OP(nxt1) = NOTHING; /* was OPEN. */
5526 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5527 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5528 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5529 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5530 OP(nxt + 1) = OPTIMIZED; /* was count. */
5531 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5536 /* Try optimization CURLYX => CURLYM. */
5537 if ( OP(oscan) == CURLYX && data
5538 && !(data->flags & SF_HAS_PAR)
5539 && !(data->flags & SF_HAS_EVAL)
5540 && !deltanext /* atom is fixed width */
5541 && minnext != 0 /* CURLYM can't handle zero width */
5543 /* Nor characters whose fold at run-time may be
5544 * multi-character */
5545 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5547 /* XXXX How to optimize if data == 0? */
5548 /* Optimize to a simpler form. */
5549 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5553 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5554 && (OP(nxt2) != WHILEM))
5556 OP(nxt2) = SUCCEED; /* Whas WHILEM */
5557 /* Need to optimize away parenths. */
5558 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5559 /* Set the parenth number. */
5560 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5562 oscan->flags = (U8)ARG(nxt);
5563 if (RExC_open_parens) {
5565 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5568 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5571 OP(nxt1) = OPTIMIZED; /* was OPEN. */
5572 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5575 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5576 OP(nxt + 1) = OPTIMIZED; /* was count. */
5577 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5578 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5581 while ( nxt1 && (OP(nxt1) != WHILEM)) {
5582 regnode *nnxt = regnext(nxt1);
5584 if (reg_off_by_arg[OP(nxt1)])
5585 ARG_SET(nxt1, nxt2 - nxt1);
5586 else if (nxt2 - nxt1 < U16_MAX)
5587 NEXT_OFF(nxt1) = nxt2 - nxt1;
5589 OP(nxt) = NOTHING; /* Cannot beautify */
5594 /* Optimize again: */
5595 /* recurse study_chunk() on optimised CURLYX => CURLYM */
5596 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5597 NULL, stopparen, recursed_depth, NULL, 0,
5603 else if ((OP(oscan) == CURLYX)
5604 && (flags & SCF_WHILEM_VISITED_POS)
5605 /* See the comment on a similar expression above.
5606 However, this time it's not a subexpression
5607 we care about, but the expression itself. */
5608 && (maxcount == REG_INFTY)
5610 /* This stays as CURLYX, we can put the count/of pair. */
5611 /* Find WHILEM (as in regexec.c) */
5612 regnode *nxt = oscan + NEXT_OFF(oscan);
5614 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5616 nxt = PREVOPER(nxt);
5617 if (nxt->flags & 0xf) {
5618 /* we've already set whilem count on this node */
5619 } else if (++data->whilem_c < 16) {
5620 assert(data->whilem_c <= RExC_whilem_seen);
5621 nxt->flags = (U8)(data->whilem_c
5622 | (RExC_whilem_seen << 4)); /* On WHILEM */
5625 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5627 if (flags & SCF_DO_SUBSTR) {
5628 SV *last_str = NULL;
5629 STRLEN last_chrs = 0;
5630 int counted = mincount != 0;
5632 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5634 SSize_t b = pos_before >= data->last_start_min
5635 ? pos_before : data->last_start_min;
5637 const char * const s = SvPV_const(data->last_found, l);
5638 SSize_t old = b - data->last_start_min;
5642 old = utf8_hop_forward((U8*)s, old,
5643 (U8 *) SvEND(data->last_found))
5646 /* Get the added string: */
5647 last_str = newSVpvn_utf8(s + old, l, UTF);
5648 last_chrs = UTF ? utf8_length((U8*)(s + old),
5649 (U8*)(s + old + l)) : l;
5650 if (deltanext == 0 && pos_before == b) {
5651 /* What was added is a constant string */
5654 SvGROW(last_str, (mincount * l) + 1);
5655 repeatcpy(SvPVX(last_str) + l,
5656 SvPVX_const(last_str), l,
5658 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5659 /* Add additional parts. */
5660 SvCUR_set(data->last_found,
5661 SvCUR(data->last_found) - l);
5662 sv_catsv(data->last_found, last_str);
5664 SV * sv = data->last_found;
5666 SvUTF8(sv) && SvMAGICAL(sv) ?
5667 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5668 if (mg && mg->mg_len >= 0)
5669 mg->mg_len += last_chrs * (mincount-1);
5671 last_chrs *= mincount;
5672 data->last_end += l * (mincount - 1);
5675 /* start offset must point into the last copy */
5676 data->last_start_min += minnext * (mincount - 1);
5677 data->last_start_max =
5680 : data->last_start_max +
5681 (maxcount - 1) * (minnext + data->pos_delta);
5684 /* It is counted once already... */
5685 data->pos_min += minnext * (mincount - counted);
5687 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
5688 " SSize_t_MAX=%" UVuf " minnext=%" UVuf
5689 " maxcount=%" UVuf " mincount=%" UVuf "\n",
5690 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
5692 if (deltanext != SSize_t_MAX)
5693 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
5694 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5695 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
5697 if (deltanext == SSize_t_MAX
5698 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
5699 data->pos_delta = SSize_t_MAX;
5701 data->pos_delta += - counted * deltanext +
5702 (minnext + deltanext) * maxcount - minnext * mincount;
5703 if (mincount != maxcount) {
5704 /* Cannot extend fixed substrings found inside
5706 scan_commit(pRExC_state, data, minlenp, is_inf);
5707 if (mincount && last_str) {
5708 SV * const sv = data->last_found;
5709 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5710 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5714 sv_setsv(sv, last_str);
5715 data->last_end = data->pos_min;
5716 data->last_start_min = data->pos_min - last_chrs;
5717 data->last_start_max = is_inf
5719 : data->pos_min + data->pos_delta - last_chrs;
5721 data->cur_is_floating = 1; /* float */
5723 SvREFCNT_dec(last_str);
5725 if (data && (fl & SF_HAS_EVAL))
5726 data->flags |= SF_HAS_EVAL;
5727 optimize_curly_tail:
5728 if (OP(oscan) != CURLYX) {
5729 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
5731 NEXT_OFF(oscan) += NEXT_OFF(next);
5737 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5742 if (flags & SCF_DO_SUBSTR) {
5743 /* Cannot expect anything... */
5744 scan_commit(pRExC_state, data, minlenp, is_inf);
5745 data->cur_is_floating = 1; /* float */
5747 is_inf = is_inf_internal = 1;
5748 if (flags & SCF_DO_STCLASS_OR) {
5749 if (OP(scan) == CLUMP) {
5750 /* Actually is any start char, but very few code points
5751 * aren't start characters */
5752 ssc_match_all_cp(data->start_class);
5755 ssc_anything(data->start_class);
5758 flags &= ~SCF_DO_STCLASS;
5762 else if (OP(scan) == LNBREAK) {
5763 if (flags & SCF_DO_STCLASS) {
5764 if (flags & SCF_DO_STCLASS_AND) {
5765 ssc_intersection(data->start_class,
5766 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5767 ssc_clear_locale(data->start_class);
5768 ANYOF_FLAGS(data->start_class)
5769 &= ~SSC_MATCHES_EMPTY_STRING;
5771 else if (flags & SCF_DO_STCLASS_OR) {
5772 ssc_union(data->start_class,
5773 PL_XPosix_ptrs[_CC_VERTSPACE],
5775 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5777 /* See commit msg for
5778 * 749e076fceedeb708a624933726e7989f2302f6a */
5779 ANYOF_FLAGS(data->start_class)
5780 &= ~SSC_MATCHES_EMPTY_STRING;
5782 flags &= ~SCF_DO_STCLASS;
5785 if (delta != SSize_t_MAX)
5786 delta++; /* Because of the 2 char string cr-lf */
5787 if (flags & SCF_DO_SUBSTR) {
5788 /* Cannot expect anything... */
5789 scan_commit(pRExC_state, data, minlenp, is_inf);
5791 if (data->pos_delta != SSize_t_MAX) {
5792 data->pos_delta += 1;
5794 data->cur_is_floating = 1; /* float */
5797 else if (REGNODE_SIMPLE(OP(scan))) {
5799 if (flags & SCF_DO_SUBSTR) {
5800 scan_commit(pRExC_state, data, minlenp, is_inf);
5804 if (flags & SCF_DO_STCLASS) {
5806 SV* my_invlist = NULL;
5809 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5810 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5812 /* Some of the logic below assumes that switching
5813 locale on will only add false positives. */
5818 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5822 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5823 ssc_match_all_cp(data->start_class);
5828 SV* REG_ANY_invlist = _new_invlist(2);
5829 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5831 if (flags & SCF_DO_STCLASS_OR) {
5832 ssc_union(data->start_class,
5834 TRUE /* TRUE => invert, hence all but \n
5838 else if (flags & SCF_DO_STCLASS_AND) {
5839 ssc_intersection(data->start_class,
5841 TRUE /* TRUE => invert */
5843 ssc_clear_locale(data->start_class);
5845 SvREFCNT_dec_NN(REG_ANY_invlist);
5856 if (flags & SCF_DO_STCLASS_AND)
5857 ssc_and(pRExC_state, data->start_class,
5858 (regnode_charclass *) scan);
5860 ssc_or(pRExC_state, data->start_class,
5861 (regnode_charclass *) scan);
5867 SV* cp_list = get_ANYOFM_contents(scan);
5869 if (flags & SCF_DO_STCLASS_OR) {
5870 ssc_union(data->start_class, cp_list, invert);
5872 else if (flags & SCF_DO_STCLASS_AND) {
5873 ssc_intersection(data->start_class, cp_list, invert);
5876 SvREFCNT_dec_NN(cp_list);
5885 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5886 if (flags & SCF_DO_STCLASS_AND) {
5887 bool was_there = cBOOL(
5888 ANYOF_POSIXL_TEST(data->start_class,
5890 ANYOF_POSIXL_ZERO(data->start_class);
5891 if (was_there) { /* Do an AND */
5892 ANYOF_POSIXL_SET(data->start_class, namedclass);
5894 /* No individual code points can now match */
5895 data->start_class->invlist
5896 = sv_2mortal(_new_invlist(0));
5899 int complement = namedclass + ((invert) ? -1 : 1);
5901 assert(flags & SCF_DO_STCLASS_OR);
5903 /* If the complement of this class was already there,
5904 * the result is that they match all code points,
5905 * (\d + \D == everything). Remove the classes from
5906 * future consideration. Locale is not relevant in
5908 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5909 ssc_match_all_cp(data->start_class);
5910 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5911 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5913 else { /* The usual case; just add this class to the
5915 ANYOF_POSIXL_SET(data->start_class, namedclass);
5920 case NPOSIXA: /* For these, we always know the exact set of
5925 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
5926 goto join_posix_and_ascii;
5934 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
5936 /* NPOSIXD matches all upper Latin1 code points unless the
5937 * target string being matched is UTF-8, which is
5938 * unknowable until match time. Since we are going to
5939 * invert, we want to get rid of all of them so that the
5940 * inversion will match all */
5941 if (OP(scan) == NPOSIXD) {
5942 _invlist_subtract(my_invlist, PL_UpperLatin1,
5946 join_posix_and_ascii:
5948 if (flags & SCF_DO_STCLASS_AND) {
5949 ssc_intersection(data->start_class, my_invlist, invert);
5950 ssc_clear_locale(data->start_class);
5953 assert(flags & SCF_DO_STCLASS_OR);
5954 ssc_union(data->start_class, my_invlist, invert);
5956 SvREFCNT_dec(my_invlist);
5958 if (flags & SCF_DO_STCLASS_OR)
5959 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5960 flags &= ~SCF_DO_STCLASS;
5963 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5964 data->flags |= (OP(scan) == MEOL
5967 scan_commit(pRExC_state, data, minlenp, is_inf);
5970 else if ( PL_regkind[OP(scan)] == BRANCHJ
5971 /* Lookbehind, or need to calculate parens/evals/stclass: */
5972 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5973 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5975 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5976 || OP(scan) == UNLESSM )
5978 /* Negative Lookahead/lookbehind
5979 In this case we can't do fixed string optimisation.
5982 SSize_t deltanext, minnext, fake = 0;
5987 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5989 data_fake.whilem_c = data->whilem_c;
5990 data_fake.last_closep = data->last_closep;
5993 data_fake.last_closep = &fake;
5994 data_fake.pos_delta = delta;
5995 if ( flags & SCF_DO_STCLASS && !scan->flags
5996 && OP(scan) == IFMATCH ) { /* Lookahead */
5997 ssc_init(pRExC_state, &intrnl);
5998 data_fake.start_class = &intrnl;
5999 f |= SCF_DO_STCLASS_AND;
6001 if (flags & SCF_WHILEM_VISITED_POS)
6002 f |= SCF_WHILEM_VISITED_POS;
6003 next = regnext(scan);
6004 nscan = NEXTOPER(NEXTOPER(scan));
6006 /* recurse study_chunk() for lookahead body */
6007 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6008 last, &data_fake, stopparen,
6009 recursed_depth, NULL, f, depth+1);
6012 || deltanext > (I32) U8_MAX
6013 || minnext > (I32)U8_MAX
6014 || minnext + deltanext > (I32)U8_MAX)
6016 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6020 /* The 'next_off' field has been repurposed to count the
6021 * additional starting positions to try beyond the initial
6022 * one. (This leaves it at 0 for non-variable length
6023 * matches to avoid breakage for those not using this
6026 scan->next_off = deltanext;
6027 ckWARNexperimental(RExC_parse,
6028 WARN_EXPERIMENTAL__VLB,
6029 "Variable length lookbehind is experimental");
6031 scan->flags = (U8)minnext + deltanext;
6034 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6036 if (data_fake.flags & SF_HAS_EVAL)
6037 data->flags |= SF_HAS_EVAL;
6038 data->whilem_c = data_fake.whilem_c;
6040 if (f & SCF_DO_STCLASS_AND) {
6041 if (flags & SCF_DO_STCLASS_OR) {
6042 /* OR before, AND after: ideally we would recurse with
6043 * data_fake to get the AND applied by study of the
6044 * remainder of the pattern, and then derecurse;
6045 * *** HACK *** for now just treat as "no information".
6046 * See [perl #56690].
6048 ssc_init(pRExC_state, data->start_class);
6050 /* AND before and after: combine and continue. These
6051 * assertions are zero-length, so can match an EMPTY
6053 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6054 ANYOF_FLAGS(data->start_class)
6055 |= SSC_MATCHES_EMPTY_STRING;
6059 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6061 /* Positive Lookahead/lookbehind
6062 In this case we can do fixed string optimisation,
6063 but we must be careful about it. Note in the case of
6064 lookbehind the positions will be offset by the minimum
6065 length of the pattern, something we won't know about
6066 until after the recurse.
6068 SSize_t deltanext, fake = 0;
6072 /* We use SAVEFREEPV so that when the full compile
6073 is finished perl will clean up the allocated
6074 minlens when it's all done. This way we don't
6075 have to worry about freeing them when we know
6076 they wont be used, which would be a pain.
6079 Newx( minnextp, 1, SSize_t );
6080 SAVEFREEPV(minnextp);
6083 StructCopy(data, &data_fake, scan_data_t);
6084 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6087 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6088 data_fake.last_found=newSVsv(data->last_found);
6092 data_fake.last_closep = &fake;
6093 data_fake.flags = 0;
6094 data_fake.substrs[0].flags = 0;
6095 data_fake.substrs[1].flags = 0;
6096 data_fake.pos_delta = delta;
6098 data_fake.flags |= SF_IS_INF;
6099 if ( flags & SCF_DO_STCLASS && !scan->flags
6100 && OP(scan) == IFMATCH ) { /* Lookahead */
6101 ssc_init(pRExC_state, &intrnl);
6102 data_fake.start_class = &intrnl;
6103 f |= SCF_DO_STCLASS_AND;
6105 if (flags & SCF_WHILEM_VISITED_POS)
6106 f |= SCF_WHILEM_VISITED_POS;
6107 next = regnext(scan);
6108 nscan = NEXTOPER(NEXTOPER(scan));
6110 /* positive lookahead study_chunk() recursion */
6111 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6112 &deltanext, last, &data_fake,
6113 stopparen, recursed_depth, NULL,
6116 assert(0); /* This code has never been tested since this
6117 is normally not compiled */
6119 || deltanext > (I32) U8_MAX
6120 || *minnextp > (I32)U8_MAX
6121 || *minnextp + deltanext > (I32)U8_MAX)
6123 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6128 scan->next_off = deltanext;
6130 scan->flags = (U8)*minnextp + deltanext;
6135 if (f & SCF_DO_STCLASS_AND) {
6136 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6137 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6140 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6142 if (data_fake.flags & SF_HAS_EVAL)
6143 data->flags |= SF_HAS_EVAL;
6144 data->whilem_c = data_fake.whilem_c;
6145 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6147 if (RExC_rx->minlen<*minnextp)
6148 RExC_rx->minlen=*minnextp;
6149 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6150 SvREFCNT_dec_NN(data_fake.last_found);
6152 for (i = 0; i < 2; i++) {
6153 if (data_fake.substrs[i].minlenp != minlenp) {
6154 data->substrs[i].min_offset =
6155 data_fake.substrs[i].min_offset;
6156 data->substrs[i].max_offset =
6157 data_fake.substrs[i].max_offset;
6158 data->substrs[i].minlenp =
6159 data_fake.substrs[i].minlenp;
6160 data->substrs[i].lookbehind += scan->flags;
6169 else if (OP(scan) == OPEN) {
6170 if (stopparen != (I32)ARG(scan))
6173 else if (OP(scan) == CLOSE) {
6174 if (stopparen == (I32)ARG(scan)) {
6177 if ((I32)ARG(scan) == is_par) {
6178 next = regnext(scan);
6180 if ( next && (OP(next) != WHILEM) && next < last)
6181 is_par = 0; /* Disable optimization */
6184 *(data->last_closep) = ARG(scan);
6186 else if (OP(scan) == EVAL) {
6188 data->flags |= SF_HAS_EVAL;
6190 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6191 if (flags & SCF_DO_SUBSTR) {
6192 scan_commit(pRExC_state, data, minlenp, is_inf);
6193 flags &= ~SCF_DO_SUBSTR;
6195 if (data && OP(scan)==ACCEPT) {
6196 data->flags |= SCF_SEEN_ACCEPT;
6201 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6203 if (flags & SCF_DO_SUBSTR) {
6204 scan_commit(pRExC_state, data, minlenp, is_inf);
6205 data->cur_is_floating = 1; /* float */
6207 is_inf = is_inf_internal = 1;
6208 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6209 ssc_anything(data->start_class);
6210 flags &= ~SCF_DO_STCLASS;
6212 else if (OP(scan) == GPOS) {
6213 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6214 !(delta || is_inf || (data && data->pos_delta)))
6216 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6217 RExC_rx->intflags |= PREGf_ANCH_GPOS;
6218 if (RExC_rx->gofs < (STRLEN)min)
6219 RExC_rx->gofs = min;
6221 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6225 #ifdef TRIE_STUDY_OPT
6226 #ifdef FULL_TRIE_STUDY
6227 else if (PL_regkind[OP(scan)] == TRIE) {
6228 /* NOTE - There is similar code to this block above for handling
6229 BRANCH nodes on the initial study. If you change stuff here
6231 regnode *trie_node= scan;
6232 regnode *tail= regnext(scan);
6233 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6234 SSize_t max1 = 0, min1 = SSize_t_MAX;
6237 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6238 /* Cannot merge strings after this. */
6239 scan_commit(pRExC_state, data, minlenp, is_inf);
6241 if (flags & SCF_DO_STCLASS)
6242 ssc_init_zero(pRExC_state, &accum);
6248 const regnode *nextbranch= NULL;
6251 for ( word=1 ; word <= trie->wordcount ; word++)
6253 SSize_t deltanext=0, minnext=0, f = 0, fake;
6254 regnode_ssc this_class;
6256 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6258 data_fake.whilem_c = data->whilem_c;
6259 data_fake.last_closep = data->last_closep;
6262 data_fake.last_closep = &fake;
6263 data_fake.pos_delta = delta;
6264 if (flags & SCF_DO_STCLASS) {
6265 ssc_init(pRExC_state, &this_class);
6266 data_fake.start_class = &this_class;
6267 f = SCF_DO_STCLASS_AND;
6269 if (flags & SCF_WHILEM_VISITED_POS)
6270 f |= SCF_WHILEM_VISITED_POS;
6272 if (trie->jump[word]) {
6274 nextbranch = trie_node + trie->jump[0];
6275 scan= trie_node + trie->jump[word];
6276 /* We go from the jump point to the branch that follows
6277 it. Note this means we need the vestigal unused
6278 branches even though they arent otherwise used. */
6279 /* optimise study_chunk() for TRIE */
6280 minnext = study_chunk(pRExC_state, &scan, minlenp,
6281 &deltanext, (regnode *)nextbranch, &data_fake,
6282 stopparen, recursed_depth, NULL, f, depth+1);
6284 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6285 nextbranch= regnext((regnode*)nextbranch);
6287 if (min1 > (SSize_t)(minnext + trie->minlen))
6288 min1 = minnext + trie->minlen;
6289 if (deltanext == SSize_t_MAX) {
6290 is_inf = is_inf_internal = 1;
6292 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6293 max1 = minnext + deltanext + trie->maxlen;
6295 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6297 if (data_fake.flags & SCF_SEEN_ACCEPT) {
6298 if ( stopmin > min + min1)
6299 stopmin = min + min1;
6300 flags &= ~SCF_DO_SUBSTR;
6302 data->flags |= SCF_SEEN_ACCEPT;
6305 if (data_fake.flags & SF_HAS_EVAL)
6306 data->flags |= SF_HAS_EVAL;
6307 data->whilem_c = data_fake.whilem_c;
6309 if (flags & SCF_DO_STCLASS)
6310 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6313 if (flags & SCF_DO_SUBSTR) {
6314 data->pos_min += min1;
6315 data->pos_delta += max1 - min1;
6316 if (max1 != min1 || is_inf)
6317 data->cur_is_floating = 1; /* float */
6320 if (delta != SSize_t_MAX) {
6321 if (SSize_t_MAX - (max1 - min1) >= delta)
6322 delta += max1 - min1;
6324 delta = SSize_t_MAX;
6326 if (flags & SCF_DO_STCLASS_OR) {
6327 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6329 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6330 flags &= ~SCF_DO_STCLASS;
6333 else if (flags & SCF_DO_STCLASS_AND) {
6335 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6336 flags &= ~SCF_DO_STCLASS;
6339 /* Switch to OR mode: cache the old value of
6340 * data->start_class */
6342 StructCopy(data->start_class, and_withp, regnode_ssc);
6343 flags &= ~SCF_DO_STCLASS_AND;
6344 StructCopy(&accum, data->start_class, regnode_ssc);
6345 flags |= SCF_DO_STCLASS_OR;
6352 else if (PL_regkind[OP(scan)] == TRIE) {
6353 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6356 min += trie->minlen;
6357 delta += (trie->maxlen - trie->minlen);
6358 flags &= ~SCF_DO_STCLASS; /* xxx */
6359 if (flags & SCF_DO_SUBSTR) {
6360 /* Cannot expect anything... */
6361 scan_commit(pRExC_state, data, minlenp, is_inf);
6362 data->pos_min += trie->minlen;
6363 data->pos_delta += (trie->maxlen - trie->minlen);
6364 if (trie->maxlen != trie->minlen)
6365 data->cur_is_floating = 1; /* float */
6367 if (trie->jump) /* no more substrings -- for now /grr*/
6368 flags &= ~SCF_DO_SUBSTR;
6370 #endif /* old or new */
6371 #endif /* TRIE_STUDY_OPT */
6373 /* Else: zero-length, ignore. */
6374 scan = regnext(scan);
6379 /* we need to unwind recursion. */
6382 DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6383 DEBUG_PEEP("fend", scan, depth, flags);
6385 /* restore previous context */
6386 last = frame->last_regnode;
6387 scan = frame->next_regnode;
6388 stopparen = frame->stopparen;
6389 recursed_depth = frame->prev_recursed_depth;
6391 RExC_frame_last = frame->prev_frame;
6392 frame = frame->this_prev_frame;
6393 goto fake_study_recurse;
6397 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6400 *deltap = is_inf_internal ? SSize_t_MAX : delta;
6402 if (flags & SCF_DO_SUBSTR && is_inf)
6403 data->pos_delta = SSize_t_MAX - data->pos_min;
6404 if (is_par > (I32)U8_MAX)
6406 if (is_par && pars==1 && data) {
6407 data->flags |= SF_IN_PAR;
6408 data->flags &= ~SF_HAS_PAR;
6410 else if (pars && data) {
6411 data->flags |= SF_HAS_PAR;
6412 data->flags &= ~SF_IN_PAR;
6414 if (flags & SCF_DO_STCLASS_OR)
6415 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6416 if (flags & SCF_TRIE_RESTUDY)
6417 data->flags |= SCF_TRIE_RESTUDY;
6419 DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6422 SSize_t final_minlen= min < stopmin ? min : stopmin;
6424 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6425 if (final_minlen > SSize_t_MAX - delta)
6426 RExC_maxlen = SSize_t_MAX;
6427 else if (RExC_maxlen < final_minlen + delta)
6428 RExC_maxlen = final_minlen + delta;
6430 return final_minlen;
6432 NOT_REACHED; /* NOTREACHED */
6436 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6438 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6440 PERL_ARGS_ASSERT_ADD_DATA;
6442 Renewc(RExC_rxi->data,
6443 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6444 char, struct reg_data);
6446 Renew(RExC_rxi->data->what, count + n, U8);
6448 Newx(RExC_rxi->data->what, n, U8);
6449 RExC_rxi->data->count = count + n;
6450 Copy(s, RExC_rxi->data->what + count, n, U8);
6454 /*XXX: todo make this not included in a non debugging perl, but appears to be
6455 * used anyway there, in 'use re' */
6456 #ifndef PERL_IN_XSUB_RE
6458 Perl_reginitcolors(pTHX)
6460 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6462 char *t = savepv(s);
6466 t = strchr(t, '\t');
6472 PL_colors[i] = t = (char *)"";
6477 PL_colors[i++] = (char *)"";
6484 #ifdef TRIE_STUDY_OPT
6485 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
6488 (data.flags & SCF_TRIE_RESTUDY) \
6496 #define CHECK_RESTUDY_GOTO_butfirst
6500 * pregcomp - compile a regular expression into internal code
6502 * Decides which engine's compiler to call based on the hint currently in
6506 #ifndef PERL_IN_XSUB_RE
6508 /* return the currently in-scope regex engine (or the default if none) */
6510 regexp_engine const *
6511 Perl_current_re_engine(pTHX)
6513 if (IN_PERL_COMPILETIME) {
6514 HV * const table = GvHV(PL_hintgv);
6517 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6518 return &PL_core_reg_engine;
6519 ptr = hv_fetchs(table, "regcomp", FALSE);
6520 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6521 return &PL_core_reg_engine;
6522 return INT2PTR(regexp_engine*, SvIV(*ptr));
6526 if (!PL_curcop->cop_hints_hash)
6527 return &PL_core_reg_engine;
6528 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6529 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6530 return &PL_core_reg_engine;
6531 return INT2PTR(regexp_engine*, SvIV(ptr));
6537 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6539 regexp_engine const *eng = current_re_engine();
6540 GET_RE_DEBUG_FLAGS_DECL;
6542 PERL_ARGS_ASSERT_PREGCOMP;
6544 /* Dispatch a request to compile a regexp to correct regexp engine. */
6546 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
6549 return CALLREGCOMP_ENG(eng, pattern, flags);
6553 /* public(ish) entry point for the perl core's own regex compiling code.
6554 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6555 * pattern rather than a list of OPs, and uses the internal engine rather
6556 * than the current one */
6559 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6561 SV *pat = pattern; /* defeat constness! */
6562 PERL_ARGS_ASSERT_RE_COMPILE;
6563 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6564 #ifdef PERL_IN_XSUB_RE
6567 &PL_core_reg_engine,
6569 NULL, NULL, rx_flags, 0);
6574 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6578 if (--cbs->refcnt > 0)
6580 for (n = 0; n < cbs->count; n++) {
6581 REGEXP *rx = cbs->cb[n].src_regex;
6583 cbs->cb[n].src_regex = NULL;
6584 SvREFCNT_dec_NN(rx);
6592 static struct reg_code_blocks *
6593 S_alloc_code_blocks(pTHX_ int ncode)
6595 struct reg_code_blocks *cbs;
6596 Newx(cbs, 1, struct reg_code_blocks);
6599 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6601 Newx(cbs->cb, ncode, struct reg_code_block);
6608 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6609 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6610 * point to the realloced string and length.
6612 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6616 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6617 char **pat_p, STRLEN *plen_p, int num_code_blocks)
6619 U8 *const src = (U8*)*pat_p;
6624 GET_RE_DEBUG_FLAGS_DECL;
6626 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6627 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6629 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6630 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6633 while (s < *plen_p) {
6634 append_utf8_from_native_byte(src[s], &d);
6636 if (n < num_code_blocks) {
6637 assert(pRExC_state->code_blocks);
6638 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6639 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6640 assert(*(d - 1) == '(');
6643 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6644 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6645 assert(*(d - 1) == ')');
6654 *pat_p = (char*) dst;
6656 RExC_orig_utf8 = RExC_utf8 = 1;
6661 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6662 * while recording any code block indices, and handling overloading,
6663 * nested qr// objects etc. If pat is null, it will allocate a new
6664 * string, or just return the first arg, if there's only one.
6666 * Returns the malloced/updated pat.
6667 * patternp and pat_count is the array of SVs to be concatted;
6668 * oplist is the optional list of ops that generated the SVs;
6669 * recompile_p is a pointer to a boolean that will be set if
6670 * the regex will need to be recompiled.
6671 * delim, if non-null is an SV that will be inserted between each element
6675 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6676 SV *pat, SV ** const patternp, int pat_count,
6677 OP *oplist, bool *recompile_p, SV *delim)
6681 bool use_delim = FALSE;
6682 bool alloced = FALSE;
6684 /* if we know we have at least two args, create an empty string,
6685 * then concatenate args to that. For no args, return an empty string */
6686 if (!pat && pat_count != 1) {
6692 for (svp = patternp; svp < patternp + pat_count; svp++) {
6695 STRLEN orig_patlen = 0;
6697 SV *msv = use_delim ? delim : *svp;
6698 if (!msv) msv = &PL_sv_undef;
6700 /* if we've got a delimiter, we go round the loop twice for each
6701 * svp slot (except the last), using the delimiter the second
6710 if (SvTYPE(msv) == SVt_PVAV) {
6711 /* we've encountered an interpolated array within
6712 * the pattern, e.g. /...@a..../. Expand the list of elements,
6713 * then recursively append elements.
6714 * The code in this block is based on S_pushav() */
6716 AV *const av = (AV*)msv;
6717 const SSize_t maxarg = AvFILL(av) + 1;
6721 assert(oplist->op_type == OP_PADAV
6722 || oplist->op_type == OP_RV2AV);
6723 oplist = OpSIBLING(oplist);
6726 if (SvRMAGICAL(av)) {
6729 Newx(array, maxarg, SV*);
6731 for (i=0; i < maxarg; i++) {
6732 SV ** const svp = av_fetch(av, i, FALSE);
6733 array[i] = svp ? *svp : &PL_sv_undef;
6737 array = AvARRAY(av);
6739 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6740 array, maxarg, NULL, recompile_p,
6742 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6748 /* we make the assumption here that each op in the list of
6749 * op_siblings maps to one SV pushed onto the stack,
6750 * except for code blocks, with have both an OP_NULL and
6752 * This allows us to match up the list of SVs against the
6753 * list of OPs to find the next code block.
6755 * Note that PUSHMARK PADSV PADSV ..
6757 * PADRANGE PADSV PADSV ..
6758 * so the alignment still works. */
6761 if (oplist->op_type == OP_NULL
6762 && (oplist->op_flags & OPf_SPECIAL))
6764 assert(n < pRExC_state->code_blocks->count);
6765 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6766 pRExC_state->code_blocks->cb[n].block = oplist;
6767 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6770 oplist = OpSIBLING(oplist); /* skip CONST */
6773 oplist = OpSIBLING(oplist);;
6776 /* apply magic and QR overloading to arg */
6779 if (SvROK(msv) && SvAMAGIC(msv)) {
6780 SV *sv = AMG_CALLunary(msv, regexp_amg);
6784 if (SvTYPE(sv) != SVt_REGEXP)
6785 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6790 /* try concatenation overload ... */
6791 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6792 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6795 /* overloading involved: all bets are off over literal
6796 * code. Pretend we haven't seen it */
6798 pRExC_state->code_blocks->count -= n;
6802 /* ... or failing that, try "" overload */
6803 while (SvAMAGIC(msv)
6804 && (sv = AMG_CALLunary(msv, string_amg))
6808 && SvRV(msv) == SvRV(sv))
6813 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6817 /* this is a partially unrolled
6818 * sv_catsv_nomg(pat, msv);
6819 * that allows us to adjust code block indices if
6822 char *dst = SvPV_force_nomg(pat, dlen);
6824 if (SvUTF8(msv) && !SvUTF8(pat)) {
6825 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6826 sv_setpvn(pat, dst, dlen);
6829 sv_catsv_nomg(pat, msv);
6833 /* We have only one SV to process, but we need to verify
6834 * it is properly null terminated or we will fail asserts
6835 * later. In theory we probably shouldn't get such SV's,
6836 * but if we do we should handle it gracefully. */
6837 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
6838 /* not a string, or a string with a trailing null */
6841 /* a string with no trailing null, we need to copy it
6842 * so it has a trailing null */
6843 pat = sv_2mortal(newSVsv(msv));
6848 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
6851 /* extract any code blocks within any embedded qr//'s */
6852 if (rx && SvTYPE(rx) == SVt_REGEXP
6853 && RX_ENGINE((REGEXP*)rx)->op_comp)
6856 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6857 if (ri->code_blocks && ri->code_blocks->count) {
6859 /* the presence of an embedded qr// with code means
6860 * we should always recompile: the text of the
6861 * qr// may not have changed, but it may be a
6862 * different closure than last time */
6864 if (pRExC_state->code_blocks) {
6865 int new_count = pRExC_state->code_blocks->count
6866 + ri->code_blocks->count;
6867 Renew(pRExC_state->code_blocks->cb,
6868 new_count, struct reg_code_block);
6869 pRExC_state->code_blocks->count = new_count;
6872 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
6873 ri->code_blocks->count);
6875 for (i=0; i < ri->code_blocks->count; i++) {
6876 struct reg_code_block *src, *dst;
6877 STRLEN offset = orig_patlen
6878 + ReANY((REGEXP *)rx)->pre_prefix;
6879 assert(n < pRExC_state->code_blocks->count);
6880 src = &ri->code_blocks->cb[i];
6881 dst = &pRExC_state->code_blocks->cb[n];
6882 dst->start = src->start + offset;
6883 dst->end = src->end + offset;
6884 dst->block = src->block;
6885 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6894 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6903 /* see if there are any run-time code blocks in the pattern.
6904 * False positives are allowed */
6907 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6908 char *pat, STRLEN plen)
6913 PERL_UNUSED_CONTEXT;
6915 for (s = 0; s < plen; s++) {
6916 if ( pRExC_state->code_blocks
6917 && n < pRExC_state->code_blocks->count
6918 && s == pRExC_state->code_blocks->cb[n].start)
6920 s = pRExC_state->code_blocks->cb[n].end;
6924 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6926 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6928 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6935 /* Handle run-time code blocks. We will already have compiled any direct
6936 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6937 * copy of it, but with any literal code blocks blanked out and
6938 * appropriate chars escaped; then feed it into
6940 * eval "qr'modified_pattern'"
6944 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6948 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6950 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6951 * and merge them with any code blocks of the original regexp.
6953 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6954 * instead, just save the qr and return FALSE; this tells our caller that
6955 * the original pattern needs upgrading to utf8.
6959 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6960 char *pat, STRLEN plen)
6964 GET_RE_DEBUG_FLAGS_DECL;
6966 if (pRExC_state->runtime_code_qr) {
6967 /* this is the second time we've been called; this should
6968 * only happen if the main pattern got upgraded to utf8
6969 * during compilation; re-use the qr we compiled first time
6970 * round (which should be utf8 too)
6972 qr = pRExC_state->runtime_code_qr;
6973 pRExC_state->runtime_code_qr = NULL;
6974 assert(RExC_utf8 && SvUTF8(qr));
6980 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
6984 /* determine how many extra chars we need for ' and \ escaping */
6985 for (s = 0; s < plen; s++) {
6986 if (pat[s] == '\'' || pat[s] == '\\')
6990 Newx(newpat, newlen, char);
6992 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6994 for (s = 0; s < plen; s++) {
6995 if ( pRExC_state->code_blocks
6996 && n < pRExC_state->code_blocks->count
6997 && s == pRExC_state->code_blocks->cb[n].start)
6999 /* blank out literal code block so that they aren't
7000 * recompiled: eg change from/to:
7010 assert(pat[s] == '(');
7011 assert(pat[s+1] == '?');
7015 while (s < pRExC_state->code_blocks->cb[n].end) {
7023 if (pat[s] == '\'' || pat[s] == '\\')
7028 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7030 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7036 Perl_re_printf( aTHX_
7037 "%sre-parsing pattern for runtime code:%s %s\n",
7038 PL_colors[4], PL_colors[5], newpat);
7041 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7047 PUSHSTACKi(PERLSI_REQUIRE);
7048 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7049 * parsing qr''; normally only q'' does this. It also alters
7051 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7052 SvREFCNT_dec_NN(sv);
7057 SV * const errsv = ERRSV;
7058 if (SvTRUE_NN(errsv))
7059 /* use croak_sv ? */
7060 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7062 assert(SvROK(qr_ref));
7064 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7065 /* the leaving below frees the tmp qr_ref.
7066 * Give qr a life of its own */
7074 if (!RExC_utf8 && SvUTF8(qr)) {
7075 /* first time through; the pattern got upgraded; save the
7076 * qr for the next time through */
7077 assert(!pRExC_state->runtime_code_qr);
7078 pRExC_state->runtime_code_qr = qr;
7083 /* extract any code blocks within the returned qr// */
7086 /* merge the main (r1) and run-time (r2) code blocks into one */
7088 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7089 struct reg_code_block *new_block, *dst;
7090 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7094 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7096 SvREFCNT_dec_NN(qr);
7100 if (!r1->code_blocks)
7101 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7103 r1c = r1->code_blocks->count;
7104 r2c = r2->code_blocks->count;
7106 Newx(new_block, r1c + r2c, struct reg_code_block);
7110 while (i1 < r1c || i2 < r2c) {
7111 struct reg_code_block *src;
7115 src = &r2->code_blocks->cb[i2++];
7119 src = &r1->code_blocks->cb[i1++];
7120 else if ( r1->code_blocks->cb[i1].start
7121 < r2->code_blocks->cb[i2].start)
7123 src = &r1->code_blocks->cb[i1++];
7124 assert(src->end < r2->code_blocks->cb[i2].start);
7127 assert( r1->code_blocks->cb[i1].start
7128 > r2->code_blocks->cb[i2].start);
7129 src = &r2->code_blocks->cb[i2++];
7131 assert(src->end < r1->code_blocks->cb[i1].start);
7134 assert(pat[src->start] == '(');
7135 assert(pat[src->end] == ')');
7136 dst->start = src->start;
7137 dst->end = src->end;
7138 dst->block = src->block;
7139 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7143 r1->code_blocks->count += r2c;
7144 Safefree(r1->code_blocks->cb);
7145 r1->code_blocks->cb = new_block;
7148 SvREFCNT_dec_NN(qr);
7154 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7155 struct reg_substr_datum *rsd,
7156 struct scan_data_substrs *sub,
7157 STRLEN longest_length)
7159 /* This is the common code for setting up the floating and fixed length
7160 * string data extracted from Perl_re_op_compile() below. Returns a boolean
7161 * as to whether succeeded or not */
7165 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
7166 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7168 if (! (longest_length
7169 || (eol /* Can't have SEOL and MULTI */
7170 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7172 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7173 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7178 /* copy the information about the longest from the reg_scan_data
7179 over to the program. */
7180 if (SvUTF8(sub->str)) {
7182 rsd->utf8_substr = sub->str;
7184 rsd->substr = sub->str;
7185 rsd->utf8_substr = NULL;
7187 /* end_shift is how many chars that must be matched that
7188 follow this item. We calculate it ahead of time as once the
7189 lookbehind offset is added in we lose the ability to correctly
7191 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7192 rsd->end_shift = ml - sub->min_offset
7194 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7196 + (SvTAIL(sub->str) != 0)
7200 t = (eol/* Can't have SEOL and MULTI */
7201 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7202 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7208 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7210 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7211 * properly wrapped with the right modifiers */
7213 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7214 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7215 != REGEX_DEPENDS_CHARSET);
7217 /* The caret is output if there are any defaults: if not all the STD
7218 * flags are set, or if no character set specifier is needed */
7220 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7222 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7223 == REG_RUN_ON_COMMENT_SEEN);
7224 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7225 >> RXf_PMf_STD_PMMOD_SHIFT);
7226 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
7228 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7230 /* We output all the necessary flags; we never output a minus, as all
7231 * those are defaults, so are
7232 * covered by the caret */
7233 const STRLEN wraplen = pat_len + has_p + has_runon
7234 + has_default /* If needs a caret */
7235 + PL_bitcount[reganch] /* 1 char for each set standard flag */
7237 /* If needs a character set specifier */
7238 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7239 + (sizeof("(?:)") - 1);
7241 PERL_ARGS_ASSERT_SET_REGEX_PV;
7243 /* make sure PL_bitcount bounds not exceeded */
7244 assert(sizeof(STD_PAT_MODS) <= 8);
7246 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7249 SvFLAGS(Rx) |= SVf_UTF8;
7252 /* If a default, cover it using the caret */
7254 *p++= DEFAULT_PAT_MOD;
7260 name = get_regex_charset_name(RExC_rx->extflags, &len);
7261 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
7263 name = UNICODE_PAT_MODS;
7264 len = sizeof(UNICODE_PAT_MODS) - 1;
7266 Copy(name, p, len, char);
7270 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7273 while((ch = *fptr++)) {
7281 Copy(RExC_precomp, p, pat_len, char);
7282 assert ((RX_WRAPPED(Rx) - p) < 16);
7283 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7286 /* Adding a trailing \n causes this to compile properly:
7287 my $R = qr / A B C # D E/x; /($R)/
7288 Otherwise the parens are considered part of the comment */
7293 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7297 * Perl_re_op_compile - the perl internal RE engine's function to compile a
7298 * regular expression into internal code.
7299 * The pattern may be passed either as:
7300 * a list of SVs (patternp plus pat_count)
7301 * a list of OPs (expr)
7302 * If both are passed, the SV list is used, but the OP list indicates
7303 * which SVs are actually pre-compiled code blocks
7305 * The SVs in the list have magic and qr overloading applied to them (and
7306 * the list may be modified in-place with replacement SVs in the latter
7309 * If the pattern hasn't changed from old_re, then old_re will be
7312 * eng is the current engine. If that engine has an op_comp method, then
7313 * handle directly (i.e. we assume that op_comp was us); otherwise, just
7314 * do the initial concatenation of arguments and pass on to the external
7317 * If is_bare_re is not null, set it to a boolean indicating whether the
7318 * arg list reduced (after overloading) to a single bare regex which has
7319 * been returned (i.e. /$qr/).
7321 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7323 * pm_flags contains the PMf_* flags, typically based on those from the
7324 * pm_flags field of the related PMOP. Currently we're only interested in
7325 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
7327 * For many years this code had an initial sizing pass that calculated
7328 * (sometimes incorrectly, leading to security holes) the size needed for the
7329 * compiled pattern. That was changed by commit
7330 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7331 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
7332 * references to this sizing pass.
7334 * Now, an initial crude guess as to the size needed is made, based on the
7335 * length of the pattern. Patches welcome to improve that guess. That amount
7336 * of space is malloc'd and then immediately freed, and then clawed back node
7337 * by node. This design is to minimze, to the extent possible, memory churn
7338 * when doing the the reallocs.
7340 * A separate parentheses counting pass may be needed in some cases.
7341 * (Previously the sizing pass did this.) Patches welcome to reduce the number
7344 * The existence of a sizing pass necessitated design decisions that are no
7345 * longer needed. There are potential areas of simplification.
7347 * Beware that the optimization-preparation code in here knows about some
7348 * of the structure of the compiled regexp. [I'll say.]
7352 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7353 OP *expr, const regexp_engine* eng, REGEXP *old_re,
7354 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7357 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
7365 SV** new_patternp = patternp;
7367 /* these are all flags - maybe they should be turned
7368 * into a single int with different bit masks */
7369 I32 sawlookahead = 0;
7374 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7376 bool runtime_code = 0;
7378 RExC_state_t RExC_state;
7379 RExC_state_t * const pRExC_state = &RExC_state;
7380 #ifdef TRIE_STUDY_OPT
7382 RExC_state_t copyRExC_state;
7384 GET_RE_DEBUG_FLAGS_DECL;
7386 PERL_ARGS_ASSERT_RE_OP_COMPILE;
7388 DEBUG_r(if (!PL_colorset) reginitcolors());
7390 /* Initialize these here instead of as-needed, as is quick and avoids
7391 * having to test them each time otherwise */
7392 if (! PL_InBitmap) {
7394 char * dump_len_string;
7397 /* This is calculated here, because the Perl program that generates the
7398 * static global ones doesn't currently have access to
7399 * NUM_ANYOF_CODE_POINTS */
7400 PL_InBitmap = _new_invlist(2);
7401 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
7402 NUM_ANYOF_CODE_POINTS - 1);
7404 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
7405 if ( ! dump_len_string
7406 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
7408 PL_dump_re_max_len = 60; /* A reasonable default */
7413 pRExC_state->warn_text = NULL;
7414 pRExC_state->unlexed_names = NULL;
7415 pRExC_state->code_blocks = NULL;
7418 *is_bare_re = FALSE;
7420 if (expr && (expr->op_type == OP_LIST ||
7421 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7422 /* allocate code_blocks if needed */
7426 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7427 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7428 ncode++; /* count of DO blocks */
7431 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7435 /* compile-time pattern with just OP_CONSTs and DO blocks */
7440 /* find how many CONSTs there are */
7443 if (expr->op_type == OP_CONST)
7446 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7447 if (o->op_type == OP_CONST)
7451 /* fake up an SV array */
7453 assert(!new_patternp);
7454 Newx(new_patternp, n, SV*);
7455 SAVEFREEPV(new_patternp);
7459 if (expr->op_type == OP_CONST)
7460 new_patternp[n] = cSVOPx_sv(expr);
7462 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7463 if (o->op_type == OP_CONST)
7464 new_patternp[n++] = cSVOPo_sv;
7469 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7470 "Assembling pattern from %d elements%s\n", pat_count,
7471 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7473 /* set expr to the first arg op */
7475 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7476 && expr->op_type != OP_CONST)
7478 expr = cLISTOPx(expr)->op_first;
7479 assert( expr->op_type == OP_PUSHMARK
7480 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7481 || expr->op_type == OP_PADRANGE);
7482 expr = OpSIBLING(expr);
7485 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7486 expr, &recompile, NULL);
7488 /* handle bare (possibly after overloading) regex: foo =~ $re */
7493 if (SvTYPE(re) == SVt_REGEXP) {
7497 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7498 "Precompiled pattern%s\n",
7499 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7505 exp = SvPV_nomg(pat, plen);
7507 if (!eng->op_comp) {
7508 if ((SvUTF8(pat) && IN_BYTES)
7509 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7511 /* make a temporary copy; either to convert to bytes,
7512 * or to avoid repeating get-magic / overloaded stringify */
7513 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7514 (IN_BYTES ? 0 : SvUTF8(pat)));
7516 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7519 /* ignore the utf8ness if the pattern is 0 length */
7520 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7521 RExC_uni_semantics = 0;
7522 RExC_contains_locale = 0;
7523 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7524 RExC_in_script_run = 0;
7525 RExC_study_started = 0;
7526 pRExC_state->runtime_code_qr = NULL;
7527 RExC_frame_head= NULL;
7528 RExC_frame_last= NULL;
7529 RExC_frame_count= 0;
7530 RExC_latest_warn_offset = 0;
7531 RExC_use_BRANCHJ = 0;
7532 RExC_total_parens = 0;
7533 RExC_open_parens = NULL;
7534 RExC_close_parens = NULL;
7535 RExC_paren_names = NULL;
7537 RExC_seen_d_op = FALSE;
7539 RExC_paren_name_list = NULL;
7543 RExC_mysv1= sv_newmortal();
7544 RExC_mysv2= sv_newmortal();
7548 SV *dsv= sv_newmortal();
7549 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7550 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
7551 PL_colors[4], PL_colors[5], s);
7554 /* we jump here if we have to recompile, e.g., from upgrading the pattern
7557 if ((pm_flags & PMf_USE_RE_EVAL)
7558 /* this second condition covers the non-regex literal case,
7559 * i.e. $foo =~ '(?{})'. */
7560 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7562 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7565 /* return old regex if pattern hasn't changed */
7566 /* XXX: note in the below we have to check the flags as well as the
7569 * Things get a touch tricky as we have to compare the utf8 flag
7570 * independently from the compile flags. */
7574 && !!RX_UTF8(old_re) == !!RExC_utf8
7575 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7576 && RX_PRECOMP(old_re)
7577 && RX_PRELEN(old_re) == plen
7578 && memEQ(RX_PRECOMP(old_re), exp, plen)
7579 && !runtime_code /* with runtime code, always recompile */ )
7584 /* Allocate the pattern's SV */
7585 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7586 RExC_rx = ReANY(Rx);
7587 if ( RExC_rx == NULL )
7588 FAIL("Regexp out of space");
7590 rx_flags = orig_rx_flags;
7592 if ( (UTF || RExC_uni_semantics)
7593 && initial_charset == REGEX_DEPENDS_CHARSET)
7596 /* Set to use unicode semantics if the pattern is in utf8 and has the
7597 * 'depends' charset specified, as it means unicode when utf8 */
7598 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7599 RExC_uni_semantics = 1;
7602 RExC_pm_flags = pm_flags;
7605 assert(TAINTING_get || !TAINT_get);
7607 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7609 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7610 /* whoops, we have a non-utf8 pattern, whilst run-time code
7611 * got compiled as utf8. Try again with a utf8 pattern */
7612 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7613 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7617 assert(!pRExC_state->runtime_code_qr);
7623 RExC_in_lookbehind = 0;
7624 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7626 RExC_recode_x_to_native = 0;
7628 RExC_in_multi_char_class = 0;
7630 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7631 RExC_precomp_end = RExC_end = exp + plen;
7633 RExC_whilem_seen = 0;
7635 RExC_recurse = NULL;
7636 RExC_study_chunk_recursed = NULL;
7637 RExC_study_chunk_recursed_bytes= 0;
7638 RExC_recurse_count = 0;
7639 pRExC_state->code_index = 0;
7641 /* Initialize the string in the compiled pattern. This is so that there is
7642 * something to output if necessary */
7643 set_regex_pv(pRExC_state, Rx);
7646 Perl_re_printf( aTHX_
7647 "Starting parse and generation\n");
7649 RExC_lastparse=NULL;
7652 /* Allocate space and zero-initialize. Note, the two step process
7653 of zeroing when in debug mode, thus anything assigned has to
7654 happen after that */
7657 /* On the first pass of the parse, we guess how big this will be. Then
7658 * we grow in one operation to that amount and then give it back. As
7659 * we go along, we re-allocate what we need.
7661 * XXX Currently the guess is essentially that the pattern will be an
7662 * EXACT node with one byte input, one byte output. This is crude, and
7663 * better heuristics are welcome.
7665 * On any subsequent passes, we guess what we actually computed in the
7666 * latest earlier pass. Such a pass probably didn't complete so is
7667 * missing stuff. We could improve those guesses by knowing where the
7668 * parse stopped, and use the length so far plus apply the above
7669 * assumption to what's left. */
7670 RExC_size = STR_SZ(RExC_end - RExC_start);
7673 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7674 if ( RExC_rxi == NULL )
7675 FAIL("Regexp out of space");
7677 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7678 RXi_SET( RExC_rx, RExC_rxi );
7680 /* We start from 0 (over from 0 in the case this is a reparse. The first
7681 * node parsed will give back any excess memory we have allocated so far).
7685 /* non-zero initialization begins here */
7686 RExC_rx->engine= eng;
7687 RExC_rx->extflags = rx_flags;
7688 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7690 if (pm_flags & PMf_IS_QR) {
7691 RExC_rxi->code_blocks = pRExC_state->code_blocks;
7692 if (RExC_rxi->code_blocks) {
7693 RExC_rxi->code_blocks->refcnt++;
7697 RExC_rx->intflags = 0;
7699 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
7702 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7703 * code makes sure the final byte is an uncounted NUL. But should this
7704 * ever not be the case, lots of things could read beyond the end of the
7705 * buffer: loops like
7706 * while(isFOO(*RExC_parse)) RExC_parse++;
7707 * strchr(RExC_parse, "foo");
7708 * etc. So it is worth noting. */
7709 assert(*RExC_end == '\0');
7713 RExC_parens_buf_size = 0;
7714 RExC_emit_start = RExC_rxi->program;
7715 pRExC_state->code_index = 0;
7717 *((char*) RExC_emit_start) = (char) REG_MAGIC;
7721 if (reg(pRExC_state, 0, &flags, 1)) {
7723 /* Success!, But we may need to redo the parse knowing how many parens
7724 * there actually are */
7725 if (IN_PARENS_PASS) {
7726 flags |= RESTART_PARSE;
7729 /* We have that number in RExC_npar */
7730 RExC_total_parens = RExC_npar;
7732 else if (! MUST_RESTART(flags)) {
7734 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7737 /* Here, we either have success, or we have to redo the parse for some reason */
7738 if (MUST_RESTART(flags)) {
7740 /* It's possible to write a regexp in ascii that represents Unicode
7741 codepoints outside of the byte range, such as via \x{100}. If we
7742 detect such a sequence we have to convert the entire pattern to utf8
7743 and then recompile, as our sizing calculation will have been based
7744 on 1 byte == 1 character, but we will need to use utf8 to encode
7745 at least some part of the pattern, and therefore must convert the whole
7748 if (flags & NEED_UTF8) {
7750 /* We have stored the offset of the final warning output so far.
7751 * That must be adjusted. Any variant characters between the start
7752 * of the pattern and this warning count for 2 bytes in the final,
7753 * so just add them again */
7754 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7755 RExC_latest_warn_offset +=
7756 variant_under_utf8_count((U8 *) exp, (U8 *) exp
7757 + RExC_latest_warn_offset);
7759 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7760 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7761 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7764 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7767 if (ALL_PARENS_COUNTED) {
7768 /* Make enough room for all the known parens, and zero it */
7769 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7770 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7771 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
7773 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7774 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7776 else { /* Parse did not complete. Reinitialize the parentheses
7778 RExC_total_parens = 0;
7779 if (RExC_open_parens) {
7780 Safefree(RExC_open_parens);
7781 RExC_open_parens = NULL;
7783 if (RExC_close_parens) {
7784 Safefree(RExC_close_parens);
7785 RExC_close_parens = NULL;
7789 /* Clean up what we did in this parse */
7790 SvREFCNT_dec_NN(RExC_rx_sv);
7795 /* Here, we have successfully parsed and generated the pattern's program
7796 * for the regex engine. We are ready to finish things up and look for
7799 /* Update the string to compile, with correct modifiers, etc */
7800 set_regex_pv(pRExC_state, Rx);
7802 RExC_rx->nparens = RExC_total_parens - 1;
7804 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7805 if (RExC_whilem_seen > 15)
7806 RExC_whilem_seen = 15;
7809 Perl_re_printf( aTHX_
7810 "Required size %" IVdf " nodes\n", (IV)RExC_size);
7812 RExC_lastparse=NULL;
7815 #ifdef RE_TRACK_PATTERN_OFFSETS
7816 DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7817 "%s %" UVuf " bytes for offset annotations.\n",
7818 RExC_offsets ? "Got" : "Couldn't get",
7819 (UV)((RExC_offsets[0] * 2 + 1))));
7820 DEBUG_OFFSETS_r(if (RExC_offsets) {
7821 const STRLEN len = RExC_offsets[0];
7823 GET_RE_DEBUG_FLAGS_DECL;
7824 Perl_re_printf( aTHX_
7825 "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7826 for (i = 1; i <= len; i++) {
7827 if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7828 Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ",
7829 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7831 Perl_re_printf( aTHX_ "\n");
7835 SetProgLen(RExC_rxi,RExC_size);
7839 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
7842 /* XXXX To minimize changes to RE engine we always allocate
7843 3-units-long substrs field. */
7844 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
7845 if (RExC_recurse_count) {
7846 Newx(RExC_recurse, RExC_recurse_count, regnode *);
7847 SAVEFREEPV(RExC_recurse);
7850 if (RExC_seen & REG_RECURSE_SEEN) {
7851 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
7852 * So its 1 if there are no parens. */
7853 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
7854 ((RExC_total_parens & 0x07) != 0);
7855 Newx(RExC_study_chunk_recursed,
7856 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7857 SAVEFREEPV(RExC_study_chunk_recursed);
7861 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
7863 RExC_study_chunk_recursed_count= 0;
7865 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
7866 if (RExC_study_chunk_recursed) {
7867 Zero(RExC_study_chunk_recursed,
7868 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
7872 #ifdef TRIE_STUDY_OPT
7874 StructCopy(&zero_scan_data, &data, scan_data_t);
7875 copyRExC_state = RExC_state;
7878 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
7880 RExC_state = copyRExC_state;
7881 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
7882 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
7884 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
7885 StructCopy(&zero_scan_data, &data, scan_data_t);
7888 StructCopy(&zero_scan_data, &data, scan_data_t);
7891 /* Dig out information for optimizations. */
7892 RExC_rx->extflags = RExC_flags; /* was pm_op */
7893 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
7896 SvUTF8_on(Rx); /* Unicode in it? */
7897 RExC_rxi->regstclass = NULL;
7898 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
7899 RExC_rx->intflags |= PREGf_NAUGHTY;
7900 scan = RExC_rxi->program + 1; /* First BRANCH. */
7902 /* testing for BRANCH here tells us whether there is "must appear"
7903 data in the pattern. If there is then we can use it for optimisations */
7904 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
7907 STRLEN longest_length[2];
7908 regnode_ssc ch_class; /* pointed to by data */
7910 SSize_t last_close = 0; /* pointed to by data */
7911 regnode *first= scan;
7912 regnode *first_next= regnext(first);
7916 * Skip introductions and multiplicators >= 1
7917 * so that we can extract the 'meat' of the pattern that must
7918 * match in the large if() sequence following.
7919 * NOTE that EXACT is NOT covered here, as it is normally
7920 * picked up by the optimiser separately.
7922 * This is unfortunate as the optimiser isnt handling lookahead
7923 * properly currently.
7926 while ((OP(first) == OPEN && (sawopen = 1)) ||
7927 /* An OR of *one* alternative - should not happen now. */
7928 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
7929 /* for now we can't handle lookbehind IFMATCH*/
7930 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
7931 (OP(first) == PLUS) ||
7932 (OP(first) == MINMOD) ||
7933 /* An {n,m} with n>0 */
7934 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
7935 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
7938 * the only op that could be a regnode is PLUS, all the rest
7939 * will be regnode_1 or regnode_2.
7941 * (yves doesn't think this is true)
7943 if (OP(first) == PLUS)
7946 if (OP(first) == MINMOD)
7948 first += regarglen[OP(first)];
7950 first = NEXTOPER(first);
7951 first_next= regnext(first);
7954 /* Starting-point info. */
7956 DEBUG_PEEP("first:", first, 0, 0);
7957 /* Ignore EXACT as we deal with it later. */
7958 if (PL_regkind[OP(first)] == EXACT) {
7959 if ( OP(first) == EXACT
7960 || OP(first) == EXACT_ONLY8
7961 || OP(first) == EXACTL)
7963 NOOP; /* Empty, get anchored substr later. */
7966 RExC_rxi->regstclass = first;
7969 else if (PL_regkind[OP(first)] == TRIE &&
7970 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
7972 /* this can happen only on restudy */
7973 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7976 else if (REGNODE_SIMPLE(OP(first)))
7977 RExC_rxi->regstclass = first;
7978 else if (PL_regkind[OP(first)] == BOUND ||
7979 PL_regkind[OP(first)] == NBOUND)
7980 RExC_rxi->regstclass = first;
7981 else if (PL_regkind[OP(first)] == BOL) {
7982 RExC_rx->intflags |= (OP(first) == MBOL
7985 first = NEXTOPER(first);
7988 else if (OP(first) == GPOS) {
7989 RExC_rx->intflags |= PREGf_ANCH_GPOS;
7990 first = NEXTOPER(first);
7993 else if ((!sawopen || !RExC_sawback) &&
7995 (OP(first) == STAR &&
7996 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7997 !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
7999 /* turn .* into ^.* with an implied $*=1 */
8001 (OP(NEXTOPER(first)) == REG_ANY)
8004 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8005 first = NEXTOPER(first);
8008 if (sawplus && !sawminmod && !sawlookahead
8009 && (!sawopen || !RExC_sawback)
8010 && !pRExC_state->code_blocks) /* May examine pos and $& */
8011 /* x+ must match at the 1st pos of run of x's */
8012 RExC_rx->intflags |= PREGf_SKIP;
8014 /* Scan is after the zeroth branch, first is atomic matcher. */
8015 #ifdef TRIE_STUDY_OPT
8018 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8019 (IV)(first - scan + 1))
8023 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8024 (IV)(first - scan + 1))
8030 * If there's something expensive in the r.e., find the
8031 * longest literal string that must appear and make it the
8032 * regmust. Resolve ties in favor of later strings, since
8033 * the regstart check works with the beginning of the r.e.
8034 * and avoiding duplication strengthens checking. Not a
8035 * strong reason, but sufficient in the absence of others.
8036 * [Now we resolve ties in favor of the earlier string if
8037 * it happens that c_offset_min has been invalidated, since the
8038 * earlier string may buy us something the later one won't.]
8041 data.substrs[0].str = newSVpvs("");
8042 data.substrs[1].str = newSVpvs("");
8043 data.last_found = newSVpvs("");
8044 data.cur_is_floating = 0; /* initially any found substring is fixed */
8045 ENTER_with_name("study_chunk");
8046 SAVEFREESV(data.substrs[0].str);
8047 SAVEFREESV(data.substrs[1].str);
8048 SAVEFREESV(data.last_found);
8050 if (!RExC_rxi->regstclass) {
8051 ssc_init(pRExC_state, &ch_class);
8052 data.start_class = &ch_class;
8053 stclass_flag = SCF_DO_STCLASS_AND;
8054 } else /* XXXX Check for BOUND? */
8056 data.last_closep = &last_close;
8060 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8061 * (NO top level branches)
8063 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8064 scan + RExC_size, /* Up to end */
8066 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8067 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8071 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8074 if ( RExC_total_parens == 1 && !data.cur_is_floating
8075 && data.last_start_min == 0 && data.last_end > 0
8076 && !RExC_seen_zerolen
8077 && !(RExC_seen & REG_VERBARG_SEEN)
8078 && !(RExC_seen & REG_GPOS_SEEN)
8080 RExC_rx->extflags |= RXf_CHECK_ALL;
8082 scan_commit(pRExC_state, &data,&minlen, 0);
8085 /* XXX this is done in reverse order because that's the way the
8086 * code was before it was parameterised. Don't know whether it
8087 * actually needs doing in reverse order. DAPM */
8088 for (i = 1; i >= 0; i--) {
8089 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8092 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
8093 && data.substrs[0].min_offset
8094 == data.substrs[1].min_offset
8095 && SvCUR(data.substrs[0].str)
8096 == SvCUR(data.substrs[1].str)
8098 && S_setup_longest (aTHX_ pRExC_state,
8099 &(RExC_rx->substrs->data[i]),
8103 RExC_rx->substrs->data[i].min_offset =
8104 data.substrs[i].min_offset - data.substrs[i].lookbehind;
8106 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8107 /* Don't offset infinity */
8108 if (data.substrs[i].max_offset < SSize_t_MAX)
8109 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8110 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8113 RExC_rx->substrs->data[i].substr = NULL;
8114 RExC_rx->substrs->data[i].utf8_substr = NULL;
8115 longest_length[i] = 0;
8119 LEAVE_with_name("study_chunk");
8121 if (RExC_rxi->regstclass
8122 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8123 RExC_rxi->regstclass = NULL;
8125 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8126 || RExC_rx->substrs->data[0].min_offset)
8128 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8129 && is_ssc_worth_it(pRExC_state, data.start_class))
8131 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8133 ssc_finalize(pRExC_state, data.start_class);
8135 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8136 StructCopy(data.start_class,
8137 (regnode_ssc*)RExC_rxi->data->data[n],
8139 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8140 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8141 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8142 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8143 Perl_re_printf( aTHX_
8144 "synthetic stclass \"%s\".\n",
8145 SvPVX_const(sv));});
8146 data.start_class = NULL;
8149 /* A temporary algorithm prefers floated substr to fixed one of
8150 * same length to dig more info. */
8151 i = (longest_length[0] <= longest_length[1]);
8152 RExC_rx->substrs->check_ix = i;
8153 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
8154 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
8155 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
8156 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8157 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8158 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8159 RExC_rx->intflags |= PREGf_NOSCAN;
8161 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8162 RExC_rx->extflags |= RXf_USE_INTUIT;
8163 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8164 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8167 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8168 if ( (STRLEN)minlen < longest_length[1] )
8169 minlen= longest_length[1];
8170 if ( (STRLEN)minlen < longest_length[0] )
8171 minlen= longest_length[0];
8175 /* Several toplevels. Best we can is to set minlen. */
8177 regnode_ssc ch_class;
8178 SSize_t last_close = 0;
8180 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
8182 scan = RExC_rxi->program + 1;
8183 ssc_init(pRExC_state, &ch_class);
8184 data.start_class = &ch_class;
8185 data.last_closep = &last_close;
8189 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8190 * (patterns WITH top level branches)
8192 minlen = study_chunk(pRExC_state,
8193 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8194 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8195 ? SCF_TRIE_DOING_RESTUDY
8199 CHECK_RESTUDY_GOTO_butfirst(NOOP);
8201 RExC_rx->check_substr = NULL;
8202 RExC_rx->check_utf8 = NULL;
8203 RExC_rx->substrs->data[0].substr = NULL;
8204 RExC_rx->substrs->data[0].utf8_substr = NULL;
8205 RExC_rx->substrs->data[1].substr = NULL;
8206 RExC_rx->substrs->data[1].utf8_substr = NULL;
8208 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8209 && is_ssc_worth_it(pRExC_state, data.start_class))
8211 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8213 ssc_finalize(pRExC_state, data.start_class);
8215 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8216 StructCopy(data.start_class,
8217 (regnode_ssc*)RExC_rxi->data->data[n],
8219 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8220 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8221 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8222 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8223 Perl_re_printf( aTHX_
8224 "synthetic stclass \"%s\".\n",
8225 SvPVX_const(sv));});
8226 data.start_class = NULL;
8230 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8231 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8232 RExC_rx->maxlen = REG_INFTY;
8235 RExC_rx->maxlen = RExC_maxlen;
8238 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8239 the "real" pattern. */
8241 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8242 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8244 RExC_rx->minlenret = minlen;
8245 if (RExC_rx->minlen < minlen)
8246 RExC_rx->minlen = minlen;
8248 if (RExC_seen & REG_RECURSE_SEEN ) {
8249 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8250 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8252 if (RExC_seen & REG_GPOS_SEEN)
8253 RExC_rx->intflags |= PREGf_GPOS_SEEN;
8254 if (RExC_seen & REG_LOOKBEHIND_SEEN)
8255 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8257 if (pRExC_state->code_blocks)
8258 RExC_rx->extflags |= RXf_EVAL_SEEN;
8259 if (RExC_seen & REG_VERBARG_SEEN)
8261 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8262 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8264 if (RExC_seen & REG_CUTGROUP_SEEN)
8265 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8266 if (pm_flags & PMf_USE_RE_EVAL)
8267 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8268 if (RExC_paren_names)
8269 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8271 RXp_PAREN_NAMES(RExC_rx) = NULL;
8273 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8274 * so it can be used in pp.c */
8275 if (RExC_rx->intflags & PREGf_ANCH)
8276 RExC_rx->extflags |= RXf_IS_ANCHORED;
8280 /* this is used to identify "special" patterns that might result
8281 * in Perl NOT calling the regex engine and instead doing the match "itself",
8282 * particularly special cases in split//. By having the regex compiler
8283 * do this pattern matching at a regop level (instead of by inspecting the pattern)
8284 * we avoid weird issues with equivalent patterns resulting in different behavior,
8285 * AND we allow non Perl engines to get the same optimizations by the setting the
8286 * flags appropriately - Yves */
8287 regnode *first = RExC_rxi->program + 1;
8289 regnode *next = regnext(first);
8292 if (PL_regkind[fop] == NOTHING && nop == END)
8293 RExC_rx->extflags |= RXf_NULL;
8294 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8295 /* when fop is SBOL first->flags will be true only when it was
8296 * produced by parsing /\A/, and not when parsing /^/. This is
8297 * very important for the split code as there we want to
8298 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8299 * See rt #122761 for more details. -- Yves */
8300 RExC_rx->extflags |= RXf_START_ONLY;
8301 else if (fop == PLUS
8302 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8304 RExC_rx->extflags |= RXf_WHITE;
8305 else if ( RExC_rx->extflags & RXf_SPLIT
8306 && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
8307 && STR_LEN(first) == 1
8308 && *(STRING(first)) == ' '
8310 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8314 if (RExC_contains_locale) {
8315 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8319 if (RExC_paren_names) {
8320 RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8321 RExC_rxi->data->data[RExC_rxi->name_list_idx]
8322 = (void*)SvREFCNT_inc(RExC_paren_name_list);
8325 RExC_rxi->name_list_idx = 0;
8327 while ( RExC_recurse_count > 0 ) {
8328 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8330 * This data structure is set up in study_chunk() and is used
8331 * to calculate the distance between a GOSUB regopcode and
8332 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8335 * If for some reason someone writes code that optimises
8336 * away a GOSUB opcode then the assert should be changed to
8337 * an if(scan) to guard the ARG2L_SET() - Yves
8340 assert(scan && OP(scan) == GOSUB);
8341 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8344 Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8345 /* assume we don't need to swap parens around before we match */
8347 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8348 (unsigned long)RExC_study_chunk_recursed_count);
8352 Perl_re_printf( aTHX_ "Final program:\n");
8356 if (RExC_open_parens) {
8357 Safefree(RExC_open_parens);
8358 RExC_open_parens = NULL;
8360 if (RExC_close_parens) {
8361 Safefree(RExC_close_parens);
8362 RExC_close_parens = NULL;
8366 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8367 * by setting the regexp SV to readonly-only instead. If the
8368 * pattern's been recompiled, the USEDness should remain. */
8369 if (old_re && SvREADONLY(old_re))
8377 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8380 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8382 PERL_UNUSED_ARG(value);
8384 if (flags & RXapif_FETCH) {
8385 return reg_named_buff_fetch(rx, key, flags);
8386 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8387 Perl_croak_no_modify();
8389 } else if (flags & RXapif_EXISTS) {
8390 return reg_named_buff_exists(rx, key, flags)
8393 } else if (flags & RXapif_REGNAMES) {
8394 return reg_named_buff_all(rx, flags);
8395 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8396 return reg_named_buff_scalar(rx, flags);
8398 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8404 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8407 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8408 PERL_UNUSED_ARG(lastkey);
8410 if (flags & RXapif_FIRSTKEY)
8411 return reg_named_buff_firstkey(rx, flags);
8412 else if (flags & RXapif_NEXTKEY)
8413 return reg_named_buff_nextkey(rx, flags);
8415 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8422 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8426 struct regexp *const rx = ReANY(r);
8428 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8430 if (rx && RXp_PAREN_NAMES(rx)) {
8431 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8434 SV* sv_dat=HeVAL(he_str);
8435 I32 *nums=(I32*)SvPVX(sv_dat);
8436 AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8437 for ( i=0; i<SvIVX(sv_dat); i++ ) {
8438 if ((I32)(rx->nparens) >= nums[i]
8439 && rx->offs[nums[i]].start != -1
8440 && rx->offs[nums[i]].end != -1)
8443 CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8448 ret = newSVsv(&PL_sv_undef);
8451 av_push(retarray, ret);
8454 return newRV_noinc(MUTABLE_SV(retarray));
8461 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8464 struct regexp *const rx = ReANY(r);
8466 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8468 if (rx && RXp_PAREN_NAMES(rx)) {
8469 if (flags & RXapif_ALL) {
8470 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8472 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8474 SvREFCNT_dec_NN(sv);
8486 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8488 struct regexp *const rx = ReANY(r);
8490 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8492 if ( rx && RXp_PAREN_NAMES(rx) ) {
8493 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8495 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8502 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8504 struct regexp *const rx = ReANY(r);
8505 GET_RE_DEBUG_FLAGS_DECL;
8507 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8509 if (rx && RXp_PAREN_NAMES(rx)) {
8510 HV *hv = RXp_PAREN_NAMES(rx);
8512 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8515 SV* sv_dat = HeVAL(temphe);
8516 I32 *nums = (I32*)SvPVX(sv_dat);
8517 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8518 if ((I32)(rx->lastparen) >= nums[i] &&
8519 rx->offs[nums[i]].start != -1 &&
8520 rx->offs[nums[i]].end != -1)
8526 if (parno || flags & RXapif_ALL) {
8527 return newSVhek(HeKEY_hek(temphe));
8535 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8540 struct regexp *const rx = ReANY(r);
8542 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8544 if (rx && RXp_PAREN_NAMES(rx)) {
8545 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8546 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8547 } else if (flags & RXapif_ONE) {
8548 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8549 av = MUTABLE_AV(SvRV(ret));
8550 length = av_tindex(av);
8551 SvREFCNT_dec_NN(ret);
8552 return newSViv(length + 1);
8554 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8559 return &PL_sv_undef;
8563 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8565 struct regexp *const rx = ReANY(r);
8568 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8570 if (rx && RXp_PAREN_NAMES(rx)) {
8571 HV *hv= RXp_PAREN_NAMES(rx);
8573 (void)hv_iterinit(hv);
8574 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8577 SV* sv_dat = HeVAL(temphe);
8578 I32 *nums = (I32*)SvPVX(sv_dat);
8579 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8580 if ((I32)(rx->lastparen) >= nums[i] &&
8581 rx->offs[nums[i]].start != -1 &&
8582 rx->offs[nums[i]].end != -1)
8588 if (parno || flags & RXapif_ALL) {
8589 av_push(av, newSVhek(HeKEY_hek(temphe)));
8594 return newRV_noinc(MUTABLE_SV(av));
8598 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8601 struct regexp *const rx = ReANY(r);
8607 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8609 if ( n == RX_BUFF_IDX_CARET_PREMATCH
8610 || n == RX_BUFF_IDX_CARET_FULLMATCH
8611 || n == RX_BUFF_IDX_CARET_POSTMATCH
8614 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8616 /* on something like
8619 * the KEEPCOPY is set on the PMOP rather than the regex */
8620 if (PL_curpm && r == PM_GETRE(PL_curpm))
8621 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8630 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8631 /* no need to distinguish between them any more */
8632 n = RX_BUFF_IDX_FULLMATCH;
8634 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8635 && rx->offs[0].start != -1)
8637 /* $`, ${^PREMATCH} */
8638 i = rx->offs[0].start;
8642 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8643 && rx->offs[0].end != -1)
8645 /* $', ${^POSTMATCH} */
8646 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8647 i = rx->sublen + rx->suboffset - rx->offs[0].end;
8650 if ( 0 <= n && n <= (I32)rx->nparens &&
8651 (s1 = rx->offs[n].start) != -1 &&
8652 (t1 = rx->offs[n].end) != -1)
8654 /* $&, ${^MATCH}, $1 ... */
8656 s = rx->subbeg + s1 - rx->suboffset;
8661 assert(s >= rx->subbeg);
8662 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8664 #ifdef NO_TAINT_SUPPORT
8665 sv_setpvn(sv, s, i);
8667 const int oldtainted = TAINT_get;
8669 sv_setpvn(sv, s, i);
8670 TAINT_set(oldtainted);
8672 if (RXp_MATCH_UTF8(rx))
8677 if (RXp_MATCH_TAINTED(rx)) {
8678 if (SvTYPE(sv) >= SVt_PVMG) {
8679 MAGIC* const mg = SvMAGIC(sv);
8682 SvMAGIC_set(sv, mg->mg_moremagic);
8684 if ((mgt = SvMAGIC(sv))) {
8685 mg->mg_moremagic = mgt;
8686 SvMAGIC_set(sv, mg);
8703 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8704 SV const * const value)
8706 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8708 PERL_UNUSED_ARG(rx);
8709 PERL_UNUSED_ARG(paren);
8710 PERL_UNUSED_ARG(value);
8713 Perl_croak_no_modify();
8717 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8720 struct regexp *const rx = ReANY(r);
8724 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8726 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
8727 || paren == RX_BUFF_IDX_CARET_FULLMATCH
8728 || paren == RX_BUFF_IDX_CARET_POSTMATCH
8731 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8733 /* on something like
8736 * the KEEPCOPY is set on the PMOP rather than the regex */
8737 if (PL_curpm && r == PM_GETRE(PL_curpm))
8738 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8744 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8746 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8747 case RX_BUFF_IDX_PREMATCH: /* $` */
8748 if (rx->offs[0].start != -1) {
8749 i = rx->offs[0].start;
8758 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8759 case RX_BUFF_IDX_POSTMATCH: /* $' */
8760 if (rx->offs[0].end != -1) {
8761 i = rx->sublen - rx->offs[0].end;
8763 s1 = rx->offs[0].end;
8770 default: /* $& / ${^MATCH}, $1, $2, ... */
8771 if (paren <= (I32)rx->nparens &&
8772 (s1 = rx->offs[paren].start) != -1 &&
8773 (t1 = rx->offs[paren].end) != -1)
8779 if (ckWARN(WARN_UNINITIALIZED))
8780 report_uninit((const SV *)sv);
8785 if (i > 0 && RXp_MATCH_UTF8(rx)) {
8786 const char * const s = rx->subbeg - rx->suboffset + s1;
8791 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8798 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8800 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8801 PERL_UNUSED_ARG(rx);
8805 return newSVpvs("Regexp");
8808 /* Scans the name of a named buffer from the pattern.
8809 * If flags is REG_RSN_RETURN_NULL returns null.
8810 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8811 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8812 * to the parsed name as looked up in the RExC_paren_names hash.
8813 * If there is an error throws a vFAIL().. type exception.
8816 #define REG_RSN_RETURN_NULL 0
8817 #define REG_RSN_RETURN_NAME 1
8818 #define REG_RSN_RETURN_DATA 2
8821 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8823 char *name_start = RExC_parse;
8826 PERL_ARGS_ASSERT_REG_SCAN_NAME;
8828 assert (RExC_parse <= RExC_end);
8829 if (RExC_parse == RExC_end) NOOP;
8830 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8831 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
8832 * using do...while */
8835 RExC_parse += UTF8SKIP(RExC_parse);
8836 } while ( RExC_parse < RExC_end
8837 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
8841 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
8843 RExC_parse++; /* so the <- from the vFAIL is after the offending
8845 vFAIL("Group name must start with a non-digit word character");
8847 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
8848 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
8849 if ( flags == REG_RSN_RETURN_NAME)
8851 else if (flags==REG_RSN_RETURN_DATA) {
8854 if ( ! sv_name ) /* should not happen*/
8855 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
8856 if (RExC_paren_names)
8857 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
8859 sv_dat = HeVAL(he_str);
8860 if ( ! sv_dat ) { /* Didn't find group */
8862 /* It might be a forward reference; we can't fail until we
8863 * know, by completing the parse to get all the groups, and
8865 if (ALL_PARENS_COUNTED) {
8866 vFAIL("Reference to nonexistent named group");
8869 REQUIRE_PARENS_PASS;
8875 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
8876 (unsigned long) flags);
8879 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
8880 if (RExC_lastparse!=RExC_parse) { \
8881 Perl_re_printf( aTHX_ "%s", \
8882 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
8883 RExC_end - RExC_parse, 16, \
8885 PERL_PV_ESCAPE_UNI_DETECT | \
8886 PERL_PV_PRETTY_ELLIPSES | \
8887 PERL_PV_PRETTY_LTGT | \
8888 PERL_PV_ESCAPE_RE | \
8889 PERL_PV_PRETTY_EXACTSIZE \
8893 Perl_re_printf( aTHX_ "%16s",""); \
8895 if (RExC_lastnum!=RExC_emit) \
8896 Perl_re_printf( aTHX_ "|%4d", RExC_emit); \
8898 Perl_re_printf( aTHX_ "|%4s",""); \
8899 Perl_re_printf( aTHX_ "|%*s%-4s", \
8900 (int)((depth*2)), "", \
8903 RExC_lastnum=RExC_emit; \
8904 RExC_lastparse=RExC_parse; \
8909 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
8910 DEBUG_PARSE_MSG((funcname)); \
8911 Perl_re_printf( aTHX_ "%4s","\n"); \
8913 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
8914 DEBUG_PARSE_MSG((funcname)); \
8915 Perl_re_printf( aTHX_ fmt "\n",args); \
8918 /* This section of code defines the inversion list object and its methods. The
8919 * interfaces are highly subject to change, so as much as possible is static to
8920 * this file. An inversion list is here implemented as a malloc'd C UV array
8921 * as an SVt_INVLIST scalar.
8923 * An inversion list for Unicode is an array of code points, sorted by ordinal
8924 * number. Each element gives the code point that begins a range that extends
8925 * up-to but not including the code point given by the next element. The final
8926 * element gives the first code point of a range that extends to the platform's
8927 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
8928 * ...) give ranges whose code points are all in the inversion list. We say
8929 * that those ranges are in the set. The odd-numbered elements give ranges
8930 * whose code points are not in the inversion list, and hence not in the set.
8931 * Thus, element [0] is the first code point in the list. Element [1]
8932 * is the first code point beyond that not in the list; and element [2] is the
8933 * first code point beyond that that is in the list. In other words, the first
8934 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
8935 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
8936 * all code points in that range are not in the inversion list. The third
8937 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
8938 * list, and so forth. Thus every element whose index is divisible by two
8939 * gives the beginning of a range that is in the list, and every element whose
8940 * index is not divisible by two gives the beginning of a range not in the
8941 * list. If the final element's index is divisible by two, the inversion list
8942 * extends to the platform's infinity; otherwise the highest code point in the
8943 * inversion list is the contents of that element minus 1.
8945 * A range that contains just a single code point N will look like
8947 * invlist[i+1] == N+1
8949 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
8950 * impossible to represent, so element [i+1] is omitted. The single element
8952 * invlist[0] == UV_MAX
8953 * contains just UV_MAX, but is interpreted as matching to infinity.
8955 * Taking the complement (inverting) an inversion list is quite simple, if the
8956 * first element is 0, remove it; otherwise add a 0 element at the beginning.
8957 * This implementation reserves an element at the beginning of each inversion
8958 * list to always contain 0; there is an additional flag in the header which
8959 * indicates if the list begins at the 0, or is offset to begin at the next
8960 * element. This means that the inversion list can be inverted without any
8961 * copying; just flip the flag.
8963 * More about inversion lists can be found in "Unicode Demystified"
8964 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
8966 * The inversion list data structure is currently implemented as an SV pointing
8967 * to an array of UVs that the SV thinks are bytes. This allows us to have an
8968 * array of UV whose memory management is automatically handled by the existing
8969 * facilities for SV's.
8971 * Some of the methods should always be private to the implementation, and some
8972 * should eventually be made public */
8974 /* The header definitions are in F<invlist_inline.h> */
8976 #ifndef PERL_IN_XSUB_RE
8978 PERL_STATIC_INLINE UV*
8979 S__invlist_array_init(SV* const invlist, const bool will_have_0)
8981 /* Returns a pointer to the first element in the inversion list's array.
8982 * This is called upon initialization of an inversion list. Where the
8983 * array begins depends on whether the list has the code point U+0000 in it
8984 * or not. The other parameter tells it whether the code that follows this
8985 * call is about to put a 0 in the inversion list or not. The first
8986 * element is either the element reserved for 0, if TRUE, or the element
8987 * after it, if FALSE */
8989 bool* offset = get_invlist_offset_addr(invlist);
8990 UV* zero_addr = (UV *) SvPVX(invlist);
8992 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8995 assert(! _invlist_len(invlist));
8999 /* 1^1 = 0; 1^0 = 1 */
9000 *offset = 1 ^ will_have_0;
9001 return zero_addr + *offset;
9004 PERL_STATIC_INLINE void
9005 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
9007 /* Sets the current number of elements stored in the inversion list.
9008 * Updates SvCUR correspondingly */
9009 PERL_UNUSED_CONTEXT;
9010 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
9012 assert(is_invlist(invlist));
9017 : TO_INTERNAL_SIZE(len + offset));
9018 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
9022 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9024 /* Replaces the inversion list in 'dest' with the one from 'src'. It
9025 * steals the list from 'src', so 'src' is made to have a NULL list. This
9026 * is similar to what SvSetMagicSV() would do, if it were implemented on
9027 * inversion lists, though this routine avoids a copy */
9029 const UV src_len = _invlist_len(src);
9030 const bool src_offset = *get_invlist_offset_addr(src);
9031 const STRLEN src_byte_len = SvLEN(src);
9032 char * array = SvPVX(src);
9034 const int oldtainted = TAINT_get;
9036 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9038 assert(is_invlist(src));
9039 assert(is_invlist(dest));
9040 assert(! invlist_is_iterating(src));
9041 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9043 /* Make sure it ends in the right place with a NUL, as our inversion list
9044 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9046 array[src_byte_len - 1] = '\0';
9048 TAINT_NOT; /* Otherwise it breaks */
9049 sv_usepvn_flags(dest,
9053 /* This flag is documented to cause a copy to be avoided */
9054 SV_HAS_TRAILING_NUL);
9055 TAINT_set(oldtainted);
9060 /* Finish up copying over the other fields in an inversion list */
9061 *get_invlist_offset_addr(dest) = src_offset;
9062 invlist_set_len(dest, src_len, src_offset);
9063 *get_invlist_previous_index_addr(dest) = 0;
9064 invlist_iterfinish(dest);
9067 PERL_STATIC_INLINE IV*
9068 S_get_invlist_previous_index_addr(SV* invlist)
9070 /* Return the address of the IV that is reserved to hold the cached index
9072 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9074 assert(is_invlist(invlist));
9076 return &(((XINVLIST*) SvANY(invlist))->prev_index);
9079 PERL_STATIC_INLINE IV
9080 S_invlist_previous_index(SV* const invlist)
9082 /* Returns cached index of previous search */
9084 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9086 return *get_invlist_previous_index_addr(invlist);
9089 PERL_STATIC_INLINE void
9090 S_invlist_set_previous_index(SV* const invlist, const IV index)
9092 /* Caches <index> for later retrieval */
9094 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9096 assert(index == 0 || index < (int) _invlist_len(invlist));
9098 *get_invlist_previous_index_addr(invlist) = index;
9101 PERL_STATIC_INLINE void
9102 S_invlist_trim(SV* invlist)
9104 /* Free the not currently-being-used space in an inversion list */
9106 /* But don't free up the space needed for the 0 UV that is always at the
9107 * beginning of the list, nor the trailing NUL */
9108 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9110 PERL_ARGS_ASSERT_INVLIST_TRIM;
9112 assert(is_invlist(invlist));
9114 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9117 PERL_STATIC_INLINE void
9118 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
9120 PERL_ARGS_ASSERT_INVLIST_CLEAR;
9122 assert(is_invlist(invlist));
9124 invlist_set_len(invlist, 0, 0);
9125 invlist_trim(invlist);
9128 #endif /* ifndef PERL_IN_XSUB_RE */
9130 PERL_STATIC_INLINE bool
9131 S_invlist_is_iterating(SV* const invlist)
9133 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9135 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9138 #ifndef PERL_IN_XSUB_RE
9140 PERL_STATIC_INLINE UV
9141 S_invlist_max(SV* const invlist)
9143 /* Returns the maximum number of elements storable in the inversion list's
9144 * array, without having to realloc() */
9146 PERL_ARGS_ASSERT_INVLIST_MAX;
9148 assert(is_invlist(invlist));
9150 /* Assumes worst case, in which the 0 element is not counted in the
9151 * inversion list, so subtracts 1 for that */
9152 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
9153 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9154 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9158 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9160 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9162 /* First 1 is in case the zero element isn't in the list; second 1 is for
9164 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9165 invlist_set_len(invlist, 0, 0);
9167 /* Force iterinit() to be used to get iteration to work */
9168 invlist_iterfinish(invlist);
9170 *get_invlist_previous_index_addr(invlist) = 0;
9174 Perl__new_invlist(pTHX_ IV initial_size)
9177 /* Return a pointer to a newly constructed inversion list, with enough
9178 * space to store 'initial_size' elements. If that number is negative, a
9179 * system default is used instead */
9183 if (initial_size < 0) {
9187 new_list = newSV_type(SVt_INVLIST);
9188 initialize_invlist_guts(new_list, initial_size);
9194 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9196 /* Return a pointer to a newly constructed inversion list, initialized to
9197 * point to <list>, which has to be in the exact correct inversion list
9198 * form, including internal fields. Thus this is a dangerous routine that
9199 * should not be used in the wrong hands. The passed in 'list' contains
9200 * several header fields at the beginning that are not part of the
9201 * inversion list body proper */
9203 const STRLEN length = (STRLEN) list[0];
9204 const UV version_id = list[1];
9205 const bool offset = cBOOL(list[2]);
9206 #define HEADER_LENGTH 3
9207 /* If any of the above changes in any way, you must change HEADER_LENGTH
9208 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9209 * perl -E 'say int(rand 2**31-1)'
9211 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9212 data structure type, so that one being
9213 passed in can be validated to be an
9214 inversion list of the correct vintage.
9217 SV* invlist = newSV_type(SVt_INVLIST);
9219 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9221 if (version_id != INVLIST_VERSION_ID) {
9222 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9225 /* The generated array passed in includes header elements that aren't part
9226 * of the list proper, so start it just after them */
9227 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9229 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
9230 shouldn't touch it */
9232 *(get_invlist_offset_addr(invlist)) = offset;
9234 /* The 'length' passed to us is the physical number of elements in the
9235 * inversion list. But if there is an offset the logical number is one
9237 invlist_set_len(invlist, length - offset, offset);
9239 invlist_set_previous_index(invlist, 0);
9241 /* Initialize the iteration pointer. */
9242 invlist_iterfinish(invlist);
9244 SvREADONLY_on(invlist);
9250 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
9252 /* Grow the maximum size of an inversion list */
9254 PERL_ARGS_ASSERT_INVLIST_EXTEND;
9256 assert(is_invlist(invlist));
9258 /* Add one to account for the zero element at the beginning which may not
9259 * be counted by the calling parameters */
9260 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
9264 S__append_range_to_invlist(pTHX_ SV* const invlist,
9265 const UV start, const UV end)
9267 /* Subject to change or removal. Append the range from 'start' to 'end' at
9268 * the end of the inversion list. The range must be above any existing
9272 UV max = invlist_max(invlist);
9273 UV len = _invlist_len(invlist);
9276 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9278 if (len == 0) { /* Empty lists must be initialized */
9279 offset = start != 0;
9280 array = _invlist_array_init(invlist, ! offset);
9283 /* Here, the existing list is non-empty. The current max entry in the
9284 * list is generally the first value not in the set, except when the
9285 * set extends to the end of permissible values, in which case it is
9286 * the first entry in that final set, and so this call is an attempt to
9287 * append out-of-order */
9289 UV final_element = len - 1;
9290 array = invlist_array(invlist);
9291 if ( array[final_element] > start
9292 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9294 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9295 array[final_element], start,
9296 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9299 /* Here, it is a legal append. If the new range begins 1 above the end
9300 * of the range below it, it is extending the range below it, so the
9301 * new first value not in the set is one greater than the newly
9302 * extended range. */
9303 offset = *get_invlist_offset_addr(invlist);
9304 if (array[final_element] == start) {
9305 if (end != UV_MAX) {
9306 array[final_element] = end + 1;
9309 /* But if the end is the maximum representable on the machine,
9310 * assume that infinity was actually what was meant. Just let
9311 * the range that this would extend to have no end */
9312 invlist_set_len(invlist, len - 1, offset);
9318 /* Here the new range doesn't extend any existing set. Add it */
9320 len += 2; /* Includes an element each for the start and end of range */
9322 /* If wll overflow the existing space, extend, which may cause the array to
9325 invlist_extend(invlist, len);
9327 /* Have to set len here to avoid assert failure in invlist_array() */
9328 invlist_set_len(invlist, len, offset);
9330 array = invlist_array(invlist);
9333 invlist_set_len(invlist, len, offset);
9336 /* The next item on the list starts the range, the one after that is
9337 * one past the new range. */
9338 array[len - 2] = start;
9339 if (end != UV_MAX) {
9340 array[len - 1] = end + 1;
9343 /* But if the end is the maximum representable on the machine, just let
9344 * the range have no end */
9345 invlist_set_len(invlist, len - 1, offset);
9350 Perl__invlist_search(SV* const invlist, const UV cp)
9352 /* Searches the inversion list for the entry that contains the input code
9353 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
9354 * return value is the index into the list's array of the range that
9355 * contains <cp>, that is, 'i' such that
9356 * array[i] <= cp < array[i+1]
9361 IV high = _invlist_len(invlist);
9362 const IV highest_element = high - 1;
9365 PERL_ARGS_ASSERT__INVLIST_SEARCH;
9367 /* If list is empty, return failure. */
9372 /* (We can't get the array unless we know the list is non-empty) */
9373 array = invlist_array(invlist);
9375 mid = invlist_previous_index(invlist);
9377 if (mid > highest_element) {
9378 mid = highest_element;
9381 /* <mid> contains the cache of the result of the previous call to this
9382 * function (0 the first time). See if this call is for the same result,
9383 * or if it is for mid-1. This is under the theory that calls to this
9384 * function will often be for related code points that are near each other.
9385 * And benchmarks show that caching gives better results. We also test
9386 * here if the code point is within the bounds of the list. These tests
9387 * replace others that would have had to be made anyway to make sure that
9388 * the array bounds were not exceeded, and these give us extra information
9389 * at the same time */
9390 if (cp >= array[mid]) {
9391 if (cp >= array[highest_element]) {
9392 return highest_element;
9395 /* Here, array[mid] <= cp < array[highest_element]. This means that
9396 * the final element is not the answer, so can exclude it; it also
9397 * means that <mid> is not the final element, so can refer to 'mid + 1'
9399 if (cp < array[mid + 1]) {
9405 else { /* cp < aray[mid] */
9406 if (cp < array[0]) { /* Fail if outside the array */
9410 if (cp >= array[mid - 1]) {
9415 /* Binary search. What we are looking for is <i> such that
9416 * array[i] <= cp < array[i+1]
9417 * The loop below converges on the i+1. Note that there may not be an
9418 * (i+1)th element in the array, and things work nonetheless */
9419 while (low < high) {
9420 mid = (low + high) / 2;
9421 assert(mid <= highest_element);
9422 if (array[mid] <= cp) { /* cp >= array[mid] */
9425 /* We could do this extra test to exit the loop early.
9426 if (cp < array[low]) {
9431 else { /* cp < array[mid] */
9438 invlist_set_previous_index(invlist, high);
9443 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9444 const bool complement_b, SV** output)
9446 /* Take the union of two inversion lists and point '*output' to it. On
9447 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9448 * even 'a' or 'b'). If to an inversion list, the contents of the original
9449 * list will be replaced by the union. The first list, 'a', may be
9450 * NULL, in which case a copy of the second list is placed in '*output'.
9451 * If 'complement_b' is TRUE, the union is taken of the complement
9452 * (inversion) of 'b' instead of b itself.
9454 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9455 * Richard Gillam, published by Addison-Wesley, and explained at some
9456 * length there. The preface says to incorporate its examples into your
9457 * code at your own risk.
9459 * The algorithm is like a merge sort. */
9461 const UV* array_a; /* a's array */
9463 UV len_a; /* length of a's array */
9466 SV* u; /* the resulting union */
9470 UV i_a = 0; /* current index into a's array */
9474 /* running count, as explained in the algorithm source book; items are
9475 * stopped accumulating and are output when the count changes to/from 0.
9476 * The count is incremented when we start a range that's in an input's set,
9477 * and decremented when we start a range that's not in a set. So this
9478 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
9479 * and hence nothing goes into the union; 1, just one of the inputs is in
9480 * its set (and its current range gets added to the union); and 2 when both
9481 * inputs are in their sets. */
9484 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9486 assert(*output == NULL || is_invlist(*output));
9488 len_b = _invlist_len(b);
9491 /* Here, 'b' is empty, hence it's complement is all possible code
9492 * points. So if the union includes the complement of 'b', it includes
9493 * everything, and we need not even look at 'a'. It's easiest to
9494 * create a new inversion list that matches everything. */
9496 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9498 if (*output == NULL) { /* If the output didn't exist, just point it
9500 *output = everything;
9502 else { /* Otherwise, replace its contents with the new list */
9503 invlist_replace_list_destroys_src(*output, everything);
9504 SvREFCNT_dec_NN(everything);
9510 /* Here, we don't want the complement of 'b', and since 'b' is empty,
9511 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
9512 * output will be empty */
9514 if (a == NULL || _invlist_len(a) == 0) {
9515 if (*output == NULL) {
9516 *output = _new_invlist(0);
9519 invlist_clear(*output);
9524 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9525 * union. We can just return a copy of 'a' if '*output' doesn't point
9526 * to an existing list */
9527 if (*output == NULL) {
9528 *output = invlist_clone(a, NULL);
9532 /* If the output is to overwrite 'a', we have a no-op, as it's
9538 /* Here, '*output' is to be overwritten by 'a' */
9539 u = invlist_clone(a, NULL);
9540 invlist_replace_list_destroys_src(*output, u);
9546 /* Here 'b' is not empty. See about 'a' */
9548 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9550 /* Here, 'a' is empty (and b is not). That means the union will come
9551 * entirely from 'b'. If '*output' is NULL, we can directly return a
9552 * clone of 'b'. Otherwise, we replace the contents of '*output' with
9555 SV ** dest = (*output == NULL) ? output : &u;
9556 *dest = invlist_clone(b, NULL);
9558 _invlist_invert(*dest);
9562 invlist_replace_list_destroys_src(*output, u);
9569 /* Here both lists exist and are non-empty */
9570 array_a = invlist_array(a);
9571 array_b = invlist_array(b);
9573 /* If are to take the union of 'a' with the complement of b, set it
9574 * up so are looking at b's complement. */
9577 /* To complement, we invert: if the first element is 0, remove it. To
9578 * do this, we just pretend the array starts one later */
9579 if (array_b[0] == 0) {
9585 /* But if the first element is not zero, we pretend the list starts
9586 * at the 0 that is always stored immediately before the array. */
9592 /* Size the union for the worst case: that the sets are completely
9594 u = _new_invlist(len_a + len_b);
9596 /* Will contain U+0000 if either component does */
9597 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
9598 || (len_b > 0 && array_b[0] == 0));
9600 /* Go through each input list item by item, stopping when have exhausted
9602 while (i_a < len_a && i_b < len_b) {
9603 UV cp; /* The element to potentially add to the union's array */
9604 bool cp_in_set; /* is it in the the input list's set or not */
9606 /* We need to take one or the other of the two inputs for the union.
9607 * Since we are merging two sorted lists, we take the smaller of the
9608 * next items. In case of a tie, we take first the one that is in its
9609 * set. If we first took the one not in its set, it would decrement
9610 * the count, possibly to 0 which would cause it to be output as ending
9611 * the range, and the next time through we would take the same number,
9612 * and output it again as beginning the next range. By doing it the
9613 * opposite way, there is no possibility that the count will be
9614 * momentarily decremented to 0, and thus the two adjoining ranges will
9615 * be seamlessly merged. (In a tie and both are in the set or both not
9616 * in the set, it doesn't matter which we take first.) */
9617 if ( array_a[i_a] < array_b[i_b]
9618 || ( array_a[i_a] == array_b[i_b]
9619 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9621 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9622 cp = array_a[i_a++];
9625 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9626 cp = array_b[i_b++];
9629 /* Here, have chosen which of the two inputs to look at. Only output
9630 * if the running count changes to/from 0, which marks the
9631 * beginning/end of a range that's in the set */
9634 array_u[i_u++] = cp;
9641 array_u[i_u++] = cp;
9647 /* The loop above increments the index into exactly one of the input lists
9648 * each iteration, and ends when either index gets to its list end. That
9649 * means the other index is lower than its end, and so something is
9650 * remaining in that one. We decrement 'count', as explained below, if
9651 * that list is in its set. (i_a and i_b each currently index the element
9652 * beyond the one we care about.) */
9653 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9654 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9659 /* Above we decremented 'count' if the list that had unexamined elements in
9660 * it was in its set. This has made it so that 'count' being non-zero
9661 * means there isn't anything left to output; and 'count' equal to 0 means
9662 * that what is left to output is precisely that which is left in the
9663 * non-exhausted input list.
9665 * To see why, note first that the exhausted input obviously has nothing
9666 * left to add to the union. If it was in its set at its end, that means
9667 * the set extends from here to the platform's infinity, and hence so does
9668 * the union and the non-exhausted set is irrelevant. The exhausted set
9669 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
9670 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9671 * 'count' remains at 1. This is consistent with the decremented 'count'
9672 * != 0 meaning there's nothing left to add to the union.
9674 * But if the exhausted input wasn't in its set, it contributed 0 to
9675 * 'count', and the rest of the union will be whatever the other input is.
9676 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9677 * otherwise it gets decremented to 0. This is consistent with 'count'
9678 * == 0 meaning the remainder of the union is whatever is left in the
9679 * non-exhausted list. */
9684 IV copy_count = len_a - i_a;
9685 if (copy_count > 0) { /* The non-exhausted input is 'a' */
9686 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9688 else { /* The non-exhausted input is b */
9689 copy_count = len_b - i_b;
9690 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9692 len_u = i_u + copy_count;
9695 /* Set the result to the final length, which can change the pointer to
9696 * array_u, so re-find it. (Note that it is unlikely that this will
9697 * change, as we are shrinking the space, not enlarging it) */
9698 if (len_u != _invlist_len(u)) {
9699 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9701 array_u = invlist_array(u);
9704 if (*output == NULL) { /* Simply return the new inversion list */
9708 /* Otherwise, overwrite the inversion list that was in '*output'. We
9709 * could instead free '*output', and then set it to 'u', but experience
9710 * has shown [perl #127392] that if the input is a mortal, we can get a
9711 * huge build-up of these during regex compilation before they get
9713 invlist_replace_list_destroys_src(*output, u);
9721 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9722 const bool complement_b, SV** i)
9724 /* Take the intersection of two inversion lists and point '*i' to it. On
9725 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9726 * even 'a' or 'b'). If to an inversion list, the contents of the original
9727 * list will be replaced by the intersection. The first list, 'a', may be
9728 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
9729 * TRUE, the result will be the intersection of 'a' and the complement (or
9730 * inversion) of 'b' instead of 'b' directly.
9732 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9733 * Richard Gillam, published by Addison-Wesley, and explained at some
9734 * length there. The preface says to incorporate its examples into your
9735 * code at your own risk. In fact, it had bugs
9737 * The algorithm is like a merge sort, and is essentially the same as the
9741 const UV* array_a; /* a's array */
9743 UV len_a; /* length of a's array */
9746 SV* r; /* the resulting intersection */
9750 UV i_a = 0; /* current index into a's array */
9754 /* running count of how many of the two inputs are postitioned at ranges
9755 * that are in their sets. As explained in the algorithm source book,
9756 * items are stopped accumulating and are output when the count changes
9757 * to/from 2. The count is incremented when we start a range that's in an
9758 * input's set, and decremented when we start a range that's not in a set.
9759 * Only when it is 2 are we in the intersection. */
9762 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9764 assert(*i == NULL || is_invlist(*i));
9766 /* Special case if either one is empty */
9767 len_a = (a == NULL) ? 0 : _invlist_len(a);
9768 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9769 if (len_a != 0 && complement_b) {
9771 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9772 * must be empty. Here, also we are using 'b's complement, which
9773 * hence must be every possible code point. Thus the intersection
9776 if (*i == a) { /* No-op */
9781 *i = invlist_clone(a, NULL);
9785 r = invlist_clone(a, NULL);
9786 invlist_replace_list_destroys_src(*i, r);
9791 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
9792 * intersection must be empty */
9794 *i = _new_invlist(0);
9802 /* Here both lists exist and are non-empty */
9803 array_a = invlist_array(a);
9804 array_b = invlist_array(b);
9806 /* If are to take the intersection of 'a' with the complement of b, set it
9807 * up so are looking at b's complement. */
9810 /* To complement, we invert: if the first element is 0, remove it. To
9811 * do this, we just pretend the array starts one later */
9812 if (array_b[0] == 0) {
9818 /* But if the first element is not zero, we pretend the list starts
9819 * at the 0 that is always stored immediately before the array. */
9825 /* Size the intersection for the worst case: that the intersection ends up
9826 * fragmenting everything to be completely disjoint */
9827 r= _new_invlist(len_a + len_b);
9829 /* Will contain U+0000 iff both components do */
9830 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9831 && len_b > 0 && array_b[0] == 0);
9833 /* Go through each list item by item, stopping when have exhausted one of
9835 while (i_a < len_a && i_b < len_b) {
9836 UV cp; /* The element to potentially add to the intersection's
9838 bool cp_in_set; /* Is it in the input list's set or not */
9840 /* We need to take one or the other of the two inputs for the
9841 * intersection. Since we are merging two sorted lists, we take the
9842 * smaller of the next items. In case of a tie, we take first the one
9843 * that is not in its set (a difference from the union algorithm). If
9844 * we first took the one in its set, it would increment the count,
9845 * possibly to 2 which would cause it to be output as starting a range
9846 * in the intersection, and the next time through we would take that
9847 * same number, and output it again as ending the set. By doing the
9848 * opposite of this, there is no possibility that the count will be
9849 * momentarily incremented to 2. (In a tie and both are in the set or
9850 * both not in the set, it doesn't matter which we take first.) */
9851 if ( array_a[i_a] < array_b[i_b]
9852 || ( array_a[i_a] == array_b[i_b]
9853 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9855 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9856 cp = array_a[i_a++];
9859 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9863 /* Here, have chosen which of the two inputs to look at. Only output
9864 * if the running count changes to/from 2, which marks the
9865 * beginning/end of a range that's in the intersection */
9869 array_r[i_r++] = cp;
9874 array_r[i_r++] = cp;
9881 /* The loop above increments the index into exactly one of the input lists
9882 * each iteration, and ends when either index gets to its list end. That
9883 * means the other index is lower than its end, and so something is
9884 * remaining in that one. We increment 'count', as explained below, if the
9885 * exhausted list was in its set. (i_a and i_b each currently index the
9886 * element beyond the one we care about.) */
9887 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9888 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9893 /* Above we incremented 'count' if the exhausted list was in its set. This
9894 * has made it so that 'count' being below 2 means there is nothing left to
9895 * output; otheriwse what's left to add to the intersection is precisely
9896 * that which is left in the non-exhausted input list.
9898 * To see why, note first that the exhausted input obviously has nothing
9899 * left to affect the intersection. If it was in its set at its end, that
9900 * means the set extends from here to the platform's infinity, and hence
9901 * anything in the non-exhausted's list will be in the intersection, and
9902 * anything not in it won't be. Hence, the rest of the intersection is
9903 * precisely what's in the non-exhausted list The exhausted set also
9904 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
9905 * it means 'count' is now at least 2. This is consistent with the
9906 * incremented 'count' being >= 2 means to add the non-exhausted list to
9909 * But if the exhausted input wasn't in its set, it contributed 0 to
9910 * 'count', and the intersection can't include anything further; the
9911 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
9912 * incremented. This is consistent with 'count' being < 2 meaning nothing
9913 * further to add to the intersection. */
9914 if (count < 2) { /* Nothing left to put in the intersection. */
9917 else { /* copy the non-exhausted list, unchanged. */
9918 IV copy_count = len_a - i_a;
9919 if (copy_count > 0) { /* a is the one with stuff left */
9920 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
9922 else { /* b is the one with stuff left */
9923 copy_count = len_b - i_b;
9924 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
9926 len_r = i_r + copy_count;
9929 /* Set the result to the final length, which can change the pointer to
9930 * array_r, so re-find it. (Note that it is unlikely that this will
9931 * change, as we are shrinking the space, not enlarging it) */
9932 if (len_r != _invlist_len(r)) {
9933 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
9935 array_r = invlist_array(r);
9938 if (*i == NULL) { /* Simply return the calculated intersection */
9941 else { /* Otherwise, replace the existing inversion list in '*i'. We could
9942 instead free '*i', and then set it to 'r', but experience has
9943 shown [perl #127392] that if the input is a mortal, we can get a
9944 huge build-up of these during regex compilation before they get
9947 invlist_replace_list_destroys_src(*i, r);
9959 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
9961 /* Add the range from 'start' to 'end' inclusive to the inversion list's
9962 * set. A pointer to the inversion list is returned. This may actually be
9963 * a new list, in which case the passed in one has been destroyed. The
9964 * passed-in inversion list can be NULL, in which case a new one is created
9965 * with just the one range in it. The new list is not necessarily
9966 * NUL-terminated. Space is not freed if the inversion list shrinks as a
9967 * result of this function. The gain would not be large, and in many
9968 * cases, this is called multiple times on a single inversion list, so
9969 * anything freed may almost immediately be needed again.
9971 * This used to mostly call the 'union' routine, but that is much more
9972 * heavyweight than really needed for a single range addition */
9974 UV* array; /* The array implementing the inversion list */
9975 UV len; /* How many elements in 'array' */
9976 SSize_t i_s; /* index into the invlist array where 'start'
9978 SSize_t i_e = 0; /* And the index where 'end' should go */
9979 UV cur_highest; /* The highest code point in the inversion list
9980 upon entry to this function */
9982 /* This range becomes the whole inversion list if none already existed */
9983 if (invlist == NULL) {
9984 invlist = _new_invlist(2);
9985 _append_range_to_invlist(invlist, start, end);
9989 /* Likewise, if the inversion list is currently empty */
9990 len = _invlist_len(invlist);
9992 _append_range_to_invlist(invlist, start, end);
9996 /* Starting here, we have to know the internals of the list */
9997 array = invlist_array(invlist);
9999 /* If the new range ends higher than the current highest ... */
10000 cur_highest = invlist_highest(invlist);
10001 if (end > cur_highest) {
10003 /* If the whole range is higher, we can just append it */
10004 if (start > cur_highest) {
10005 _append_range_to_invlist(invlist, start, end);
10009 /* Otherwise, add the portion that is higher ... */
10010 _append_range_to_invlist(invlist, cur_highest + 1, end);
10012 /* ... and continue on below to handle the rest. As a result of the
10013 * above append, we know that the index of the end of the range is the
10014 * final even numbered one of the array. Recall that the final element
10015 * always starts a range that extends to infinity. If that range is in
10016 * the set (meaning the set goes from here to infinity), it will be an
10017 * even index, but if it isn't in the set, it's odd, and the final
10018 * range in the set is one less, which is even. */
10019 if (end == UV_MAX) {
10027 /* We have dealt with appending, now see about prepending. If the new
10028 * range starts lower than the current lowest ... */
10029 if (start < array[0]) {
10031 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10032 * Let the union code handle it, rather than having to know the
10033 * trickiness in two code places. */
10034 if (UNLIKELY(start == 0)) {
10037 range_invlist = _new_invlist(2);
10038 _append_range_to_invlist(range_invlist, start, end);
10040 _invlist_union(invlist, range_invlist, &invlist);
10042 SvREFCNT_dec_NN(range_invlist);
10047 /* If the whole new range comes before the first entry, and doesn't
10048 * extend it, we have to insert it as an additional range */
10049 if (end < array[0] - 1) {
10051 goto splice_in_new_range;
10054 /* Here the new range adjoins the existing first range, extending it
10058 /* And continue on below to handle the rest. We know that the index of
10059 * the beginning of the range is the first one of the array */
10062 else { /* Not prepending any part of the new range to the existing list.
10063 * Find where in the list it should go. This finds i_s, such that:
10064 * invlist[i_s] <= start < array[i_s+1]
10066 i_s = _invlist_search(invlist, start);
10069 /* At this point, any extending before the beginning of the inversion list
10070 * and/or after the end has been done. This has made it so that, in the
10071 * code below, each endpoint of the new range is either in a range that is
10072 * in the set, or is in a gap between two ranges that are. This means we
10073 * don't have to worry about exceeding the array bounds.
10075 * Find where in the list the new range ends (but we can skip this if we
10076 * have already determined what it is, or if it will be the same as i_s,
10077 * which we already have computed) */
10079 i_e = (start == end)
10081 : _invlist_search(invlist, end);
10084 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
10085 * is a range that goes to infinity there is no element at invlist[i_e+1],
10086 * so only the first relation holds. */
10088 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10090 /* Here, the ranges on either side of the beginning of the new range
10091 * are in the set, and this range starts in the gap between them.
10093 * The new range extends the range above it downwards if the new range
10094 * ends at or above that range's start */
10095 const bool extends_the_range_above = ( end == UV_MAX
10096 || end + 1 >= array[i_s+1]);
10098 /* The new range extends the range below it upwards if it begins just
10099 * after where that range ends */
10100 if (start == array[i_s]) {
10102 /* If the new range fills the entire gap between the other ranges,
10103 * they will get merged together. Other ranges may also get
10104 * merged, depending on how many of them the new range spans. In
10105 * the general case, we do the merge later, just once, after we
10106 * figure out how many to merge. But in the case where the new
10107 * range exactly spans just this one gap (possibly extending into
10108 * the one above), we do the merge here, and an early exit. This
10109 * is done here to avoid having to special case later. */
10110 if (i_e - i_s <= 1) {
10112 /* If i_e - i_s == 1, it means that the new range terminates
10113 * within the range above, and hence 'extends_the_range_above'
10114 * must be true. (If the range above it extends to infinity,
10115 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10116 * will be 0, so no harm done.) */
10117 if (extends_the_range_above) {
10118 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10119 invlist_set_len(invlist,
10121 *(get_invlist_offset_addr(invlist)));
10125 /* Here, i_e must == i_s. We keep them in sync, as they apply
10126 * to the same range, and below we are about to decrement i_s
10131 /* Here, the new range is adjacent to the one below. (It may also
10132 * span beyond the range above, but that will get resolved later.)
10133 * Extend the range below to include this one. */
10134 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10136 start = array[i_s];
10138 else if (extends_the_range_above) {
10140 /* Here the new range only extends the range above it, but not the
10141 * one below. It merges with the one above. Again, we keep i_e
10142 * and i_s in sync if they point to the same range */
10147 array[i_s] = start;
10151 /* Here, we've dealt with the new range start extending any adjoining
10154 * If the new range extends to infinity, it is now the final one,
10155 * regardless of what was there before */
10156 if (UNLIKELY(end == UV_MAX)) {
10157 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10161 /* If i_e started as == i_s, it has also been dealt with,
10162 * and been updated to the new i_s, which will fail the following if */
10163 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10165 /* Here, the ranges on either side of the end of the new range are in
10166 * the set, and this range ends in the gap between them.
10168 * If this range is adjacent to (hence extends) the range above it, it
10169 * becomes part of that range; likewise if it extends the range below,
10170 * it becomes part of that range */
10171 if (end + 1 == array[i_e+1]) {
10173 array[i_e] = start;
10175 else if (start <= array[i_e]) {
10176 array[i_e] = end + 1;
10183 /* If the range fits entirely in an existing range (as possibly already
10184 * extended above), it doesn't add anything new */
10185 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10189 /* Here, no part of the range is in the list. Must add it. It will
10190 * occupy 2 more slots */
10191 splice_in_new_range:
10193 invlist_extend(invlist, len + 2);
10194 array = invlist_array(invlist);
10195 /* Move the rest of the array down two slots. Don't include any
10197 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10199 /* Do the actual splice */
10200 array[i_e+1] = start;
10201 array[i_e+2] = end + 1;
10202 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10206 /* Here the new range crossed the boundaries of a pre-existing range. The
10207 * code above has adjusted things so that both ends are in ranges that are
10208 * in the set. This means everything in between must also be in the set.
10209 * Just squash things together */
10210 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10211 invlist_set_len(invlist,
10213 *(get_invlist_offset_addr(invlist)));
10219 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10220 UV** other_elements_ptr)
10222 /* Create and return an inversion list whose contents are to be populated
10223 * by the caller. The caller gives the number of elements (in 'size') and
10224 * the very first element ('element0'). This function will set
10225 * '*other_elements_ptr' to an array of UVs, where the remaining elements
10226 * are to be placed.
10228 * Obviously there is some trust involved that the caller will properly
10229 * fill in the other elements of the array.
10231 * (The first element needs to be passed in, as the underlying code does
10232 * things differently depending on whether it is zero or non-zero) */
10234 SV* invlist = _new_invlist(size);
10237 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10239 invlist = add_cp_to_invlist(invlist, element0);
10240 offset = *get_invlist_offset_addr(invlist);
10242 invlist_set_len(invlist, size, offset);
10243 *other_elements_ptr = invlist_array(invlist) + 1;
10249 PERL_STATIC_INLINE SV*
10250 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
10251 return _add_range_to_invlist(invlist, cp, cp);
10254 #ifndef PERL_IN_XSUB_RE
10256 Perl__invlist_invert(pTHX_ SV* const invlist)
10258 /* Complement the input inversion list. This adds a 0 if the list didn't
10259 * have a zero; removes it otherwise. As described above, the data
10260 * structure is set up so that this is very efficient */
10262 PERL_ARGS_ASSERT__INVLIST_INVERT;
10264 assert(! invlist_is_iterating(invlist));
10266 /* The inverse of matching nothing is matching everything */
10267 if (_invlist_len(invlist) == 0) {
10268 _append_range_to_invlist(invlist, 0, UV_MAX);
10272 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10276 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10278 /* Return a new inversion list that is a copy of the input one, which is
10279 * unchanged. The new list will not be mortal even if the old one was. */
10281 const STRLEN nominal_length = _invlist_len(invlist);
10282 const STRLEN physical_length = SvCUR(invlist);
10283 const bool offset = *(get_invlist_offset_addr(invlist));
10285 PERL_ARGS_ASSERT_INVLIST_CLONE;
10287 if (new_invlist == NULL) {
10288 new_invlist = _new_invlist(nominal_length);
10291 sv_upgrade(new_invlist, SVt_INVLIST);
10292 initialize_invlist_guts(new_invlist, nominal_length);
10295 *(get_invlist_offset_addr(new_invlist)) = offset;
10296 invlist_set_len(new_invlist, nominal_length, offset);
10297 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10299 return new_invlist;
10304 PERL_STATIC_INLINE STRLEN*
10305 S_get_invlist_iter_addr(SV* invlist)
10307 /* Return the address of the UV that contains the current iteration
10310 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
10312 assert(is_invlist(invlist));
10314 return &(((XINVLIST*) SvANY(invlist))->iterator);
10317 PERL_STATIC_INLINE void
10318 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
10320 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
10322 *get_invlist_iter_addr(invlist) = 0;
10325 PERL_STATIC_INLINE void
10326 S_invlist_iterfinish(SV* invlist)
10328 /* Terminate iterator for invlist. This is to catch development errors.
10329 * Any iteration that is interrupted before completed should call this
10330 * function. Functions that add code points anywhere else but to the end
10331 * of an inversion list assert that they are not in the middle of an
10332 * iteration. If they were, the addition would make the iteration
10333 * problematical: if the iteration hadn't reached the place where things
10334 * were being added, it would be ok */
10336 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
10338 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
10342 S_invlist_iternext(SV* invlist, UV* start, UV* end)
10344 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
10345 * This call sets in <*start> and <*end>, the next range in <invlist>.
10346 * Returns <TRUE> if successful and the next call will return the next
10347 * range; <FALSE> if was already at the end of the list. If the latter,
10348 * <*start> and <*end> are unchanged, and the next call to this function
10349 * will start over at the beginning of the list */
10351 STRLEN* pos = get_invlist_iter_addr(invlist);
10352 UV len = _invlist_len(invlist);
10355 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
10358 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
10362 array = invlist_array(invlist);
10364 *start = array[(*pos)++];
10370 *end = array[(*pos)++] - 1;
10376 PERL_STATIC_INLINE UV
10377 S_invlist_highest(SV* const invlist)
10379 /* Returns the highest code point that matches an inversion list. This API
10380 * has an ambiguity, as it returns 0 under either the highest is actually
10381 * 0, or if the list is empty. If this distinction matters to you, check
10382 * for emptiness before calling this function */
10384 UV len = _invlist_len(invlist);
10387 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
10393 array = invlist_array(invlist);
10395 /* The last element in the array in the inversion list always starts a
10396 * range that goes to infinity. That range may be for code points that are
10397 * matched in the inversion list, or it may be for ones that aren't
10398 * matched. In the latter case, the highest code point in the set is one
10399 * less than the beginning of this range; otherwise it is the final element
10400 * of this range: infinity */
10401 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
10403 : array[len - 1] - 1;
10407 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10409 /* Get the contents of an inversion list into a string SV so that they can
10410 * be printed out. If 'traditional_style' is TRUE, it uses the format
10411 * traditionally done for debug tracing; otherwise it uses a format
10412 * suitable for just copying to the output, with blanks between ranges and
10413 * a dash between range components */
10417 const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10418 const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10420 if (traditional_style) {
10421 output = newSVpvs("\n");
10424 output = newSVpvs("");
10427 PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10429 assert(! invlist_is_iterating(invlist));
10431 invlist_iterinit(invlist);
10432 while (invlist_iternext(invlist, &start, &end)) {
10433 if (end == UV_MAX) {
10434 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10435 start, intra_range_delimiter,
10436 inter_range_delimiter);
10438 else if (end != start) {
10439 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10441 intra_range_delimiter,
10442 end, inter_range_delimiter);
10445 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10446 start, inter_range_delimiter);
10450 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10451 SvCUR_set(output, SvCUR(output) - 1);
10457 #ifndef PERL_IN_XSUB_RE
10459 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10460 const char * const indent, SV* const invlist)
10462 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
10463 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
10464 * the string 'indent'. The output looks like this:
10465 [0] 0x000A .. 0x000D
10467 [4] 0x2028 .. 0x2029
10468 [6] 0x3104 .. INFTY
10469 * This means that the first range of code points matched by the list are
10470 * 0xA through 0xD; the second range contains only the single code point
10471 * 0x85, etc. An inversion list is an array of UVs. Two array elements
10472 * are used to define each range (except if the final range extends to
10473 * infinity, only a single element is needed). The array index of the
10474 * first element for the corresponding range is given in brackets. */
10479 PERL_ARGS_ASSERT__INVLIST_DUMP;
10481 if (invlist_is_iterating(invlist)) {
10482 Perl_dump_indent(aTHX_ level, file,
10483 "%sCan't dump inversion list because is in middle of iterating\n",
10488 invlist_iterinit(invlist);
10489 while (invlist_iternext(invlist, &start, &end)) {
10490 if (end == UV_MAX) {
10491 Perl_dump_indent(aTHX_ level, file,
10492 "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10493 indent, (UV)count, start);
10495 else if (end != start) {
10496 Perl_dump_indent(aTHX_ level, file,
10497 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10498 indent, (UV)count, start, end);
10501 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10502 indent, (UV)count, start);
10510 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10512 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10514 /* Return a boolean as to if the two passed in inversion lists are
10515 * identical. The final argument, if TRUE, says to take the complement of
10516 * the second inversion list before doing the comparison */
10518 const UV len_a = _invlist_len(a);
10519 UV len_b = _invlist_len(b);
10521 const UV* array_a = NULL;
10522 const UV* array_b = NULL;
10524 PERL_ARGS_ASSERT__INVLISTEQ;
10526 /* This code avoids accessing the arrays unless it knows the length is
10531 return ! complement_b;
10535 array_a = invlist_array(a);
10539 array_b = invlist_array(b);
10542 /* If are to compare 'a' with the complement of b, set it
10543 * up so are looking at b's complement. */
10544 if (complement_b) {
10546 /* The complement of nothing is everything, so <a> would have to have
10547 * just one element, starting at zero (ending at infinity) */
10549 return (len_a == 1 && array_a[0] == 0);
10551 if (array_b[0] == 0) {
10553 /* Otherwise, to complement, we invert. Here, the first element is
10554 * 0, just remove it. To do this, we just pretend the array starts
10562 /* But if the first element is not zero, we pretend the list starts
10563 * at the 0 that is always stored immediately before the array. */
10569 return len_a == len_b
10570 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10576 * As best we can, determine the characters that can match the start of
10577 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
10578 * can be false positive matches
10580 * Returns the invlist as a new SV*; it is the caller's responsibility to
10581 * call SvREFCNT_dec() when done with it.
10584 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10587 const U8 * s = (U8*)STRING(node);
10588 SSize_t bytelen = STR_LEN(node);
10590 /* Start out big enough for 2 separate code points */
10591 SV* invlist = _new_invlist(4);
10593 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
10598 /* We punt and assume can match anything if the node begins
10599 * with a multi-character fold. Things are complicated. For
10600 * example, /ffi/i could match any of:
10601 * "\N{LATIN SMALL LIGATURE FFI}"
10602 * "\N{LATIN SMALL LIGATURE FF}I"
10603 * "F\N{LATIN SMALL LIGATURE FI}"
10604 * plus several other things; and making sure we have all the
10605 * possibilities is hard. */
10606 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10607 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10610 /* Any Latin1 range character can potentially match any
10611 * other depending on the locale, and in Turkic locales, U+130 and
10613 if (OP(node) == EXACTFL) {
10614 _invlist_union(invlist, PL_Latin1, &invlist);
10615 invlist = add_cp_to_invlist(invlist,
10616 LATIN_SMALL_LETTER_DOTLESS_I);
10617 invlist = add_cp_to_invlist(invlist,
10618 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10621 /* But otherwise, it matches at least itself. We can
10622 * quickly tell if it has a distinct fold, and if so,
10623 * it matches that as well */
10624 invlist = add_cp_to_invlist(invlist, uc);
10625 if (IS_IN_SOME_FOLD_L1(uc))
10626 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10629 /* Some characters match above-Latin1 ones under /i. This
10630 * is true of EXACTFL ones when the locale is UTF-8 */
10631 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10632 && (! isASCII(uc) || (OP(node) != EXACTFAA
10633 && OP(node) != EXACTFAA_NO_TRIE)))
10635 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10639 else { /* Pattern is UTF-8 */
10640 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10641 const U8* e = s + bytelen;
10644 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10646 /* The only code points that aren't folded in a UTF EXACTFish
10647 * node are are the problematic ones in EXACTFL nodes */
10648 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10649 /* We need to check for the possibility that this EXACTFL
10650 * node begins with a multi-char fold. Therefore we fold
10651 * the first few characters of it so that we can make that
10657 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10659 *(d++) = (U8) toFOLD(*s);
10660 if (fc < 0) { /* Save the first fold */
10667 UV fold = toFOLD_utf8_safe(s, e, d, &len);
10668 if (fc < 0) { /* Save the first fold */
10676 /* And set up so the code below that looks in this folded
10677 * buffer instead of the node's string */
10682 /* When we reach here 's' points to the fold of the first
10683 * character(s) of the node; and 'e' points to far enough along
10684 * the folded string to be just past any possible multi-char
10687 * Unlike the non-UTF-8 case, the macro for determining if a
10688 * string is a multi-char fold requires all the characters to
10689 * already be folded. This is because of all the complications
10690 * if not. Note that they are folded anyway, except in EXACTFL
10691 * nodes. Like the non-UTF case above, we punt if the node
10692 * begins with a multi-char fold */
10694 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10695 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10697 else { /* Single char fold */
10699 unsigned int first_fold;
10700 const unsigned int * remaining_folds;
10701 Size_t folds_count;
10703 /* It matches itself */
10704 invlist = add_cp_to_invlist(invlist, fc);
10706 /* ... plus all the things that fold to it, which are found in
10707 * PL_utf8_foldclosures */
10708 folds_count = _inverse_folds(fc, &first_fold,
10710 for (k = 0; k < folds_count; k++) {
10711 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10713 /* /aa doesn't allow folds between ASCII and non- */
10714 if ( (OP(node) == EXACTFAA || OP(node) == EXACTFAA_NO_TRIE)
10715 && isASCII(c) != isASCII(fc))
10720 invlist = add_cp_to_invlist(invlist, c);
10723 if (OP(node) == EXACTFL) {
10725 /* If either [iI] are present in an EXACTFL node the above code
10726 * should have added its normal case pair, but under a Turkish
10727 * locale they could match instead the case pairs from it. Add
10728 * those as potential matches as well */
10729 if (isALPHA_FOLD_EQ(fc, 'I')) {
10730 invlist = add_cp_to_invlist(invlist,
10731 LATIN_SMALL_LETTER_DOTLESS_I);
10732 invlist = add_cp_to_invlist(invlist,
10733 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10735 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10736 invlist = add_cp_to_invlist(invlist, 'I');
10738 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10739 invlist = add_cp_to_invlist(invlist, 'i');
10748 #undef HEADER_LENGTH
10749 #undef TO_INTERNAL_SIZE
10750 #undef FROM_INTERNAL_SIZE
10751 #undef INVLIST_VERSION_ID
10753 /* End of inversion list object */
10756 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10758 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10759 * constructs, and updates RExC_flags with them. On input, RExC_parse
10760 * should point to the first flag; it is updated on output to point to the
10761 * final ')' or ':'. There needs to be at least one flag, or this will
10764 /* for (?g), (?gc), and (?o) warnings; warning
10765 about (?c) will warn about (?g) -- japhy */
10767 #define WASTED_O 0x01
10768 #define WASTED_G 0x02
10769 #define WASTED_C 0x04
10770 #define WASTED_GC (WASTED_G|WASTED_C)
10771 I32 wastedflags = 0x00;
10772 U32 posflags = 0, negflags = 0;
10773 U32 *flagsp = &posflags;
10774 char has_charset_modifier = '\0';
10776 bool has_use_defaults = FALSE;
10777 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10778 int x_mod_count = 0;
10780 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10782 /* '^' as an initial flag sets certain defaults */
10783 if (UCHARAT(RExC_parse) == '^') {
10785 has_use_defaults = TRUE;
10786 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10787 cs = (RExC_uni_semantics)
10788 ? REGEX_UNICODE_CHARSET
10789 : REGEX_DEPENDS_CHARSET;
10790 set_regex_charset(&RExC_flags, cs);
10793 cs = get_regex_charset(RExC_flags);
10794 if ( cs == REGEX_DEPENDS_CHARSET
10795 && RExC_uni_semantics)
10797 cs = REGEX_UNICODE_CHARSET;
10801 while (RExC_parse < RExC_end) {
10802 /* && strchr("iogcmsx", *RExC_parse) */
10803 /* (?g), (?gc) and (?o) are useless here
10804 and must be globally applied -- japhy */
10805 switch (*RExC_parse) {
10807 /* Code for the imsxn flags */
10808 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10810 case LOCALE_PAT_MOD:
10811 if (has_charset_modifier) {
10812 goto excess_modifier;
10814 else if (flagsp == &negflags) {
10817 cs = REGEX_LOCALE_CHARSET;
10818 has_charset_modifier = LOCALE_PAT_MOD;
10820 case UNICODE_PAT_MOD:
10821 if (has_charset_modifier) {
10822 goto excess_modifier;
10824 else if (flagsp == &negflags) {
10827 cs = REGEX_UNICODE_CHARSET;
10828 has_charset_modifier = UNICODE_PAT_MOD;
10830 case ASCII_RESTRICT_PAT_MOD:
10831 if (flagsp == &negflags) {
10834 if (has_charset_modifier) {
10835 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10836 goto excess_modifier;
10838 /* Doubled modifier implies more restricted */
10839 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10842 cs = REGEX_ASCII_RESTRICTED_CHARSET;
10844 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10846 case DEPENDS_PAT_MOD:
10847 if (has_use_defaults) {
10848 goto fail_modifiers;
10850 else if (flagsp == &negflags) {
10853 else if (has_charset_modifier) {
10854 goto excess_modifier;
10857 /* The dual charset means unicode semantics if the
10858 * pattern (or target, not known until runtime) are
10859 * utf8, or something in the pattern indicates unicode
10861 cs = (RExC_uni_semantics)
10862 ? REGEX_UNICODE_CHARSET
10863 : REGEX_DEPENDS_CHARSET;
10864 has_charset_modifier = DEPENDS_PAT_MOD;
10868 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10869 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10871 else if (has_charset_modifier == *(RExC_parse - 1)) {
10872 vFAIL2("Regexp modifier \"%c\" may not appear twice",
10873 *(RExC_parse - 1));
10876 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10878 NOT_REACHED; /*NOTREACHED*/
10881 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10882 *(RExC_parse - 1));
10883 NOT_REACHED; /*NOTREACHED*/
10884 case ONCE_PAT_MOD: /* 'o' */
10885 case GLOBAL_PAT_MOD: /* 'g' */
10886 if (ckWARN(WARN_REGEXP)) {
10887 const I32 wflagbit = *RExC_parse == 'o'
10890 if (! (wastedflags & wflagbit) ) {
10891 wastedflags |= wflagbit;
10892 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10895 "Useless (%s%c) - %suse /%c modifier",
10896 flagsp == &negflags ? "?-" : "?",
10898 flagsp == &negflags ? "don't " : "",
10905 case CONTINUE_PAT_MOD: /* 'c' */
10906 if (ckWARN(WARN_REGEXP)) {
10907 if (! (wastedflags & WASTED_C) ) {
10908 wastedflags |= WASTED_GC;
10909 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10912 "Useless (%sc) - %suse /gc modifier",
10913 flagsp == &negflags ? "?-" : "?",
10914 flagsp == &negflags ? "don't " : ""
10919 case KEEPCOPY_PAT_MOD: /* 'p' */
10920 if (flagsp == &negflags) {
10921 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10923 *flagsp |= RXf_PMf_KEEPCOPY;
10927 /* A flag is a default iff it is following a minus, so
10928 * if there is a minus, it means will be trying to
10929 * re-specify a default which is an error */
10930 if (has_use_defaults || flagsp == &negflags) {
10931 goto fail_modifiers;
10933 flagsp = &negflags;
10934 wastedflags = 0; /* reset so (?g-c) warns twice */
10940 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
10941 negflags |= RXf_PMf_EXTENDED_MORE;
10943 RExC_flags |= posflags;
10945 if (negflags & RXf_PMf_EXTENDED) {
10946 negflags |= RXf_PMf_EXTENDED_MORE;
10948 RExC_flags &= ~negflags;
10949 set_regex_charset(&RExC_flags, cs);
10954 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
10955 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10956 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
10957 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10958 NOT_REACHED; /*NOTREACHED*/
10961 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10964 vFAIL("Sequence (?... not terminated");
10968 - reg - regular expression, i.e. main body or parenthesized thing
10970 * Caller must absorb opening parenthesis.
10972 * Combining parenthesis handling with the base level of regular expression
10973 * is a trifle forced, but the need to tie the tails of the branches to what
10974 * follows makes it hard to avoid.
10976 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
10978 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
10980 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
10983 PERL_STATIC_INLINE regnode_offset
10984 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
10986 char * parse_start,
10990 regnode_offset ret;
10991 char* name_start = RExC_parse;
10993 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
10994 GET_RE_DEBUG_FLAGS_DECL;
10996 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
10998 if (RExC_parse == name_start || *RExC_parse != ch) {
10999 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11000 vFAIL2("Sequence %.3s... not terminated", parse_start);
11004 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11005 RExC_rxi->data->data[num]=(void*)sv_dat;
11006 SvREFCNT_inc_simple_void_NN(sv_dat);
11009 ret = reganode(pRExC_state,
11012 : (ASCII_FOLD_RESTRICTED)
11014 : (AT_LEAST_UNI_SEMANTICS)
11020 *flagp |= HASWIDTH;
11022 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11023 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11025 nextchar(pRExC_state);
11029 /* On success, returns the offset at which any next node should be placed into
11030 * the regex engine program being compiled.
11032 * Returns 0 otherwise, with *flagp set to indicate why:
11033 * TRYAGAIN at the end of (?) that only sets flags.
11034 * RESTART_PARSE if the parse needs to be restarted, or'd with
11035 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11036 * Otherwise would only return 0 if regbranch() returns 0, which cannot
11038 STATIC regnode_offset
11039 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11040 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11041 * 2 is like 1, but indicates that nextchar() has been called to advance
11042 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
11043 * this flag alerts us to the need to check for that */
11045 regnode_offset ret = 0; /* Will be the head of the group. */
11047 regnode_offset lastbr;
11048 regnode_offset ender = 0;
11051 U32 oregflags = RExC_flags;
11052 bool have_branch = 0;
11054 I32 freeze_paren = 0;
11055 I32 after_freeze = 0;
11056 I32 num; /* numeric backreferences */
11057 SV * max_open; /* Max number of unclosed parens */
11059 char * parse_start = RExC_parse; /* MJD */
11060 char * const oregcomp_parse = RExC_parse;
11062 GET_RE_DEBUG_FLAGS_DECL;
11064 PERL_ARGS_ASSERT_REG;
11065 DEBUG_PARSE("reg ");
11068 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11070 if (!SvIOK(max_open)) {
11071 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11073 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11075 vFAIL("Too many nested open parens");
11078 *flagp = 0; /* Tentatively. */
11080 /* Having this true makes it feasible to have a lot fewer tests for the
11081 * parse pointer being in scope. For example, we can write
11082 * while(isFOO(*RExC_parse)) RExC_parse++;
11084 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11086 assert(*RExC_end == '\0');
11088 /* Make an OPEN node, if parenthesized. */
11091 /* Under /x, space and comments can be gobbled up between the '(' and
11092 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
11093 * intervening space, as the sequence is a token, and a token should be
11095 bool has_intervening_patws = (paren == 2)
11096 && *(RExC_parse - 1) != '(';
11098 if (RExC_parse >= RExC_end) {
11099 vFAIL("Unmatched (");
11102 if (paren == 'r') { /* Atomic script run */
11106 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11107 char *start_verb = RExC_parse + 1;
11109 char *start_arg = NULL;
11110 unsigned char op = 0;
11111 int arg_required = 0;
11112 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11113 bool has_upper = FALSE;
11115 if (has_intervening_patws) {
11116 RExC_parse++; /* past the '*' */
11118 /* For strict backwards compatibility, don't change the message
11119 * now that we also have lowercase operands */
11120 if (isUPPER(*RExC_parse)) {
11121 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11124 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11127 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11128 if ( *RExC_parse == ':' ) {
11129 start_arg = RExC_parse + 1;
11133 if (isUPPER(*RExC_parse)) {
11139 RExC_parse += UTF8SKIP(RExC_parse);
11142 verb_len = RExC_parse - start_verb;
11144 if (RExC_parse >= RExC_end) {
11145 goto unterminated_verb_pattern;
11148 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11149 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11150 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11152 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11153 unterminated_verb_pattern:
11155 vFAIL("Unterminated verb pattern argument");
11158 vFAIL("Unterminated '(*...' argument");
11162 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11164 vFAIL("Unterminated verb pattern");
11167 vFAIL("Unterminated '(*...' construct");
11172 /* Here, we know that RExC_parse < RExC_end */
11174 switch ( *start_verb ) {
11175 case 'A': /* (*ACCEPT) */
11176 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11178 internal_argval = RExC_nestroot;
11181 case 'C': /* (*COMMIT) */
11182 if ( memEQs(start_verb, verb_len,"COMMIT") )
11185 case 'F': /* (*FAIL) */
11186 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11190 case ':': /* (*:NAME) */
11191 case 'M': /* (*MARK:NAME) */
11192 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11197 case 'P': /* (*PRUNE) */
11198 if ( memEQs(start_verb, verb_len,"PRUNE") )
11201 case 'S': /* (*SKIP) */
11202 if ( memEQs(start_verb, verb_len,"SKIP") )
11205 case 'T': /* (*THEN) */
11206 /* [19:06] <TimToady> :: is then */
11207 if ( memEQs(start_verb, verb_len,"THEN") ) {
11209 RExC_seen |= REG_CUTGROUP_SEEN;
11213 if ( memEQs(start_verb, verb_len, "asr")
11214 || memEQs(start_verb, verb_len, "atomic_script_run"))
11216 paren = 'r'; /* Mnemonic: recursed run */
11219 else if (memEQs(start_verb, verb_len, "atomic")) {
11220 paren = 't'; /* AtOMIC */
11221 goto alpha_assertions;
11225 if ( memEQs(start_verb, verb_len, "plb")
11226 || memEQs(start_verb, verb_len, "positive_lookbehind"))
11229 goto lookbehind_alpha_assertions;
11231 else if ( memEQs(start_verb, verb_len, "pla")
11232 || memEQs(start_verb, verb_len, "positive_lookahead"))
11235 goto alpha_assertions;
11239 if ( memEQs(start_verb, verb_len, "nlb")
11240 || memEQs(start_verb, verb_len, "negative_lookbehind"))
11243 goto lookbehind_alpha_assertions;
11245 else if ( memEQs(start_verb, verb_len, "nla")
11246 || memEQs(start_verb, verb_len, "negative_lookahead"))
11249 goto alpha_assertions;
11253 if ( memEQs(start_verb, verb_len, "sr")
11254 || memEQs(start_verb, verb_len, "script_run"))
11256 regnode_offset atomic;
11262 /* This indicates Unicode rules. */
11263 REQUIRE_UNI_RULES(flagp, 0);
11269 RExC_parse = start_arg;
11271 if (RExC_in_script_run) {
11273 /* Nested script runs are treated as no-ops, because
11274 * if the nested one fails, the outer one must as
11275 * well. It could fail sooner, and avoid (??{} with
11276 * side effects, but that is explicitly documented as
11277 * undefined behavior. */
11281 if (paren == 's') {
11286 /* But, the atomic part of a nested atomic script run
11287 * isn't a no-op, but can be treated just like a '(?>'
11293 /* By doing this here, we avoid extra warnings for nested
11295 ckWARNexperimental(RExC_parse,
11296 WARN_EXPERIMENTAL__SCRIPT_RUN,
11297 "The script_run feature is experimental");
11299 if (paren == 's') {
11300 /* Here, we're starting a new regular script run */
11301 ret = reg_node(pRExC_state, SROPEN);
11302 RExC_in_script_run = 1;
11307 /* Here, we are starting an atomic script run. This is
11308 * handled by recursing to deal with the atomic portion
11309 * separately, enclosed in SROPEN ... SRCLOSE nodes */
11311 ret = reg_node(pRExC_state, SROPEN);
11313 RExC_in_script_run = 1;
11315 atomic = reg(pRExC_state, 'r', &flags, depth);
11316 if (flags & (RESTART_PARSE|NEED_UTF8)) {
11317 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11321 REGTAIL(pRExC_state, ret, atomic);
11323 REGTAIL(pRExC_state, atomic,
11324 reg_node(pRExC_state, SRCLOSE));
11326 RExC_in_script_run = 0;
11332 lookbehind_alpha_assertions:
11333 RExC_seen |= REG_LOOKBEHIND_SEEN;
11334 RExC_in_lookbehind++;
11338 ckWARNexperimental(RExC_parse,
11339 WARN_EXPERIMENTAL__ALPHA_ASSERTIONS,
11340 "The alpha_assertions feature is experimental");
11342 RExC_seen_zerolen++;
11348 /* An empty negative lookahead assertion simply is failure */
11349 if (paren == 'A' && RExC_parse == start_arg) {
11350 ret=reganode(pRExC_state, OPFAIL, 0);
11351 nextchar(pRExC_state);
11355 RExC_parse = start_arg;
11360 "'(*%" UTF8f "' requires a terminating ':'",
11361 UTF8fARG(UTF, verb_len, start_verb));
11362 NOT_REACHED; /*NOTREACHED*/
11364 } /* End of switch */
11367 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11369 if (has_upper || verb_len == 0) {
11371 "Unknown verb pattern '%" UTF8f "'",
11372 UTF8fARG(UTF, verb_len, start_verb));
11376 "Unknown '(*...)' construct '%" UTF8f "'",
11377 UTF8fARG(UTF, verb_len, start_verb));
11380 if ( RExC_parse == start_arg ) {
11383 if ( arg_required && !start_arg ) {
11384 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11385 verb_len, start_verb);
11387 if (internal_argval == -1) {
11388 ret = reganode(pRExC_state, op, 0);
11390 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11392 RExC_seen |= REG_VERBARG_SEEN;
11394 SV *sv = newSVpvn( start_arg,
11395 RExC_parse - start_arg);
11396 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11397 STR_WITH_LEN("S"));
11398 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11399 FLAGS(REGNODE_p(ret)) = 1;
11401 FLAGS(REGNODE_p(ret)) = 0;
11403 if ( internal_argval != -1 )
11404 ARG2L_SET(REGNODE_p(ret), internal_argval);
11405 nextchar(pRExC_state);
11408 else if (*RExC_parse == '?') { /* (?...) */
11409 bool is_logical = 0;
11410 const char * const seqstart = RExC_parse;
11411 const char * endptr;
11412 if (has_intervening_patws) {
11414 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11417 RExC_parse++; /* past the '?' */
11418 paren = *RExC_parse; /* might be a trailing NUL, if not
11420 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11421 if (RExC_parse > RExC_end) {
11424 ret = 0; /* For look-ahead/behind. */
11427 case 'P': /* (?P...) variants for those used to PCRE/Python */
11428 paren = *RExC_parse;
11429 if ( paren == '<') { /* (?P<...>) named capture */
11431 if (RExC_parse >= RExC_end) {
11432 vFAIL("Sequence (?P<... not terminated");
11434 goto named_capture;
11436 else if (paren == '>') { /* (?P>name) named recursion */
11438 if (RExC_parse >= RExC_end) {
11439 vFAIL("Sequence (?P>... not terminated");
11441 goto named_recursion;
11443 else if (paren == '=') { /* (?P=...) named backref */
11445 return handle_named_backref(pRExC_state, flagp,
11448 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11449 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11450 vFAIL3("Sequence (%.*s...) not recognized",
11451 RExC_parse-seqstart, seqstart);
11452 NOT_REACHED; /*NOTREACHED*/
11453 case '<': /* (?<...) */
11454 if (*RExC_parse == '!')
11456 else if (*RExC_parse != '=')
11463 case '\'': /* (?'...') */
11464 name_start = RExC_parse;
11465 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11466 if ( RExC_parse == name_start
11467 || RExC_parse >= RExC_end
11468 || *RExC_parse != paren)
11470 vFAIL2("Sequence (?%c... not terminated",
11471 paren=='>' ? '<' : paren);
11476 if (!svname) /* shouldn't happen */
11478 "panic: reg_scan_name returned NULL");
11479 if (!RExC_paren_names) {
11480 RExC_paren_names= newHV();
11481 sv_2mortal(MUTABLE_SV(RExC_paren_names));
11483 RExC_paren_name_list= newAV();
11484 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11487 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11489 sv_dat = HeVAL(he_str);
11491 /* croak baby croak */
11493 "panic: paren_name hash element allocation failed");
11494 } else if ( SvPOK(sv_dat) ) {
11495 /* (?|...) can mean we have dupes so scan to check
11496 its already been stored. Maybe a flag indicating
11497 we are inside such a construct would be useful,
11498 but the arrays are likely to be quite small, so
11499 for now we punt -- dmq */
11500 IV count = SvIV(sv_dat);
11501 I32 *pv = (I32*)SvPVX(sv_dat);
11503 for ( i = 0 ; i < count ; i++ ) {
11504 if ( pv[i] == RExC_npar ) {
11510 pv = (I32*)SvGROW(sv_dat,
11511 SvCUR(sv_dat) + sizeof(I32)+1);
11512 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11513 pv[count] = RExC_npar;
11514 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11517 (void)SvUPGRADE(sv_dat, SVt_PVNV);
11518 sv_setpvn(sv_dat, (char *)&(RExC_npar),
11521 SvIV_set(sv_dat, 1);
11524 /* Yes this does cause a memory leak in debugging Perls
11526 if (!av_store(RExC_paren_name_list,
11527 RExC_npar, SvREFCNT_inc_NN(svname)))
11528 SvREFCNT_dec_NN(svname);
11531 /*sv_dump(sv_dat);*/
11533 nextchar(pRExC_state);
11535 goto capturing_parens;
11538 RExC_seen |= REG_LOOKBEHIND_SEEN;
11539 RExC_in_lookbehind++;
11541 if (RExC_parse >= RExC_end) {
11542 vFAIL("Sequence (?... not terminated");
11546 case '=': /* (?=...) */
11547 RExC_seen_zerolen++;
11549 case '!': /* (?!...) */
11550 RExC_seen_zerolen++;
11551 /* check if we're really just a "FAIL" assertion */
11552 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11553 FALSE /* Don't force to /x */ );
11554 if (*RExC_parse == ')') {
11555 ret=reganode(pRExC_state, OPFAIL, 0);
11556 nextchar(pRExC_state);
11560 case '|': /* (?|...) */
11561 /* branch reset, behave like a (?:...) except that
11562 buffers in alternations share the same numbers */
11564 after_freeze = freeze_paren = RExC_npar;
11566 /* XXX This construct currently requires an extra pass.
11567 * Investigation would be required to see if that could be
11569 REQUIRE_PARENS_PASS;
11571 case ':': /* (?:...) */
11572 case '>': /* (?>...) */
11574 case '$': /* (?$...) */
11575 case '@': /* (?@...) */
11576 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11578 case '0' : /* (?0) */
11579 case 'R' : /* (?R) */
11580 if (RExC_parse == RExC_end || *RExC_parse != ')')
11581 FAIL("Sequence (?R) not terminated");
11583 RExC_seen |= REG_RECURSE_SEEN;
11585 /* XXX These constructs currently require an extra pass.
11586 * It probably could be changed */
11587 REQUIRE_PARENS_PASS;
11589 *flagp |= POSTPONED;
11590 goto gen_recurse_regop;
11592 /* named and numeric backreferences */
11593 case '&': /* (?&NAME) */
11594 parse_start = RExC_parse - 1;
11597 SV *sv_dat = reg_scan_name(pRExC_state,
11598 REG_RSN_RETURN_DATA);
11599 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11601 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11602 vFAIL("Sequence (?&... not terminated");
11603 goto gen_recurse_regop;
11606 if (! inRANGE(RExC_parse[0], '1', '9')) {
11608 vFAIL("Illegal pattern");
11610 goto parse_recursion;
11612 case '-': /* (?-1) */
11613 if (! inRANGE(RExC_parse[0], '1', '9')) {
11614 RExC_parse--; /* rewind to let it be handled later */
11618 case '1': case '2': case '3': case '4': /* (?1) */
11619 case '5': case '6': case '7': case '8': case '9':
11620 RExC_parse = (char *) seqstart + 1; /* Point to the digit */
11623 bool is_neg = FALSE;
11625 parse_start = RExC_parse - 1; /* MJD */
11626 if (*RExC_parse == '-') {
11631 if (grok_atoUV(RExC_parse, &unum, &endptr)
11635 RExC_parse = (char*)endptr;
11639 /* Some limit for num? */
11643 if (*RExC_parse!=')')
11644 vFAIL("Expecting close bracket");
11647 if ( paren == '-' ) {
11649 Diagram of capture buffer numbering.
11650 Top line is the normal capture buffer numbers
11651 Bottom line is the negative indexing as from
11655 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11659 num = RExC_npar + num;
11662 /* It might be a forward reference; we can't fail until
11663 * we know, by completing the parse to get all the
11664 * groups, and then reparsing */
11665 if (ALL_PARENS_COUNTED) {
11667 vFAIL("Reference to nonexistent group");
11670 REQUIRE_PARENS_PASS;
11673 } else if ( paren == '+' ) {
11674 num = RExC_npar + num - 1;
11676 /* We keep track how many GOSUB items we have produced.
11677 To start off the ARG2L() of the GOSUB holds its "id",
11678 which is used later in conjunction with RExC_recurse
11679 to calculate the offset we need to jump for the GOSUB,
11680 which it will store in the final representation.
11681 We have to defer the actual calculation until much later
11682 as the regop may move.
11685 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11686 if (num >= RExC_npar) {
11688 /* It might be a forward reference; we can't fail until we
11689 * know, by completing the parse to get all the groups, and
11690 * then reparsing */
11691 if (ALL_PARENS_COUNTED) {
11692 if (num >= RExC_total_parens) {
11694 vFAIL("Reference to nonexistent group");
11698 REQUIRE_PARENS_PASS;
11701 RExC_recurse_count++;
11702 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11703 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11704 22, "| |", (int)(depth * 2 + 1), "",
11705 (UV)ARG(REGNODE_p(ret)),
11706 (IV)ARG2L(REGNODE_p(ret))));
11707 RExC_seen |= REG_RECURSE_SEEN;
11709 Set_Node_Length(REGNODE_p(ret),
11710 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11711 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11713 *flagp |= POSTPONED;
11714 assert(*RExC_parse == ')');
11715 nextchar(pRExC_state);
11720 case '?': /* (??...) */
11722 if (*RExC_parse != '{') {
11723 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11724 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11726 "Sequence (%" UTF8f "...) not recognized",
11727 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11728 NOT_REACHED; /*NOTREACHED*/
11730 *flagp |= POSTPONED;
11734 case '{': /* (?{...}) */
11737 struct reg_code_block *cb;
11740 RExC_seen_zerolen++;
11742 if ( !pRExC_state->code_blocks
11743 || pRExC_state->code_index
11744 >= pRExC_state->code_blocks->count
11745 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11746 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11749 if (RExC_pm_flags & PMf_USE_RE_EVAL)
11750 FAIL("panic: Sequence (?{...}): no code block found\n");
11751 FAIL("Eval-group not allowed at runtime, use re 'eval'");
11753 /* this is a pre-compiled code block (?{...}) */
11754 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11755 RExC_parse = RExC_start + cb->end;
11757 if (cb->src_regex) {
11758 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11759 RExC_rxi->data->data[n] =
11760 (void*)SvREFCNT_inc((SV*)cb->src_regex);
11761 RExC_rxi->data->data[n+1] = (void*)o;
11764 n = add_data(pRExC_state,
11765 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11766 RExC_rxi->data->data[n] = (void*)o;
11768 pRExC_state->code_index++;
11769 nextchar(pRExC_state);
11772 regnode_offset eval;
11773 ret = reg_node(pRExC_state, LOGICAL);
11775 eval = reg2Lanode(pRExC_state, EVAL,
11778 /* for later propagation into (??{})
11780 RExC_flags & RXf_PMf_COMPILETIME
11782 FLAGS(REGNODE_p(ret)) = 2;
11783 REGTAIL(pRExC_state, ret, eval);
11784 /* deal with the length of this later - MJD */
11787 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11788 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11789 Set_Node_Offset(REGNODE_p(ret), parse_start);
11792 case '(': /* (?(?{...})...) and (?(?=...)...) */
11795 const int DEFINE_len = sizeof("DEFINE") - 1;
11796 if ( RExC_parse < RExC_end - 1
11797 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
11798 && ( RExC_parse[1] == '='
11799 || RExC_parse[1] == '!'
11800 || RExC_parse[1] == '<'
11801 || RExC_parse[1] == '{'))
11802 || ( RExC_parse[0] == '*' /* (?(*...)) */
11803 && ( memBEGINs(RExC_parse + 1,
11804 (Size_t) (RExC_end - (RExC_parse + 1)),
11806 || memBEGINs(RExC_parse + 1,
11807 (Size_t) (RExC_end - (RExC_parse + 1)),
11809 || memBEGINs(RExC_parse + 1,
11810 (Size_t) (RExC_end - (RExC_parse + 1)),
11812 || memBEGINs(RExC_parse + 1,
11813 (Size_t) (RExC_end - (RExC_parse + 1)),
11815 || memBEGINs(RExC_parse + 1,
11816 (Size_t) (RExC_end - (RExC_parse + 1)),
11817 "positive_lookahead:")
11818 || memBEGINs(RExC_parse + 1,
11819 (Size_t) (RExC_end - (RExC_parse + 1)),
11820 "positive_lookbehind:")
11821 || memBEGINs(RExC_parse + 1,
11822 (Size_t) (RExC_end - (RExC_parse + 1)),
11823 "negative_lookahead:")
11824 || memBEGINs(RExC_parse + 1,
11825 (Size_t) (RExC_end - (RExC_parse + 1)),
11826 "negative_lookbehind:"))))
11827 ) { /* Lookahead or eval. */
11829 regnode_offset tail;
11831 ret = reg_node(pRExC_state, LOGICAL);
11832 FLAGS(REGNODE_p(ret)) = 1;
11834 tail = reg(pRExC_state, 1, &flag, depth+1);
11835 RETURN_FAIL_ON_RESTART(flag, flagp);
11836 REGTAIL(pRExC_state, ret, tail);
11839 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
11840 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11842 char ch = RExC_parse[0] == '<' ? '>' : '\'';
11843 char *name_start= RExC_parse++;
11845 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11846 if ( RExC_parse == name_start
11847 || RExC_parse >= RExC_end
11848 || *RExC_parse != ch)
11850 vFAIL2("Sequence (?(%c... not terminated",
11851 (ch == '>' ? '<' : ch));
11855 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11856 RExC_rxi->data->data[num]=(void*)sv_dat;
11857 SvREFCNT_inc_simple_void_NN(sv_dat);
11859 ret = reganode(pRExC_state, GROUPPN, num);
11860 goto insert_if_check_paren;
11862 else if (memBEGINs(RExC_parse,
11863 (STRLEN) (RExC_end - RExC_parse),
11866 ret = reganode(pRExC_state, DEFINEP, 0);
11867 RExC_parse += DEFINE_len;
11869 goto insert_if_check_paren;
11871 else if (RExC_parse[0] == 'R') {
11873 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
11874 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11875 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11878 if (RExC_parse[0] == '0') {
11882 else if (inRANGE(RExC_parse[0], '1', '9')) {
11885 if (grok_atoUV(RExC_parse, &uv, &endptr)
11888 parno = (I32)uv + 1;
11889 RExC_parse = (char*)endptr;
11891 /* else "Switch condition not recognized" below */
11892 } else if (RExC_parse[0] == '&') {
11895 sv_dat = reg_scan_name(pRExC_state,
11896 REG_RSN_RETURN_DATA);
11898 parno = 1 + *((I32 *)SvPVX(sv_dat));
11900 ret = reganode(pRExC_state, INSUBP, parno);
11901 goto insert_if_check_paren;
11903 else if (inRANGE(RExC_parse[0], '1', '9')) {
11908 if (grok_atoUV(RExC_parse, &uv, &endptr)
11912 RExC_parse = (char*)endptr;
11915 vFAIL("panic: grok_atoUV returned FALSE");
11917 ret = reganode(pRExC_state, GROUPP, parno);
11919 insert_if_check_paren:
11920 if (UCHARAT(RExC_parse) != ')') {
11922 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11924 vFAIL("Switch condition not recognized");
11926 nextchar(pRExC_state);
11928 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
11929 br = regbranch(pRExC_state, &flags, 1, depth+1);
11931 RETURN_FAIL_ON_RESTART(flags,flagp);
11932 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11935 REGTAIL(pRExC_state, br, reganode(pRExC_state,
11937 c = UCHARAT(RExC_parse);
11938 nextchar(pRExC_state);
11939 if (flags&HASWIDTH)
11940 *flagp |= HASWIDTH;
11943 vFAIL("(?(DEFINE)....) does not allow branches");
11945 /* Fake one for optimizer. */
11946 lastbr = reganode(pRExC_state, IFTHEN, 0);
11948 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
11949 RETURN_FAIL_ON_RESTART(flags, flagp);
11950 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
11953 REGTAIL(pRExC_state, ret, lastbr);
11954 if (flags&HASWIDTH)
11955 *flagp |= HASWIDTH;
11956 c = UCHARAT(RExC_parse);
11957 nextchar(pRExC_state);
11962 if (RExC_parse >= RExC_end)
11963 vFAIL("Switch (?(condition)... not terminated");
11965 vFAIL("Switch (?(condition)... contains too many branches");
11967 ender = reg_node(pRExC_state, TAIL);
11968 REGTAIL(pRExC_state, br, ender);
11970 REGTAIL(pRExC_state, lastbr, ender);
11971 REGTAIL(pRExC_state, REGNODE_OFFSET(
11973 NEXTOPER(REGNODE_p(lastbr)))),
11977 REGTAIL(pRExC_state, ret, ender);
11978 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */
11979 RExC_size++; /* XXX WHY do we need this?!!
11980 For large programs it seems to be required
11981 but I can't figure out why. -- dmq*/
11986 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11988 vFAIL("Unknown switch condition (?(...))");
11990 case '[': /* (?[ ... ]) */
11991 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
11993 case 0: /* A NUL */
11994 RExC_parse--; /* for vFAIL to print correctly */
11995 vFAIL("Sequence (? incomplete");
11999 if (RExC_strict) { /* [perl #132851] */
12000 ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12003 default: /* e.g., (?i) */
12004 RExC_parse = (char *) seqstart + 1;
12006 parse_lparen_question_flags(pRExC_state);
12007 if (UCHARAT(RExC_parse) != ':') {
12008 if (RExC_parse < RExC_end)
12009 nextchar(pRExC_state);
12014 nextchar(pRExC_state);
12020 if (*RExC_parse == '{') {
12021 ckWARNregdep(RExC_parse + 1,
12022 "Unescaped left brace in regex is "
12023 "deprecated here (and will be fatal "
12024 "in Perl 5.32), passed through");
12026 /* Not bothering to indent here, as the above 'else' is temporary
12028 if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
12032 if (! ALL_PARENS_COUNTED) {
12033 /* If we are in our first pass through (and maybe only pass),
12034 * we need to allocate memory for the capturing parentheses
12038 if (!RExC_parens_buf_size) {
12039 /* first guess at number of parens we might encounter */
12040 RExC_parens_buf_size = 10;
12042 /* setup RExC_open_parens, which holds the address of each
12043 * OPEN tag, and to make things simpler for the 0 index the
12044 * start of the program - this is used later for offsets */
12045 Newxz(RExC_open_parens, RExC_parens_buf_size,
12047 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
12049 /* setup RExC_close_parens, which holds the address of each
12050 * CLOSE tag, and to make things simpler for the 0 index
12051 * the end of the program - this is used later for offsets
12053 Newxz(RExC_close_parens, RExC_parens_buf_size,
12055 /* we dont know where end op starts yet, so we dont need to
12056 * set RExC_close_parens[0] like we do RExC_open_parens[0]
12059 else if (RExC_npar > RExC_parens_buf_size) {
12060 I32 old_size = RExC_parens_buf_size;
12062 RExC_parens_buf_size *= 2;
12064 Renew(RExC_open_parens, RExC_parens_buf_size,
12066 Zero(RExC_open_parens + old_size,
12067 RExC_parens_buf_size - old_size, regnode_offset);
12069 Renew(RExC_close_parens, RExC_parens_buf_size,
12071 Zero(RExC_close_parens + old_size,
12072 RExC_parens_buf_size - old_size, regnode_offset);
12076 ret = reganode(pRExC_state, OPEN, parno);
12077 if (!RExC_nestroot)
12078 RExC_nestroot = parno;
12079 if (RExC_open_parens && !RExC_open_parens[parno])
12081 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12082 "%*s%*s Setting open paren #%" IVdf " to %d\n",
12083 22, "| |", (int)(depth * 2 + 1), "",
12085 RExC_open_parens[parno]= ret;
12088 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12089 Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12092 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12102 /* Pick up the branches, linking them together. */
12103 parse_start = RExC_parse; /* MJD */
12104 br = regbranch(pRExC_state, &flags, 1, depth+1);
12106 /* branch_len = (paren != 0); */
12109 RETURN_FAIL_ON_RESTART(flags, flagp);
12110 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12112 if (*RExC_parse == '|') {
12113 if (RExC_use_BRANCHJ) {
12114 reginsert(pRExC_state, BRANCHJ, br, depth+1);
12117 reginsert(pRExC_state, BRANCH, br, depth+1);
12118 Set_Node_Length(REGNODE_p(br), paren != 0);
12119 Set_Node_Offset_To_R(br, parse_start-RExC_start);
12123 else if (paren == ':') {
12124 *flagp |= flags&SIMPLE;
12126 if (is_open) { /* Starts with OPEN. */
12127 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
12129 else if (paren != '?') /* Not Conditional */
12131 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12133 while (*RExC_parse == '|') {
12134 if (RExC_use_BRANCHJ) {
12135 ender = reganode(pRExC_state, LONGJMP, 0);
12137 /* Append to the previous. */
12138 REGTAIL(pRExC_state,
12139 REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12142 nextchar(pRExC_state);
12143 if (freeze_paren) {
12144 if (RExC_npar > after_freeze)
12145 after_freeze = RExC_npar;
12146 RExC_npar = freeze_paren;
12148 br = regbranch(pRExC_state, &flags, 0, depth+1);
12151 RETURN_FAIL_ON_RESTART(flags, flagp);
12152 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12154 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
12155 REQUIRE_BRANCHJ(flagp, 0);
12158 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
12161 if (have_branch || paren != ':') {
12164 /* Make a closing node, and hook it on the end. */
12167 ender = reg_node(pRExC_state, TAIL);
12170 ender = reganode(pRExC_state, CLOSE, parno);
12171 if ( RExC_close_parens ) {
12172 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12173 "%*s%*s Setting close paren #%" IVdf " to %d\n",
12174 22, "| |", (int)(depth * 2 + 1), "",
12175 (IV)parno, ender));
12176 RExC_close_parens[parno]= ender;
12177 if (RExC_nestroot == parno)
12180 Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12181 Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12184 ender = reg_node(pRExC_state, SRCLOSE);
12185 RExC_in_script_run = 0;
12195 *flagp &= ~HASWIDTH;
12197 case 't': /* aTomic */
12199 ender = reg_node(pRExC_state, SUCCEED);
12202 ender = reg_node(pRExC_state, END);
12203 assert(!RExC_end_op); /* there can only be one! */
12204 RExC_end_op = REGNODE_p(ender);
12205 if (RExC_close_parens) {
12206 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12207 "%*s%*s Setting close paren #0 (END) to %d\n",
12208 22, "| |", (int)(depth * 2 + 1), "",
12211 RExC_close_parens[0]= ender;
12216 DEBUG_PARSE_MSG("lsbr");
12217 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12218 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12219 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12220 SvPV_nolen_const(RExC_mysv1),
12222 SvPV_nolen_const(RExC_mysv2),
12224 (IV)(ender - lastbr)
12227 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12228 REQUIRE_BRANCHJ(flagp, 0);
12232 char is_nothing= 1;
12234 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12236 /* Hook the tails of the branches to the closing node. */
12237 for (br = REGNODE_p(ret); br; br = regnext(br)) {
12238 const U8 op = PL_regkind[OP(br)];
12239 if (op == BRANCH) {
12240 if (! REGTAIL_STUDY(pRExC_state,
12241 REGNODE_OFFSET(NEXTOPER(br)),
12244 REQUIRE_BRANCHJ(flagp, 0);
12246 if ( OP(NEXTOPER(br)) != NOTHING
12247 || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12250 else if (op == BRANCHJ) {
12251 REGTAIL_STUDY(pRExC_state,
12252 REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12254 /* for now we always disable this optimisation * /
12255 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12256 || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12262 regnode * ret_as_regnode = REGNODE_p(ret);
12263 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12264 ? regnext(ret_as_regnode)
12267 DEBUG_PARSE_MSG("NADA");
12268 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12269 NULL, pRExC_state);
12270 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12271 NULL, pRExC_state);
12272 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12273 SvPV_nolen_const(RExC_mysv1),
12274 (IV)REG_NODE_NUM(ret_as_regnode),
12275 SvPV_nolen_const(RExC_mysv2),
12281 if (OP(REGNODE_p(ender)) == TAIL) {
12283 RExC_emit= REGNODE_OFFSET(br) + 1;
12286 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12287 OP(opt)= OPTIMIZED;
12288 NEXT_OFF(br)= REGNODE_p(ender) - br;
12296 /* Even/odd or x=don't care: 010101x10x */
12297 static const char parens[] = "=!aA<,>Bbt";
12298 /* flag below is set to 0 up through 'A'; 1 for larger */
12300 if (paren && (p = strchr(parens, paren))) {
12301 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12302 int flag = (p - parens) > 3;
12304 if (paren == '>' || paren == 't') {
12305 node = SUSPEND, flag = 0;
12308 reginsert(pRExC_state, node, ret, depth+1);
12309 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12310 Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12311 FLAGS(REGNODE_p(ret)) = flag;
12312 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12314 REQUIRE_BRANCHJ(flagp, 0);
12319 /* Check for proper termination. */
12321 /* restore original flags, but keep (?p) and, if we've encountered
12322 * something in the parse that changes /d rules into /u, keep the /u */
12323 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12324 if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
12325 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12327 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12328 RExC_parse = oregcomp_parse;
12329 vFAIL("Unmatched (");
12331 nextchar(pRExC_state);
12333 else if (!paren && RExC_parse < RExC_end) {
12334 if (*RExC_parse == ')') {
12336 vFAIL("Unmatched )");
12339 FAIL("Junk on end of regexp"); /* "Can't happen". */
12340 NOT_REACHED; /* NOTREACHED */
12343 if (RExC_in_lookbehind) {
12344 RExC_in_lookbehind--;
12346 if (after_freeze > RExC_npar)
12347 RExC_npar = after_freeze;
12352 - regbranch - one alternative of an | operator
12354 * Implements the concatenation operator.
12356 * On success, returns the offset at which any next node should be placed into
12357 * the regex engine program being compiled.
12359 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12360 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12363 STATIC regnode_offset
12364 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12366 regnode_offset ret;
12367 regnode_offset chain = 0;
12368 regnode_offset latest;
12369 I32 flags = 0, c = 0;
12370 GET_RE_DEBUG_FLAGS_DECL;
12372 PERL_ARGS_ASSERT_REGBRANCH;
12374 DEBUG_PARSE("brnc");
12379 if (RExC_use_BRANCHJ)
12380 ret = reganode(pRExC_state, BRANCHJ, 0);
12382 ret = reg_node(pRExC_state, BRANCH);
12383 Set_Node_Length(REGNODE_p(ret), 1);
12387 *flagp = WORST; /* Tentatively. */
12389 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12390 FALSE /* Don't force to /x */ );
12391 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12392 flags &= ~TRYAGAIN;
12393 latest = regpiece(pRExC_state, &flags, depth+1);
12395 if (flags & TRYAGAIN)
12397 RETURN_FAIL_ON_RESTART(flags, flagp);
12398 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12402 *flagp |= flags&(HASWIDTH|POSTPONED);
12403 if (chain == 0) /* First piece. */
12404 *flagp |= flags&SPSTART;
12406 /* FIXME adding one for every branch after the first is probably
12407 * excessive now we have TRIE support. (hv) */
12409 if (! REGTAIL(pRExC_state, chain, latest)) {
12410 /* XXX We could just redo this branch, but figuring out what
12411 * bookkeeping needs to be reset is a pain, and it's likely
12412 * that other branches that goto END will also be too large */
12413 REQUIRE_BRANCHJ(flagp, 0);
12419 if (chain == 0) { /* Loop ran zero times. */
12420 chain = reg_node(pRExC_state, NOTHING);
12425 *flagp |= flags&SIMPLE;
12432 - regpiece - something followed by possible quantifier * + ? {n,m}
12434 * Note that the branching code sequences used for ? and the general cases
12435 * of * and + are somewhat optimized: they use the same NOTHING node as
12436 * both the endmarker for their branch list and the body of the last branch.
12437 * It might seem that this node could be dispensed with entirely, but the
12438 * endmarker role is not redundant.
12440 * On success, returns the offset at which any next node should be placed into
12441 * the regex engine program being compiled.
12443 * Returns 0 otherwise, with *flagp set to indicate why:
12444 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
12445 * RESTART_PARSE if the parse needs to be restarted, or'd with
12446 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12448 STATIC regnode_offset
12449 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12451 regnode_offset ret;
12455 const char * const origparse = RExC_parse;
12457 I32 max = REG_INFTY;
12458 #ifdef RE_TRACK_PATTERN_OFFSETS
12461 const char *maxpos = NULL;
12464 /* Save the original in case we change the emitted regop to a FAIL. */
12465 const regnode_offset orig_emit = RExC_emit;
12467 GET_RE_DEBUG_FLAGS_DECL;
12469 PERL_ARGS_ASSERT_REGPIECE;
12471 DEBUG_PARSE("piec");
12473 ret = regatom(pRExC_state, &flags, depth+1);
12475 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12476 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12481 if (op == '{' && regcurly(RExC_parse)) {
12483 #ifdef RE_TRACK_PATTERN_OFFSETS
12484 parse_start = RExC_parse; /* MJD */
12486 next = RExC_parse + 1;
12487 while (isDIGIT(*next) || *next == ',') {
12488 if (*next == ',') {
12496 if (*next == '}') { /* got one */
12497 const char* endptr;
12501 if (isDIGIT(*RExC_parse)) {
12503 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12504 vFAIL("Invalid quantifier in {,}");
12505 if (uv >= REG_INFTY)
12506 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12511 if (*maxpos == ',')
12514 maxpos = RExC_parse;
12515 if (isDIGIT(*maxpos)) {
12517 if (!grok_atoUV(maxpos, &uv, &endptr))
12518 vFAIL("Invalid quantifier in {,}");
12519 if (uv >= REG_INFTY)
12520 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12523 max = REG_INFTY; /* meaning "infinity" */
12526 nextchar(pRExC_state);
12527 if (max < min) { /* If can't match, warn and optimize to fail
12529 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12530 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12531 NEXT_OFF(REGNODE_p(orig_emit)) =
12532 regarglen[OPFAIL] + NODE_STEP_REGNODE;
12535 else if (min == max && *RExC_parse == '?')
12537 ckWARN2reg(RExC_parse + 1,
12538 "Useless use of greediness modifier '%c'",
12543 if ((flags&SIMPLE)) {
12544 if (min == 0 && max == REG_INFTY) {
12545 reginsert(pRExC_state, STAR, ret, depth+1);
12547 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12550 if (min == 1 && max == REG_INFTY) {
12551 reginsert(pRExC_state, PLUS, ret, depth+1);
12553 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12556 MARK_NAUGHTY_EXP(2, 2);
12557 reginsert(pRExC_state, CURLY, ret, depth+1);
12558 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12559 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12562 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12564 FLAGS(REGNODE_p(w)) = 0;
12565 REGTAIL(pRExC_state, ret, w);
12566 if (RExC_use_BRANCHJ) {
12567 reginsert(pRExC_state, LONGJMP, ret, depth+1);
12568 reginsert(pRExC_state, NOTHING, ret, depth+1);
12569 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
12571 reginsert(pRExC_state, CURLYX, ret, depth+1);
12573 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12574 Set_Node_Length(REGNODE_p(ret),
12575 op == '{' ? (RExC_parse - parse_start) : 1);
12577 if (RExC_use_BRANCHJ)
12578 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
12580 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
12581 RExC_whilem_seen++;
12582 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
12584 FLAGS(REGNODE_p(ret)) = 0;
12589 *flagp |= HASWIDTH;
12590 ARG1_SET(REGNODE_p(ret), (U16)min);
12591 ARG2_SET(REGNODE_p(ret), (U16)max);
12592 if (max == REG_INFTY)
12593 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12599 if (!ISMULT1(op)) {
12604 #if 0 /* Now runtime fix should be reliable. */
12606 /* if this is reinstated, don't forget to put this back into perldiag:
12608 =item Regexp *+ operand could be empty at {#} in regex m/%s/
12610 (F) The part of the regexp subject to either the * or + quantifier
12611 could match an empty string. The {#} shows in the regular
12612 expression about where the problem was discovered.
12616 if (!(flags&HASWIDTH) && op != '?')
12617 vFAIL("Regexp *+ operand could be empty");
12620 #ifdef RE_TRACK_PATTERN_OFFSETS
12621 parse_start = RExC_parse;
12623 nextchar(pRExC_state);
12625 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
12631 else if (op == '+') {
12635 else if (op == '?') {
12640 if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
12641 ckWARN2reg(RExC_parse,
12642 "%" UTF8f " matches null string many times",
12643 UTF8fARG(UTF, (RExC_parse >= origparse
12644 ? RExC_parse - origparse
12649 if (*RExC_parse == '?') {
12650 nextchar(pRExC_state);
12651 reginsert(pRExC_state, MINMOD, ret, depth+1);
12652 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
12654 else if (*RExC_parse == '+') {
12655 regnode_offset ender;
12656 nextchar(pRExC_state);
12657 ender = reg_node(pRExC_state, SUCCEED);
12658 REGTAIL(pRExC_state, ret, ender);
12659 reginsert(pRExC_state, SUSPEND, ret, depth+1);
12660 ender = reg_node(pRExC_state, TAIL);
12661 REGTAIL(pRExC_state, ret, ender);
12664 if (ISMULT2(RExC_parse)) {
12666 vFAIL("Nested quantifiers");
12673 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12674 regnode_offset * node_p,
12682 /* This routine teases apart the various meanings of \N and returns
12683 * accordingly. The input parameters constrain which meaning(s) is/are valid
12684 * in the current context.
12686 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12688 * If <code_point_p> is not NULL, the context is expecting the result to be a
12689 * single code point. If this \N instance turns out to a single code point,
12690 * the function returns TRUE and sets *code_point_p to that code point.
12692 * If <node_p> is not NULL, the context is expecting the result to be one of
12693 * the things representable by a regnode. If this \N instance turns out to be
12694 * one such, the function generates the regnode, returns TRUE and sets *node_p
12695 * to point to the offset of that regnode into the regex engine program being
12698 * If this instance of \N isn't legal in any context, this function will
12699 * generate a fatal error and not return.
12701 * On input, RExC_parse should point to the first char following the \N at the
12702 * time of the call. On successful return, RExC_parse will have been updated
12703 * to point to just after the sequence identified by this routine. Also
12704 * *flagp has been updated as needed.
12706 * When there is some problem with the current context and this \N instance,
12707 * the function returns FALSE, without advancing RExC_parse, nor setting
12708 * *node_p, nor *code_point_p, nor *flagp.
12710 * If <cp_count> is not NULL, the caller wants to know the length (in code
12711 * points) that this \N sequence matches. This is set, and the input is
12712 * parsed for errors, even if the function returns FALSE, as detailed below.
12714 * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12716 * Probably the most common case is for the \N to specify a single code point.
12717 * *cp_count will be set to 1, and *code_point_p will be set to that code
12720 * Another possibility is for the input to be an empty \N{}. This is no
12721 * longer accepted, and will generate a fatal error.
12723 * Another possibility is for a custom charnames handler to be in effect which
12724 * translates the input name to an empty string. *cp_count will be set to 0.
12725 * *node_p will be set to a generated NOTHING node.
12727 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12728 * set to 0. *node_p will be set to a generated REG_ANY node.
12730 * The fifth possibility is that \N resolves to a sequence of more than one
12731 * code points. *cp_count will be set to the number of code points in the
12732 * sequence. *node_p will be set to a generated node returned by this
12733 * function calling S_reg().
12735 * The final possibility is that it is premature to be calling this function;
12736 * the parse needs to be restarted. This can happen when this changes from
12737 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
12738 * latter occurs only when the fifth possibility would otherwise be in
12739 * effect, and is because one of those code points requires the pattern to be
12740 * recompiled as UTF-8. The function returns FALSE, and sets the
12741 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
12742 * happens, the caller needs to desist from continuing parsing, and return
12743 * this information to its caller. This is not set for when there is only one
12744 * code point, as this can be called as part of an ANYOF node, and they can
12745 * store above-Latin1 code points without the pattern having to be in UTF-8.
12747 * For non-single-quoted regexes, the tokenizer has resolved character and
12748 * sequence names inside \N{...} into their Unicode values, normalizing the
12749 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12750 * hex-represented code points in the sequence. This is done there because
12751 * the names can vary based on what charnames pragma is in scope at the time,
12752 * so we need a way to take a snapshot of what they resolve to at the time of
12753 * the original parse. [perl #56444].
12755 * That parsing is skipped for single-quoted regexes, so here we may get
12756 * '\N{NAME}', which is parsed now. If the single-quoted regex is something
12757 * like '\N{U+41}', that code point is Unicode, and has to be translated into
12758 * the native character set for non-ASCII platforms. The other possibilities
12759 * are already native, so no translation is done. */
12761 char * endbrace; /* points to '}' following the name */
12762 char* p = RExC_parse; /* Temporary */
12764 SV * substitute_parse = NULL;
12769 GET_RE_DEBUG_FLAGS_DECL;
12771 PERL_ARGS_ASSERT_GROK_BSLASH_N;
12773 GET_RE_DEBUG_FLAGS;
12775 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
12776 assert(! (node_p && cp_count)); /* At most 1 should be set */
12778 if (cp_count) { /* Initialize return for the most common case */
12782 /* The [^\n] meaning of \N ignores spaces and comments under the /x
12783 * modifier. The other meanings do not, so use a temporary until we find
12784 * out which we are being called with */
12785 skip_to_be_ignored_text(pRExC_state, &p,
12786 FALSE /* Don't force to /x */ );
12788 /* Disambiguate between \N meaning a named character versus \N meaning
12789 * [^\n]. The latter is assumed when the {...} following the \N is a legal
12790 * quantifier, or if there is no '{' at all */
12791 if (*p != '{' || regcurly(p)) {
12801 *node_p = reg_node(pRExC_state, REG_ANY);
12802 *flagp |= HASWIDTH|SIMPLE;
12804 Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
12808 /* The test above made sure that the next real character is a '{', but
12809 * under the /x modifier, it could be separated by space (or a comment and
12810 * \n) and this is not allowed (for consistency with \x{...} and the
12811 * tokenizer handling of \N{NAME}). */
12812 if (*RExC_parse != '{') {
12813 vFAIL("Missing braces on \\N{}");
12816 RExC_parse++; /* Skip past the '{' */
12818 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
12819 if (! endbrace) { /* no trailing brace */
12820 vFAIL2("Missing right brace on \\%c{}", 'N');
12823 /* Here, we have decided it should be a named character or sequence. These
12824 * imply Unicode semantics */
12825 REQUIRE_UNI_RULES(flagp, FALSE);
12827 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
12828 * nothing at all (not allowed under strict) */
12829 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
12830 RExC_parse = endbrace;
12832 RExC_parse++; /* Position after the "}" */
12833 vFAIL("Zero length \\N{}");
12839 nextchar(pRExC_state);
12844 *node_p = reg_node(pRExC_state, NOTHING);
12848 if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
12850 /* Here, the name isn't of the form U+.... This can happen if the
12851 * pattern is single-quoted, so didn't get evaluated in toke.c. Now
12852 * is the time to find out what the name means */
12854 const STRLEN name_len = endbrace - RExC_parse;
12855 SV * value_sv; /* What does this name evaluate to */
12857 const U8 * value; /* string of name's value */
12858 STRLEN value_len; /* and its length */
12860 /* RExC_unlexed_names is a hash of names that weren't evaluated by
12861 * toke.c, and their values. Make sure is initialized */
12862 if (! RExC_unlexed_names) {
12863 RExC_unlexed_names = newHV();
12866 /* If we have already seen this name in this pattern, use that. This
12867 * allows us to only call the charnames handler once per name per
12868 * pattern. A broken or malicious handler could return something
12869 * different each time, which could cause the results to vary depending
12870 * on if something gets added or subtracted from the pattern that
12871 * causes the number of passes to change, for example */
12872 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
12875 value_sv = *value_svp;
12877 else { /* Otherwise we have to go out and get the name */
12878 const char * error_msg = NULL;
12879 value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
12883 RExC_parse = endbrace;
12887 /* If no error message, should have gotten a valid return */
12890 /* Save the name's meaning for later use */
12891 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
12894 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
12898 /* Here, we have the value the name evaluates to in 'value_sv' */
12899 value = (U8 *) SvPV(value_sv, value_len);
12901 /* See if the result is one code point vs 0 or multiple */
12902 if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv))
12906 /* Here, exactly one code point. If that isn't what is wanted,
12908 if (! code_point_p) {
12913 /* Convert from string to numeric code point */
12914 *code_point_p = (SvUTF8(value_sv))
12915 ? valid_utf8_to_uvchr(value, NULL)
12918 /* Have parsed this entire single code point \N{...}. *cp_count
12919 * has already been set to 1, so don't do it again. */
12920 RExC_parse = endbrace;
12921 nextchar(pRExC_state);
12923 } /* End of is a single code point */
12925 /* Count the code points, if caller desires. The API says to do this
12926 * even if we will later return FALSE */
12930 *cp_count = (SvUTF8(value_sv))
12931 ? utf8_length(value, value + value_len)
12935 /* Fail if caller doesn't want to handle a multi-code-point sequence.
12936 * But don't back the pointer up if the caller wants to know how many
12937 * code points there are (they need to handle it themselves in this
12946 /* Convert this to a sub-pattern of the form "(?: ... )", and then call
12947 * reg recursively to parse it. That way, it retains its atomicness,
12948 * while not having to worry about any special handling that some code
12949 * points may have. */
12951 substitute_parse = newSVpvs("?:");
12952 sv_catsv(substitute_parse, value_sv);
12953 sv_catpv(substitute_parse, ")");
12956 /* The value should already be native, so no need to convert on EBCDIC
12958 assert(! RExC_recode_x_to_native);
12962 else { /* \N{U+...} */
12963 Size_t count = 0; /* code point count kept internally */
12965 /* We can get to here when the input is \N{U+...} or when toke.c has
12966 * converted a name to the \N{U+...} form. This include changing a
12967 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
12969 RExC_parse += 2; /* Skip past the 'U+' */
12971 /* Code points are separated by dots. The '}' terminates the whole
12974 do { /* Loop until the ending brace */
12976 char * start_digit; /* The first of the current code point */
12977 if (! isXDIGIT(*RExC_parse)) {
12979 vFAIL("Invalid hexadecimal number in \\N{U+...}");
12982 start_digit = RExC_parse;
12985 /* Loop through the hex digits of the current code point */
12987 /* Adding this digit will shift the result 4 bits. If that
12988 * result would be above the legal max, it's overflow */
12989 if (cp > MAX_LEGAL_CP >> 4) {
12991 /* Find the end of the code point */
12994 } while (isXDIGIT(*RExC_parse) || *RExC_parse == '_');
12996 /* Be sure to synchronize this message with the similar one
12998 vFAIL4("Use of code point 0x%.*s is not allowed; the"
12999 " permissible max is 0x%" UVxf,
13000 (int) (RExC_parse - start_digit), start_digit,
13004 /* Accumulate this (valid) digit into the running total */
13005 cp = (cp << 4) + READ_XDIGIT(RExC_parse);
13007 /* READ_XDIGIT advanced the input pointer. Ignore a single
13008 * underscore separator */
13009 if (*RExC_parse == '_' && isXDIGIT(RExC_parse[1])) {
13012 } while (isXDIGIT(*RExC_parse));
13014 /* Here, have accumulated the next code point */
13015 if (RExC_parse >= endbrace) { /* If done ... */
13020 /* Here, is a single code point; fail if doesn't want that */
13021 if (! code_point_p) {
13026 /* A single code point is easy to handle; just return it */
13027 *code_point_p = UNI_TO_NATIVE(cp);
13028 RExC_parse = endbrace;
13029 nextchar(pRExC_state);
13033 /* Here, the only legal thing would be a multiple character
13034 * sequence (of the form "\N{U+c1.c2. ... }". So the next
13035 * character must be a dot (and the one after that can't be the
13036 * endbrace, or we'd have something like \N{U+100.} ) */
13037 if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
13038 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
13039 ? UTF8SKIP(RExC_parse)
13041 if (RExC_parse >= endbrace) { /* Guard against malformed utf8 */
13042 RExC_parse = endbrace;
13044 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13047 /* Here, looks like its really a multiple character sequence. Fail
13048 * if that's not what the caller wants. But continue with counting
13049 * and error checking if they still want a count */
13050 if (! node_p && ! cp_count) {
13054 /* What is done here is to convert this to a sub-pattern of the
13055 * form \x{char1}\x{char2}... and then call reg recursively to
13056 * parse it (enclosing in "(?: ... )" ). That way, it retains its
13057 * atomicness, while not having to worry about special handling
13058 * that some code points may have. We don't create a subpattern,
13059 * but go through the motions of code point counting and error
13060 * checking, if the caller doesn't want a node returned. */
13062 if (node_p && count == 1) {
13063 substitute_parse = newSVpvs("?:");
13069 /* Convert to notation the rest of the code understands */
13070 sv_catpvs(substitute_parse, "\\x{");
13071 sv_catpvn(substitute_parse, start_digit,
13072 RExC_parse - start_digit);
13073 sv_catpvs(substitute_parse, "}");
13076 /* Move to after the dot (or ending brace the final time through.)
13081 } while (RExC_parse < endbrace);
13083 if (! node_p) { /* Doesn't want the node */
13090 sv_catpvs(substitute_parse, ")");
13093 /* The values are Unicode, and therefore have to be converted to native
13094 * on a non-Unicode (meaning non-ASCII) platform. */
13095 RExC_recode_x_to_native = 1;
13100 /* Here, we have the string the name evaluates to, ready to be parsed,
13101 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13102 * constructs. This can be called from within a substitute parse already.
13103 * The error reporting mechanism doesn't work for 2 levels of this, but the
13104 * code above has validated this new construct, so there should be no
13105 * errors generated by the below. And this isn' an exact copy, so the
13106 * mechanism to seamlessly deal with this won't work, so turn off warnings
13108 save_start = RExC_start;
13109 orig_end = RExC_end;
13111 RExC_parse = RExC_start = SvPVX(substitute_parse);
13112 RExC_end = RExC_parse + SvCUR(substitute_parse);
13113 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13115 *node_p = reg(pRExC_state, 1, &flags, depth+1);
13117 /* Restore the saved values */
13119 RExC_start = save_start;
13120 RExC_parse = endbrace;
13121 RExC_end = orig_end;
13123 RExC_recode_x_to_native = 0;
13126 SvREFCNT_dec_NN(substitute_parse);
13129 RETURN_FAIL_ON_RESTART(flags, flagp);
13130 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13133 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13135 nextchar(pRExC_state);
13141 PERL_STATIC_INLINE U8
13142 S_compute_EXACTish(RExC_state_t *pRExC_state)
13146 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13154 op = get_regex_charset(RExC_flags);
13155 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13156 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13157 been, so there is no hole */
13160 return op + EXACTF;
13164 S_new_regcurly(const char *s, const char *e)
13166 /* This is a temporary function designed to match the most lenient form of
13167 * a {m,n} quantifier we ever envision, with either number omitted, and
13168 * spaces anywhere between/before/after them.
13170 * If this function fails, then the string it matches is very unlikely to
13171 * ever be considered a valid quantifier, so we can allow the '{' that
13172 * begins it to be considered as a literal */
13174 bool has_min = FALSE;
13175 bool has_max = FALSE;
13177 PERL_ARGS_ASSERT_NEW_REGCURLY;
13179 if (s >= e || *s++ != '{')
13182 while (s < e && isSPACE(*s)) {
13185 while (s < e && isDIGIT(*s)) {
13189 while (s < e && isSPACE(*s)) {
13195 while (s < e && isSPACE(*s)) {
13198 while (s < e && isDIGIT(*s)) {
13202 while (s < e && isSPACE(*s)) {
13207 return s < e && *s == '}' && (has_min || has_max);
13210 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13211 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13214 S_backref_value(char *p, char *e)
13216 const char* endptr = e;
13218 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13225 - regatom - the lowest level
13227 Try to identify anything special at the start of the current parse position.
13228 If there is, then handle it as required. This may involve generating a
13229 single regop, such as for an assertion; or it may involve recursing, such as
13230 to handle a () structure.
13232 If the string doesn't start with something special then we gobble up
13233 as much literal text as we can. If we encounter a quantifier, we have to
13234 back off the final literal character, as that quantifier applies to just it
13235 and not to the whole string of literals.
13237 Once we have been able to handle whatever type of thing started the
13238 sequence, we return the offset into the regex engine program being compiled
13239 at which any next regnode should be placed.
13241 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13242 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13243 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13244 Otherwise does not return 0.
13246 Note: we have to be careful with escapes, as they can be both literal
13247 and special, and in the case of \10 and friends, context determines which.
13249 A summary of the code structure is:
13251 switch (first_byte) {
13252 cases for each special:
13253 handle this special;
13256 switch (2nd byte) {
13257 cases for each unambiguous special:
13258 handle this special;
13260 cases for each ambigous special/literal:
13262 if (special) handle here
13264 default: // unambiguously literal:
13267 default: // is a literal char
13270 create EXACTish node for literal;
13271 while (more input and node isn't full) {
13272 switch (input_byte) {
13273 cases for each special;
13274 make sure parse pointer is set so that the next call to
13275 regatom will see this special first
13276 goto loopdone; // EXACTish node terminated by prev. char
13278 append char to EXACTISH node;
13280 get next input byte;
13284 return the generated node;
13286 Specifically there are two separate switches for handling
13287 escape sequences, with the one for handling literal escapes requiring
13288 a dummy entry for all of the special escapes that are actually handled
13293 STATIC regnode_offset
13294 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13297 regnode_offset ret = 0;
13304 GET_RE_DEBUG_FLAGS_DECL;
13306 *flagp = WORST; /* Tentatively. */
13308 DEBUG_PARSE("atom");
13310 PERL_ARGS_ASSERT_REGATOM;
13313 parse_start = RExC_parse;
13314 assert(RExC_parse < RExC_end);
13315 switch ((U8)*RExC_parse) {
13317 RExC_seen_zerolen++;
13318 nextchar(pRExC_state);
13319 if (RExC_flags & RXf_PMf_MULTILINE)
13320 ret = reg_node(pRExC_state, MBOL);
13322 ret = reg_node(pRExC_state, SBOL);
13323 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13326 nextchar(pRExC_state);
13328 RExC_seen_zerolen++;
13329 if (RExC_flags & RXf_PMf_MULTILINE)
13330 ret = reg_node(pRExC_state, MEOL);
13332 ret = reg_node(pRExC_state, SEOL);
13333 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13336 nextchar(pRExC_state);
13337 if (RExC_flags & RXf_PMf_SINGLELINE)
13338 ret = reg_node(pRExC_state, SANY);
13340 ret = reg_node(pRExC_state, REG_ANY);
13341 *flagp |= HASWIDTH|SIMPLE;
13343 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13347 char * const oregcomp_parse = ++RExC_parse;
13348 ret = regclass(pRExC_state, flagp, depth+1,
13349 FALSE, /* means parse the whole char class */
13350 TRUE, /* allow multi-char folds */
13351 FALSE, /* don't silence non-portable warnings. */
13352 (bool) RExC_strict,
13353 TRUE, /* Allow an optimized regnode result */
13356 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13357 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13360 if (*RExC_parse != ']') {
13361 RExC_parse = oregcomp_parse;
13362 vFAIL("Unmatched [");
13364 nextchar(pRExC_state);
13365 Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13369 nextchar(pRExC_state);
13370 ret = reg(pRExC_state, 2, &flags, depth+1);
13372 if (flags & TRYAGAIN) {
13373 if (RExC_parse >= RExC_end) {
13374 /* Make parent create an empty node if needed. */
13375 *flagp |= TRYAGAIN;
13380 RETURN_FAIL_ON_RESTART(flags, flagp);
13381 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13384 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
13388 if (flags & TRYAGAIN) {
13389 *flagp |= TRYAGAIN;
13392 vFAIL("Internal urp");
13393 /* Supposed to be caught earlier. */
13399 vFAIL("Quantifier follows nothing");
13404 This switch handles escape sequences that resolve to some kind
13405 of special regop and not to literal text. Escape sequences that
13406 resolve to literal text are handled below in the switch marked
13409 Every entry in this switch *must* have a corresponding entry
13410 in the literal escape switch. However, the opposite is not
13411 required, as the default for this switch is to jump to the
13412 literal text handling code.
13415 switch ((U8)*RExC_parse) {
13416 /* Special Escapes */
13418 RExC_seen_zerolen++;
13419 ret = reg_node(pRExC_state, SBOL);
13420 /* SBOL is shared with /^/ so we set the flags so we can tell
13421 * /\A/ from /^/ in split. */
13422 FLAGS(REGNODE_p(ret)) = 1;
13424 goto finish_meta_pat;
13426 ret = reg_node(pRExC_state, GPOS);
13427 RExC_seen |= REG_GPOS_SEEN;
13429 goto finish_meta_pat;
13431 RExC_seen_zerolen++;
13432 ret = reg_node(pRExC_state, KEEPS);
13434 /* XXX:dmq : disabling in-place substitution seems to
13435 * be necessary here to avoid cases of memory corruption, as
13436 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13438 RExC_seen |= REG_LOOKBEHIND_SEEN;
13439 goto finish_meta_pat;
13441 ret = reg_node(pRExC_state, SEOL);
13443 RExC_seen_zerolen++; /* Do not optimize RE away */
13444 goto finish_meta_pat;
13446 ret = reg_node(pRExC_state, EOS);
13448 RExC_seen_zerolen++; /* Do not optimize RE away */
13449 goto finish_meta_pat;
13451 vFAIL("\\C no longer supported");
13453 ret = reg_node(pRExC_state, CLUMP);
13454 *flagp |= HASWIDTH;
13455 goto finish_meta_pat;
13461 arg = ANYOF_WORDCHAR;
13470 regex_charset charset = get_regex_charset(RExC_flags);
13472 RExC_seen_zerolen++;
13473 RExC_seen |= REG_LOOKBEHIND_SEEN;
13474 op = BOUND + charset;
13476 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13477 flags = TRADITIONAL_BOUND;
13478 if (op > BOUNDA) { /* /aa is same as /a */
13484 char name = *RExC_parse;
13485 char * endbrace = NULL;
13487 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13490 vFAIL2("Missing right brace on \\%c{}", name);
13492 /* XXX Need to decide whether to take spaces or not. Should be
13493 * consistent with \p{}, but that currently is SPACE, which
13494 * means vertical too, which seems wrong
13495 * while (isBLANK(*RExC_parse)) {
13498 if (endbrace == RExC_parse) {
13499 RExC_parse++; /* After the '}' */
13500 vFAIL2("Empty \\%c{}", name);
13502 length = endbrace - RExC_parse;
13503 /*while (isBLANK(*(RExC_parse + length - 1))) {
13506 switch (*RExC_parse) {
13509 && (memNEs(RExC_parse + 1, length - 1, "cb")))
13511 goto bad_bound_type;
13516 if (length != 2 || *(RExC_parse + 1) != 'b') {
13517 goto bad_bound_type;
13522 if (length != 2 || *(RExC_parse + 1) != 'b') {
13523 goto bad_bound_type;
13528 if (length != 2 || *(RExC_parse + 1) != 'b') {
13529 goto bad_bound_type;
13535 RExC_parse = endbrace;
13537 "'%" UTF8f "' is an unknown bound type",
13538 UTF8fARG(UTF, length, endbrace - length));
13539 NOT_REACHED; /*NOTREACHED*/
13541 RExC_parse = endbrace;
13542 REQUIRE_UNI_RULES(flagp, 0);
13547 else if (op >= BOUNDA) { /* /aa is same as /a */
13551 /* Don't have to worry about UTF-8, in this message because
13552 * to get here the contents of the \b must be ASCII */
13553 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
13554 "Using /u for '%.*s' instead of /%s",
13556 endbrace - length + 1,
13557 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13558 ? ASCII_RESTRICT_PAT_MODS
13559 : ASCII_MORE_RESTRICT_PAT_MODS);
13564 RExC_seen_d_op = TRUE;
13566 else if (op == BOUNDL) {
13567 RExC_contains_locale = 1;
13571 op += NBOUND - BOUND;
13574 ret = reg_node(pRExC_state, op);
13575 FLAGS(REGNODE_p(ret)) = flags;
13579 goto finish_meta_pat;
13587 if (! DEPENDS_SEMANTICS) {
13591 /* \d doesn't have any matches in the upper Latin1 range, hence /d
13592 * is equivalent to /u. Changing to /u saves some branches at
13595 goto join_posix_op_known;
13598 ret = reg_node(pRExC_state, LNBREAK);
13599 *flagp |= HASWIDTH|SIMPLE;
13600 goto finish_meta_pat;
13608 goto join_posix_op_known;
13614 arg = ANYOF_VERTWS;
13616 goto join_posix_op_known;
13626 op = POSIXD + get_regex_charset(RExC_flags);
13627 if (op > POSIXA) { /* /aa is same as /a */
13630 else if (op == POSIXL) {
13631 RExC_contains_locale = 1;
13633 else if (op == POSIXD) {
13634 RExC_seen_d_op = TRUE;
13637 join_posix_op_known:
13640 op += NPOSIXD - POSIXD;
13643 ret = reg_node(pRExC_state, op);
13644 FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
13646 *flagp |= HASWIDTH|SIMPLE;
13650 if ( UCHARAT(RExC_parse + 1) == '{'
13651 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13654 vFAIL("Unescaped left brace in regex is illegal here");
13656 nextchar(pRExC_state);
13657 Set_Node_Length(REGNODE_p(ret), 2); /* MJD */
13663 ret = regclass(pRExC_state, flagp, depth+1,
13664 TRUE, /* means just parse this element */
13665 FALSE, /* don't allow multi-char folds */
13666 FALSE, /* don't silence non-portable warnings. It
13667 would be a bug if these returned
13669 (bool) RExC_strict,
13670 TRUE, /* Allow an optimized regnode result */
13672 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13673 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13674 * multi-char folds are allowed. */
13676 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13681 Set_Node_Offset(REGNODE_p(ret), parse_start);
13682 Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2);
13683 nextchar(pRExC_state);
13686 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13687 * \N{...} evaluates to a sequence of more than one code points).
13688 * The function call below returns a regnode, which is our result.
13689 * The parameters cause it to fail if the \N{} evaluates to a
13690 * single code point; we handle those like any other literal. The
13691 * reason that the multicharacter case is handled here and not as
13692 * part of the EXACtish code is because of quantifiers. In
13693 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13694 * this way makes that Just Happen. dmq.
13695 * join_exact() will join this up with adjacent EXACTish nodes
13696 * later on, if appropriate. */
13698 if (grok_bslash_N(pRExC_state,
13699 &ret, /* Want a regnode returned */
13700 NULL, /* Fail if evaluates to a single code
13702 NULL, /* Don't need a count of how many code
13711 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13713 /* Here, evaluates to a single code point. Go get that */
13714 RExC_parse = parse_start;
13717 case 'k': /* Handle \k<NAME> and \k'NAME' */
13721 if ( RExC_parse >= RExC_end - 1
13722 || (( ch = RExC_parse[1]) != '<'
13727 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13728 vFAIL2("Sequence %.2s... not terminated", parse_start);
13731 ret = handle_named_backref(pRExC_state,
13743 case '1': case '2': case '3': case '4':
13744 case '5': case '6': case '7': case '8': case '9':
13749 if (*RExC_parse == 'g') {
13753 if (*RExC_parse == '{') {
13757 if (*RExC_parse == '-') {
13761 if (hasbrace && !isDIGIT(*RExC_parse)) {
13762 if (isrel) RExC_parse--;
13764 goto parse_named_seq;
13767 if (RExC_parse >= RExC_end) {
13768 goto unterminated_g;
13770 num = S_backref_value(RExC_parse, RExC_end);
13772 vFAIL("Reference to invalid group 0");
13773 else if (num == I32_MAX) {
13774 if (isDIGIT(*RExC_parse))
13775 vFAIL("Reference to nonexistent group");
13778 vFAIL("Unterminated \\g... pattern");
13782 num = RExC_npar - num;
13784 vFAIL("Reference to nonexistent or unclosed group");
13788 num = S_backref_value(RExC_parse, RExC_end);
13789 /* bare \NNN might be backref or octal - if it is larger
13790 * than or equal RExC_npar then it is assumed to be an
13791 * octal escape. Note RExC_npar is +1 from the actual
13792 * number of parens. */
13793 /* Note we do NOT check if num == I32_MAX here, as that is
13794 * handled by the RExC_npar check */
13797 /* any numeric escape < 10 is always a backref */
13799 /* any numeric escape < RExC_npar is a backref */
13800 && num >= RExC_npar
13801 /* cannot be an octal escape if it starts with 8 */
13802 && *RExC_parse != '8'
13803 /* cannot be an octal escape if it starts with 9 */
13804 && *RExC_parse != '9'
13806 /* Probably not meant to be a backref, instead likely
13807 * to be an octal character escape, e.g. \35 or \777.
13808 * The above logic should make it obvious why using
13809 * octal escapes in patterns is problematic. - Yves */
13810 RExC_parse = parse_start;
13815 /* At this point RExC_parse points at a numeric escape like
13816 * \12 or \88 or something similar, which we should NOT treat
13817 * as an octal escape. It may or may not be a valid backref
13818 * escape. For instance \88888888 is unlikely to be a valid
13820 while (isDIGIT(*RExC_parse))
13823 if (*RExC_parse != '}')
13824 vFAIL("Unterminated \\g{...} pattern");
13827 if (num >= (I32)RExC_npar) {
13829 /* It might be a forward reference; we can't fail until we
13830 * know, by completing the parse to get all the groups, and
13831 * then reparsing */
13832 if (ALL_PARENS_COUNTED) {
13833 if (num >= RExC_total_parens) {
13834 vFAIL("Reference to nonexistent group");
13838 REQUIRE_PARENS_PASS;
13842 ret = reganode(pRExC_state,
13845 : (ASCII_FOLD_RESTRICTED)
13847 : (AT_LEAST_UNI_SEMANTICS)
13853 if (OP(REGNODE_p(ret)) == REFF) {
13854 RExC_seen_d_op = TRUE;
13856 *flagp |= HASWIDTH;
13858 /* override incorrect value set in reganode MJD */
13859 Set_Node_Offset(REGNODE_p(ret), parse_start);
13860 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13861 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13862 FALSE /* Don't force to /x */ );
13866 if (RExC_parse >= RExC_end)
13867 FAIL("Trailing \\");
13870 /* Do not generate "unrecognized" warnings here, we fall
13871 back into the quick-grab loop below */
13872 RExC_parse = parse_start;
13874 } /* end of switch on a \foo sequence */
13879 /* '#' comments should have been spaced over before this function was
13881 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
13883 if (RExC_flags & RXf_PMf_EXTENDED) {
13884 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
13885 if (RExC_parse < RExC_end)
13895 /* Here, we have determined that the next thing is probably a
13896 * literal character. RExC_parse points to the first byte of its
13897 * definition. (It still may be an escape sequence that evaluates
13898 * to a single character) */
13905 /* This allows us to fill a node with just enough spare so that if the final
13906 * character folds, its expansion is guaranteed to fit */
13907 #define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
13910 U8 upper_parse = MAX_NODE_STRING_SIZE;
13912 /* We start out as an EXACT node, even if under /i, until we find a
13913 * character which is in a fold. The algorithm now segregates into
13914 * separate nodes, characters that fold from those that don't under
13915 * /i. (This hopefully will create nodes that are fixed strings
13916 * even under /i, giving the optimizer something to grab on to.)
13917 * So, if a node has something in it and the next character is in
13918 * the opposite category, that node is closed up, and the function
13919 * returns. Then regatom is called again, and a new node is
13920 * created for the new category. */
13921 U8 node_type = EXACT;
13923 /* Assume the node will be fully used; the excess is given back at
13924 * the end. We can't make any other length assumptions, as a byte
13925 * input sequence could shrink down. */
13926 Ptrdiff_t initial_size = STR_SZ(256);
13928 bool next_is_quantifier;
13929 char * oldp = NULL;
13931 /* We can convert EXACTF nodes to EXACTFU if they contain only
13932 * characters that match identically regardless of the target
13933 * string's UTF8ness. The reason to do this is that EXACTF is not
13934 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
13937 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
13938 * contain only above-Latin1 characters (hence must be in UTF8),
13939 * which don't participate in folds with Latin1-range characters,
13940 * as the latter's folds aren't known until runtime. */
13941 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
13943 /* Single-character EXACTish nodes are almost always SIMPLE. This
13944 * allows us to override this as encountered */
13945 U8 maybe_SIMPLE = SIMPLE;
13947 /* Does this node contain something that can't match unless the
13948 * target string is (also) in UTF-8 */
13949 bool requires_utf8_target = FALSE;
13951 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
13952 bool has_ss = FALSE;
13954 /* So is the MICRO SIGN */
13955 bool has_micro_sign = FALSE;
13957 /* Allocate an EXACT node. The node_type may change below to
13958 * another EXACTish node, but since the size of the node doesn't
13959 * change, it works */
13960 ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
13961 FILL_NODE(ret, node_type);
13964 s = STRING(REGNODE_p(ret));
13970 /* This breaks under rare circumstances. If folding, we do not
13971 * want to split a node at a character that is a non-final in a
13972 * multi-char fold, as an input string could just happen to want to
13973 * match across the node boundary. The code at the end of the loop
13974 * looks for this, and backs off until it finds not such a
13975 * character, but it is possible (though extremely, extremely
13976 * unlikely) for all characters in the node to be non-final fold
13977 * ones, in which case we just leave the node fully filled, and
13978 * hope that it doesn't match the string in just the wrong place */
13980 assert( ! UTF /* Is at the beginning of a character */
13981 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
13982 || UTF8_IS_START(UCHARAT(RExC_parse)));
13984 /* Here, we have a literal character. Find the maximal string of
13985 * them in the input that we can fit into a single EXACTish node.
13986 * We quit at the first non-literal or when the node gets full, or
13987 * under /i the categorization of folding/non-folding character
13989 for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
13991 /* In most cases each iteration adds one byte to the output.
13992 * The exceptions override this */
13993 Size_t added_len = 1;
13997 /* White space has already been ignored */
13998 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
13999 || ! is_PATWS_safe((p), RExC_end, UTF));
14011 /* Literal Escapes Switch
14013 This switch is meant to handle escape sequences that
14014 resolve to a literal character.
14016 Every escape sequence that represents something
14017 else, like an assertion or a char class, is handled
14018 in the switch marked 'Special Escapes' above in this
14019 routine, but also has an entry here as anything that
14020 isn't explicitly mentioned here will be treated as
14021 an unescaped equivalent literal.
14024 switch ((U8)*++p) {
14026 /* These are all the special escapes. */
14027 case 'A': /* Start assertion */
14028 case 'b': case 'B': /* Word-boundary assertion*/
14029 case 'C': /* Single char !DANGEROUS! */
14030 case 'd': case 'D': /* digit class */
14031 case 'g': case 'G': /* generic-backref, pos assertion */
14032 case 'h': case 'H': /* HORIZWS */
14033 case 'k': case 'K': /* named backref, keep marker */
14034 case 'p': case 'P': /* Unicode property */
14035 case 'R': /* LNBREAK */
14036 case 's': case 'S': /* space class */
14037 case 'v': case 'V': /* VERTWS */
14038 case 'w': case 'W': /* word class */
14039 case 'X': /* eXtended Unicode "combining
14040 character sequence" */
14041 case 'z': case 'Z': /* End of line/string assertion */
14045 /* Anything after here is an escape that resolves to a
14046 literal. (Except digits, which may or may not)
14052 case 'N': /* Handle a single-code point named character. */
14053 RExC_parse = p + 1;
14054 if (! grok_bslash_N(pRExC_state,
14055 NULL, /* Fail if evaluates to
14056 anything other than a
14057 single code point */
14058 &ender, /* The returned single code
14060 NULL, /* Don't need a count of
14061 how many code points */
14066 if (*flagp & NEED_UTF8)
14067 FAIL("panic: grok_bslash_N set NEED_UTF8");
14068 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14070 /* Here, it wasn't a single code point. Go close
14071 * up this EXACTish node. The switch() prior to
14072 * this switch handles the other cases */
14073 RExC_parse = p = oldp;
14077 RExC_parse = parse_start;
14079 /* The \N{} means the pattern, if previously /d,
14080 * becomes /u. That means it can't be an EXACTF node,
14081 * but an EXACTFU */
14082 if (node_type == EXACTF) {
14083 node_type = EXACTFU;
14085 /* If the node already contains something that
14086 * differs between EXACTF and EXACTFU, reparse it
14088 if (! maybe_exactfu) {
14109 ender = ESC_NATIVE;
14119 const char* error_msg;
14121 bool valid = grok_bslash_o(&p,
14125 TO_OUTPUT_WARNINGS(p),
14126 (bool) RExC_strict,
14127 TRUE, /* Output warnings
14132 RExC_parse = p; /* going to die anyway; point
14133 to exact spot of failure */
14136 UPDATE_WARNINGS_LOC(p - 1);
14142 UV result = UV_MAX; /* initialize to erroneous
14144 const char* error_msg;
14146 bool valid = grok_bslash_x(&p,
14150 TO_OUTPUT_WARNINGS(p),
14151 (bool) RExC_strict,
14152 TRUE, /* Silence warnings
14157 RExC_parse = p; /* going to die anyway; point
14158 to exact spot of failure */
14161 UPDATE_WARNINGS_LOC(p - 1);
14164 if (ender < 0x100) {
14166 if (RExC_recode_x_to_native) {
14167 ender = LATIN1_TO_NATIVE(ender);
14175 ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
14176 UPDATE_WARNINGS_LOC(p);
14179 case '8': case '9': /* must be a backreference */
14181 /* we have an escape like \8 which cannot be an octal escape
14182 * so we exit the loop, and let the outer loop handle this
14183 * escape which may or may not be a legitimate backref. */
14185 case '1': case '2': case '3':case '4':
14186 case '5': case '6': case '7':
14187 /* When we parse backslash escapes there is ambiguity
14188 * between backreferences and octal escapes. Any escape
14189 * from \1 - \9 is a backreference, any multi-digit
14190 * escape which does not start with 0 and which when
14191 * evaluated as decimal could refer to an already
14192 * parsed capture buffer is a back reference. Anything
14195 * Note this implies that \118 could be interpreted as
14196 * 118 OR as "\11" . "8" depending on whether there
14197 * were 118 capture buffers defined already in the
14200 /* NOTE, RExC_npar is 1 more than the actual number of
14201 * parens we have seen so far, hence the "<" as opposed
14203 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14204 { /* Not to be treated as an octal constant, go
14212 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14214 ender = grok_oct(p, &numlen, &flags, NULL);
14216 if ( isDIGIT(*p) /* like \08, \178 */
14217 && ckWARN(WARN_REGEXP)
14220 reg_warn_non_literal_string(
14222 form_short_octal_warning(p, numlen));
14228 FAIL("Trailing \\");
14231 if (isALPHANUMERIC(*p)) {
14232 /* An alpha followed by '{' is going to fail next
14233 * iteration, so don't output this warning in that
14235 if (! isALPHA(*p) || *(p + 1) != '{') {
14236 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14237 " passed through", p);
14240 goto normal_default;
14241 } /* End of switch on '\' */
14244 /* Trying to gain new uses for '{' without breaking too
14245 * much existing code is hard. The solution currently
14247 * 1) If there is no ambiguity that a '{' should always
14248 * be taken literally, at the start of a construct, we
14250 * 2) If the literal '{' conflicts with our desired use
14251 * of it as a metacharacter, we die. The deprecation
14252 * cycles for this have come and gone.
14253 * 3) If there is ambiguity, we raise a simple warning.
14254 * This could happen, for example, if the user
14255 * intended it to introduce a quantifier, but slightly
14256 * misspelled the quantifier. Without this warning,
14257 * the quantifier would silently be taken as a literal
14258 * string of characters instead of a meta construct */
14259 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14261 || ( p > parse_start + 1
14262 && isALPHA_A(*(p - 1))
14263 && *(p - 2) == '\\')
14264 || new_regcurly(p, RExC_end))
14266 RExC_parse = p + 1;
14267 vFAIL("Unescaped left brace in regex is "
14270 ckWARNreg(p + 1, "Unescaped left brace in regex is"
14271 " passed through");
14273 goto normal_default;
14276 if (p > RExC_parse && RExC_strict) {
14277 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14280 default: /* A literal character */
14282 if (! UTF8_IS_INVARIANT(*p) && UTF) {
14284 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14285 &numlen, UTF8_ALLOW_DEFAULT);
14291 } /* End of switch on the literal */
14293 /* Here, have looked at the literal character, and <ender>
14294 * contains its ordinal; <p> points to the character after it.
14298 REQUIRE_UTF8(flagp);
14301 /* We need to check if the next non-ignored thing is a
14302 * quantifier. Move <p> to after anything that should be
14303 * ignored, which, as a side effect, positions <p> for the next
14304 * loop iteration */
14305 skip_to_be_ignored_text(pRExC_state, &p,
14306 FALSE /* Don't force to /x */ );
14308 /* If the next thing is a quantifier, it applies to this
14309 * character only, which means that this character has to be in
14310 * its own node and can't just be appended to the string in an
14311 * existing node, so if there are already other characters in
14312 * the node, close the node with just them, and set up to do
14313 * this character again next time through, when it will be the
14314 * only thing in its new node */
14316 next_is_quantifier = LIKELY(p < RExC_end)
14317 && UNLIKELY(ISMULT2(p));
14319 if (next_is_quantifier && LIKELY(len)) {
14324 /* Ready to add 'ender' to the node */
14326 if (! FOLD) { /* The simple case, just append the literal */
14329 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14330 *(s++) = (char) ender;
14333 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14334 added_len = (char *) new_s - s;
14335 s = (char *) new_s;
14338 requires_utf8_target = TRUE;
14342 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14344 /* Here are folding under /l, and the code point is
14345 * problematic. If this is the first character in the
14346 * node, change the node type to folding. Otherwise, if
14347 * this is the first problematic character, close up the
14348 * existing node, so can start a new node with this one */
14350 node_type = EXACTFL;
14351 RExC_contains_locale = 1;
14353 else if (node_type == EXACT) {
14358 /* This problematic code point means we can't simplify
14360 maybe_exactfu = FALSE;
14362 /* Here, we are adding a problematic fold character.
14363 * "Problematic" in this context means that its fold isn't
14364 * known until runtime. (The non-problematic code points
14365 * are the above-Latin1 ones that fold to also all
14366 * above-Latin1. Their folds don't vary no matter what the
14367 * locale is.) But here we have characters whose fold
14368 * depends on the locale. We just add in the unfolded
14369 * character, and wait until runtime to fold it */
14370 goto not_fold_common;
14372 else /* regular fold; see if actually is in a fold */
14373 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14375 && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14377 /* Here, folding, but the character isn't in a fold.
14379 * Start a new node if previous characters in the node were
14381 if (len && node_type != EXACT) {
14386 /* Here, continuing a node with non-folded characters. Add
14388 goto not_fold_common;
14390 else { /* Here, does participate in some fold */
14392 /* If this is the first character in the node, change its
14393 * type to folding. Otherwise, if this is the first
14394 * folding character in the node, close up the existing
14395 * node, so can start a new node with this one. */
14397 node_type = compute_EXACTish(pRExC_state);
14399 else if (node_type == EXACT) {
14404 if (UTF) { /* Use the folded value */
14405 if (UVCHR_IS_INVARIANT(ender)) {
14406 *(s)++ = (U8) toFOLD(ender);
14409 ender = _to_uni_fold_flags(
14413 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
14414 ? FOLD_FLAGS_NOMIX_ASCII
14419 && LIKELY(ender != GREEK_SMALL_LETTER_MU))
14421 /* U+B5 folds to the MU, so its possible for a
14422 * non-UTF-8 target to match it */
14423 requires_utf8_target = TRUE;
14429 /* Here is non-UTF8. First, see if the character's
14430 * fold differs between /d and /u. */
14431 if (PL_fold[ender] != PL_fold_latin1[ender]) {
14432 maybe_exactfu = FALSE;
14435 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
14436 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
14437 || UNICODE_DOT_DOT_VERSION > 0)
14439 /* On non-ancient Unicode versions, this includes the
14440 * multi-char fold SHARP S to 'ss' */
14442 if ( UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
14443 || ( isALPHA_FOLD_EQ(ender, 's')
14445 && isALPHA_FOLD_EQ(*(s-1), 's')))
14447 /* Here, we have one of the following:
14448 * a) a SHARP S. This folds to 'ss' only under
14449 * /u rules. If we are in that situation,
14450 * fold the SHARP S to 'ss'. See the comments
14451 * for join_exact() as to why we fold this
14452 * non-UTF at compile time, and no others.
14453 * b) 'ss'. When under /u, there's nothing
14454 * special needed to be done here. The
14455 * previous iteration handled the first 's',
14456 * and this iteration will handle the second.
14457 * If, on the otherhand it's not /u, we have
14458 * to exclude the possibility of moving to /u,
14459 * so that we won't generate an unwanted
14460 * match, unless, at runtime, the target
14461 * string is in UTF-8.
14465 maybe_exactfu = FALSE; /* Can't generate an
14466 EXACTFU node (unless we
14467 already are in one) */
14468 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14470 if (node_type == EXACTFU) {
14473 /* Let the code below add in the extra 's' */
14481 else if (UNLIKELY(ender == MICRO_SIGN)) {
14482 has_micro_sign = TRUE;
14485 *(s++) = (DEPENDS_SEMANTICS)
14486 ? (char) toFOLD(ender)
14488 /* Under /u, the fold of any character in
14489 * the 0-255 range happens to be its
14490 * lowercase equivalent, except for LATIN
14491 * SMALL LETTER SHARP S, which was handled
14492 * above, and the MICRO SIGN, whose fold
14493 * requires UTF-8 to represent. */
14494 : (char) toLOWER_L1(ender);
14496 } /* End of adding current character to the node */
14500 if (next_is_quantifier) {
14502 /* Here, the next input is a quantifier, and to get here,
14503 * the current character is the only one in the node. */
14507 } /* End of loop through literal characters */
14509 /* Here we have either exhausted the input or ran out of room in
14510 * the node. (If we encountered a character that can't be in the
14511 * node, transfer is made directly to <loopdone>, and so we
14512 * wouldn't have fallen off the end of the loop.) In the latter
14513 * case, we artificially have to split the node into two, because
14514 * we just don't have enough space to hold everything. This
14515 * creates a problem if the final character participates in a
14516 * multi-character fold in the non-final position, as a match that
14517 * should have occurred won't, due to the way nodes are matched,
14518 * and our artificial boundary. So back off until we find a non-
14519 * problematic character -- one that isn't at the beginning or
14520 * middle of such a fold. (Either it doesn't participate in any
14521 * folds, or appears only in the final position of all the folds it
14522 * does participate in.) A better solution with far fewer false
14523 * positives, and that would fill the nodes more completely, would
14524 * be to actually have available all the multi-character folds to
14525 * test against, and to back-off only far enough to be sure that
14526 * this node isn't ending with a partial one. <upper_parse> is set
14527 * further below (if we need to reparse the node) to include just
14528 * up through that final non-problematic character that this code
14529 * identifies, so when it is set to less than the full node, we can
14530 * skip the rest of this */
14531 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
14532 PERL_UINT_FAST8_T backup_count = 0;
14534 const STRLEN full_len = len;
14536 assert(len >= MAX_NODE_STRING_SIZE);
14538 /* Here, <s> points to just beyond where we have output the
14539 * final character of the node. Look backwards through the
14540 * string until find a non- problematic character */
14544 /* This has no multi-char folds to non-UTF characters */
14545 if (ASCII_FOLD_RESTRICTED) {
14549 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
14556 /* Point to the first byte of the final character */
14557 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
14559 while (s >= s0) { /* Search backwards until find
14560 a non-problematic char */
14561 if (UTF8_IS_INVARIANT(*s)) {
14563 /* There are no ascii characters that participate
14564 * in multi-char folds under /aa. In EBCDIC, the
14565 * non-ascii invariants are all control characters,
14566 * so don't ever participate in any folds. */
14567 if (ASCII_FOLD_RESTRICTED
14568 || ! IS_NON_FINAL_FOLD(*s))
14573 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
14574 if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
14580 else if (! _invlist_contains_cp(
14582 valid_utf8_to_uvchr((U8 *) s, NULL)))
14587 /* Here, the current character is problematic in that
14588 * it does occur in the non-final position of some
14589 * fold, so try the character before it, but have to
14590 * special case the very first byte in the string, so
14591 * we don't read outside the string */
14592 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
14594 } /* End of loop backwards through the string */
14596 /* If there were only problematic characters in the string,
14597 * <s> will point to before s0, in which case the length
14598 * should be 0, otherwise include the length of the
14599 * non-problematic character just found */
14600 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
14603 /* Here, have found the final character, if any, that is
14604 * non-problematic as far as ending the node without splitting
14605 * it across a potential multi-char fold. <len> contains the
14606 * number of bytes in the node up-to and including that
14607 * character, or is 0 if there is no such character, meaning
14608 * the whole node contains only problematic characters. In
14609 * this case, give up and just take the node as-is. We can't
14616 /* Here, the node does contain some characters that aren't
14617 * problematic. If we didn't have to backup any, then the
14618 * final character in the node is non-problematic, and we
14619 * can take the node as-is */
14620 if (backup_count == 0) {
14623 else if (backup_count == 1) {
14625 /* If the final character is problematic, but the
14626 * penultimate is not, back-off that last character to
14627 * later start a new node with it */
14632 /* Here, the final non-problematic character is earlier
14633 * in the input than the penultimate character. What we do
14634 * is reparse from the beginning, going up only as far as
14635 * this final ok one, thus guaranteeing that the node ends
14636 * in an acceptable character. The reason we reparse is
14637 * that we know how far in the character is, but we don't
14638 * know how to correlate its position with the input parse.
14639 * An alternate implementation would be to build that
14640 * correlation as we go along during the original parse,
14641 * but that would entail extra work for every node, whereas
14642 * this code gets executed only when the string is too
14643 * large for the node, and the final two characters are
14644 * problematic, an infrequent occurrence. Yet another
14645 * possible strategy would be to save the tail of the
14646 * string, and the next time regatom is called, initialize
14647 * with that. The problem with this is that unless you
14648 * back off one more character, you won't be guaranteed
14649 * regatom will get called again, unless regbranch,
14650 * regpiece ... are also changed. If you do back off that
14651 * extra character, so that there is input guaranteed to
14652 * force calling regatom, you can't handle the case where
14653 * just the first character in the node is acceptable. I
14654 * (khw) decided to try this method which doesn't have that
14655 * pitfall; if performance issues are found, we can do a
14656 * combination of the current approach plus that one */
14662 } /* End of verifying node ends with an appropriate char */
14664 loopdone: /* Jumped to when encounters something that shouldn't be
14667 /* Free up any over-allocated space; cast is to silence bogus
14668 * warning in MS VC */
14669 change_engine_size(pRExC_state,
14670 - (Ptrdiff_t) (initial_size - STR_SZ(len)));
14672 /* I (khw) don't know if you can get here with zero length, but the
14673 * old code handled this situation by creating a zero-length EXACT
14674 * node. Might as well be NOTHING instead */
14676 OP(REGNODE_p(ret)) = NOTHING;
14680 /* If the node type is EXACT here, check to see if it
14681 * should be EXACTL, or EXACT_ONLY8. */
14682 if (node_type == EXACT) {
14684 node_type = EXACTL;
14686 else if (requires_utf8_target) {
14687 node_type = EXACT_ONLY8;
14690 if ( UNLIKELY(has_micro_sign || has_ss)
14691 && (node_type == EXACTFU || ( node_type == EXACTF
14692 && maybe_exactfu)))
14693 { /* These two conditions are problematic in non-UTF-8
14696 node_type = EXACTFUP;
14698 else if (node_type == EXACTFL) {
14700 /* 'maybe_exactfu' is deliberately set above to
14701 * indicate this node type, where all code points in it
14703 if (maybe_exactfu) {
14704 node_type = EXACTFLU8;
14707 else if (node_type == EXACTF) { /* Means is /di */
14709 /* If 'maybe_exactfu' is clear, then we need to stay
14710 * /di. If it is set, it means there are no code
14711 * points that match differently depending on UTF8ness
14712 * of the target string, so it can become an EXACTFU
14714 if (! maybe_exactfu) {
14715 RExC_seen_d_op = TRUE;
14717 else if ( isALPHA_FOLD_EQ(* STRING(REGNODE_p(ret)), 's')
14718 || isALPHA_FOLD_EQ(ender, 's'))
14720 /* But, if the node begins or ends in an 's' we
14721 * have to defer changing it into an EXACTFU, as
14722 * the node could later get joined with another one
14723 * that ends or begins with 's' creating an 'ss'
14724 * sequence which would then wrongly match the
14725 * sharp s without the target being UTF-8. We
14726 * create a special node that we resolve later when
14727 * we join nodes together */
14729 node_type = EXACTFU_S_EDGE;
14732 node_type = EXACTFU;
14736 if (requires_utf8_target && node_type == EXACTFU) {
14737 node_type = EXACTFU_ONLY8;
14741 OP(REGNODE_p(ret)) = node_type;
14742 STR_LEN(REGNODE_p(ret)) = len;
14743 RExC_emit += STR_SZ(len);
14745 /* If the node isn't a single character, it can't be SIMPLE */
14746 if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) {
14750 *flagp |= HASWIDTH | maybe_SIMPLE;
14753 Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
14757 /* len is STRLEN which is unsigned, need to copy to signed */
14760 vFAIL("Internal disaster");
14763 } /* End of label 'defchar:' */
14765 } /* End of giant switch on input character */
14767 /* Position parse to next real character */
14768 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
14769 FALSE /* Don't force to /x */ );
14770 if ( *RExC_parse == '{'
14771 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
14773 if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
14775 vFAIL("Unescaped left brace in regex is illegal here");
14777 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
14778 " passed through");
14786 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
14788 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
14789 * sets up the bitmap and any flags, removing those code points from the
14790 * inversion list, setting it to NULL should it become completely empty */
14794 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
14795 assert(PL_regkind[OP(node)] == ANYOF);
14797 /* There is no bitmap for this node type */
14798 if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
14802 ANYOF_BITMAP_ZERO(node);
14803 if (*invlist_ptr) {
14805 /* This gets set if we actually need to modify things */
14806 bool change_invlist = FALSE;
14810 /* Start looking through *invlist_ptr */
14811 invlist_iterinit(*invlist_ptr);
14812 while (invlist_iternext(*invlist_ptr, &start, &end)) {
14816 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
14817 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
14820 /* Quit if are above what we should change */
14821 if (start >= NUM_ANYOF_CODE_POINTS) {
14825 change_invlist = TRUE;
14827 /* Set all the bits in the range, up to the max that we are doing */
14828 high = (end < NUM_ANYOF_CODE_POINTS - 1)
14830 : NUM_ANYOF_CODE_POINTS - 1;
14831 for (i = start; i <= (int) high; i++) {
14832 if (! ANYOF_BITMAP_TEST(node, i)) {
14833 ANYOF_BITMAP_SET(node, i);
14837 invlist_iterfinish(*invlist_ptr);
14839 /* Done with loop; remove any code points that are in the bitmap from
14840 * *invlist_ptr; similarly for code points above the bitmap if we have
14841 * a flag to match all of them anyways */
14842 if (change_invlist) {
14843 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
14845 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
14846 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
14849 /* If have completely emptied it, remove it completely */
14850 if (_invlist_len(*invlist_ptr) == 0) {
14851 SvREFCNT_dec_NN(*invlist_ptr);
14852 *invlist_ptr = NULL;
14857 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
14858 Character classes ([:foo:]) can also be negated ([:^foo:]).
14859 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
14860 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
14861 but trigger failures because they are currently unimplemented. */
14863 #define POSIXCC_DONE(c) ((c) == ':')
14864 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
14865 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
14866 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
14868 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
14869 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
14870 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
14872 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
14874 /* 'posix_warnings' and 'warn_text' are names of variables in the following
14876 #define ADD_POSIX_WARNING(p, text) STMT_START { \
14877 if (posix_warnings) { \
14878 if (! RExC_warn_text ) RExC_warn_text = \
14879 (AV *) sv_2mortal((SV *) newAV()); \
14880 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
14884 REPORT_LOCATION_ARGS(p))); \
14887 #define CLEAR_POSIX_WARNINGS() \
14889 if (posix_warnings && RExC_warn_text) \
14890 av_clear(RExC_warn_text); \
14893 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
14895 CLEAR_POSIX_WARNINGS(); \
14900 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
14902 const char * const s, /* Where the putative posix class begins.
14903 Normally, this is one past the '['. This
14904 parameter exists so it can be somewhere
14905 besides RExC_parse. */
14906 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
14908 AV ** posix_warnings, /* Where to place any generated warnings, or
14910 const bool check_only /* Don't die if error */
14913 /* This parses what the caller thinks may be one of the three POSIX
14915 * 1) a character class, like [:blank:]
14916 * 2) a collating symbol, like [. .]
14917 * 3) an equivalence class, like [= =]
14918 * In the latter two cases, it croaks if it finds a syntactically legal
14919 * one, as these are not handled by Perl.
14921 * The main purpose is to look for a POSIX character class. It returns:
14922 * a) the class number
14923 * if it is a completely syntactically and semantically legal class.
14924 * 'updated_parse_ptr', if not NULL, is set to point to just after the
14925 * closing ']' of the class
14926 * b) OOB_NAMEDCLASS
14927 * if it appears that one of the three POSIX constructs was meant, but
14928 * its specification was somehow defective. 'updated_parse_ptr', if
14929 * not NULL, is set to point to the character just after the end
14930 * character of the class. See below for handling of warnings.
14931 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
14932 * if it doesn't appear that a POSIX construct was intended.
14933 * 'updated_parse_ptr' is not changed. No warnings nor errors are
14936 * In b) there may be errors or warnings generated. If 'check_only' is
14937 * TRUE, then any errors are discarded. Warnings are returned to the
14938 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
14939 * instead it is NULL, warnings are suppressed.
14941 * The reason for this function, and its complexity is that a bracketed
14942 * character class can contain just about anything. But it's easy to
14943 * mistype the very specific posix class syntax but yielding a valid
14944 * regular bracketed class, so it silently gets compiled into something
14945 * quite unintended.
14947 * The solution adopted here maintains backward compatibility except that
14948 * it adds a warning if it looks like a posix class was intended but
14949 * improperly specified. The warning is not raised unless what is input
14950 * very closely resembles one of the 14 legal posix classes. To do this,
14951 * it uses fuzzy parsing. It calculates how many single-character edits it
14952 * would take to transform what was input into a legal posix class. Only
14953 * if that number is quite small does it think that the intention was a
14954 * posix class. Obviously these are heuristics, and there will be cases
14955 * where it errs on one side or another, and they can be tweaked as
14956 * experience informs.
14958 * The syntax for a legal posix class is:
14960 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
14962 * What this routine considers syntactically to be an intended posix class
14963 * is this (the comments indicate some restrictions that the pattern
14966 * qr/(?x: \[? # The left bracket, possibly
14968 * \h* # possibly followed by blanks
14969 * (?: \^ \h* )? # possibly a misplaced caret
14970 * [:;]? # The opening class character,
14971 * # possibly omitted. A typo
14972 * # semi-colon can also be used.
14974 * \^? # possibly a correctly placed
14975 * # caret, but not if there was also
14976 * # a misplaced one
14978 * .{3,15} # The class name. If there are
14979 * # deviations from the legal syntax,
14980 * # its edit distance must be close
14981 * # to a real class name in order
14982 * # for it to be considered to be
14983 * # an intended posix class.
14985 * [[:punct:]]? # The closing class character,
14986 * # possibly omitted. If not a colon
14987 * # nor semi colon, the class name
14988 * # must be even closer to a valid
14991 * \]? # The right bracket, possibly
14995 * In the above, \h must be ASCII-only.
14997 * These are heuristics, and can be tweaked as field experience dictates.
14998 * There will be cases when someone didn't intend to specify a posix class
14999 * that this warns as being so. The goal is to minimize these, while
15000 * maximizing the catching of things intended to be a posix class that
15001 * aren't parsed as such.
15005 const char * const e = RExC_end;
15006 unsigned complement = 0; /* If to complement the class */
15007 bool found_problem = FALSE; /* Assume OK until proven otherwise */
15008 bool has_opening_bracket = FALSE;
15009 bool has_opening_colon = FALSE;
15010 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
15012 const char * possible_end = NULL; /* used for a 2nd parse pass */
15013 const char* name_start; /* ptr to class name first char */
15015 /* If the number of single-character typos the input name is away from a
15016 * legal name is no more than this number, it is considered to have meant
15017 * the legal name */
15018 int max_distance = 2;
15020 /* to store the name. The size determines the maximum length before we
15021 * decide that no posix class was intended. Should be at least
15022 * sizeof("alphanumeric") */
15024 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15026 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15028 CLEAR_POSIX_WARNINGS();
15031 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15034 if (*(p - 1) != '[') {
15035 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15036 found_problem = TRUE;
15039 has_opening_bracket = TRUE;
15042 /* They could be confused and think you can put spaces between the
15045 found_problem = TRUE;
15049 } while (p < e && isBLANK(*p));
15051 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15054 /* For [. .] and [= =]. These are quite different internally from [: :],
15055 * so they are handled separately. */
15056 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15057 and 1 for at least one char in it
15060 const char open_char = *p;
15061 const char * temp_ptr = p + 1;
15063 /* These two constructs are not handled by perl, and if we find a
15064 * syntactically valid one, we croak. khw, who wrote this code, finds
15065 * this explanation of them very unclear:
15066 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15067 * And searching the rest of the internet wasn't very helpful either.
15068 * It looks like just about any byte can be in these constructs,
15069 * depending on the locale. But unless the pattern is being compiled
15070 * under /l, which is very rare, Perl runs under the C or POSIX locale.
15071 * In that case, it looks like [= =] isn't allowed at all, and that
15072 * [. .] could be any single code point, but for longer strings the
15073 * constituent characters would have to be the ASCII alphabetics plus
15074 * the minus-hyphen. Any sensible locale definition would limit itself
15075 * to these. And any portable one definitely should. Trying to parse
15076 * the general case is a nightmare (see [perl #127604]). So, this code
15077 * looks only for interiors of these constructs that match:
15079 * Using \w relaxes the apparent rules a little, without adding much
15080 * danger of mistaking something else for one of these constructs.
15082 * [. .] in some implementations described on the internet is usable to
15083 * escape a character that otherwise is special in bracketed character
15084 * classes. For example [.].] means a literal right bracket instead of
15085 * the ending of the class
15087 * [= =] can legitimately contain a [. .] construct, but we don't
15088 * handle this case, as that [. .] construct will later get parsed
15089 * itself and croak then. And [= =] is checked for even when not under
15090 * /l, as Perl has long done so.
15092 * The code below relies on there being a trailing NUL, so it doesn't
15093 * have to keep checking if the parse ptr < e.
15095 if (temp_ptr[1] == open_char) {
15098 else while ( temp_ptr < e
15099 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15104 if (*temp_ptr == open_char) {
15106 if (*temp_ptr == ']') {
15108 if (! found_problem && ! check_only) {
15109 RExC_parse = (char *) temp_ptr;
15110 vFAIL3("POSIX syntax [%c %c] is reserved for future "
15111 "extensions", open_char, open_char);
15114 /* Here, the syntax wasn't completely valid, or else the call
15115 * is to check-only */
15116 if (updated_parse_ptr) {
15117 *updated_parse_ptr = (char *) temp_ptr;
15120 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15124 /* If we find something that started out to look like one of these
15125 * constructs, but isn't, we continue below so that it can be checked
15126 * for being a class name with a typo of '.' or '=' instead of a colon.
15130 /* Here, we think there is a possibility that a [: :] class was meant, and
15131 * we have the first real character. It could be they think the '^' comes
15134 found_problem = TRUE;
15135 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15140 found_problem = TRUE;
15144 } while (p < e && isBLANK(*p));
15146 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15150 /* But the first character should be a colon, which they could have easily
15151 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15152 * distinguish from a colon, so treat that as a colon). */
15155 has_opening_colon = TRUE;
15157 else if (*p == ';') {
15158 found_problem = TRUE;
15160 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15161 has_opening_colon = TRUE;
15164 found_problem = TRUE;
15165 ADD_POSIX_WARNING(p, "there must be a starting ':'");
15167 /* Consider an initial punctuation (not one of the recognized ones) to
15168 * be a left terminator */
15169 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15174 /* They may think that you can put spaces between the components */
15176 found_problem = TRUE;
15180 } while (p < e && isBLANK(*p));
15182 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15187 /* We consider something like [^:^alnum:]] to not have been intended to
15188 * be a posix class, but XXX maybe we should */
15190 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15197 /* Again, they may think that you can put spaces between the components */
15199 found_problem = TRUE;
15203 } while (p < e && isBLANK(*p));
15205 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15210 /* XXX This ']' may be a typo, and something else was meant. But
15211 * treating it as such creates enough complications, that that
15212 * possibility isn't currently considered here. So we assume that the
15213 * ']' is what is intended, and if we've already found an initial '[',
15214 * this leaves this construct looking like [:] or [:^], which almost
15215 * certainly weren't intended to be posix classes */
15216 if (has_opening_bracket) {
15217 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15220 /* But this function can be called when we parse the colon for
15221 * something like qr/[alpha:]]/, so we back up to look for the
15226 found_problem = TRUE;
15227 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15229 else if (*p != ':') {
15231 /* XXX We are currently very restrictive here, so this code doesn't
15232 * consider the possibility that, say, /[alpha.]]/ was intended to
15233 * be a posix class. */
15234 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15237 /* Here we have something like 'foo:]'. There was no initial colon,
15238 * and we back up over 'foo. XXX Unlike the going forward case, we
15239 * don't handle typos of non-word chars in the middle */
15240 has_opening_colon = FALSE;
15243 while (p > RExC_start && isWORDCHAR(*p)) {
15248 /* Here, we have positioned ourselves to where we think the first
15249 * character in the potential class is */
15252 /* Now the interior really starts. There are certain key characters that
15253 * can end the interior, or these could just be typos. To catch both
15254 * cases, we may have to do two passes. In the first pass, we keep on
15255 * going unless we come to a sequence that matches
15256 * qr/ [[:punct:]] [[:blank:]]* \] /xa
15257 * This means it takes a sequence to end the pass, so two typos in a row if
15258 * that wasn't what was intended. If the class is perfectly formed, just
15259 * this one pass is needed. We also stop if there are too many characters
15260 * being accumulated, but this number is deliberately set higher than any
15261 * real class. It is set high enough so that someone who thinks that
15262 * 'alphanumeric' is a correct name would get warned that it wasn't.
15263 * While doing the pass, we keep track of where the key characters were in
15264 * it. If we don't find an end to the class, and one of the key characters
15265 * was found, we redo the pass, but stop when we get to that character.
15266 * Thus the key character was considered a typo in the first pass, but a
15267 * terminator in the second. If two key characters are found, we stop at
15268 * the second one in the first pass. Again this can miss two typos, but
15269 * catches a single one
15271 * In the first pass, 'possible_end' starts as NULL, and then gets set to
15272 * point to the first key character. For the second pass, it starts as -1.
15278 bool has_blank = FALSE;
15279 bool has_upper = FALSE;
15280 bool has_terminating_colon = FALSE;
15281 bool has_terminating_bracket = FALSE;
15282 bool has_semi_colon = FALSE;
15283 unsigned int name_len = 0;
15284 int punct_count = 0;
15288 /* Squeeze out blanks when looking up the class name below */
15289 if (isBLANK(*p) ) {
15291 found_problem = TRUE;
15296 /* The name will end with a punctuation */
15298 const char * peek = p + 1;
15300 /* Treat any non-']' punctuation followed by a ']' (possibly
15301 * with intervening blanks) as trying to terminate the class.
15302 * ']]' is very likely to mean a class was intended (but
15303 * missing the colon), but the warning message that gets
15304 * generated shows the error position better if we exit the
15305 * loop at the bottom (eventually), so skip it here. */
15307 if (peek < e && isBLANK(*peek)) {
15309 found_problem = TRUE;
15312 } while (peek < e && isBLANK(*peek));
15315 if (peek < e && *peek == ']') {
15316 has_terminating_bracket = TRUE;
15318 has_terminating_colon = TRUE;
15320 else if (*p == ';') {
15321 has_semi_colon = TRUE;
15322 has_terminating_colon = TRUE;
15325 found_problem = TRUE;
15332 /* Here we have punctuation we thought didn't end the class.
15333 * Keep track of the position of the key characters that are
15334 * more likely to have been class-enders */
15335 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15337 /* Allow just one such possible class-ender not actually
15338 * ending the class. */
15339 if (possible_end) {
15345 /* If we have too many punctuation characters, no use in
15347 if (++punct_count > max_distance) {
15351 /* Treat the punctuation as a typo. */
15352 input_text[name_len++] = *p;
15355 else if (isUPPER(*p)) { /* Use lowercase for lookup */
15356 input_text[name_len++] = toLOWER(*p);
15358 found_problem = TRUE;
15360 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15361 input_text[name_len++] = *p;
15365 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15369 /* The declaration of 'input_text' is how long we allow a potential
15370 * class name to be, before saying they didn't mean a class name at
15372 if (name_len >= C_ARRAY_LENGTH(input_text)) {
15377 /* We get to here when the possible class name hasn't been properly
15378 * terminated before:
15379 * 1) we ran off the end of the pattern; or
15380 * 2) found two characters, each of which might have been intended to
15381 * be the name's terminator
15382 * 3) found so many punctuation characters in the purported name,
15383 * that the edit distance to a valid one is exceeded
15384 * 4) we decided it was more characters than anyone could have
15385 * intended to be one. */
15387 found_problem = TRUE;
15389 /* In the final two cases, we know that looking up what we've
15390 * accumulated won't lead to a match, even a fuzzy one. */
15391 if ( name_len >= C_ARRAY_LENGTH(input_text)
15392 || punct_count > max_distance)
15394 /* If there was an intermediate key character that could have been
15395 * an intended end, redo the parse, but stop there */
15396 if (possible_end && possible_end != (char *) -1) {
15397 possible_end = (char *) -1; /* Special signal value to say
15398 we've done a first pass */
15403 /* Otherwise, it can't have meant to have been a class */
15404 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15407 /* If we ran off the end, and the final character was a punctuation
15408 * one, back up one, to look at that final one just below. Later, we
15409 * will restore the parse pointer if appropriate */
15410 if (name_len && p == e && isPUNCT(*(p-1))) {
15415 if (p < e && isPUNCT(*p)) {
15417 has_terminating_bracket = TRUE;
15419 /* If this is a 2nd ']', and the first one is just below this
15420 * one, consider that to be the real terminator. This gives a
15421 * uniform and better positioning for the warning message */
15423 && possible_end != (char *) -1
15424 && *possible_end == ']'
15425 && name_len && input_text[name_len - 1] == ']')
15430 /* And this is actually equivalent to having done the 2nd
15431 * pass now, so set it to not try again */
15432 possible_end = (char *) -1;
15437 has_terminating_colon = TRUE;
15439 else if (*p == ';') {
15440 has_semi_colon = TRUE;
15441 has_terminating_colon = TRUE;
15449 /* Here, we have a class name to look up. We can short circuit the
15450 * stuff below for short names that can't possibly be meant to be a
15451 * class name. (We can do this on the first pass, as any second pass
15452 * will yield an even shorter name) */
15453 if (name_len < 3) {
15454 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15457 /* Find which class it is. Initially switch on the length of the name.
15459 switch (name_len) {
15461 if (memEQs(name_start, 4, "word")) {
15462 /* this is not POSIX, this is the Perl \w */
15463 class_number = ANYOF_WORDCHAR;
15467 /* Names all of length 5: alnum alpha ascii blank cntrl digit
15468 * graph lower print punct space upper
15469 * Offset 4 gives the best switch position. */
15470 switch (name_start[4]) {
15472 if (memBEGINs(name_start, 5, "alph")) /* alpha */
15473 class_number = ANYOF_ALPHA;
15476 if (memBEGINs(name_start, 5, "spac")) /* space */
15477 class_number = ANYOF_SPACE;
15480 if (memBEGINs(name_start, 5, "grap")) /* graph */
15481 class_number = ANYOF_GRAPH;
15484 if (memBEGINs(name_start, 5, "asci")) /* ascii */
15485 class_number = ANYOF_ASCII;
15488 if (memBEGINs(name_start, 5, "blan")) /* blank */
15489 class_number = ANYOF_BLANK;
15492 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
15493 class_number = ANYOF_CNTRL;
15496 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
15497 class_number = ANYOF_ALPHANUMERIC;
15500 if (memBEGINs(name_start, 5, "lowe")) /* lower */
15501 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
15502 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
15503 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
15506 if (memBEGINs(name_start, 5, "digi")) /* digit */
15507 class_number = ANYOF_DIGIT;
15508 else if (memBEGINs(name_start, 5, "prin")) /* print */
15509 class_number = ANYOF_PRINT;
15510 else if (memBEGINs(name_start, 5, "punc")) /* punct */
15511 class_number = ANYOF_PUNCT;
15516 if (memEQs(name_start, 6, "xdigit"))
15517 class_number = ANYOF_XDIGIT;
15521 /* If the name exactly matches a posix class name the class number will
15522 * here be set to it, and the input almost certainly was meant to be a
15523 * posix class, so we can skip further checking. If instead the syntax
15524 * is exactly correct, but the name isn't one of the legal ones, we
15525 * will return that as an error below. But if neither of these apply,
15526 * it could be that no posix class was intended at all, or that one
15527 * was, but there was a typo. We tease these apart by doing fuzzy
15528 * matching on the name */
15529 if (class_number == OOB_NAMEDCLASS && found_problem) {
15530 const UV posix_names[][6] = {
15531 { 'a', 'l', 'n', 'u', 'm' },
15532 { 'a', 'l', 'p', 'h', 'a' },
15533 { 'a', 's', 'c', 'i', 'i' },
15534 { 'b', 'l', 'a', 'n', 'k' },
15535 { 'c', 'n', 't', 'r', 'l' },
15536 { 'd', 'i', 'g', 'i', 't' },
15537 { 'g', 'r', 'a', 'p', 'h' },
15538 { 'l', 'o', 'w', 'e', 'r' },
15539 { 'p', 'r', 'i', 'n', 't' },
15540 { 'p', 'u', 'n', 'c', 't' },
15541 { 's', 'p', 'a', 'c', 'e' },
15542 { 'u', 'p', 'p', 'e', 'r' },
15543 { 'w', 'o', 'r', 'd' },
15544 { 'x', 'd', 'i', 'g', 'i', 't' }
15546 /* The names of the above all have added NULs to make them the same
15547 * size, so we need to also have the real lengths */
15548 const UV posix_name_lengths[] = {
15549 sizeof("alnum") - 1,
15550 sizeof("alpha") - 1,
15551 sizeof("ascii") - 1,
15552 sizeof("blank") - 1,
15553 sizeof("cntrl") - 1,
15554 sizeof("digit") - 1,
15555 sizeof("graph") - 1,
15556 sizeof("lower") - 1,
15557 sizeof("print") - 1,
15558 sizeof("punct") - 1,
15559 sizeof("space") - 1,
15560 sizeof("upper") - 1,
15561 sizeof("word") - 1,
15562 sizeof("xdigit")- 1
15565 int temp_max = max_distance; /* Use a temporary, so if we
15566 reparse, we haven't changed the
15569 /* Use a smaller max edit distance if we are missing one of the
15571 if ( has_opening_bracket + has_opening_colon < 2
15572 || has_terminating_bracket + has_terminating_colon < 2)
15577 /* See if the input name is close to a legal one */
15578 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
15580 /* Short circuit call if the lengths are too far apart to be
15582 if (abs( (int) (name_len - posix_name_lengths[i]))
15588 if (edit_distance(input_text,
15591 posix_name_lengths[i],
15595 { /* If it is close, it probably was intended to be a class */
15596 goto probably_meant_to_be;
15600 /* Here the input name is not close enough to a valid class name
15601 * for us to consider it to be intended to be a posix class. If
15602 * we haven't already done so, and the parse found a character that
15603 * could have been terminators for the name, but which we absorbed
15604 * as typos during the first pass, repeat the parse, signalling it
15605 * to stop at that character */
15606 if (possible_end && possible_end != (char *) -1) {
15607 possible_end = (char *) -1;
15612 /* Here neither pass found a close-enough class name */
15613 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15616 probably_meant_to_be:
15618 /* Here we think that a posix specification was intended. Update any
15620 if (updated_parse_ptr) {
15621 *updated_parse_ptr = (char *) p;
15624 /* If a posix class name was intended but incorrectly specified, we
15625 * output or return the warnings */
15626 if (found_problem) {
15628 /* We set flags for these issues in the parse loop above instead of
15629 * adding them to the list of warnings, because we can parse it
15630 * twice, and we only want one warning instance */
15632 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
15635 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15637 if (has_semi_colon) {
15638 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15640 else if (! has_terminating_colon) {
15641 ADD_POSIX_WARNING(p, "there is no terminating ':'");
15643 if (! has_terminating_bracket) {
15644 ADD_POSIX_WARNING(p, "there is no terminating ']'");
15647 if ( posix_warnings
15649 && av_top_index(RExC_warn_text) > -1)
15651 *posix_warnings = RExC_warn_text;
15654 else if (class_number != OOB_NAMEDCLASS) {
15655 /* If it is a known class, return the class. The class number
15656 * #defines are structured so each complement is +1 to the normal
15658 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
15660 else if (! check_only) {
15662 /* Here, it is an unrecognized class. This is an error (unless the
15663 * call is to check only, which we've already handled above) */
15664 const char * const complement_string = (complement)
15667 RExC_parse = (char *) p;
15668 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
15670 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
15674 return OOB_NAMEDCLASS;
15676 #undef ADD_POSIX_WARNING
15678 STATIC unsigned int
15679 S_regex_set_precedence(const U8 my_operator) {
15681 /* Returns the precedence in the (?[...]) construct of the input operator,
15682 * specified by its character representation. The precedence follows
15683 * general Perl rules, but it extends this so that ')' and ']' have (low)
15684 * precedence even though they aren't really operators */
15686 switch (my_operator) {
15702 NOT_REACHED; /* NOTREACHED */
15703 return 0; /* Silence compiler warning */
15706 STATIC regnode_offset
15707 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
15708 I32 *flagp, U32 depth,
15709 char * const oregcomp_parse)
15711 /* Handle the (?[...]) construct to do set operations */
15713 U8 curchar; /* Current character being parsed */
15714 UV start, end; /* End points of code point ranges */
15715 SV* final = NULL; /* The end result inversion list */
15716 SV* result_string; /* 'final' stringified */
15717 AV* stack; /* stack of operators and operands not yet
15719 AV* fence_stack = NULL; /* A stack containing the positions in
15720 'stack' of where the undealt-with left
15721 parens would be if they were actually
15723 /* The 'volatile' is a workaround for an optimiser bug
15724 * in Solaris Studio 12.3. See RT #127455 */
15725 volatile IV fence = 0; /* Position of where most recent undealt-
15726 with left paren in stack is; -1 if none.
15728 STRLEN len; /* Temporary */
15729 regnode_offset node; /* Temporary, and final regnode returned by
15731 const bool save_fold = FOLD; /* Temporary */
15732 char *save_end, *save_parse; /* Temporaries */
15733 const bool in_locale = LOC; /* we turn off /l during processing */
15735 GET_RE_DEBUG_FLAGS_DECL;
15737 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
15739 DEBUG_PARSE("xcls");
15742 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
15745 /* The use of this operator implies /u. This is required so that the
15746 * compile time values are valid in all runtime cases */
15747 REQUIRE_UNI_RULES(flagp, 0);
15749 ckWARNexperimental(RExC_parse,
15750 WARN_EXPERIMENTAL__REGEX_SETS,
15751 "The regex_sets feature is experimental");
15753 /* Everything in this construct is a metacharacter. Operands begin with
15754 * either a '\' (for an escape sequence), or a '[' for a bracketed
15755 * character class. Any other character should be an operator, or
15756 * parenthesis for grouping. Both types of operands are handled by calling
15757 * regclass() to parse them. It is called with a parameter to indicate to
15758 * return the computed inversion list. The parsing here is implemented via
15759 * a stack. Each entry on the stack is a single character representing one
15760 * of the operators; or else a pointer to an operand inversion list. */
15762 #define IS_OPERATOR(a) SvIOK(a)
15763 #define IS_OPERAND(a) (! IS_OPERATOR(a))
15765 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
15766 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
15767 * with pronouncing it called it Reverse Polish instead, but now that YOU
15768 * know how to pronounce it you can use the correct term, thus giving due
15769 * credit to the person who invented it, and impressing your geek friends.
15770 * Wikipedia says that the pronounciation of "Ł" has been changing so that
15771 * it is now more like an English initial W (as in wonk) than an L.)
15773 * This means that, for example, 'a | b & c' is stored on the stack as
15781 * where the numbers in brackets give the stack [array] element number.
15782 * In this implementation, parentheses are not stored on the stack.
15783 * Instead a '(' creates a "fence" so that the part of the stack below the
15784 * fence is invisible except to the corresponding ')' (this allows us to
15785 * replace testing for parens, by using instead subtraction of the fence
15786 * position). As new operands are processed they are pushed onto the stack
15787 * (except as noted in the next paragraph). New operators of higher
15788 * precedence than the current final one are inserted on the stack before
15789 * the lhs operand (so that when the rhs is pushed next, everything will be
15790 * in the correct positions shown above. When an operator of equal or
15791 * lower precedence is encountered in parsing, all the stacked operations
15792 * of equal or higher precedence are evaluated, leaving the result as the
15793 * top entry on the stack. This makes higher precedence operations
15794 * evaluate before lower precedence ones, and causes operations of equal
15795 * precedence to left associate.
15797 * The only unary operator '!' is immediately pushed onto the stack when
15798 * encountered. When an operand is encountered, if the top of the stack is
15799 * a '!", the complement is immediately performed, and the '!' popped. The
15800 * resulting value is treated as a new operand, and the logic in the
15801 * previous paragraph is executed. Thus in the expression
15803 * the stack looks like
15809 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
15816 * A ')' is treated as an operator with lower precedence than all the
15817 * aforementioned ones, which causes all operations on the stack above the
15818 * corresponding '(' to be evaluated down to a single resultant operand.
15819 * Then the fence for the '(' is removed, and the operand goes through the
15820 * algorithm above, without the fence.
15822 * A separate stack is kept of the fence positions, so that the position of
15823 * the latest so-far unbalanced '(' is at the top of it.
15825 * The ']' ending the construct is treated as the lowest operator of all,
15826 * so that everything gets evaluated down to a single operand, which is the
15829 sv_2mortal((SV *)(stack = newAV()));
15830 sv_2mortal((SV *)(fence_stack = newAV()));
15832 while (RExC_parse < RExC_end) {
15833 I32 top_index; /* Index of top-most element in 'stack' */
15834 SV** top_ptr; /* Pointer to top 'stack' element */
15835 SV* current = NULL; /* To contain the current inversion list
15837 SV* only_to_avoid_leaks;
15839 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15840 TRUE /* Force /x */ );
15841 if (RExC_parse >= RExC_end) { /* Fail */
15845 curchar = UCHARAT(RExC_parse);
15849 #ifdef ENABLE_REGEX_SETS_DEBUGGING
15850 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
15851 DEBUG_U(dump_regex_sets_structures(pRExC_state,
15852 stack, fence, fence_stack));
15855 top_index = av_tindex_skip_len_mg(stack);
15858 SV** stacked_ptr; /* Ptr to something already on 'stack' */
15859 char stacked_operator; /* The topmost operator on the 'stack'. */
15860 SV* lhs; /* Operand to the left of the operator */
15861 SV* rhs; /* Operand to the right of the operator */
15862 SV* fence_ptr; /* Pointer to top element of the fence
15867 if ( RExC_parse < RExC_end - 2
15868 && UCHARAT(RExC_parse + 1) == '?'
15869 && UCHARAT(RExC_parse + 2) == '^')
15871 /* If is a '(?', could be an embedded '(?^flags:(?[...])'.
15872 * This happens when we have some thing like
15874 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
15876 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
15878 * Here we would be handling the interpolated
15879 * '$thai_or_lao'. We handle this by a recursive call to
15880 * ourselves which returns the inversion list the
15881 * interpolated expression evaluates to. We use the flags
15882 * from the interpolated pattern. */
15883 U32 save_flags = RExC_flags;
15884 const char * save_parse;
15886 RExC_parse += 2; /* Skip past the '(?' */
15887 save_parse = RExC_parse;
15889 /* Parse the flags for the '(?'. We already know the first
15890 * flag to parse is a '^' */
15891 parse_lparen_question_flags(pRExC_state);
15893 if ( RExC_parse >= RExC_end - 4
15894 || UCHARAT(RExC_parse) != ':'
15895 || UCHARAT(++RExC_parse) != '('
15896 || UCHARAT(++RExC_parse) != '?'
15897 || UCHARAT(++RExC_parse) != '[')
15900 /* In combination with the above, this moves the
15901 * pointer to the point just after the first erroneous
15903 if (RExC_parse >= RExC_end - 4) {
15904 RExC_parse = RExC_end;
15906 else if (RExC_parse != save_parse) {
15907 RExC_parse += (UTF)
15908 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
15911 vFAIL("Expecting '(?flags:(?[...'");
15914 /* Recurse, with the meat of the embedded expression */
15916 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
15917 depth+1, oregcomp_parse);
15919 /* Here, 'current' contains the embedded expression's
15920 * inversion list, and RExC_parse points to the trailing
15921 * ']'; the next character should be the ')' */
15923 if (UCHARAT(RExC_parse) != ')')
15924 vFAIL("Expecting close paren for nested extended charclass");
15926 /* Then the ')' matching the original '(' handled by this
15927 * case: statement */
15929 if (UCHARAT(RExC_parse) != ')')
15930 vFAIL("Expecting close paren for wrapper for nested extended charclass");
15932 RExC_flags = save_flags;
15933 goto handle_operand;
15936 /* A regular '('. Look behind for illegal syntax */
15937 if (top_index - fence >= 0) {
15938 /* If the top entry on the stack is an operator, it had
15939 * better be a '!', otherwise the entry below the top
15940 * operand should be an operator */
15941 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
15942 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
15943 || ( IS_OPERAND(*top_ptr)
15944 && ( top_index - fence < 1
15945 || ! (stacked_ptr = av_fetch(stack,
15948 || ! IS_OPERATOR(*stacked_ptr))))
15951 vFAIL("Unexpected '(' with no preceding operator");
15955 /* Stack the position of this undealt-with left paren */
15956 av_push(fence_stack, newSViv(fence));
15957 fence = top_index + 1;
15961 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
15962 * multi-char folds are allowed. */
15963 if (!regclass(pRExC_state, flagp, depth+1,
15964 TRUE, /* means parse just the next thing */
15965 FALSE, /* don't allow multi-char folds */
15966 FALSE, /* don't silence non-portable warnings. */
15968 FALSE, /* Require return to be an ANYOF */
15971 goto regclass_failed;
15974 /* regclass() will return with parsing just the \ sequence,
15975 * leaving the parse pointer at the next thing to parse */
15977 goto handle_operand;
15979 case '[': /* Is a bracketed character class */
15981 /* See if this is a [:posix:] class. */
15982 bool is_posix_class = (OOB_NAMEDCLASS
15983 < handle_possible_posix(pRExC_state,
15987 TRUE /* checking only */));
15988 /* If it is a posix class, leave the parse pointer at the '['
15989 * to fool regclass() into thinking it is part of a
15990 * '[[:posix:]]'. */
15991 if (! is_posix_class) {
15995 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
15996 * multi-char folds are allowed. */
15997 if (!regclass(pRExC_state, flagp, depth+1,
15998 is_posix_class, /* parse the whole char
15999 class only if not a
16001 FALSE, /* don't allow multi-char folds */
16002 TRUE, /* silence non-portable warnings. */
16004 FALSE, /* Require return to be an ANYOF */
16007 goto regclass_failed;
16014 /* function call leaves parse pointing to the ']', except if we
16016 if (is_posix_class) {
16020 goto handle_operand;
16024 if (top_index >= 1) {
16025 goto join_operators;
16028 /* Only a single operand on the stack: are done */
16032 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16033 if (UCHARAT(RExC_parse - 1) == ']') {
16037 vFAIL("Unexpected ')'");
16040 /* If nothing after the fence, is missing an operand */
16041 if (top_index - fence < 0) {
16045 /* If at least two things on the stack, treat this as an
16047 if (top_index - fence >= 1) {
16048 goto join_operators;
16051 /* Here only a single thing on the fenced stack, and there is a
16052 * fence. Get rid of it */
16053 fence_ptr = av_pop(fence_stack);
16055 fence = SvIV(fence_ptr);
16056 SvREFCNT_dec_NN(fence_ptr);
16063 /* Having gotten rid of the fence, we pop the operand at the
16064 * stack top and process it as a newly encountered operand */
16065 current = av_pop(stack);
16066 if (IS_OPERAND(current)) {
16067 goto handle_operand;
16079 /* These binary operators should have a left operand already
16081 if ( top_index - fence < 0
16082 || top_index - fence == 1
16083 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16084 || ! IS_OPERAND(*top_ptr))
16086 goto unexpected_binary;
16089 /* If only the one operand is on the part of the stack visible
16090 * to us, we just place this operator in the proper position */
16091 if (top_index - fence < 2) {
16093 /* Place the operator before the operand */
16095 SV* lhs = av_pop(stack);
16096 av_push(stack, newSVuv(curchar));
16097 av_push(stack, lhs);
16101 /* But if there is something else on the stack, we need to
16102 * process it before this new operator if and only if the
16103 * stacked operation has equal or higher precedence than the
16108 /* The operator on the stack is supposed to be below both its
16110 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16111 || IS_OPERAND(*stacked_ptr))
16113 /* But if not, it's legal and indicates we are completely
16114 * done if and only if we're currently processing a ']',
16115 * which should be the final thing in the expression */
16116 if (curchar == ']') {
16122 vFAIL2("Unexpected binary operator '%c' with no "
16123 "preceding operand", curchar);
16125 stacked_operator = (char) SvUV(*stacked_ptr);
16127 if (regex_set_precedence(curchar)
16128 > regex_set_precedence(stacked_operator))
16130 /* Here, the new operator has higher precedence than the
16131 * stacked one. This means we need to add the new one to
16132 * the stack to await its rhs operand (and maybe more
16133 * stuff). We put it before the lhs operand, leaving
16134 * untouched the stacked operator and everything below it
16136 lhs = av_pop(stack);
16137 assert(IS_OPERAND(lhs));
16139 av_push(stack, newSVuv(curchar));
16140 av_push(stack, lhs);
16144 /* Here, the new operator has equal or lower precedence than
16145 * what's already there. This means the operation already
16146 * there should be performed now, before the new one. */
16148 rhs = av_pop(stack);
16149 if (! IS_OPERAND(rhs)) {
16151 /* This can happen when a ! is not followed by an operand,
16152 * like in /(?[\t &!])/ */
16156 lhs = av_pop(stack);
16158 if (! IS_OPERAND(lhs)) {
16160 /* This can happen when there is an empty (), like in
16161 * /(?[[0]+()+])/ */
16165 switch (stacked_operator) {
16167 _invlist_intersection(lhs, rhs, &rhs);
16172 _invlist_union(lhs, rhs, &rhs);
16176 _invlist_subtract(lhs, rhs, &rhs);
16179 case '^': /* The union minus the intersection */
16184 _invlist_union(lhs, rhs, &u);
16185 _invlist_intersection(lhs, rhs, &i);
16186 _invlist_subtract(u, i, &rhs);
16187 SvREFCNT_dec_NN(i);
16188 SvREFCNT_dec_NN(u);
16194 /* Here, the higher precedence operation has been done, and the
16195 * result is in 'rhs'. We overwrite the stacked operator with
16196 * the result. Then we redo this code to either push the new
16197 * operator onto the stack or perform any higher precedence
16198 * stacked operation */
16199 only_to_avoid_leaks = av_pop(stack);
16200 SvREFCNT_dec(only_to_avoid_leaks);
16201 av_push(stack, rhs);
16204 case '!': /* Highest priority, right associative */
16206 /* If what's already at the top of the stack is another '!",
16207 * they just cancel each other out */
16208 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
16209 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16211 only_to_avoid_leaks = av_pop(stack);
16212 SvREFCNT_dec(only_to_avoid_leaks);
16214 else { /* Otherwise, since it's right associative, just push
16216 av_push(stack, newSVuv(curchar));
16221 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16222 if (RExC_parse >= RExC_end) {
16225 vFAIL("Unexpected character");
16229 /* Here 'current' is the operand. If something is already on the
16230 * stack, we have to check if it is a !. But first, the code above
16231 * may have altered the stack in the time since we earlier set
16234 top_index = av_tindex_skip_len_mg(stack);
16235 if (top_index - fence >= 0) {
16236 /* If the top entry on the stack is an operator, it had better
16237 * be a '!', otherwise the entry below the top operand should
16238 * be an operator */
16239 top_ptr = av_fetch(stack, top_index, FALSE);
16241 if (IS_OPERATOR(*top_ptr)) {
16243 /* The only permissible operator at the top of the stack is
16244 * '!', which is applied immediately to this operand. */
16245 curchar = (char) SvUV(*top_ptr);
16246 if (curchar != '!') {
16247 SvREFCNT_dec(current);
16248 vFAIL2("Unexpected binary operator '%c' with no "
16249 "preceding operand", curchar);
16252 _invlist_invert(current);
16254 only_to_avoid_leaks = av_pop(stack);
16255 SvREFCNT_dec(only_to_avoid_leaks);
16257 /* And we redo with the inverted operand. This allows
16258 * handling multiple ! in a row */
16259 goto handle_operand;
16261 /* Single operand is ok only for the non-binary ')'
16263 else if ((top_index - fence == 0 && curchar != ')')
16264 || (top_index - fence > 0
16265 && (! (stacked_ptr = av_fetch(stack,
16268 || IS_OPERAND(*stacked_ptr))))
16270 SvREFCNT_dec(current);
16271 vFAIL("Operand with no preceding operator");
16275 /* Here there was nothing on the stack or the top element was
16276 * another operand. Just add this new one */
16277 av_push(stack, current);
16279 } /* End of switch on next parse token */
16281 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16282 } /* End of loop parsing through the construct */
16284 vFAIL("Syntax error in (?[...])");
16288 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16289 if (RExC_parse < RExC_end) {
16293 vFAIL("Unexpected ']' with no following ')' in (?[...");
16296 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16297 vFAIL("Unmatched (");
16300 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
16301 || ((final = av_pop(stack)) == NULL)
16302 || ! IS_OPERAND(final)
16303 || ! is_invlist(final)
16304 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
16307 SvREFCNT_dec(final);
16308 vFAIL("Incomplete expression within '(?[ ])'");
16311 /* Here, 'final' is the resultant inversion list from evaluating the
16312 * expression. Return it if so requested */
16313 if (return_invlist) {
16314 *return_invlist = final;
16318 /* Otherwise generate a resultant node, based on 'final'. regclass() is
16319 * expecting a string of ranges and individual code points */
16320 invlist_iterinit(final);
16321 result_string = newSVpvs("");
16322 while (invlist_iternext(final, &start, &end)) {
16323 if (start == end) {
16324 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16327 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}",
16332 /* About to generate an ANYOF (or similar) node from the inversion list we
16333 * have calculated */
16334 save_parse = RExC_parse;
16335 RExC_parse = SvPV(result_string, len);
16336 save_end = RExC_end;
16337 RExC_end = RExC_parse + len;
16338 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16340 /* We turn off folding around the call, as the class we have constructed
16341 * already has all folding taken into consideration, and we don't want
16342 * regclass() to add to that */
16343 RExC_flags &= ~RXf_PMf_FOLD;
16344 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16345 * folds are allowed. */
16346 node = regclass(pRExC_state, flagp, depth+1,
16347 FALSE, /* means parse the whole char class */
16348 FALSE, /* don't allow multi-char folds */
16349 TRUE, /* silence non-portable warnings. The above may very
16350 well have generated non-portable code points, but
16351 they're valid on this machine */
16352 FALSE, /* similarly, no need for strict */
16353 FALSE, /* Require return to be an ANYOF */
16358 RExC_parse = save_parse + 1;
16359 RExC_end = save_end;
16360 SvREFCNT_dec_NN(final);
16361 SvREFCNT_dec_NN(result_string);
16364 RExC_flags |= RXf_PMf_FOLD;
16368 goto regclass_failed;
16370 /* Fix up the node type if we are in locale. (We have pretended we are
16371 * under /u for the purposes of regclass(), as this construct will only
16372 * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so
16373 * as to cause any warnings about bad locales to be output in regexec.c),
16374 * and add the flag that indicates to check if not in a UTF-8 locale. The
16375 * reason we above forbid optimization into something other than an ANYOF
16376 * node is simply to minimize the number of code changes in regexec.c.
16377 * Otherwise we would have to create new EXACTish node types and deal with
16378 * them. This decision could be revisited should this construct become
16381 * (One might think we could look at the resulting ANYOF node and suppress
16382 * the flag if everything is above 255, as those would be UTF-8 only,
16383 * but this isn't true, as the components that led to that result could
16384 * have been locale-affected, and just happen to cancel each other out
16385 * under UTF-8 locales.) */
16387 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16389 assert(OP(REGNODE_p(node)) == ANYOF);
16391 OP(REGNODE_p(node)) = ANYOFL;
16392 ANYOF_FLAGS(REGNODE_p(node))
16393 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16396 nextchar(pRExC_state);
16397 Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16401 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
16405 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16408 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
16409 AV * stack, const IV fence, AV * fence_stack)
16410 { /* Dumps the stacks in handle_regex_sets() */
16412 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
16413 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
16416 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
16418 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
16420 if (stack_top < 0) {
16421 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
16424 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
16425 for (i = stack_top; i >= 0; i--) {
16426 SV ** element_ptr = av_fetch(stack, i, FALSE);
16427 if (! element_ptr) {
16430 if (IS_OPERATOR(*element_ptr)) {
16431 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
16432 (int) i, (int) SvIV(*element_ptr));
16435 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
16436 sv_dump(*element_ptr);
16441 if (fence_stack_top < 0) {
16442 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
16445 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
16446 for (i = fence_stack_top; i >= 0; i--) {
16447 SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
16448 if (! element_ptr) {
16451 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
16452 (int) i, (int) SvIV(*element_ptr));
16463 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
16465 /* This adds the Latin1/above-Latin1 folding rules.
16467 * This should be called only for a Latin1-range code points, cp, which is
16468 * known to be involved in a simple fold with other code points above
16469 * Latin1. It would give false results if /aa has been specified.
16470 * Multi-char folds are outside the scope of this, and must be handled
16473 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
16475 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
16477 /* The rules that are valid for all Unicode versions are hard-coded in */
16482 add_cp_to_invlist(*invlist, KELVIN_SIGN);
16486 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
16489 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
16490 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
16492 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
16493 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
16494 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
16496 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
16497 *invlist = add_cp_to_invlist(*invlist,
16498 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
16501 default: /* Other code points are checked against the data for the
16502 current Unicode version */
16504 Size_t folds_count;
16505 unsigned int first_fold;
16506 const unsigned int * remaining_folds;
16510 folded_cp = toFOLD(cp);
16513 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
16515 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
16518 if (folded_cp > 255) {
16519 *invlist = add_cp_to_invlist(*invlist, folded_cp);
16522 folds_count = _inverse_folds(folded_cp, &first_fold,
16524 if (folds_count == 0) {
16526 /* Use deprecated warning to increase the chances of this being
16528 ckWARN2reg_d(RExC_parse,
16529 "Perl folding rules are not up-to-date for 0x%02X;"
16530 " please use the perlbug utility to report;", cp);
16535 if (first_fold > 255) {
16536 *invlist = add_cp_to_invlist(*invlist, first_fold);
16538 for (i = 0; i < folds_count - 1; i++) {
16539 if (remaining_folds[i] > 255) {
16540 *invlist = add_cp_to_invlist(*invlist,
16541 remaining_folds[i]);
16551 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
16553 /* Output the elements of the array given by '*posix_warnings' as REGEXP
16557 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
16559 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
16561 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
16565 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
16566 if (first_is_fatal) { /* Avoid leaking this */
16567 av_undef(posix_warnings); /* This isn't necessary if the
16568 array is mortal, but is a
16570 (void) sv_2mortal(msg);
16573 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
16574 SvREFCNT_dec_NN(msg);
16577 UPDATE_WARNINGS_LOC(RExC_parse);
16581 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
16583 /* This adds the string scalar <multi_string> to the array
16584 * <multi_char_matches>. <multi_string> is known to have exactly
16585 * <cp_count> code points in it. This is used when constructing a
16586 * bracketed character class and we find something that needs to match more
16587 * than a single character.
16589 * <multi_char_matches> is actually an array of arrays. Each top-level
16590 * element is an array that contains all the strings known so far that are
16591 * the same length. And that length (in number of code points) is the same
16592 * as the index of the top-level array. Hence, the [2] element is an
16593 * array, each element thereof is a string containing TWO code points;
16594 * while element [3] is for strings of THREE characters, and so on. Since
16595 * this is for multi-char strings there can never be a [0] nor [1] element.
16597 * When we rewrite the character class below, we will do so such that the
16598 * longest strings are written first, so that it prefers the longest
16599 * matching strings first. This is done even if it turns out that any
16600 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
16601 * Christiansen has agreed that this is ok. This makes the test for the
16602 * ligature 'ffi' come before the test for 'ff', for example */
16605 AV** this_array_ptr;
16607 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
16609 if (! multi_char_matches) {
16610 multi_char_matches = newAV();
16613 if (av_exists(multi_char_matches, cp_count)) {
16614 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
16615 this_array = *this_array_ptr;
16618 this_array = newAV();
16619 av_store(multi_char_matches, cp_count,
16622 av_push(this_array, multi_string);
16624 return multi_char_matches;
16627 /* The names of properties whose definitions are not known at compile time are
16628 * stored in this SV, after a constant heading. So if the length has been
16629 * changed since initialization, then there is a run-time definition. */
16630 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
16631 (SvCUR(listsv) != initial_listsv_len)
16633 /* There is a restricted set of white space characters that are legal when
16634 * ignoring white space in a bracketed character class. This generates the
16635 * code to skip them.
16637 * There is a line below that uses the same white space criteria but is outside
16638 * this macro. Both here and there must use the same definition */
16639 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
16642 while (isBLANK_A(UCHARAT(p))) \
16649 STATIC regnode_offset
16650 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
16651 const bool stop_at_1, /* Just parse the next thing, don't
16652 look for a full character class */
16653 bool allow_mutiple_chars,
16654 const bool silence_non_portable, /* Don't output warnings
16658 bool optimizable, /* ? Allow a non-ANYOF return
16660 SV** ret_invlist /* Return an inversion list, not a node */
16663 /* parse a bracketed class specification. Most of these will produce an
16664 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
16665 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
16666 * under /i with multi-character folds: it will be rewritten following the
16667 * paradigm of this example, where the <multi-fold>s are characters which
16668 * fold to multiple character sequences:
16669 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
16670 * gets effectively rewritten as:
16671 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
16672 * reg() gets called (recursively) on the rewritten version, and this
16673 * function will return what it constructs. (Actually the <multi-fold>s
16674 * aren't physically removed from the [abcdefghi], it's just that they are
16675 * ignored in the recursion by means of a flag:
16676 * <RExC_in_multi_char_class>.)
16678 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
16679 * characters, with the corresponding bit set if that character is in the
16680 * list. For characters above this, an inversion list is used. There
16681 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
16682 * determinable at compile time
16684 * On success, returns the offset at which any next node should be placed
16685 * into the regex engine program being compiled.
16687 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
16688 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
16693 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
16695 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
16696 regnode_offset ret = -1; /* Initialized to an illegal value */
16698 int namedclass = OOB_NAMEDCLASS;
16699 char *rangebegin = NULL;
16700 SV *listsv = NULL; /* List of \p{user-defined} whose definitions
16701 aren't available at the time this was called */
16702 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
16703 than just initialized. */
16704 SV* properties = NULL; /* Code points that match \p{} \P{} */
16705 SV* posixes = NULL; /* Code points that match classes like [:word:],
16706 extended beyond the Latin1 range. These have to
16707 be kept separate from other code points for much
16708 of this function because their handling is
16709 different under /i, and for most classes under
16711 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
16712 separate for a while from the non-complemented
16713 versions because of complications with /d
16715 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
16716 treated more simply than the general case,
16717 leading to less compilation and execution
16719 UV element_count = 0; /* Number of distinct elements in the class.
16720 Optimizations may be possible if this is tiny */
16721 AV * multi_char_matches = NULL; /* Code points that fold to more than one
16722 character; used under /i */
16724 char * stop_ptr = RExC_end; /* where to stop parsing */
16726 /* ignore unescaped whitespace? */
16727 const bool skip_white = cBOOL( ret_invlist
16728 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
16730 /* inversion list of code points this node matches only when the target
16731 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
16733 SV* upper_latin1_only_utf8_matches = NULL;
16735 /* Inversion list of code points this node matches regardless of things
16736 * like locale, folding, utf8ness of the target string */
16737 SV* cp_list = NULL;
16739 /* Like cp_list, but code points on this list need to be checked for things
16740 * that fold to/from them under /i */
16741 SV* cp_foldable_list = NULL;
16743 /* Like cp_list, but code points on this list are valid only when the
16744 * runtime locale is UTF-8 */
16745 SV* only_utf8_locale_list = NULL;
16747 /* In a range, if one of the endpoints is non-character-set portable,
16748 * meaning that it hard-codes a code point that may mean a different
16749 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
16750 * mnemonic '\t' which each mean the same character no matter which
16751 * character set the platform is on. */
16752 unsigned int non_portable_endpoint = 0;
16754 /* Is the range unicode? which means on a platform that isn't 1-1 native
16755 * to Unicode (i.e. non-ASCII), each code point in it should be considered
16756 * to be a Unicode value. */
16757 bool unicode_range = FALSE;
16758 bool invert = FALSE; /* Is this class to be complemented */
16760 bool warn_super = ALWAYS_WARN_SUPER;
16762 const char * orig_parse = RExC_parse;
16764 /* This variable is used to mark where the end in the input is of something
16765 * that looks like a POSIX construct but isn't. During the parse, when
16766 * something looks like it could be such a construct is encountered, it is
16767 * checked for being one, but not if we've already checked this area of the
16768 * input. Only after this position is reached do we check again */
16769 char *not_posix_region_end = RExC_parse - 1;
16771 AV* posix_warnings = NULL;
16772 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
16773 U8 op = END; /* The returned node-type, initialized to an impossible
16775 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
16776 U32 posixl = 0; /* bit field of posix classes matched under /l */
16779 /* Flags as to what things aren't knowable until runtime. (Note that these are
16780 * mutually exclusive.) */
16781 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
16782 haven't been defined as of yet */
16783 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
16785 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
16786 what gets folded */
16787 U32 has_runtime_dependency = 0; /* OR of the above flags */
16789 GET_RE_DEBUG_FLAGS_DECL;
16791 PERL_ARGS_ASSERT_REGCLASS;
16793 PERL_UNUSED_ARG(depth);
16797 /* If wants an inversion list returned, we can't optimize to something
16800 optimizable = FALSE;
16803 DEBUG_PARSE("clas");
16805 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
16806 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
16807 && UNICODE_DOT_DOT_VERSION == 0)
16808 allow_mutiple_chars = FALSE;
16811 /* We include the /i status at the beginning of this so that we can
16812 * know it at runtime */
16813 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
16814 initial_listsv_len = SvCUR(listsv);
16815 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
16817 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16819 assert(RExC_parse <= RExC_end);
16821 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
16824 allow_mutiple_chars = FALSE;
16826 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16829 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
16830 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
16831 int maybe_class = handle_possible_posix(pRExC_state,
16833 ¬_posix_region_end,
16835 TRUE /* checking only */);
16836 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
16837 ckWARN4reg(not_posix_region_end,
16838 "POSIX syntax [%c %c] belongs inside character classes%s",
16839 *RExC_parse, *RExC_parse,
16840 (maybe_class == OOB_NAMEDCLASS)
16841 ? ((POSIXCC_NOTYET(*RExC_parse))
16842 ? " (but this one isn't implemented)"
16843 : " (but this one isn't fully valid)")
16849 /* If the caller wants us to just parse a single element, accomplish this
16850 * by faking the loop ending condition */
16851 if (stop_at_1 && RExC_end > RExC_parse) {
16852 stop_ptr = RExC_parse + 1;
16855 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
16856 if (UCHARAT(RExC_parse) == ']')
16857 goto charclassloop;
16861 if ( posix_warnings
16862 && av_tindex_skip_len_mg(posix_warnings) >= 0
16863 && RExC_parse > not_posix_region_end)
16865 /* Warnings about posix class issues are considered tentative until
16866 * we are far enough along in the parse that we can no longer
16867 * change our mind, at which point we output them. This is done
16868 * each time through the loop so that a later class won't zap them
16869 * before they have been dealt with. */
16870 output_posix_warnings(pRExC_state, posix_warnings);
16873 if (RExC_parse >= stop_ptr) {
16877 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
16879 if (UCHARAT(RExC_parse) == ']') {
16885 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
16886 save_value = value;
16887 save_prevvalue = prevvalue;
16890 rangebegin = RExC_parse;
16892 non_portable_endpoint = 0;
16894 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
16895 value = utf8n_to_uvchr((U8*)RExC_parse,
16896 RExC_end - RExC_parse,
16897 &numlen, UTF8_ALLOW_DEFAULT);
16898 RExC_parse += numlen;
16901 value = UCHARAT(RExC_parse++);
16903 if (value == '[') {
16904 char * posix_class_end;
16905 namedclass = handle_possible_posix(pRExC_state,
16908 do_posix_warnings ? &posix_warnings : NULL,
16909 FALSE /* die if error */);
16910 if (namedclass > OOB_NAMEDCLASS) {
16912 /* If there was an earlier attempt to parse this particular
16913 * posix class, and it failed, it was a false alarm, as this
16914 * successful one proves */
16915 if ( posix_warnings
16916 && av_tindex_skip_len_mg(posix_warnings) >= 0
16917 && not_posix_region_end >= RExC_parse
16918 && not_posix_region_end <= posix_class_end)
16920 av_undef(posix_warnings);
16923 RExC_parse = posix_class_end;
16925 else if (namedclass == OOB_NAMEDCLASS) {
16926 not_posix_region_end = posix_class_end;
16929 namedclass = OOB_NAMEDCLASS;
16932 else if ( RExC_parse - 1 > not_posix_region_end
16933 && MAYBE_POSIXCC(value))
16935 (void) handle_possible_posix(
16937 RExC_parse - 1, /* -1 because parse has already been
16939 ¬_posix_region_end,
16940 do_posix_warnings ? &posix_warnings : NULL,
16941 TRUE /* checking only */);
16943 else if ( strict && ! skip_white
16944 && ( _generic_isCC(value, _CC_VERTSPACE)
16945 || is_VERTWS_cp_high(value)))
16947 vFAIL("Literal vertical space in [] is illegal except under /x");
16949 else if (value == '\\') {
16950 /* Is a backslash; get the code point of the char after it */
16952 if (RExC_parse >= RExC_end) {
16953 vFAIL("Unmatched [");
16956 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
16957 value = utf8n_to_uvchr((U8*)RExC_parse,
16958 RExC_end - RExC_parse,
16959 &numlen, UTF8_ALLOW_DEFAULT);
16960 RExC_parse += numlen;
16963 value = UCHARAT(RExC_parse++);
16965 /* Some compilers cannot handle switching on 64-bit integer
16966 * values, therefore value cannot be an UV. Yes, this will
16967 * be a problem later if we want switch on Unicode.
16968 * A similar issue a little bit later when switching on
16969 * namedclass. --jhi */
16971 /* If the \ is escaping white space when white space is being
16972 * skipped, it means that that white space is wanted literally, and
16973 * is already in 'value'. Otherwise, need to translate the escape
16974 * into what it signifies. */
16975 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
16977 case 'w': namedclass = ANYOF_WORDCHAR; break;
16978 case 'W': namedclass = ANYOF_NWORDCHAR; break;
16979 case 's': namedclass = ANYOF_SPACE; break;
16980 case 'S': namedclass = ANYOF_NSPACE; break;
16981 case 'd': namedclass = ANYOF_DIGIT; break;
16982 case 'D': namedclass = ANYOF_NDIGIT; break;
16983 case 'v': namedclass = ANYOF_VERTWS; break;
16984 case 'V': namedclass = ANYOF_NVERTWS; break;
16985 case 'h': namedclass = ANYOF_HORIZWS; break;
16986 case 'H': namedclass = ANYOF_NHORIZWS; break;
16987 case 'N': /* Handle \N{NAME} in class */
16989 const char * const backslash_N_beg = RExC_parse - 2;
16992 if (! grok_bslash_N(pRExC_state,
16993 NULL, /* No regnode */
16994 &value, /* Yes single value */
16995 &cp_count, /* Multiple code pt count */
17001 if (*flagp & NEED_UTF8)
17002 FAIL("panic: grok_bslash_N set NEED_UTF8");
17004 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17006 if (cp_count < 0) {
17007 vFAIL("\\N in a character class must be a named character: \\N{...}");
17009 else if (cp_count == 0) {
17010 ckWARNreg(RExC_parse,
17011 "Ignoring zero length \\N{} in character class");
17013 else { /* cp_count > 1 */
17014 assert(cp_count > 1);
17015 if (! RExC_in_multi_char_class) {
17016 if ( ! allow_mutiple_chars
17019 || *RExC_parse == '-')
17023 vFAIL("\\N{} here is restricted to one character");
17025 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17026 break; /* <value> contains the first code
17027 point. Drop out of the switch to
17031 SV * multi_char_N = newSVpvn(backslash_N_beg,
17032 RExC_parse - backslash_N_beg);
17034 = add_multi_match(multi_char_matches,
17039 } /* End of cp_count != 1 */
17041 /* This element should not be processed further in this
17044 value = save_value;
17045 prevvalue = save_prevvalue;
17046 continue; /* Back to top of loop to get next char */
17049 /* Here, is a single code point, and <value> contains it */
17050 unicode_range = TRUE; /* \N{} are Unicode */
17058 /* \p means they want Unicode semantics */
17059 REQUIRE_UNI_RULES(flagp, 0);
17061 if (RExC_parse >= RExC_end)
17062 vFAIL2("Empty \\%c", (U8)value);
17063 if (*RExC_parse == '{') {
17064 const U8 c = (U8)value;
17065 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17068 vFAIL2("Missing right brace on \\%c{}", c);
17073 /* White space is allowed adjacent to the braces and after
17074 * any '^', even when not under /x */
17075 while (isSPACE(*RExC_parse)) {
17079 if (UCHARAT(RExC_parse) == '^') {
17081 /* toggle. (The rhs xor gets the single bit that
17082 * differs between P and p; the other xor inverts just
17084 value ^= 'P' ^ 'p';
17087 while (isSPACE(*RExC_parse)) {
17092 if (e == RExC_parse)
17093 vFAIL2("Empty \\%c{}", c);
17095 n = e - RExC_parse;
17096 while (isSPACE(*(RExC_parse + n - 1)))
17099 } /* The \p isn't immediately followed by a '{' */
17100 else if (! isALPHA(*RExC_parse)) {
17101 RExC_parse += (UTF)
17102 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17104 vFAIL2("Character following \\%c must be '{' or a "
17105 "single-character Unicode property name",
17113 char* name = RExC_parse;
17115 /* Any message returned about expanding the definition */
17116 SV* msg = newSVpvs_flags("", SVs_TEMP);
17118 /* If set TRUE, the property is user-defined as opposed to
17119 * official Unicode */
17120 bool user_defined = FALSE;
17122 SV * prop_definition = parse_uniprop_string(
17123 name, n, UTF, FOLD,
17124 FALSE, /* This is compile-time */
17126 /* We can't defer this defn when
17127 * the full result is required in
17129 ! cBOOL(ret_invlist),
17135 if (SvCUR(msg)) { /* Assumes any error causes a msg */
17136 assert(prop_definition == NULL);
17137 RExC_parse = e + 1;
17138 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
17139 thing so, or else the display is
17143 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17144 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17145 SvCUR(msg), SvPVX(msg)));
17148 if (! is_invlist(prop_definition)) {
17150 /* Here, the definition isn't known, so we have gotten
17151 * returned a string that will be evaluated if and when
17152 * encountered at runtime. We add it to the list of
17153 * such properties, along with whether it should be
17154 * complemented or not */
17155 if (value == 'P') {
17156 sv_catpvs(listsv, "!");
17159 sv_catpvs(listsv, "+");
17161 sv_catsv(listsv, prop_definition);
17163 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17165 /* We don't know yet what this matches, so have to flag
17167 anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17170 assert (prop_definition && is_invlist(prop_definition));
17172 /* Here we do have the complete property definition
17174 * Temporary workaround for [perl #133136]. For this
17175 * precise input that is in the .t that is failing,
17176 * load utf8.pm, which is what the test wants, so that
17177 * that .t passes */
17178 if ( memEQs(RExC_start, e + 1 - RExC_start,
17180 && ! hv_common(GvHVn(PL_incgv),
17182 "utf8.pm", sizeof("utf8.pm") - 1,
17183 0, HV_FETCH_ISEXISTS, NULL, 0))
17185 require_pv("utf8.pm");
17188 if (! user_defined &&
17189 /* We warn on matching an above-Unicode code point
17190 * if the match would return true, except don't
17191 * warn for \p{All}, which has exactly one element
17193 (_invlist_contains_cp(prop_definition, 0x110000)
17194 && (! (_invlist_len(prop_definition) == 1
17195 && *invlist_array(prop_definition) == 0))))
17200 /* Invert if asking for the complement */
17201 if (value == 'P') {
17202 _invlist_union_complement_2nd(properties,
17207 _invlist_union(properties, prop_definition, &properties);
17212 RExC_parse = e + 1;
17213 namedclass = ANYOF_UNIPROP; /* no official name, but it's
17217 case 'n': value = '\n'; break;
17218 case 'r': value = '\r'; break;
17219 case 't': value = '\t'; break;
17220 case 'f': value = '\f'; break;
17221 case 'b': value = '\b'; break;
17222 case 'e': value = ESC_NATIVE; break;
17223 case 'a': value = '\a'; break;
17225 RExC_parse--; /* function expects to be pointed at the 'o' */
17227 const char* error_msg;
17228 bool valid = grok_bslash_o(&RExC_parse,
17232 TO_OUTPUT_WARNINGS(RExC_parse),
17234 silence_non_portable,
17239 UPDATE_WARNINGS_LOC(RExC_parse - 1);
17241 non_portable_endpoint++;
17244 RExC_parse--; /* function expects to be pointed at the 'x' */
17246 const char* error_msg;
17247 bool valid = grok_bslash_x(&RExC_parse,
17251 TO_OUTPUT_WARNINGS(RExC_parse),
17253 silence_non_portable,
17258 UPDATE_WARNINGS_LOC(RExC_parse - 1);
17260 non_portable_endpoint++;
17263 value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
17264 UPDATE_WARNINGS_LOC(RExC_parse);
17266 non_portable_endpoint++;
17268 case '0': case '1': case '2': case '3': case '4':
17269 case '5': case '6': case '7':
17271 /* Take 1-3 octal digits */
17272 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
17273 numlen = (strict) ? 4 : 3;
17274 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17275 RExC_parse += numlen;
17278 RExC_parse += (UTF)
17279 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17281 vFAIL("Need exactly 3 octal digits");
17283 else if ( numlen < 3 /* like \08, \178 */
17284 && RExC_parse < RExC_end
17285 && isDIGIT(*RExC_parse)
17286 && ckWARN(WARN_REGEXP))
17288 reg_warn_non_literal_string(
17290 form_short_octal_warning(RExC_parse, numlen));
17293 non_portable_endpoint++;
17297 /* Allow \_ to not give an error */
17298 if (isWORDCHAR(value) && value != '_') {
17300 vFAIL2("Unrecognized escape \\%c in character class",
17304 ckWARN2reg(RExC_parse,
17305 "Unrecognized escape \\%c in character class passed through",
17310 } /* End of switch on char following backslash */
17311 } /* end of handling backslash escape sequences */
17313 /* Here, we have the current token in 'value' */
17315 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
17318 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
17319 * literal, as is the character that began the false range, i.e.
17320 * the 'a' in the examples */
17322 const int w = (RExC_parse >= rangebegin)
17323 ? RExC_parse - rangebegin
17327 "False [] range \"%" UTF8f "\"",
17328 UTF8fARG(UTF, w, rangebegin));
17331 ckWARN2reg(RExC_parse,
17332 "False [] range \"%" UTF8f "\"",
17333 UTF8fARG(UTF, w, rangebegin));
17334 cp_list = add_cp_to_invlist(cp_list, '-');
17335 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
17339 range = 0; /* this was not a true range */
17340 element_count += 2; /* So counts for three values */
17343 classnum = namedclass_to_classnum(namedclass);
17345 if (LOC && namedclass < ANYOF_POSIXL_MAX
17346 #ifndef HAS_ISASCII
17347 && classnum != _CC_ASCII
17350 SV* scratch_list = NULL;
17352 /* What the Posix classes (like \w, [:space:]) match isn't
17353 * generally knowable under locale until actual match time. A
17354 * special node is used for these which has extra space for a
17355 * bitmap, with a bit reserved for each named class that is to
17356 * be matched against. (This isn't needed for \p{} and
17357 * pseudo-classes, as they are not affected by locale, and
17358 * hence are dealt with separately.) However, if a named class
17359 * and its complement are both present, then it matches
17360 * everything, and there is no runtime dependency. Odd numbers
17361 * are the complements of the next lower number, so xor works.
17362 * (Note that something like [\w\D] should match everything,
17363 * because \d should be a proper subset of \w. But rather than
17364 * trust that the locale is well behaved, we leave this to
17365 * runtime to sort out) */
17366 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
17367 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
17368 POSIXL_ZERO(posixl);
17369 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
17370 anyof_flags &= ~ANYOF_MATCHES_POSIXL;
17371 continue; /* We could ignore the rest of the class, but
17372 best to parse it for any errors */
17374 else { /* Here, isn't the complement of any already parsed
17376 POSIXL_SET(posixl, namedclass);
17377 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
17378 anyof_flags |= ANYOF_MATCHES_POSIXL;
17380 /* The above-Latin1 characters are not subject to locale
17381 * rules. Just add them to the unconditionally-matched
17384 /* Get the list of the above-Latin1 code points this
17386 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
17387 PL_XPosix_ptrs[classnum],
17389 /* Odd numbers are complements,
17390 * like NDIGIT, NASCII, ... */
17391 namedclass % 2 != 0,
17393 /* Checking if 'cp_list' is NULL first saves an extra
17394 * clone. Its reference count will be decremented at the
17395 * next union, etc, or if this is the only instance, at the
17396 * end of the routine */
17398 cp_list = scratch_list;
17401 _invlist_union(cp_list, scratch_list, &cp_list);
17402 SvREFCNT_dec_NN(scratch_list);
17404 continue; /* Go get next character */
17409 /* Here, is not /l, or is a POSIX class for which /l doesn't
17410 * matter (or is a Unicode property, which is skipped here). */
17411 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
17412 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
17414 /* Here, should be \h, \H, \v, or \V. None of /d, /i
17415 * nor /l make a difference in what these match,
17416 * therefore we just add what they match to cp_list. */
17417 if (classnum != _CC_VERTSPACE) {
17418 assert( namedclass == ANYOF_HORIZWS
17419 || namedclass == ANYOF_NHORIZWS);
17421 /* It turns out that \h is just a synonym for
17423 classnum = _CC_BLANK;
17426 _invlist_union_maybe_complement_2nd(
17428 PL_XPosix_ptrs[classnum],
17429 namedclass % 2 != 0, /* Complement if odd
17430 (NHORIZWS, NVERTWS)
17435 else if ( AT_LEAST_UNI_SEMANTICS
17436 || classnum == _CC_ASCII
17437 || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
17438 || classnum == _CC_XDIGIT)))
17440 /* We usually have to worry about /d affecting what POSIX
17441 * classes match, with special code needed because we won't
17442 * know until runtime what all matches. But there is no
17443 * extra work needed under /u and /a; and [:ascii:] is
17444 * unaffected by /d; and :digit: and :xdigit: don't have
17445 * runtime differences under /d. So we can special case
17446 * these, and avoid some extra work below, and at runtime.
17448 _invlist_union_maybe_complement_2nd(
17450 ((AT_LEAST_ASCII_RESTRICTED)
17451 ? PL_Posix_ptrs[classnum]
17452 : PL_XPosix_ptrs[classnum]),
17453 namedclass % 2 != 0,
17456 else { /* Garden variety class. If is NUPPER, NALPHA, ...
17457 complement and use nposixes */
17458 SV** posixes_ptr = namedclass % 2 == 0
17461 _invlist_union_maybe_complement_2nd(
17463 PL_XPosix_ptrs[classnum],
17464 namedclass % 2 != 0,
17468 } /* end of namedclass \blah */
17470 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
17472 /* If 'range' is set, 'value' is the ending of a range--check its
17473 * validity. (If value isn't a single code point in the case of a
17474 * range, we should have figured that out above in the code that
17475 * catches false ranges). Later, we will handle each individual code
17476 * point in the range. If 'range' isn't set, this could be the
17477 * beginning of a range, so check for that by looking ahead to see if
17478 * the next real character to be processed is the range indicator--the
17483 /* For unicode ranges, we have to test that the Unicode as opposed
17484 * to the native values are not decreasing. (Above 255, there is
17485 * no difference between native and Unicode) */
17486 if (unicode_range && prevvalue < 255 && value < 255) {
17487 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
17488 goto backwards_range;
17493 if (prevvalue > value) /* b-a */ {
17498 w = RExC_parse - rangebegin;
17500 "Invalid [] range \"%" UTF8f "\"",
17501 UTF8fARG(UTF, w, rangebegin));
17502 NOT_REACHED; /* NOTREACHED */
17506 prevvalue = value; /* save the beginning of the potential range */
17507 if (! stop_at_1 /* Can't be a range if parsing just one thing */
17508 && *RExC_parse == '-')
17510 char* next_char_ptr = RExC_parse + 1;
17512 /* Get the next real char after the '-' */
17513 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
17515 /* If the '-' is at the end of the class (just before the ']',
17516 * it is a literal minus; otherwise it is a range */
17517 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
17518 RExC_parse = next_char_ptr;
17520 /* a bad range like \w-, [:word:]- ? */
17521 if (namedclass > OOB_NAMEDCLASS) {
17522 if (strict || ckWARN(WARN_REGEXP)) {
17523 const int w = RExC_parse >= rangebegin
17524 ? RExC_parse - rangebegin
17527 vFAIL4("False [] range \"%*.*s\"",
17532 "False [] range \"%*.*s\"",
17536 cp_list = add_cp_to_invlist(cp_list, '-');
17539 range = 1; /* yeah, it's a range! */
17540 continue; /* but do it the next time */
17545 if (namedclass > OOB_NAMEDCLASS) {
17549 /* Here, we have a single value this time through the loop, and
17550 * <prevvalue> is the beginning of the range, if any; or <value> if
17553 /* non-Latin1 code point implies unicode semantics. */
17555 REQUIRE_UNI_RULES(flagp, 0);
17558 /* Ready to process either the single value, or the completed range.
17559 * For single-valued non-inverted ranges, we consider the possibility
17560 * of multi-char folds. (We made a conscious decision to not do this
17561 * for the other cases because it can often lead to non-intuitive
17562 * results. For example, you have the peculiar case that:
17563 * "s s" =~ /^[^\xDF]+$/i => Y
17564 * "ss" =~ /^[^\xDF]+$/i => N
17566 * See [perl #89750] */
17567 if (FOLD && allow_mutiple_chars && value == prevvalue) {
17568 if ( value == LATIN_SMALL_LETTER_SHARP_S
17569 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
17572 /* Here <value> is indeed a multi-char fold. Get what it is */
17574 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17577 UV folded = _to_uni_fold_flags(
17581 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
17582 ? FOLD_FLAGS_NOMIX_ASCII
17586 /* Here, <folded> should be the first character of the
17587 * multi-char fold of <value>, with <foldbuf> containing the
17588 * whole thing. But, if this fold is not allowed (because of
17589 * the flags), <fold> will be the same as <value>, and should
17590 * be processed like any other character, so skip the special
17592 if (folded != value) {
17594 /* Skip if we are recursed, currently parsing the class
17595 * again. Otherwise add this character to the list of
17596 * multi-char folds. */
17597 if (! RExC_in_multi_char_class) {
17598 STRLEN cp_count = utf8_length(foldbuf,
17599 foldbuf + foldlen);
17600 SV* multi_fold = sv_2mortal(newSVpvs(""));
17602 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
17605 = add_multi_match(multi_char_matches,
17611 /* This element should not be processed further in this
17614 value = save_value;
17615 prevvalue = save_prevvalue;
17621 if (strict && ckWARN(WARN_REGEXP)) {
17624 /* If the range starts above 255, everything is portable and
17625 * likely to be so for any forseeable character set, so don't
17627 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
17628 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
17630 else if (prevvalue != value) {
17632 /* Under strict, ranges that stop and/or end in an ASCII
17633 * printable should have each end point be a portable value
17634 * for it (preferably like 'A', but we don't warn if it is
17635 * a (portable) Unicode name or code point), and the range
17636 * must be be all digits or all letters of the same case.
17637 * Otherwise, the range is non-portable and unclear as to
17638 * what it contains */
17639 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
17640 && ( non_portable_endpoint
17641 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
17642 || (isLOWER_A(prevvalue) && isLOWER_A(value))
17643 || (isUPPER_A(prevvalue) && isUPPER_A(value))
17645 vWARN(RExC_parse, "Ranges of ASCII printables should"
17646 " be some subset of \"0-9\","
17647 " \"A-Z\", or \"a-z\"");
17649 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
17650 SSize_t index_start;
17651 SSize_t index_final;
17653 /* But the nature of Unicode and languages mean we
17654 * can't do the same checks for above-ASCII ranges,
17655 * except in the case of digit ones. These should
17656 * contain only digits from the same group of 10. The
17657 * ASCII case is handled just above. Hence here, the
17658 * range could be a range of digits. First some
17659 * unlikely special cases. Grandfather in that a range
17660 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
17661 * if its starting value is one of the 10 digits prior
17662 * to it. This is because it is an alternate way of
17663 * writing 19D1, and some people may expect it to be in
17664 * that group. But it is bad, because it won't give
17665 * the expected results. In Unicode 5.2 it was
17666 * considered to be in that group (of 11, hence), but
17667 * this was fixed in the next version */
17669 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
17670 goto warn_bad_digit_range;
17672 else if (UNLIKELY( prevvalue >= 0x1D7CE
17673 && value <= 0x1D7FF))
17675 /* This is the only other case currently in Unicode
17676 * where the algorithm below fails. The code
17677 * points just above are the end points of a single
17678 * range containing only decimal digits. It is 5
17679 * different series of 0-9. All other ranges of
17680 * digits currently in Unicode are just a single
17681 * series. (And mktables will notify us if a later
17682 * Unicode version breaks this.)
17684 * If the range being checked is at most 9 long,
17685 * and the digit values represented are in
17686 * numerical order, they are from the same series.
17688 if ( value - prevvalue > 9
17689 || ((( value - 0x1D7CE) % 10)
17690 <= (prevvalue - 0x1D7CE) % 10))
17692 goto warn_bad_digit_range;
17697 /* For all other ranges of digits in Unicode, the
17698 * algorithm is just to check if both end points
17699 * are in the same series, which is the same range.
17701 index_start = _invlist_search(
17702 PL_XPosix_ptrs[_CC_DIGIT],
17705 /* Warn if the range starts and ends with a digit,
17706 * and they are not in the same group of 10. */
17707 if ( index_start >= 0
17708 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
17710 _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
17711 value)) != index_start
17712 && index_final >= 0
17713 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
17715 warn_bad_digit_range:
17716 vWARN(RExC_parse, "Ranges of digits should be"
17717 " from the same group of"
17724 if ((! range || prevvalue == value) && non_portable_endpoint) {
17725 if (isPRINT_A(value)) {
17728 if (isBACKSLASHED_PUNCT(value)) {
17729 literal[d++] = '\\';
17731 literal[d++] = (char) value;
17732 literal[d++] = '\0';
17735 "\"%.*s\" is more clearly written simply as \"%s\"",
17736 (int) (RExC_parse - rangebegin),
17741 else if (isMNEMONIC_CNTRL(value)) {
17743 "\"%.*s\" is more clearly written simply as \"%s\"",
17744 (int) (RExC_parse - rangebegin),
17746 cntrl_to_mnemonic((U8) value)
17752 /* Deal with this element of the class */
17755 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17758 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
17759 * that don't require special handling, we can just add the range like
17760 * we do for ASCII platforms */
17761 if ((UNLIKELY(prevvalue == 0) && value >= 255)
17762 || ! (prevvalue < 256
17764 || (! non_portable_endpoint
17765 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
17766 || (isUPPER_A(prevvalue)
17767 && isUPPER_A(value)))))))
17769 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17773 /* Here, requires special handling. This can be because it is a
17774 * range whose code points are considered to be Unicode, and so
17775 * must be individually translated into native, or because its a
17776 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
17777 * EBCDIC, but we have defined them to include only the "expected"
17778 * upper or lower case ASCII alphabetics. Subranges above 255 are
17779 * the same in native and Unicode, so can be added as a range */
17780 U8 start = NATIVE_TO_LATIN1(prevvalue);
17782 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
17783 for (j = start; j <= end; j++) {
17784 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
17787 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
17793 range = 0; /* this range (if it was one) is done now */
17794 } /* End of loop through all the text within the brackets */
17796 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
17797 output_posix_warnings(pRExC_state, posix_warnings);
17800 /* If anything in the class expands to more than one character, we have to
17801 * deal with them by building up a substitute parse string, and recursively
17802 * calling reg() on it, instead of proceeding */
17803 if (multi_char_matches) {
17804 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
17807 char *save_end = RExC_end;
17808 char *save_parse = RExC_parse;
17809 char *save_start = RExC_start;
17810 Size_t constructed_prefix_len = 0; /* This gives the length of the
17811 constructed portion of the
17812 substitute parse. */
17813 bool first_time = TRUE; /* First multi-char occurrence doesn't get
17818 /* Only one level of recursion allowed */
17819 assert(RExC_copy_start_in_constructed == RExC_precomp);
17821 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
17822 because too confusing */
17824 sv_catpvs(substitute_parse, "(?:");
17828 /* Look at the longest folds first */
17829 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
17834 if (av_exists(multi_char_matches, cp_count)) {
17835 AV** this_array_ptr;
17838 this_array_ptr = (AV**) av_fetch(multi_char_matches,
17840 while ((this_sequence = av_pop(*this_array_ptr)) !=
17843 if (! first_time) {
17844 sv_catpvs(substitute_parse, "|");
17846 first_time = FALSE;
17848 sv_catpv(substitute_parse, SvPVX(this_sequence));
17853 /* If the character class contains anything else besides these
17854 * multi-character folds, have to include it in recursive parsing */
17855 if (element_count) {
17856 sv_catpvs(substitute_parse, "|[");
17857 constructed_prefix_len = SvCUR(substitute_parse);
17858 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
17860 /* Put in a closing ']' only if not going off the end, as otherwise
17861 * we are adding something that really isn't there */
17862 if (RExC_parse < RExC_end) {
17863 sv_catpvs(substitute_parse, "]");
17867 sv_catpvs(substitute_parse, ")");
17870 /* This is a way to get the parse to skip forward a whole named
17871 * sequence instead of matching the 2nd character when it fails the
17873 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
17877 /* Set up the data structure so that any errors will be properly
17878 * reported. See the comments at the definition of
17879 * REPORT_LOCATION_ARGS for details */
17880 RExC_copy_start_in_input = (char *) orig_parse;
17881 RExC_start = RExC_parse = SvPV(substitute_parse, len);
17882 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
17883 RExC_end = RExC_parse + len;
17884 RExC_in_multi_char_class = 1;
17886 ret = reg(pRExC_state, 1, ®_flags, depth+1);
17888 *flagp |= reg_flags & (HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PARSE|NEED_UTF8);
17890 /* And restore so can parse the rest of the pattern */
17891 RExC_parse = save_parse;
17892 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
17893 RExC_end = save_end;
17894 RExC_in_multi_char_class = 0;
17895 SvREFCNT_dec_NN(multi_char_matches);
17899 /* If folding, we calculate all characters that could fold to or from the
17900 * ones already on the list */
17901 if (cp_foldable_list) {
17903 UV start, end; /* End points of code point ranges */
17905 SV* fold_intersection = NULL;
17908 /* Our calculated list will be for Unicode rules. For locale
17909 * matching, we have to keep a separate list that is consulted at
17910 * runtime only when the locale indicates Unicode rules (and we
17911 * don't include potential matches in the ASCII/Latin1 range, as
17912 * any code point could fold to any other, based on the run-time
17913 * locale). For non-locale, we just use the general list */
17915 use_list = &only_utf8_locale_list;
17918 use_list = &cp_list;
17921 /* Only the characters in this class that participate in folds need
17922 * be checked. Get the intersection of this class and all the
17923 * possible characters that are foldable. This can quickly narrow
17924 * down a large class */
17925 _invlist_intersection(PL_in_some_fold, cp_foldable_list,
17926 &fold_intersection);
17928 /* Now look at the foldable characters in this class individually */
17929 invlist_iterinit(fold_intersection);
17930 while (invlist_iternext(fold_intersection, &start, &end)) {
17934 /* Look at every character in the range */
17935 for (j = start; j <= end; j++) {
17936 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
17939 Size_t folds_count;
17940 unsigned int first_fold;
17941 const unsigned int * remaining_folds;
17945 /* Under /l, we don't know what code points below 256
17946 * fold to, except we do know the MICRO SIGN folds to
17947 * an above-255 character if the locale is UTF-8, so we
17948 * add it to the special list (in *use_list) Otherwise
17949 * we know now what things can match, though some folds
17950 * are valid under /d only if the target is UTF-8.
17951 * Those go in a separate list */
17952 if ( IS_IN_SOME_FOLD_L1(j)
17953 && ! (LOC && j != MICRO_SIGN))
17956 /* ASCII is always matched; non-ASCII is matched
17957 * only under Unicode rules (which could happen
17958 * under /l if the locale is a UTF-8 one */
17959 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
17960 *use_list = add_cp_to_invlist(*use_list,
17961 PL_fold_latin1[j]);
17963 else if (j != PL_fold_latin1[j]) {
17964 upper_latin1_only_utf8_matches
17965 = add_cp_to_invlist(
17966 upper_latin1_only_utf8_matches,
17967 PL_fold_latin1[j]);
17971 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
17972 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
17974 add_above_Latin1_folds(pRExC_state,
17981 /* Here is an above Latin1 character. We don't have the
17982 * rules hard-coded for it. First, get its fold. This is
17983 * the simple fold, as the multi-character folds have been
17984 * handled earlier and separated out */
17985 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
17986 (ASCII_FOLD_RESTRICTED)
17987 ? FOLD_FLAGS_NOMIX_ASCII
17990 /* Single character fold of above Latin1. Add everything
17991 * in its fold closure to the list that this node should
17993 folds_count = _inverse_folds(folded, &first_fold,
17995 for (k = 0; k <= folds_count; k++) {
17996 UV c = (k == 0) /* First time through use itself */
17998 : (k == 1) /* 2nd time use, the first fold */
18001 /* Then the remaining ones */
18002 : remaining_folds[k-2];
18004 /* /aa doesn't allow folds between ASCII and non- */
18005 if (( ASCII_FOLD_RESTRICTED
18006 && (isASCII(c) != isASCII(j))))
18011 /* Folds under /l which cross the 255/256 boundary are
18012 * added to a separate list. (These are valid only
18013 * when the locale is UTF-8.) */
18014 if (c < 256 && LOC) {
18015 *use_list = add_cp_to_invlist(*use_list, c);
18019 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18021 cp_list = add_cp_to_invlist(cp_list, c);
18024 /* Similarly folds involving non-ascii Latin1
18025 * characters under /d are added to their list */
18026 upper_latin1_only_utf8_matches
18027 = add_cp_to_invlist(
18028 upper_latin1_only_utf8_matches,
18034 SvREFCNT_dec_NN(fold_intersection);
18037 /* Now that we have finished adding all the folds, there is no reason
18038 * to keep the foldable list separate */
18039 _invlist_union(cp_list, cp_foldable_list, &cp_list);
18040 SvREFCNT_dec_NN(cp_foldable_list);
18043 /* And combine the result (if any) with any inversion lists from posix
18044 * classes. The lists are kept separate up to now because we don't want to
18045 * fold the classes */
18046 if (simple_posixes) { /* These are the classes known to be unaffected by
18049 _invlist_union(cp_list, simple_posixes, &cp_list);
18050 SvREFCNT_dec_NN(simple_posixes);
18053 cp_list = simple_posixes;
18056 if (posixes || nposixes) {
18057 if (! DEPENDS_SEMANTICS) {
18059 /* For everything but /d, we can just add the current 'posixes' and
18060 * 'nposixes' to the main list */
18063 _invlist_union(cp_list, posixes, &cp_list);
18064 SvREFCNT_dec_NN(posixes);
18072 _invlist_union(cp_list, nposixes, &cp_list);
18073 SvREFCNT_dec_NN(nposixes);
18076 cp_list = nposixes;
18081 /* Under /d, things like \w match upper Latin1 characters only if
18082 * the target string is in UTF-8. But things like \W match all the
18083 * upper Latin1 characters if the target string is not in UTF-8.
18085 * Handle the case with something like \W separately */
18087 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18089 /* A complemented posix class matches all upper Latin1
18090 * characters if not in UTF-8. And it matches just certain
18091 * ones when in UTF-8. That means those certain ones are
18092 * matched regardless, so can just be added to the
18093 * unconditional list */
18095 _invlist_union(cp_list, nposixes, &cp_list);
18096 SvREFCNT_dec_NN(nposixes);
18100 cp_list = nposixes;
18103 /* Likewise for 'posixes' */
18104 _invlist_union(posixes, cp_list, &cp_list);
18106 /* Likewise for anything else in the range that matched only
18108 if (upper_latin1_only_utf8_matches) {
18109 _invlist_union(cp_list,
18110 upper_latin1_only_utf8_matches,
18112 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18113 upper_latin1_only_utf8_matches = NULL;
18116 /* If we don't match all the upper Latin1 characters regardless
18117 * of UTF-8ness, we have to set a flag to match the rest when
18119 _invlist_subtract(only_non_utf8_list, cp_list,
18120 &only_non_utf8_list);
18121 if (_invlist_len(only_non_utf8_list) != 0) {
18122 anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18124 SvREFCNT_dec_NN(only_non_utf8_list);
18127 /* Here there were no complemented posix classes. That means
18128 * the upper Latin1 characters in 'posixes' match only when the
18129 * target string is in UTF-8. So we have to add them to the
18130 * list of those types of code points, while adding the
18131 * remainder to the unconditional list.
18133 * First calculate what they are */
18134 SV* nonascii_but_latin1_properties = NULL;
18135 _invlist_intersection(posixes, PL_UpperLatin1,
18136 &nonascii_but_latin1_properties);
18138 /* And add them to the final list of such characters. */
18139 _invlist_union(upper_latin1_only_utf8_matches,
18140 nonascii_but_latin1_properties,
18141 &upper_latin1_only_utf8_matches);
18143 /* Remove them from what now becomes the unconditional list */
18144 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18147 /* And add those unconditional ones to the final list */
18149 _invlist_union(cp_list, posixes, &cp_list);
18150 SvREFCNT_dec_NN(posixes);
18157 SvREFCNT_dec(nonascii_but_latin1_properties);
18159 /* Get rid of any characters from the conditional list that we
18160 * now know are matched unconditionally, which may make that
18162 _invlist_subtract(upper_latin1_only_utf8_matches,
18164 &upper_latin1_only_utf8_matches);
18165 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18166 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18167 upper_latin1_only_utf8_matches = NULL;
18173 /* And combine the result (if any) with any inversion list from properties.
18174 * The lists are kept separate up to now so that we can distinguish the two
18175 * in regards to matching above-Unicode. A run-time warning is generated
18176 * if a Unicode property is matched against a non-Unicode code point. But,
18177 * we allow user-defined properties to match anything, without any warning,
18178 * and we also suppress the warning if there is a portion of the character
18179 * class that isn't a Unicode property, and which matches above Unicode, \W
18180 * or [\x{110000}] for example.
18181 * (Note that in this case, unlike the Posix one above, there is no
18182 * <upper_latin1_only_utf8_matches>, because having a Unicode property
18183 * forces Unicode semantics */
18187 /* If it matters to the final outcome, see if a non-property
18188 * component of the class matches above Unicode. If so, the
18189 * warning gets suppressed. This is true even if just a single
18190 * such code point is specified, as, though not strictly correct if
18191 * another such code point is matched against, the fact that they
18192 * are using above-Unicode code points indicates they should know
18193 * the issues involved */
18195 warn_super = ! (invert
18196 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18199 _invlist_union(properties, cp_list, &cp_list);
18200 SvREFCNT_dec_NN(properties);
18203 cp_list = properties;
18208 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18210 /* Because an ANYOF node is the only one that warns, this node
18211 * can't be optimized into something else */
18212 optimizable = FALSE;
18216 /* Here, we have calculated what code points should be in the character
18219 * Now we can see about various optimizations. Fold calculation (which we
18220 * did above) needs to take place before inversion. Otherwise /[^k]/i
18221 * would invert to include K, which under /i would match k, which it
18222 * shouldn't. Therefore we can't invert folded locale now, as it won't be
18223 * folded until runtime */
18225 /* If we didn't do folding, it's because some information isn't available
18226 * until runtime; set the run-time fold flag for these We know to set the
18227 * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18228 * at least one 0-255 range code point */
18231 /* Some things on the list might be unconditionally included because of
18232 * other components. Remove them, and clean up the list if it goes to
18234 if (only_utf8_locale_list && cp_list) {
18235 _invlist_subtract(only_utf8_locale_list, cp_list,
18236 &only_utf8_locale_list);
18238 if (_invlist_len(only_utf8_locale_list) == 0) {
18239 SvREFCNT_dec_NN(only_utf8_locale_list);
18240 only_utf8_locale_list = NULL;
18243 if ( only_utf8_locale_list
18244 || (cp_list && ( _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18245 || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18247 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18250 | ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18252 else if (cp_list) { /* Look to see if a 0-255 code point is in list */
18254 invlist_iterinit(cp_list);
18255 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
18256 anyof_flags |= ANYOFL_FOLD;
18257 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18259 invlist_iterfinish(cp_list);
18262 else if ( DEPENDS_SEMANTICS
18263 && ( upper_latin1_only_utf8_matches
18264 || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18266 RExC_seen_d_op = TRUE;
18267 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18270 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
18274 && ! has_runtime_dependency)
18276 _invlist_invert(cp_list);
18278 /* Clear the invert flag since have just done it here */
18283 *ret_invlist = cp_list;
18288 /* All possible optimizations below still have these characteristics.
18289 * (Multi-char folds aren't SIMPLE, but they don't get this far in this
18291 *flagp |= HASWIDTH|SIMPLE;
18293 if (anyof_flags & ANYOF_LOCALE_FLAGS) {
18294 RExC_contains_locale = 1;
18297 /* Some character classes are equivalent to other nodes. Such nodes take
18298 * up less room, and some nodes require fewer operations to execute, than
18299 * ANYOF nodes. EXACTish nodes may be joinable with adjacent nodes to
18300 * improve efficiency. */
18303 PERL_UINT_FAST8_T i;
18304 Size_t partial_cp_count = 0;
18305 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
18306 UV end[MAX_FOLD_FROMS+1] = { 0 };
18308 if (cp_list) { /* Count the code points in enough ranges that we would
18309 see all the ones possible in any fold in this version
18312 invlist_iterinit(cp_list);
18313 for (i = 0; i <= MAX_FOLD_FROMS; i++) {
18314 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
18317 partial_cp_count += end[i] - start[i] + 1;
18320 invlist_iterfinish(cp_list);
18323 /* If we know at compile time that this matches every possible code
18324 * point, any run-time dependencies don't matter */
18325 if (start[0] == 0 && end[0] == UV_MAX) {
18327 ret = reganode(pRExC_state, OPFAIL, 0);
18330 ret = reg_node(pRExC_state, SANY);
18336 /* Similarly, for /l posix classes, if both a class and its
18337 * complement match, any run-time dependencies don't matter */
18339 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
18342 if ( POSIXL_TEST(posixl, namedclass) /* class */
18343 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
18346 ret = reganode(pRExC_state, OPFAIL, 0);
18349 ret = reg_node(pRExC_state, SANY);
18355 /* For well-behaved locales, some classes are subsets of others,
18356 * so complementing the subset and including the non-complemented
18357 * superset should match everything, like [\D[:alnum:]], and
18358 * [[:^alpha:][:alnum:]], but some implementations of locales are
18359 * buggy, and khw thinks its a bad idea to have optimization change
18360 * behavior, even if it avoids an OS bug in a given case */
18362 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
18364 /* If is a single posix /l class, can optimize to just that op.
18365 * Such a node will not match anything in the Latin1 range, as that
18366 * is not determinable until runtime, but will match whatever the
18367 * class does outside that range. (Note that some classes won't
18368 * match anything outside the range, like [:ascii:]) */
18369 if ( isSINGLE_BIT_SET(posixl)
18370 && (partial_cp_count == 0 || start[0] > 255))
18373 SV * class_above_latin1 = NULL;
18374 bool already_inverted;
18375 bool are_equivalent;
18377 /* Compute which bit is set, which is the same thing as, e.g.,
18378 * ANYOF_CNTRL. From
18379 * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
18381 static const int MultiplyDeBruijnBitPosition2[32] =
18383 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
18384 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
18387 namedclass = MultiplyDeBruijnBitPosition2[(posixl
18388 * 0x077CB531U) >> 27];
18389 classnum = namedclass_to_classnum(namedclass);
18391 /* The named classes are such that the inverted number is one
18392 * larger than the non-inverted one */
18393 already_inverted = namedclass
18394 - classnum_to_namedclass(classnum);
18396 /* Create an inversion list of the official property, inverted
18397 * if the constructed node list is inverted, and restricted to
18398 * only the above latin1 code points, which are the only ones
18399 * known at compile time */
18400 _invlist_intersection_maybe_complement_2nd(
18402 PL_XPosix_ptrs[classnum],
18404 &class_above_latin1);
18405 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
18407 SvREFCNT_dec_NN(class_above_latin1);
18409 if (are_equivalent) {
18411 /* Resolve the run-time inversion flag with this possibly
18412 * inverted class */
18413 invert = invert ^ already_inverted;
18415 ret = reg_node(pRExC_state,
18416 POSIXL + invert * (NPOSIXL - POSIXL));
18417 FLAGS(REGNODE_p(ret)) = classnum;
18423 /* khw can't think of any other possible transformation involving
18425 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
18429 if (! has_runtime_dependency) {
18431 /* If the list is empty, nothing matches. This happens, for
18432 * example, when a Unicode property that doesn't match anything is
18433 * the only element in the character class (perluniprops.pod notes
18434 * such properties). */
18435 if (partial_cp_count == 0) {
18437 ret = reg_node(pRExC_state, SANY);
18440 ret = reganode(pRExC_state, OPFAIL, 0);
18446 /* If matches everything but \n */
18447 if ( start[0] == 0 && end[0] == '\n' - 1
18448 && start[1] == '\n' + 1 && end[1] == UV_MAX)
18451 ret = reg_node(pRExC_state, REG_ANY);
18457 /* Next see if can optimize classes that contain just a few code points
18458 * into an EXACTish node. The reason to do this is to let the
18459 * optimizer join this node with adjacent EXACTish ones.
18461 * An EXACTFish node can be generated even if not under /i, and vice
18462 * versa. But care must be taken. An EXACTFish node has to be such
18463 * that it only matches precisely the code points in the class, but we
18464 * want to generate the least restrictive one that does that, to
18465 * increase the odds of being able to join with an adjacent node. For
18466 * example, if the class contains [kK], we have to make it an EXACTFAA
18467 * node to prevent the KELVIN SIGN from matching. Whether we are under
18468 * /i or not is irrelevant in this case. Less obvious is the pattern
18469 * qr/[\x{02BC}]n/i. U+02BC is MODIFIER LETTER APOSTROPHE. That is
18470 * supposed to match the single character U+0149 LATIN SMALL LETTER N
18471 * PRECEDED BY APOSTROPHE. And so even though there is no simple fold
18472 * that includes \X{02BC}, there is a multi-char fold that does, and so
18473 * the node generated for it must be an EXACTFish one. On the other
18474 * hand qr/:/i should generate a plain EXACT node since the colon
18475 * participates in no fold whatsoever, and having it EXACT tells the
18476 * optimizer the target string cannot match unless it has a colon in
18479 * We don't typically generate an EXACTish node if doing so would
18480 * require changing the pattern to UTF-8, as that affects /d and
18481 * otherwise is slower. However, under /i, not changing to UTF-8 can
18482 * miss some potential multi-character folds. We calculate the
18483 * EXACTish node, and then decide if something would be missed if we
18488 /* Only try if there are no more code points in the class than
18489 * in the max possible fold */
18490 && partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1
18492 && (start[0] < 256 || UTF || FOLD))
18494 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
18496 /* We can always make a single code point class into an
18497 * EXACTish node. */
18501 /* Here is /l: Use EXACTL, except /li indicates EXACTFL,
18502 * as that means there is a fold not known until runtime so
18503 * shows as only a single code point here. */
18504 op = (FOLD) ? EXACTFL : EXACTL;
18506 else if (! FOLD) { /* Not /l and not /i */
18507 op = (start[0] < 256) ? EXACT : EXACT_ONLY8;
18509 else if (start[0] < 256) { /* /i, not /l, and the code point is
18512 /* Under /i, it gets a little tricky. A code point that
18513 * doesn't participate in a fold should be an EXACT node.
18514 * We know this one isn't the result of a simple fold, or
18515 * there'd be more than one code point in the list, but it
18516 * could be part of a multi- character fold. In that case
18517 * we better not create an EXACT node, as we would wrongly
18518 * be telling the optimizer that this code point must be in
18519 * the target string, and that is wrong. This is because
18520 * if the sequence around this code point forms a
18521 * multi-char fold, what needs to be in the string could be
18522 * the code point that folds to the sequence.
18524 * This handles the case of below-255 code points, as we
18525 * have an easy look up for those. The next clause handles
18526 * the above-256 one */
18527 op = IS_IN_SOME_FOLD_L1(start[0])
18531 else { /* /i, larger code point. Since we are under /i, and
18532 have just this code point, we know that it can't
18533 fold to something else, so PL_InMultiCharFold
18535 op = _invlist_contains_cp(PL_InMultiCharFold,
18543 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
18544 && _invlist_contains_cp(PL_in_some_fold, start[0]))
18546 /* Here, the only runtime dependency, if any, is from /d, and
18547 * the class matches more than one code point, and the lowest
18548 * code point participates in some fold. It might be that the
18549 * other code points are /i equivalent to this one, and hence
18550 * they would representable by an EXACTFish node. Above, we
18551 * eliminated classes that contain too many code points to be
18552 * EXACTFish, with the test for MAX_FOLD_FROMS
18554 * First, special case the ASCII fold pairs, like 'B' and 'b'.
18555 * We do this because we have EXACTFAA at our disposal for the
18557 if (partial_cp_count == 2 && isASCII(start[0])) {
18559 /* The only ASCII characters that participate in folds are
18561 assert(isALPHA(start[0]));
18562 if ( end[0] == start[0] /* First range is a single
18563 character, so 2nd exists */
18564 && isALPHA_FOLD_EQ(start[0], start[1]))
18567 /* Here, is part of an ASCII fold pair */
18569 if ( ASCII_FOLD_RESTRICTED
18570 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
18572 /* If the second clause just above was true, it
18573 * means we can't be under /i, or else the list
18574 * would have included more than this fold pair.
18575 * Therefore we have to exclude the possibility of
18576 * whatever else it is that folds to these, by
18577 * using EXACTFAA */
18580 else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
18582 /* Here, there's no simple fold that start[0] is part
18583 * of, but there is a multi-character one. If we
18584 * are not under /i, we want to exclude that
18585 * possibility; if under /i, we want to include it
18587 op = (FOLD) ? EXACTFU : EXACTFAA;
18591 /* Here, the only possible fold start[0] particpates in
18592 * is with start[1]. /i or not isn't relevant */
18596 value = toFOLD(start[0]);
18599 else if ( ! upper_latin1_only_utf8_matches
18600 || ( _invlist_len(upper_latin1_only_utf8_matches)
18603 invlist_highest(upper_latin1_only_utf8_matches)]
18606 /* Here, the smallest character is non-ascii or there are
18607 * more than 2 code points matched by this node. Also, we
18608 * either don't have /d UTF-8 dependent matches, or if we
18609 * do, they look like they could be a single character that
18610 * is the fold of the lowest one in the always-match list.
18611 * This test quickly excludes most of the false positives
18612 * when there are /d UTF-8 depdendent matches. These are
18613 * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
18614 * SMALL LETTER A WITH GRAVE iff the target string is
18615 * UTF-8. (We don't have to worry above about exceeding
18616 * the array bounds of PL_fold_latin1[] because any code
18617 * point in 'upper_latin1_only_utf8_matches' is below 256.)
18619 * EXACTFAA would apply only to pairs (hence exactly 2 code
18620 * points) in the ASCII range, so we can't use it here to
18621 * artificially restrict the fold domain, so we check if
18622 * the class does or does not match some EXACTFish node.
18623 * Further, if we aren't under /i, and and the folded-to
18624 * character is part of a multi-character fold, we can't do
18625 * this optimization, as the sequence around it could be
18626 * that multi-character fold, and we don't here know the
18627 * context, so we have to assume it is that multi-char
18628 * fold, to prevent potential bugs.
18630 * To do the general case, we first find the fold of the
18631 * lowest code point (which may be higher than the lowest
18632 * one), then find everything that folds to it. (The data
18633 * structure we have only maps from the folded code points,
18634 * so we have to do the earlier step.) */
18637 U8 foldbuf[UTF8_MAXBYTES_CASE];
18638 UV folded = _to_uni_fold_flags(start[0],
18639 foldbuf, &foldlen, 0);
18640 unsigned int first_fold;
18641 const unsigned int * remaining_folds;
18642 Size_t folds_to_this_cp_count = _inverse_folds(
18646 Size_t folds_count = folds_to_this_cp_count + 1;
18647 SV * fold_list = _new_invlist(folds_count);
18650 /* If there are UTF-8 dependent matches, create a temporary
18651 * list of what this node matches, including them. */
18652 SV * all_cp_list = NULL;
18653 SV ** use_this_list = &cp_list;
18655 if (upper_latin1_only_utf8_matches) {
18656 all_cp_list = _new_invlist(0);
18657 use_this_list = &all_cp_list;
18658 _invlist_union(cp_list,
18659 upper_latin1_only_utf8_matches,
18663 /* Having gotten everything that participates in the fold
18664 * containing the lowest code point, we turn that into an
18665 * inversion list, making sure everything is included. */
18666 fold_list = add_cp_to_invlist(fold_list, start[0]);
18667 fold_list = add_cp_to_invlist(fold_list, folded);
18668 if (folds_to_this_cp_count > 0) {
18669 fold_list = add_cp_to_invlist(fold_list, first_fold);
18670 for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
18671 fold_list = add_cp_to_invlist(fold_list,
18672 remaining_folds[i]);
18676 /* If the fold list is identical to what's in this ANYOF
18677 * node, the node can be represented by an EXACTFish one
18679 if (_invlistEQ(*use_this_list, fold_list,
18680 0 /* Don't complement */ )
18683 /* But, we have to be careful, as mentioned above.
18684 * Just the right sequence of characters could match
18685 * this if it is part of a multi-character fold. That
18686 * IS what we want if we are under /i. But it ISN'T
18687 * what we want if not under /i, as it could match when
18688 * it shouldn't. So, when we aren't under /i and this
18689 * character participates in a multi-char fold, we
18690 * don't optimize into an EXACTFish node. So, for each
18691 * case below we have to check if we are folding
18692 * and if not, if it is not part of a multi-char fold.
18694 if (start[0] > 255) { /* Highish code point */
18695 if (FOLD || ! _invlist_contains_cp(
18696 PL_InMultiCharFold, folded))
18700 : (ASCII_FOLD_RESTRICTED)
18705 } /* Below, the lowest code point < 256 */
18708 && DEPENDS_SEMANTICS)
18709 { /* An EXACTF node containing a single character
18710 's', can be an EXACTFU if it doesn't get
18711 joined with an adjacent 's' */
18712 op = EXACTFU_S_EDGE;
18716 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
18718 if (upper_latin1_only_utf8_matches) {
18721 /* We can't use the fold, as that only matches
18725 else if ( UNLIKELY(start[0] == MICRO_SIGN)
18727 { /* EXACTFUP is a special node for this
18729 op = (ASCII_FOLD_RESTRICTED)
18732 value = MICRO_SIGN;
18734 else if ( ASCII_FOLD_RESTRICTED
18735 && ! isASCII(start[0]))
18736 { /* For ASCII under /iaa, we can use EXACTFU
18748 SvREFCNT_dec_NN(fold_list);
18749 SvREFCNT_dec(all_cp_list);
18755 /* Here, we have calculated what EXACTish node we would use.
18756 * But we don't use it if it would require converting the
18757 * pattern to UTF-8, unless not using it could cause us to miss
18758 * some folds (hence be buggy) */
18760 if (! UTF && value > 255) {
18761 SV * in_multis = NULL;
18765 /* If there is no code point that is part of a multi-char
18766 * fold, then there aren't any matches, so we don't do this
18767 * optimization. Otherwise, it could match depending on
18768 * the context around us, so we do upgrade */
18769 _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis);
18770 if (UNLIKELY(_invlist_len(in_multis) != 0)) {
18771 REQUIRE_UTF8(flagp);
18779 U8 len = (UTF) ? UVCHR_SKIP(value) : 1;
18781 ret = regnode_guts(pRExC_state, op, len, "exact");
18782 FILL_NODE(ret, op);
18783 RExC_emit += 1 + STR_SZ(len);
18784 STR_LEN(REGNODE_p(ret)) = len;
18786 *STRING(REGNODE_p(ret)) = (U8) value;
18789 uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
18796 if (! has_runtime_dependency) {
18798 /* See if this can be turned into an ANYOFM node. Think about the
18799 * bit patterns in two different bytes. In some positions, the
18800 * bits in each will be 1; and in other positions both will be 0;
18801 * and in some positions the bit will be 1 in one byte, and 0 in
18802 * the other. Let 'n' be the number of positions where the bits
18803 * differ. We create a mask which has exactly 'n' 0 bits, each in
18804 * a position where the two bytes differ. Now take the set of all
18805 * bytes that when ANDed with the mask yield the same result. That
18806 * set has 2**n elements, and is representable by just two 8 bit
18807 * numbers: the result and the mask. Importantly, matching the set
18808 * can be vectorized by creating a word full of the result bytes,
18809 * and a word full of the mask bytes, yielding a significant speed
18810 * up. Here, see if this node matches such a set. As a concrete
18811 * example consider [01], and the byte representing '0' which is
18812 * 0x30 on ASCII machines. It has the bits 0011 0000. Take the
18813 * mask 1111 1110. If we AND 0x31 and 0x30 with that mask we get
18814 * 0x30. Any other bytes ANDed yield something else. So [01],
18815 * which is a common usage, is optimizable into ANYOFM, and can
18816 * benefit from the speed up. We can only do this on UTF-8
18817 * invariant bytes, because they have the same bit patterns under
18819 PERL_UINT_FAST8_T inverted = 0;
18821 const PERL_UINT_FAST8_T max_permissible = 0xFF;
18823 const PERL_UINT_FAST8_T max_permissible = 0x7F;
18825 /* If doesn't fit the criteria for ANYOFM, invert and try again.
18826 * If that works we will instead later generate an NANYOFM, and
18827 * invert back when through */
18828 if (invlist_highest(cp_list) > max_permissible) {
18829 _invlist_invert(cp_list);
18833 if (invlist_highest(cp_list) <= max_permissible) {
18834 UV this_start, this_end;
18835 UV lowest_cp = UV_MAX; /* inited to suppress compiler warn */
18836 U8 bits_differing = 0;
18837 Size_t full_cp_count = 0;
18838 bool first_time = TRUE;
18840 /* Go through the bytes and find the bit positions that differ
18842 invlist_iterinit(cp_list);
18843 while (invlist_iternext(cp_list, &this_start, &this_end)) {
18844 unsigned int i = this_start;
18847 if (! UVCHR_IS_INVARIANT(i)) {
18851 first_time = FALSE;
18852 lowest_cp = this_start;
18854 /* We have set up the code point to compare with.
18855 * Don't compare it with itself */
18859 /* Find the bit positions that differ from the lowest code
18860 * point in the node. Keep track of all such positions by
18862 for (; i <= this_end; i++) {
18863 if (! UVCHR_IS_INVARIANT(i)) {
18867 bits_differing |= i ^ lowest_cp;
18870 full_cp_count += this_end - this_start + 1;
18872 invlist_iterfinish(cp_list);
18874 /* At the end of the loop, we count how many bits differ from
18875 * the bits in lowest code point, call the count 'd'. If the
18876 * set we found contains 2**d elements, it is the closure of
18877 * all code points that differ only in those bit positions. To
18878 * convince yourself of that, first note that the number in the
18879 * closure must be a power of 2, which we test for. The only
18880 * way we could have that count and it be some differing set,
18881 * is if we got some code points that don't differ from the
18882 * lowest code point in any position, but do differ from each
18883 * other in some other position. That means one code point has
18884 * a 1 in that position, and another has a 0. But that would
18885 * mean that one of them differs from the lowest code point in
18886 * that position, which possibility we've already excluded. */
18887 if ( (inverted || full_cp_count > 1)
18888 && full_cp_count == 1U << PL_bitcount[bits_differing])
18892 op = ANYOFM + inverted;;
18894 /* We need to make the bits that differ be 0's */
18895 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
18897 /* The argument is the lowest code point */
18898 ret = reganode(pRExC_state, op, lowest_cp);
18899 FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
18905 _invlist_invert(cp_list);
18913 if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
18914 PERL_UINT_FAST8_T type;
18915 SV * intersection = NULL;
18916 SV* d_invlist = NULL;
18918 /* See if this matches any of the POSIX classes. The POSIXA and
18919 * POSIXD ones are about the same speed as ANYOF ops, but take less
18920 * room; the ones that have above-Latin1 code point matches are
18921 * somewhat faster than ANYOF. */
18923 for (type = POSIXA; type >= POSIXD; type--) {
18926 if (type == POSIXL) { /* But not /l posix classes */
18930 for (posix_class = 0;
18931 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
18934 SV** our_code_points = &cp_list;
18935 SV** official_code_points;
18938 if (type == POSIXA) {
18939 official_code_points = &PL_Posix_ptrs[posix_class];
18942 official_code_points = &PL_XPosix_ptrs[posix_class];
18945 /* Skip non-existent classes of this type. e.g. \v only
18946 * has an entry in PL_XPosix_ptrs */
18947 if (! *official_code_points) {
18951 /* Try both the regular class, and its inversion */
18952 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
18953 bool this_inverted = invert ^ try_inverted;
18955 if (type != POSIXD) {
18957 /* This class that isn't /d can't match if we have
18958 * /d dependencies */
18959 if (has_runtime_dependency
18960 & HAS_D_RUNTIME_DEPENDENCY)
18965 else /* is /d */ if (! this_inverted) {
18967 /* /d classes don't match anything non-ASCII below
18968 * 256 unconditionally (which cp_list contains) */
18969 _invlist_intersection(cp_list, PL_UpperLatin1,
18971 if (_invlist_len(intersection) != 0) {
18975 SvREFCNT_dec(d_invlist);
18976 d_invlist = invlist_clone(cp_list, NULL);
18978 /* But under UTF-8 it turns into using /u rules.
18979 * Add the things it matches under these conditions
18980 * so that we check below that these are identical
18981 * to what the tested class should match */
18982 if (upper_latin1_only_utf8_matches) {
18985 upper_latin1_only_utf8_matches,
18988 our_code_points = &d_invlist;
18990 else { /* POSIXD, inverted. If this doesn't have this
18991 flag set, it isn't /d. */
18992 if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
18996 our_code_points = &cp_list;
18999 /* Here, have weeded out some things. We want to see
19000 * if the list of characters this node contains
19001 * ('*our_code_points') precisely matches those of the
19002 * class we are currently checking against
19003 * ('*official_code_points'). */
19004 if (_invlistEQ(*our_code_points,
19005 *official_code_points,
19008 /* Here, they precisely match. Optimize this ANYOF
19009 * node into its equivalent POSIX one of the
19010 * correct type, possibly inverted */
19011 ret = reg_node(pRExC_state, (try_inverted)
19015 FLAGS(REGNODE_p(ret)) = posix_class;
19016 SvREFCNT_dec(d_invlist);
19017 SvREFCNT_dec(intersection);
19023 SvREFCNT_dec(d_invlist);
19024 SvREFCNT_dec(intersection);
19027 /* If didn't find an optimization and there is no need for a bitmap,
19028 * optimize to indicate that */
19029 if ( start[0] >= NUM_ANYOF_CODE_POINTS
19031 && ! upper_latin1_only_utf8_matches
19032 && anyof_flags == 0)
19034 U8 low_utf8[UTF8_MAXBYTES+1];
19035 UV highest_cp = invlist_highest(cp_list);
19039 /* Currently the maximum allowed code point by the system is
19040 * IV_MAX. Higher ones are reserved for future internal use. This
19041 * particular regnode can be used for higher ones, but we can't
19042 * calculate the code point of those. IV_MAX suffices though, as
19043 * it will be a large first byte */
19044 (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
19046 /* We store the lowest possible first byte of the UTF-8
19047 * representation, using the flags field. This allows for quick
19048 * ruling out of some inputs without having to convert from UTF-8
19049 * to code point. For EBCDIC, this has to be I8. */
19050 anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19052 /* If the first UTF-8 start byte for the highest code point in the
19053 * range is suitably small, we may be able to get an upper bound as
19055 if (highest_cp <= IV_MAX) {
19056 U8 high_utf8[UTF8_MAXBYTES+1];
19058 (void) uvchr_to_utf8(high_utf8, highest_cp);
19060 /* If the lowest and highest are the same, we can get an exact
19061 * first byte instead of a just minimum. We signal this with a
19062 * different regnode */
19063 if (low_utf8[0] == high_utf8[0]) {
19065 /* No need to convert to I8 for EBCDIC as this is an exact
19067 anyof_flags = low_utf8[0];
19070 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19073 /* Here, the high byte is not the same as the low, but is
19074 * small enough that its reasonable to have a loose upper
19075 * bound, which is packed in with the strict lower bound.
19076 * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19077 * On EBCDIC platforms, I8 is used. On ASCII platforms I8
19078 * is the same thing as UTF-8 */
19081 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19082 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19085 if (range_diff <= max_range_diff / 8) {
19088 else if (range_diff <= max_range_diff / 4) {
19091 else if (range_diff <= max_range_diff / 2) {
19094 anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19099 goto done_finding_op;
19101 } /* End of seeing if can optimize it into a different node */
19103 is_anyof: /* It's going to be an ANYOF node. */
19104 op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19114 ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19115 FILL_NODE(ret, op); /* We set the argument later */
19116 RExC_emit += 1 + regarglen[op];
19117 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19119 /* Here, <cp_list> contains all the code points we can determine at
19120 * compile time that match under all conditions. Go through it, and
19121 * for things that belong in the bitmap, put them there, and delete from
19122 * <cp_list>. While we are at it, see if everything above 255 is in the
19123 * list, and if so, set a flag to speed up execution */
19125 populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19128 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19132 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19135 /* Here, the bitmap has been populated with all the Latin1 code points that
19136 * always match. Can now add to the overall list those that match only
19137 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19139 if (upper_latin1_only_utf8_matches) {
19141 _invlist_union(cp_list,
19142 upper_latin1_only_utf8_matches,
19144 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19147 cp_list = upper_latin1_only_utf8_matches;
19149 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19152 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19153 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19155 only_utf8_locale_list);
19160 /* Here, the node is getting optimized into something that's not an ANYOF
19161 * one. Finish up. */
19163 Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19164 RExC_parse - orig_parse);;
19165 SvREFCNT_dec(cp_list);;
19169 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19172 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19173 regnode* const node,
19175 SV* const runtime_defns,
19176 SV* const only_utf8_locale_list)
19178 /* Sets the arg field of an ANYOF-type node 'node', using information about
19179 * the node passed-in. If there is nothing outside the node's bitmap, the
19180 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
19181 * the count returned by add_data(), having allocated and stored an array,
19184 * av[0] stores the inversion list defining this class as far as known at
19185 * this time, or PL_sv_undef if nothing definite is now known.
19186 * av[1] stores the inversion list of code points that match only if the
19187 * current locale is UTF-8, or if none, PL_sv_undef if there is an
19188 * av[2], or no entry otherwise.
19189 * av[2] stores the list of user-defined properties whose subroutine
19190 * definitions aren't known at this time, or no entry if none. */
19194 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
19196 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
19197 assert(! (ANYOF_FLAGS(node)
19198 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
19199 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
19202 AV * const av = newAV();
19206 av_store(av, INVLIST_INDEX, cp_list);
19209 if (only_utf8_locale_list) {
19210 av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list);
19213 if (runtime_defns) {
19214 av_store(av, DEFERRED_USER_DEFINED_INDEX, SvREFCNT_inc(runtime_defns));
19217 rv = newRV_noinc(MUTABLE_SV(av));
19218 n = add_data(pRExC_state, STR_WITH_LEN("s"));
19219 RExC_rxi->data->data[n] = (void*)rv;
19224 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
19226 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
19227 const regnode* node,
19230 SV** only_utf8_locale_ptr,
19231 SV** output_invlist)
19234 /* For internal core use only.
19235 * Returns the inversion list for the input 'node' in the regex 'prog'.
19236 * If <doinit> is 'true', will attempt to create the inversion list if not
19238 * If <listsvp> is non-null, will return the printable contents of the
19239 * property definition. This can be used to get debugging information
19240 * even before the inversion list exists, by calling this function with
19241 * 'doinit' set to false, in which case the components that will be used
19242 * to eventually create the inversion list are returned (in a printable
19244 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
19245 * store an inversion list of code points that should match only if the
19246 * execution-time locale is a UTF-8 one.
19247 * If <output_invlist> is not NULL, it is where this routine is to store an
19248 * inversion list of the code points that would be instead returned in
19249 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
19250 * when this parameter is used, is just the non-code point data that
19251 * will go into creating the inversion list. This currently should be just
19252 * user-defined properties whose definitions were not known at compile
19253 * time. Using this parameter allows for easier manipulation of the
19254 * inversion list's data by the caller. It is illegal to call this
19255 * function with this parameter set, but not <listsvp>
19257 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
19258 * that, in spite of this function's name, the inversion list it returns
19259 * may include the bitmap data as well */
19261 SV *si = NULL; /* Input initialization string */
19262 SV* invlist = NULL;
19264 RXi_GET_DECL(prog, progi);
19265 const struct reg_data * const data = prog ? progi->data : NULL;
19267 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
19268 assert(! output_invlist || listsvp);
19270 if (data && data->count) {
19271 const U32 n = ARG(node);
19273 if (data->what[n] == 's') {
19274 SV * const rv = MUTABLE_SV(data->data[n]);
19275 AV * const av = MUTABLE_AV(SvRV(rv));
19276 SV **const ary = AvARRAY(av);
19278 invlist = ary[INVLIST_INDEX];
19280 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
19281 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
19284 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
19285 si = ary[DEFERRED_USER_DEFINED_INDEX];
19288 if (doinit && (si || invlist)) {
19291 SV * msg = newSVpvs_flags("", SVs_TEMP);
19293 SV * prop_definition = handle_user_defined_property(
19294 "", 0, FALSE, /* There is no \p{}, \P{} */
19295 SvPVX_const(si)[1] - '0', /* /i or not has been
19296 stored here for just
19298 TRUE, /* run time */
19299 FALSE, /* This call must find the defn */
19300 si, /* The property definition */
19303 0 /* base level call */
19307 assert(prop_definition == NULL);
19309 Perl_croak(aTHX_ "%" UTF8f,
19310 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
19314 _invlist_union(invlist, prop_definition, &invlist);
19315 SvREFCNT_dec_NN(prop_definition);
19318 invlist = prop_definition;
19321 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
19322 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
19324 av_store(av, INVLIST_INDEX, invlist);
19325 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
19326 ? ONLY_LOCALE_MATCHES_INDEX:
19334 /* If requested, return a printable version of what this ANYOF node matches
19337 SV* matches_string = NULL;
19339 /* This function can be called at compile-time, before everything gets
19340 * resolved, in which case we return the currently best available
19341 * information, which is the string that will eventually be used to do
19342 * that resolving, 'si' */
19344 /* Here, we only have 'si' (and possibly some passed-in data in
19345 * 'invlist', which is handled below) If the caller only wants
19346 * 'si', use that. */
19347 if (! output_invlist) {
19348 matches_string = newSVsv(si);
19351 /* But if the caller wants an inversion list of the node, we
19352 * need to parse 'si' and place as much as possible in the
19353 * desired output inversion list, making 'matches_string' only
19354 * contain the currently unresolvable things */
19355 const char *si_string = SvPVX(si);
19356 STRLEN remaining = SvCUR(si);
19360 /* Ignore everything before the first new-line */
19361 while (*si_string != '\n' && remaining > 0) {
19365 assert(remaining > 0);
19370 while (remaining > 0) {
19372 /* The data consists of just strings defining user-defined
19373 * property names, but in prior incarnations, and perhaps
19374 * somehow from pluggable regex engines, it could still
19375 * hold hex code point definitions. Each component of a
19376 * range would be separated by a tab, and each range by a
19377 * new-line. If these are found, instead add them to the
19378 * inversion list */
19379 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
19380 |PERL_SCAN_SILENT_NON_PORTABLE;
19381 STRLEN len = remaining;
19382 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
19384 /* If the hex decode routine found something, it should go
19385 * up to the next \n */
19386 if ( *(si_string + len) == '\n') {
19387 if (count) { /* 2nd code point on line */
19388 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
19391 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
19394 goto prepare_for_next_iteration;
19397 /* If the hex decode was instead for the lower range limit,
19398 * save it, and go parse the upper range limit */
19399 if (*(si_string + len) == '\t') {
19400 assert(count == 0);
19404 prepare_for_next_iteration:
19405 si_string += len + 1;
19406 remaining -= len + 1;
19410 /* Here, didn't find a legal hex number. Just add it from
19411 * here to the next \n */
19414 while (*(si_string + len) != '\n' && remaining > 0) {
19418 if (*(si_string + len) == '\n') {
19422 if (matches_string) {
19423 sv_catpvn(matches_string, si_string, len - 1);
19426 matches_string = newSVpvn(si_string, len - 1);
19429 sv_catpvs(matches_string, " ");
19430 } /* end of loop through the text */
19432 assert(matches_string);
19433 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
19434 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
19436 } /* end of has an 'si' */
19439 /* Add the stuff that's already known */
19442 /* Again, if the caller doesn't want the output inversion list, put
19443 * everything in 'matches-string' */
19444 if (! output_invlist) {
19445 if ( ! matches_string) {
19446 matches_string = newSVpvs("\n");
19448 sv_catsv(matches_string, invlist_contents(invlist,
19449 TRUE /* traditional style */
19452 else if (! *output_invlist) {
19453 *output_invlist = invlist_clone(invlist, NULL);
19456 _invlist_union(*output_invlist, invlist, output_invlist);
19460 *listsvp = matches_string;
19465 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
19467 /* reg_skipcomment()
19469 Absorbs an /x style # comment from the input stream,
19470 returning a pointer to the first character beyond the comment, or if the
19471 comment terminates the pattern without anything following it, this returns
19472 one past the final character of the pattern (in other words, RExC_end) and
19473 sets the REG_RUN_ON_COMMENT_SEEN flag.
19475 Note it's the callers responsibility to ensure that we are
19476 actually in /x mode
19480 PERL_STATIC_INLINE char*
19481 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
19483 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
19487 while (p < RExC_end) {
19488 if (*(++p) == '\n') {
19493 /* we ran off the end of the pattern without ending the comment, so we have
19494 * to add an \n when wrapping */
19495 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
19500 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
19502 const bool force_to_xmod
19505 /* If the text at the current parse position '*p' is a '(?#...)' comment,
19506 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
19507 * is /x whitespace, advance '*p' so that on exit it points to the first
19508 * byte past all such white space and comments */
19510 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
19512 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
19514 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
19517 if (RExC_end - (*p) >= 3
19519 && *(*p + 1) == '?'
19520 && *(*p + 2) == '#')
19522 while (*(*p) != ')') {
19523 if ((*p) == RExC_end)
19524 FAIL("Sequence (?#... not terminated");
19532 const char * save_p = *p;
19533 while ((*p) < RExC_end) {
19535 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
19538 else if (*(*p) == '#') {
19539 (*p) = reg_skipcomment(pRExC_state, (*p));
19545 if (*p != save_p) {
19558 Advances the parse position by one byte, unless that byte is the beginning
19559 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
19560 those two cases, the parse position is advanced beyond all such comments and
19563 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
19567 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
19569 PERL_ARGS_ASSERT_NEXTCHAR;
19571 if (RExC_parse < RExC_end) {
19573 || UTF8_IS_INVARIANT(*RExC_parse)
19574 || UTF8_IS_START(*RExC_parse));
19576 RExC_parse += (UTF)
19577 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
19580 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
19581 FALSE /* Don't force /x */ );
19586 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
19588 /* 'size' is the delta to add or subtract from the current memory allocated
19589 * to the regex engine being constructed */
19591 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
19596 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
19597 /* +1 for REG_MAGIC */
19600 if ( RExC_rxi == NULL )
19601 FAIL("Regexp out of space");
19602 RXi_SET(RExC_rx, RExC_rxi);
19604 RExC_emit_start = RExC_rxi->program;
19606 Zero(REGNODE_p(RExC_emit), size, regnode);
19609 #ifdef RE_TRACK_PATTERN_OFFSETS
19610 Renew(RExC_offsets, 2*RExC_size+1, U32);
19612 Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
19614 RExC_offsets[0] = RExC_size;
19618 STATIC regnode_offset
19619 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
19621 /* Allocate a regnode for 'op', with 'extra_size' extra space. It aligns
19622 * and increments RExC_size and RExC_emit
19624 * It returns the regnode's offset into the regex engine program */
19626 const regnode_offset ret = RExC_emit;
19628 GET_RE_DEBUG_FLAGS_DECL;
19630 PERL_ARGS_ASSERT_REGNODE_GUTS;
19632 SIZE_ALIGN(RExC_size);
19633 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
19634 NODE_ALIGN_FILL(REGNODE_p(ret));
19635 #ifndef RE_TRACK_PATTERN_OFFSETS
19636 PERL_UNUSED_ARG(name);
19637 PERL_UNUSED_ARG(op);
19639 assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
19641 if (RExC_offsets) { /* MJD */
19643 ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
19646 (UV)(RExC_emit) > RExC_offsets[0]
19647 ? "Overwriting end of array!\n" : "OK",
19649 (UV)(RExC_parse - RExC_start),
19650 (UV)RExC_offsets[0]));
19651 Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
19658 - reg_node - emit a node
19660 STATIC regnode_offset /* Location. */
19661 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
19663 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
19664 regnode_offset ptr = ret;
19666 PERL_ARGS_ASSERT_REG_NODE;
19668 assert(regarglen[op] == 0);
19670 FILL_ADVANCE_NODE(ptr, op);
19676 - reganode - emit a node with an argument
19678 STATIC regnode_offset /* Location. */
19679 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
19681 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
19682 regnode_offset ptr = ret;
19684 PERL_ARGS_ASSERT_REGANODE;
19686 /* ANYOF are special cased to allow non-length 1 args */
19687 assert(regarglen[op] == 1);
19689 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
19694 STATIC regnode_offset
19695 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
19697 /* emit a node with U32 and I32 arguments */
19699 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
19700 regnode_offset ptr = ret;
19702 PERL_ARGS_ASSERT_REG2LANODE;
19704 assert(regarglen[op] == 2);
19706 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
19712 - reginsert - insert an operator in front of already-emitted operand
19714 * That means that on exit 'operand' is the offset of the newly inserted
19715 * operator, and the original operand has been relocated.
19717 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
19718 * set up NEXT_OFF() of the inserted node if needed. Something like this:
19720 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
19721 * NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
19723 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
19726 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
19727 const regnode_offset operand, const U32 depth)
19732 const int offset = regarglen[(U8)op];
19733 const int size = NODE_STEP_REGNODE + offset;
19734 GET_RE_DEBUG_FLAGS_DECL;
19736 PERL_ARGS_ASSERT_REGINSERT;
19737 PERL_UNUSED_CONTEXT;
19738 PERL_UNUSED_ARG(depth);
19739 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
19740 DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
19741 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
19742 studying. If this is wrong then we need to adjust RExC_recurse
19743 below like we do with RExC_open_parens/RExC_close_parens. */
19744 change_engine_size(pRExC_state, (Ptrdiff_t) size);
19745 src = REGNODE_p(RExC_emit);
19747 dst = REGNODE_p(RExC_emit);
19749 /* If we are in a "count the parentheses" pass, the numbers are unreliable,
19750 * and [perl #133871] shows this can lead to problems, so skip this
19751 * realignment of parens until a later pass when they are reliable */
19752 if (! IN_PARENS_PASS && RExC_open_parens) {
19754 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
19755 /* remember that RExC_npar is rex->nparens + 1,
19756 * iow it is 1 more than the number of parens seen in
19757 * the pattern so far. */
19758 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
19759 /* note, RExC_open_parens[0] is the start of the
19760 * regex, it can't move. RExC_close_parens[0] is the end
19761 * of the regex, it *can* move. */
19762 if ( paren && RExC_open_parens[paren] >= operand ) {
19763 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
19764 RExC_open_parens[paren] += size;
19766 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
19768 if ( RExC_close_parens[paren] >= operand ) {
19769 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
19770 RExC_close_parens[paren] += size;
19772 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
19777 RExC_end_op += size;
19779 while (src > REGNODE_p(operand)) {
19780 StructCopy(--src, --dst, regnode);
19781 #ifdef RE_TRACK_PATTERN_OFFSETS
19782 if (RExC_offsets) { /* MJD 20010112 */
19784 ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
19788 (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
19789 ? "Overwriting end of array!\n" : "OK",
19790 (UV)REGNODE_OFFSET(src),
19791 (UV)REGNODE_OFFSET(dst),
19792 (UV)RExC_offsets[0]));
19793 Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
19794 Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
19799 place = REGNODE_p(operand); /* Op node, where operand used to be. */
19800 #ifdef RE_TRACK_PATTERN_OFFSETS
19801 if (RExC_offsets) { /* MJD */
19803 ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
19807 (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
19808 ? "Overwriting end of array!\n" : "OK",
19809 (UV)REGNODE_OFFSET(place),
19810 (UV)(RExC_parse - RExC_start),
19811 (UV)RExC_offsets[0]));
19812 Set_Node_Offset(place, RExC_parse);
19813 Set_Node_Length(place, 1);
19816 src = NEXTOPER(place);
19818 FILL_NODE(operand, op);
19820 /* Zero out any arguments in the new node */
19821 Zero(src, offset, regnode);
19825 - regtail - set the next-pointer at the end of a node chain of p to val. If
19826 that value won't fit in the space available, instead returns FALSE.
19827 (Except asserts if we can't fit in the largest space the regex
19828 engine is designed for.)
19829 - SEE ALSO: regtail_study
19832 S_regtail(pTHX_ RExC_state_t * pRExC_state,
19833 const regnode_offset p,
19834 const regnode_offset val,
19837 regnode_offset scan;
19838 GET_RE_DEBUG_FLAGS_DECL;
19840 PERL_ARGS_ASSERT_REGTAIL;
19842 PERL_UNUSED_ARG(depth);
19845 /* Find last node. */
19846 scan = (regnode_offset) p;
19848 regnode * const temp = regnext(REGNODE_p(scan));
19850 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
19851 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
19852 Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n",
19853 SvPV_nolen_const(RExC_mysv), scan,
19854 (temp == NULL ? "->" : ""),
19855 (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
19860 scan = REGNODE_OFFSET(temp);
19863 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
19864 assert((UV) (val - scan) <= U32_MAX);
19865 ARG_SET(REGNODE_p(scan), val - scan);
19868 if (val - scan > U16_MAX) {
19869 /* Since not all callers check the return value, populate this with
19870 * something that won't loop and will likely lead to a crash if
19871 * execution continues */
19872 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
19875 NEXT_OFF(REGNODE_p(scan)) = val - scan;
19883 - regtail_study - set the next-pointer at the end of a node chain of p to val.
19884 - Look for optimizable sequences at the same time.
19885 - currently only looks for EXACT chains.
19887 This is experimental code. The idea is to use this routine to perform
19888 in place optimizations on branches and groups as they are constructed,
19889 with the long term intention of removing optimization from study_chunk so
19890 that it is purely analytical.
19892 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
19893 to control which is which.
19895 This used to return a value that was ignored. It was a problem that it is
19896 #ifdef'd to be another function that didn't return a value. khw has changed it
19897 so both currently return a pass/fail return.
19900 /* TODO: All four parms should be const */
19903 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
19904 const regnode_offset val, U32 depth)
19906 regnode_offset scan;
19908 #ifdef EXPERIMENTAL_INPLACESCAN
19911 GET_RE_DEBUG_FLAGS_DECL;
19913 PERL_ARGS_ASSERT_REGTAIL_STUDY;
19916 /* Find last node. */
19920 regnode * const temp = regnext(REGNODE_p(scan));
19921 #ifdef EXPERIMENTAL_INPLACESCAN
19922 if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
19923 bool unfolded_multi_char; /* Unexamined in this routine */
19924 if (join_exact(pRExC_state, scan, &min,
19925 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
19926 return TRUE; /* Was return EXACT */
19930 switch (OP(REGNODE_p(scan))) {
19935 case EXACTFU_S_EDGE:
19936 case EXACTFAA_NO_TRIE:
19939 case EXACTFU_ONLY8:
19943 if( exact == PSEUDO )
19944 exact= OP(REGNODE_p(scan));
19945 else if ( exact != OP(REGNODE_p(scan)) )
19954 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
19955 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
19956 Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n",
19957 SvPV_nolen_const(RExC_mysv),
19959 PL_reg_name[exact]);
19963 scan = REGNODE_OFFSET(temp);
19966 DEBUG_PARSE_MSG("");
19967 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
19968 Perl_re_printf( aTHX_
19969 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
19970 SvPV_nolen_const(RExC_mysv),
19975 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
19976 assert((UV) (val - scan) <= U32_MAX);
19977 ARG_SET(REGNODE_p(scan), val - scan);
19980 if (val - scan > U16_MAX) {
19981 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
19984 NEXT_OFF(REGNODE_p(scan)) = val - scan;
19987 return TRUE; /* Was 'return exact' */
19992 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
19994 /* Returns an inversion list of all the code points matched by the
19995 * ANYOFM/NANYOFM node 'n' */
19997 SV * cp_list = _new_invlist(-1);
19998 const U8 lowest = (U8) ARG(n);
20001 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20003 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20005 /* Starting with the lowest code point, any code point that ANDed with the
20006 * mask yields the lowest code point is in the set */
20007 for (i = lowest; i <= 0xFF; i++) {
20008 if ((i & FLAGS(n)) == ARG(n)) {
20009 cp_list = add_cp_to_invlist(cp_list, i);
20012 /* We know how many code points (a power of two) that are in the
20013 * set. No use looking once we've got that number */
20014 if (count >= needed) break;
20018 if (OP(n) == NANYOFM) {
20019 _invlist_invert(cp_list);
20025 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20030 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20035 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20037 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20038 if (flags & (1<<bit)) {
20039 if (!set++ && lead)
20040 Perl_re_printf( aTHX_ "%s", lead);
20041 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
20046 Perl_re_printf( aTHX_ "\n");
20048 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20053 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20059 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20061 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20062 if (flags & (1<<bit)) {
20063 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
20066 if (!set++ && lead)
20067 Perl_re_printf( aTHX_ "%s", lead);
20068 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
20071 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20072 if (!set++ && lead) {
20073 Perl_re_printf( aTHX_ "%s", lead);
20076 case REGEX_UNICODE_CHARSET:
20077 Perl_re_printf( aTHX_ "UNICODE");
20079 case REGEX_LOCALE_CHARSET:
20080 Perl_re_printf( aTHX_ "LOCALE");
20082 case REGEX_ASCII_RESTRICTED_CHARSET:
20083 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
20085 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20086 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
20089 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
20095 Perl_re_printf( aTHX_ "\n");
20097 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20103 Perl_regdump(pTHX_ const regexp *r)
20107 SV * const sv = sv_newmortal();
20108 SV *dsv= sv_newmortal();
20109 RXi_GET_DECL(r, ri);
20110 GET_RE_DEBUG_FLAGS_DECL;
20112 PERL_ARGS_ASSERT_REGDUMP;
20114 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20116 /* Header fields of interest. */
20117 for (i = 0; i < 2; i++) {
20118 if (r->substrs->data[i].substr) {
20119 RE_PV_QUOTED_DECL(s, 0, dsv,
20120 SvPVX_const(r->substrs->data[i].substr),
20121 RE_SV_DUMPLEN(r->substrs->data[i].substr),
20122 PL_dump_re_max_len);
20123 Perl_re_printf( aTHX_
20124 "%s %s%s at %" IVdf "..%" UVuf " ",
20125 i ? "floating" : "anchored",
20127 RE_SV_TAIL(r->substrs->data[i].substr),
20128 (IV)r->substrs->data[i].min_offset,
20129 (UV)r->substrs->data[i].max_offset);
20131 else if (r->substrs->data[i].utf8_substr) {
20132 RE_PV_QUOTED_DECL(s, 1, dsv,
20133 SvPVX_const(r->substrs->data[i].utf8_substr),
20134 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20136 Perl_re_printf( aTHX_
20137 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20138 i ? "floating" : "anchored",
20140 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20141 (IV)r->substrs->data[i].min_offset,
20142 (UV)r->substrs->data[i].max_offset);
20146 if (r->check_substr || r->check_utf8)
20147 Perl_re_printf( aTHX_
20149 ( r->check_substr == r->substrs->data[1].substr
20150 && r->check_utf8 == r->substrs->data[1].utf8_substr
20151 ? "(checking floating" : "(checking anchored"));
20152 if (r->intflags & PREGf_NOSCAN)
20153 Perl_re_printf( aTHX_ " noscan");
20154 if (r->extflags & RXf_CHECK_ALL)
20155 Perl_re_printf( aTHX_ " isall");
20156 if (r->check_substr || r->check_utf8)
20157 Perl_re_printf( aTHX_ ") ");
20159 if (ri->regstclass) {
20160 regprop(r, sv, ri->regstclass, NULL, NULL);
20161 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
20163 if (r->intflags & PREGf_ANCH) {
20164 Perl_re_printf( aTHX_ "anchored");
20165 if (r->intflags & PREGf_ANCH_MBOL)
20166 Perl_re_printf( aTHX_ "(MBOL)");
20167 if (r->intflags & PREGf_ANCH_SBOL)
20168 Perl_re_printf( aTHX_ "(SBOL)");
20169 if (r->intflags & PREGf_ANCH_GPOS)
20170 Perl_re_printf( aTHX_ "(GPOS)");
20171 Perl_re_printf( aTHX_ " ");
20173 if (r->intflags & PREGf_GPOS_SEEN)
20174 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
20175 if (r->intflags & PREGf_SKIP)
20176 Perl_re_printf( aTHX_ "plus ");
20177 if (r->intflags & PREGf_IMPLICIT)
20178 Perl_re_printf( aTHX_ "implicit ");
20179 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
20180 if (r->extflags & RXf_EVAL_SEEN)
20181 Perl_re_printf( aTHX_ "with eval ");
20182 Perl_re_printf( aTHX_ "\n");
20184 regdump_extflags("r->extflags: ", r->extflags);
20185 regdump_intflags("r->intflags: ", r->intflags);
20188 PERL_ARGS_ASSERT_REGDUMP;
20189 PERL_UNUSED_CONTEXT;
20190 PERL_UNUSED_ARG(r);
20191 #endif /* DEBUGGING */
20194 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
20197 # if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
20198 || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
20199 || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
20200 || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
20201 || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
20202 || _CC_VERTSPACE != 15
20203 # error Need to adjust order of anyofs[]
20205 static const char * const anyofs[] = {
20242 - regprop - printable representation of opcode, with run time support
20246 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
20251 RXi_GET_DECL(prog, progi);
20252 GET_RE_DEBUG_FLAGS_DECL;
20254 PERL_ARGS_ASSERT_REGPROP;
20258 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
20259 /* It would be nice to FAIL() here, but this may be called from
20260 regexec.c, and it would be hard to supply pRExC_state. */
20261 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
20262 (int)OP(o), (int)REGNODE_MAX);
20263 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
20265 k = PL_regkind[OP(o)];
20268 sv_catpvs(sv, " ");
20269 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
20270 * is a crude hack but it may be the best for now since
20271 * we have no flag "this EXACTish node was UTF-8"
20273 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
20274 PL_colors[0], PL_colors[1],
20275 PERL_PV_ESCAPE_UNI_DETECT |
20276 PERL_PV_ESCAPE_NONASCII |
20277 PERL_PV_PRETTY_ELLIPSES |
20278 PERL_PV_PRETTY_LTGT |
20279 PERL_PV_PRETTY_NOCLEAR
20281 } else if (k == TRIE) {
20282 /* print the details of the trie in dumpuntil instead, as
20283 * progi->data isn't available here */
20284 const char op = OP(o);
20285 const U32 n = ARG(o);
20286 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
20287 (reg_ac_data *)progi->data->data[n] :
20289 const reg_trie_data * const trie
20290 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
20292 Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
20293 DEBUG_TRIE_COMPILE_r({
20295 sv_catpvs(sv, "(JUMP)");
20296 Perl_sv_catpvf(aTHX_ sv,
20297 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
20298 (UV)trie->startstate,
20299 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
20300 (UV)trie->wordcount,
20303 (UV)TRIE_CHARCOUNT(trie),
20304 (UV)trie->uniquecharcount
20307 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
20308 sv_catpvs(sv, "[");
20309 (void) put_charclass_bitmap_innards(sv,
20310 ((IS_ANYOF_TRIE(op))
20312 : TRIE_BITMAP(trie)),
20318 sv_catpvs(sv, "]");
20320 } else if (k == CURLY) {
20321 U32 lo = ARG1(o), hi = ARG2(o);
20322 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
20323 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
20324 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
20325 if (hi == REG_INFTY)
20326 sv_catpvs(sv, "INFTY");
20328 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
20329 sv_catpvs(sv, "}");
20331 else if (k == WHILEM && o->flags) /* Ordinal/of */
20332 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
20333 else if (k == REF || k == OPEN || k == CLOSE
20334 || k == GROUPP || OP(o)==ACCEPT)
20336 AV *name_list= NULL;
20337 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
20338 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
20339 if ( RXp_PAREN_NAMES(prog) ) {
20340 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20341 } else if ( pRExC_state ) {
20342 name_list= RExC_paren_name_list;
20345 if ( k != REF || (OP(o) < REFN)) {
20346 SV **name= av_fetch(name_list, parno, 0 );
20348 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20351 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
20352 I32 *nums=(I32*)SvPVX(sv_dat);
20353 SV **name= av_fetch(name_list, nums[0], 0 );
20356 for ( n=0; n<SvIVX(sv_dat); n++ ) {
20357 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
20358 (n ? "," : ""), (IV)nums[n]);
20360 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20364 if ( k == REF && reginfo) {
20365 U32 n = ARG(o); /* which paren pair */
20366 I32 ln = prog->offs[n].start;
20367 if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
20368 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
20369 else if (ln == prog->offs[n].end)
20370 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
20372 const char *s = reginfo->strbeg + ln;
20373 Perl_sv_catpvf(aTHX_ sv, ": ");
20374 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
20375 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
20378 } else if (k == GOSUB) {
20379 AV *name_list= NULL;
20380 if ( RXp_PAREN_NAMES(prog) ) {
20381 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
20382 } else if ( pRExC_state ) {
20383 name_list= RExC_paren_name_list;
20386 /* Paren and offset */
20387 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
20388 (int)((o + (int)ARG2L(o)) - progi->program) );
20390 SV **name= av_fetch(name_list, ARG(o), 0 );
20392 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
20395 else if (k == LOGICAL)
20396 /* 2: embedded, otherwise 1 */
20397 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
20398 else if (k == ANYOF) {
20399 const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
20402 bool do_sep = FALSE; /* Do we need to separate various components of
20404 /* Set if there is still an unresolved user-defined property */
20405 SV *unresolved = NULL;
20407 /* Things that are ignored except when the runtime locale is UTF-8 */
20408 SV *only_utf8_locale_invlist = NULL;
20410 /* Code points that don't fit in the bitmap */
20411 SV *nonbitmap_invlist = NULL;
20413 /* And things that aren't in the bitmap, but are small enough to be */
20414 SV* bitmap_range_not_in_bitmap = NULL;
20416 const bool inverted = flags & ANYOF_INVERT;
20418 if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
20419 if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
20420 sv_catpvs(sv, "{utf8-locale-reqd}");
20422 if (flags & ANYOFL_FOLD) {
20423 sv_catpvs(sv, "{i}");
20427 /* If there is stuff outside the bitmap, get it */
20428 if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
20429 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
20431 &only_utf8_locale_invlist,
20432 &nonbitmap_invlist);
20433 /* The non-bitmap data may contain stuff that could fit in the
20434 * bitmap. This could come from a user-defined property being
20435 * finally resolved when this call was done; or much more likely
20436 * because there are matches that require UTF-8 to be valid, and so
20437 * aren't in the bitmap. This is teased apart later */
20438 _invlist_intersection(nonbitmap_invlist,
20440 &bitmap_range_not_in_bitmap);
20441 /* Leave just the things that don't fit into the bitmap */
20442 _invlist_subtract(nonbitmap_invlist,
20444 &nonbitmap_invlist);
20447 /* Obey this flag to add all above-the-bitmap code points */
20448 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
20449 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
20450 NUM_ANYOF_CODE_POINTS,
20454 /* Ready to start outputting. First, the initial left bracket */
20455 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20457 if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
20458 /* Then all the things that could fit in the bitmap */
20459 do_sep = put_charclass_bitmap_innards(sv,
20461 bitmap_range_not_in_bitmap,
20462 only_utf8_locale_invlist,
20465 /* Can't try inverting for a
20466 * better display if there
20467 * are things that haven't
20469 unresolved != NULL);
20470 SvREFCNT_dec(bitmap_range_not_in_bitmap);
20472 /* If there are user-defined properties which haven't been defined
20473 * yet, output them. If the result is not to be inverted, it is
20474 * clearest to output them in a separate [] from the bitmap range
20475 * stuff. If the result is to be complemented, we have to show
20476 * everything in one [], as the inversion applies to the whole
20477 * thing. Use {braces} to separate them from anything in the
20478 * bitmap and anything above the bitmap. */
20481 if (! do_sep) { /* If didn't output anything in the bitmap
20483 sv_catpvs(sv, "^");
20485 sv_catpvs(sv, "{");
20488 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
20491 sv_catsv(sv, unresolved);
20493 sv_catpvs(sv, "}");
20495 do_sep = ! inverted;
20499 /* And, finally, add the above-the-bitmap stuff */
20500 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
20503 /* See if truncation size is overridden */
20504 const STRLEN dump_len = (PL_dump_re_max_len > 256)
20505 ? PL_dump_re_max_len
20508 /* This is output in a separate [] */
20510 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
20513 /* And, for easy of understanding, it is shown in the
20514 * uncomplemented form if possible. The one exception being if
20515 * there are unresolved items, where the inversion has to be
20516 * delayed until runtime */
20517 if (inverted && ! unresolved) {
20518 _invlist_invert(nonbitmap_invlist);
20519 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
20522 contents = invlist_contents(nonbitmap_invlist,
20523 FALSE /* output suitable for catsv */
20526 /* If the output is shorter than the permissible maximum, just do it. */
20527 if (SvCUR(contents) <= dump_len) {
20528 sv_catsv(sv, contents);
20531 const char * contents_string = SvPVX(contents);
20532 STRLEN i = dump_len;
20534 /* Otherwise, start at the permissible max and work back to the
20535 * first break possibility */
20536 while (i > 0 && contents_string[i] != ' ') {
20539 if (i == 0) { /* Fail-safe. Use the max if we couldn't
20540 find a legal break */
20544 sv_catpvn(sv, contents_string, i);
20545 sv_catpvs(sv, "...");
20548 SvREFCNT_dec_NN(contents);
20549 SvREFCNT_dec_NN(nonbitmap_invlist);
20552 /* And finally the matching, closing ']' */
20553 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
20555 if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
20556 U8 lowest = (OP(o) != ANYOFHr)
20558 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
20559 U8 highest = (OP(o) == ANYOFHb)
20563 : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
20564 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
20565 if (lowest != highest) {
20566 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
20568 Perl_sv_catpvf(aTHX_ sv, ")");
20571 SvREFCNT_dec(unresolved);
20573 else if (k == ANYOFM) {
20574 SV * cp_list = get_ANYOFM_contents(o);
20576 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
20577 if (OP(o) == NANYOFM) {
20578 _invlist_invert(cp_list);
20581 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
20582 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
20584 SvREFCNT_dec(cp_list);
20586 else if (k == POSIXD || k == NPOSIXD) {
20587 U8 index = FLAGS(o) * 2;
20588 if (index < C_ARRAY_LENGTH(anyofs)) {
20589 if (*anyofs[index] != '[') {
20590 sv_catpvs(sv, "[");
20592 sv_catpv(sv, anyofs[index]);
20593 if (*anyofs[index] != '[') {
20594 sv_catpvs(sv, "]");
20598 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
20601 else if (k == BOUND || k == NBOUND) {
20602 /* Must be synced with order of 'bound_type' in regcomp.h */
20603 const char * const bounds[] = {
20604 "", /* Traditional */
20610 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
20611 sv_catpv(sv, bounds[FLAGS(o)]);
20613 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
20614 Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
20616 Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
20618 Perl_sv_catpvf(aTHX_ sv, "]");
20620 else if (OP(o) == SBOL)
20621 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
20623 /* add on the verb argument if there is one */
20624 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
20626 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
20627 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
20629 sv_catpvs(sv, ":NULL");
20632 PERL_UNUSED_CONTEXT;
20633 PERL_UNUSED_ARG(sv);
20634 PERL_UNUSED_ARG(o);
20635 PERL_UNUSED_ARG(prog);
20636 PERL_UNUSED_ARG(reginfo);
20637 PERL_UNUSED_ARG(pRExC_state);
20638 #endif /* DEBUGGING */
20644 Perl_re_intuit_string(pTHX_ REGEXP * const r)
20645 { /* Assume that RE_INTUIT is set */
20646 struct regexp *const prog = ReANY(r);
20647 GET_RE_DEBUG_FLAGS_DECL;
20649 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
20650 PERL_UNUSED_CONTEXT;
20654 const char * const s = SvPV_nolen_const(RX_UTF8(r)
20655 ? prog->check_utf8 : prog->check_substr);
20657 if (!PL_colorset) reginitcolors();
20658 Perl_re_printf( aTHX_
20659 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
20661 RX_UTF8(r) ? "utf8 " : "",
20662 PL_colors[5], PL_colors[0],
20665 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
20668 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
20669 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
20675 handles refcounting and freeing the perl core regexp structure. When
20676 it is necessary to actually free the structure the first thing it
20677 does is call the 'free' method of the regexp_engine associated to
20678 the regexp, allowing the handling of the void *pprivate; member
20679 first. (This routine is not overridable by extensions, which is why
20680 the extensions free is called first.)
20682 See regdupe and regdupe_internal if you change anything here.
20684 #ifndef PERL_IN_XSUB_RE
20686 Perl_pregfree(pTHX_ REGEXP *r)
20692 Perl_pregfree2(pTHX_ REGEXP *rx)
20694 struct regexp *const r = ReANY(rx);
20695 GET_RE_DEBUG_FLAGS_DECL;
20697 PERL_ARGS_ASSERT_PREGFREE2;
20702 if (r->mother_re) {
20703 ReREFCNT_dec(r->mother_re);
20705 CALLREGFREE_PVT(rx); /* free the private data */
20706 SvREFCNT_dec(RXp_PAREN_NAMES(r));
20710 for (i = 0; i < 2; i++) {
20711 SvREFCNT_dec(r->substrs->data[i].substr);
20712 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
20714 Safefree(r->substrs);
20716 RX_MATCH_COPY_FREE(rx);
20717 #ifdef PERL_ANY_COW
20718 SvREFCNT_dec(r->saved_copy);
20721 SvREFCNT_dec(r->qr_anoncv);
20722 if (r->recurse_locinput)
20723 Safefree(r->recurse_locinput);
20729 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
20730 except that dsv will be created if NULL.
20732 This function is used in two main ways. First to implement
20733 $r = qr/....; $s = $$r;
20735 Secondly, it is used as a hacky workaround to the structural issue of
20737 being stored in the regexp structure which is in turn stored in
20738 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
20739 could be PL_curpm in multiple contexts, and could require multiple
20740 result sets being associated with the pattern simultaneously, such
20741 as when doing a recursive match with (??{$qr})
20743 The solution is to make a lightweight copy of the regexp structure
20744 when a qr// is returned from the code executed by (??{$qr}) this
20745 lightweight copy doesn't actually own any of its data except for
20746 the starp/end and the actual regexp structure itself.
20752 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
20754 struct regexp *drx;
20755 struct regexp *const srx = ReANY(ssv);
20756 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
20758 PERL_ARGS_ASSERT_REG_TEMP_COPY;
20761 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
20763 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
20765 /* our only valid caller, sv_setsv_flags(), should have done
20766 * a SV_CHECK_THINKFIRST_COW_DROP() by now */
20767 assert(!SvOOK(dsv));
20768 assert(!SvIsCOW(dsv));
20769 assert(!SvROK(dsv));
20771 if (SvPVX_const(dsv)) {
20773 Safefree(SvPVX(dsv));
20778 SvOK_off((SV *)dsv);
20781 /* For PVLVs, the head (sv_any) points to an XPVLV, while
20782 * the LV's xpvlenu_rx will point to a regexp body, which
20783 * we allocate here */
20784 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
20785 assert(!SvPVX(dsv));
20786 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
20787 temp->sv_any = NULL;
20788 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
20789 SvREFCNT_dec_NN(temp);
20790 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
20791 ing below will not set it. */
20792 SvCUR_set(dsv, SvCUR(ssv));
20795 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
20796 sv_force_normal(sv) is called. */
20800 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
20801 SvPV_set(dsv, RX_WRAPPED(ssv));
20802 /* We share the same string buffer as the original regexp, on which we
20803 hold a reference count, incremented when mother_re is set below.
20804 The string pointer is copied here, being part of the regexp struct.
20806 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
20807 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
20811 const I32 npar = srx->nparens+1;
20812 Newx(drx->offs, npar, regexp_paren_pair);
20813 Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
20815 if (srx->substrs) {
20817 Newx(drx->substrs, 1, struct reg_substr_data);
20818 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
20820 for (i = 0; i < 2; i++) {
20821 SvREFCNT_inc_void(drx->substrs->data[i].substr);
20822 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
20825 /* check_substr and check_utf8, if non-NULL, point to either their
20826 anchored or float namesakes, and don't hold a second reference. */
20828 RX_MATCH_COPIED_off(dsv);
20829 #ifdef PERL_ANY_COW
20830 drx->saved_copy = NULL;
20832 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
20833 SvREFCNT_inc_void(drx->qr_anoncv);
20834 if (srx->recurse_locinput)
20835 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
20842 /* regfree_internal()
20844 Free the private data in a regexp. This is overloadable by
20845 extensions. Perl takes care of the regexp structure in pregfree(),
20846 this covers the *pprivate pointer which technically perl doesn't
20847 know about, however of course we have to handle the
20848 regexp_internal structure when no extension is in use.
20850 Note this is called before freeing anything in the regexp
20855 Perl_regfree_internal(pTHX_ REGEXP * const rx)
20857 struct regexp *const r = ReANY(rx);
20858 RXi_GET_DECL(r, ri);
20859 GET_RE_DEBUG_FLAGS_DECL;
20861 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
20871 SV *dsv= sv_newmortal();
20872 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
20873 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
20874 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
20875 PL_colors[4], PL_colors[5], s);
20879 #ifdef RE_TRACK_PATTERN_OFFSETS
20881 Safefree(ri->u.offsets); /* 20010421 MJD */
20883 if (ri->code_blocks)
20884 S_free_codeblocks(aTHX_ ri->code_blocks);
20887 int n = ri->data->count;
20890 /* If you add a ->what type here, update the comment in regcomp.h */
20891 switch (ri->data->what[n]) {
20897 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
20900 Safefree(ri->data->data[n]);
20906 { /* Aho Corasick add-on structure for a trie node.
20907 Used in stclass optimization only */
20909 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
20910 #ifdef USE_ITHREADS
20914 refcount = --aho->refcount;
20917 PerlMemShared_free(aho->states);
20918 PerlMemShared_free(aho->fail);
20919 /* do this last!!!! */
20920 PerlMemShared_free(ri->data->data[n]);
20921 /* we should only ever get called once, so
20922 * assert as much, and also guard the free
20923 * which /might/ happen twice. At the least
20924 * it will make code anlyzers happy and it
20925 * doesn't cost much. - Yves */
20926 assert(ri->regstclass);
20927 if (ri->regstclass) {
20928 PerlMemShared_free(ri->regstclass);
20929 ri->regstclass = 0;
20936 /* trie structure. */
20938 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
20939 #ifdef USE_ITHREADS
20943 refcount = --trie->refcount;
20946 PerlMemShared_free(trie->charmap);
20947 PerlMemShared_free(trie->states);
20948 PerlMemShared_free(trie->trans);
20950 PerlMemShared_free(trie->bitmap);
20952 PerlMemShared_free(trie->jump);
20953 PerlMemShared_free(trie->wordinfo);
20954 /* do this last!!!! */
20955 PerlMemShared_free(ri->data->data[n]);
20960 Perl_croak(aTHX_ "panic: regfree data code '%c'",
20961 ri->data->what[n]);
20964 Safefree(ri->data->what);
20965 Safefree(ri->data);
20971 #define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t))
20972 #define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t))
20973 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
20976 re_dup_guts - duplicate a regexp.
20978 This routine is expected to clone a given regexp structure. It is only
20979 compiled under USE_ITHREADS.
20981 After all of the core data stored in struct regexp is duplicated
20982 the regexp_engine.dupe method is used to copy any private data
20983 stored in the *pprivate pointer. This allows extensions to handle
20984 any duplication it needs to do.
20986 See pregfree() and regfree_internal() if you change anything here.
20988 #if defined(USE_ITHREADS)
20989 #ifndef PERL_IN_XSUB_RE
20991 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
20995 const struct regexp *r = ReANY(sstr);
20996 struct regexp *ret = ReANY(dstr);
20998 PERL_ARGS_ASSERT_RE_DUP_GUTS;
21000 npar = r->nparens+1;
21001 Newx(ret->offs, npar, regexp_paren_pair);
21002 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21004 if (ret->substrs) {
21005 /* Do it this way to avoid reading from *r after the StructCopy().
21006 That way, if any of the sv_dup_inc()s dislodge *r from the L1
21007 cache, it doesn't matter. */
21009 const bool anchored = r->check_substr
21010 ? r->check_substr == r->substrs->data[0].substr
21011 : r->check_utf8 == r->substrs->data[0].utf8_substr;
21012 Newx(ret->substrs, 1, struct reg_substr_data);
21013 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21015 for (i = 0; i < 2; i++) {
21016 ret->substrs->data[i].substr =
21017 sv_dup_inc(ret->substrs->data[i].substr, param);
21018 ret->substrs->data[i].utf8_substr =
21019 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21022 /* check_substr and check_utf8, if non-NULL, point to either their
21023 anchored or float namesakes, and don't hold a second reference. */
21025 if (ret->check_substr) {
21027 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21029 ret->check_substr = ret->substrs->data[0].substr;
21030 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21032 assert(r->check_substr == r->substrs->data[1].substr);
21033 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
21035 ret->check_substr = ret->substrs->data[1].substr;
21036 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21038 } else if (ret->check_utf8) {
21040 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21042 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21047 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21048 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21049 if (r->recurse_locinput)
21050 Newx(ret->recurse_locinput, r->nparens + 1, char *);
21053 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21055 if (RX_MATCH_COPIED(dstr))
21056 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
21058 ret->subbeg = NULL;
21059 #ifdef PERL_ANY_COW
21060 ret->saved_copy = NULL;
21063 /* Whether mother_re be set or no, we need to copy the string. We
21064 cannot refrain from copying it when the storage points directly to
21065 our mother regexp, because that's
21066 1: a buffer in a different thread
21067 2: something we no longer hold a reference on
21068 so we need to copy it locally. */
21069 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21070 /* set malloced length to a non-zero value so it will be freed
21071 * (otherwise in combination with SVf_FAKE it looks like an alien
21072 * buffer). It doesn't have to be the actual malloced size, since it
21073 * should never be grown */
21074 SvLEN_set(dstr, SvCUR(sstr)+1);
21075 ret->mother_re = NULL;
21077 #endif /* PERL_IN_XSUB_RE */
21082 This is the internal complement to regdupe() which is used to copy
21083 the structure pointed to by the *pprivate pointer in the regexp.
21084 This is the core version of the extension overridable cloning hook.
21085 The regexp structure being duplicated will be copied by perl prior
21086 to this and will be provided as the regexp *r argument, however
21087 with the /old/ structures pprivate pointer value. Thus this routine
21088 may override any copying normally done by perl.
21090 It returns a pointer to the new regexp_internal structure.
21094 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
21097 struct regexp *const r = ReANY(rx);
21098 regexp_internal *reti;
21100 RXi_GET_DECL(r, ri);
21102 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
21106 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
21107 char, regexp_internal);
21108 Copy(ri->program, reti->program, len+1, regnode);
21111 if (ri->code_blocks) {
21113 Newx(reti->code_blocks, 1, struct reg_code_blocks);
21114 Newx(reti->code_blocks->cb, ri->code_blocks->count,
21115 struct reg_code_block);
21116 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
21117 ri->code_blocks->count, struct reg_code_block);
21118 for (n = 0; n < ri->code_blocks->count; n++)
21119 reti->code_blocks->cb[n].src_regex = (REGEXP*)
21120 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
21121 reti->code_blocks->count = ri->code_blocks->count;
21122 reti->code_blocks->refcnt = 1;
21125 reti->code_blocks = NULL;
21127 reti->regstclass = NULL;
21130 struct reg_data *d;
21131 const int count = ri->data->count;
21134 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
21135 char, struct reg_data);
21136 Newx(d->what, count, U8);
21139 for (i = 0; i < count; i++) {
21140 d->what[i] = ri->data->what[i];
21141 switch (d->what[i]) {
21142 /* see also regcomp.h and regfree_internal() */
21143 case 'a': /* actually an AV, but the dup function is identical.
21144 values seem to be "plain sv's" generally. */
21145 case 'r': /* a compiled regex (but still just another SV) */
21146 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
21147 this use case should go away, the code could have used
21148 'a' instead - see S_set_ANYOF_arg() for array contents. */
21149 case 'S': /* actually an SV, but the dup function is identical. */
21150 case 'u': /* actually an HV, but the dup function is identical.
21151 values are "plain sv's" */
21152 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
21155 /* Synthetic Start Class - "Fake" charclass we generate to optimize
21156 * patterns which could start with several different things. Pre-TRIE
21157 * this was more important than it is now, however this still helps
21158 * in some places, for instance /x?a+/ might produce a SSC equivalent
21159 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
21162 /* This is cheating. */
21163 Newx(d->data[i], 1, regnode_ssc);
21164 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
21165 reti->regstclass = (regnode*)d->data[i];
21168 /* AHO-CORASICK fail table */
21169 /* Trie stclasses are readonly and can thus be shared
21170 * without duplication. We free the stclass in pregfree
21171 * when the corresponding reg_ac_data struct is freed.
21173 reti->regstclass= ri->regstclass;
21176 /* TRIE transition table */
21178 ((reg_trie_data*)ri->data->data[i])->refcount++;
21181 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
21182 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
21183 is not from another regexp */
21184 d->data[i] = ri->data->data[i];
21187 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
21188 ri->data->what[i]);
21197 reti->name_list_idx = ri->name_list_idx;
21199 #ifdef RE_TRACK_PATTERN_OFFSETS
21200 if (ri->u.offsets) {
21201 Newx(reti->u.offsets, 2*len+1, U32);
21202 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
21205 SetProgLen(reti, len);
21208 return (void*)reti;
21211 #endif /* USE_ITHREADS */
21213 #ifndef PERL_IN_XSUB_RE
21216 - regnext - dig the "next" pointer out of a node
21219 Perl_regnext(pTHX_ regnode *p)
21226 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
21227 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
21228 (int)OP(p), (int)REGNODE_MAX);
21231 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
21241 S_re_croak2(pTHX_ bool utf8, const char* pat1, const char* pat2,...)
21244 STRLEN l1 = strlen(pat1);
21245 STRLEN l2 = strlen(pat2);
21248 const char *message;
21250 PERL_ARGS_ASSERT_RE_CROAK2;
21256 Copy(pat1, buf, l1 , char);
21257 Copy(pat2, buf + l1, l2 , char);
21258 buf[l1 + l2] = '\n';
21259 buf[l1 + l2 + 1] = '\0';
21260 va_start(args, pat2);
21261 msv = vmess(buf, &args);
21263 message = SvPV_const(msv, l1);
21266 Copy(message, buf, l1 , char);
21267 /* l1-1 to avoid \n */
21268 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf));
21271 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
21273 #ifndef PERL_IN_XSUB_RE
21275 Perl_save_re_context(pTHX)
21280 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
21283 const REGEXP * const rx = PM_GETRE(PL_curpm);
21285 nparens = RX_NPARENS(rx);
21288 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
21289 * that PL_curpm will be null, but that utf8.pm and the modules it
21290 * loads will only use $1..$3.
21291 * The t/porting/re_context.t test file checks this assumption.
21296 for (i = 1; i <= nparens; i++) {
21297 char digits[TYPE_CHARS(long)];
21298 const STRLEN len = my_snprintf(digits, sizeof(digits),
21300 GV *const *const gvp
21301 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
21304 GV * const gv = *gvp;
21305 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
21315 S_put_code_point(pTHX_ SV *sv, UV c)
21317 PERL_ARGS_ASSERT_PUT_CODE_POINT;
21320 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
21322 else if (isPRINT(c)) {
21323 const char string = (char) c;
21325 /* We use {phrase} as metanotation in the class, so also escape literal
21327 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
21328 sv_catpvs(sv, "\\");
21329 sv_catpvn(sv, &string, 1);
21331 else if (isMNEMONIC_CNTRL(c)) {
21332 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
21335 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
21339 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
21342 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
21344 /* Appends to 'sv' a displayable version of the range of code points from
21345 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
21346 * that have them, when they occur at the beginning or end of the range.
21347 * It uses hex to output the remaining code points, unless 'allow_literals'
21348 * is true, in which case the printable ASCII ones are output as-is (though
21349 * some of these will be escaped by put_code_point()).
21351 * NOTE: This is designed only for printing ranges of code points that fit
21352 * inside an ANYOF bitmap. Higher code points are simply suppressed
21355 const unsigned int min_range_count = 3;
21357 assert(start <= end);
21359 PERL_ARGS_ASSERT_PUT_RANGE;
21361 while (start <= end) {
21363 const char * format;
21365 if (end - start < min_range_count) {
21367 /* Output chars individually when they occur in short ranges */
21368 for (; start <= end; start++) {
21369 put_code_point(sv, start);
21374 /* If permitted by the input options, and there is a possibility that
21375 * this range contains a printable literal, look to see if there is
21377 if (allow_literals && start <= MAX_PRINT_A) {
21379 /* If the character at the beginning of the range isn't an ASCII
21380 * printable, effectively split the range into two parts:
21381 * 1) the portion before the first such printable,
21383 * and output them separately. */
21384 if (! isPRINT_A(start)) {
21385 UV temp_end = start + 1;
21387 /* There is no point looking beyond the final possible
21388 * printable, in MAX_PRINT_A */
21389 UV max = MIN(end, MAX_PRINT_A);
21391 while (temp_end <= max && ! isPRINT_A(temp_end)) {
21395 /* Here, temp_end points to one beyond the first printable if
21396 * found, or to one beyond 'max' if not. If none found, make
21397 * sure that we use the entire range */
21398 if (temp_end > MAX_PRINT_A) {
21399 temp_end = end + 1;
21402 /* Output the first part of the split range: the part that
21403 * doesn't have printables, with the parameter set to not look
21404 * for literals (otherwise we would infinitely recurse) */
21405 put_range(sv, start, temp_end - 1, FALSE);
21407 /* The 2nd part of the range (if any) starts here. */
21410 /* We do a continue, instead of dropping down, because even if
21411 * the 2nd part is non-empty, it could be so short that we want
21412 * to output it as individual characters, as tested for at the
21413 * top of this loop. */
21417 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
21418 * output a sub-range of just the digits or letters, then process
21419 * the remaining portion as usual. */
21420 if (isALPHANUMERIC_A(start)) {
21421 UV mask = (isDIGIT_A(start))
21426 UV temp_end = start + 1;
21428 /* Find the end of the sub-range that includes just the
21429 * characters in the same class as the first character in it */
21430 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
21435 /* For short ranges, don't duplicate the code above to output
21436 * them; just call recursively */
21437 if (temp_end - start < min_range_count) {
21438 put_range(sv, start, temp_end, FALSE);
21440 else { /* Output as a range */
21441 put_code_point(sv, start);
21442 sv_catpvs(sv, "-");
21443 put_code_point(sv, temp_end);
21445 start = temp_end + 1;
21449 /* We output any other printables as individual characters */
21450 if (isPUNCT_A(start) || isSPACE_A(start)) {
21451 while (start <= end && (isPUNCT_A(start)
21452 || isSPACE_A(start)))
21454 put_code_point(sv, start);
21459 } /* End of looking for literals */
21461 /* Here is not to output as a literal. Some control characters have
21462 * mnemonic names. Split off any of those at the beginning and end of
21463 * the range to print mnemonically. It isn't possible for many of
21464 * these to be in a row, so this won't overwhelm with output */
21466 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
21468 while (isMNEMONIC_CNTRL(start) && start <= end) {
21469 put_code_point(sv, start);
21473 /* If this didn't take care of the whole range ... */
21474 if (start <= end) {
21476 /* Look backwards from the end to find the final non-mnemonic
21479 while (isMNEMONIC_CNTRL(temp_end)) {
21483 /* And separately output the interior range that doesn't start
21484 * or end with mnemonics */
21485 put_range(sv, start, temp_end, FALSE);
21487 /* Then output the mnemonic trailing controls */
21488 start = temp_end + 1;
21489 while (start <= end) {
21490 put_code_point(sv, start);
21497 /* As a final resort, output the range or subrange as hex. */
21499 this_end = (end < NUM_ANYOF_CODE_POINTS)
21501 : NUM_ANYOF_CODE_POINTS - 1;
21502 #if NUM_ANYOF_CODE_POINTS > 256
21503 format = (this_end < 256)
21504 ? "\\x%02" UVXf "-\\x%02" UVXf
21505 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
21507 format = "\\x%02" UVXf "-\\x%02" UVXf;
21509 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
21510 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
21511 GCC_DIAG_RESTORE_STMT;
21517 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
21519 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
21523 bool allow_literals = TRUE;
21525 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
21527 /* Generally, it is more readable if printable characters are output as
21528 * literals, but if a range (nearly) spans all of them, it's best to output
21529 * it as a single range. This code will use a single range if all but 2
21530 * ASCII printables are in it */
21531 invlist_iterinit(invlist);
21532 while (invlist_iternext(invlist, &start, &end)) {
21534 /* If the range starts beyond the final printable, it doesn't have any
21536 if (start > MAX_PRINT_A) {
21540 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
21541 * all but two, the range must start and end no later than 2 from
21543 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
21544 if (end > MAX_PRINT_A) {
21550 if (end - start >= MAX_PRINT_A - ' ' - 2) {
21551 allow_literals = FALSE;
21556 invlist_iterfinish(invlist);
21558 /* Here we have figured things out. Output each range */
21559 invlist_iterinit(invlist);
21560 while (invlist_iternext(invlist, &start, &end)) {
21561 if (start >= NUM_ANYOF_CODE_POINTS) {
21564 put_range(sv, start, end, allow_literals);
21566 invlist_iterfinish(invlist);
21572 S_put_charclass_bitmap_innards_common(pTHX_
21573 SV* invlist, /* The bitmap */
21574 SV* posixes, /* Under /l, things like [:word:], \S */
21575 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
21576 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
21577 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
21578 const bool invert /* Is the result to be inverted? */
21581 /* Create and return an SV containing a displayable version of the bitmap
21582 * and associated information determined by the input parameters. If the
21583 * output would have been only the inversion indicator '^', NULL is instead
21589 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
21592 output = newSVpvs("^");
21595 output = newSVpvs("");
21598 /* First, the code points in the bitmap that are unconditionally there */
21599 put_charclass_bitmap_innards_invlist(output, invlist);
21601 /* Traditionally, these have been placed after the main code points */
21603 sv_catsv(output, posixes);
21606 if (only_utf8 && _invlist_len(only_utf8)) {
21607 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
21608 put_charclass_bitmap_innards_invlist(output, only_utf8);
21611 if (not_utf8 && _invlist_len(not_utf8)) {
21612 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
21613 put_charclass_bitmap_innards_invlist(output, not_utf8);
21616 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
21617 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
21618 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
21620 /* This is the only list in this routine that can legally contain code
21621 * points outside the bitmap range. The call just above to
21622 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
21623 * output them here. There's about a half-dozen possible, and none in
21624 * contiguous ranges longer than 2 */
21625 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
21627 SV* above_bitmap = NULL;
21629 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
21631 invlist_iterinit(above_bitmap);
21632 while (invlist_iternext(above_bitmap, &start, &end)) {
21635 for (i = start; i <= end; i++) {
21636 put_code_point(output, i);
21639 invlist_iterfinish(above_bitmap);
21640 SvREFCNT_dec_NN(above_bitmap);
21644 if (invert && SvCUR(output) == 1) {
21652 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
21654 SV *nonbitmap_invlist,
21655 SV *only_utf8_locale_invlist,
21656 const regnode * const node,
21657 const bool force_as_is_display)
21659 /* Appends to 'sv' a displayable version of the innards of the bracketed
21660 * character class defined by the other arguments:
21661 * 'bitmap' points to the bitmap, or NULL if to ignore that.
21662 * 'nonbitmap_invlist' is an inversion list of the code points that are in
21663 * the bitmap range, but for some reason aren't in the bitmap; NULL if
21664 * none. The reasons for this could be that they require some
21665 * condition such as the target string being or not being in UTF-8
21666 * (under /d), or because they came from a user-defined property that
21667 * was not resolved at the time of the regex compilation (under /u)
21668 * 'only_utf8_locale_invlist' is an inversion list of the code points that
21669 * are valid only if the runtime locale is a UTF-8 one; NULL if none
21670 * 'node' is the regex pattern ANYOF node. It is needed only when the
21671 * above two parameters are not null, and is passed so that this
21672 * routine can tease apart the various reasons for them.
21673 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
21674 * to invert things to see if that leads to a cleaner display. If
21675 * FALSE, this routine is free to use its judgment about doing this.
21677 * It returns TRUE if there was actually something output. (It may be that
21678 * the bitmap, etc is empty.)
21680 * When called for outputting the bitmap of a non-ANYOF node, just pass the
21681 * bitmap, with the succeeding parameters set to NULL, and the final one to
21685 /* In general, it tries to display the 'cleanest' representation of the
21686 * innards, choosing whether to display them inverted or not, regardless of
21687 * whether the class itself is to be inverted. However, there are some
21688 * cases where it can't try inverting, as what actually matches isn't known
21689 * until runtime, and hence the inversion isn't either. */
21692 bool inverting_allowed = ! force_as_is_display;
21695 STRLEN orig_sv_cur = SvCUR(sv);
21697 SV* invlist; /* Inversion list we accumulate of code points that
21698 are unconditionally matched */
21699 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
21701 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
21703 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
21704 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
21707 SV* as_is_display; /* The output string when we take the inputs
21709 SV* inverted_display; /* The output string when we invert the inputs */
21711 U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
21713 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
21715 /* We are biased in favor of displaying things without them being inverted,
21716 * as that is generally easier to understand */
21717 const int bias = 5;
21719 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
21721 /* Start off with whatever code points are passed in. (We clone, so we
21722 * don't change the caller's list) */
21723 if (nonbitmap_invlist) {
21724 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
21725 invlist = invlist_clone(nonbitmap_invlist, NULL);
21727 else { /* Worst case size is every other code point is matched */
21728 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
21732 if (OP(node) == ANYOFD) {
21734 /* This flag indicates that the code points below 0x100 in the
21735 * nonbitmap list are precisely the ones that match only when the
21736 * target is UTF-8 (they should all be non-ASCII). */
21737 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
21739 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
21740 _invlist_subtract(invlist, only_utf8, &invlist);
21743 /* And this flag for matching all non-ASCII 0xFF and below */
21744 if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
21746 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
21749 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
21751 /* If either of these flags are set, what matches isn't
21752 * determinable except during execution, so don't know enough here
21754 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
21755 inverting_allowed = FALSE;
21758 /* What the posix classes match also varies at runtime, so these
21759 * will be output symbolically. */
21760 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
21763 posixes = newSVpvs("");
21764 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
21765 if (ANYOF_POSIXL_TEST(node, i)) {
21766 sv_catpv(posixes, anyofs[i]);
21773 /* Accumulate the bit map into the unconditional match list */
21775 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
21776 if (BITMAP_TEST(bitmap, i)) {
21779 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
21782 invlist = _add_range_to_invlist(invlist, start, i-1);
21787 /* Make sure that the conditional match lists don't have anything in them
21788 * that match unconditionally; otherwise the output is quite confusing.
21789 * This could happen if the code that populates these misses some
21792 _invlist_subtract(only_utf8, invlist, &only_utf8);
21795 _invlist_subtract(not_utf8, invlist, ¬_utf8);
21798 if (only_utf8_locale_invlist) {
21800 /* Since this list is passed in, we have to make a copy before
21802 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
21804 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
21806 /* And, it can get really weird for us to try outputting an inverted
21807 * form of this list when it has things above the bitmap, so don't even
21809 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
21810 inverting_allowed = FALSE;
21814 /* Calculate what the output would be if we take the input as-is */
21815 as_is_display = put_charclass_bitmap_innards_common(invlist,
21822 /* If have to take the output as-is, just do that */
21823 if (! inverting_allowed) {
21824 if (as_is_display) {
21825 sv_catsv(sv, as_is_display);
21826 SvREFCNT_dec_NN(as_is_display);
21829 else { /* But otherwise, create the output again on the inverted input, and
21830 use whichever version is shorter */
21832 int inverted_bias, as_is_bias;
21834 /* We will apply our bias to whichever of the the results doesn't have
21844 inverted_bias = bias;
21847 /* Now invert each of the lists that contribute to the output,
21848 * excluding from the result things outside the possible range */
21850 /* For the unconditional inversion list, we have to add in all the
21851 * conditional code points, so that when inverted, they will be gone
21853 _invlist_union(only_utf8, invlist, &invlist);
21854 _invlist_union(not_utf8, invlist, &invlist);
21855 _invlist_union(only_utf8_locale, invlist, &invlist);
21856 _invlist_invert(invlist);
21857 _invlist_intersection(invlist, PL_InBitmap, &invlist);
21860 _invlist_invert(only_utf8);
21861 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
21863 else if (not_utf8) {
21865 /* If a code point matches iff the target string is not in UTF-8,
21866 * then complementing the result has it not match iff not in UTF-8,
21867 * which is the same thing as matching iff it is UTF-8. */
21868 only_utf8 = not_utf8;
21872 if (only_utf8_locale) {
21873 _invlist_invert(only_utf8_locale);
21874 _invlist_intersection(only_utf8_locale,
21876 &only_utf8_locale);
21879 inverted_display = put_charclass_bitmap_innards_common(
21884 only_utf8_locale, invert);
21886 /* Use the shortest representation, taking into account our bias
21887 * against showing it inverted */
21888 if ( inverted_display
21889 && ( ! as_is_display
21890 || ( SvCUR(inverted_display) + inverted_bias
21891 < SvCUR(as_is_display) + as_is_bias)))
21893 sv_catsv(sv, inverted_display);
21895 else if (as_is_display) {
21896 sv_catsv(sv, as_is_display);
21899 SvREFCNT_dec(as_is_display);
21900 SvREFCNT_dec(inverted_display);
21903 SvREFCNT_dec_NN(invlist);
21904 SvREFCNT_dec(only_utf8);
21905 SvREFCNT_dec(not_utf8);
21906 SvREFCNT_dec(posixes);
21907 SvREFCNT_dec(only_utf8_locale);
21909 return SvCUR(sv) > orig_sv_cur;
21912 #define CLEAR_OPTSTART \
21913 if (optstart) STMT_START { \
21914 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
21915 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
21919 #define DUMPUNTIL(b,e) \
21921 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
21923 STATIC const regnode *
21924 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
21925 const regnode *last, const regnode *plast,
21926 SV* sv, I32 indent, U32 depth)
21928 U8 op = PSEUDO; /* Arbitrary non-END op. */
21929 const regnode *next;
21930 const regnode *optstart= NULL;
21932 RXi_GET_DECL(r, ri);
21933 GET_RE_DEBUG_FLAGS_DECL;
21935 PERL_ARGS_ASSERT_DUMPUNTIL;
21937 #ifdef DEBUG_DUMPUNTIL
21938 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
21939 last ? last-start : 0, plast ? plast-start : 0);
21942 if (plast && plast < last)
21945 while (PL_regkind[op] != END && (!last || node < last)) {
21947 /* While that wasn't END last time... */
21950 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
21952 next = regnext((regnode *)node);
21955 if (OP(node) == OPTIMIZED) {
21956 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
21963 regprop(r, sv, node, NULL, NULL);
21964 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
21965 (int)(2*indent + 1), "", SvPVX_const(sv));
21967 if (OP(node) != OPTIMIZED) {
21968 if (next == NULL) /* Next ptr. */
21969 Perl_re_printf( aTHX_ " (0)");
21970 else if (PL_regkind[(U8)op] == BRANCH
21971 && PL_regkind[OP(next)] != BRANCH )
21972 Perl_re_printf( aTHX_ " (FAIL)");
21974 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
21975 Perl_re_printf( aTHX_ "\n");
21979 if (PL_regkind[(U8)op] == BRANCHJ) {
21982 const regnode *nnode = (OP(next) == LONGJMP
21983 ? regnext((regnode *)next)
21985 if (last && nnode > last)
21987 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
21990 else if (PL_regkind[(U8)op] == BRANCH) {
21992 DUMPUNTIL(NEXTOPER(node), next);
21994 else if ( PL_regkind[(U8)op] == TRIE ) {
21995 const regnode *this_trie = node;
21996 const char op = OP(node);
21997 const U32 n = ARG(node);
21998 const reg_ac_data * const ac = op>=AHOCORASICK ?
21999 (reg_ac_data *)ri->data->data[n] :
22001 const reg_trie_data * const trie =
22002 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22004 AV *const trie_words
22005 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22007 const regnode *nextbranch= NULL;
22010 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22011 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22013 Perl_re_indentf( aTHX_ "%s ",
22016 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22017 SvCUR(*elem_ptr), PL_dump_re_max_len,
22018 PL_colors[0], PL_colors[1],
22020 ? PERL_PV_ESCAPE_UNI
22022 | PERL_PV_PRETTY_ELLIPSES
22023 | PERL_PV_PRETTY_LTGT
22028 U16 dist= trie->jump[word_idx+1];
22029 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
22030 (UV)((dist ? this_trie + dist : next) - start));
22033 nextbranch= this_trie + trie->jump[0];
22034 DUMPUNTIL(this_trie + dist, nextbranch);
22036 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22037 nextbranch= regnext((regnode *)nextbranch);
22039 Perl_re_printf( aTHX_ "\n");
22042 if (last && next > last)
22047 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
22048 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22049 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22051 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22053 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22055 else if ( op == PLUS || op == STAR) {
22056 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22058 else if (PL_regkind[(U8)op] == EXACT) {
22059 /* Literal string, where present. */
22060 node += NODE_SZ_STR(node) - 1;
22061 node = NEXTOPER(node);
22064 node = NEXTOPER(node);
22065 node += regarglen[(U8)op];
22067 if (op == CURLYX || op == OPEN || op == SROPEN)
22071 #ifdef DEBUG_DUMPUNTIL
22072 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
22077 #endif /* DEBUGGING */
22079 #ifndef PERL_IN_XSUB_RE
22081 #include "uni_keywords.h"
22084 Perl_init_uniprops(pTHX)
22088 PL_user_def_props = newHV();
22090 #ifdef USE_ITHREADS
22092 HvSHAREKEYS_off(PL_user_def_props);
22093 PL_user_def_props_aTHX = aTHX;
22097 /* Set up the inversion list global variables */
22099 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22100 PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
22101 PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
22102 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
22103 PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
22104 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
22105 PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
22106 PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
22107 PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
22108 PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
22109 PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
22110 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
22111 PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
22112 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
22113 PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
22114 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
22116 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
22117 PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
22118 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
22119 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
22120 PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
22121 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
22122 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
22123 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
22124 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
22125 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
22126 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
22127 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
22128 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
22129 PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
22130 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
22131 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
22133 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
22134 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
22135 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
22136 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
22137 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
22139 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
22140 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
22141 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
22143 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
22145 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
22146 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
22148 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
22149 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
22151 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
22152 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
22153 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
22154 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
22155 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
22156 PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[
22157 UNI__PERL_NON_FINAL_FOLDS]);
22159 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
22160 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
22161 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
22162 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
22163 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
22164 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
22165 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
22166 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
22167 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
22170 /* The below are used only by deprecated functions. They could be removed */
22171 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
22172 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
22173 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
22179 This code was mainly added for backcompat to give a warning for non-portable
22180 code points in user-defined properties. But experiments showed that the
22181 warning in earlier perls were only omitted on overflow, which should be an
22182 error, so there really isnt a backcompat issue, and actually adding the
22183 warning when none was present before might cause breakage, for little gain. So
22184 khw left this code in, but not enabled. Tests were never added.
22187 Ei |const char *|get_extended_utf8_msg|const UV cp
22189 PERL_STATIC_INLINE const char *
22190 S_get_extended_utf8_msg(pTHX_ const UV cp)
22192 U8 dummy[UTF8_MAXBYTES + 1];
22196 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
22199 msg = hv_fetchs(msgs, "text", 0);
22202 (void) sv_2mortal((SV *) msgs);
22204 return SvPVX(*msg);
22210 Perl_handle_user_defined_property(pTHX_
22212 /* Parses the contents of a user-defined property definition; returning the
22213 * expanded definition if possible. If so, the return is an inversion
22216 * If there are subroutines that are part of the expansion and which aren't
22217 * known at the time of the call to this function, this returns what
22218 * parse_uniprop_string() returned for the first one encountered.
22220 * If an error was found, NULL is returned, and 'msg' gets a suitable
22221 * message appended to it. (Appending allows the back trace of how we got
22222 * to the faulty definition to be displayed through nested calls of
22223 * user-defined subs.)
22225 * The caller IS responsible for freeing any returned SV.
22227 * The syntax of the contents is pretty much described in perlunicode.pod,
22228 * but we also allow comments on each line */
22230 const char * name, /* Name of property */
22231 const STRLEN name_len, /* The name's length in bytes */
22232 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
22233 const bool to_fold, /* ? Is this under /i */
22234 const bool runtime, /* ? Are we in compile- or run-time */
22235 const bool deferrable, /* Is it ok for this property's full definition
22236 to be deferred until later? */
22237 SV* contents, /* The property's definition */
22238 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
22239 getting called unless this is thought to be
22240 a user-defined property */
22241 SV * msg, /* Any error or warning msg(s) are appended to
22243 const STRLEN level) /* Recursion level of this call */
22246 const char * string = SvPV_const(contents, len);
22247 const char * const e = string + len;
22248 const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
22249 const STRLEN msgs_length_on_entry = SvCUR(msg);
22251 const char * s0 = string; /* Points to first byte in the current line
22252 being parsed in 'string' */
22253 const char overflow_msg[] = "Code point too large in \"";
22254 SV* running_definition = NULL;
22256 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
22258 *user_defined_ptr = TRUE;
22260 /* Look at each line */
22262 const char * s; /* Current byte */
22263 char op = '+'; /* Default operation is 'union' */
22264 IV min = 0; /* range begin code point */
22265 IV max = -1; /* and range end */
22266 SV* this_definition;
22268 /* Skip comment lines */
22270 s0 = strchr(s0, '\n');
22278 /* For backcompat, allow an empty first line */
22284 /* First character in the line may optionally be the operation */
22293 /* If the line is one or two hex digits separated by blank space, its
22294 * a range; otherwise it is either another user-defined property or an
22299 if (! isXDIGIT(*s)) {
22300 goto check_if_property;
22303 do { /* Each new hex digit will add 4 bits. */
22304 if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
22305 s = strchr(s, '\n');
22309 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22310 sv_catpv(msg, overflow_msg);
22311 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22312 UTF8fARG(is_contents_utf8, s - s0, s0));
22313 sv_catpvs(msg, "\"");
22314 goto return_failure;
22317 /* Accumulate this digit into the value */
22318 min = (min << 4) + READ_XDIGIT(s);
22319 } while (isXDIGIT(*s));
22321 while (isBLANK(*s)) { s++; }
22323 /* We allow comments at the end of the line */
22325 s = strchr(s, '\n');
22331 else if (s < e && *s != '\n') {
22332 if (! isXDIGIT(*s)) {
22333 goto check_if_property;
22336 /* Look for the high point of the range */
22339 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
22340 s = strchr(s, '\n');
22344 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22345 sv_catpv(msg, overflow_msg);
22346 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22347 UTF8fARG(is_contents_utf8, s - s0, s0));
22348 sv_catpvs(msg, "\"");
22349 goto return_failure;
22352 max = (max << 4) + READ_XDIGIT(s);
22353 } while (isXDIGIT(*s));
22355 while (isBLANK(*s)) { s++; }
22358 s = strchr(s, '\n');
22363 else if (s < e && *s != '\n') {
22364 goto check_if_property;
22368 if (max == -1) { /* The line only had one entry */
22371 else if (max < min) {
22372 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22373 sv_catpvs(msg, "Illegal range in \"");
22374 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22375 UTF8fARG(is_contents_utf8, s - s0, s0));
22376 sv_catpvs(msg, "\"");
22377 goto return_failure;
22380 #if 0 /* See explanation at definition above of get_extended_utf8_msg() */
22382 if ( UNICODE_IS_PERL_EXTENDED(min)
22383 || UNICODE_IS_PERL_EXTENDED(max))
22385 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
22387 /* If both code points are non-portable, warn only on the lower
22389 sv_catpv(msg, get_extended_utf8_msg(
22390 (UNICODE_IS_PERL_EXTENDED(min))
22392 sv_catpvs(msg, " in \"");
22393 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
22394 UTF8fARG(is_contents_utf8, s - s0, s0));
22395 sv_catpvs(msg, "\"");
22400 /* Here, this line contains a legal range */
22401 this_definition = sv_2mortal(_new_invlist(2));
22402 this_definition = _add_range_to_invlist(this_definition, min, max);
22407 /* Here it isn't a legal range line. See if it is a legal property
22408 * line. First find the end of the meat of the line */
22409 s = strpbrk(s, "#\n");
22414 /* Ignore trailing blanks in keeping with the requirements of
22415 * parse_uniprop_string() */
22417 while (s > s0 && isBLANK_A(*s)) {
22422 this_definition = parse_uniprop_string(s0, s - s0,
22423 is_utf8, to_fold, runtime,
22425 user_defined_ptr, msg,
22427 ? level /* Don't increase level
22428 if input is empty */
22431 if (this_definition == NULL) {
22432 goto return_failure; /* 'msg' should have had the reason
22433 appended to it by the above call */
22436 if (! is_invlist(this_definition)) { /* Unknown at this time */
22437 return newSVsv(this_definition);
22441 s = strchr(s, '\n');
22451 _invlist_union(running_definition, this_definition,
22452 &running_definition);
22455 _invlist_subtract(running_definition, this_definition,
22456 &running_definition);
22459 _invlist_intersection(running_definition, this_definition,
22460 &running_definition);
22463 _invlist_union_complement_2nd(running_definition,
22464 this_definition, &running_definition);
22467 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
22468 __FILE__, __LINE__, op);
22472 /* Position past the '\n' */
22474 } /* End of loop through the lines of 'contents' */
22476 /* Here, we processed all the lines in 'contents' without error. If we
22477 * didn't add any warnings, simply return success */
22478 if (msgs_length_on_entry == SvCUR(msg)) {
22480 /* If the expansion was empty, the answer isn't nothing: its an empty
22481 * inversion list */
22482 if (running_definition == NULL) {
22483 running_definition = _new_invlist(1);
22486 return running_definition;
22489 /* Otherwise, add some explanatory text, but we will return success */
22493 running_definition = NULL;
22497 if (name_len > 0) {
22498 sv_catpvs(msg, " in expansion of ");
22499 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
22502 return running_definition;
22505 /* As explained below, certain operations need to take place in the first
22506 * thread created. These macros switch contexts */
22507 #ifdef USE_ITHREADS
22508 # define DECLARATION_FOR_GLOBAL_CONTEXT \
22509 PerlInterpreter * save_aTHX = aTHX;
22510 # define SWITCH_TO_GLOBAL_CONTEXT \
22511 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
22512 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
22513 # define CUR_CONTEXT aTHX
22514 # define ORIGINAL_CONTEXT save_aTHX
22516 # define DECLARATION_FOR_GLOBAL_CONTEXT
22517 # define SWITCH_TO_GLOBAL_CONTEXT NOOP
22518 # define RESTORE_CONTEXT NOOP
22519 # define CUR_CONTEXT NULL
22520 # define ORIGINAL_CONTEXT NULL
22524 S_delete_recursion_entry(pTHX_ void *key)
22526 /* Deletes the entry used to detect recursion when expanding user-defined
22527 * properties. This is a function so it can be set up to be called even if
22528 * the program unexpectedly quits */
22531 SV ** current_entry;
22532 const STRLEN key_len = strlen((const char *) key);
22533 DECLARATION_FOR_GLOBAL_CONTEXT;
22535 SWITCH_TO_GLOBAL_CONTEXT;
22537 /* If the entry is one of these types, it is a permanent entry, and not the
22538 * one used to detect recursions. This function should delete only the
22539 * recursion entry */
22540 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
22542 && ! is_invlist(*current_entry)
22543 && ! SvPOK(*current_entry))
22545 (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
22553 S_get_fq_name(pTHX_
22554 const char * const name, /* The first non-blank in the \p{}, \P{} */
22555 const Size_t name_len, /* Its length in bytes, not including any trailing space */
22556 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
22557 const bool has_colon_colon
22560 /* Returns a mortal SV containing the fully qualified version of the input
22565 fq_name = newSVpvs_flags("", SVs_TEMP);
22567 /* Use the current package if it wasn't included in our input */
22568 if (! has_colon_colon) {
22569 const HV * pkg = (IN_PERL_COMPILETIME)
22571 : CopSTASH(PL_curcop);
22572 const char* pkgname = HvNAME(pkg);
22574 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
22575 UTF8fARG(is_utf8, strlen(pkgname), pkgname));
22576 sv_catpvs(fq_name, "::");
22579 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
22580 UTF8fARG(is_utf8, name_len, name));
22585 Perl_parse_uniprop_string(pTHX_
22587 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
22588 * now. If so, the return is an inversion list.
22590 * If the property is user-defined, it is a subroutine, which in turn
22591 * may call other subroutines. This function will call the whole nest of
22592 * them to get the definition they return; if some aren't known at the time
22593 * of the call to this function, the fully qualified name of the highest
22594 * level sub is returned. It is an error to call this function at runtime
22595 * without every sub defined.
22597 * If an error was found, NULL is returned, and 'msg' gets a suitable
22598 * message appended to it. (Appending allows the back trace of how we got
22599 * to the faulty definition to be displayed through nested calls of
22600 * user-defined subs.)
22602 * The caller should NOT try to free any returned inversion list.
22604 * Other parameters will be set on return as described below */
22606 const char * const name, /* The first non-blank in the \p{}, \P{} */
22607 const Size_t name_len, /* Its length in bytes, not including any
22609 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
22610 const bool to_fold, /* ? Is this under /i */
22611 const bool runtime, /* TRUE if this is being called at run time */
22612 const bool deferrable, /* TRUE if it's ok for the definition to not be
22613 known at this call */
22614 bool *user_defined_ptr, /* Upon return from this function it will be
22615 set to TRUE if any component is a
22616 user-defined property */
22617 SV * msg, /* Any error or warning msg(s) are appended to
22619 const STRLEN level) /* Recursion level of this call */
22622 char* lookup_name; /* normalized name for lookup in our tables */
22623 unsigned lookup_len; /* Its length */
22624 bool stricter = FALSE; /* Some properties have stricter name
22625 normalization rules, which we decide upon
22626 based on parsing */
22628 /* nv= or numeric_value=, or possibly one of the cjk numeric properties
22629 * (though it requires extra effort to download them from Unicode and
22630 * compile perl to know about them) */
22631 bool is_nv_type = FALSE;
22633 unsigned int i, j = 0;
22634 int equals_pos = -1; /* Where the '=' is found, or negative if none */
22635 int slash_pos = -1; /* Where the '/' is found, or negative if none */
22636 int table_index = 0; /* The entry number for this property in the table
22637 of all Unicode property names */
22638 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
22639 Size_t lookup_offset = 0; /* Used to ignore the first few characters of
22640 the normalized name in certain situations */
22641 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
22642 part of a package name */
22643 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
22644 property rather than a Unicode
22646 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
22647 if an error. If it is an inversion list,
22648 it is the definition. Otherwise it is a
22649 string containing the fully qualified sub
22651 SV * fq_name = NULL; /* For user-defined properties, the fully
22653 bool invert_return = FALSE; /* ? Do we need to complement the result before
22656 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
22658 /* The input will be normalized into 'lookup_name' */
22659 Newx(lookup_name, name_len, char);
22660 SAVEFREEPV(lookup_name);
22662 /* Parse the input. */
22663 for (i = 0; i < name_len; i++) {
22664 char cur = name[i];
22666 /* Most of the characters in the input will be of this ilk, being parts
22668 if (isIDCONT_A(cur)) {
22670 /* Case differences are ignored. Our lookup routine assumes
22671 * everything is lowercase, so normalize to that */
22672 if (isUPPER_A(cur)) {
22673 lookup_name[j++] = toLOWER_A(cur);
22677 if (cur == '_') { /* Don't include these in the normalized name */
22681 lookup_name[j++] = cur;
22683 /* The first character in a user-defined name must be of this type.
22685 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
22686 could_be_user_defined = FALSE;
22692 /* Here, the character is not something typically in a name, But these
22693 * two types of characters (and the '_' above) can be freely ignored in
22694 * most situations. Later it may turn out we shouldn't have ignored
22695 * them, and we have to reparse, but we don't have enough information
22696 * yet to make that decision */
22697 if (cur == '-' || isSPACE_A(cur)) {
22698 could_be_user_defined = FALSE;
22702 /* An equals sign or single colon mark the end of the first part of
22703 * the property name */
22705 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
22707 lookup_name[j++] = '='; /* Treat the colon as an '=' */
22708 equals_pos = j; /* Note where it occurred in the input */
22709 could_be_user_defined = FALSE;
22713 /* Otherwise, this character is part of the name. */
22714 lookup_name[j++] = cur;
22716 /* Here it isn't a single colon, so if it is a colon, it must be a
22720 /* A double colon should be a package qualifier. We note its
22721 * position and continue. Note that one could have
22722 * pkg1::pkg2::...::foo
22723 * so that the position at the end of the loop will be just after
22724 * the final qualifier */
22727 non_pkg_begin = i + 1;
22728 lookup_name[j++] = ':';
22730 else { /* Only word chars (and '::') can be in a user-defined name */
22731 could_be_user_defined = FALSE;
22733 } /* End of parsing through the lhs of the property name (or all of it if
22736 #define STRLENs(s) (sizeof("" s "") - 1)
22738 /* If there is a single package name 'utf8::', it is ambiguous. It could
22739 * be for a user-defined property, or it could be a Unicode property, as
22740 * all of them are considered to be for that package. For the purposes of
22741 * parsing the rest of the property, strip it off */
22742 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
22743 lookup_name += STRLENs("utf8::");
22744 j -= STRLENs("utf8::");
22745 equals_pos -= STRLENs("utf8::");
22748 /* Here, we are either done with the whole property name, if it was simple;
22749 * or are positioned just after the '=' if it is compound. */
22751 if (equals_pos >= 0) {
22752 assert(! stricter); /* We shouldn't have set this yet */
22754 /* Space immediately after the '=' is ignored */
22756 for (; i < name_len; i++) {
22757 if (! isSPACE_A(name[i])) {
22762 /* Most punctuation after the equals indicates a subpattern, like
22764 if ( isPUNCT_A(name[i])
22770 /* Find the property. The table includes the equals sign, so we
22772 table_index = match_uniprop((U8 *) lookup_name, j);
22774 const char * const * prop_values
22775 = UNI_prop_value_ptrs[table_index];
22777 Size_t subpattern_len;
22778 REGEXP * subpattern_re;
22779 char open = name[i++];
22781 const char * pos_in_brackets;
22784 /* A backslash means the real delimitter is the next character.
22786 if (open == '\\') {
22791 /* This data structure is constructed so that the matching
22792 * closing bracket is 3 past its matching opening. The second
22793 * set of closing is so that if the opening is something like
22794 * ']', the closing will be that as well. Something similar is
22795 * done in toke.c */
22796 pos_in_brackets = strchr("([<)]>)]>", open);
22797 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
22800 || name[name_len-1] != close
22801 || (escaped && name[name_len-2] != '\\'))
22803 sv_catpvs(msg, "Unicode property wildcard not terminated");
22804 goto append_name_to_msg;
22807 Perl_ck_warner_d(aTHX_
22808 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
22809 "The Unicode property wildcards feature is experimental");
22811 /* Now create and compile the wildcard subpattern. Use /iaa
22812 * because nothing outside of ASCII will match, and it the
22813 * property values should all match /i. Note that when the
22814 * pattern fails to compile, our added text to the user's
22815 * pattern will be displayed to the user, which is not so
22817 subpattern_len = name_len - i - 1 - escaped;
22818 subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)",
22819 (unsigned) subpattern_len,
22821 subpattern = sv_2mortal(subpattern);
22822 subpattern_re = re_compile(subpattern, 0);
22823 assert(subpattern_re); /* Should have died if didn't compile
22826 /* For each legal property value, see if the supplied pattern
22828 while (*prop_values) {
22829 const char * const entry = *prop_values;
22830 const Size_t len = strlen(entry);
22831 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
22833 if (pregexec(subpattern_re,
22835 (char *) entry + len,
22839 { /* Here, matched. Add to the returned list */
22840 Size_t total_len = j + len;
22841 SV * sub_invlist = NULL;
22842 char * this_string;
22844 /* We know this is a legal \p{property=value}. Call
22845 * the function to return the list of code points that
22847 Newxz(this_string, total_len + 1, char);
22848 Copy(lookup_name, this_string, j, char);
22849 my_strlcat(this_string, entry, total_len + 1);
22850 SAVEFREEPV(this_string);
22851 sub_invlist = parse_uniprop_string(this_string,
22860 _invlist_union(prop_definition, sub_invlist,
22864 prop_values++; /* Next iteration, look at next propvalue */
22865 } /* End of looking through property values; (the data
22866 structure is terminated by a NULL ptr) */
22868 SvREFCNT_dec_NN(subpattern_re);
22870 if (prop_definition) {
22871 return prop_definition;
22874 sv_catpvs(msg, "No Unicode property value wildcard matches:");
22875 goto append_name_to_msg;
22878 /* Here's how khw thinks we should proceed to handle the properties
22879 * not yet done: Bidi Mirroring Glyph
22880 Bidi Paired Bracket
22881 Case Folding (both full and simple)
22882 Decomposition Mapping
22883 Equivalent Unified Ideograph
22886 Lowercase Mapping (both full and simple)
22888 Titlecase Mapping (both full and simple)
22889 Uppercase Mapping (both full and simple)
22890 * Move the part that looks at the property values into a perl
22891 * script, like utf8_heavy.pl is done. This makes things somewhat
22892 * easier, but most importantly, it avoids always adding all these
22893 * strings to the memory usage when the feature is little-used.
22895 * The property values would all be concatenated into a single
22896 * string per property with each value on a separate line, and the
22897 * code point it's for on alternating lines. Then we match the
22898 * user's input pattern m//mg, without having to worry about their
22899 * uses of '^' and '$'. Only the values that aren't the default
22900 * would be in the strings. Code points would be in UTF-8. The
22901 * search pattern that we would construct would look like
22902 * (?: \n (code-point_re) \n (?aam: user-re ) \n )
22903 * And so $1 would contain the code point that matched the user-re.
22904 * For properties where the default is the code point itself, such
22905 * as any of the case changing mappings, the string would otherwise
22906 * consist of all Unicode code points in UTF-8 strung together.
22907 * This would be impractical. So instead, examine their compiled
22908 * pattern, looking at the ssc. If none, reject the pattern as an
22909 * error. Otherwise run the pattern against every code point in
22910 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
22911 * And it might be good to create an API to return the ssc.
22913 * For the name properties, a new function could be created in
22914 * charnames which essentially does the same thing as above,
22915 * sharing Name.pl with the other charname functions. Don't know
22916 * about loose name matching, or algorithmically determined names.
22917 * Decomposition.pl similarly.
22919 * It might be that a new pattern modifier would have to be
22920 * created, like /t for resTricTed, which changed the behavior of
22921 * some constructs in their subpattern, like \A. */
22922 } /* End of is a wildcard subppattern */
22925 /* Certain properties whose values are numeric need special handling.
22926 * They may optionally be prefixed by 'is'. Ignore that prefix for the
22927 * purposes of checking if this is one of those properties */
22928 if (memBEGINPs(lookup_name, name_len, "is")) {
22932 /* Then check if it is one of these specially-handled properties. The
22933 * possibilities are hard-coded because easier this way, and the list
22934 * is unlikely to change.
22936 * All numeric value type properties are of this ilk, and are also
22937 * special in a different way later on. So find those first. There
22938 * are several numeric value type properties in the Unihan DB (which is
22939 * unlikely to be compiled with perl, but we handle it here in case it
22940 * does get compiled). They all end with 'numeric'. The interiors
22941 * aren't checked for the precise property. This would stop working if
22942 * a cjk property were to be created that ended with 'numeric' and
22943 * wasn't a numeric type */
22944 is_nv_type = memEQs(lookup_name + lookup_offset,
22945 j - 1 - lookup_offset, "numericvalue")
22946 || memEQs(lookup_name + lookup_offset,
22947 j - 1 - lookup_offset, "nv")
22948 || ( memENDPs(lookup_name + lookup_offset,
22949 j - 1 - lookup_offset, "numeric")
22950 && ( memBEGINPs(lookup_name + lookup_offset,
22951 j - 1 - lookup_offset, "cjk")
22952 || memBEGINPs(lookup_name + lookup_offset,
22953 j - 1 - lookup_offset, "k")));
22955 || memEQs(lookup_name + lookup_offset,
22956 j - 1 - lookup_offset, "canonicalcombiningclass")
22957 || memEQs(lookup_name + lookup_offset,
22958 j - 1 - lookup_offset, "ccc")
22959 || memEQs(lookup_name + lookup_offset,
22960 j - 1 - lookup_offset, "age")
22961 || memEQs(lookup_name + lookup_offset,
22962 j - 1 - lookup_offset, "in")
22963 || memEQs(lookup_name + lookup_offset,
22964 j - 1 - lookup_offset, "presentin"))
22968 /* Since the stuff after the '=' is a number, we can't throw away
22969 * '-' willy-nilly, as those could be a minus sign. Other stricter
22970 * rules also apply. However, these properties all can have the
22971 * rhs not be a number, in which case they contain at least one
22972 * alphabetic. In those cases, the stricter rules don't apply.
22973 * But the numeric type properties can have the alphas [Ee] to
22974 * signify an exponent, and it is still a number with stricter
22975 * rules. So look for an alpha that signifies not-strict */
22977 for (k = i; k < name_len; k++) {
22978 if ( isALPHA_A(name[k])
22979 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
22989 /* A number may have a leading '+' or '-'. The latter is retained
22991 if (name[i] == '+') {
22994 else if (name[i] == '-') {
22995 lookup_name[j++] = '-';
22999 /* Skip leading zeros including single underscores separating the
23000 * zeros, or between the final leading zero and the first other
23002 for (; i < name_len - 1; i++) {
23003 if ( name[i] != '0'
23004 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
23011 else { /* No '=' */
23013 /* Only a few properties without an '=' should be parsed with stricter
23014 * rules. The list is unlikely to change. */
23015 if ( memBEGINPs(lookup_name, j, "perl")
23016 && memNEs(lookup_name + 4, j - 4, "space")
23017 && memNEs(lookup_name + 4, j - 4, "word"))
23021 /* We set the inputs back to 0 and the code below will reparse,
23027 /* Here, we have either finished the property, or are positioned to parse
23028 * the remainder, and we know if stricter rules apply. Finish out, if not
23030 for (; i < name_len; i++) {
23031 char cur = name[i];
23033 /* In all instances, case differences are ignored, and we normalize to
23035 if (isUPPER_A(cur)) {
23036 lookup_name[j++] = toLOWER(cur);
23040 /* An underscore is skipped, but not under strict rules unless it
23041 * separates two digits */
23044 && ( i == 0 || (int) i == equals_pos || i == name_len- 1
23045 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
23047 lookup_name[j++] = '_';
23052 /* Hyphens are skipped except under strict */
23053 if (cur == '-' && ! stricter) {
23057 /* XXX Bug in documentation. It says white space skipped adjacent to
23058 * non-word char. Maybe we should, but shouldn't skip it next to a dot
23060 if (isSPACE_A(cur) && ! stricter) {
23064 lookup_name[j++] = cur;
23066 /* Unless this is a non-trailing slash, we are done with it */
23067 if (i >= name_len - 1 || cur != '/') {
23073 /* A slash in the 'numeric value' property indicates that what follows
23074 * is a denominator. It can have a leading '+' and '0's that should be
23075 * skipped. But we have never allowed a negative denominator, so treat
23076 * a minus like every other character. (No need to rule out a second
23077 * '/', as that won't match anything anyway */
23080 if (i < name_len && name[i] == '+') {
23084 /* Skip leading zeros including underscores separating digits */
23085 for (; i < name_len - 1; i++) {
23086 if ( name[i] != '0'
23087 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
23093 /* Store the first real character in the denominator */
23094 lookup_name[j++] = name[i];
23098 /* Here are completely done parsing the input 'name', and 'lookup_name'
23099 * contains a copy, normalized.
23101 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
23102 * different from without the underscores. */
23103 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
23104 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
23105 && UNLIKELY(name[name_len-1] == '_'))
23107 lookup_name[j++] = '&';
23110 /* If the original input began with 'In' or 'Is', it could be a subroutine
23111 * call to a user-defined property instead of a Unicode property name. */
23112 if ( non_pkg_begin + name_len > 2
23113 && name[non_pkg_begin+0] == 'I'
23114 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
23116 /* Names that start with In have different characterstics than those
23117 * that start with Is */
23118 if (name[non_pkg_begin+1] == 's') {
23119 starts_with_Is = TRUE;
23123 could_be_user_defined = FALSE;
23126 if (could_be_user_defined) {
23129 /* If the user defined property returns the empty string, it could
23130 * easily be because the pattern is being compiled before the data it
23131 * actually needs to compile is available. This could be argued to be
23132 * a bug in the perl code, but this is a change of behavior for Perl,
23133 * so we handle it. This means that intentionally returning nothing
23134 * will not be resolved until runtime */
23135 bool empty_return = FALSE;
23137 /* Here, the name could be for a user defined property, which are
23138 * implemented as subs. */
23139 user_sub = get_cvn_flags(name, name_len, 0);
23141 const char insecure[] = "Insecure user-defined property";
23143 /* Here, there is a sub by the correct name. Normally we call it
23144 * to get the property definition */
23146 SV * user_sub_sv = MUTABLE_SV(user_sub);
23147 SV * error; /* Any error returned by calling 'user_sub' */
23148 SV * key; /* The key into the hash of user defined sub names
23151 SV ** saved_user_prop_ptr; /* Hash entry for this property */
23153 /* How many times to retry when another thread is in the middle of
23154 * expanding the same definition we want */
23155 PERL_INT_FAST8_T retry_countdown = 10;
23157 DECLARATION_FOR_GLOBAL_CONTEXT;
23159 /* If we get here, we know this property is user-defined */
23160 *user_defined_ptr = TRUE;
23162 /* We refuse to call a potentially tainted subroutine; returning an
23165 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23166 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
23167 goto append_name_to_msg;
23170 /* In principal, we only call each subroutine property definition
23171 * once during the life of the program. This guarantees that the
23172 * property definition never changes. The results of the single
23173 * sub call are stored in a hash, which is used instead for future
23174 * references to this property. The property definition is thus
23175 * immutable. But, to allow the user to have a /i-dependent
23176 * definition, we call the sub once for non-/i, and once for /i,
23177 * should the need arise, passing the /i status as a parameter.
23179 * We start by constructing the hash key name, consisting of the
23180 * fully qualified subroutine name, preceded by the /i status, so
23181 * that there is a key for /i and a different key for non-/i */
23182 key = newSVpvn(((to_fold) ? "1" : "0"), 1);
23183 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
23184 non_pkg_begin != 0);
23185 sv_catsv(key, fq_name);
23188 /* We only call the sub once throughout the life of the program
23189 * (with the /i, non-/i exception noted above). That means the
23190 * hash must be global and accessible to all threads. It is
23191 * created at program start-up, before any threads are created, so
23192 * is accessible to all children. But this creates some
23195 * 1) The keys can't be shared, or else problems arise; sharing is
23196 * turned off at hash creation time
23197 * 2) All SVs in it are there for the remainder of the life of the
23198 * program, and must be created in the same interpreter context
23199 * as the hash, or else they will be freed from the wrong pool
23200 * at global destruction time. This is handled by switching to
23201 * the hash's context to create each SV going into it, and then
23202 * immediately switching back
23203 * 3) All accesses to the hash must be controlled by a mutex, to
23204 * prevent two threads from getting an unstable state should
23205 * they simultaneously be accessing it. The code below is
23206 * crafted so that the mutex is locked whenever there is an
23207 * access and unlocked only when the next stable state is
23210 * The hash stores either the definition of the property if it was
23211 * valid, or, if invalid, the error message that was raised. We
23212 * use the type of SV to distinguish.
23214 * There's also the need to guard against the definition expansion
23215 * from infinitely recursing. This is handled by storing the aTHX
23216 * of the expanding thread during the expansion. Again the SV type
23217 * is used to distinguish this from the other two cases. If we
23218 * come to here and the hash entry for this property is our aTHX,
23219 * it means we have recursed, and the code assumes that we would
23220 * infinitely recurse, so instead stops and raises an error.
23221 * (Any recursion has always been treated as infinite recursion in
23224 * If instead, the entry is for a different aTHX, it means that
23225 * that thread has gotten here first, and hasn't finished expanding
23226 * the definition yet. We just have to wait until it is done. We
23227 * sleep and retry a few times, returning an error if the other
23228 * thread doesn't complete. */
23231 USER_PROP_MUTEX_LOCK;
23233 /* If we have an entry for this key, the subroutine has already
23234 * been called once with this /i status. */
23235 saved_user_prop_ptr = hv_fetch(PL_user_def_props,
23236 SvPVX(key), SvCUR(key), 0);
23237 if (saved_user_prop_ptr) {
23239 /* If the saved result is an inversion list, it is the valid
23240 * definition of this property */
23241 if (is_invlist(*saved_user_prop_ptr)) {
23242 prop_definition = *saved_user_prop_ptr;
23244 /* The SV in the hash won't be removed until global
23245 * destruction, so it is stable and we can unlock */
23246 USER_PROP_MUTEX_UNLOCK;
23248 /* The caller shouldn't try to free this SV */
23249 return prop_definition;
23252 /* Otherwise, if it is a string, it is the error message
23253 * that was returned when we first tried to evaluate this
23254 * property. Fail, and append the message */
23255 if (SvPOK(*saved_user_prop_ptr)) {
23256 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23257 sv_catsv(msg, *saved_user_prop_ptr);
23259 /* The SV in the hash won't be removed until global
23260 * destruction, so it is stable and we can unlock */
23261 USER_PROP_MUTEX_UNLOCK;
23266 assert(SvIOK(*saved_user_prop_ptr));
23268 /* Here, we have an unstable entry in the hash. Either another
23269 * thread is in the middle of expanding the property's
23270 * definition, or we are ourselves recursing. We use the aTHX
23271 * in it to distinguish */
23272 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
23274 /* Here, it's another thread doing the expanding. We've
23275 * looked as much as we are going to at the contents of the
23276 * hash entry. It's safe to unlock. */
23277 USER_PROP_MUTEX_UNLOCK;
23279 /* Retry a few times */
23280 if (retry_countdown-- > 0) {
23285 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23286 sv_catpvs(msg, "Timeout waiting for another thread to "
23288 goto append_name_to_msg;
23291 /* Here, we are recursing; don't dig any deeper */
23292 USER_PROP_MUTEX_UNLOCK;
23294 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23296 "Infinite recursion in user-defined property");
23297 goto append_name_to_msg;
23300 /* Here, this thread has exclusive control, and there is no entry
23301 * for this property in the hash. So we have the go ahead to
23302 * expand the definition ourselves. */
23304 PUSHSTACKi(PERLSI_MAGIC);
23307 /* Create a temporary placeholder in the hash to detect recursion
23309 SWITCH_TO_GLOBAL_CONTEXT;
23310 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
23311 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
23314 /* Now that we have a placeholder, we can let other threads
23316 USER_PROP_MUTEX_UNLOCK;
23318 /* Make sure the placeholder always gets destroyed */
23319 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
23324 /* Call the user's function, with the /i status as a parameter.
23325 * Note that we have gone to a lot of trouble to keep this call
23326 * from being within the locked mutex region. */
23327 XPUSHs(boolSV(to_fold));
23330 /* The following block was taken from swash_init(). Presumably
23331 * they apply to here as well, though we no longer use a swash --
23335 /* We might get here via a subroutine signature which uses a utf8
23336 * parameter name, at which point PL_subname will have been set
23337 * but not yet used. */
23338 save_item(PL_subname);
23340 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
23345 if (TAINT_get || SvTRUE(error)) {
23346 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23347 if (SvTRUE(error)) {
23348 sv_catpvs(msg, "Error \"");
23349 sv_catsv(msg, error);
23350 sv_catpvs(msg, "\"");
23353 if (SvTRUE(error)) sv_catpvs(msg, "; ");
23354 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
23357 if (name_len > 0) {
23358 sv_catpvs(msg, " in expansion of ");
23359 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
23365 prop_definition = NULL;
23367 else { /* G_SCALAR guarantees a single return value */
23368 SV * contents = POPs;
23370 /* The contents is supposed to be the expansion of the property
23371 * definition. If the definition is deferrable, and we got an
23372 * empty string back, set a flag to later defer it (after clean
23375 && (! SvPOK(contents) || SvCUR(contents) == 0))
23377 empty_return = TRUE;
23379 else { /* Otherwise, call a function to check for valid syntax,
23382 prop_definition = handle_user_defined_property(
23384 is_utf8, to_fold, runtime,
23386 contents, user_defined_ptr,
23392 /* Here, we have the results of the expansion. Delete the
23393 * placeholder, and if the definition is now known, replace it with
23394 * that definition. We need exclusive access to the hash, and we
23395 * can't let anyone else in, between when we delete the placeholder
23396 * and add the permanent entry */
23397 USER_PROP_MUTEX_LOCK;
23399 S_delete_recursion_entry(aTHX_ SvPVX(key));
23401 if ( ! empty_return
23402 && (! prop_definition || is_invlist(prop_definition)))
23404 /* If we got success we use the inversion list defining the
23405 * property; otherwise use the error message */
23406 SWITCH_TO_GLOBAL_CONTEXT;
23407 (void) hv_store_ent(PL_user_def_props,
23410 ? newSVsv(prop_definition)
23416 /* All done, and the hash now has a permanent entry for this
23417 * property. Give up exclusive control */
23418 USER_PROP_MUTEX_UNLOCK;
23424 if (empty_return) {
23425 goto definition_deferred;
23428 if (prop_definition) {
23430 /* If the definition is for something not known at this time,
23431 * we toss it, and go return the main property name, as that's
23432 * the one the user will be aware of */
23433 if (! is_invlist(prop_definition)) {
23434 SvREFCNT_dec_NN(prop_definition);
23435 goto definition_deferred;
23438 sv_2mortal(prop_definition);
23442 return prop_definition;
23444 } /* End of calling the subroutine for the user-defined property */
23445 } /* End of it could be a user-defined property */
23447 /* Here it wasn't a user-defined property that is known at this time. See
23448 * if it is a Unicode property */
23450 lookup_len = j; /* This is a more mnemonic name than 'j' */
23452 /* Get the index into our pointer table of the inversion list corresponding
23453 * to the property */
23454 table_index = match_uniprop((U8 *) lookup_name, lookup_len);
23456 /* If it didn't find the property ... */
23457 if (table_index == 0) {
23459 /* Try again stripping off any initial 'Is'. This is because we
23460 * promise that an initial Is is optional. The same isn't true of
23461 * names that start with 'In'. Those can match only blocks, and the
23462 * lookup table already has those accounted for. */
23463 if (starts_with_Is) {
23469 table_index = match_uniprop((U8 *) lookup_name, lookup_len);
23472 if (table_index == 0) {
23475 /* Here, we didn't find it. If not a numeric type property, and
23476 * can't be a user-defined one, it isn't a legal property */
23477 if (! is_nv_type) {
23478 if (! could_be_user_defined) {
23482 /* Here, the property name is legal as a user-defined one. At
23483 * compile time, it might just be that the subroutine for that
23484 * property hasn't been encountered yet, but at runtime, it's
23485 * an error to try to use an undefined one */
23486 if (! deferrable) {
23487 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23488 sv_catpvs(msg, "Unknown user-defined property name");
23489 goto append_name_to_msg;
23492 goto definition_deferred;
23493 } /* End of isn't a numeric type property */
23495 /* The numeric type properties need more work to decide. What we
23496 * do is make sure we have the number in canonical form and look
23499 if (slash_pos < 0) { /* No slash */
23501 /* When it isn't a rational, take the input, convert it to a
23502 * NV, then create a canonical string representation of that
23506 SSize_t value_len = lookup_len - equals_pos;
23508 /* Get the value */
23509 if ( value_len <= 0
23510 || my_atof3(lookup_name + equals_pos, &value,
23512 != lookup_name + lookup_len)
23517 /* If the value is an integer, the canonical value is integral
23519 if (Perl_ceil(value) == value) {
23520 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
23521 equals_pos, lookup_name, value);
23523 else { /* Otherwise, it is %e with a known precision */
23526 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
23527 equals_pos, lookup_name,
23528 PL_E_FORMAT_PRECISION, value);
23530 /* The exponent generated is expecting two digits, whereas
23531 * %e on some systems will generate three. Remove leading
23532 * zeros in excess of 2 from the exponent. We start
23533 * looking for them after the '=' */
23534 exp_ptr = strchr(canonical + equals_pos, 'e');
23536 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
23537 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
23539 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
23541 if (excess_exponent_len > 0) {
23542 SSize_t leading_zeros = strspn(cur_ptr, "0");
23543 SSize_t excess_leading_zeros
23544 = MIN(leading_zeros, excess_exponent_len);
23545 if (excess_leading_zeros > 0) {
23546 Move(cur_ptr + excess_leading_zeros,
23548 strlen(cur_ptr) - excess_leading_zeros
23549 + 1, /* Copy the NUL as well */
23556 else { /* Has a slash. Create a rational in canonical form */
23557 UV numerator, denominator, gcd, trial;
23558 const char * end_ptr;
23559 const char * sign = "";
23561 /* We can't just find the numerator, denominator, and do the
23562 * division, then use the method above, because that is
23563 * inexact. And the input could be a rational that is within
23564 * epsilon (given our precision) of a valid rational, and would
23565 * then incorrectly compare valid.
23567 * We're only interested in the part after the '=' */
23568 const char * this_lookup_name = lookup_name + equals_pos;
23569 lookup_len -= equals_pos;
23570 slash_pos -= equals_pos;
23572 /* Handle any leading minus */
23573 if (this_lookup_name[0] == '-') {
23575 this_lookup_name++;
23580 /* Convert the numerator to numeric */
23581 end_ptr = this_lookup_name + slash_pos;
23582 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
23586 /* It better have included all characters before the slash */
23587 if (*end_ptr != '/') {
23591 /* Set to look at just the denominator */
23592 this_lookup_name += slash_pos;
23593 lookup_len -= slash_pos;
23594 end_ptr = this_lookup_name + lookup_len;
23596 /* Convert the denominator to numeric */
23597 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
23601 /* It better be the rest of the characters, and don't divide by
23603 if ( end_ptr != this_lookup_name + lookup_len
23604 || denominator == 0)
23609 /* Get the greatest common denominator using
23610 http://en.wikipedia.org/wiki/Euclidean_algorithm */
23612 trial = denominator;
23613 while (trial != 0) {
23615 trial = gcd % trial;
23619 /* If already in lowest possible terms, we have already tried
23620 * looking this up */
23625 /* Reduce the rational, which should put it in canonical form
23628 denominator /= gcd;
23630 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
23631 equals_pos, lookup_name, sign, numerator, denominator);
23634 /* Here, we have the number in canonical form. Try that */
23635 table_index = match_uniprop((U8 *) canonical, strlen(canonical));
23636 if (table_index == 0) {
23639 } /* End of still didn't find the property in our table */
23640 } /* End of didn't find the property in our table */
23642 /* Here, we have a non-zero return, which is an index into a table of ptrs.
23643 * A negative return signifies that the real index is the absolute value,
23644 * but the result needs to be inverted */
23645 if (table_index < 0) {
23646 invert_return = TRUE;
23647 table_index = -table_index;
23650 /* Out-of band indices indicate a deprecated property. The proper index is
23651 * modulo it with the table size. And dividing by the table size yields
23652 * an offset into a table constructed by regen/mk_invlists.pl to contain
23653 * the corresponding warning message */
23654 if (table_index > MAX_UNI_KEYWORD_INDEX) {
23655 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
23656 table_index %= MAX_UNI_KEYWORD_INDEX;
23657 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
23658 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
23659 (int) name_len, name, deprecated_property_msgs[warning_offset]);
23662 /* In a few properties, a different property is used under /i. These are
23663 * unlikely to change, so are hard-coded here. */
23665 if ( table_index == UNI_XPOSIXUPPER
23666 || table_index == UNI_XPOSIXLOWER
23667 || table_index == UNI_TITLE)
23669 table_index = UNI_CASED;
23671 else if ( table_index == UNI_UPPERCASELETTER
23672 || table_index == UNI_LOWERCASELETTER
23673 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
23674 || table_index == UNI_TITLECASELETTER
23677 table_index = UNI_CASEDLETTER;
23679 else if ( table_index == UNI_POSIXUPPER
23680 || table_index == UNI_POSIXLOWER)
23682 table_index = UNI_POSIXALPHA;
23686 /* Create and return the inversion list */
23687 prop_definition =_new_invlist_C_array(uni_prop_ptrs[table_index]);
23688 sv_2mortal(prop_definition);
23691 /* See if there is a private use override to add to this definition */
23693 COPHH * hinthash = (IN_PERL_COMPILETIME)
23694 ? CopHINTHASH_get(&PL_compiling)
23695 : CopHINTHASH_get(PL_curcop);
23696 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
23698 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
23700 /* See if there is an element in the hints hash for this table */
23701 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
23702 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
23706 SV * pu_definition;
23708 SV * expanded_prop_definition =
23709 sv_2mortal(invlist_clone(prop_definition, NULL));
23711 /* If so, it's definition is the string from here to the next
23712 * \a character. And its format is the same as a user-defined
23714 pos += SvCUR(pu_lookup);
23715 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
23716 pu_invlist = handle_user_defined_property(lookup_name,
23719 0, /* Not folded */
23727 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23728 sv_catpvs(msg, "Insecure private-use override");
23729 goto append_name_to_msg;
23732 /* For now, as a safety measure, make sure that it doesn't
23733 * override non-private use code points */
23734 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
23736 /* Add it to the list to be returned */
23737 _invlist_union(prop_definition, pu_invlist,
23738 &expanded_prop_definition);
23739 prop_definition = expanded_prop_definition;
23740 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
23745 if (invert_return) {
23746 _invlist_invert(prop_definition);
23748 return prop_definition;
23752 if (non_pkg_begin != 0) {
23753 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23754 sv_catpvs(msg, "Illegal user-defined property name");
23757 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23758 sv_catpvs(msg, "Can't find Unicode property definition");
23762 append_name_to_msg:
23764 const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
23765 const char * suffix = (runtime && level == 0) ? "}" : "\"";
23767 sv_catpv(msg, prefix);
23768 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23769 sv_catpv(msg, suffix);
23774 definition_deferred:
23776 /* Here it could yet to be defined, so defer evaluation of this
23777 * until its needed at runtime. We need the fully qualified property name
23778 * to avoid ambiguity, and a trailing newline */
23780 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
23781 non_pkg_begin != 0 /* If has "::" */
23784 sv_catpvs(fq_name, "\n");
23786 *user_defined_ptr = TRUE;
23793 * ex: set ts=8 sts=4 sw=4 et: