5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 /* Note on debug output:
76 * This is set up so that -Dr turns on debugging like all other flags that are
77 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
78 * all regular expressions encountered in a program, and gives a huge amount of
79 * output for all but the shortest programs.
81 * The ability to output pattern debugging information lexically, and with much
82 * finer grained control was added, with 'use re qw(Debug ....);' available even
83 * in non-DEBUGGING builds. This is accomplished by copying the contents of
84 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
85 * Those files are compiled and linked into the perl executable, and they are
86 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
89 * That would normally mean linking errors when two functions of the same name
90 * are attempted to be placed into the same executable. That is solved in one
92 * 1) Static functions aren't known outside the file they are in, so for the
93 * many functions of that type in this file, it just isn't a problem.
94 * 2) Most externally known functions are enclosed in
95 * #ifndef PERL_IN_XSUB_RE
98 * blocks, so there is only one defintion for them in the whole
99 * executable, the one in regcomp.c (or regexec.c). The implication of
100 * that is any debugging info that comes from them is controlled only by
101 * -Dr. Further, any static function they call will also be the version
102 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
103 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
104 * have different names, so that what gets loaded in the executable is
105 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
106 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
107 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
108 * versions and their callees are under control of re.pm. The catch is
109 * that references to all these go through the regexp_engine structure,
110 * which is initialized in regcomp.h to the Perl_foo versions, and
111 * substituted out in lexical scopes where 'use re' is in effect to the
112 * 'my_foo' ones. That structure is public API, so it would be a hard
113 * sell to add any additional members.
114 * 4) For functions in regcomp.c and re_comp.c that are called only from,
115 * respectively, regexec.c and re_exec.c, they can have two different
116 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
119 * The bottom line is that if you add code to one of the public functions
120 * listed in ext/re/re_top.h, debugging automagically works. But if you write
121 * a new function that needs to do debugging or there is a chain of calls from
122 * it that need to do debugging, all functions in the chain should use options
125 * A function may have to be split so that debugging stuff is static, but it
126 * calls out to some other function that only gets compiled in regcomp.c to
127 * access data that we don't want to duplicate.
131 #define PERL_IN_REGCOMP_C
135 #ifdef PERL_IN_XSUB_RE
136 # include "re_comp.h"
137 EXTERN_C const struct regexp_engine my_reg_engine;
138 EXTERN_C const struct regexp_engine wild_reg_engine;
140 # include "regcomp.h"
143 #include "invlist_inline.h"
144 #include "unicode_constants.h"
146 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
147 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
148 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
149 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
150 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
151 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
154 #define STATIC static
157 /* this is a chain of data about sub patterns we are processing that
158 need to be handled separately/specially in study_chunk. Its so
159 we can simulate recursion without losing state. */
161 typedef struct scan_frame {
162 regnode *last_regnode; /* last node to process in this frame */
163 regnode *next_regnode; /* next node to process when last is reached */
164 U32 prev_recursed_depth;
165 I32 stopparen; /* what stopparen do we use */
166 bool in_gosub; /* this or an outer frame is for GOSUB */
168 struct scan_frame *this_prev_frame; /* this previous frame */
169 struct scan_frame *prev_frame; /* previous frame */
170 struct scan_frame *next_frame; /* next frame */
173 /* Certain characters are output as a sequence with the first being a
175 #define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
178 struct RExC_state_t {
179 U32 flags; /* RXf_* are we folding, multilining? */
180 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
181 char *precomp; /* uncompiled string. */
182 char *precomp_end; /* pointer to end of uncompiled string. */
183 REGEXP *rx_sv; /* The SV that is the regexp. */
184 regexp *rx; /* perl core regexp structure */
185 regexp_internal *rxi; /* internal data for regexp object
187 char *start; /* Start of input for compile */
188 char *end; /* End of input for compile */
189 char *parse; /* Input-scan pointer. */
190 char *copy_start; /* start of copy of input within
191 constructed parse string */
192 char *save_copy_start; /* Provides one level of saving
193 and restoring 'copy_start' */
194 char *copy_start_in_input; /* Position in input string
195 corresponding to copy_start */
196 SSize_t whilem_seen; /* number of WHILEM in this expr */
197 regnode *emit_start; /* Start of emitted-code area */
198 regnode_offset emit; /* Code-emit pointer */
199 I32 naughty; /* How bad is this pattern? */
200 I32 sawback; /* Did we see \1, ...? */
201 SSize_t size; /* Number of regnode equivalents in
203 Size_t sets_depth; /* Counts recursion depth of already-
204 compiled regex set patterns */
207 I32 parens_buf_size; /* #slots malloced open/close_parens */
208 regnode_offset *open_parens; /* offsets to open parens */
209 regnode_offset *close_parens; /* offsets to close parens */
210 HV *paren_names; /* Paren names */
212 /* position beyond 'precomp' of the warning message furthest away from
213 * 'precomp'. During the parse, no warnings are raised for any problems
214 * earlier in the parse than this position. This works if warnings are
215 * raised the first time a given spot is parsed, and if only one
216 * independent warning is raised for any given spot */
217 Size_t latest_warn_offset;
219 I32 npar; /* Capture buffer count so far in the
220 parse, (OPEN) plus one. ("par" 0 is
222 I32 total_par; /* During initial parse, is either 0,
223 or -1; the latter indicating a
224 reparse is needed. After that pass,
225 it is what 'npar' became after the
226 pass. Hence, it being > 0 indicates
227 we are in a reparse situation */
228 I32 nestroot; /* root parens we are in - used by
231 regnode *end_op; /* END node in program */
232 I32 utf8; /* whether the pattern is utf8 or not */
233 I32 orig_utf8; /* whether the pattern was originally in utf8 */
234 /* XXX use this for future optimisation of case
235 * where pattern must be upgraded to utf8. */
236 I32 uni_semantics; /* If a d charset modifier should use unicode
237 rules, even if the pattern is not in
240 I32 recurse_count; /* Number of recurse regops we have generated */
241 regnode **recurse; /* Recurse regops */
242 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
244 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
248 I32 override_recoding;
249 I32 recode_x_to_native;
250 I32 in_multi_char_class;
251 int code_index; /* next code_blocks[] slot */
252 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
254 SSize_t maxlen; /* mininum possible number of chars in string to match */
255 scan_frame *frame_head;
256 scan_frame *frame_last;
260 SV *runtime_code_qr; /* qr with the runtime code blocks */
262 const char *lastparse;
264 U32 study_chunk_recursed_count;
265 AV *paren_name_list; /* idx -> name */
269 #define RExC_lastparse (pRExC_state->lastparse)
270 #define RExC_lastnum (pRExC_state->lastnum)
271 #define RExC_paren_name_list (pRExC_state->paren_name_list)
272 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
273 #define RExC_mysv (pRExC_state->mysv1)
274 #define RExC_mysv1 (pRExC_state->mysv1)
275 #define RExC_mysv2 (pRExC_state->mysv2)
283 bool sWARN_EXPERIMENTAL__VLB;
284 bool sWARN_EXPERIMENTAL__REGEX_SETS;
287 #define RExC_flags (pRExC_state->flags)
288 #define RExC_pm_flags (pRExC_state->pm_flags)
289 #define RExC_precomp (pRExC_state->precomp)
290 #define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
291 #define RExC_copy_start_in_constructed (pRExC_state->copy_start)
292 #define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
293 #define RExC_precomp_end (pRExC_state->precomp_end)
294 #define RExC_rx_sv (pRExC_state->rx_sv)
295 #define RExC_rx (pRExC_state->rx)
296 #define RExC_rxi (pRExC_state->rxi)
297 #define RExC_start (pRExC_state->start)
298 #define RExC_end (pRExC_state->end)
299 #define RExC_parse (pRExC_state->parse)
300 #define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
301 #define RExC_whilem_seen (pRExC_state->whilem_seen)
302 #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
303 under /d from /u ? */
305 #ifdef RE_TRACK_PATTERN_OFFSETS
306 # define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
309 #define RExC_emit (pRExC_state->emit)
310 #define RExC_emit_start (pRExC_state->emit_start)
311 #define RExC_sawback (pRExC_state->sawback)
312 #define RExC_seen (pRExC_state->seen)
313 #define RExC_size (pRExC_state->size)
314 #define RExC_maxlen (pRExC_state->maxlen)
315 #define RExC_npar (pRExC_state->npar)
316 #define RExC_total_parens (pRExC_state->total_par)
317 #define RExC_parens_buf_size (pRExC_state->parens_buf_size)
318 #define RExC_nestroot (pRExC_state->nestroot)
319 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
320 #define RExC_utf8 (pRExC_state->utf8)
321 #define RExC_uni_semantics (pRExC_state->uni_semantics)
322 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
323 #define RExC_open_parens (pRExC_state->open_parens)
324 #define RExC_close_parens (pRExC_state->close_parens)
325 #define RExC_end_op (pRExC_state->end_op)
326 #define RExC_paren_names (pRExC_state->paren_names)
327 #define RExC_recurse (pRExC_state->recurse)
328 #define RExC_recurse_count (pRExC_state->recurse_count)
329 #define RExC_sets_depth (pRExC_state->sets_depth)
330 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
331 #define RExC_study_chunk_recursed_bytes \
332 (pRExC_state->study_chunk_recursed_bytes)
333 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
334 #define RExC_in_lookahead (pRExC_state->in_lookahead)
335 #define RExC_contains_locale (pRExC_state->contains_locale)
336 #define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
339 # define SET_recode_x_to_native(x) \
340 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
342 # define SET_recode_x_to_native(x) NOOP
345 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
346 #define RExC_frame_head (pRExC_state->frame_head)
347 #define RExC_frame_last (pRExC_state->frame_last)
348 #define RExC_frame_count (pRExC_state->frame_count)
349 #define RExC_strict (pRExC_state->strict)
350 #define RExC_study_started (pRExC_state->study_started)
351 #define RExC_warn_text (pRExC_state->warn_text)
352 #define RExC_in_script_run (pRExC_state->in_script_run)
353 #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
354 #define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
355 #define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
356 #define RExC_unlexed_names (pRExC_state->unlexed_names)
358 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
359 * a flag to disable back-off on the fixed/floating substrings - if it's
360 * a high complexity pattern we assume the benefit of avoiding a full match
361 * is worth the cost of checking for the substrings even if they rarely help.
363 #define RExC_naughty (pRExC_state->naughty)
364 #define TOO_NAUGHTY (10)
365 #define MARK_NAUGHTY(add) \
366 if (RExC_naughty < TOO_NAUGHTY) \
367 RExC_naughty += (add)
368 #define MARK_NAUGHTY_EXP(exp, add) \
369 if (RExC_naughty < TOO_NAUGHTY) \
370 RExC_naughty += RExC_naughty / (exp) + (add)
372 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
373 #define ISMULT2(s) (ISMULT1(*s) || ((*s) == '{' && regcurly(s)))
376 * Flags to be passed up and down.
378 #define HASWIDTH 0x01 /* Known to not match null strings, could match
380 #define SIMPLE 0x02 /* Exactly one character wide */
381 /* (or LNBREAK as a special case) */
382 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
383 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
384 #define RESTART_PARSE 0x20 /* Need to redo the parse */
385 #define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
386 calcuate sizes as UTF-8 */
388 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
390 /* whether trie related optimizations are enabled */
391 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
392 #define TRIE_STUDY_OPT
393 #define FULL_TRIE_STUDY
399 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
400 #define PBITVAL(paren) (1 << ((paren) & 7))
401 #define PAREN_OFFSET(depth) \
402 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
403 #define PAREN_TEST(depth, paren) \
404 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
405 #define PAREN_SET(depth, paren) \
406 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
407 #define PAREN_UNSET(depth, paren) \
408 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
410 #define REQUIRE_UTF8(flagp) STMT_START { \
412 *flagp = RESTART_PARSE|NEED_UTF8; \
417 /* /u is to be chosen if we are supposed to use Unicode rules, or if the
418 * pattern is in UTF-8. This latter condition is in case the outermost rules
419 * are locale. See GH #17278 */
420 #define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF)
422 /* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
423 * a flag that indicates we need to override /d with /u as a result of
424 * something in the pattern. It should only be used in regards to calling
425 * set_regex_charset() or get_regex_charset() */
426 #define REQUIRE_UNI_RULES(flagp, restart_retval) \
428 if (DEPENDS_SEMANTICS) { \
429 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
430 RExC_uni_semantics = 1; \
431 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
432 /* No need to restart the parse if we haven't seen \
433 * anything that differs between /u and /d, and no need \
434 * to restart immediately if we're going to reparse \
435 * anyway to count parens */ \
436 *flagp |= RESTART_PARSE; \
437 return restart_retval; \
442 #define REQUIRE_BRANCHJ(flagp, restart_retval) \
444 RExC_use_BRANCHJ = 1; \
445 *flagp |= RESTART_PARSE; \
446 return restart_retval; \
449 /* Until we have completed the parse, we leave RExC_total_parens at 0 or
450 * less. After that, it must always be positive, because the whole re is
451 * considered to be surrounded by virtual parens. Setting it to negative
452 * indicates there is some construct that needs to know the actual number of
453 * parens to be properly handled. And that means an extra pass will be
454 * required after we've counted them all */
455 #define ALL_PARENS_COUNTED (RExC_total_parens > 0)
456 #define REQUIRE_PARENS_PASS \
457 STMT_START { /* No-op if have completed a pass */ \
458 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
460 #define IN_PARENS_PASS (RExC_total_parens < 0)
463 /* This is used to return failure (zero) early from the calling function if
464 * various flags in 'flags' are set. Two flags always cause a return:
465 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
466 * additional flags that should cause a return; 0 if none. If the return will
467 * be done, '*flagp' is first set to be all of the flags that caused the
469 #define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
471 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
472 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
477 #define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
479 #define RETURN_FAIL_ON_RESTART(flags,flagp) \
480 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
481 #define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
482 if (MUST_RESTART(*(flagp))) return 0
484 /* This converts the named class defined in regcomp.h to its equivalent class
485 * number defined in handy.h. */
486 #define namedclass_to_classnum(class) ((int) ((class) / 2))
487 #define classnum_to_namedclass(classnum) ((classnum) * 2)
489 #define _invlist_union_complement_2nd(a, b, output) \
490 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
491 #define _invlist_intersection_complement_2nd(a, b, output) \
492 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
494 /* We add a marker if we are deferring expansion of a property that is both
495 * 1) potentiallly user-defined; and
496 * 2) could also be an official Unicode property.
498 * Without this marker, any deferred expansion can only be for a user-defined
499 * one. This marker shouldn't conflict with any that could be in a legal name,
500 * and is appended to its name to indicate this. There is a string and
502 #define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
503 #define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
505 /* What is infinity for optimization purposes */
506 #define OPTIMIZE_INFTY SSize_t_MAX
508 /* About scan_data_t.
510 During optimisation we recurse through the regexp program performing
511 various inplace (keyhole style) optimisations. In addition study_chunk
512 and scan_commit populate this data structure with information about
513 what strings MUST appear in the pattern. We look for the longest
514 string that must appear at a fixed location, and we look for the
515 longest string that may appear at a floating location. So for instance
520 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
521 strings (because they follow a .* construct). study_chunk will identify
522 both FOO and BAR as being the longest fixed and floating strings respectively.
524 The strings can be composites, for instance
528 will result in a composite fixed substring 'foo'.
530 For each string some basic information is maintained:
533 This is the position the string must appear at, or not before.
534 It also implicitly (when combined with minlenp) tells us how many
535 characters must match before the string we are searching for.
536 Likewise when combined with minlenp and the length of the string it
537 tells us how many characters must appear after the string we have
541 Only used for floating strings. This is the rightmost point that
542 the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
543 string can occur infinitely far to the right.
544 For fixed strings, it is equal to min_offset.
547 A pointer to the minimum number of characters of the pattern that the
548 string was found inside. This is important as in the case of positive
549 lookahead or positive lookbehind we can have multiple patterns
554 The minimum length of the pattern overall is 3, the minimum length
555 of the lookahead part is 3, but the minimum length of the part that
556 will actually match is 1. So 'FOO's minimum length is 3, but the
557 minimum length for the F is 1. This is important as the minimum length
558 is used to determine offsets in front of and behind the string being
559 looked for. Since strings can be composites this is the length of the
560 pattern at the time it was committed with a scan_commit. Note that
561 the length is calculated by study_chunk, so that the minimum lengths
562 are not known until the full pattern has been compiled, thus the
563 pointer to the value.
567 In the case of lookbehind the string being searched for can be
568 offset past the start point of the final matching string.
569 If this value was just blithely removed from the min_offset it would
570 invalidate some of the calculations for how many chars must match
571 before or after (as they are derived from min_offset and minlen and
572 the length of the string being searched for).
573 When the final pattern is compiled and the data is moved from the
574 scan_data_t structure into the regexp structure the information
575 about lookbehind is factored in, with the information that would
576 have been lost precalculated in the end_shift field for the
579 The fields pos_min and pos_delta are used to store the minimum offset
580 and the delta to the maximum offset at the current point in the pattern.
584 struct scan_data_substrs {
585 SV *str; /* longest substring found in pattern */
586 SSize_t min_offset; /* earliest point in string it can appear */
587 SSize_t max_offset; /* latest point in string it can appear */
588 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
589 SSize_t lookbehind; /* is the pos of the string modified by LB */
590 I32 flags; /* per substring SF_* and SCF_* flags */
593 typedef struct scan_data_t {
594 /*I32 len_min; unused */
595 /*I32 len_delta; unused */
599 SSize_t last_end; /* min value, <0 unless valid. */
600 SSize_t last_start_min;
601 SSize_t last_start_max;
602 U8 cur_is_floating; /* whether the last_* values should be set as
603 * the next fixed (0) or floating (1)
606 /* [0] is longest fixed substring so far, [1] is longest float so far */
607 struct scan_data_substrs substrs[2];
609 I32 flags; /* common SF_* and SCF_* flags */
611 SSize_t *last_closep;
612 regnode_ssc *start_class;
616 * Forward declarations for pregcomp()'s friends.
619 static const scan_data_t zero_scan_data = {
620 0, 0, NULL, 0, 0, 0, 0,
622 { NULL, 0, 0, 0, 0, 0 },
623 { NULL, 0, 0, 0, 0, 0 },
630 #define SF_BEFORE_SEOL 0x0001
631 #define SF_BEFORE_MEOL 0x0002
632 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
634 #define SF_IS_INF 0x0040
635 #define SF_HAS_PAR 0x0080
636 #define SF_IN_PAR 0x0100
637 #define SF_HAS_EVAL 0x0200
640 /* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
641 * longest substring in the pattern. When it is not set the optimiser keeps
642 * track of position, but does not keep track of the actual strings seen,
644 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
647 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
648 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
649 * turned off because of the alternation (BRANCH). */
650 #define SCF_DO_SUBSTR 0x0400
652 #define SCF_DO_STCLASS_AND 0x0800
653 #define SCF_DO_STCLASS_OR 0x1000
654 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
655 #define SCF_WHILEM_VISITED_POS 0x2000
657 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
658 #define SCF_SEEN_ACCEPT 0x8000
659 #define SCF_TRIE_DOING_RESTUDY 0x10000
660 #define SCF_IN_DEFINE 0x20000
665 #define UTF cBOOL(RExC_utf8)
667 /* The enums for all these are ordered so things work out correctly */
668 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
669 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
670 == REGEX_DEPENDS_CHARSET)
671 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
672 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
673 >= REGEX_UNICODE_CHARSET)
674 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
675 == REGEX_ASCII_RESTRICTED_CHARSET)
676 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
677 >= REGEX_ASCII_RESTRICTED_CHARSET)
678 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
679 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
681 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
683 /* For programs that want to be strictly Unicode compatible by dying if any
684 * attempt is made to match a non-Unicode code point against a Unicode
686 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
688 #define OOB_NAMEDCLASS -1
690 /* There is no code point that is out-of-bounds, so this is problematic. But
691 * its only current use is to initialize a variable that is always set before
693 #define OOB_UNICODE 0xDEADBEEF
695 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
698 /* length of regex to show in messages that don't mark a position within */
699 #define RegexLengthToShowInErrorMessages 127
702 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
703 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
704 * op/pragma/warn/regcomp.
706 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
707 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
709 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
710 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
712 /* The code in this file in places uses one level of recursion with parsing
713 * rebased to an alternate string constructed by us in memory. This can take
714 * the form of something that is completely different from the input, or
715 * something that uses the input as part of the alternate. In the first case,
716 * there should be no possibility of an error, as we are in complete control of
717 * the alternate string. But in the second case we don't completely control
718 * the input portion, so there may be errors in that. Here's an example:
720 * is handled specially because \x{df} folds to a sequence of more than one
721 * character: 'ss'. What is done is to create and parse an alternate string,
722 * which looks like this:
723 * /(?:\x{DF}|[abc\x{DF}def])/ui
724 * where it uses the input unchanged in the middle of something it constructs,
725 * which is a branch for the DF outside the character class, and clustering
726 * parens around the whole thing. (It knows enough to skip the DF inside the
727 * class while in this substitute parse.) 'abc' and 'def' may have errors that
728 * need to be reported. The general situation looks like this:
730 * |<------- identical ------>|
732 * Input: ---------------------------------------------------------------
733 * Constructed: ---------------------------------------------------
735 * |<------- identical ------>|
737 * sI..eI is the portion of the input pattern we are concerned with here.
738 * sC..EC is the constructed substitute parse string.
739 * sC..tC is constructed by us
740 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
741 * In the diagram, these are vertically aligned.
742 * eC..EC is also constructed by us.
743 * xC is the position in the substitute parse string where we found a
745 * xI is the position in the original pattern corresponding to xC.
747 * We want to display a message showing the real input string. Thus we need to
748 * translate from xC to xI. We know that xC >= tC, since the portion of the
749 * string sC..tC has been constructed by us, and so shouldn't have errors. We
751 * xI = tI + (xC - tC)
753 * When the substitute parse is constructed, the code needs to set:
756 * RExC_copy_start_in_input (tI)
757 * RExC_copy_start_in_constructed (tC)
758 * and restore them when done.
760 * During normal processing of the input pattern, both
761 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
762 * sI, so that xC equals xI.
765 #define sI RExC_precomp
766 #define eI RExC_precomp_end
767 #define sC RExC_start
769 #define tI RExC_copy_start_in_input
770 #define tC RExC_copy_start_in_constructed
771 #define xI(xC) (tI + (xC - tC))
772 #define xI_offset(xC) (xI(xC) - sI)
774 #define REPORT_LOCATION_ARGS(xC) \
776 (xI(xC) > eI) /* Don't run off end */ \
777 ? eI - sI /* Length before the <--HERE */ \
778 : ((xI_offset(xC) >= 0) \
780 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
781 IVdf " trying to output message for " \
783 __FILE__, __LINE__, (IV) xI_offset(xC), \
784 ((int) (eC - sC)), sC), 0)), \
785 sI), /* The input pattern printed up to the <--HERE */ \
787 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
788 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
790 /* Used to point after bad bytes for an error message, but avoid skipping
791 * past a nul byte. */
792 #define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
794 /* Set up to clean up after our imminent demise */
795 #define PREPARE_TO_DIE \
798 SAVEFREESV(RExC_rx_sv); \
799 if (RExC_open_parens) \
800 SAVEFREEPV(RExC_open_parens); \
801 if (RExC_close_parens) \
802 SAVEFREEPV(RExC_close_parens); \
806 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
807 * arg. Show regex, up to a maximum length. If it's too long, chop and add
810 #define _FAIL(code) STMT_START { \
811 const char *ellipses = ""; \
812 IV len = RExC_precomp_end - RExC_precomp; \
815 if (len > RegexLengthToShowInErrorMessages) { \
816 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
817 len = RegexLengthToShowInErrorMessages - 10; \
823 #define FAIL(msg) _FAIL( \
824 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
825 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
827 #define FAIL2(msg,arg) _FAIL( \
828 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
829 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
831 #define FAIL3(msg,arg1,arg2) _FAIL( \
832 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
833 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
836 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
838 #define Simple_vFAIL(m) STMT_START { \
839 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
840 m, REPORT_LOCATION_ARGS(RExC_parse)); \
844 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
846 #define vFAIL(m) STMT_START { \
852 * Like Simple_vFAIL(), but accepts two arguments.
854 #define Simple_vFAIL2(m,a1) STMT_START { \
855 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
856 REPORT_LOCATION_ARGS(RExC_parse)); \
860 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
862 #define vFAIL2(m,a1) STMT_START { \
864 Simple_vFAIL2(m, a1); \
869 * Like Simple_vFAIL(), but accepts three arguments.
871 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
872 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
873 REPORT_LOCATION_ARGS(RExC_parse)); \
877 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
879 #define vFAIL3(m,a1,a2) STMT_START { \
881 Simple_vFAIL3(m, a1, a2); \
885 * Like Simple_vFAIL(), but accepts four arguments.
887 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
888 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \
889 REPORT_LOCATION_ARGS(RExC_parse)); \
892 #define vFAIL4(m,a1,a2,a3) STMT_START { \
894 Simple_vFAIL4(m, a1, a2, a3); \
897 /* A specialized version of vFAIL2 that works with UTF8f */
898 #define vFAIL2utf8f(m, a1) STMT_START { \
900 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
901 REPORT_LOCATION_ARGS(RExC_parse)); \
904 #define vFAIL3utf8f(m, a1, a2) STMT_START { \
906 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
907 REPORT_LOCATION_ARGS(RExC_parse)); \
910 /* Setting this to NULL is a signal to not output warnings */
911 #define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
913 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
914 RExC_copy_start_in_constructed = NULL; \
916 #define RESTORE_WARNINGS \
917 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
919 /* Since a warning can be generated multiple times as the input is reparsed, we
920 * output it the first time we come to that point in the parse, but suppress it
921 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
922 * generate any warnings */
923 #define TO_OUTPUT_WARNINGS(loc) \
924 ( RExC_copy_start_in_constructed \
925 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
927 /* After we've emitted a warning, we save the position in the input so we don't
929 #define UPDATE_WARNINGS_LOC(loc) \
931 if (TO_OUTPUT_WARNINGS(loc)) { \
932 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
937 /* 'warns' is the output of the packWARNx macro used in 'code' */
938 #define _WARN_HELPER(loc, warns, code) \
940 if (! RExC_copy_start_in_constructed) { \
941 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
942 " expected at '%s'", \
943 __FILE__, __LINE__, loc); \
945 if (TO_OUTPUT_WARNINGS(loc)) { \
949 UPDATE_WARNINGS_LOC(loc); \
953 /* m is not necessarily a "literal string", in this macro */
954 #define warn_non_literal_string(loc, packed_warn, m) \
955 _WARN_HELPER(loc, packed_warn, \
956 Perl_warner(aTHX_ packed_warn, \
957 "%s" REPORT_LOCATION, \
958 m, REPORT_LOCATION_ARGS(loc)))
959 #define reg_warn_non_literal_string(loc, m) \
960 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
962 #define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
965 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
966 Newx(format, format_size, char); \
967 my_strlcpy(format, m, format_size); \
968 my_strlcat(format, REPORT_LOCATION, format_size); \
969 SAVEFREEPV(format); \
970 _WARN_HELPER(loc, packwarn, \
971 Perl_ck_warner(aTHX_ packwarn, \
973 a1, REPORT_LOCATION_ARGS(loc))); \
976 #define ckWARNreg(loc,m) \
977 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
978 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
980 REPORT_LOCATION_ARGS(loc)))
982 #define vWARN(loc, m) \
983 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
984 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
986 REPORT_LOCATION_ARGS(loc))) \
988 #define vWARN_dep(loc, m) \
989 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
990 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
992 REPORT_LOCATION_ARGS(loc)))
994 #define ckWARNdep(loc,m) \
995 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
996 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
998 REPORT_LOCATION_ARGS(loc)))
1000 #define ckWARNregdep(loc,m) \
1001 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
1002 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
1004 m REPORT_LOCATION, \
1005 REPORT_LOCATION_ARGS(loc)))
1007 #define ckWARN2reg_d(loc,m, a1) \
1008 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1009 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
1010 m REPORT_LOCATION, \
1011 a1, REPORT_LOCATION_ARGS(loc)))
1013 #define ckWARN2reg(loc, m, a1) \
1014 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1015 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1016 m REPORT_LOCATION, \
1017 a1, REPORT_LOCATION_ARGS(loc)))
1019 #define vWARN3(loc, m, a1, a2) \
1020 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1021 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1022 m REPORT_LOCATION, \
1023 a1, a2, REPORT_LOCATION_ARGS(loc)))
1025 #define ckWARN3reg(loc, m, a1, a2) \
1026 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1027 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1028 m REPORT_LOCATION, \
1030 REPORT_LOCATION_ARGS(loc)))
1032 #define vWARN4(loc, m, a1, a2, a3) \
1033 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1034 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1035 m REPORT_LOCATION, \
1037 REPORT_LOCATION_ARGS(loc)))
1039 #define ckWARN4reg(loc, m, a1, a2, a3) \
1040 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1041 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
1042 m REPORT_LOCATION, \
1044 REPORT_LOCATION_ARGS(loc)))
1046 #define vWARN5(loc, m, a1, a2, a3, a4) \
1047 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1048 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
1049 m REPORT_LOCATION, \
1051 REPORT_LOCATION_ARGS(loc)))
1053 #define ckWARNexperimental(loc, class, m) \
1055 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1056 RExC_warned_ ## class = 1; \
1057 _WARN_HELPER(loc, packWARN(class), \
1058 Perl_ck_warner_d(aTHX_ packWARN(class), \
1059 m REPORT_LOCATION, \
1060 REPORT_LOCATION_ARGS(loc)));\
1064 /* Convert between a pointer to a node and its offset from the beginning of the
1066 #define REGNODE_p(offset) (RExC_emit_start + (offset))
1067 #define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1069 /* Macros for recording node offsets. 20001227 mjd@plover.com
1070 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
1071 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
1072 * Element 0 holds the number n.
1073 * Position is 1 indexed.
1075 #ifndef RE_TRACK_PATTERN_OFFSETS
1076 #define Set_Node_Offset_To_R(offset,byte)
1077 #define Set_Node_Offset(node,byte)
1078 #define Set_Cur_Node_Offset
1079 #define Set_Node_Length_To_R(node,len)
1080 #define Set_Node_Length(node,len)
1081 #define Set_Node_Cur_Length(node,start)
1082 #define Node_Offset(n)
1083 #define Node_Length(n)
1084 #define Set_Node_Offset_Length(node,offset,len)
1085 #define ProgLen(ri) ri->u.proglen
1086 #define SetProgLen(ri,x) ri->u.proglen = x
1087 #define Track_Code(code)
1089 #define ProgLen(ri) ri->u.offsets[0]
1090 #define SetProgLen(ri,x) ri->u.offsets[0] = x
1091 #define Set_Node_Offset_To_R(offset,byte) STMT_START { \
1092 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
1093 __LINE__, (int)(offset), (int)(byte))); \
1094 if((offset) < 0) { \
1095 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
1098 RExC_offsets[2*(offset)-1] = (byte); \
1102 #define Set_Node_Offset(node,byte) \
1103 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
1104 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1106 #define Set_Node_Length_To_R(node,len) STMT_START { \
1107 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
1108 __LINE__, (int)(node), (int)(len))); \
1110 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1113 RExC_offsets[2*(node)] = (len); \
1117 #define Set_Node_Length(node,len) \
1118 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
1119 #define Set_Node_Cur_Length(node, start) \
1120 Set_Node_Length(node, RExC_parse - start)
1122 /* Get offsets and lengths */
1123 #define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1124 #define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
1126 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1127 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1128 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
1131 #define Track_Code(code) STMT_START { code } STMT_END
1134 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1135 #define EXPERIMENTAL_INPLACESCAN
1136 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
1140 Perl_re_printf(pTHX_ const char *fmt, ...)
1144 PerlIO *f= Perl_debug_log;
1145 PERL_ARGS_ASSERT_RE_PRINTF;
1147 result = PerlIO_vprintf(f, fmt, ap);
1153 Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
1157 PerlIO *f= Perl_debug_log;
1158 PERL_ARGS_ASSERT_RE_INDENTF;
1159 va_start(ap, depth);
1160 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
1161 result = PerlIO_vprintf(f, fmt, ap);
1165 #endif /* DEBUGGING */
1167 #define DEBUG_RExC_seen() \
1168 DEBUG_OPTIMISE_MORE_r({ \
1169 Perl_re_printf( aTHX_ "RExC_seen: "); \
1171 if (RExC_seen & REG_ZERO_LEN_SEEN) \
1172 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
1174 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
1175 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
1177 if (RExC_seen & REG_GPOS_SEEN) \
1178 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
1180 if (RExC_seen & REG_RECURSE_SEEN) \
1181 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
1183 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
1184 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
1186 if (RExC_seen & REG_VERBARG_SEEN) \
1187 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
1189 if (RExC_seen & REG_CUTGROUP_SEEN) \
1190 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
1192 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
1193 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
1195 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
1196 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
1198 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
1199 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
1201 Perl_re_printf( aTHX_ "\n"); \
1204 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
1205 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
1210 S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1211 const char *close_str)
1216 Perl_re_printf( aTHX_ "%s", open_str);
1217 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1218 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
1219 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1220 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1221 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1222 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1223 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1224 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1225 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1226 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1227 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1228 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1229 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1230 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1231 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1232 Perl_re_printf( aTHX_ "%s", close_str);
1237 S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1238 U32 depth, int is_inf)
1240 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1242 DEBUG_OPTIMISE_MORE_r({
1245 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1249 (IV)data->pos_delta,
1253 S_debug_show_study_flags(aTHX_ data->flags," [","]");
1255 Perl_re_printf( aTHX_
1256 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1258 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1259 is_inf ? "INF " : ""
1262 if (data->last_found) {
1264 Perl_re_printf(aTHX_
1265 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1266 SvPVX_const(data->last_found),
1268 (IV)data->last_start_min,
1269 (IV)data->last_start_max
1272 for (i = 0; i < 2; i++) {
1273 Perl_re_printf(aTHX_
1274 " %s%s: '%s' @ %" IVdf "/%" IVdf,
1275 data->cur_is_floating == i ? "*" : "",
1276 i ? "Float" : "Fixed",
1277 SvPVX_const(data->substrs[i].str),
1278 (IV)data->substrs[i].min_offset,
1279 (IV)data->substrs[i].max_offset
1281 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1285 Perl_re_printf( aTHX_ "\n");
1291 S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1292 regnode *scan, U32 depth, U32 flags)
1294 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1301 Next = regnext(scan);
1302 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1303 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1306 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1307 Next ? (REG_NODE_NUM(Next)) : 0 );
1308 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1309 Perl_re_printf( aTHX_ "\n");
1314 # define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1315 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1317 # define DEBUG_PEEP(str, scan, depth, flags) \
1318 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1321 # define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1322 # define DEBUG_PEEP(str, scan, depth, flags) NOOP
1326 /* =========================================================
1327 * BEGIN edit_distance stuff.
1329 * This calculates how many single character changes of any type are needed to
1330 * transform a string into another one. It is taken from version 3.1 of
1332 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1335 /* Our unsorted dictionary linked list. */
1336 /* Note we use UVs, not chars. */
1341 struct dictionary* next;
1343 typedef struct dictionary item;
1346 PERL_STATIC_INLINE item*
1347 push(UV key, item* curr)
1350 Newx(head, 1, item);
1358 PERL_STATIC_INLINE item*
1359 find(item* head, UV key)
1361 item* iterator = head;
1363 if (iterator->key == key){
1366 iterator = iterator->next;
1372 PERL_STATIC_INLINE item*
1373 uniquePush(item* head, UV key)
1375 item* iterator = head;
1378 if (iterator->key == key) {
1381 iterator = iterator->next;
1384 return push(key, head);
1387 PERL_STATIC_INLINE void
1388 dict_free(item* head)
1390 item* iterator = head;
1393 item* temp = iterator;
1394 iterator = iterator->next;
1401 /* End of Dictionary Stuff */
1403 /* All calculations/work are done here */
1405 S_edit_distance(const UV* src,
1407 const STRLEN x, /* length of src[] */
1408 const STRLEN y, /* length of tgt[] */
1409 const SSize_t maxDistance
1413 UV swapCount, swapScore, targetCharCount, i, j;
1415 UV score_ceil = x + y;
1417 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1419 /* intialize matrix start values */
1420 Newx(scores, ( (x + 2) * (y + 2)), UV);
1421 scores[0] = score_ceil;
1422 scores[1 * (y + 2) + 0] = score_ceil;
1423 scores[0 * (y + 2) + 1] = score_ceil;
1424 scores[1 * (y + 2) + 1] = 0;
1425 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
1430 for (i=1;i<=x;i++) {
1432 head = uniquePush(head, src[i]);
1433 scores[(i+1) * (y + 2) + 1] = i;
1434 scores[(i+1) * (y + 2) + 0] = score_ceil;
1437 for (j=1;j<=y;j++) {
1440 head = uniquePush(head, tgt[j]);
1441 scores[1 * (y + 2) + (j + 1)] = j;
1442 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1445 targetCharCount = find(head, tgt[j-1])->value;
1446 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1448 if (src[i-1] != tgt[j-1]){
1449 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1453 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1457 find(head, src[i-1])->value = i;
1461 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1464 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1468 /* END of edit_distance() stuff
1469 * ========================================================= */
1471 /* Mark that we cannot extend a found fixed substring at this point.
1472 Update the longest found anchored substring or the longest found
1473 floating substrings if needed. */
1476 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1477 SSize_t *minlenp, int is_inf)
1479 const STRLEN l = CHR_SVLEN(data->last_found);
1480 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
1481 const STRLEN old_l = CHR_SVLEN(longest_sv);
1482 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1484 PERL_ARGS_ASSERT_SCAN_COMMIT;
1486 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
1487 const U8 i = data->cur_is_floating;
1488 SvSetMagicSV(longest_sv, data->last_found);
1489 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1492 data->substrs[0].max_offset = data->substrs[0].min_offset;
1494 data->substrs[1].max_offset =
1498 ? data->last_start_max
1499 /* temporary underflow guard for 5.32 */
1500 : data->pos_delta < 0 ? OPTIMIZE_INFTY
1501 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1503 : data->pos_min + data->pos_delta));
1506 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1507 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
1508 data->substrs[i].minlenp = minlenp;
1509 data->substrs[i].lookbehind = 0;
1512 SvCUR_set(data->last_found, 0);
1514 SV * const sv = data->last_found;
1515 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1516 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1521 data->last_end = -1;
1522 data->flags &= ~SF_BEFORE_EOL;
1523 DEBUG_STUDYDATA("commit", data, 0, is_inf);
1526 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1527 * list that describes which code points it matches */
1530 S_ssc_anything(pTHX_ regnode_ssc *ssc)
1532 /* Set the SSC 'ssc' to match an empty string or any code point */
1534 PERL_ARGS_ASSERT_SSC_ANYTHING;
1536 assert(is_ANYOF_SYNTHETIC(ssc));
1538 /* mortalize so won't leak */
1539 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
1540 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1544 S_ssc_is_anything(const regnode_ssc *ssc)
1546 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1547 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1548 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1549 * in any way, so there's no point in using it */
1554 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1556 assert(is_ANYOF_SYNTHETIC(ssc));
1558 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1562 /* See if the list consists solely of the range 0 - Infinity */
1563 invlist_iterinit(ssc->invlist);
1564 ret = invlist_iternext(ssc->invlist, &start, &end)
1568 invlist_iterfinish(ssc->invlist);
1574 /* If e.g., both \w and \W are set, matches everything */
1575 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1577 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1578 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1588 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1590 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1591 * string, any code point, or any posix class under locale */
1593 PERL_ARGS_ASSERT_SSC_INIT;
1595 Zero(ssc, 1, regnode_ssc);
1596 set_ANYOF_SYNTHETIC(ssc);
1597 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1600 /* If any portion of the regex is to operate under locale rules that aren't
1601 * fully known at compile time, initialization includes it. The reason
1602 * this isn't done for all regexes is that the optimizer was written under
1603 * the assumption that locale was all-or-nothing. Given the complexity and
1604 * lack of documentation in the optimizer, and that there are inadequate
1605 * test cases for locale, many parts of it may not work properly, it is
1606 * safest to avoid locale unless necessary. */
1607 if (RExC_contains_locale) {
1608 ANYOF_POSIXL_SETALL(ssc);
1611 ANYOF_POSIXL_ZERO(ssc);
1616 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1617 const regnode_ssc *ssc)
1619 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1620 * to the list of code points matched, and locale posix classes; hence does
1621 * not check its flags) */
1626 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1628 assert(is_ANYOF_SYNTHETIC(ssc));
1630 invlist_iterinit(ssc->invlist);
1631 ret = invlist_iternext(ssc->invlist, &start, &end)
1635 invlist_iterfinish(ssc->invlist);
1641 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1648 #define INVLIST_INDEX 0
1649 #define ONLY_LOCALE_MATCHES_INDEX 1
1650 #define DEFERRED_USER_DEFINED_INDEX 2
1653 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1654 const regnode_charclass* const node)
1656 /* Returns a mortal inversion list defining which code points are matched
1657 * by 'node', which is of type ANYOF. Handles complementing the result if
1658 * appropriate. If some code points aren't knowable at this time, the
1659 * returned list must, and will, contain every code point that is a
1663 SV* only_utf8_locale_invlist = NULL;
1665 const U32 n = ARG(node);
1666 bool new_node_has_latin1 = FALSE;
1667 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
1669 : ANYOF_FLAGS(node);
1671 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1673 /* Look at the data structure created by S_set_ANYOF_arg() */
1674 if (n != ANYOF_ONLY_HAS_BITMAP) {
1675 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1676 AV * const av = MUTABLE_AV(SvRV(rv));
1677 SV **const ary = AvARRAY(av);
1678 assert(RExC_rxi->data->what[n] == 's');
1680 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
1682 /* Here there are things that won't be known until runtime -- we
1683 * have to assume it could be anything */
1684 invlist = sv_2mortal(_new_invlist(1));
1685 return _add_range_to_invlist(invlist, 0, UV_MAX);
1687 else if (ary[INVLIST_INDEX]) {
1689 /* Use the node's inversion list */
1690 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1693 /* Get the code points valid only under UTF-8 locales */
1694 if ( (flags & ANYOFL_FOLD)
1695 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1697 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
1702 invlist = sv_2mortal(_new_invlist(0));
1705 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1706 * code points, and an inversion list for the others, but if there are code
1707 * points that should match only conditionally on the target string being
1708 * UTF-8, those are placed in the inversion list, and not the bitmap.
1709 * Since there are circumstances under which they could match, they are
1710 * included in the SSC. But if the ANYOF node is to be inverted, we have
1711 * to exclude them here, so that when we invert below, the end result
1712 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1713 * have to do this here before we add the unconditionally matched code
1715 if (flags & ANYOF_INVERT) {
1716 _invlist_intersection_complement_2nd(invlist,
1721 /* Add in the points from the bit map */
1722 if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
1723 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1724 if (ANYOF_BITMAP_TEST(node, i)) {
1725 unsigned int start = i++;
1727 for (; i < NUM_ANYOF_CODE_POINTS
1728 && ANYOF_BITMAP_TEST(node, i); ++i)
1732 invlist = _add_range_to_invlist(invlist, start, i-1);
1733 new_node_has_latin1 = TRUE;
1738 /* If this can match all upper Latin1 code points, have to add them
1739 * as well. But don't add them if inverting, as when that gets done below,
1740 * it would exclude all these characters, including the ones it shouldn't
1741 * that were added just above */
1742 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1743 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1745 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1748 /* Similarly for these */
1749 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1750 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1753 if (flags & ANYOF_INVERT) {
1754 _invlist_invert(invlist);
1756 else if (flags & ANYOFL_FOLD) {
1757 if (new_node_has_latin1) {
1759 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1760 * the locale. We can skip this if there are no 0-255 at all. */
1761 _invlist_union(invlist, PL_Latin1, &invlist);
1763 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1764 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1767 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1768 invlist = add_cp_to_invlist(invlist, 'I');
1770 if (_invlist_contains_cp(invlist,
1771 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1773 invlist = add_cp_to_invlist(invlist, 'i');
1778 /* Similarly add the UTF-8 locale possible matches. These have to be
1779 * deferred until after the non-UTF-8 locale ones are taken care of just
1780 * above, or it leads to wrong results under ANYOF_INVERT */
1781 if (only_utf8_locale_invlist) {
1782 _invlist_union_maybe_complement_2nd(invlist,
1783 only_utf8_locale_invlist,
1784 flags & ANYOF_INVERT,
1791 /* These two functions currently do the exact same thing */
1792 #define ssc_init_zero ssc_init
1794 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1795 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1797 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1798 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1799 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1802 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1803 const regnode_charclass *and_with)
1805 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1806 * another SSC or a regular ANYOF class. Can create false positives. */
1809 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
1811 : ANYOF_FLAGS(and_with);
1814 PERL_ARGS_ASSERT_SSC_AND;
1816 assert(is_ANYOF_SYNTHETIC(ssc));
1818 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1819 * the code point inversion list and just the relevant flags */
1820 if (is_ANYOF_SYNTHETIC(and_with)) {
1821 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1822 anded_flags = and_with_flags;
1824 /* XXX This is a kludge around what appears to be deficiencies in the
1825 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1826 * there are paths through the optimizer where it doesn't get weeded
1827 * out when it should. And if we don't make some extra provision for
1828 * it like the code just below, it doesn't get added when it should.
1829 * This solution is to add it only when AND'ing, which is here, and
1830 * only when what is being AND'ed is the pristine, original node
1831 * matching anything. Thus it is like adding it to ssc_anything() but
1832 * only when the result is to be AND'ed. Probably the same solution
1833 * could be adopted for the same problem we have with /l matching,
1834 * which is solved differently in S_ssc_init(), and that would lead to
1835 * fewer false positives than that solution has. But if this solution
1836 * creates bugs, the consequences are only that a warning isn't raised
1837 * that should be; while the consequences for having /l bugs is
1838 * incorrect matches */
1839 if (ssc_is_anything((regnode_ssc *)and_with)) {
1840 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
1844 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1845 if (OP(and_with) == ANYOFD) {
1846 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
1849 anded_flags = and_with_flags
1850 &( ANYOF_COMMON_FLAGS
1851 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1852 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
1853 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
1855 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1860 ANYOF_FLAGS(ssc) &= anded_flags;
1862 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1863 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1864 * 'and_with' may be inverted. When not inverted, we have the situation of
1866 * (C1 | P1) & (C2 | P2)
1867 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1868 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1869 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1870 * <= ((C1 & C2) | P1 | P2)
1871 * Alternatively, the last few steps could be:
1872 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1873 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1874 * <= (C1 | C2 | (P1 & P2))
1875 * We favor the second approach if either P1 or P2 is non-empty. This is
1876 * because these components are a barrier to doing optimizations, as what
1877 * they match cannot be known until the moment of matching as they are
1878 * dependent on the current locale, 'AND"ing them likely will reduce or
1880 * But we can do better if we know that C1,P1 are in their initial state (a
1881 * frequent occurrence), each matching everything:
1882 * (<everything>) & (C2 | P2) = C2 | P2
1883 * Similarly, if C2,P2 are in their initial state (again a frequent
1884 * occurrence), the result is a no-op
1885 * (C1 | P1) & (<everything>) = C1 | P1
1888 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1889 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1890 * <= (C1 & ~C2) | (P1 & ~P2)
1893 if ((and_with_flags & ANYOF_INVERT)
1894 && ! is_ANYOF_SYNTHETIC(and_with))
1898 ssc_intersection(ssc,
1900 FALSE /* Has already been inverted */
1903 /* If either P1 or P2 is empty, the intersection will be also; can skip
1905 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
1906 ANYOF_POSIXL_ZERO(ssc);
1908 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1910 /* Note that the Posix class component P from 'and_with' actually
1912 * P = Pa | Pb | ... | Pn
1913 * where each component is one posix class, such as in [\w\s].
1915 * ~P = ~(Pa | Pb | ... | Pn)
1916 * = ~Pa & ~Pb & ... & ~Pn
1917 * <= ~Pa | ~Pb | ... | ~Pn
1918 * The last is something we can easily calculate, but unfortunately
1919 * is likely to have many false positives. We could do better
1920 * in some (but certainly not all) instances if two classes in
1921 * P have known relationships. For example
1922 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1924 * :lower: & :print: = :lower:
1925 * And similarly for classes that must be disjoint. For example,
1926 * since \s and \w can have no elements in common based on rules in
1927 * the POSIX standard,
1928 * \w & ^\S = nothing
1929 * Unfortunately, some vendor locales do not meet the Posix
1930 * standard, in particular almost everything by Microsoft.
1931 * The loop below just changes e.g., \w into \W and vice versa */
1933 regnode_charclass_posixl temp;
1934 int add = 1; /* To calculate the index of the complement */
1936 Zero(&temp, 1, regnode_charclass_posixl);
1937 ANYOF_POSIXL_ZERO(&temp);
1938 for (i = 0; i < ANYOF_MAX; i++) {
1940 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1941 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1943 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1944 ANYOF_POSIXL_SET(&temp, i + add);
1946 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1948 ANYOF_POSIXL_AND(&temp, ssc);
1950 } /* else ssc already has no posixes */
1951 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1952 in its initial state */
1953 else if (! is_ANYOF_SYNTHETIC(and_with)
1954 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1956 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1957 * copy it over 'ssc' */
1958 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1959 if (is_ANYOF_SYNTHETIC(and_with)) {
1960 StructCopy(and_with, ssc, regnode_ssc);
1963 ssc->invlist = anded_cp_list;
1964 ANYOF_POSIXL_ZERO(ssc);
1965 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1966 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1970 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1971 || (and_with_flags & ANYOF_MATCHES_POSIXL))
1973 /* One or the other of P1, P2 is non-empty. */
1974 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1975 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1977 ssc_union(ssc, anded_cp_list, FALSE);
1979 else { /* P1 = P2 = empty */
1980 ssc_intersection(ssc, anded_cp_list, FALSE);
1986 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1987 const regnode_charclass *or_with)
1989 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1990 * another SSC or a regular ANYOF class. Can create false positives if
1991 * 'or_with' is to be inverted. */
1995 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
1997 : ANYOF_FLAGS(or_with);
1999 PERL_ARGS_ASSERT_SSC_OR;
2001 assert(is_ANYOF_SYNTHETIC(ssc));
2003 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
2004 * the code point inversion list and just the relevant flags */
2005 if (is_ANYOF_SYNTHETIC(or_with)) {
2006 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
2007 ored_flags = or_with_flags;
2010 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
2011 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
2012 if (OP(or_with) != ANYOFD) {
2015 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2016 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
2017 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
2019 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
2024 ANYOF_FLAGS(ssc) |= ored_flags;
2026 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2027 * C2 is the list of code points in 'or-with'; P2, its posix classes.
2028 * 'or_with' may be inverted. When not inverted, we have the simple
2029 * situation of computing:
2030 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
2031 * If P1|P2 yields a situation with both a class and its complement are
2032 * set, like having both \w and \W, this matches all code points, and we
2033 * can delete these from the P component of the ssc going forward. XXX We
2034 * might be able to delete all the P components, but I (khw) am not certain
2035 * about this, and it is better to be safe.
2038 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
2039 * <= (C1 | P1) | ~C2
2040 * <= (C1 | ~C2) | P1
2041 * (which results in actually simpler code than the non-inverted case)
2044 if ((or_with_flags & ANYOF_INVERT)
2045 && ! is_ANYOF_SYNTHETIC(or_with))
2047 /* We ignore P2, leaving P1 going forward */
2048 } /* else Not inverted */
2049 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
2050 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
2051 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2053 for (i = 0; i < ANYOF_MAX; i += 2) {
2054 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2056 ssc_match_all_cp(ssc);
2057 ANYOF_POSIXL_CLEAR(ssc, i);
2058 ANYOF_POSIXL_CLEAR(ssc, i+1);
2066 FALSE /* Already has been inverted */
2071 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2073 PERL_ARGS_ASSERT_SSC_UNION;
2075 assert(is_ANYOF_SYNTHETIC(ssc));
2077 _invlist_union_maybe_complement_2nd(ssc->invlist,
2084 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2086 const bool invert2nd)
2088 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2090 assert(is_ANYOF_SYNTHETIC(ssc));
2092 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2099 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2101 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2103 assert(is_ANYOF_SYNTHETIC(ssc));
2105 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2109 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2111 /* AND just the single code point 'cp' into the SSC 'ssc' */
2113 SV* cp_list = _new_invlist(2);
2115 PERL_ARGS_ASSERT_SSC_CP_AND;
2117 assert(is_ANYOF_SYNTHETIC(ssc));
2119 cp_list = add_cp_to_invlist(cp_list, cp);
2120 ssc_intersection(ssc, cp_list,
2121 FALSE /* Not inverted */
2123 SvREFCNT_dec_NN(cp_list);
2127 S_ssc_clear_locale(regnode_ssc *ssc)
2129 /* Set the SSC 'ssc' to not match any locale things */
2130 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2132 assert(is_ANYOF_SYNTHETIC(ssc));
2134 ANYOF_POSIXL_ZERO(ssc);
2135 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2138 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2141 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2143 /* The synthetic start class is used to hopefully quickly winnow down
2144 * places where a pattern could start a match in the target string. If it
2145 * doesn't really narrow things down that much, there isn't much point to
2146 * having the overhead of using it. This function uses some very crude
2147 * heuristics to decide if to use the ssc or not.
2149 * It returns TRUE if 'ssc' rules out more than half what it considers to
2150 * be the "likely" possible matches, but of course it doesn't know what the
2151 * actual things being matched are going to be; these are only guesses
2153 * For /l matches, it assumes that the only likely matches are going to be
2154 * in the 0-255 range, uniformly distributed, so half of that is 127
2155 * For /a and /d matches, it assumes that the likely matches will be just
2156 * the ASCII range, so half of that is 63
2157 * For /u and there isn't anything matching above the Latin1 range, it
2158 * assumes that that is the only range likely to be matched, and uses
2159 * half that as the cut-off: 127. If anything matches above Latin1,
2160 * it assumes that all of Unicode could match (uniformly), except for
2161 * non-Unicode code points and things in the General Category "Other"
2162 * (unassigned, private use, surrogates, controls and formats). This
2163 * is a much large number. */
2165 U32 count = 0; /* Running total of number of code points matched by
2167 UV start, end; /* Start and end points of current range in inversion
2168 XXX outdated. UTF-8 locales are common, what about invert? list */
2169 const U32 max_code_points = (LOC)
2171 : (( ! UNI_SEMANTICS
2172 || invlist_highest(ssc->invlist) < 256)
2175 const U32 max_match = max_code_points / 2;
2177 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2179 invlist_iterinit(ssc->invlist);
2180 while (invlist_iternext(ssc->invlist, &start, &end)) {
2181 if (start >= max_code_points) {
2184 end = MIN(end, max_code_points - 1);
2185 count += end - start + 1;
2186 if (count >= max_match) {
2187 invlist_iterfinish(ssc->invlist);
2197 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2199 /* The inversion list in the SSC is marked mortal; now we need a more
2200 * permanent copy, which is stored the same way that is done in a regular
2201 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2204 SV* invlist = invlist_clone(ssc->invlist, NULL);
2206 PERL_ARGS_ASSERT_SSC_FINALIZE;
2208 assert(is_ANYOF_SYNTHETIC(ssc));
2210 /* The code in this file assumes that all but these flags aren't relevant
2211 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2212 * by the time we reach here */
2213 assert(! (ANYOF_FLAGS(ssc)
2214 & ~( ANYOF_COMMON_FLAGS
2215 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2216 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
2218 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2220 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
2221 SvREFCNT_dec(invlist);
2223 /* Make sure is clone-safe */
2224 ssc->invlist = NULL;
2226 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
2227 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
2228 OP(ssc) = ANYOFPOSIXL;
2230 else if (RExC_contains_locale) {
2234 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
2237 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2238 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2239 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
2240 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2241 ? (TRIE_LIST_CUR( idx ) - 1) \
2247 dump_trie(trie,widecharmap,revcharmap)
2248 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2249 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
2251 These routines dump out a trie in a somewhat readable format.
2252 The _interim_ variants are used for debugging the interim
2253 tables that are used to generate the final compressed
2254 representation which is what dump_trie expects.
2256 Part of the reason for their existence is to provide a form
2257 of documentation as to how the different representations function.
2262 Dumps the final compressed table form of the trie to Perl_debug_log.
2263 Used for debugging make_trie().
2267 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2268 AV *revcharmap, U32 depth)
2271 SV *sv=sv_newmortal();
2272 int colwidth= widecharmap ? 6 : 4;
2274 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2276 PERL_ARGS_ASSERT_DUMP_TRIE;
2278 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
2279 depth+1, "Match","Base","Ofs" );
2281 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2282 SV ** const tmp = av_fetch( revcharmap, state, 0);
2284 Perl_re_printf( aTHX_ "%*s",
2286 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2287 PL_colors[0], PL_colors[1],
2288 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2289 PERL_PV_ESCAPE_FIRSTCHAR
2294 Perl_re_printf( aTHX_ "\n");
2295 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
2297 for( state = 0 ; state < trie->uniquecharcount ; state++ )
2298 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2299 Perl_re_printf( aTHX_ "\n");
2301 for( state = 1 ; state < trie->statecount ; state++ ) {
2302 const U32 base = trie->states[ state ].trans.base;
2304 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
2306 if ( trie->states[ state ].wordnum ) {
2307 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
2309 Perl_re_printf( aTHX_ "%6s", "" );
2312 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
2317 while( ( base + ofs < trie->uniquecharcount ) ||
2318 ( base + ofs - trie->uniquecharcount < trie->lasttrans
2319 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2323 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
2325 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2326 if ( ( base + ofs >= trie->uniquecharcount )
2327 && ( base + ofs - trie->uniquecharcount
2329 && trie->trans[ base + ofs
2330 - trie->uniquecharcount ].check == state )
2332 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
2333 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2336 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
2340 Perl_re_printf( aTHX_ "]");
2343 Perl_re_printf( aTHX_ "\n" );
2345 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
2347 for (word=1; word <= trie->wordcount; word++) {
2348 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2349 (int)word, (int)(trie->wordinfo[word].prev),
2350 (int)(trie->wordinfo[word].len));
2352 Perl_re_printf( aTHX_ "\n" );
2355 Dumps a fully constructed but uncompressed trie in list form.
2356 List tries normally only are used for construction when the number of
2357 possible chars (trie->uniquecharcount) is very high.
2358 Used for debugging make_trie().
2361 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2362 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2366 SV *sv=sv_newmortal();
2367 int colwidth= widecharmap ? 6 : 4;
2368 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2370 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2372 /* print out the table precompression. */
2373 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
2375 Perl_re_indentf( aTHX_ "%s",
2376 depth+1, "------:-----+-----------------\n" );
2378 for( state=1 ; state < next_alloc ; state ++ ) {
2381 Perl_re_indentf( aTHX_ " %4" UVXf " :",
2382 depth+1, (UV)state );
2383 if ( ! trie->states[ state ].wordnum ) {
2384 Perl_re_printf( aTHX_ "%5s| ","");
2386 Perl_re_printf( aTHX_ "W%4x| ",
2387 trie->states[ state ].wordnum
2390 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2391 SV ** const tmp = av_fetch( revcharmap,
2392 TRIE_LIST_ITEM(state, charid).forid, 0);
2394 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
2396 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2398 PL_colors[0], PL_colors[1],
2399 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2400 | PERL_PV_ESCAPE_FIRSTCHAR
2402 TRIE_LIST_ITEM(state, charid).forid,
2403 (UV)TRIE_LIST_ITEM(state, charid).newstate
2406 Perl_re_printf( aTHX_ "\n%*s| ",
2407 (int)((depth * 2) + 14), "");
2410 Perl_re_printf( aTHX_ "\n");
2415 Dumps a fully constructed but uncompressed trie in table form.
2416 This is the normal DFA style state transition table, with a few
2417 twists to facilitate compression later.
2418 Used for debugging make_trie().
2421 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2422 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2427 SV *sv=sv_newmortal();
2428 int colwidth= widecharmap ? 6 : 4;
2429 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2431 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
2434 print out the table precompression so that we can do a visual check
2435 that they are identical.
2438 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
2440 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2441 SV ** const tmp = av_fetch( revcharmap, charid, 0);
2443 Perl_re_printf( aTHX_ "%*s",
2445 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
2446 PL_colors[0], PL_colors[1],
2447 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2448 PERL_PV_ESCAPE_FIRSTCHAR
2454 Perl_re_printf( aTHX_ "\n");
2455 Perl_re_indentf( aTHX_ "State+-", depth+1 );
2457 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
2458 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
2461 Perl_re_printf( aTHX_ "\n" );
2463 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2465 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
2467 (UV)TRIE_NODENUM( state ) );
2469 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2470 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2472 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
2474 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
2476 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
2477 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
2478 (UV)trie->trans[ state ].check );
2480 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
2481 (UV)trie->trans[ state ].check,
2482 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2490 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2491 startbranch: the first branch in the whole branch sequence
2492 first : start branch of sequence of branch-exact nodes.
2493 May be the same as startbranch
2494 last : Thing following the last branch.
2495 May be the same as tail.
2496 tail : item following the branch sequence
2497 count : words in the sequence
2498 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
2499 depth : indent depth
2501 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
2503 A trie is an N'ary tree where the branches are determined by digital
2504 decomposition of the key. IE, at the root node you look up the 1st character and
2505 follow that branch repeat until you find the end of the branches. Nodes can be
2506 marked as "accepting" meaning they represent a complete word. Eg:
2510 would convert into the following structure. Numbers represent states, letters
2511 following numbers represent valid transitions on the letter from that state, if
2512 the number is in square brackets it represents an accepting state, otherwise it
2513 will be in parenthesis.
2515 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2519 (1) +-i->(6)-+-s->[7]
2521 +-s->(3)-+-h->(4)-+-e->[5]
2523 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2525 This shows that when matching against the string 'hers' we will begin at state 1
2526 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2527 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2528 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2529 single traverse. We store a mapping from accepting to state to which word was
2530 matched, and then when we have multiple possibilities we try to complete the
2531 rest of the regex in the order in which they occurred in the alternation.
2533 The only prior NFA like behaviour that would be changed by the TRIE support is
2534 the silent ignoring of duplicate alternations which are of the form:
2536 / (DUPE|DUPE) X? (?{ ... }) Y /x
2538 Thus EVAL blocks following a trie may be called a different number of times with
2539 and without the optimisation. With the optimisations dupes will be silently
2540 ignored. This inconsistent behaviour of EVAL type nodes is well established as
2541 the following demonstrates:
2543 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2545 which prints out 'word' three times, but
2547 'words'=~/(word|word|word)(?{ print $1 })S/
2549 which doesnt print it out at all. This is due to other optimisations kicking in.
2551 Example of what happens on a structural level:
2553 The regexp /(ac|ad|ab)+/ will produce the following debug output:
2555 1: CURLYM[1] {1,32767}(18)
2566 This would be optimizable with startbranch=5, first=5, last=16, tail=16
2567 and should turn into:
2569 1: CURLYM[1] {1,32767}(18)
2571 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2579 Cases where tail != last would be like /(?foo|bar)baz/:
2589 which would be optimizable with startbranch=1, first=1, last=7, tail=8
2590 and would end up looking like:
2593 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2600 d = uvchr_to_utf8_flags(d, uv, 0);
2602 is the recommended Unicode-aware way of saying
2607 #define TRIE_STORE_REVCHAR(val) \
2610 SV *zlopp = newSV(UTF8_MAXBYTES); \
2611 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2612 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2614 SvCUR_set(zlopp, kapow - flrbbbbb); \
2617 av_push(revcharmap, zlopp); \
2619 char ooooff = (char)val; \
2620 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2624 /* This gets the next character from the input, folding it if not already
2626 #define TRIE_READ_CHAR STMT_START { \
2629 /* if it is UTF then it is either already folded, or does not need \
2631 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2633 else if (folder == PL_fold_latin1) { \
2634 /* This folder implies Unicode rules, which in the range expressible \
2635 * by not UTF is the lower case, with the two exceptions, one of \
2636 * which should have been taken care of before calling this */ \
2637 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2638 uvc = toLOWER_L1(*uc); \
2639 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2642 /* raw data, will be folded later if needed */ \
2650 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2651 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2652 U32 ging = TRIE_LIST_LEN( state ) * 2; \
2653 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2654 TRIE_LIST_LEN( state ) = ging; \
2656 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2657 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2658 TRIE_LIST_CUR( state )++; \
2661 #define TRIE_LIST_NEW(state) STMT_START { \
2662 Newx( trie->states[ state ].trans.list, \
2663 4, reg_trie_trans_le ); \
2664 TRIE_LIST_CUR( state ) = 1; \
2665 TRIE_LIST_LEN( state ) = 4; \
2668 #define TRIE_HANDLE_WORD(state) STMT_START { \
2669 U16 dupe= trie->states[ state ].wordnum; \
2670 regnode * const noper_next = regnext( noper ); \
2673 /* store the word for dumping */ \
2675 if (OP(noper) != NOTHING) \
2676 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2678 tmp = newSVpvn_utf8( "", 0, UTF ); \
2679 av_push( trie_words, tmp ); \
2683 trie->wordinfo[curword].prev = 0; \
2684 trie->wordinfo[curword].len = wordlen; \
2685 trie->wordinfo[curword].accept = state; \
2687 if ( noper_next < tail ) { \
2689 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2691 trie->jump[curword] = (U16)(noper_next - convert); \
2693 jumper = noper_next; \
2695 nextbranch= regnext(cur); \
2699 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2700 /* chain, so that when the bits of chain are later */\
2701 /* linked together, the dups appear in the chain */\
2702 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2703 trie->wordinfo[dupe].prev = curword; \
2705 /* we haven't inserted this word yet. */ \
2706 trie->states[ state ].wordnum = curword; \
2711 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2712 ( ( base + charid >= ucharcount \
2713 && base + charid < ubound \
2714 && state == trie->trans[ base - ucharcount + charid ].check \
2715 && trie->trans[ base - ucharcount + charid ].next ) \
2716 ? trie->trans[ base - ucharcount + charid ].next \
2717 : ( state==1 ? special : 0 ) \
2720 #define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2722 TRIE_BITMAP_SET(trie, uvc); \
2723 /* store the folded codepoint */ \
2725 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2728 /* store first byte of utf8 representation of */ \
2729 /* variant codepoints */ \
2730 if (! UVCHR_IS_INVARIANT(uvc)) { \
2731 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2736 #define MADE_JUMP_TRIE 2
2737 #define MADE_EXACT_TRIE 4
2740 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2741 regnode *first, regnode *last, regnode *tail,
2742 U32 word_count, U32 flags, U32 depth)
2744 /* first pass, loop through and scan words */
2745 reg_trie_data *trie;
2746 HV *widecharmap = NULL;
2747 AV *revcharmap = newAV();
2753 regnode *jumper = NULL;
2754 regnode *nextbranch = NULL;
2755 regnode *convert = NULL;
2756 U32 *prev_states; /* temp array mapping each state to previous one */
2757 /* we just use folder as a flag in utf8 */
2758 const U8 * folder = NULL;
2760 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2761 * which stands for one trie structure, one hash, optionally followed
2764 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2765 AV *trie_words = NULL;
2766 /* along with revcharmap, this only used during construction but both are
2767 * useful during debugging so we store them in the struct when debugging.
2770 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2771 STRLEN trie_charcount=0;
2773 SV *re_trie_maxbuff;
2774 DECLARE_AND_GET_RE_DEBUG_FLAGS;
2776 PERL_ARGS_ASSERT_MAKE_TRIE;
2778 PERL_UNUSED_ARG(depth);
2782 case EXACT: case EXACT_REQ8: case EXACTL: break;
2786 case EXACTFLU8: folder = PL_fold_latin1; break;
2787 case EXACTF: folder = PL_fold; break;
2788 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2791 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2793 trie->startstate = 1;
2794 trie->wordcount = word_count;
2795 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2796 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2797 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
2798 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2799 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2800 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2803 trie_words = newAV();
2806 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
2807 assert(re_trie_maxbuff);
2808 if (!SvIOK(re_trie_maxbuff)) {
2809 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2811 DEBUG_TRIE_COMPILE_r({
2812 Perl_re_indentf( aTHX_
2813 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2815 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
2816 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2819 /* Find the node we are going to overwrite */
2820 if ( first == startbranch && OP( last ) != BRANCH ) {
2821 /* whole branch chain */
2824 /* branch sub-chain */
2825 convert = NEXTOPER( first );
2828 /* -- First loop and Setup --
2830 We first traverse the branches and scan each word to determine if it
2831 contains widechars, and how many unique chars there are, this is
2832 important as we have to build a table with at least as many columns as we
2835 We use an array of integers to represent the character codes 0..255
2836 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2837 the native representation of the character value as the key and IV's for
2840 *TODO* If we keep track of how many times each character is used we can
2841 remap the columns so that the table compression later on is more
2842 efficient in terms of memory by ensuring the most common value is in the
2843 middle and the least common are on the outside. IMO this would be better
2844 than a most to least common mapping as theres a decent chance the most
2845 common letter will share a node with the least common, meaning the node
2846 will not be compressible. With a middle is most common approach the worst
2847 case is when we have the least common nodes twice.
2851 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2852 regnode *noper = NEXTOPER( cur );
2856 U32 wordlen = 0; /* required init */
2857 STRLEN minchars = 0;
2858 STRLEN maxchars = 0;
2859 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2862 if (OP(noper) == NOTHING) {
2863 /* skip past a NOTHING at the start of an alternation
2864 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2866 * If the next node is not something we are supposed to process
2867 * we will just ignore it due to the condition guarding the
2871 regnode *noper_next= regnext(noper);
2872 if (noper_next < tail)
2877 && ( OP(noper) == flags
2878 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2879 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
2880 || OP(noper) == EXACTFUP))))
2882 uc= (U8*)STRING(noper);
2883 e= uc + STR_LEN(noper);
2890 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2891 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2892 regardless of encoding */
2893 if (OP( noper ) == EXACTFUP) {
2894 /* false positives are ok, so just set this */
2895 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2899 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2901 TRIE_CHARCOUNT(trie)++;
2904 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2905 * is in effect. Under /i, this character can match itself, or
2906 * anything that folds to it. If not under /i, it can match just
2907 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2908 * all fold to k, and all are single characters. But some folds
2909 * expand to more than one character, so for example LATIN SMALL
2910 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2911 * the string beginning at 'uc' is 'ffi', it could be matched by
2912 * three characters, or just by the one ligature character. (It
2913 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2914 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2915 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2916 * match.) The trie needs to know the minimum and maximum number
2917 * of characters that could match so that it can use size alone to
2918 * quickly reject many match attempts. The max is simple: it is
2919 * the number of folded characters in this branch (since a fold is
2920 * never shorter than what folds to it. */
2924 /* And the min is equal to the max if not under /i (indicated by
2925 * 'folder' being NULL), or there are no multi-character folds. If
2926 * there is a multi-character fold, the min is incremented just
2927 * once, for the character that folds to the sequence. Each
2928 * character in the sequence needs to be added to the list below of
2929 * characters in the trie, but we count only the first towards the
2930 * min number of characters needed. This is done through the
2931 * variable 'foldlen', which is returned by the macros that look
2932 * for these sequences as the number of bytes the sequence
2933 * occupies. Each time through the loop, we decrement 'foldlen' by
2934 * how many bytes the current char occupies. Only when it reaches
2935 * 0 do we increment 'minchars' or look for another multi-character
2937 if (folder == NULL) {
2940 else if (foldlen > 0) {
2941 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2946 /* See if *uc is the beginning of a multi-character fold. If
2947 * so, we decrement the length remaining to look at, to account
2948 * for the current character this iteration. (We can use 'uc'
2949 * instead of the fold returned by TRIE_READ_CHAR because the
2950 * macro is smart enough to account for any unfolded
2953 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2954 foldlen -= UTF8SKIP(uc);
2957 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2962 /* The current character (and any potential folds) should be added
2963 * to the possible matching characters for this position in this
2967 U8 folded= folder[ (U8) uvc ];
2968 if ( !trie->charmap[ folded ] ) {
2969 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2970 TRIE_STORE_REVCHAR( folded );
2973 if ( !trie->charmap[ uvc ] ) {
2974 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2975 TRIE_STORE_REVCHAR( uvc );
2978 /* store the codepoint in the bitmap, and its folded
2980 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
2981 set_bit = 0; /* We've done our bit :-) */
2985 /* XXX We could come up with the list of code points that fold
2986 * to this using PL_utf8_foldclosures, except not for
2987 * multi-char folds, as there may be multiple combinations
2988 * there that could work, which needs to wait until runtime to
2989 * resolve (The comment about LIGATURE FFI above is such an
2994 widecharmap = newHV();
2996 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2999 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
3001 if ( !SvTRUE( *svpp ) ) {
3002 sv_setiv( *svpp, ++trie->uniquecharcount );
3003 TRIE_STORE_REVCHAR(uvc);
3006 } /* end loop through characters in this branch of the trie */
3008 /* We take the min and max for this branch and combine to find the min
3009 * and max for all branches processed so far */
3010 if( cur == first ) {
3011 trie->minlen = minchars;
3012 trie->maxlen = maxchars;
3013 } else if (minchars < trie->minlen) {
3014 trie->minlen = minchars;
3015 } else if (maxchars > trie->maxlen) {
3016 trie->maxlen = maxchars;
3018 } /* end first pass */
3019 DEBUG_TRIE_COMPILE_r(
3020 Perl_re_indentf( aTHX_
3021 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3023 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
3024 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3025 (int)trie->minlen, (int)trie->maxlen )
3029 We now know what we are dealing with in terms of unique chars and
3030 string sizes so we can calculate how much memory a naive
3031 representation using a flat table will take. If it's over a reasonable
3032 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
3033 conservative but potentially much slower representation using an array
3036 At the end we convert both representations into the same compressed
3037 form that will be used in regexec.c for matching with. The latter
3038 is a form that cannot be used to construct with but has memory
3039 properties similar to the list form and access properties similar
3040 to the table form making it both suitable for fast searches and
3041 small enough that its feasable to store for the duration of a program.
3043 See the comment in the code where the compressed table is produced
3044 inplace from the flat tabe representation for an explanation of how
3045 the compression works.
3050 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3053 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3054 > SvIV(re_trie_maxbuff) )
3057 Second Pass -- Array Of Lists Representation
3059 Each state will be represented by a list of charid:state records
3060 (reg_trie_trans_le) the first such element holds the CUR and LEN
3061 points of the allocated array. (See defines above).
3063 We build the initial structure using the lists, and then convert
3064 it into the compressed table form which allows faster lookups
3065 (but cant be modified once converted).
3068 STRLEN transcount = 1;
3070 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
3073 trie->states = (reg_trie_state *)
3074 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3075 sizeof(reg_trie_state) );
3079 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3081 regnode *noper = NEXTOPER( cur );
3082 U32 state = 1; /* required init */
3083 U16 charid = 0; /* sanity init */
3084 U32 wordlen = 0; /* required init */
3086 if (OP(noper) == NOTHING) {
3087 regnode *noper_next= regnext(noper);
3088 if (noper_next < tail)
3090 /* we will undo this assignment if noper does not
3091 * point at a trieable type in the else clause of
3092 * the following statement. */
3096 && ( OP(noper) == flags
3097 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3098 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3099 || OP(noper) == EXACTFUP))))
3101 const U8 *uc= (U8*)STRING(noper);
3102 const U8 *e= uc + STR_LEN(noper);
3104 for ( ; uc < e ; uc += len ) {
3109 charid = trie->charmap[ uvc ];
3111 SV** const svpp = hv_fetch( widecharmap,
3118 charid=(U16)SvIV( *svpp );
3121 /* charid is now 0 if we dont know the char read, or
3122 * nonzero if we do */
3129 if ( !trie->states[ state ].trans.list ) {
3130 TRIE_LIST_NEW( state );
3133 check <= TRIE_LIST_USED( state );
3136 if ( TRIE_LIST_ITEM( state, check ).forid
3139 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3144 newstate = next_alloc++;
3145 prev_states[newstate] = state;
3146 TRIE_LIST_PUSH( state, charid, newstate );
3151 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3155 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3156 * on a trieable type. So we need to reset noper back to point at the first regop
3157 * in the branch before we call TRIE_HANDLE_WORD()
3159 noper= NEXTOPER(cur);
3161 TRIE_HANDLE_WORD(state);
3163 } /* end second pass */
3165 /* next alloc is the NEXT state to be allocated */
3166 trie->statecount = next_alloc;
3167 trie->states = (reg_trie_state *)
3168 PerlMemShared_realloc( trie->states,
3170 * sizeof(reg_trie_state) );
3172 /* and now dump it out before we compress it */
3173 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3174 revcharmap, next_alloc,
3178 trie->trans = (reg_trie_trans *)
3179 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
3186 for( state=1 ; state < next_alloc ; state ++ ) {
3190 DEBUG_TRIE_COMPILE_MORE_r(
3191 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
3195 if (trie->states[state].trans.list) {
3196 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3200 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3201 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3202 if ( forid < minid ) {
3204 } else if ( forid > maxid ) {
3208 if ( transcount < tp + maxid - minid + 1) {
3210 trie->trans = (reg_trie_trans *)
3211 PerlMemShared_realloc( trie->trans,
3213 * sizeof(reg_trie_trans) );
3214 Zero( trie->trans + (transcount / 2),
3218 base = trie->uniquecharcount + tp - minid;
3219 if ( maxid == minid ) {
3221 for ( ; zp < tp ; zp++ ) {
3222 if ( ! trie->trans[ zp ].next ) {
3223 base = trie->uniquecharcount + zp - minid;
3224 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3226 trie->trans[ zp ].check = state;
3232 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3234 trie->trans[ tp ].check = state;
3239 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
3240 const U32 tid = base
3241 - trie->uniquecharcount
3242 + TRIE_LIST_ITEM( state, idx ).forid;
3243 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3245 trie->trans[ tid ].check = state;
3247 tp += ( maxid - minid + 1 );
3249 Safefree(trie->states[ state ].trans.list);
3252 DEBUG_TRIE_COMPILE_MORE_r(
3253 Perl_re_printf( aTHX_ " base: %d\n",base);
3256 trie->states[ state ].trans.base=base;
3258 trie->lasttrans = tp + 1;
3262 Second Pass -- Flat Table Representation.
3264 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3265 each. We know that we will need Charcount+1 trans at most to store
3266 the data (one row per char at worst case) So we preallocate both
3267 structures assuming worst case.
3269 We then construct the trie using only the .next slots of the entry
3272 We use the .check field of the first entry of the node temporarily
3273 to make compression both faster and easier by keeping track of how
3274 many non zero fields are in the node.
3276 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3279 There are two terms at use here: state as a TRIE_NODEIDX() which is
3280 a number representing the first entry of the node, and state as a
3281 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3282 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3283 if there are 2 entrys per node. eg:
3291 The table is internally in the right hand, idx form. However as we
3292 also have to deal with the states array which is indexed by nodenum
3293 we have to use TRIE_NODENUM() to convert.
3296 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
3299 trie->trans = (reg_trie_trans *)
3300 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3301 * trie->uniquecharcount + 1,
3302 sizeof(reg_trie_trans) );
3303 trie->states = (reg_trie_state *)
3304 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3305 sizeof(reg_trie_state) );
3306 next_alloc = trie->uniquecharcount + 1;
3309 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3311 regnode *noper = NEXTOPER( cur );
3313 U32 state = 1; /* required init */
3315 U16 charid = 0; /* sanity init */
3316 U32 accept_state = 0; /* sanity init */
3318 U32 wordlen = 0; /* required init */
3320 if (OP(noper) == NOTHING) {
3321 regnode *noper_next= regnext(noper);
3322 if (noper_next < tail)
3324 /* we will undo this assignment if noper does not
3325 * point at a trieable type in the else clause of
3326 * the following statement. */
3330 && ( OP(noper) == flags
3331 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3332 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
3333 || OP(noper) == EXACTFUP))))
3335 const U8 *uc= (U8*)STRING(noper);
3336 const U8 *e= uc + STR_LEN(noper);
3338 for ( ; uc < e ; uc += len ) {
3343 charid = trie->charmap[ uvc ];
3345 SV* const * const svpp = hv_fetch( widecharmap,
3349 charid = svpp ? (U16)SvIV(*svpp) : 0;
3353 if ( !trie->trans[ state + charid ].next ) {
3354 trie->trans[ state + charid ].next = next_alloc;
3355 trie->trans[ state ].check++;
3356 prev_states[TRIE_NODENUM(next_alloc)]
3357 = TRIE_NODENUM(state);
3358 next_alloc += trie->uniquecharcount;
3360 state = trie->trans[ state + charid ].next;
3362 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
3364 /* charid is now 0 if we dont know the char read, or
3365 * nonzero if we do */
3368 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3369 * on a trieable type. So we need to reset noper back to point at the first regop
3370 * in the branch before we call TRIE_HANDLE_WORD().
3372 noper= NEXTOPER(cur);
3374 accept_state = TRIE_NODENUM( state );
3375 TRIE_HANDLE_WORD(accept_state);
3377 } /* end second pass */
3379 /* and now dump it out before we compress it */
3380 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3382 next_alloc, depth+1));
3386 * Inplace compress the table.*
3388 For sparse data sets the table constructed by the trie algorithm will
3389 be mostly 0/FAIL transitions or to put it another way mostly empty.
3390 (Note that leaf nodes will not contain any transitions.)
3392 This algorithm compresses the tables by eliminating most such
3393 transitions, at the cost of a modest bit of extra work during lookup:
3395 - Each states[] entry contains a .base field which indicates the
3396 index in the state[] array wheres its transition data is stored.
3398 - If .base is 0 there are no valid transitions from that node.
3400 - If .base is nonzero then charid is added to it to find an entry in
3403 -If trans[states[state].base+charid].check!=state then the
3404 transition is taken to be a 0/Fail transition. Thus if there are fail
3405 transitions at the front of the node then the .base offset will point
3406 somewhere inside the previous nodes data (or maybe even into a node
3407 even earlier), but the .check field determines if the transition is
3411 The following process inplace converts the table to the compressed
3412 table: We first do not compress the root node 1,and mark all its
3413 .check pointers as 1 and set its .base pointer as 1 as well. This
3414 allows us to do a DFA construction from the compressed table later,
3415 and ensures that any .base pointers we calculate later are greater
3418 - We set 'pos' to indicate the first entry of the second node.
3420 - We then iterate over the columns of the node, finding the first and
3421 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3422 and set the .check pointers accordingly, and advance pos
3423 appropriately and repreat for the next node. Note that when we copy
3424 the next pointers we have to convert them from the original
3425 NODEIDX form to NODENUM form as the former is not valid post
3428 - If a node has no transitions used we mark its base as 0 and do not
3429 advance the pos pointer.
3431 - If a node only has one transition we use a second pointer into the
3432 structure to fill in allocated fail transitions from other states.
3433 This pointer is independent of the main pointer and scans forward
3434 looking for null transitions that are allocated to a state. When it
3435 finds one it writes the single transition into the "hole". If the
3436 pointer doesnt find one the single transition is appended as normal.
3438 - Once compressed we can Renew/realloc the structures to release the
3441 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3442 specifically Fig 3.47 and the associated pseudocode.
3446 const U32 laststate = TRIE_NODENUM( next_alloc );
3449 trie->statecount = laststate;
3451 for ( state = 1 ; state < laststate ; state++ ) {
3453 const U32 stateidx = TRIE_NODEIDX( state );
3454 const U32 o_used = trie->trans[ stateidx ].check;
3455 U32 used = trie->trans[ stateidx ].check;
3456 trie->trans[ stateidx ].check = 0;
3459 used && charid < trie->uniquecharcount;
3462 if ( flag || trie->trans[ stateidx + charid ].next ) {
3463 if ( trie->trans[ stateidx + charid ].next ) {
3465 for ( ; zp < pos ; zp++ ) {
3466 if ( ! trie->trans[ zp ].next ) {
3470 trie->states[ state ].trans.base
3472 + trie->uniquecharcount
3474 trie->trans[ zp ].next
3475 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3477 trie->trans[ zp ].check = state;
3478 if ( ++zp > pos ) pos = zp;
3485 trie->states[ state ].trans.base
3486 = pos + trie->uniquecharcount - charid ;
3488 trie->trans[ pos ].next
3489 = SAFE_TRIE_NODENUM(
3490 trie->trans[ stateidx + charid ].next );
3491 trie->trans[ pos ].check = state;
3496 trie->lasttrans = pos + 1;
3497 trie->states = (reg_trie_state *)
3498 PerlMemShared_realloc( trie->states, laststate
3499 * sizeof(reg_trie_state) );
3500 DEBUG_TRIE_COMPILE_MORE_r(
3501 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
3503 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3507 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
3510 } /* end table compress */
3512 DEBUG_TRIE_COMPILE_MORE_r(
3513 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
3515 (UV)trie->statecount,
3516 (UV)trie->lasttrans)
3518 /* resize the trans array to remove unused space */
3519 trie->trans = (reg_trie_trans *)
3520 PerlMemShared_realloc( trie->trans, trie->lasttrans
3521 * sizeof(reg_trie_trans) );
3523 { /* Modify the program and insert the new TRIE node */
3524 U8 nodetype =(U8)(flags & 0xFF);
3528 regnode *optimize = NULL;
3529 #ifdef RE_TRACK_PATTERN_OFFSETS
3532 U32 mjd_nodelen = 0;
3533 #endif /* RE_TRACK_PATTERN_OFFSETS */
3534 #endif /* DEBUGGING */
3536 This means we convert either the first branch or the first Exact,
3537 depending on whether the thing following (in 'last') is a branch
3538 or not and whther first is the startbranch (ie is it a sub part of
3539 the alternation or is it the whole thing.)
3540 Assuming its a sub part we convert the EXACT otherwise we convert
3541 the whole branch sequence, including the first.
3543 /* Find the node we are going to overwrite */
3544 if ( first != startbranch || OP( last ) == BRANCH ) {
3545 /* branch sub-chain */
3546 NEXT_OFF( first ) = (U16)(last - first);
3547 #ifdef RE_TRACK_PATTERN_OFFSETS
3549 mjd_offset= Node_Offset((convert));
3550 mjd_nodelen= Node_Length((convert));
3553 /* whole branch chain */
3555 #ifdef RE_TRACK_PATTERN_OFFSETS
3558 const regnode *nop = NEXTOPER( convert );
3559 mjd_offset= Node_Offset((nop));
3560 mjd_nodelen= Node_Length((nop));
3564 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
3566 (UV)mjd_offset, (UV)mjd_nodelen)
3569 /* But first we check to see if there is a common prefix we can
3570 split out as an EXACT and put in front of the TRIE node. */
3571 trie->startstate= 1;
3572 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3573 /* we want to find the first state that has more than
3574 * one transition, if that state is not the first state
3575 * then we have a common prefix which we can remove.
3578 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
3580 I32 first_ofs = -1; /* keeps track of the ofs of the first
3581 transition, -1 means none */
3583 const U32 base = trie->states[ state ].trans.base;
3585 /* does this state terminate an alternation? */
3586 if ( trie->states[state].wordnum )
3589 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
3590 if ( ( base + ofs >= trie->uniquecharcount ) &&
3591 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
3592 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3594 if ( ++count > 1 ) {
3595 /* we have more than one transition */
3598 /* if this is the first state there is no common prefix
3599 * to extract, so we can exit */
3600 if ( state == 1 ) break;
3601 tmp = av_fetch( revcharmap, ofs, 0);
3602 ch = (U8*)SvPV_nolen_const( *tmp );
3604 /* if we are on count 2 then we need to initialize the
3605 * bitmap, and store the previous char if there was one
3608 /* clear the bitmap */
3609 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3611 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
3614 if (first_ofs >= 0) {
3615 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
3616 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
3618 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3620 Perl_re_printf( aTHX_ "%s", (char*)ch)
3624 /* store the current firstchar in the bitmap */
3625 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3626 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
3632 /* This state has only one transition, its transition is part
3633 * of a common prefix - we need to concatenate the char it
3634 * represents to what we have so far. */
3635 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
3637 char *ch = SvPV( *tmp, len );
3639 SV *sv=sv_newmortal();
3640 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
3642 (UV)state, (UV)first_ofs,
3643 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
3644 PL_colors[0], PL_colors[1],
3645 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
3646 PERL_PV_ESCAPE_FIRSTCHAR
3651 OP( convert ) = nodetype;
3652 str=STRING(convert);
3653 setSTR_LEN(convert, 0);
3655 assert( ( STR_LEN(convert) + len ) < 256 );
3656 setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
3662 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
3667 trie->prefixlen = (state-1);
3669 regnode *n = convert+NODE_SZ_STR(convert);
3670 assert( NODE_SZ_STR(convert) <= U16_MAX );
3671 NEXT_OFF(convert) = (U16)(NODE_SZ_STR(convert));
3672 trie->startstate = state;
3673 trie->minlen -= (state - 1);
3674 trie->maxlen -= (state - 1);
3676 /* At least the UNICOS C compiler choked on this
3677 * being argument to DEBUG_r(), so let's just have
3680 #ifdef PERL_EXT_RE_BUILD
3686 regnode *fix = convert;
3687 U32 word = trie->wordcount;
3688 #ifdef RE_TRACK_PATTERN_OFFSETS
3691 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3692 while( ++fix < n ) {
3693 Set_Node_Offset_Length(fix, 0, 0);
3696 SV ** const tmp = av_fetch( trie_words, word, 0 );
3698 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3699 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3701 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3709 NEXT_OFF(convert) = (U16)(tail - convert);
3710 DEBUG_r(optimize= n);
3716 if ( trie->maxlen ) {
3717 NEXT_OFF( convert ) = (U16)(tail - convert);
3718 ARG_SET( convert, data_slot );
3719 /* Store the offset to the first unabsorbed branch in
3720 jump[0], which is otherwise unused by the jump logic.
3721 We use this when dumping a trie and during optimisation. */
3723 trie->jump[0] = (U16)(nextbranch - convert);
3725 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3726 * and there is a bitmap
3727 * and the first "jump target" node we found leaves enough room
3728 * then convert the TRIE node into a TRIEC node, with the bitmap
3729 * embedded inline in the opcode - this is hypothetically faster.
3731 if ( !trie->states[trie->startstate].wordnum
3733 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3735 OP( convert ) = TRIEC;
3736 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3737 PerlMemShared_free(trie->bitmap);
3740 OP( convert ) = TRIE;
3742 /* store the type in the flags */
3743 convert->flags = nodetype;
3747 + regarglen[ OP( convert ) ];
3749 /* XXX We really should free up the resource in trie now,
3750 as we won't use them - (which resources?) dmq */
3752 /* needed for dumping*/
3753 DEBUG_r(if (optimize) {
3754 regnode *opt = convert;
3756 while ( ++opt < optimize) {
3757 Set_Node_Offset_Length(opt, 0, 0);
3760 Try to clean up some of the debris left after the
3763 while( optimize < jumper ) {
3764 Track_Code( mjd_nodelen += Node_Length((optimize)); );
3765 OP( optimize ) = OPTIMIZED;
3766 Set_Node_Offset_Length(optimize, 0, 0);
3769 Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen);
3771 } /* end node insert */
3773 /* Finish populating the prev field of the wordinfo array. Walk back
3774 * from each accept state until we find another accept state, and if
3775 * so, point the first word's .prev field at the second word. If the
3776 * second already has a .prev field set, stop now. This will be the
3777 * case either if we've already processed that word's accept state,
3778 * or that state had multiple words, and the overspill words were
3779 * already linked up earlier.
3786 for (word=1; word <= trie->wordcount; word++) {
3788 if (trie->wordinfo[word].prev)
3790 state = trie->wordinfo[word].accept;
3792 state = prev_states[state];
3795 prev = trie->states[state].wordnum;
3799 trie->wordinfo[word].prev = prev;
3801 Safefree(prev_states);
3805 /* and now dump out the compressed format */
3806 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3808 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3810 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3811 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3813 SvREFCNT_dec_NN(revcharmap);
3817 : trie->startstate>1
3823 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3825 /* The Trie is constructed and compressed now so we can build a fail array if
3828 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3830 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3834 We find the fail state for each state in the trie, this state is the longest
3835 proper suffix of the current state's 'word' that is also a proper prefix of
3836 another word in our trie. State 1 represents the word '' and is thus the
3837 default fail state. This allows the DFA not to have to restart after its
3838 tried and failed a word at a given point, it simply continues as though it
3839 had been matching the other word in the first place.
3841 'abcdgu'=~/abcdefg|cdgu/
3842 When we get to 'd' we are still matching the first word, we would encounter
3843 'g' which would fail, which would bring us to the state representing 'd' in
3844 the second word where we would try 'g' and succeed, proceeding to match
3847 /* add a fail transition */
3848 const U32 trie_offset = ARG(source);
3849 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3851 const U32 ucharcount = trie->uniquecharcount;
3852 const U32 numstates = trie->statecount;
3853 const U32 ubound = trie->lasttrans + ucharcount;
3857 U32 base = trie->states[ 1 ].trans.base;
3860 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3862 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3864 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3865 PERL_UNUSED_CONTEXT;
3867 PERL_UNUSED_ARG(depth);
3870 if ( OP(source) == TRIE ) {
3871 struct regnode_1 *op = (struct regnode_1 *)
3872 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3873 StructCopy(source, op, struct regnode_1);
3874 stclass = (regnode *)op;
3876 struct regnode_charclass *op = (struct regnode_charclass *)
3877 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3878 StructCopy(source, op, struct regnode_charclass);
3879 stclass = (regnode *)op;
3881 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3883 ARG_SET( stclass, data_slot );
3884 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3885 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3886 aho->trie=trie_offset;
3887 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3888 Copy( trie->states, aho->states, numstates, reg_trie_state );
3889 Newx( q, numstates, U32);
3890 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3893 /* initialize fail[0..1] to be 1 so that we always have
3894 a valid final fail state */
3895 fail[ 0 ] = fail[ 1 ] = 1;
3897 for ( charid = 0; charid < ucharcount ; charid++ ) {
3898 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3900 q[ q_write ] = newstate;
3901 /* set to point at the root */
3902 fail[ q[ q_write++ ] ]=1;
3905 while ( q_read < q_write) {
3906 const U32 cur = q[ q_read++ % numstates ];
3907 base = trie->states[ cur ].trans.base;
3909 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3910 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3912 U32 fail_state = cur;
3915 fail_state = fail[ fail_state ];
3916 fail_base = aho->states[ fail_state ].trans.base;
3917 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3919 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3920 fail[ ch_state ] = fail_state;
3921 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3923 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3925 q[ q_write++ % numstates] = ch_state;
3929 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3930 when we fail in state 1, this allows us to use the
3931 charclass scan to find a valid start char. This is based on the principle
3932 that theres a good chance the string being searched contains lots of stuff
3933 that cant be a start char.
3935 fail[ 0 ] = fail[ 1 ] = 0;
3936 DEBUG_TRIE_COMPILE_r({
3937 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
3938 depth, (UV)numstates
3940 for( q_read=1; q_read<numstates; q_read++ ) {
3941 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
3943 Perl_re_printf( aTHX_ "\n");
3946 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3951 /* The below joins as many adjacent EXACTish nodes as possible into a single
3952 * one. The regop may be changed if the node(s) contain certain sequences that
3953 * require special handling. The joining is only done if:
3954 * 1) there is room in the current conglomerated node to entirely contain the
3956 * 2) they are compatible node types
3958 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3959 * these get optimized out
3961 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3962 * as possible, even if that means splitting an existing node so that its first
3963 * part is moved to the preceeding node. This would maximise the efficiency of
3964 * memEQ during matching.
3966 * If a node is to match under /i (folded), the number of characters it matches
3967 * can be different than its character length if it contains a multi-character
3968 * fold. *min_subtract is set to the total delta number of characters of the
3971 * And *unfolded_multi_char is set to indicate whether or not the node contains
3972 * an unfolded multi-char fold. This happens when it won't be known until
3973 * runtime whether the fold is valid or not; namely
3974 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
3975 * target string being matched against turns out to be UTF-8 is that fold
3977 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
3979 * (Multi-char folds whose components are all above the Latin1 range are not
3980 * run-time locale dependent, and have already been folded by the time this
3981 * function is called.)
3983 * This is as good a place as any to discuss the design of handling these
3984 * multi-character fold sequences. It's been wrong in Perl for a very long
3985 * time. There are three code points in Unicode whose multi-character folds
3986 * were long ago discovered to mess things up. The previous designs for
3987 * dealing with these involved assigning a special node for them. This
3988 * approach doesn't always work, as evidenced by this example:
3989 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3990 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3991 * would match just the \xDF, it won't be able to handle the case where a
3992 * successful match would have to cross the node's boundary. The new approach
3993 * that hopefully generally solves the problem generates an EXACTFUP node
3994 * that is "sss" in this case.
3996 * It turns out that there are problems with all multi-character folds, and not
3997 * just these three. Now the code is general, for all such cases. The
3998 * approach taken is:
3999 * 1) This routine examines each EXACTFish node that could contain multi-
4000 * character folded sequences. Since a single character can fold into
4001 * such a sequence, the minimum match length for this node is less than
4002 * the number of characters in the node. This routine returns in
4003 * *min_subtract how many characters to subtract from the actual
4004 * length of the string to get a real minimum match length; it is 0 if
4005 * there are no multi-char foldeds. This delta is used by the caller to
4006 * adjust the min length of the match, and the delta between min and max,
4007 * so that the optimizer doesn't reject these possibilities based on size
4010 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
4011 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
4012 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
4013 * EXACTFU nodes. The node type of such nodes is then changed to
4014 * EXACTFUP, indicating it is problematic, and needs careful handling.
4015 * (The procedures in step 1) above are sufficient to handle this case in
4016 * UTF-8 encoded nodes.) The reason this is problematic is that this is
4017 * the only case where there is a possible fold length change in non-UTF-8
4018 * patterns. By reserving a special node type for problematic cases, the
4019 * far more common regular EXACTFU nodes can be processed faster.
4020 * regexec.c takes advantage of this.
4022 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
4023 * problematic cases. These all only occur when the pattern is not
4024 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
4025 * length change, it handles the situation where the string cannot be
4026 * entirely folded. The strings in an EXACTFish node are folded as much
4027 * as possible during compilation in regcomp.c. This saves effort in
4028 * regex matching. By using an EXACTFUP node when it is not possible to
4029 * fully fold at compile time, regexec.c can know that everything in an
4030 * EXACTFU node is folded, so folding can be skipped at runtime. The only
4031 * case where folding in EXACTFU nodes can't be done at compile time is
4032 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
4033 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
4034 * handle two very different cases. Alternatively, there could have been
4035 * a node type where there are length changes, one for unfolded, and one
4036 * for both. If yet another special case needed to be created, the number
4037 * of required node types would have to go to 7. khw figures that even
4038 * though there are plenty of node types to spare, that the maintenance
4039 * cost wasn't worth the small speedup of doing it that way, especially
4040 * since he thinks the MICRO SIGN is rarely encountered in practice.
4042 * There are other cases where folding isn't done at compile time, but
4043 * none of them are under /u, and hence not for EXACTFU nodes. The folds
4044 * in EXACTFL nodes aren't known until runtime, and vary as the locale
4045 * changes. Some folds in EXACTF depend on if the runtime target string
4046 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
4047 * when no fold in it depends on the UTF-8ness of the target string.)
4049 * 3) A problem remains for unfolded multi-char folds. (These occur when the
4050 * validity of the fold won't be known until runtime, and so must remain
4051 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
4052 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
4053 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
4054 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
4055 * The reason this is a problem is that the optimizer part of regexec.c
4056 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
4057 * that a character in the pattern corresponds to at most a single
4058 * character in the target string. (And I do mean character, and not byte
4059 * here, unlike other parts of the documentation that have never been
4060 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
4061 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
4062 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
4063 * EXACTFL nodes, violate the assumption, and they are the only instances
4064 * where it is violated. I'm reluctant to try to change the assumption,
4065 * as the code involved is impenetrable to me (khw), so instead the code
4066 * here punts. This routine examines EXACTFL nodes, and (when the pattern
4067 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
4068 * boolean indicating whether or not the node contains such a fold. When
4069 * it is true, the caller sets a flag that later causes the optimizer in
4070 * this file to not set values for the floating and fixed string lengths,
4071 * and thus avoids the optimizer code in regexec.c that makes the invalid
4072 * assumption. Thus, there is no optimization based on string lengths for
4073 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
4074 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
4075 * assumption is wrong only in these cases is that all other non-UTF-8
4076 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
4077 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
4078 * EXACTF nodes because we don't know at compile time if it actually
4079 * matches 'ss' or not. For EXACTF nodes it will match iff the target
4080 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
4081 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
4082 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
4083 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
4084 * string would require the pattern to be forced into UTF-8, the overhead
4085 * of which we want to avoid. Similarly the unfolded multi-char folds in
4086 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
4089 * Similarly, the code that generates tries doesn't currently handle
4090 * not-already-folded multi-char folds, and it looks like a pain to change
4091 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
4092 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
4093 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
4094 * using /iaa matching will be doing so almost entirely with ASCII
4095 * strings, so this should rarely be encountered in practice */
4098 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
4099 UV *min_subtract, bool *unfolded_multi_char,
4100 U32 flags, regnode *val, U32 depth)
4102 /* Merge several consecutive EXACTish nodes into one. */
4104 regnode *n = regnext(scan);
4106 regnode *next = scan + NODE_SZ_STR(scan);
4110 regnode *stop = scan;
4111 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4113 PERL_UNUSED_ARG(depth);
4116 PERL_ARGS_ASSERT_JOIN_EXACT;
4117 #ifndef EXPERIMENTAL_INPLACESCAN
4118 PERL_UNUSED_ARG(flags);
4119 PERL_UNUSED_ARG(val);
4121 DEBUG_PEEP("join", scan, depth, 0);
4123 assert(PL_regkind[OP(scan)] == EXACT);
4125 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
4126 * EXACT ones that are mergeable to the current one. */
4128 && ( PL_regkind[OP(n)] == NOTHING
4129 || (stringok && PL_regkind[OP(n)] == EXACT))
4131 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
4134 if (OP(n) == TAIL || n > next)
4136 if (PL_regkind[OP(n)] == NOTHING) {
4137 DEBUG_PEEP("skip:", n, depth, 0);
4138 NEXT_OFF(scan) += NEXT_OFF(n);
4139 next = n + NODE_STEP_REGNODE;
4146 else if (stringok) {
4147 const unsigned int oldl = STR_LEN(scan);
4148 regnode * const nnext = regnext(n);
4150 /* XXX I (khw) kind of doubt that this works on platforms (should
4151 * Perl ever run on one) where U8_MAX is above 255 because of lots
4152 * of other assumptions */
4153 /* Don't join if the sum can't fit into a single node */
4154 if (oldl + STR_LEN(n) > U8_MAX)
4157 /* Joining something that requires UTF-8 with something that
4158 * doesn't, means the result requires UTF-8. */
4159 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
4160 OP(scan) = EXACT_REQ8;
4162 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
4163 ; /* join is compatible, no need to change OP */
4165 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
4166 OP(scan) = EXACTFU_REQ8;
4168 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
4169 ; /* join is compatible, no need to change OP */
4171 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
4172 ; /* join is compatible, no need to change OP */
4174 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
4176 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
4177 * which can join with EXACTFU ones. We check for this case
4178 * here. These need to be resolved to either EXACTFU or
4179 * EXACTF at joining time. They have nothing in them that
4180 * would forbid them from being the more desirable EXACTFU
4181 * nodes except that they begin and/or end with a single [Ss].
4182 * The reason this is problematic is because they could be
4183 * joined in this loop with an adjacent node that ends and/or
4184 * begins with [Ss] which would then form the sequence 'ss',
4185 * which matches differently under /di than /ui, in which case
4186 * EXACTFU can't be used. If the 'ss' sequence doesn't get
4187 * formed, the nodes get absorbed into any adjacent EXACTFU
4188 * node. And if the only adjacent node is EXACTF, they get
4189 * absorbed into that, under the theory that a longer node is
4190 * better than two shorter ones, even if one is EXACTFU. Note
4191 * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
4192 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
4194 if (STRING(n)[STR_LEN(n)-1] == 's') {
4196 /* Here the joined node would end with 's'. If the node
4197 * following the combination is an EXACTF one, it's better to
4198 * join this trailing edge 's' node with that one, leaving the
4199 * current one in 'scan' be the more desirable EXACTFU */
4200 if (OP(nnext) == EXACTF) {
4204 OP(scan) = EXACTFU_S_EDGE;
4206 } /* Otherwise, the beginning 's' of the 2nd node just
4207 becomes an interior 's' in 'scan' */
4209 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
4210 ; /* join is compatible, no need to change OP */
4212 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
4214 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
4215 * nodes. But the latter nodes can be also joined with EXACTFU
4216 * ones, and that is a better outcome, so if the node following
4217 * 'n' is EXACTFU, quit now so that those two can be joined
4219 if (OP(nnext) == EXACTFU) {
4223 /* The join is compatible, and the combined node will be
4224 * EXACTF. (These don't care if they begin or end with 's' */
4226 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
4227 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
4228 && STRING(n)[0] == 's')
4230 /* When combined, we have the sequence 'ss', which means we
4231 * have to remain /di */
4235 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
4236 if (STRING(n)[0] == 's') {
4237 ; /* Here the join is compatible and the combined node
4238 starts with 's', no need to change OP */
4240 else { /* Now the trailing 's' is in the interior */
4244 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
4246 /* The join is compatible, and the combined node will be
4247 * EXACTF. (These don't care if they begin or end with 's' */
4250 else if (OP(scan) != OP(n)) {
4252 /* The only other compatible joinings are the same node type */
4256 DEBUG_PEEP("merg", n, depth, 0);
4259 NEXT_OFF(scan) += NEXT_OFF(n);
4260 assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
4261 setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
4262 next = n + NODE_SZ_STR(n);
4263 /* Now we can overwrite *n : */
4264 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
4272 #ifdef EXPERIMENTAL_INPLACESCAN
4273 if (flags && !NEXT_OFF(n)) {
4274 DEBUG_PEEP("atch", val, depth, 0);
4275 if (reg_off_by_arg[OP(n)]) {
4276 ARG_SET(n, val - n);
4279 NEXT_OFF(n) = val - n;
4286 /* This temporary node can now be turned into EXACTFU, and must, as
4287 * regexec.c doesn't handle it */
4288 if (OP(scan) == EXACTFU_S_EDGE) {
4293 *unfolded_multi_char = FALSE;
4295 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
4296 * can now analyze for sequences of problematic code points. (Prior to
4297 * this final joining, sequences could have been split over boundaries, and
4298 * hence missed). The sequences only happen in folding, hence for any
4299 * non-EXACT EXACTish node */
4300 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
4301 U8* s0 = (U8*) STRING(scan);
4303 U8* s_end = s0 + STR_LEN(scan);
4305 int total_count_delta = 0; /* Total delta number of characters that
4306 multi-char folds expand to */
4308 /* One pass is made over the node's string looking for all the
4309 * possibilities. To avoid some tests in the loop, there are two main
4310 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
4315 if (OP(scan) == EXACTFL) {
4318 /* An EXACTFL node would already have been changed to another
4319 * node type unless there is at least one character in it that
4320 * is problematic; likely a character whose fold definition
4321 * won't be known until runtime, and so has yet to be folded.
4322 * For all but the UTF-8 locale, folds are 1-1 in length, but
4323 * to handle the UTF-8 case, we need to create a temporary
4324 * folded copy using UTF-8 locale rules in order to analyze it.
4325 * This is because our macros that look to see if a sequence is
4326 * a multi-char fold assume everything is folded (otherwise the
4327 * tests in those macros would be too complicated and slow).
4328 * Note that here, the non-problematic folds will have already
4329 * been done, so we can just copy such characters. We actually
4330 * don't completely fold the EXACTFL string. We skip the
4331 * unfolded multi-char folds, as that would just create work
4332 * below to figure out the size they already are */
4334 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
4337 STRLEN s_len = UTF8SKIP(s);
4338 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
4339 Copy(s, d, s_len, U8);
4342 else if (is_FOLDS_TO_MULTI_utf8(s)) {
4343 *unfolded_multi_char = TRUE;
4344 Copy(s, d, s_len, U8);
4347 else if (isASCII(*s)) {
4348 *(d++) = toFOLD(*s);
4352 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
4358 /* Point the remainder of the routine to look at our temporary
4362 } /* End of creating folded copy of EXACTFL string */
4364 /* Examine the string for a multi-character fold sequence. UTF-8
4365 * patterns have all characters pre-folded by the time this code is
4367 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
4368 length sequence we are looking for is 2 */
4370 int count = 0; /* How many characters in a multi-char fold */
4371 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
4372 if (! len) { /* Not a multi-char fold: get next char */
4377 { /* Here is a generic multi-char fold. */
4378 U8* multi_end = s + len;
4380 /* Count how many characters are in it. In the case of
4381 * /aa, no folds which contain ASCII code points are
4382 * allowed, so check for those, and skip if found. */
4383 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
4384 count = utf8_length(s, multi_end);
4388 while (s < multi_end) {
4391 goto next_iteration;
4401 /* The delta is how long the sequence is minus 1 (1 is how long
4402 * the character that folds to the sequence is) */
4403 total_count_delta += count - 1;
4407 /* We created a temporary folded copy of the string in EXACTFL
4408 * nodes. Therefore we need to be sure it doesn't go below zero,
4409 * as the real string could be shorter */
4410 if (OP(scan) == EXACTFL) {
4411 int total_chars = utf8_length((U8*) STRING(scan),
4412 (U8*) STRING(scan) + STR_LEN(scan));
4413 if (total_count_delta > total_chars) {
4414 total_count_delta = total_chars;
4418 *min_subtract += total_count_delta;
4421 else if (OP(scan) == EXACTFAA) {
4423 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
4424 * fold to the ASCII range (and there are no existing ones in the
4425 * upper latin1 range). But, as outlined in the comments preceding
4426 * this function, we need to flag any occurrences of the sharp s.
4427 * This character forbids trie formation (because of added
4429 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
4430 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
4431 || UNICODE_DOT_DOT_VERSION > 0)
4433 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
4434 OP(scan) = EXACTFAA_NO_TRIE;
4435 *unfolded_multi_char = TRUE;
4441 else if (OP(scan) != EXACTFAA_NO_TRIE) {
4443 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
4444 * folds that are all Latin1. As explained in the comments
4445 * preceding this function, we look also for the sharp s in EXACTF
4446 * and EXACTFL nodes; it can be in the final position. Otherwise
4447 * we can stop looking 1 byte earlier because have to find at least
4448 * two characters for a multi-fold */
4449 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
4454 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
4455 if (! len) { /* Not a multi-char fold. */
4456 if (*s == LATIN_SMALL_LETTER_SHARP_S
4457 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
4459 *unfolded_multi_char = TRUE;
4466 && isALPHA_FOLD_EQ(*s, 's')
4467 && isALPHA_FOLD_EQ(*(s+1), 's'))
4470 /* EXACTF nodes need to know that the minimum length
4471 * changed so that a sharp s in the string can match this
4472 * ss in the pattern, but they remain EXACTF nodes, as they
4473 * won't match this unless the target string is in UTF-8,
4474 * which we don't know until runtime. EXACTFL nodes can't
4475 * transform into EXACTFU nodes */
4476 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
4477 OP(scan) = EXACTFUP;
4481 *min_subtract += len - 1;
4489 /* Allow dumping but overwriting the collection of skipped
4490 * ops and/or strings with fake optimized ops */
4491 n = scan + NODE_SZ_STR(scan);
4499 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
4503 /* REx optimizer. Converts nodes into quicker variants "in place".
4504 Finds fixed substrings. */
4506 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
4507 to the position after last scanned or to NULL. */
4509 #define INIT_AND_WITHP \
4510 assert(!and_withp); \
4511 Newx(and_withp, 1, regnode_ssc); \
4512 SAVEFREEPV(and_withp)
4516 S_unwind_scan_frames(pTHX_ const void *p)
4518 scan_frame *f= (scan_frame *)p;
4520 scan_frame *n= f->next_frame;
4526 /* Follow the next-chain of the current node and optimize away
4527 all the NOTHINGs from it.
4530 S_rck_elide_nothing(pTHX_ regnode *node)
4532 PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
4534 if (OP(node) != CURLYX) {
4535 const int max = (reg_off_by_arg[OP(node)]
4537 /* I32 may be smaller than U16 on CRAYs! */
4538 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
4539 int off = (reg_off_by_arg[OP(node)] ? ARG(node) : NEXT_OFF(node));
4543 /* Skip NOTHING and LONGJMP. */
4547 (PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
4548 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
4554 if (reg_off_by_arg[OP(node)])
4557 NEXT_OFF(node) = off;
4562 /* the return from this sub is the minimum length that could possibly match */
4564 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
4565 SSize_t *minlenp, SSize_t *deltap,
4570 regnode_ssc *and_withp,
4571 U32 flags, U32 depth, bool was_mutate_ok)
4572 /* scanp: Start here (read-write). */
4573 /* deltap: Write maxlen-minlen here. */
4574 /* last: Stop before this one. */
4575 /* data: string data about the pattern */
4576 /* stopparen: treat close N as END */
4577 /* recursed: which subroutines have we recursed into */
4578 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
4580 SSize_t final_minlen;
4581 /* There must be at least this number of characters to match */
4584 regnode *scan = *scanp, *next;
4586 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
4587 int is_inf_internal = 0; /* The studied chunk is infinite */
4588 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
4589 scan_data_t data_fake;
4590 SV *re_trie_maxbuff = NULL;
4591 regnode *first_non_open = scan;
4592 SSize_t stopmin = OPTIMIZE_INFTY;
4593 scan_frame *frame = NULL;
4594 DECLARE_AND_GET_RE_DEBUG_FLAGS;
4596 PERL_ARGS_ASSERT_STUDY_CHUNK;
4597 RExC_study_started= 1;
4599 Zero(&data_fake, 1, scan_data_t);
4602 while (first_non_open && OP(first_non_open) == OPEN)
4603 first_non_open=regnext(first_non_open);
4609 RExC_study_chunk_recursed_count++;
4611 DEBUG_OPTIMISE_MORE_r(
4613 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
4614 depth, (long)stopparen,
4615 (unsigned long)RExC_study_chunk_recursed_count,
4616 (unsigned long)depth, (unsigned long)recursed_depth,
4619 if (recursed_depth) {
4622 for ( j = 0 ; j < recursed_depth ; j++ ) {
4623 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
4624 if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
4625 Perl_re_printf( aTHX_ " %d",(int)i);
4629 if ( j + 1 < recursed_depth ) {
4630 Perl_re_printf( aTHX_ ",");
4634 Perl_re_printf( aTHX_ "\n");
4637 while ( scan && OP(scan) != END && scan < last ){
4638 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
4639 node length to get a real minimum (because
4640 the folded version may be shorter) */
4641 bool unfolded_multi_char = FALSE;
4642 /* avoid mutating ops if we are anywhere within the recursed or
4643 * enframed handling for a GOSUB: the outermost level will handle it.
4645 bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
4646 /* Peephole optimizer: */
4647 DEBUG_STUDYDATA("Peep", data, depth, is_inf);
4648 DEBUG_PEEP("Peep", scan, depth, flags);
4651 /* The reason we do this here is that we need to deal with things like
4652 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
4653 * parsing code, as each (?:..) is handled by a different invocation of
4656 if (PL_regkind[OP(scan)] == EXACT
4657 && OP(scan) != LEXACT
4658 && OP(scan) != LEXACT_REQ8
4661 join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
4662 0, NULL, depth + 1);
4665 /* Follow the next-chain of the current node and optimize
4666 away all the NOTHINGs from it.
4668 rck_elide_nothing(scan);
4670 /* The principal pseudo-switch. Cannot be a switch, since we look into
4671 * several different things. */
4672 if ( OP(scan) == DEFINEP ) {
4674 SSize_t deltanext = 0;
4675 SSize_t fake_last_close = 0;
4676 I32 f = SCF_IN_DEFINE;
4678 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4679 scan = regnext(scan);
4680 assert( OP(scan) == IFTHEN );
4681 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
4683 data_fake.last_closep= &fake_last_close;
4685 next = regnext(scan);
4686 scan = NEXTOPER(NEXTOPER(scan));
4687 DEBUG_PEEP("scan", scan, depth, flags);
4688 DEBUG_PEEP("next", next, depth, flags);
4690 /* we suppose the run is continuous, last=next...
4691 * NOTE we dont use the return here! */
4692 /* DEFINEP study_chunk() recursion */
4693 (void)study_chunk(pRExC_state, &scan, &minlen,
4694 &deltanext, next, &data_fake, stopparen,
4695 recursed_depth, NULL, f, depth+1, mutate_ok);
4700 OP(scan) == BRANCH ||
4701 OP(scan) == BRANCHJ ||
4704 next = regnext(scan);
4707 /* The op(next)==code check below is to see if we
4708 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
4709 * IFTHEN is special as it might not appear in pairs.
4710 * Not sure whether BRANCH-BRANCHJ is possible, regardless
4711 * we dont handle it cleanly. */
4712 if (OP(next) == code || code == IFTHEN) {
4713 /* NOTE - There is similar code to this block below for
4714 * handling TRIE nodes on a re-study. If you change stuff here
4715 * check there too. */
4716 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
4718 regnode * const startbranch=scan;
4720 if (flags & SCF_DO_SUBSTR) {
4721 /* Cannot merge strings after this. */
4722 scan_commit(pRExC_state, data, minlenp, is_inf);
4725 if (flags & SCF_DO_STCLASS)
4726 ssc_init_zero(pRExC_state, &accum);
4728 while (OP(scan) == code) {
4729 SSize_t deltanext, minnext, fake;
4731 regnode_ssc this_class;
4733 DEBUG_PEEP("Branch", scan, depth, flags);
4736 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
4738 data_fake.whilem_c = data->whilem_c;
4739 data_fake.last_closep = data->last_closep;
4742 data_fake.last_closep = &fake;
4744 data_fake.pos_delta = delta;
4745 next = regnext(scan);
4747 scan = NEXTOPER(scan); /* everything */
4748 if (code != BRANCH) /* everything but BRANCH */
4749 scan = NEXTOPER(scan);
4751 if (flags & SCF_DO_STCLASS) {
4752 ssc_init(pRExC_state, &this_class);
4753 data_fake.start_class = &this_class;
4754 f = SCF_DO_STCLASS_AND;
4756 if (flags & SCF_WHILEM_VISITED_POS)
4757 f |= SCF_WHILEM_VISITED_POS;
4759 /* we suppose the run is continuous, last=next...*/
4760 /* recurse study_chunk() for each BRANCH in an alternation */
4761 minnext = study_chunk(pRExC_state, &scan, minlenp,
4762 &deltanext, next, &data_fake, stopparen,
4763 recursed_depth, NULL, f, depth+1,
4768 if (deltanext == OPTIMIZE_INFTY) {
4769 is_inf = is_inf_internal = 1;
4770 max1 = OPTIMIZE_INFTY;
4771 } else if (max1 < minnext + deltanext)
4772 max1 = minnext + deltanext;
4774 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4776 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4777 if ( stopmin > minnext)
4778 stopmin = min + min1;
4779 flags &= ~SCF_DO_SUBSTR;
4781 data->flags |= SCF_SEEN_ACCEPT;
4784 if (data_fake.flags & SF_HAS_EVAL)
4785 data->flags |= SF_HAS_EVAL;
4786 data->whilem_c = data_fake.whilem_c;
4788 if (flags & SCF_DO_STCLASS)
4789 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
4791 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
4793 if (flags & SCF_DO_SUBSTR) {
4794 data->pos_min += min1;
4795 if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
4796 data->pos_delta = OPTIMIZE_INFTY;
4798 data->pos_delta += max1 - min1;
4799 if (max1 != min1 || is_inf)
4800 data->cur_is_floating = 1;
4803 if (delta == OPTIMIZE_INFTY
4804 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
4805 delta = OPTIMIZE_INFTY;
4807 delta += max1 - min1;
4808 if (flags & SCF_DO_STCLASS_OR) {
4809 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4811 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4812 flags &= ~SCF_DO_STCLASS;
4815 else if (flags & SCF_DO_STCLASS_AND) {
4817 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4818 flags &= ~SCF_DO_STCLASS;
4821 /* Switch to OR mode: cache the old value of
4822 * data->start_class */
4824 StructCopy(data->start_class, and_withp, regnode_ssc);
4825 flags &= ~SCF_DO_STCLASS_AND;
4826 StructCopy(&accum, data->start_class, regnode_ssc);
4827 flags |= SCF_DO_STCLASS_OR;
4831 if (PERL_ENABLE_TRIE_OPTIMISATION
4832 && OP(startbranch) == BRANCH
4837 Assuming this was/is a branch we are dealing with: 'scan'
4838 now points at the item that follows the branch sequence,
4839 whatever it is. We now start at the beginning of the
4840 sequence and look for subsequences of
4846 which would be constructed from a pattern like
4849 If we can find such a subsequence we need to turn the first
4850 element into a trie and then add the subsequent branch exact
4851 strings to the trie.
4855 1. patterns where the whole set of branches can be
4858 2. patterns where only a subset can be converted.
4860 In case 1 we can replace the whole set with a single regop
4861 for the trie. In case 2 we need to keep the start and end
4864 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4865 becomes BRANCH TRIE; BRANCH X;
4867 There is an additional case, that being where there is a
4868 common prefix, which gets split out into an EXACT like node
4869 preceding the TRIE node.
4871 If x(1..n)==tail then we can do a simple trie, if not we make
4872 a "jump" trie, such that when we match the appropriate word
4873 we "jump" to the appropriate tail node. Essentially we turn
4874 a nested if into a case structure of sorts.
4879 if (!re_trie_maxbuff) {
4880 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4881 if (!SvIOK(re_trie_maxbuff))
4882 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4884 if ( SvIV(re_trie_maxbuff)>=0 ) {
4886 regnode *first = (regnode *)NULL;
4887 regnode *prev = (regnode *)NULL;
4888 regnode *tail = scan;
4892 /* var tail is used because there may be a TAIL
4893 regop in the way. Ie, the exacts will point to the
4894 thing following the TAIL, but the last branch will
4895 point at the TAIL. So we advance tail. If we
4896 have nested (?:) we may have to move through several
4900 while ( OP( tail ) == TAIL ) {
4901 /* this is the TAIL generated by (?:) */
4902 tail = regnext( tail );
4906 DEBUG_TRIE_COMPILE_r({
4907 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4908 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
4910 "Looking for TRIE'able sequences. Tail node is ",
4911 (UV) REGNODE_OFFSET(tail),
4912 SvPV_nolen_const( RExC_mysv )
4918 Step through the branches
4919 cur represents each branch,
4920 noper is the first thing to be matched as part
4922 noper_next is the regnext() of that node.
4924 We normally handle a case like this
4925 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4926 support building with NOJUMPTRIE, which restricts
4927 the trie logic to structures like /FOO|BAR/.
4929 If noper is a trieable nodetype then the branch is
4930 a possible optimization target. If we are building
4931 under NOJUMPTRIE then we require that noper_next is
4932 the same as scan (our current position in the regex
4935 Once we have two or more consecutive such branches
4936 we can create a trie of the EXACT's contents and
4937 stitch it in place into the program.
4939 If the sequence represents all of the branches in
4940 the alternation we replace the entire thing with a
4943 Otherwise when it is a subsequence we need to
4944 stitch it in place and replace only the relevant
4945 branches. This means the first branch has to remain
4946 as it is used by the alternation logic, and its
4947 next pointer, and needs to be repointed at the item
4948 on the branch chain following the last branch we
4949 have optimized away.
4951 This could be either a BRANCH, in which case the
4952 subsequence is internal, or it could be the item
4953 following the branch sequence in which case the
4954 subsequence is at the end (which does not
4955 necessarily mean the first node is the start of the
4958 TRIE_TYPE(X) is a define which maps the optype to a
4962 ----------------+-----------
4967 EXACTFU_REQ8 | EXACTFU
4971 EXACTFLU8 | EXACTFLU8
4975 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4977 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
4979 : ( EXACTFU == (X) \
4980 || EXACTFU_REQ8 == (X) \
4981 || EXACTFUP == (X) ) \
4983 : ( EXACTFAA == (X) ) \
4985 : ( EXACTL == (X) ) \
4987 : ( EXACTFLU8 == (X) ) \
4991 /* dont use tail as the end marker for this traverse */
4992 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4993 regnode * const noper = NEXTOPER( cur );
4994 U8 noper_type = OP( noper );
4995 U8 noper_trietype = TRIE_TYPE( noper_type );
4996 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4997 regnode * const noper_next = regnext( noper );
4998 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
4999 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
5002 DEBUG_TRIE_COMPILE_r({
5003 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5004 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
5006 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
5008 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
5009 Perl_re_printf( aTHX_ " -> %d:%s",
5010 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
5013 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
5014 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
5015 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
5017 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
5018 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5019 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
5023 /* Is noper a trieable nodetype that can be merged
5024 * with the current trie (if there is one)? */
5028 ( noper_trietype == NOTHING )
5029 || ( trietype == NOTHING )
5030 || ( trietype == noper_trietype )
5033 && noper_next >= tail
5037 /* Handle mergable triable node Either we are
5038 * the first node in a new trieable sequence,
5039 * in which case we do some bookkeeping,
5040 * otherwise we update the end pointer. */
5043 if ( noper_trietype == NOTHING ) {
5044 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
5045 regnode * const noper_next = regnext( noper );
5046 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
5047 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
5050 if ( noper_next_trietype ) {
5051 trietype = noper_next_trietype;
5052 } else if (noper_next_type) {
5053 /* a NOTHING regop is 1 regop wide.
5054 * We need at least two for a trie
5055 * so we can't merge this in */
5059 trietype = noper_trietype;
5062 if ( trietype == NOTHING )
5063 trietype = noper_trietype;
5068 } /* end handle mergable triable node */
5070 /* handle unmergable node -
5071 * noper may either be a triable node which can
5072 * not be tried together with the current trie,
5073 * or a non triable node */
5075 /* If last is set and trietype is not
5076 * NOTHING then we have found at least two
5077 * triable branch sequences in a row of a
5078 * similar trietype so we can turn them
5079 * into a trie. If/when we allow NOTHING to
5080 * start a trie sequence this condition
5081 * will be required, and it isn't expensive
5082 * so we leave it in for now. */
5083 if ( trietype && trietype != NOTHING )
5084 make_trie( pRExC_state,
5085 startbranch, first, cur, tail,
5086 count, trietype, depth+1 );
5087 prev = NULL; /* note: we clear/update
5088 first, trietype etc below,
5089 so we dont do it here */
5093 && noper_next >= tail
5096 /* noper is triable, so we can start a new
5100 trietype = noper_trietype;
5102 /* if we already saw a first but the
5103 * current node is not triable then we have
5104 * to reset the first information. */
5109 } /* end handle unmergable node */
5110 } /* loop over branches */
5111 DEBUG_TRIE_COMPILE_r({
5112 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5113 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
5114 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5115 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
5116 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
5117 PL_reg_name[trietype]
5121 if ( prev && trietype ) {
5122 if ( trietype != NOTHING ) {
5123 /* the last branch of the sequence was part of
5124 * a trie, so we have to construct it here
5125 * outside of the loop */
5126 made= make_trie( pRExC_state, startbranch,
5127 first, scan, tail, count,
5128 trietype, depth+1 );
5129 #ifdef TRIE_STUDY_OPT
5130 if ( ((made == MADE_EXACT_TRIE &&
5131 startbranch == first)
5132 || ( first_non_open == first )) &&
5134 flags |= SCF_TRIE_RESTUDY;
5135 if ( startbranch == first
5138 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
5143 /* at this point we know whatever we have is a
5144 * NOTHING sequence/branch AND if 'startbranch'
5145 * is 'first' then we can turn the whole thing
5148 if ( startbranch == first ) {
5150 /* the entire thing is a NOTHING sequence,
5151 * something like this: (?:|) So we can
5152 * turn it into a plain NOTHING op. */
5153 DEBUG_TRIE_COMPILE_r({
5154 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
5155 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
5157 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
5160 OP(startbranch)= NOTHING;
5161 NEXT_OFF(startbranch)= tail - startbranch;
5162 for ( opt= startbranch + 1; opt < tail ; opt++ )
5166 } /* end if ( prev) */
5167 } /* TRIE_MAXBUF is non zero */
5171 else if ( code == BRANCHJ ) { /* single branch is optimized. */
5172 scan = NEXTOPER(NEXTOPER(scan));
5173 } else /* single branch is optimized. */
5174 scan = NEXTOPER(scan);
5176 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
5178 regnode *start = NULL;
5179 regnode *end = NULL;
5180 U32 my_recursed_depth= recursed_depth;
5182 if (OP(scan) != SUSPEND) { /* GOSUB */
5183 /* Do setup, note this code has side effects beyond
5184 * the rest of this block. Specifically setting
5185 * RExC_recurse[] must happen at least once during
5188 RExC_recurse[ARG2L(scan)] = scan;
5189 start = REGNODE_p(RExC_open_parens[paren]);
5190 end = REGNODE_p(RExC_close_parens[paren]);
5192 /* NOTE we MUST always execute the above code, even
5193 * if we do nothing with a GOSUB */
5195 ( flags & SCF_IN_DEFINE )
5198 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
5200 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
5203 /* no need to do anything here if we are in a define. */
5204 /* or we are after some kind of infinite construct
5205 * so we can skip recursing into this item.
5206 * Since it is infinite we will not change the maxlen
5207 * or delta, and if we miss something that might raise
5208 * the minlen it will merely pessimise a little.
5210 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
5211 * might result in a minlen of 1 and not of 4,
5212 * but this doesn't make us mismatch, just try a bit
5213 * harder than we should.
5215 * However we must assume this GOSUB is infinite, to
5216 * avoid wrongly applying other optimizations in the
5217 * enclosing scope - see GH 18096, for example.
5219 is_inf = is_inf_internal = 1;
5220 scan= regnext(scan);
5226 || !PAREN_TEST(recursed_depth - 1, paren)
5228 /* it is quite possible that there are more efficient ways
5229 * to do this. We maintain a bitmap per level of recursion
5230 * of which patterns we have entered so we can detect if a
5231 * pattern creates a possible infinite loop. When we
5232 * recurse down a level we copy the previous levels bitmap
5233 * down. When we are at recursion level 0 we zero the top
5234 * level bitmap. It would be nice to implement a different
5235 * more efficient way of doing this. In particular the top
5236 * level bitmap may be unnecessary.
5238 if (!recursed_depth) {
5239 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
5241 Copy(PAREN_OFFSET(recursed_depth - 1),
5242 PAREN_OFFSET(recursed_depth),
5243 RExC_study_chunk_recursed_bytes, U8);
5245 /* we havent recursed into this paren yet, so recurse into it */
5246 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
5247 PAREN_SET(recursed_depth, paren);
5248 my_recursed_depth= recursed_depth + 1;
5250 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
5251 /* some form of infinite recursion, assume infinite length
5253 if (flags & SCF_DO_SUBSTR) {
5254 scan_commit(pRExC_state, data, minlenp, is_inf);
5255 data->cur_is_floating = 1;
5257 is_inf = is_inf_internal = 1;
5258 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5259 ssc_anything(data->start_class);
5260 flags &= ~SCF_DO_STCLASS;
5262 start= NULL; /* reset start so we dont recurse later on. */
5267 end = regnext(scan);
5270 scan_frame *newframe;
5272 if (!RExC_frame_last) {
5273 Newxz(newframe, 1, scan_frame);
5274 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
5275 RExC_frame_head= newframe;
5277 } else if (!RExC_frame_last->next_frame) {
5278 Newxz(newframe, 1, scan_frame);
5279 RExC_frame_last->next_frame= newframe;
5280 newframe->prev_frame= RExC_frame_last;
5283 newframe= RExC_frame_last->next_frame;
5285 RExC_frame_last= newframe;
5287 newframe->next_regnode = regnext(scan);
5288 newframe->last_regnode = last;
5289 newframe->stopparen = stopparen;
5290 newframe->prev_recursed_depth = recursed_depth;
5291 newframe->this_prev_frame= frame;
5292 newframe->in_gosub = (
5293 (frame && frame->in_gosub) || OP(scan) == GOSUB
5296 DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
5297 DEBUG_PEEP("fnew", scan, depth, flags);
5304 recursed_depth= my_recursed_depth;
5309 else if (PL_regkind[OP(scan)] == EXACT && ! isEXACTFish(OP(scan))) {
5310 SSize_t bytelen = STR_LEN(scan), charlen;
5314 const U8 * const s = (U8*)STRING(scan);
5315 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
5316 charlen = utf8_length(s, s + bytelen);
5318 uc = *((U8*)STRING(scan));
5322 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
5323 /* The code below prefers earlier match for fixed
5324 offset, later match for variable offset. */
5325 if (data->last_end == -1) { /* Update the start info. */
5326 data->last_start_min = data->pos_min;
5327 data->last_start_max =
5328 is_inf ? OPTIMIZE_INFTY
5329 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
5330 ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
5332 sv_catpvn(data->last_found, STRING(scan), bytelen);
5334 SvUTF8_on(data->last_found);
5336 SV * const sv = data->last_found;
5337 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5338 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5339 if (mg && mg->mg_len >= 0)
5340 mg->mg_len += charlen;
5342 data->last_end = data->pos_min + charlen;
5343 data->pos_min += charlen; /* As in the first entry. */
5344 data->flags &= ~SF_BEFORE_EOL;
5347 /* ANDing the code point leaves at most it, and not in locale, and
5348 * can't match null string */
5349 if (flags & SCF_DO_STCLASS_AND) {
5350 ssc_cp_and(data->start_class, uc);
5351 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5352 ssc_clear_locale(data->start_class);
5354 else if (flags & SCF_DO_STCLASS_OR) {
5355 ssc_add_cp(data->start_class, uc);
5356 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5358 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5359 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5361 flags &= ~SCF_DO_STCLASS;
5363 else if (PL_regkind[OP(scan)] == EXACT) {
5364 /* But OP != EXACT!, so is EXACTFish */
5365 SSize_t bytelen = STR_LEN(scan), charlen;
5366 const U8 * s = (U8*)STRING(scan);
5368 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
5369 * with the mask set to the complement of the bit that differs
5370 * between upper and lower case, and the lowest code point of the
5371 * pair (which the '&' forces) */
5374 && ( OP(scan) == EXACTFAA
5375 || ( OP(scan) == EXACTFU
5376 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
5379 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
5382 ARG_SET(scan, *s & mask);
5384 /* we're not EXACTFish any more, so restudy */
5388 /* Search for fixed substrings supports EXACT only. */
5389 if (flags & SCF_DO_SUBSTR) {
5391 scan_commit(pRExC_state, data, minlenp, is_inf);
5393 charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
5394 if (unfolded_multi_char) {
5395 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
5397 min += charlen - min_subtract;
5399 delta += min_subtract;
5400 if (flags & SCF_DO_SUBSTR) {
5401 data->pos_min += charlen - min_subtract;
5402 if (data->pos_min < 0) {
5405 data->pos_delta += min_subtract;
5407 data->cur_is_floating = 1; /* float */
5411 if (flags & SCF_DO_STCLASS) {
5412 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
5414 assert(EXACTF_invlist);
5415 if (flags & SCF_DO_STCLASS_AND) {
5416 if (OP(scan) != EXACTFL)
5417 ssc_clear_locale(data->start_class);
5418 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5419 ANYOF_POSIXL_ZERO(data->start_class);
5420 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
5422 else { /* SCF_DO_STCLASS_OR */
5423 ssc_union(data->start_class, EXACTF_invlist, FALSE);
5424 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5426 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5427 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5429 flags &= ~SCF_DO_STCLASS;
5430 SvREFCNT_dec(EXACTF_invlist);
5433 else if (REGNODE_VARIES(OP(scan))) {
5434 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
5435 I32 fl = 0, f = flags;
5436 regnode * const oscan = scan;
5437 regnode_ssc this_class;
5438 regnode_ssc *oclass = NULL;
5439 I32 next_is_eval = 0;
5441 switch (PL_regkind[OP(scan)]) {
5442 case WHILEM: /* End of (?:...)* . */
5443 scan = NEXTOPER(scan);
5446 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
5447 next = NEXTOPER(scan);
5448 if ( ( PL_regkind[OP(next)] == EXACT
5449 && ! isEXACTFish(OP(next)))
5450 || (flags & SCF_DO_STCLASS))
5453 maxcount = REG_INFTY;
5454 next = regnext(scan);
5455 scan = NEXTOPER(scan);
5459 if (flags & SCF_DO_SUBSTR)
5461 /* This will bypass the formal 'min += minnext * mincount'
5462 * calculation in the do_curly path, so assumes min width
5463 * of the PLUS payload is exactly one. */
5467 next = NEXTOPER(scan);
5469 /* This temporary node can now be turned into EXACTFU, and
5470 * must, as regexec.c doesn't handle it */
5471 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
5475 if ( STR_LEN(next) == 1
5476 && isALPHA_A(* STRING(next))
5477 && ( OP(next) == EXACTFAA
5478 || ( OP(next) == EXACTFU
5479 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
5482 /* These differ in just one bit */
5483 U8 mask = ~ ('A' ^ 'a');
5485 assert(isALPHA_A(* STRING(next)));
5487 /* Then replace it by an ANYOFM node, with
5488 * the mask set to the complement of the
5489 * bit that differs between upper and lower
5490 * case, and the lowest code point of the
5491 * pair (which the '&' forces) */
5493 ARG_SET(next, *STRING(next) & mask);
5497 if (flags & SCF_DO_STCLASS) {
5499 maxcount = REG_INFTY;
5500 next = regnext(scan);
5501 scan = NEXTOPER(scan);
5504 if (flags & SCF_DO_SUBSTR) {
5505 scan_commit(pRExC_state, data, minlenp, is_inf);
5506 /* Cannot extend fixed substrings */
5507 data->cur_is_floating = 1; /* float */
5509 is_inf = is_inf_internal = 1;
5510 scan = regnext(scan);
5511 goto optimize_curly_tail;
5513 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
5514 && (scan->flags == stopparen))
5519 mincount = ARG1(scan);
5520 maxcount = ARG2(scan);
5522 next = regnext(scan);
5523 if (OP(scan) == CURLYX) {
5524 I32 lp = (data ? *(data->last_closep) : 0);
5525 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
5527 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
5528 next_is_eval = (OP(scan) == EVAL);
5530 if (flags & SCF_DO_SUBSTR) {
5532 scan_commit(pRExC_state, data, minlenp, is_inf);
5533 /* Cannot extend fixed substrings */
5534 pos_before = data->pos_min;
5538 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
5540 data->flags |= SF_IS_INF;
5542 if (flags & SCF_DO_STCLASS) {
5543 ssc_init(pRExC_state, &this_class);
5544 oclass = data->start_class;
5545 data->start_class = &this_class;
5546 f |= SCF_DO_STCLASS_AND;
5547 f &= ~SCF_DO_STCLASS_OR;
5549 /* Exclude from super-linear cache processing any {n,m}
5550 regops for which the combination of input pos and regex
5551 pos is not enough information to determine if a match
5554 For example, in the regex /foo(bar\s*){4,8}baz/ with the
5555 regex pos at the \s*, the prospects for a match depend not
5556 only on the input position but also on how many (bar\s*)
5557 repeats into the {4,8} we are. */
5558 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
5559 f &= ~SCF_WHILEM_VISITED_POS;
5561 /* This will finish on WHILEM, setting scan, or on NULL: */
5562 /* recurse study_chunk() on loop bodies */
5563 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
5564 last, data, stopparen, recursed_depth, NULL,
5566 ? (f & ~SCF_DO_SUBSTR)
5568 , depth+1, mutate_ok);
5570 if (flags & SCF_DO_STCLASS)
5571 data->start_class = oclass;
5572 if (mincount == 0 || minnext == 0) {
5573 if (flags & SCF_DO_STCLASS_OR) {
5574 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5576 else if (flags & SCF_DO_STCLASS_AND) {
5577 /* Switch to OR mode: cache the old value of
5578 * data->start_class */
5580 StructCopy(data->start_class, and_withp, regnode_ssc);
5581 flags &= ~SCF_DO_STCLASS_AND;
5582 StructCopy(&this_class, data->start_class, regnode_ssc);
5583 flags |= SCF_DO_STCLASS_OR;
5584 ANYOF_FLAGS(data->start_class)
5585 |= SSC_MATCHES_EMPTY_STRING;
5587 } else { /* Non-zero len */
5588 if (flags & SCF_DO_STCLASS_OR) {
5589 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5590 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5592 else if (flags & SCF_DO_STCLASS_AND)
5593 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
5594 flags &= ~SCF_DO_STCLASS;
5596 if (!scan) /* It was not CURLYX, but CURLY. */
5598 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
5599 /* ? quantifier ok, except for (?{ ... }) */
5600 && (next_is_eval || !(mincount == 0 && maxcount == 1))
5601 && (minnext == 0) && (deltanext == 0)
5602 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
5603 && maxcount <= REG_INFTY/3) /* Complement check for big
5606 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
5607 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
5608 "Quantifier unexpected on zero-length expression "
5609 "in regex m/%" UTF8f "/",
5610 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
5614 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
5615 || min >= SSize_t_MAX - minnext * mincount )
5617 FAIL("Regexp out of space");
5620 min += minnext * mincount;
5621 is_inf_internal |= deltanext == OPTIMIZE_INFTY
5622 || (maxcount == REG_INFTY && minnext + deltanext > 0);
5623 is_inf |= is_inf_internal;
5625 delta = OPTIMIZE_INFTY;
5627 delta += (minnext + deltanext) * maxcount
5628 - minnext * mincount;
5630 /* Try powerful optimization CURLYX => CURLYN. */
5631 if ( OP(oscan) == CURLYX && data
5632 && data->flags & SF_IN_PAR
5633 && !(data->flags & SF_HAS_EVAL)
5634 && !deltanext && minnext == 1
5637 /* Try to optimize to CURLYN. */
5638 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
5639 regnode * const nxt1 = nxt;
5646 if (!REGNODE_SIMPLE(OP(nxt))
5647 && !(PL_regkind[OP(nxt)] == EXACT
5648 && STR_LEN(nxt) == 1))
5654 if (OP(nxt) != CLOSE)
5656 if (RExC_open_parens) {
5659 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5662 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt) + 2;
5664 /* Now we know that nxt2 is the only contents: */
5665 oscan->flags = (U8)ARG(nxt);
5667 OP(nxt1) = NOTHING; /* was OPEN. */
5670 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5671 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
5672 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
5673 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5674 OP(nxt + 1) = OPTIMIZED; /* was count. */
5675 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
5680 /* Try optimization CURLYX => CURLYM. */
5681 if ( OP(oscan) == CURLYX && data
5682 && !(data->flags & SF_HAS_PAR)
5683 && !(data->flags & SF_HAS_EVAL)
5684 && !deltanext /* atom is fixed width */
5685 && minnext != 0 /* CURLYM can't handle zero width */
5686 /* Nor characters whose fold at run-time may be
5687 * multi-character */
5688 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
5691 /* XXXX How to optimize if data == 0? */
5692 /* Optimize to a simpler form. */
5693 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
5697 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
5698 && (OP(nxt2) != WHILEM))
5700 OP(nxt2) = SUCCEED; /* Whas WHILEM */
5701 /* Need to optimize away parenths. */
5702 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
5703 /* Set the parenth number. */
5704 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
5706 oscan->flags = (U8)ARG(nxt);
5707 if (RExC_open_parens) {
5709 RExC_open_parens[ARG(nxt1)] = REGNODE_OFFSET(oscan);
5712 RExC_close_parens[ARG(nxt1)] = REGNODE_OFFSET(nxt2)
5715 OP(nxt1) = OPTIMIZED; /* was OPEN. */
5716 OP(nxt) = OPTIMIZED; /* was CLOSE. */
5719 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
5720 OP(nxt + 1) = OPTIMIZED; /* was count. */
5721 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
5722 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
5725 while ( nxt1 && (OP(nxt1) != WHILEM)) {
5726 regnode *nnxt = regnext(nxt1);
5728 if (reg_off_by_arg[OP(nxt1)])
5729 ARG_SET(nxt1, nxt2 - nxt1);
5730 else if (nxt2 - nxt1 < U16_MAX)
5731 NEXT_OFF(nxt1) = nxt2 - nxt1;
5733 OP(nxt) = NOTHING; /* Cannot beautify */
5738 /* Optimize again: */
5739 /* recurse study_chunk() on optimised CURLYX => CURLYM */
5740 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
5741 NULL, stopparen, recursed_depth, NULL, 0,
5742 depth+1, mutate_ok);
5747 else if ((OP(oscan) == CURLYX)
5748 && (flags & SCF_WHILEM_VISITED_POS)
5749 /* See the comment on a similar expression above.
5750 However, this time it's not a subexpression
5751 we care about, but the expression itself. */
5752 && (maxcount == REG_INFTY)
5754 /* This stays as CURLYX, we can put the count/of pair. */
5755 /* Find WHILEM (as in regexec.c) */
5756 regnode *nxt = oscan + NEXT_OFF(oscan);
5758 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
5760 nxt = PREVOPER(nxt);
5761 if (nxt->flags & 0xf) {
5762 /* we've already set whilem count on this node */
5763 } else if (++data->whilem_c < 16) {
5764 assert(data->whilem_c <= RExC_whilem_seen);
5765 nxt->flags = (U8)(data->whilem_c
5766 | (RExC_whilem_seen << 4)); /* On WHILEM */
5769 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
5771 if (flags & SCF_DO_SUBSTR) {
5772 SV *last_str = NULL;
5773 STRLEN last_chrs = 0;
5774 int counted = mincount != 0;
5776 if (data->last_end > 0 && mincount != 0) { /* Ends with a
5778 SSize_t b = pos_before >= data->last_start_min
5779 ? pos_before : data->last_start_min;
5781 const char * const s = SvPV_const(data->last_found, l);
5782 SSize_t old = b - data->last_start_min;
5786 old = utf8_hop_forward((U8*)s, old,
5787 (U8 *) SvEND(data->last_found))
5790 /* Get the added string: */
5791 last_str = newSVpvn_utf8(s + old, l, UTF);
5792 last_chrs = UTF ? utf8_length((U8*)(s + old),
5793 (U8*)(s + old + l)) : l;
5794 if (deltanext == 0 && pos_before == b) {
5795 /* What was added is a constant string */
5798 SvGROW(last_str, (mincount * l) + 1);
5799 repeatcpy(SvPVX(last_str) + l,
5800 SvPVX_const(last_str), l,
5802 SvCUR_set(last_str, SvCUR(last_str) * mincount);
5803 /* Add additional parts. */
5804 SvCUR_set(data->last_found,
5805 SvCUR(data->last_found) - l);
5806 sv_catsv(data->last_found, last_str);
5808 SV * sv = data->last_found;
5810 SvUTF8(sv) && SvMAGICAL(sv) ?
5811 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5812 if (mg && mg->mg_len >= 0)
5813 mg->mg_len += last_chrs * (mincount-1);
5815 last_chrs *= mincount;
5816 data->last_end += l * (mincount - 1);
5819 /* start offset must point into the last copy */
5820 data->last_start_min += minnext * (mincount - 1);
5821 data->last_start_max =
5824 : data->last_start_max +
5825 (maxcount - 1) * (minnext + data->pos_delta);
5828 /* It is counted once already... */
5829 data->pos_min += minnext * (mincount - counted);
5831 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
5832 " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
5833 " maxcount=%" UVuf " mincount=%" UVuf "\n",
5834 (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext, (UV)maxcount,
5836 if (deltanext != OPTIMIZE_INFTY)
5837 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
5838 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
5839 - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
5841 if (deltanext == OPTIMIZE_INFTY
5842 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
5843 data->pos_delta = OPTIMIZE_INFTY;
5845 data->pos_delta += - counted * deltanext +
5846 (minnext + deltanext) * maxcount - minnext * mincount;
5847 if (mincount != maxcount) {
5848 /* Cannot extend fixed substrings found inside
5850 scan_commit(pRExC_state, data, minlenp, is_inf);
5851 if (mincount && last_str) {
5852 SV * const sv = data->last_found;
5853 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
5854 mg_find(sv, PERL_MAGIC_utf8) : NULL;
5858 sv_setsv(sv, last_str);
5859 data->last_end = data->pos_min;
5860 data->last_start_min = data->pos_min - last_chrs;
5861 data->last_start_max = is_inf
5863 : data->pos_min + data->pos_delta - last_chrs;
5865 data->cur_is_floating = 1; /* float */
5867 SvREFCNT_dec(last_str);
5869 if (data && (fl & SF_HAS_EVAL))
5870 data->flags |= SF_HAS_EVAL;
5871 optimize_curly_tail:
5872 rck_elide_nothing(oscan);
5876 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
5880 if (flags & SCF_DO_SUBSTR) {
5881 /* Cannot expect anything... */
5882 scan_commit(pRExC_state, data, minlenp, is_inf);
5883 data->cur_is_floating = 1; /* float */
5885 is_inf = is_inf_internal = 1;
5886 if (flags & SCF_DO_STCLASS_OR) {
5887 if (OP(scan) == CLUMP) {
5888 /* Actually is any start char, but very few code points
5889 * aren't start characters */
5890 ssc_match_all_cp(data->start_class);
5893 ssc_anything(data->start_class);
5896 flags &= ~SCF_DO_STCLASS;
5900 else if (OP(scan) == LNBREAK) {
5901 if (flags & SCF_DO_STCLASS) {
5902 if (flags & SCF_DO_STCLASS_AND) {
5903 ssc_intersection(data->start_class,
5904 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5905 ssc_clear_locale(data->start_class);
5906 ANYOF_FLAGS(data->start_class)
5907 &= ~SSC_MATCHES_EMPTY_STRING;
5909 else if (flags & SCF_DO_STCLASS_OR) {
5910 ssc_union(data->start_class,
5911 PL_XPosix_ptrs[_CC_VERTSPACE],
5913 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5915 /* See commit msg for
5916 * 749e076fceedeb708a624933726e7989f2302f6a */
5917 ANYOF_FLAGS(data->start_class)
5918 &= ~SSC_MATCHES_EMPTY_STRING;
5920 flags &= ~SCF_DO_STCLASS;
5923 if (delta != OPTIMIZE_INFTY)
5924 delta++; /* Because of the 2 char string cr-lf */
5925 if (flags & SCF_DO_SUBSTR) {
5926 /* Cannot expect anything... */
5927 scan_commit(pRExC_state, data, minlenp, is_inf);
5929 if (data->pos_delta != OPTIMIZE_INFTY) {
5930 data->pos_delta += 1;
5932 data->cur_is_floating = 1; /* float */
5935 else if (REGNODE_SIMPLE(OP(scan))) {
5937 if (flags & SCF_DO_SUBSTR) {
5938 scan_commit(pRExC_state, data, minlenp, is_inf);
5942 if (flags & SCF_DO_STCLASS) {
5944 SV* my_invlist = NULL;
5947 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5948 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5950 /* Some of the logic below assumes that switching
5951 locale on will only add false positives. */
5956 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5960 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5961 ssc_match_all_cp(data->start_class);
5966 SV* REG_ANY_invlist = _new_invlist(2);
5967 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5969 if (flags & SCF_DO_STCLASS_OR) {
5970 ssc_union(data->start_class,
5972 TRUE /* TRUE => invert, hence all but \n
5976 else if (flags & SCF_DO_STCLASS_AND) {
5977 ssc_intersection(data->start_class,
5979 TRUE /* TRUE => invert */
5981 ssc_clear_locale(data->start_class);
5983 SvREFCNT_dec_NN(REG_ANY_invlist);
5995 if (flags & SCF_DO_STCLASS_AND)
5996 ssc_and(pRExC_state, data->start_class,
5997 (regnode_charclass *) scan);
5999 ssc_or(pRExC_state, data->start_class,
6000 (regnode_charclass *) scan);
6003 case NANYOFM: /* NANYOFM already contains the inversion of the
6004 input ANYOF data, so, unlike things like
6005 NPOSIXA, don't change 'invert' to TRUE */
6009 SV* cp_list = get_ANYOFM_contents(scan);
6011 if (flags & SCF_DO_STCLASS_OR) {
6012 ssc_union(data->start_class, cp_list, invert);
6014 else if (flags & SCF_DO_STCLASS_AND) {
6015 ssc_intersection(data->start_class, cp_list, invert);
6018 SvREFCNT_dec_NN(cp_list);
6027 cp_list = _add_range_to_invlist(cp_list,
6029 ANYOFRbase(scan) + ANYOFRdelta(scan));
6031 if (flags & SCF_DO_STCLASS_OR) {
6032 ssc_union(data->start_class, cp_list, invert);
6034 else if (flags & SCF_DO_STCLASS_AND) {
6035 ssc_intersection(data->start_class, cp_list, invert);
6038 SvREFCNT_dec_NN(cp_list);
6047 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
6048 if (flags & SCF_DO_STCLASS_AND) {
6049 bool was_there = cBOOL(
6050 ANYOF_POSIXL_TEST(data->start_class,
6052 ANYOF_POSIXL_ZERO(data->start_class);
6053 if (was_there) { /* Do an AND */
6054 ANYOF_POSIXL_SET(data->start_class, namedclass);
6056 /* No individual code points can now match */
6057 data->start_class->invlist
6058 = sv_2mortal(_new_invlist(0));
6061 int complement = namedclass + ((invert) ? -1 : 1);
6063 assert(flags & SCF_DO_STCLASS_OR);
6065 /* If the complement of this class was already there,
6066 * the result is that they match all code points,
6067 * (\d + \D == everything). Remove the classes from
6068 * future consideration. Locale is not relevant in
6070 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
6071 ssc_match_all_cp(data->start_class);
6072 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
6073 ANYOF_POSIXL_CLEAR(data->start_class, complement);
6075 else { /* The usual case; just add this class to the
6077 ANYOF_POSIXL_SET(data->start_class, namedclass);
6082 case NPOSIXA: /* For these, we always know the exact set of
6087 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
6088 goto join_posix_and_ascii;
6096 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
6098 /* NPOSIXD matches all upper Latin1 code points unless the
6099 * target string being matched is UTF-8, which is
6100 * unknowable until match time. Since we are going to
6101 * invert, we want to get rid of all of them so that the
6102 * inversion will match all */
6103 if (OP(scan) == NPOSIXD) {
6104 _invlist_subtract(my_invlist, PL_UpperLatin1,
6108 join_posix_and_ascii:
6110 if (flags & SCF_DO_STCLASS_AND) {
6111 ssc_intersection(data->start_class, my_invlist, invert);
6112 ssc_clear_locale(data->start_class);
6115 assert(flags & SCF_DO_STCLASS_OR);
6116 ssc_union(data->start_class, my_invlist, invert);
6118 SvREFCNT_dec(my_invlist);
6120 if (flags & SCF_DO_STCLASS_OR)
6121 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6122 flags &= ~SCF_DO_STCLASS;
6125 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
6126 data->flags |= (OP(scan) == MEOL
6129 scan_commit(pRExC_state, data, minlenp, is_inf);
6132 else if ( PL_regkind[OP(scan)] == BRANCHJ
6133 /* Lookbehind, or need to calculate parens/evals/stclass: */
6134 && (scan->flags || data || (flags & SCF_DO_STCLASS))
6135 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
6137 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6138 || OP(scan) == UNLESSM )
6140 /* Negative Lookahead/lookbehind
6141 In this case we can't do fixed string optimisation.
6144 SSize_t deltanext, minnext, fake = 0;
6149 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6151 data_fake.whilem_c = data->whilem_c;
6152 data_fake.last_closep = data->last_closep;
6155 data_fake.last_closep = &fake;
6156 data_fake.pos_delta = delta;
6157 if ( flags & SCF_DO_STCLASS && !scan->flags
6158 && OP(scan) == IFMATCH ) { /* Lookahead */
6159 ssc_init(pRExC_state, &intrnl);
6160 data_fake.start_class = &intrnl;
6161 f |= SCF_DO_STCLASS_AND;
6163 if (flags & SCF_WHILEM_VISITED_POS)
6164 f |= SCF_WHILEM_VISITED_POS;
6165 next = regnext(scan);
6166 nscan = NEXTOPER(NEXTOPER(scan));
6168 /* recurse study_chunk() for lookahead body */
6169 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
6170 last, &data_fake, stopparen,
6171 recursed_depth, NULL, f, depth+1,
6175 || deltanext > (I32) U8_MAX
6176 || minnext > (I32)U8_MAX
6177 || minnext + deltanext > (I32)U8_MAX)
6179 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6183 /* The 'next_off' field has been repurposed to count the
6184 * additional starting positions to try beyond the initial
6185 * one. (This leaves it at 0 for non-variable length
6186 * matches to avoid breakage for those not using this
6189 scan->next_off = deltanext;
6190 ckWARNexperimental(RExC_parse,
6191 WARN_EXPERIMENTAL__VLB,
6192 "Variable length lookbehind is experimental");
6194 scan->flags = (U8)minnext + deltanext;
6197 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6199 if (data_fake.flags & SF_HAS_EVAL)
6200 data->flags |= SF_HAS_EVAL;
6201 data->whilem_c = data_fake.whilem_c;
6203 if (f & SCF_DO_STCLASS_AND) {
6204 if (flags & SCF_DO_STCLASS_OR) {
6205 /* OR before, AND after: ideally we would recurse with
6206 * data_fake to get the AND applied by study of the
6207 * remainder of the pattern, and then derecurse;
6208 * *** HACK *** for now just treat as "no information".
6209 * See [perl #56690].
6211 ssc_init(pRExC_state, data->start_class);
6213 /* AND before and after: combine and continue. These
6214 * assertions are zero-length, so can match an EMPTY
6216 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6217 ANYOF_FLAGS(data->start_class)
6218 |= SSC_MATCHES_EMPTY_STRING;
6222 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
6224 /* Positive Lookahead/lookbehind
6225 In this case we can do fixed string optimisation,
6226 but we must be careful about it. Note in the case of
6227 lookbehind the positions will be offset by the minimum
6228 length of the pattern, something we won't know about
6229 until after the recurse.
6231 SSize_t deltanext, fake = 0;
6235 /* We use SAVEFREEPV so that when the full compile
6236 is finished perl will clean up the allocated
6237 minlens when it's all done. This way we don't
6238 have to worry about freeing them when we know
6239 they wont be used, which would be a pain.
6242 Newx( minnextp, 1, SSize_t );
6243 SAVEFREEPV(minnextp);
6246 StructCopy(data, &data_fake, scan_data_t);
6247 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
6250 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
6251 data_fake.last_found=newSVsv(data->last_found);
6255 data_fake.last_closep = &fake;
6256 data_fake.flags = 0;
6257 data_fake.substrs[0].flags = 0;
6258 data_fake.substrs[1].flags = 0;
6259 data_fake.pos_delta = delta;
6261 data_fake.flags |= SF_IS_INF;
6262 if ( flags & SCF_DO_STCLASS && !scan->flags
6263 && OP(scan) == IFMATCH ) { /* Lookahead */
6264 ssc_init(pRExC_state, &intrnl);
6265 data_fake.start_class = &intrnl;
6266 f |= SCF_DO_STCLASS_AND;
6268 if (flags & SCF_WHILEM_VISITED_POS)
6269 f |= SCF_WHILEM_VISITED_POS;
6270 next = regnext(scan);
6271 nscan = NEXTOPER(NEXTOPER(scan));
6273 /* positive lookahead study_chunk() recursion */
6274 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
6275 &deltanext, last, &data_fake,
6276 stopparen, recursed_depth, NULL,
6277 f, depth+1, mutate_ok);
6279 assert(0); /* This code has never been tested since this
6280 is normally not compiled */
6282 || deltanext > (I32) U8_MAX
6283 || *minnextp > (I32)U8_MAX
6284 || *minnextp + deltanext > (I32)U8_MAX)
6286 FAIL2("Lookbehind longer than %" UVuf " not implemented",
6291 scan->next_off = deltanext;
6293 scan->flags = (U8)*minnextp + deltanext;
6298 if (f & SCF_DO_STCLASS_AND) {
6299 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
6300 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
6303 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6305 if (data_fake.flags & SF_HAS_EVAL)
6306 data->flags |= SF_HAS_EVAL;
6307 data->whilem_c = data_fake.whilem_c;
6308 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
6310 if (RExC_rx->minlen<*minnextp)
6311 RExC_rx->minlen=*minnextp;
6312 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
6313 SvREFCNT_dec_NN(data_fake.last_found);
6315 for (i = 0; i < 2; i++) {
6316 if (data_fake.substrs[i].minlenp != minlenp) {
6317 data->substrs[i].min_offset =
6318 data_fake.substrs[i].min_offset;
6319 data->substrs[i].max_offset =
6320 data_fake.substrs[i].max_offset;
6321 data->substrs[i].minlenp =
6322 data_fake.substrs[i].minlenp;
6323 data->substrs[i].lookbehind += scan->flags;
6331 else if (OP(scan) == OPEN) {
6332 if (stopparen != (I32)ARG(scan))
6335 else if (OP(scan) == CLOSE) {
6336 if (stopparen == (I32)ARG(scan)) {
6339 if ((I32)ARG(scan) == is_par) {
6340 next = regnext(scan);
6342 if ( next && (OP(next) != WHILEM) && next < last)
6343 is_par = 0; /* Disable optimization */
6346 *(data->last_closep) = ARG(scan);
6348 else if (OP(scan) == EVAL) {
6350 data->flags |= SF_HAS_EVAL;
6352 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
6353 if (flags & SCF_DO_SUBSTR) {
6354 scan_commit(pRExC_state, data, minlenp, is_inf);
6355 flags &= ~SCF_DO_SUBSTR;
6357 if (data && OP(scan)==ACCEPT) {
6358 data->flags |= SCF_SEEN_ACCEPT;
6363 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
6365 if (flags & SCF_DO_SUBSTR) {
6366 scan_commit(pRExC_state, data, minlenp, is_inf);
6367 data->cur_is_floating = 1; /* float */
6369 is_inf = is_inf_internal = 1;
6370 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
6371 ssc_anything(data->start_class);
6372 flags &= ~SCF_DO_STCLASS;
6374 else if (OP(scan) == GPOS) {
6375 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
6376 !(delta || is_inf || (data && data->pos_delta)))
6378 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
6379 RExC_rx->intflags |= PREGf_ANCH_GPOS;
6380 if (RExC_rx->gofs < (STRLEN)min)
6381 RExC_rx->gofs = min;
6383 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
6387 #ifdef TRIE_STUDY_OPT
6388 #ifdef FULL_TRIE_STUDY
6389 else if (PL_regkind[OP(scan)] == TRIE) {
6390 /* NOTE - There is similar code to this block above for handling
6391 BRANCH nodes on the initial study. If you change stuff here
6393 regnode *trie_node= scan;
6394 regnode *tail= regnext(scan);
6395 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6396 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
6399 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
6400 /* Cannot merge strings after this. */
6401 scan_commit(pRExC_state, data, minlenp, is_inf);
6403 if (flags & SCF_DO_STCLASS)
6404 ssc_init_zero(pRExC_state, &accum);
6410 const regnode *nextbranch= NULL;
6413 for ( word=1 ; word <= trie->wordcount ; word++)
6415 SSize_t deltanext=0, minnext=0, f = 0, fake;
6416 regnode_ssc this_class;
6418 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
6420 data_fake.whilem_c = data->whilem_c;
6421 data_fake.last_closep = data->last_closep;
6424 data_fake.last_closep = &fake;
6425 data_fake.pos_delta = delta;
6426 if (flags & SCF_DO_STCLASS) {
6427 ssc_init(pRExC_state, &this_class);
6428 data_fake.start_class = &this_class;
6429 f = SCF_DO_STCLASS_AND;
6431 if (flags & SCF_WHILEM_VISITED_POS)
6432 f |= SCF_WHILEM_VISITED_POS;
6434 if (trie->jump[word]) {
6436 nextbranch = trie_node + trie->jump[0];
6437 scan= trie_node + trie->jump[word];
6438 /* We go from the jump point to the branch that follows
6439 it. Note this means we need the vestigal unused
6440 branches even though they arent otherwise used. */
6441 /* optimise study_chunk() for TRIE */
6442 minnext = study_chunk(pRExC_state, &scan, minlenp,
6443 &deltanext, (regnode *)nextbranch, &data_fake,
6444 stopparen, recursed_depth, NULL, f, depth+1,
6447 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
6448 nextbranch= regnext((regnode*)nextbranch);
6450 if (min1 > (SSize_t)(minnext + trie->minlen))
6451 min1 = minnext + trie->minlen;
6452 if (deltanext == OPTIMIZE_INFTY) {
6453 is_inf = is_inf_internal = 1;
6454 max1 = OPTIMIZE_INFTY;
6455 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
6456 max1 = minnext + deltanext + trie->maxlen;
6458 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
6460 if (data_fake.flags & SCF_SEEN_ACCEPT) {
6461 if ( stopmin > min + min1)
6462 stopmin = min + min1;
6463 flags &= ~SCF_DO_SUBSTR;
6465 data->flags |= SCF_SEEN_ACCEPT;
6468 if (data_fake.flags & SF_HAS_EVAL)
6469 data->flags |= SF_HAS_EVAL;
6470 data->whilem_c = data_fake.whilem_c;
6472 if (flags & SCF_DO_STCLASS)
6473 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
6476 if (flags & SCF_DO_SUBSTR) {
6477 data->pos_min += min1;
6478 data->pos_delta += max1 - min1;
6479 if (max1 != min1 || is_inf)
6480 data->cur_is_floating = 1; /* float */
6483 if (delta != OPTIMIZE_INFTY) {
6484 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
6485 delta += max1 - min1;
6487 delta = OPTIMIZE_INFTY;
6489 if (flags & SCF_DO_STCLASS_OR) {
6490 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6492 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6493 flags &= ~SCF_DO_STCLASS;
6496 else if (flags & SCF_DO_STCLASS_AND) {
6498 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
6499 flags &= ~SCF_DO_STCLASS;
6502 /* Switch to OR mode: cache the old value of
6503 * data->start_class */
6505 StructCopy(data->start_class, and_withp, regnode_ssc);
6506 flags &= ~SCF_DO_STCLASS_AND;
6507 StructCopy(&accum, data->start_class, regnode_ssc);
6508 flags |= SCF_DO_STCLASS_OR;
6515 else if (PL_regkind[OP(scan)] == TRIE) {
6516 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
6519 min += trie->minlen;
6520 delta += (trie->maxlen - trie->minlen);
6521 flags &= ~SCF_DO_STCLASS; /* xxx */
6522 if (flags & SCF_DO_SUBSTR) {
6523 /* Cannot expect anything... */
6524 scan_commit(pRExC_state, data, minlenp, is_inf);
6525 data->pos_min += trie->minlen;
6526 data->pos_delta += (trie->maxlen - trie->minlen);
6527 if (trie->maxlen != trie->minlen)
6528 data->cur_is_floating = 1; /* float */
6530 if (trie->jump) /* no more substrings -- for now /grr*/
6531 flags &= ~SCF_DO_SUBSTR;
6533 else if (OP(scan) == REGEX_SET) {
6534 Perl_croak(aTHX_ "panic: %s regnode should be resolved"
6535 " before optimization", reg_name[REGEX_SET]);
6538 #endif /* old or new */
6539 #endif /* TRIE_STUDY_OPT */
6541 /* Else: zero-length, ignore. */
6542 scan = regnext(scan);
6547 /* we need to unwind recursion. */
6550 DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
6551 DEBUG_PEEP("fend", scan, depth, flags);
6553 /* restore previous context */
6554 last = frame->last_regnode;
6555 scan = frame->next_regnode;
6556 stopparen = frame->stopparen;
6557 recursed_depth = frame->prev_recursed_depth;
6559 RExC_frame_last = frame->prev_frame;
6560 frame = frame->this_prev_frame;
6561 goto fake_study_recurse;
6565 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
6568 *deltap = is_inf_internal ? OPTIMIZE_INFTY : delta;
6570 if (flags & SCF_DO_SUBSTR && is_inf)
6571 data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
6572 if (is_par > (I32)U8_MAX)
6574 if (is_par && pars==1 && data) {
6575 data->flags |= SF_IN_PAR;
6576 data->flags &= ~SF_HAS_PAR;
6578 else if (pars && data) {
6579 data->flags |= SF_HAS_PAR;
6580 data->flags &= ~SF_IN_PAR;
6582 if (flags & SCF_DO_STCLASS_OR)
6583 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
6584 if (flags & SCF_TRIE_RESTUDY)
6585 data->flags |= SCF_TRIE_RESTUDY;
6587 DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
6589 final_minlen = min < stopmin
6592 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
6593 if (final_minlen > OPTIMIZE_INFTY - delta)
6594 RExC_maxlen = OPTIMIZE_INFTY;
6595 else if (RExC_maxlen < final_minlen + delta)
6596 RExC_maxlen = final_minlen + delta;
6598 return final_minlen;
6602 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
6604 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
6606 PERL_ARGS_ASSERT_ADD_DATA;
6608 Renewc(RExC_rxi->data,
6609 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
6610 char, struct reg_data);
6612 Renew(RExC_rxi->data->what, count + n, U8);
6614 Newx(RExC_rxi->data->what, n, U8);
6615 RExC_rxi->data->count = count + n;
6616 Copy(s, RExC_rxi->data->what + count, n, U8);
6620 /*XXX: todo make this not included in a non debugging perl, but appears to be
6621 * used anyway there, in 'use re' */
6622 #ifndef PERL_IN_XSUB_RE
6624 Perl_reginitcolors(pTHX)
6626 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
6628 char *t = savepv(s);
6632 t = strchr(t, '\t');
6638 PL_colors[i] = t = (char *)"";
6643 PL_colors[i++] = (char *)"";
6650 #ifdef TRIE_STUDY_OPT
6651 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
6654 (data.flags & SCF_TRIE_RESTUDY) \
6662 #define CHECK_RESTUDY_GOTO_butfirst
6666 * pregcomp - compile a regular expression into internal code
6668 * Decides which engine's compiler to call based on the hint currently in
6672 #ifndef PERL_IN_XSUB_RE
6674 /* return the currently in-scope regex engine (or the default if none) */
6676 regexp_engine const *
6677 Perl_current_re_engine(pTHX)
6679 if (IN_PERL_COMPILETIME) {
6680 HV * const table = GvHV(PL_hintgv);
6683 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
6684 return &PL_core_reg_engine;
6685 ptr = hv_fetchs(table, "regcomp", FALSE);
6686 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
6687 return &PL_core_reg_engine;
6688 return INT2PTR(regexp_engine*, SvIV(*ptr));
6692 if (!PL_curcop->cop_hints_hash)
6693 return &PL_core_reg_engine;
6694 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
6695 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
6696 return &PL_core_reg_engine;
6697 return INT2PTR(regexp_engine*, SvIV(ptr));
6703 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
6705 regexp_engine const *eng = current_re_engine();
6706 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6708 PERL_ARGS_ASSERT_PREGCOMP;
6710 /* Dispatch a request to compile a regexp to correct regexp engine. */
6712 Perl_re_printf( aTHX_ "Using engine %" UVxf "\n",
6715 return CALLREGCOMP_ENG(eng, pattern, flags);
6719 /* public(ish) entry point for the perl core's own regex compiling code.
6720 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
6721 * pattern rather than a list of OPs, and uses the internal engine rather
6722 * than the current one */
6725 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
6727 SV *pat = pattern; /* defeat constness! */
6729 PERL_ARGS_ASSERT_RE_COMPILE;
6731 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
6732 #ifdef PERL_IN_XSUB_RE
6735 &PL_core_reg_engine,
6737 NULL, NULL, rx_flags, 0);
6741 S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs)
6745 if (--cbs->refcnt > 0)
6747 for (n = 0; n < cbs->count; n++) {
6748 REGEXP *rx = cbs->cb[n].src_regex;
6750 cbs->cb[n].src_regex = NULL;
6751 SvREFCNT_dec_NN(rx);
6759 static struct reg_code_blocks *
6760 S_alloc_code_blocks(pTHX_ int ncode)
6762 struct reg_code_blocks *cbs;
6763 Newx(cbs, 1, struct reg_code_blocks);
6766 SAVEDESTRUCTOR_X(S_free_codeblocks, cbs);
6768 Newx(cbs->cb, ncode, struct reg_code_block);
6775 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
6776 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
6777 * point to the realloced string and length.
6779 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
6783 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
6784 char **pat_p, STRLEN *plen_p, int num_code_blocks)
6786 U8 *const src = (U8*)*pat_p;
6791 DECLARE_AND_GET_RE_DEBUG_FLAGS;
6793 DEBUG_PARSE_r(Perl_re_printf( aTHX_
6794 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
6796 /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
6797 Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
6800 while (s < *plen_p) {
6801 append_utf8_from_native_byte(src[s], &d);
6803 if (n < num_code_blocks) {
6804 assert(pRExC_state->code_blocks);
6805 if (!do_end && pRExC_state->code_blocks->cb[n].start == s) {
6806 pRExC_state->code_blocks->cb[n].start = d - dst - 1;
6807 assert(*(d - 1) == '(');
6810 else if (do_end && pRExC_state->code_blocks->cb[n].end == s) {
6811 pRExC_state->code_blocks->cb[n].end = d - dst - 1;
6812 assert(*(d - 1) == ')');
6821 *pat_p = (char*) dst;
6823 RExC_orig_utf8 = RExC_utf8 = 1;
6828 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
6829 * while recording any code block indices, and handling overloading,
6830 * nested qr// objects etc. If pat is null, it will allocate a new
6831 * string, or just return the first arg, if there's only one.
6833 * Returns the malloced/updated pat.
6834 * patternp and pat_count is the array of SVs to be concatted;
6835 * oplist is the optional list of ops that generated the SVs;
6836 * recompile_p is a pointer to a boolean that will be set if
6837 * the regex will need to be recompiled.
6838 * delim, if non-null is an SV that will be inserted between each element
6842 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
6843 SV *pat, SV ** const patternp, int pat_count,
6844 OP *oplist, bool *recompile_p, SV *delim)
6848 bool use_delim = FALSE;
6849 bool alloced = FALSE;
6851 /* if we know we have at least two args, create an empty string,
6852 * then concatenate args to that. For no args, return an empty string */
6853 if (!pat && pat_count != 1) {
6859 for (svp = patternp; svp < patternp + pat_count; svp++) {
6862 STRLEN orig_patlen = 0;
6864 SV *msv = use_delim ? delim : *svp;
6865 if (!msv) msv = &PL_sv_undef;
6867 /* if we've got a delimiter, we go round the loop twice for each
6868 * svp slot (except the last), using the delimiter the second
6877 if (SvTYPE(msv) == SVt_PVAV) {
6878 /* we've encountered an interpolated array within
6879 * the pattern, e.g. /...@a..../. Expand the list of elements,
6880 * then recursively append elements.
6881 * The code in this block is based on S_pushav() */
6883 AV *const av = (AV*)msv;
6884 const SSize_t maxarg = AvFILL(av) + 1;
6888 assert(oplist->op_type == OP_PADAV
6889 || oplist->op_type == OP_RV2AV);
6890 oplist = OpSIBLING(oplist);
6893 if (SvRMAGICAL(av)) {
6896 Newx(array, maxarg, SV*);
6898 for (i=0; i < maxarg; i++) {
6899 SV ** const svp = av_fetch(av, i, FALSE);
6900 array[i] = svp ? *svp : &PL_sv_undef;
6904 array = AvARRAY(av);
6906 pat = S_concat_pat(aTHX_ pRExC_state, pat,
6907 array, maxarg, NULL, recompile_p,
6909 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
6915 /* we make the assumption here that each op in the list of
6916 * op_siblings maps to one SV pushed onto the stack,
6917 * except for code blocks, with have both an OP_NULL and
6919 * This allows us to match up the list of SVs against the
6920 * list of OPs to find the next code block.
6922 * Note that PUSHMARK PADSV PADSV ..
6924 * PADRANGE PADSV PADSV ..
6925 * so the alignment still works. */
6928 if (oplist->op_type == OP_NULL
6929 && (oplist->op_flags & OPf_SPECIAL))
6931 assert(n < pRExC_state->code_blocks->count);
6932 pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0;
6933 pRExC_state->code_blocks->cb[n].block = oplist;
6934 pRExC_state->code_blocks->cb[n].src_regex = NULL;
6937 oplist = OpSIBLING(oplist); /* skip CONST */
6940 oplist = OpSIBLING(oplist);;
6943 /* apply magic and QR overloading to arg */
6946 if (SvROK(msv) && SvAMAGIC(msv)) {
6947 SV *sv = AMG_CALLunary(msv, regexp_amg);
6951 if (SvTYPE(sv) != SVt_REGEXP)
6952 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
6957 /* try concatenation overload ... */
6958 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
6959 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
6962 /* overloading involved: all bets are off over literal
6963 * code. Pretend we haven't seen it */
6965 pRExC_state->code_blocks->count -= n;
6969 /* ... or failing that, try "" overload */
6970 while (SvAMAGIC(msv)
6971 && (sv = AMG_CALLunary(msv, string_amg))
6975 && SvRV(msv) == SvRV(sv))
6980 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
6984 /* this is a partially unrolled
6985 * sv_catsv_nomg(pat, msv);
6986 * that allows us to adjust code block indices if
6989 char *dst = SvPV_force_nomg(pat, dlen);
6991 if (SvUTF8(msv) && !SvUTF8(pat)) {
6992 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6993 sv_setpvn(pat, dst, dlen);
6996 sv_catsv_nomg(pat, msv);
7000 /* We have only one SV to process, but we need to verify
7001 * it is properly null terminated or we will fail asserts
7002 * later. In theory we probably shouldn't get such SV's,
7003 * but if we do we should handle it gracefully. */
7004 if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) || SvIsCOW_shared_hash(msv) ) {
7005 /* not a string, or a string with a trailing null */
7008 /* a string with no trailing null, we need to copy it
7009 * so it has a trailing null */
7010 pat = sv_2mortal(newSVsv(msv));
7015 pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1;
7018 /* extract any code blocks within any embedded qr//'s */
7019 if (rx && SvTYPE(rx) == SVt_REGEXP
7020 && RX_ENGINE((REGEXP*)rx)->op_comp)
7023 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
7024 if (ri->code_blocks && ri->code_blocks->count) {
7026 /* the presence of an embedded qr// with code means
7027 * we should always recompile: the text of the
7028 * qr// may not have changed, but it may be a
7029 * different closure than last time */
7031 if (pRExC_state->code_blocks) {
7032 int new_count = pRExC_state->code_blocks->count
7033 + ri->code_blocks->count;
7034 Renew(pRExC_state->code_blocks->cb,
7035 new_count, struct reg_code_block);
7036 pRExC_state->code_blocks->count = new_count;
7039 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_
7040 ri->code_blocks->count);
7042 for (i=0; i < ri->code_blocks->count; i++) {
7043 struct reg_code_block *src, *dst;
7044 STRLEN offset = orig_patlen
7045 + ReANY((REGEXP *)rx)->pre_prefix;
7046 assert(n < pRExC_state->code_blocks->count);
7047 src = &ri->code_blocks->cb[i];
7048 dst = &pRExC_state->code_blocks->cb[n];
7049 dst->start = src->start + offset;
7050 dst->end = src->end + offset;
7051 dst->block = src->block;
7052 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
7061 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
7070 /* see if there are any run-time code blocks in the pattern.
7071 * False positives are allowed */
7074 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7075 char *pat, STRLEN plen)
7080 PERL_UNUSED_CONTEXT;
7082 for (s = 0; s < plen; s++) {
7083 if ( pRExC_state->code_blocks
7084 && n < pRExC_state->code_blocks->count
7085 && s == pRExC_state->code_blocks->cb[n].start)
7087 s = pRExC_state->code_blocks->cb[n].end;
7091 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
7093 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
7095 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
7102 /* Handle run-time code blocks. We will already have compiled any direct
7103 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
7104 * copy of it, but with any literal code blocks blanked out and
7105 * appropriate chars escaped; then feed it into
7107 * eval "qr'modified_pattern'"
7111 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
7115 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
7117 * After eval_sv()-ing that, grab any new code blocks from the returned qr
7118 * and merge them with any code blocks of the original regexp.
7120 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
7121 * instead, just save the qr and return FALSE; this tells our caller that
7122 * the original pattern needs upgrading to utf8.
7126 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
7127 char *pat, STRLEN plen)
7131 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7133 if (pRExC_state->runtime_code_qr) {
7134 /* this is the second time we've been called; this should
7135 * only happen if the main pattern got upgraded to utf8
7136 * during compilation; re-use the qr we compiled first time
7137 * round (which should be utf8 too)
7139 qr = pRExC_state->runtime_code_qr;
7140 pRExC_state->runtime_code_qr = NULL;
7141 assert(RExC_utf8 && SvUTF8(qr));
7147 int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */
7151 /* determine how many extra chars we need for ' and \ escaping */
7152 for (s = 0; s < plen; s++) {
7153 if (pat[s] == '\'' || pat[s] == '\\')
7157 Newx(newpat, newlen, char);
7159 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
7161 for (s = 0; s < plen; s++) {
7162 if ( pRExC_state->code_blocks
7163 && n < pRExC_state->code_blocks->count
7164 && s == pRExC_state->code_blocks->cb[n].start)
7166 /* blank out literal code block so that they aren't
7167 * recompiled: eg change from/to:
7177 assert(pat[s] == '(');
7178 assert(pat[s+1] == '?');
7182 while (s < pRExC_state->code_blocks->cb[n].end) {
7190 if (pat[s] == '\'' || pat[s] == '\\')
7195 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) {
7197 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) {
7203 Perl_re_printf( aTHX_
7204 "%sre-parsing pattern for runtime code:%s %s\n",
7205 PL_colors[4], PL_colors[5], newpat);
7208 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
7214 PUSHSTACKi(PERLSI_REQUIRE);
7215 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
7216 * parsing qr''; normally only q'' does this. It also alters
7218 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
7219 SvREFCNT_dec_NN(sv);
7224 SV * const errsv = ERRSV;
7225 if (SvTRUE_NN(errsv))
7226 /* use croak_sv ? */
7227 Perl_croak_nocontext("%" SVf, SVfARG(errsv));
7229 assert(SvROK(qr_ref));
7231 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
7232 /* the leaving below frees the tmp qr_ref.
7233 * Give qr a life of its own */
7241 if (!RExC_utf8 && SvUTF8(qr)) {
7242 /* first time through; the pattern got upgraded; save the
7243 * qr for the next time through */
7244 assert(!pRExC_state->runtime_code_qr);
7245 pRExC_state->runtime_code_qr = qr;
7250 /* extract any code blocks within the returned qr// */
7253 /* merge the main (r1) and run-time (r2) code blocks into one */
7255 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
7256 struct reg_code_block *new_block, *dst;
7257 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
7261 if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */
7263 SvREFCNT_dec_NN(qr);
7267 if (!r1->code_blocks)
7268 r1->code_blocks = S_alloc_code_blocks(aTHX_ 0);
7270 r1c = r1->code_blocks->count;
7271 r2c = r2->code_blocks->count;
7273 Newx(new_block, r1c + r2c, struct reg_code_block);
7277 while (i1 < r1c || i2 < r2c) {
7278 struct reg_code_block *src;
7282 src = &r2->code_blocks->cb[i2++];
7286 src = &r1->code_blocks->cb[i1++];
7287 else if ( r1->code_blocks->cb[i1].start
7288 < r2->code_blocks->cb[i2].start)
7290 src = &r1->code_blocks->cb[i1++];
7291 assert(src->end < r2->code_blocks->cb[i2].start);
7294 assert( r1->code_blocks->cb[i1].start
7295 > r2->code_blocks->cb[i2].start);
7296 src = &r2->code_blocks->cb[i2++];
7298 assert(src->end < r1->code_blocks->cb[i1].start);
7301 assert(pat[src->start] == '(');
7302 assert(pat[src->end] == ')');
7303 dst->start = src->start;
7304 dst->end = src->end;
7305 dst->block = src->block;
7306 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
7310 r1->code_blocks->count += r2c;
7311 Safefree(r1->code_blocks->cb);
7312 r1->code_blocks->cb = new_block;
7315 SvREFCNT_dec_NN(qr);
7321 S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
7322 struct reg_substr_datum *rsd,
7323 struct scan_data_substrs *sub,
7324 STRLEN longest_length)
7326 /* This is the common code for setting up the floating and fixed length
7327 * string data extracted from Perl_re_op_compile() below. Returns a boolean
7328 * as to whether succeeded or not */
7332 bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
7333 bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
7335 if (! (longest_length
7336 || (eol /* Can't have SEOL and MULTI */
7337 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
7339 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
7340 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
7345 /* copy the information about the longest from the reg_scan_data
7346 over to the program. */
7347 if (SvUTF8(sub->str)) {
7349 rsd->utf8_substr = sub->str;
7351 rsd->substr = sub->str;
7352 rsd->utf8_substr = NULL;
7354 /* end_shift is how many chars that must be matched that
7355 follow this item. We calculate it ahead of time as once the
7356 lookbehind offset is added in we lose the ability to correctly
7358 ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
7359 rsd->end_shift = ml - sub->min_offset
7361 /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
7363 + (SvTAIL(sub->str) != 0)
7367 t = (eol/* Can't have SEOL and MULTI */
7368 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
7369 fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
7375 S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
7377 /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
7378 * properly wrapped with the right modifiers */
7380 bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
7381 bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
7382 != REGEX_DEPENDS_CHARSET);
7384 /* The caret is output if there are any defaults: if not all the STD
7385 * flags are set, or if no character set specifier is needed */
7387 (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
7389 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
7390 == REG_RUN_ON_COMMENT_SEEN);
7391 U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
7392 >> RXf_PMf_STD_PMMOD_SHIFT);
7393 const char *fptr = STD_PAT_MODS; /*"msixxn"*/
7395 STRLEN pat_len = RExC_precomp_end - RExC_precomp;
7397 /* We output all the necessary flags; we never output a minus, as all
7398 * those are defaults, so are
7399 * covered by the caret */
7400 const STRLEN wraplen = pat_len + has_p + has_runon
7401 + has_default /* If needs a caret */
7402 + PL_bitcount[reganch] /* 1 char for each set standard flag */
7404 /* If needs a character set specifier */
7405 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
7406 + (sizeof("(?:)") - 1);
7408 PERL_ARGS_ASSERT_SET_REGEX_PV;
7410 /* make sure PL_bitcount bounds not exceeded */
7411 STATIC_ASSERT_STMT(sizeof(STD_PAT_MODS) <= 8);
7413 p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
7416 SvFLAGS(Rx) |= SVf_UTF8;
7419 /* If a default, cover it using the caret */
7421 *p++= DEFAULT_PAT_MOD;
7427 name = get_regex_charset_name(RExC_rx->extflags, &len);
7428 if (strEQ(name, DEPENDS_PAT_MODS)) { /* /d under UTF-8 => /u */
7430 name = UNICODE_PAT_MODS;
7431 len = sizeof(UNICODE_PAT_MODS) - 1;
7433 Copy(name, p, len, char);
7437 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
7440 while((ch = *fptr++)) {
7448 Copy(RExC_precomp, p, pat_len, char);
7449 assert ((RX_WRAPPED(Rx) - p) < 16);
7450 RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
7453 /* Adding a trailing \n causes this to compile properly:
7454 my $R = qr / A B C # D E/x; /($R)/
7455 Otherwise the parens are considered part of the comment */
7460 SvCUR_set(Rx, p - RX_WRAPPED(Rx));
7464 * Perl_re_op_compile - the perl internal RE engine's function to compile a
7465 * regular expression into internal code.
7466 * The pattern may be passed either as:
7467 * a list of SVs (patternp plus pat_count)
7468 * a list of OPs (expr)
7469 * If both are passed, the SV list is used, but the OP list indicates
7470 * which SVs are actually pre-compiled code blocks
7472 * The SVs in the list have magic and qr overloading applied to them (and
7473 * the list may be modified in-place with replacement SVs in the latter
7476 * If the pattern hasn't changed from old_re, then old_re will be
7479 * eng is the current engine. If that engine has an op_comp method, then
7480 * handle directly (i.e. we assume that op_comp was us); otherwise, just
7481 * do the initial concatenation of arguments and pass on to the external
7484 * If is_bare_re is not null, set it to a boolean indicating whether the
7485 * arg list reduced (after overloading) to a single bare regex which has
7486 * been returned (i.e. /$qr/).
7488 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
7490 * pm_flags contains the PMf_* flags, typically based on those from the
7491 * pm_flags field of the related PMOP. Currently we're only interested in
7492 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL, PMf_WILDCARD.
7494 * For many years this code had an initial sizing pass that calculated
7495 * (sometimes incorrectly, leading to security holes) the size needed for the
7496 * compiled pattern. That was changed by commit
7497 * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
7498 * node at a time, as parsing goes along. Patches welcome to fix any obsolete
7499 * references to this sizing pass.
7501 * Now, an initial crude guess as to the size needed is made, based on the
7502 * length of the pattern. Patches welcome to improve that guess. That amount
7503 * of space is malloc'd and then immediately freed, and then clawed back node
7504 * by node. This design is to minimze, to the extent possible, memory churn
7505 * when doing the reallocs.
7507 * A separate parentheses counting pass may be needed in some cases.
7508 * (Previously the sizing pass did this.) Patches welcome to reduce the number
7511 * The existence of a sizing pass necessitated design decisions that are no
7512 * longer needed. There are potential areas of simplification.
7514 * Beware that the optimization-preparation code in here knows about some
7515 * of the structure of the compiled regexp. [I'll say.]
7519 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7520 OP *expr, const regexp_engine* eng, REGEXP *old_re,
7521 bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
7523 REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
7531 SV** new_patternp = patternp;
7533 /* these are all flags - maybe they should be turned
7534 * into a single int with different bit masks */
7535 I32 sawlookahead = 0;
7540 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
7542 bool runtime_code = 0;
7544 RExC_state_t RExC_state;
7545 RExC_state_t * const pRExC_state = &RExC_state;
7546 #ifdef TRIE_STUDY_OPT
7548 RExC_state_t copyRExC_state;
7550 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7552 PERL_ARGS_ASSERT_RE_OP_COMPILE;
7554 DEBUG_r(if (!PL_colorset) reginitcolors());
7557 pRExC_state->warn_text = NULL;
7558 pRExC_state->unlexed_names = NULL;
7559 pRExC_state->code_blocks = NULL;
7562 *is_bare_re = FALSE;
7564 if (expr && (expr->op_type == OP_LIST ||
7565 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
7566 /* allocate code_blocks if needed */
7570 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
7571 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
7572 ncode++; /* count of DO blocks */
7575 pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode);
7579 /* compile-time pattern with just OP_CONSTs and DO blocks */
7584 /* find how many CONSTs there are */
7587 if (expr->op_type == OP_CONST)
7590 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7591 if (o->op_type == OP_CONST)
7595 /* fake up an SV array */
7597 assert(!new_patternp);
7598 Newx(new_patternp, n, SV*);
7599 SAVEFREEPV(new_patternp);
7603 if (expr->op_type == OP_CONST)
7604 new_patternp[n] = cSVOPx_sv(expr);
7606 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
7607 if (o->op_type == OP_CONST)
7608 new_patternp[n++] = cSVOPo_sv;
7613 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7614 "Assembling pattern from %d elements%s\n", pat_count,
7615 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7617 /* set expr to the first arg op */
7619 if (pRExC_state->code_blocks && pRExC_state->code_blocks->count
7620 && expr->op_type != OP_CONST)
7622 expr = cLISTOPx(expr)->op_first;
7623 assert( expr->op_type == OP_PUSHMARK
7624 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
7625 || expr->op_type == OP_PADRANGE);
7626 expr = OpSIBLING(expr);
7629 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
7630 expr, &recompile, NULL);
7632 /* handle bare (possibly after overloading) regex: foo =~ $re */
7637 if (SvTYPE(re) == SVt_REGEXP) {
7641 DEBUG_PARSE_r(Perl_re_printf( aTHX_
7642 "Precompiled pattern%s\n",
7643 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
7649 exp = SvPV_nomg(pat, plen);
7651 if (!eng->op_comp) {
7652 if ((SvUTF8(pat) && IN_BYTES)
7653 || SvGMAGICAL(pat) || SvAMAGIC(pat))
7655 /* make a temporary copy; either to convert to bytes,
7656 * or to avoid repeating get-magic / overloaded stringify */
7657 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
7658 (IN_BYTES ? 0 : SvUTF8(pat)));
7660 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
7663 /* ignore the utf8ness if the pattern is 0 length */
7664 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
7665 RExC_uni_semantics = 0;
7666 RExC_contains_locale = 0;
7667 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
7668 RExC_in_script_run = 0;
7669 RExC_study_started = 0;
7670 pRExC_state->runtime_code_qr = NULL;
7671 RExC_frame_head= NULL;
7672 RExC_frame_last= NULL;
7673 RExC_frame_count= 0;
7674 RExC_latest_warn_offset = 0;
7675 RExC_use_BRANCHJ = 0;
7676 RExC_warned_WARN_EXPERIMENTAL__VLB = 0;
7677 RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS = 0;
7678 RExC_total_parens = 0;
7679 RExC_open_parens = NULL;
7680 RExC_close_parens = NULL;
7681 RExC_paren_names = NULL;
7683 RExC_seen_d_op = FALSE;
7685 RExC_paren_name_list = NULL;
7689 RExC_mysv1= sv_newmortal();
7690 RExC_mysv2= sv_newmortal();
7694 SV *dsv= sv_newmortal();
7695 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7696 Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
7697 PL_colors[4], PL_colors[5], s);
7700 /* we jump here if we have to recompile, e.g., from upgrading the pattern
7703 if ((pm_flags & PMf_USE_RE_EVAL)
7704 /* this second condition covers the non-regex literal case,
7705 * i.e. $foo =~ '(?{})'. */
7706 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
7708 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
7711 /* return old regex if pattern hasn't changed */
7712 /* XXX: note in the below we have to check the flags as well as the
7715 * Things get a touch tricky as we have to compare the utf8 flag
7716 * independently from the compile flags. */
7720 && !!RX_UTF8(old_re) == !!RExC_utf8
7721 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
7722 && RX_PRECOMP(old_re)
7723 && RX_PRELEN(old_re) == plen
7724 && memEQ(RX_PRECOMP(old_re), exp, plen)
7725 && !runtime_code /* with runtime code, always recompile */ )
7728 SV *dsv= sv_newmortal();
7729 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
7730 Perl_re_printf( aTHX_ "%sSkipping recompilation of unchanged REx%s %s\n",
7731 PL_colors[4], PL_colors[5], s);
7736 /* Allocate the pattern's SV */
7737 RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
7738 RExC_rx = ReANY(Rx);
7739 if ( RExC_rx == NULL )
7740 FAIL("Regexp out of space");
7742 rx_flags = orig_rx_flags;
7744 if ( toUSE_UNI_CHARSET_NOT_DEPENDS
7745 && initial_charset == REGEX_DEPENDS_CHARSET)
7748 /* Set to use unicode semantics if the pattern is in utf8 and has the
7749 * 'depends' charset specified, as it means unicode when utf8 */
7750 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
7751 RExC_uni_semantics = 1;
7754 RExC_pm_flags = pm_flags;
7757 assert(TAINTING_get || !TAINT_get);
7759 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
7761 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
7762 /* whoops, we have a non-utf8 pattern, whilst run-time code
7763 * got compiled as utf8. Try again with a utf8 pattern */
7764 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7765 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7769 assert(!pRExC_state->runtime_code_qr);
7775 RExC_in_lookbehind = 0;
7776 RExC_in_lookahead = 0;
7777 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
7778 RExC_recode_x_to_native = 0;
7779 RExC_in_multi_char_class = 0;
7781 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
7782 RExC_precomp_end = RExC_end = exp + plen;
7784 RExC_whilem_seen = 0;
7786 RExC_recurse = NULL;
7787 RExC_study_chunk_recursed = NULL;
7788 RExC_study_chunk_recursed_bytes= 0;
7789 RExC_recurse_count = 0;
7790 RExC_sets_depth = 0;
7791 pRExC_state->code_index = 0;
7793 /* Initialize the string in the compiled pattern. This is so that there is
7794 * something to output if necessary */
7795 set_regex_pv(pRExC_state, Rx);
7798 Perl_re_printf( aTHX_
7799 "Starting parse and generation\n");
7801 RExC_lastparse=NULL;
7804 /* Allocate space and zero-initialize. Note, the two step process
7805 of zeroing when in debug mode, thus anything assigned has to
7806 happen after that */
7809 /* On the first pass of the parse, we guess how big this will be. Then
7810 * we grow in one operation to that amount and then give it back. As
7811 * we go along, we re-allocate what we need.
7813 * XXX Currently the guess is essentially that the pattern will be an
7814 * EXACT node with one byte input, one byte output. This is crude, and
7815 * better heuristics are welcome.
7817 * On any subsequent passes, we guess what we actually computed in the
7818 * latest earlier pass. Such a pass probably didn't complete so is
7819 * missing stuff. We could improve those guesses by knowing where the
7820 * parse stopped, and use the length so far plus apply the above
7821 * assumption to what's left. */
7822 RExC_size = STR_SZ(RExC_end - RExC_start);
7825 Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
7826 if ( RExC_rxi == NULL )
7827 FAIL("Regexp out of space");
7829 Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
7830 RXi_SET( RExC_rx, RExC_rxi );
7832 /* We start from 0 (over from 0 in the case this is a reparse. The first
7833 * node parsed will give back any excess memory we have allocated so far).
7837 /* non-zero initialization begins here */
7838 RExC_rx->engine= eng;
7839 RExC_rx->extflags = rx_flags;
7840 RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
7842 if (pm_flags & PMf_IS_QR) {
7843 RExC_rxi->code_blocks = pRExC_state->code_blocks;
7844 if (RExC_rxi->code_blocks) {
7845 RExC_rxi->code_blocks->refcnt++;
7849 RExC_rx->intflags = 0;
7851 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
7854 /* This NUL is guaranteed because the pattern comes from an SV*, and the sv
7855 * code makes sure the final byte is an uncounted NUL. But should this
7856 * ever not be the case, lots of things could read beyond the end of the
7857 * buffer: loops like
7858 * while(isFOO(*RExC_parse)) RExC_parse++;
7859 * strchr(RExC_parse, "foo");
7860 * etc. So it is worth noting. */
7861 assert(*RExC_end == '\0');
7865 RExC_parens_buf_size = 0;
7866 RExC_emit_start = RExC_rxi->program;
7867 pRExC_state->code_index = 0;
7869 *((char*) RExC_emit_start) = (char) REG_MAGIC;
7873 if (reg(pRExC_state, 0, &flags, 1)) {
7875 /* Success!, But we may need to redo the parse knowing how many parens
7876 * there actually are */
7877 if (IN_PARENS_PASS) {
7878 flags |= RESTART_PARSE;
7881 /* We have that number in RExC_npar */
7882 RExC_total_parens = RExC_npar;
7884 else if (! MUST_RESTART(flags)) {
7886 Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
7889 /* Here, we either have success, or we have to redo the parse for some reason */
7890 if (MUST_RESTART(flags)) {
7892 /* It's possible to write a regexp in ascii that represents Unicode
7893 codepoints outside of the byte range, such as via \x{100}. If we
7894 detect such a sequence we have to convert the entire pattern to utf8
7895 and then recompile, as our sizing calculation will have been based
7896 on 1 byte == 1 character, but we will need to use utf8 to encode
7897 at least some part of the pattern, and therefore must convert the whole
7900 if (flags & NEED_UTF8) {
7902 /* We have stored the offset of the final warning output so far.
7903 * That must be adjusted. Any variant characters between the start
7904 * of the pattern and this warning count for 2 bytes in the final,
7905 * so just add them again */
7906 if (UNLIKELY(RExC_latest_warn_offset > 0)) {
7907 RExC_latest_warn_offset +=
7908 variant_under_utf8_count((U8 *) exp, (U8 *) exp
7909 + RExC_latest_warn_offset);
7911 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
7912 pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
7913 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
7916 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
7919 if (ALL_PARENS_COUNTED) {
7920 /* Make enough room for all the known parens, and zero it */
7921 Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
7922 Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
7923 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
7925 Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
7926 Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
7928 else { /* Parse did not complete. Reinitialize the parentheses
7930 RExC_total_parens = 0;
7931 if (RExC_open_parens) {
7932 Safefree(RExC_open_parens);
7933 RExC_open_parens = NULL;
7935 if (RExC_close_parens) {
7936 Safefree(RExC_close_parens);
7937 RExC_close_parens = NULL;
7941 /* Clean up what we did in this parse */
7942 SvREFCNT_dec_NN(RExC_rx_sv);
7947 /* Here, we have successfully parsed and generated the pattern's program
7948 * for the regex engine. We are ready to finish things up and look for
7951 /* Update the string to compile, with correct modifiers, etc */
7952 set_regex_pv(pRExC_state, Rx);
7954 RExC_rx->nparens = RExC_total_parens - 1;
7956 /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
7957 if (RExC_whilem_seen > 15)
7958 RExC_whilem_seen = 15;
7961 Perl_re_printf( aTHX_
7962 "Required size %" IVdf " nodes\n", (IV)RExC_size);
7964 RExC_lastparse=NULL;
7967 #ifdef RE_TRACK_PATTERN_OFFSETS
7968 DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
7969 "%s %" UVuf " bytes for offset annotations.\n",
7970 RExC_offsets ? "Got" : "Couldn't get",
7971 (UV)((RExC_offsets[0] * 2 + 1))));
7972 DEBUG_OFFSETS_r(if (RExC_offsets) {
7973 const STRLEN len = RExC_offsets[0];
7975 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7976 Perl_re_printf( aTHX_
7977 "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
7978 for (i = 1; i <= len; i++) {
7979 if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
7980 Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ",
7981 (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
7983 Perl_re_printf( aTHX_ "\n");
7987 SetProgLen(RExC_rxi,RExC_size);
7990 DEBUG_DUMP_PRE_OPTIMIZE_r({
7991 SV * const sv = sv_newmortal();
7992 RXi_GET_DECL(RExC_rx, ri);
7994 Perl_re_printf( aTHX_ "Program before optimization:\n");
7996 (void)dumpuntil(RExC_rx, ri->program, ri->program + 1, NULL, NULL,
8001 Perl_re_printf( aTHX_ "Starting post parse optimization\n");
8004 /* XXXX To minimize changes to RE engine we always allocate
8005 3-units-long substrs field. */
8006 Newx(RExC_rx->substrs, 1, struct reg_substr_data);
8007 if (RExC_recurse_count) {
8008 Newx(RExC_recurse, RExC_recurse_count, regnode *);
8009 SAVEFREEPV(RExC_recurse);
8012 if (RExC_seen & REG_RECURSE_SEEN) {
8013 /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
8014 * So its 1 if there are no parens. */
8015 RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
8016 ((RExC_total_parens & 0x07) != 0);
8017 Newx(RExC_study_chunk_recursed,
8018 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8019 SAVEFREEPV(RExC_study_chunk_recursed);
8023 RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
8025 RExC_study_chunk_recursed_count= 0;
8027 Zero(RExC_rx->substrs, 1, struct reg_substr_data);
8028 if (RExC_study_chunk_recursed) {
8029 Zero(RExC_study_chunk_recursed,
8030 RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
8034 #ifdef TRIE_STUDY_OPT
8036 StructCopy(&zero_scan_data, &data, scan_data_t);
8037 copyRExC_state = RExC_state;
8040 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n"));
8042 RExC_state = copyRExC_state;
8043 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
8044 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
8046 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
8047 StructCopy(&zero_scan_data, &data, scan_data_t);
8050 StructCopy(&zero_scan_data, &data, scan_data_t);
8053 /* Dig out information for optimizations. */
8054 RExC_rx->extflags = RExC_flags; /* was pm_op */
8055 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
8058 SvUTF8_on(Rx); /* Unicode in it? */
8059 RExC_rxi->regstclass = NULL;
8060 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
8061 RExC_rx->intflags |= PREGf_NAUGHTY;
8062 scan = RExC_rxi->program + 1; /* First BRANCH. */
8064 /* testing for BRANCH here tells us whether there is "must appear"
8065 data in the pattern. If there is then we can use it for optimisations */
8066 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
8069 STRLEN longest_length[2];
8070 regnode_ssc ch_class; /* pointed to by data */
8072 SSize_t last_close = 0; /* pointed to by data */
8073 regnode *first= scan;
8074 regnode *first_next= regnext(first);
8078 * Skip introductions and multiplicators >= 1
8079 * so that we can extract the 'meat' of the pattern that must
8080 * match in the large if() sequence following.
8081 * NOTE that EXACT is NOT covered here, as it is normally
8082 * picked up by the optimiser separately.
8084 * This is unfortunate as the optimiser isnt handling lookahead
8085 * properly currently.
8088 while ((OP(first) == OPEN && (sawopen = 1)) ||
8089 /* An OR of *one* alternative - should not happen now. */
8090 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
8091 /* for now we can't handle lookbehind IFMATCH*/
8092 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
8093 (OP(first) == PLUS) ||
8094 (OP(first) == MINMOD) ||
8095 /* An {n,m} with n>0 */
8096 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
8097 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
8100 * the only op that could be a regnode is PLUS, all the rest
8101 * will be regnode_1 or regnode_2.
8103 * (yves doesn't think this is true)
8105 if (OP(first) == PLUS)
8108 if (OP(first) == MINMOD)
8110 first += regarglen[OP(first)];
8112 first = NEXTOPER(first);
8113 first_next= regnext(first);
8116 /* Starting-point info. */
8118 DEBUG_PEEP("first:", first, 0, 0);
8119 /* Ignore EXACT as we deal with it later. */
8120 if (PL_regkind[OP(first)] == EXACT) {
8121 if (! isEXACTFish(OP(first))) {
8122 NOOP; /* Empty, get anchored substr later. */
8125 RExC_rxi->regstclass = first;
8128 else if (PL_regkind[OP(first)] == TRIE &&
8129 ((reg_trie_data *)RExC_rxi->data->data[ ARG(first) ])->minlen>0)
8131 /* this can happen only on restudy */
8132 RExC_rxi->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
8135 else if (REGNODE_SIMPLE(OP(first)))
8136 RExC_rxi->regstclass = first;
8137 else if (PL_regkind[OP(first)] == BOUND ||
8138 PL_regkind[OP(first)] == NBOUND)
8139 RExC_rxi->regstclass = first;
8140 else if (PL_regkind[OP(first)] == BOL) {
8141 RExC_rx->intflags |= (OP(first) == MBOL
8144 first = NEXTOPER(first);
8147 else if (OP(first) == GPOS) {
8148 RExC_rx->intflags |= PREGf_ANCH_GPOS;
8149 first = NEXTOPER(first);
8152 else if ((!sawopen || !RExC_sawback) &&
8154 (OP(first) == STAR &&
8155 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
8156 !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
8158 /* turn .* into ^.* with an implied $*=1 */
8160 (OP(NEXTOPER(first)) == REG_ANY)
8163 RExC_rx->intflags |= (type | PREGf_IMPLICIT);
8164 first = NEXTOPER(first);
8167 if (sawplus && !sawminmod && !sawlookahead
8168 && (!sawopen || !RExC_sawback)
8169 && !pRExC_state->code_blocks) /* May examine pos and $& */
8170 /* x+ must match at the 1st pos of run of x's */
8171 RExC_rx->intflags |= PREGf_SKIP;
8173 /* Scan is after the zeroth branch, first is atomic matcher. */
8174 #ifdef TRIE_STUDY_OPT
8177 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8178 (IV)(first - scan + 1))
8182 Perl_re_printf( aTHX_ "first at %" IVdf "\n",
8183 (IV)(first - scan + 1))
8189 * If there's something expensive in the r.e., find the
8190 * longest literal string that must appear and make it the
8191 * regmust. Resolve ties in favor of later strings, since
8192 * the regstart check works with the beginning of the r.e.
8193 * and avoiding duplication strengthens checking. Not a
8194 * strong reason, but sufficient in the absence of others.
8195 * [Now we resolve ties in favor of the earlier string if
8196 * it happens that c_offset_min has been invalidated, since the
8197 * earlier string may buy us something the later one won't.]
8200 data.substrs[0].str = newSVpvs("");
8201 data.substrs[1].str = newSVpvs("");
8202 data.last_found = newSVpvs("");
8203 data.cur_is_floating = 0; /* initially any found substring is fixed */
8204 ENTER_with_name("study_chunk");
8205 SAVEFREESV(data.substrs[0].str);
8206 SAVEFREESV(data.substrs[1].str);
8207 SAVEFREESV(data.last_found);
8209 if (!RExC_rxi->regstclass) {
8210 ssc_init(pRExC_state, &ch_class);
8211 data.start_class = &ch_class;
8212 stclass_flag = SCF_DO_STCLASS_AND;
8213 } else /* XXXX Check for BOUND? */
8215 data.last_closep = &last_close;
8219 * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
8220 * (NO top level branches)
8222 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
8223 scan + RExC_size, /* Up to end */
8225 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
8226 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
8230 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
8233 if ( RExC_total_parens == 1 && !data.cur_is_floating
8234 && data.last_start_min == 0 && data.last_end > 0
8235 && !RExC_seen_zerolen
8236 && !(RExC_seen & REG_VERBARG_SEEN)
8237 && !(RExC_seen & REG_GPOS_SEEN)
8239 RExC_rx->extflags |= RXf_CHECK_ALL;
8241 scan_commit(pRExC_state, &data,&minlen, 0);
8244 /* XXX this is done in reverse order because that's the way the
8245 * code was before it was parameterised. Don't know whether it
8246 * actually needs doing in reverse order. DAPM */
8247 for (i = 1; i >= 0; i--) {
8248 longest_length[i] = CHR_SVLEN(data.substrs[i].str);
8251 && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
8252 && data.substrs[0].min_offset
8253 == data.substrs[1].min_offset
8254 && SvCUR(data.substrs[0].str)
8255 == SvCUR(data.substrs[1].str)
8257 && S_setup_longest (aTHX_ pRExC_state,
8258 &(RExC_rx->substrs->data[i]),
8262 RExC_rx->substrs->data[i].min_offset =
8263 data.substrs[i].min_offset - data.substrs[i].lookbehind;
8265 RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
8266 /* Don't offset infinity */
8267 if (data.substrs[i].max_offset < OPTIMIZE_INFTY)
8268 RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
8269 SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
8272 RExC_rx->substrs->data[i].substr = NULL;
8273 RExC_rx->substrs->data[i].utf8_substr = NULL;
8274 longest_length[i] = 0;
8278 LEAVE_with_name("study_chunk");
8280 if (RExC_rxi->regstclass
8281 && (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
8282 RExC_rxi->regstclass = NULL;
8284 if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
8285 || RExC_rx->substrs->data[0].min_offset)
8287 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8288 && is_ssc_worth_it(pRExC_state, data.start_class))
8290 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8292 ssc_finalize(pRExC_state, data.start_class);
8294 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8295 StructCopy(data.start_class,
8296 (regnode_ssc*)RExC_rxi->data->data[n],
8298 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8299 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8300 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
8301 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8302 Perl_re_printf( aTHX_
8303 "synthetic stclass \"%s\".\n",
8304 SvPVX_const(sv));});
8305 data.start_class = NULL;
8308 /* A temporary algorithm prefers floated substr to fixed one of
8309 * same length to dig more info. */
8310 i = (longest_length[0] <= longest_length[1]);
8311 RExC_rx->substrs->check_ix = i;
8312 RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
8313 RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
8314 RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
8315 RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
8316 RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
8317 if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
8318 RExC_rx->intflags |= PREGf_NOSCAN;
8320 if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
8321 RExC_rx->extflags |= RXf_USE_INTUIT;
8322 if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
8323 RExC_rx->extflags |= RXf_INTUIT_TAIL;
8326 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
8327 if ( (STRLEN)minlen < longest_length[1] )
8328 minlen= longest_length[1];
8329 if ( (STRLEN)minlen < longest_length[0] )
8330 minlen= longest_length[0];
8334 /* Several toplevels. Best we can is to set minlen. */
8336 regnode_ssc ch_class;
8337 SSize_t last_close = 0;
8339 DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n"));
8341 scan = RExC_rxi->program + 1;
8342 ssc_init(pRExC_state, &ch_class);
8343 data.start_class = &ch_class;
8344 data.last_closep = &last_close;
8348 * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
8349 * (patterns WITH top level branches)
8351 minlen = study_chunk(pRExC_state,
8352 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
8353 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
8354 ? SCF_TRIE_DOING_RESTUDY
8358 CHECK_RESTUDY_GOTO_butfirst(NOOP);
8360 RExC_rx->check_substr = NULL;
8361 RExC_rx->check_utf8 = NULL;
8362 RExC_rx->substrs->data[0].substr = NULL;
8363 RExC_rx->substrs->data[0].utf8_substr = NULL;
8364 RExC_rx->substrs->data[1].substr = NULL;
8365 RExC_rx->substrs->data[1].utf8_substr = NULL;
8367 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
8368 && is_ssc_worth_it(pRExC_state, data.start_class))
8370 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
8372 ssc_finalize(pRExC_state, data.start_class);
8374 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
8375 StructCopy(data.start_class,
8376 (regnode_ssc*)RExC_rxi->data->data[n],
8378 RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
8379 RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
8380 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
8381 regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
8382 Perl_re_printf( aTHX_
8383 "synthetic stclass \"%s\".\n",
8384 SvPVX_const(sv));});
8385 data.start_class = NULL;
8389 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
8390 RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
8391 RExC_rx->maxlen = REG_INFTY;
8394 RExC_rx->maxlen = RExC_maxlen;
8397 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
8398 the "real" pattern. */
8400 Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
8401 (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
8403 RExC_rx->minlenret = minlen;
8404 if (RExC_rx->minlen < minlen)
8405 RExC_rx->minlen = minlen;
8407 if (RExC_seen & REG_RECURSE_SEEN ) {
8408 RExC_rx->intflags |= PREGf_RECURSE_SEEN;
8409 Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
8411 if (RExC_seen & REG_GPOS_SEEN)
8412 RExC_rx->intflags |= PREGf_GPOS_SEEN;
8413 if (RExC_seen & REG_LOOKBEHIND_SEEN)
8414 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
8416 if (pRExC_state->code_blocks)
8417 RExC_rx->extflags |= RXf_EVAL_SEEN;
8418 if (RExC_seen & REG_VERBARG_SEEN)
8420 RExC_rx->intflags |= PREGf_VERBARG_SEEN;
8421 RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
8423 if (RExC_seen & REG_CUTGROUP_SEEN)
8424 RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
8425 if (pm_flags & PMf_USE_RE_EVAL)
8426 RExC_rx->intflags |= PREGf_USE_RE_EVAL;
8427 if (RExC_paren_names)
8428 RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
8430 RXp_PAREN_NAMES(RExC_rx) = NULL;
8432 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
8433 * so it can be used in pp.c */
8434 if (RExC_rx->intflags & PREGf_ANCH)
8435 RExC_rx->extflags |= RXf_IS_ANCHORED;
8439 /* this is used to identify "special" patterns that might result
8440 * in Perl NOT calling the regex engine and instead doing the match "itself",
8441 * particularly special cases in split//. By having the regex compiler
8442 * do this pattern matching at a regop level (instead of by inspecting the pattern)
8443 * we avoid weird issues with equivalent patterns resulting in different behavior,
8444 * AND we allow non Perl engines to get the same optimizations by the setting the
8445 * flags appropriately - Yves */
8446 regnode *first = RExC_rxi->program + 1;
8448 regnode *next = regnext(first);
8451 if (PL_regkind[fop] == NOTHING && nop == END)
8452 RExC_rx->extflags |= RXf_NULL;
8453 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
8454 /* when fop is SBOL first->flags will be true only when it was
8455 * produced by parsing /\A/, and not when parsing /^/. This is
8456 * very important for the split code as there we want to
8457 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
8458 * See rt #122761 for more details. -- Yves */
8459 RExC_rx->extflags |= RXf_START_ONLY;
8460 else if (fop == PLUS
8461 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
8463 RExC_rx->extflags |= RXf_WHITE;
8464 else if ( RExC_rx->extflags & RXf_SPLIT
8465 && (PL_regkind[fop] == EXACT && ! isEXACTFish(fop))
8466 && STR_LEN(first) == 1
8467 && *(STRING(first)) == ' '
8469 RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
8473 if (RExC_contains_locale) {
8474 RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
8478 if (RExC_paren_names) {
8479 RExC_rxi->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
8480 RExC_rxi->data->data[RExC_rxi->name_list_idx]
8481 = (void*)SvREFCNT_inc(RExC_paren_name_list);
8484 RExC_rxi->name_list_idx = 0;
8486 while ( RExC_recurse_count > 0 ) {
8487 const regnode *scan = RExC_recurse[ --RExC_recurse_count ];
8489 * This data structure is set up in study_chunk() and is used
8490 * to calculate the distance between a GOSUB regopcode and
8491 * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's)
8494 * If for some reason someone writes code that optimises
8495 * away a GOSUB opcode then the assert should be changed to
8496 * an if(scan) to guard the ARG2L_SET() - Yves
8499 assert(scan && OP(scan) == GOSUB);
8500 ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
8503 Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
8504 /* assume we don't need to swap parens around before we match */
8506 Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
8507 (unsigned long)RExC_study_chunk_recursed_count);
8511 Perl_re_printf( aTHX_ "Final program:\n");
8515 if (RExC_open_parens) {
8516 Safefree(RExC_open_parens);
8517 RExC_open_parens = NULL;
8519 if (RExC_close_parens) {
8520 Safefree(RExC_close_parens);
8521 RExC_close_parens = NULL;
8525 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
8526 * by setting the regexp SV to readonly-only instead. If the
8527 * pattern's been recompiled, the USEDness should remain. */
8528 if (old_re && SvREADONLY(old_re))
8536 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
8539 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
8541 PERL_UNUSED_ARG(value);
8543 if (flags & RXapif_FETCH) {
8544 return reg_named_buff_fetch(rx, key, flags);
8545 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
8546 Perl_croak_no_modify();
8548 } else if (flags & RXapif_EXISTS) {
8549 return reg_named_buff_exists(rx, key, flags)
8552 } else if (flags & RXapif_REGNAMES) {
8553 return reg_named_buff_all(rx, flags);
8554 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
8555 return reg_named_buff_scalar(rx, flags);
8557 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
8563 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
8566 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
8567 PERL_UNUSED_ARG(lastkey);
8569 if (flags & RXapif_FIRSTKEY)
8570 return reg_named_buff_firstkey(rx, flags);
8571 else if (flags & RXapif_NEXTKEY)
8572 return reg_named_buff_nextkey(rx, flags);
8574 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
8581 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
8585 struct regexp *const rx = ReANY(r);
8587 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
8589 if (rx && RXp_PAREN_NAMES(rx)) {
8590 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
8593 SV* sv_dat=HeVAL(he_str);
8594 I32 *nums=(I32*)SvPVX(sv_dat);
8595 AV * const retarray = (flags & RXapif_ALL) ? newAV() : NULL;
8596 for ( i=0; i<SvIVX(sv_dat); i++ ) {
8597 if ((I32)(rx->nparens) >= nums[i]
8598 && rx->offs[nums[i]].start != -1
8599 && rx->offs[nums[i]].end != -1)
8602 CALLREG_NUMBUF_FETCH(r, nums[i], ret);
8607 ret = newSVsv(&PL_sv_undef);
8610 av_push(retarray, ret);
8613 return newRV_noinc(MUTABLE_SV(retarray));
8620 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
8623 struct regexp *const rx = ReANY(r);
8625 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
8627 if (rx && RXp_PAREN_NAMES(rx)) {
8628 if (flags & RXapif_ALL) {
8629 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
8631 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
8633 SvREFCNT_dec_NN(sv);
8645 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
8647 struct regexp *const rx = ReANY(r);
8649 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
8651 if ( rx && RXp_PAREN_NAMES(rx) ) {
8652 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
8654 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
8661 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
8663 struct regexp *const rx = ReANY(r);
8664 DECLARE_AND_GET_RE_DEBUG_FLAGS;
8666 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
8668 if (rx && RXp_PAREN_NAMES(rx)) {
8669 HV *hv = RXp_PAREN_NAMES(rx);
8671 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8674 SV* sv_dat = HeVAL(temphe);
8675 I32 *nums = (I32*)SvPVX(sv_dat);
8676 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8677 if ((I32)(rx->lastparen) >= nums[i] &&
8678 rx->offs[nums[i]].start != -1 &&
8679 rx->offs[nums[i]].end != -1)
8685 if (parno || flags & RXapif_ALL) {
8686 return newSVhek(HeKEY_hek(temphe));
8694 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
8699 struct regexp *const rx = ReANY(r);
8701 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
8703 if (rx && RXp_PAREN_NAMES(rx)) {
8704 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
8705 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
8706 } else if (flags & RXapif_ONE) {
8707 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
8708 av = MUTABLE_AV(SvRV(ret));
8709 length = av_count(av);
8710 SvREFCNT_dec_NN(ret);
8711 return newSViv(length);
8713 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
8718 return &PL_sv_undef;
8722 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
8724 struct regexp *const rx = ReANY(r);
8727 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
8729 if (rx && RXp_PAREN_NAMES(rx)) {
8730 HV *hv= RXp_PAREN_NAMES(rx);
8732 (void)hv_iterinit(hv);
8733 while ( (temphe = hv_iternext_flags(hv, 0)) ) {
8736 SV* sv_dat = HeVAL(temphe);
8737 I32 *nums = (I32*)SvPVX(sv_dat);
8738 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
8739 if ((I32)(rx->lastparen) >= nums[i] &&
8740 rx->offs[nums[i]].start != -1 &&
8741 rx->offs[nums[i]].end != -1)
8747 if (parno || flags & RXapif_ALL) {
8748 av_push(av, newSVhek(HeKEY_hek(temphe)));
8753 return newRV_noinc(MUTABLE_SV(av));
8757 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
8760 struct regexp *const rx = ReANY(r);
8766 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
8768 if ( n == RX_BUFF_IDX_CARET_PREMATCH
8769 || n == RX_BUFF_IDX_CARET_FULLMATCH
8770 || n == RX_BUFF_IDX_CARET_POSTMATCH
8773 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8775 /* on something like
8778 * the KEEPCOPY is set on the PMOP rather than the regex */
8779 if (PL_curpm && r == PM_GETRE(PL_curpm))
8780 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8789 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
8790 /* no need to distinguish between them any more */
8791 n = RX_BUFF_IDX_FULLMATCH;
8793 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
8794 && rx->offs[0].start != -1)
8796 /* $`, ${^PREMATCH} */
8797 i = rx->offs[0].start;
8801 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
8802 && rx->offs[0].end != -1)
8804 /* $', ${^POSTMATCH} */
8805 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
8806 i = rx->sublen + rx->suboffset - rx->offs[0].end;
8809 if (inRANGE(n, 0, (I32)rx->nparens) &&
8810 (s1 = rx->offs[n].start) != -1 &&
8811 (t1 = rx->offs[n].end) != -1)
8813 /* $&, ${^MATCH}, $1 ... */
8815 s = rx->subbeg + s1 - rx->suboffset;
8820 assert(s >= rx->subbeg);
8821 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
8823 #ifdef NO_TAINT_SUPPORT
8824 sv_setpvn(sv, s, i);
8826 const int oldtainted = TAINT_get;
8828 sv_setpvn(sv, s, i);
8829 TAINT_set(oldtainted);
8831 if (RXp_MATCH_UTF8(rx))
8836 if (RXp_MATCH_TAINTED(rx)) {
8837 if (SvTYPE(sv) >= SVt_PVMG) {
8838 MAGIC* const mg = SvMAGIC(sv);
8841 SvMAGIC_set(sv, mg->mg_moremagic);
8843 if ((mgt = SvMAGIC(sv))) {
8844 mg->mg_moremagic = mgt;
8845 SvMAGIC_set(sv, mg);
8862 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
8863 SV const * const value)
8865 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
8867 PERL_UNUSED_ARG(rx);
8868 PERL_UNUSED_ARG(paren);
8869 PERL_UNUSED_ARG(value);
8872 Perl_croak_no_modify();
8876 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
8879 struct regexp *const rx = ReANY(r);
8883 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
8885 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
8886 || paren == RX_BUFF_IDX_CARET_FULLMATCH
8887 || paren == RX_BUFF_IDX_CARET_POSTMATCH
8890 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
8892 /* on something like
8895 * the KEEPCOPY is set on the PMOP rather than the regex */
8896 if (PL_curpm && r == PM_GETRE(PL_curpm))
8897 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
8903 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
8905 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
8906 case RX_BUFF_IDX_PREMATCH: /* $` */
8907 if (rx->offs[0].start != -1) {
8908 i = rx->offs[0].start;
8917 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
8918 case RX_BUFF_IDX_POSTMATCH: /* $' */
8919 if (rx->offs[0].end != -1) {
8920 i = rx->sublen - rx->offs[0].end;
8922 s1 = rx->offs[0].end;
8929 default: /* $& / ${^MATCH}, $1, $2, ... */
8930 if (paren <= (I32)rx->nparens &&
8931 (s1 = rx->offs[paren].start) != -1 &&
8932 (t1 = rx->offs[paren].end) != -1)
8938 if (ckWARN(WARN_UNINITIALIZED))
8939 report_uninit((const SV *)sv);
8944 if (i > 0 && RXp_MATCH_UTF8(rx)) {
8945 const char * const s = rx->subbeg - rx->suboffset + s1;
8950 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
8957 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
8959 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
8960 PERL_UNUSED_ARG(rx);
8964 return newSVpvs("Regexp");
8967 /* Scans the name of a named buffer from the pattern.
8968 * If flags is REG_RSN_RETURN_NULL returns null.
8969 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
8970 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
8971 * to the parsed name as looked up in the RExC_paren_names hash.
8972 * If there is an error throws a vFAIL().. type exception.
8975 #define REG_RSN_RETURN_NULL 0
8976 #define REG_RSN_RETURN_NAME 1
8977 #define REG_RSN_RETURN_DATA 2
8980 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
8982 char *name_start = RExC_parse;
8985 PERL_ARGS_ASSERT_REG_SCAN_NAME;
8987 assert (RExC_parse <= RExC_end);
8988 if (RExC_parse == RExC_end) NOOP;
8989 else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
8990 /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by
8991 * using do...while */
8994 RExC_parse += UTF8SKIP(RExC_parse);
8995 } while ( RExC_parse < RExC_end
8996 && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
9000 } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
9002 RExC_parse++; /* so the <- from the vFAIL is after the offending
9004 vFAIL("Group name must start with a non-digit word character");
9006 sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
9007 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
9008 if ( flags == REG_RSN_RETURN_NAME)
9010 else if (flags==REG_RSN_RETURN_DATA) {
9013 if ( ! sv_name ) /* should not happen*/
9014 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
9015 if (RExC_paren_names)
9016 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
9018 sv_dat = HeVAL(he_str);
9019 if ( ! sv_dat ) { /* Didn't find group */
9021 /* It might be a forward reference; we can't fail until we
9022 * know, by completing the parse to get all the groups, and
9024 if (ALL_PARENS_COUNTED) {
9025 vFAIL("Reference to nonexistent named group");
9028 REQUIRE_PARENS_PASS;
9034 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
9035 (unsigned long) flags);
9038 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
9039 if (RExC_lastparse!=RExC_parse) { \
9040 Perl_re_printf( aTHX_ "%s", \
9041 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
9042 RExC_end - RExC_parse, 16, \
9044 PERL_PV_ESCAPE_UNI_DETECT | \
9045 PERL_PV_PRETTY_ELLIPSES | \
9046 PERL_PV_PRETTY_LTGT | \
9047 PERL_PV_ESCAPE_RE | \
9048 PERL_PV_PRETTY_EXACTSIZE \
9052 Perl_re_printf( aTHX_ "%16s",""); \
9054 if (RExC_lastnum!=RExC_emit) \
9055 Perl_re_printf( aTHX_ "|%4zu", RExC_emit); \
9057 Perl_re_printf( aTHX_ "|%4s",""); \
9058 Perl_re_printf( aTHX_ "|%*s%-4s", \
9059 (int)((depth*2)), "", \
9062 RExC_lastnum=RExC_emit; \
9063 RExC_lastparse=RExC_parse; \
9068 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
9069 DEBUG_PARSE_MSG((funcname)); \
9070 Perl_re_printf( aTHX_ "%4s","\n"); \
9072 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\
9073 DEBUG_PARSE_MSG((funcname)); \
9074 Perl_re_printf( aTHX_ fmt "\n",args); \
9077 /* This section of code defines the inversion list object and its methods. The
9078 * interfaces are highly subject to change, so as much as possible is static to
9079 * this file. An inversion list is here implemented as a malloc'd C UV array
9080 * as an SVt_INVLIST scalar.
9082 * An inversion list for Unicode is an array of code points, sorted by ordinal
9083 * number. Each element gives the code point that begins a range that extends
9084 * up-to but not including the code point given by the next element. The final
9085 * element gives the first code point of a range that extends to the platform's
9086 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
9087 * ...) give ranges whose code points are all in the inversion list. We say
9088 * that those ranges are in the set. The odd-numbered elements give ranges
9089 * whose code points are not in the inversion list, and hence not in the set.
9090 * Thus, element [0] is the first code point in the list. Element [1]
9091 * is the first code point beyond that not in the list; and element [2] is the
9092 * first code point beyond that that is in the list. In other words, the first
9093 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
9094 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
9095 * all code points in that range are not in the inversion list. The third
9096 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
9097 * list, and so forth. Thus every element whose index is divisible by two
9098 * gives the beginning of a range that is in the list, and every element whose
9099 * index is not divisible by two gives the beginning of a range not in the
9100 * list. If the final element's index is divisible by two, the inversion list
9101 * extends to the platform's infinity; otherwise the highest code point in the
9102 * inversion list is the contents of that element minus 1.
9104 * A range that contains just a single code point N will look like
9106 * invlist[i+1] == N+1
9108 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
9109 * impossible to represent, so element [i+1] is omitted. The single element
9111 * invlist[0] == UV_MAX
9112 * contains just UV_MAX, but is interpreted as matching to infinity.
9114 * Taking the complement (inverting) an inversion list is quite simple, if the
9115 * first element is 0, remove it; otherwise add a 0 element at the beginning.
9116 * This implementation reserves an element at the beginning of each inversion
9117 * list to always contain 0; there is an additional flag in the header which
9118 * indicates if the list begins at the 0, or is offset to begin at the next
9119 * element. This means that the inversion list can be inverted without any
9120 * copying; just flip the flag.
9122 * More about inversion lists can be found in "Unicode Demystified"
9123 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
9125 * The inversion list data structure is currently implemented as an SV pointing
9126 * to an array of UVs that the SV thinks are bytes. This allows us to have an
9127 * array of UV whose memory management is automatically handled by the existing
9128 * facilities for SV's.
9130 * Some of the methods should always be private to the implementation, and some
9131 * should eventually be made public */
9133 /* The header definitions are in F<invlist_inline.h> */
9135 #ifndef PERL_IN_XSUB_RE
9137 PERL_STATIC_INLINE UV*
9138 S__invlist_array_init(SV* const invlist, const bool will_have_0)
9140 /* Returns a pointer to the first element in the inversion list's array.
9141 * This is called upon initialization of an inversion list. Where the
9142 * array begins depends on whether the list has the code point U+0000 in it
9143 * or not. The other parameter tells it whether the code that follows this
9144 * call is about to put a 0 in the inversion list or not. The first
9145 * element is either the element reserved for 0, if TRUE, or the element
9146 * after it, if FALSE */
9148 bool* offset = get_invlist_offset_addr(invlist);
9149 UV* zero_addr = (UV *) SvPVX(invlist);
9151 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
9154 assert(! _invlist_len(invlist));
9158 /* 1^1 = 0; 1^0 = 1 */
9159 *offset = 1 ^ will_have_0;
9160 return zero_addr + *offset;
9164 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
9166 /* Replaces the inversion list in 'dest' with the one from 'src'. It
9167 * steals the list from 'src', so 'src' is made to have a NULL list. This
9168 * is similar to what SvSetMagicSV() would do, if it were implemented on
9169 * inversion lists, though this routine avoids a copy */
9171 const UV src_len = _invlist_len(src);
9172 const bool src_offset = *get_invlist_offset_addr(src);
9173 const STRLEN src_byte_len = SvLEN(src);
9174 char * array = SvPVX(src);
9176 const int oldtainted = TAINT_get;
9178 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
9180 assert(is_invlist(src));
9181 assert(is_invlist(dest));
9182 assert(! invlist_is_iterating(src));
9183 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
9185 /* Make sure it ends in the right place with a NUL, as our inversion list
9186 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
9188 array[src_byte_len - 1] = '\0';
9190 TAINT_NOT; /* Otherwise it breaks */
9191 sv_usepvn_flags(dest,
9195 /* This flag is documented to cause a copy to be avoided */
9196 SV_HAS_TRAILING_NUL);
9197 TAINT_set(oldtainted);
9202 /* Finish up copying over the other fields in an inversion list */
9203 *get_invlist_offset_addr(dest) = src_offset;
9204 invlist_set_len(dest, src_len, src_offset);
9205 *get_invlist_previous_index_addr(dest) = 0;
9206 invlist_iterfinish(dest);
9209 PERL_STATIC_INLINE IV*
9210 S_get_invlist_previous_index_addr(SV* invlist)
9212 /* Return the address of the IV that is reserved to hold the cached index
9214 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
9216 assert(is_invlist(invlist));
9218 return &(((XINVLIST*) SvANY(invlist))->prev_index);
9221 PERL_STATIC_INLINE IV
9222 S_invlist_previous_index(SV* const invlist)
9224 /* Returns cached index of previous search */
9226 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
9228 return *get_invlist_previous_index_addr(invlist);
9231 PERL_STATIC_INLINE void
9232 S_invlist_set_previous_index(SV* const invlist, const IV index)
9234 /* Caches <index> for later retrieval */
9236 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
9238 assert(index == 0 || index < (int) _invlist_len(invlist));
9240 *get_invlist_previous_index_addr(invlist) = index;
9243 PERL_STATIC_INLINE void
9244 S_invlist_trim(SV* invlist)
9246 /* Free the not currently-being-used space in an inversion list */
9248 /* But don't free up the space needed for the 0 UV that is always at the
9249 * beginning of the list, nor the trailing NUL */
9250 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
9252 PERL_ARGS_ASSERT_INVLIST_TRIM;
9254 assert(is_invlist(invlist));
9256 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
9259 PERL_STATIC_INLINE void
9260 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
9262 PERL_ARGS_ASSERT_INVLIST_CLEAR;
9264 assert(is_invlist(invlist));
9266 invlist_set_len(invlist, 0, 0);
9267 invlist_trim(invlist);
9270 #endif /* ifndef PERL_IN_XSUB_RE */
9272 PERL_STATIC_INLINE bool
9273 S_invlist_is_iterating(SV* const invlist)
9275 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9277 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9280 #ifndef PERL_IN_XSUB_RE
9282 PERL_STATIC_INLINE UV
9283 S_invlist_max(SV* const invlist)
9285 /* Returns the maximum number of elements storable in the inversion list's
9286 * array, without having to realloc() */
9288 PERL_ARGS_ASSERT_INVLIST_MAX;
9290 assert(is_invlist(invlist));
9292 /* Assumes worst case, in which the 0 element is not counted in the
9293 * inversion list, so subtracts 1 for that */
9294 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
9295 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
9296 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
9300 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
9302 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
9304 /* First 1 is in case the zero element isn't in the list; second 1 is for
9306 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
9307 invlist_set_len(invlist, 0, 0);
9309 /* Force iterinit() to be used to get iteration to work */
9310 invlist_iterfinish(invlist);
9312 *get_invlist_previous_index_addr(invlist) = 0;
9313 SvPOK_on(invlist); /* This allows B to extract the PV */
9317 Perl__new_invlist(pTHX_ IV initial_size)
9320 /* Return a pointer to a newly constructed inversion list, with enough
9321 * space to store 'initial_size' elements. If that number is negative, a
9322 * system default is used instead */
9326 if (initial_size < 0) {
9330 new_list = newSV_type(SVt_INVLIST);
9331 initialize_invlist_guts(new_list, initial_size);
9337 Perl__new_invlist_C_array(pTHX_ const UV* const list)
9339 /* Return a pointer to a newly constructed inversion list, initialized to
9340 * point to <list>, which has to be in the exact correct inversion list
9341 * form, including internal fields. Thus this is a dangerous routine that
9342 * should not be used in the wrong hands. The passed in 'list' contains
9343 * several header fields at the beginning that are not part of the
9344 * inversion list body proper */
9346 const STRLEN length = (STRLEN) list[0];
9347 const UV version_id = list[1];
9348 const bool offset = cBOOL(list[2]);
9349 #define HEADER_LENGTH 3
9350 /* If any of the above changes in any way, you must change HEADER_LENGTH
9351 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
9352 * perl -E 'say int(rand 2**31-1)'
9354 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
9355 data structure type, so that one being
9356 passed in can be validated to be an
9357 inversion list of the correct vintage.
9360 SV* invlist = newSV_type(SVt_INVLIST);
9362 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
9364 if (version_id != INVLIST_VERSION_ID) {
9365 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
9368 /* The generated array passed in includes header elements that aren't part
9369 * of the list proper, so start it just after them */
9370 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
9372 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
9373 shouldn't touch it */
9375 *(get_invlist_offset_addr(invlist)) = offset;
9377 /* The 'length' passed to us is the physical number of elements in the
9378 * inversion list. But if there is an offset the logical number is one
9380 invlist_set_len(invlist, length - offset, offset);
9382 invlist_set_previous_index(invlist, 0);
9384 /* Initialize the iteration pointer. */
9385 invlist_iterfinish(invlist);
9387 SvREADONLY_on(invlist);
9394 S__append_range_to_invlist(pTHX_ SV* const invlist,
9395 const UV start, const UV end)
9397 /* Subject to change or removal. Append the range from 'start' to 'end' at
9398 * the end of the inversion list. The range must be above any existing
9402 UV max = invlist_max(invlist);
9403 UV len = _invlist_len(invlist);
9406 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
9408 if (len == 0) { /* Empty lists must be initialized */
9409 offset = start != 0;
9410 array = _invlist_array_init(invlist, ! offset);
9413 /* Here, the existing list is non-empty. The current max entry in the
9414 * list is generally the first value not in the set, except when the
9415 * set extends to the end of permissible values, in which case it is
9416 * the first entry in that final set, and so this call is an attempt to
9417 * append out-of-order */
9419 UV final_element = len - 1;
9420 array = invlist_array(invlist);
9421 if ( array[final_element] > start
9422 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
9424 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
9425 array[final_element], start,
9426 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
9429 /* Here, it is a legal append. If the new range begins 1 above the end
9430 * of the range below it, it is extending the range below it, so the
9431 * new first value not in the set is one greater than the newly
9432 * extended range. */
9433 offset = *get_invlist_offset_addr(invlist);
9434 if (array[final_element] == start) {
9435 if (end != UV_MAX) {
9436 array[final_element] = end + 1;
9439 /* But if the end is the maximum representable on the machine,
9440 * assume that infinity was actually what was meant. Just let
9441 * the range that this would extend to have no end */
9442 invlist_set_len(invlist, len - 1, offset);
9448 /* Here the new range doesn't extend any existing set. Add it */
9450 len += 2; /* Includes an element each for the start and end of range */
9452 /* If wll overflow the existing space, extend, which may cause the array to
9455 invlist_extend(invlist, len);
9457 /* Have to set len here to avoid assert failure in invlist_array() */
9458 invlist_set_len(invlist, len, offset);
9460 array = invlist_array(invlist);
9463 invlist_set_len(invlist, len, offset);
9466 /* The next item on the list starts the range, the one after that is
9467 * one past the new range. */
9468 array[len - 2] = start;
9469 if (end != UV_MAX) {
9470 array[len - 1] = end + 1;
9473 /* But if the end is the maximum representable on the machine, just let
9474 * the range have no end */
9475 invlist_set_len(invlist, len - 1, offset);
9480 Perl__invlist_search(SV* const invlist, const UV cp)
9482 /* Searches the inversion list for the entry that contains the input code
9483 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
9484 * return value is the index into the list's array of the range that
9485 * contains <cp>, that is, 'i' such that
9486 * array[i] <= cp < array[i+1]
9491 IV high = _invlist_len(invlist);
9492 const IV highest_element = high - 1;
9495 PERL_ARGS_ASSERT__INVLIST_SEARCH;
9497 /* If list is empty, return failure. */
9502 /* (We can't get the array unless we know the list is non-empty) */
9503 array = invlist_array(invlist);
9505 mid = invlist_previous_index(invlist);
9507 if (mid > highest_element) {
9508 mid = highest_element;
9511 /* <mid> contains the cache of the result of the previous call to this
9512 * function (0 the first time). See if this call is for the same result,
9513 * or if it is for mid-1. This is under the theory that calls to this
9514 * function will often be for related code points that are near each other.
9515 * And benchmarks show that caching gives better results. We also test
9516 * here if the code point is within the bounds of the list. These tests
9517 * replace others that would have had to be made anyway to make sure that
9518 * the array bounds were not exceeded, and these give us extra information
9519 * at the same time */
9520 if (cp >= array[mid]) {
9521 if (cp >= array[highest_element]) {
9522 return highest_element;
9525 /* Here, array[mid] <= cp < array[highest_element]. This means that
9526 * the final element is not the answer, so can exclude it; it also
9527 * means that <mid> is not the final element, so can refer to 'mid + 1'
9529 if (cp < array[mid + 1]) {
9535 else { /* cp < aray[mid] */
9536 if (cp < array[0]) { /* Fail if outside the array */
9540 if (cp >= array[mid - 1]) {
9545 /* Binary search. What we are looking for is <i> such that
9546 * array[i] <= cp < array[i+1]
9547 * The loop below converges on the i+1. Note that there may not be an
9548 * (i+1)th element in the array, and things work nonetheless */
9549 while (low < high) {
9550 mid = (low + high) / 2;
9551 assert(mid <= highest_element);
9552 if (array[mid] <= cp) { /* cp >= array[mid] */
9555 /* We could do this extra test to exit the loop early.
9556 if (cp < array[low]) {
9561 else { /* cp < array[mid] */
9568 invlist_set_previous_index(invlist, high);
9573 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9574 const bool complement_b, SV** output)
9576 /* Take the union of two inversion lists and point '*output' to it. On
9577 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9578 * even 'a' or 'b'). If to an inversion list, the contents of the original
9579 * list will be replaced by the union. The first list, 'a', may be
9580 * NULL, in which case a copy of the second list is placed in '*output'.
9581 * If 'complement_b' is TRUE, the union is taken of the complement
9582 * (inversion) of 'b' instead of b itself.
9584 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9585 * Richard Gillam, published by Addison-Wesley, and explained at some
9586 * length there. The preface says to incorporate its examples into your
9587 * code at your own risk.
9589 * The algorithm is like a merge sort. */
9591 const UV* array_a; /* a's array */
9593 UV len_a; /* length of a's array */
9596 SV* u; /* the resulting union */
9600 UV i_a = 0; /* current index into a's array */
9604 /* running count, as explained in the algorithm source book; items are
9605 * stopped accumulating and are output when the count changes to/from 0.
9606 * The count is incremented when we start a range that's in an input's set,
9607 * and decremented when we start a range that's not in a set. So this
9608 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
9609 * and hence nothing goes into the union; 1, just one of the inputs is in
9610 * its set (and its current range gets added to the union); and 2 when both
9611 * inputs are in their sets. */
9614 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
9616 assert(*output == NULL || is_invlist(*output));
9618 len_b = _invlist_len(b);
9621 /* Here, 'b' is empty, hence it's complement is all possible code
9622 * points. So if the union includes the complement of 'b', it includes
9623 * everything, and we need not even look at 'a'. It's easiest to
9624 * create a new inversion list that matches everything. */
9626 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
9628 if (*output == NULL) { /* If the output didn't exist, just point it
9630 *output = everything;
9632 else { /* Otherwise, replace its contents with the new list */
9633 invlist_replace_list_destroys_src(*output, everything);
9634 SvREFCNT_dec_NN(everything);
9640 /* Here, we don't want the complement of 'b', and since 'b' is empty,
9641 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
9642 * output will be empty */
9644 if (a == NULL || _invlist_len(a) == 0) {
9645 if (*output == NULL) {
9646 *output = _new_invlist(0);
9649 invlist_clear(*output);
9654 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
9655 * union. We can just return a copy of 'a' if '*output' doesn't point
9656 * to an existing list */
9657 if (*output == NULL) {
9658 *output = invlist_clone(a, NULL);
9662 /* If the output is to overwrite 'a', we have a no-op, as it's
9668 /* Here, '*output' is to be overwritten by 'a' */
9669 u = invlist_clone(a, NULL);
9670 invlist_replace_list_destroys_src(*output, u);
9676 /* Here 'b' is not empty. See about 'a' */
9678 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
9680 /* Here, 'a' is empty (and b is not). That means the union will come
9681 * entirely from 'b'. If '*output' is NULL, we can directly return a
9682 * clone of 'b'. Otherwise, we replace the contents of '*output' with
9685 SV ** dest = (*output == NULL) ? output : &u;
9686 *dest = invlist_clone(b, NULL);
9688 _invlist_invert(*dest);
9692 invlist_replace_list_destroys_src(*output, u);
9699 /* Here both lists exist and are non-empty */
9700 array_a = invlist_array(a);
9701 array_b = invlist_array(b);
9703 /* If are to take the union of 'a' with the complement of b, set it
9704 * up so are looking at b's complement. */
9707 /* To complement, we invert: if the first element is 0, remove it. To
9708 * do this, we just pretend the array starts one later */
9709 if (array_b[0] == 0) {
9715 /* But if the first element is not zero, we pretend the list starts
9716 * at the 0 that is always stored immediately before the array. */
9722 /* Size the union for the worst case: that the sets are completely
9724 u = _new_invlist(len_a + len_b);
9726 /* Will contain U+0000 if either component does */
9727 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
9728 || (len_b > 0 && array_b[0] == 0));
9730 /* Go through each input list item by item, stopping when have exhausted
9732 while (i_a < len_a && i_b < len_b) {
9733 UV cp; /* The element to potentially add to the union's array */
9734 bool cp_in_set; /* is it in the input list's set or not */
9736 /* We need to take one or the other of the two inputs for the union.
9737 * Since we are merging two sorted lists, we take the smaller of the
9738 * next items. In case of a tie, we take first the one that is in its
9739 * set. If we first took the one not in its set, it would decrement
9740 * the count, possibly to 0 which would cause it to be output as ending
9741 * the range, and the next time through we would take the same number,
9742 * and output it again as beginning the next range. By doing it the
9743 * opposite way, there is no possibility that the count will be
9744 * momentarily decremented to 0, and thus the two adjoining ranges will
9745 * be seamlessly merged. (In a tie and both are in the set or both not
9746 * in the set, it doesn't matter which we take first.) */
9747 if ( array_a[i_a] < array_b[i_b]
9748 || ( array_a[i_a] == array_b[i_b]
9749 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9751 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9752 cp = array_a[i_a++];
9755 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9756 cp = array_b[i_b++];
9759 /* Here, have chosen which of the two inputs to look at. Only output
9760 * if the running count changes to/from 0, which marks the
9761 * beginning/end of a range that's in the set */
9764 array_u[i_u++] = cp;
9771 array_u[i_u++] = cp;
9777 /* The loop above increments the index into exactly one of the input lists
9778 * each iteration, and ends when either index gets to its list end. That
9779 * means the other index is lower than its end, and so something is
9780 * remaining in that one. We decrement 'count', as explained below, if
9781 * that list is in its set. (i_a and i_b each currently index the element
9782 * beyond the one we care about.) */
9783 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
9784 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
9789 /* Above we decremented 'count' if the list that had unexamined elements in
9790 * it was in its set. This has made it so that 'count' being non-zero
9791 * means there isn't anything left to output; and 'count' equal to 0 means
9792 * that what is left to output is precisely that which is left in the
9793 * non-exhausted input list.
9795 * To see why, note first that the exhausted input obviously has nothing
9796 * left to add to the union. If it was in its set at its end, that means
9797 * the set extends from here to the platform's infinity, and hence so does
9798 * the union and the non-exhausted set is irrelevant. The exhausted set
9799 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
9800 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
9801 * 'count' remains at 1. This is consistent with the decremented 'count'
9802 * != 0 meaning there's nothing left to add to the union.
9804 * But if the exhausted input wasn't in its set, it contributed 0 to
9805 * 'count', and the rest of the union will be whatever the other input is.
9806 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
9807 * otherwise it gets decremented to 0. This is consistent with 'count'
9808 * == 0 meaning the remainder of the union is whatever is left in the
9809 * non-exhausted list. */
9814 IV copy_count = len_a - i_a;
9815 if (copy_count > 0) { /* The non-exhausted input is 'a' */
9816 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
9818 else { /* The non-exhausted input is b */
9819 copy_count = len_b - i_b;
9820 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
9822 len_u = i_u + copy_count;
9825 /* Set the result to the final length, which can change the pointer to
9826 * array_u, so re-find it. (Note that it is unlikely that this will
9827 * change, as we are shrinking the space, not enlarging it) */
9828 if (len_u != _invlist_len(u)) {
9829 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
9831 array_u = invlist_array(u);
9834 if (*output == NULL) { /* Simply return the new inversion list */
9838 /* Otherwise, overwrite the inversion list that was in '*output'. We
9839 * could instead free '*output', and then set it to 'u', but experience
9840 * has shown [perl #127392] that if the input is a mortal, we can get a
9841 * huge build-up of these during regex compilation before they get
9843 invlist_replace_list_destroys_src(*output, u);
9851 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
9852 const bool complement_b, SV** i)
9854 /* Take the intersection of two inversion lists and point '*i' to it. On
9855 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
9856 * even 'a' or 'b'). If to an inversion list, the contents of the original
9857 * list will be replaced by the intersection. The first list, 'a', may be
9858 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
9859 * TRUE, the result will be the intersection of 'a' and the complement (or
9860 * inversion) of 'b' instead of 'b' directly.
9862 * The basis for this comes from "Unicode Demystified" Chapter 13 by
9863 * Richard Gillam, published by Addison-Wesley, and explained at some
9864 * length there. The preface says to incorporate its examples into your
9865 * code at your own risk. In fact, it had bugs
9867 * The algorithm is like a merge sort, and is essentially the same as the
9871 const UV* array_a; /* a's array */
9873 UV len_a; /* length of a's array */
9876 SV* r; /* the resulting intersection */
9880 UV i_a = 0; /* current index into a's array */
9884 /* running count of how many of the two inputs are postitioned at ranges
9885 * that are in their sets. As explained in the algorithm source book,
9886 * items are stopped accumulating and are output when the count changes
9887 * to/from 2. The count is incremented when we start a range that's in an
9888 * input's set, and decremented when we start a range that's not in a set.
9889 * Only when it is 2 are we in the intersection. */
9892 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
9894 assert(*i == NULL || is_invlist(*i));
9896 /* Special case if either one is empty */
9897 len_a = (a == NULL) ? 0 : _invlist_len(a);
9898 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
9899 if (len_a != 0 && complement_b) {
9901 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
9902 * must be empty. Here, also we are using 'b's complement, which
9903 * hence must be every possible code point. Thus the intersection
9906 if (*i == a) { /* No-op */
9911 *i = invlist_clone(a, NULL);
9915 r = invlist_clone(a, NULL);
9916 invlist_replace_list_destroys_src(*i, r);
9921 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
9922 * intersection must be empty */
9924 *i = _new_invlist(0);
9932 /* Here both lists exist and are non-empty */
9933 array_a = invlist_array(a);
9934 array_b = invlist_array(b);
9936 /* If are to take the intersection of 'a' with the complement of b, set it
9937 * up so are looking at b's complement. */
9940 /* To complement, we invert: if the first element is 0, remove it. To
9941 * do this, we just pretend the array starts one later */
9942 if (array_b[0] == 0) {
9948 /* But if the first element is not zero, we pretend the list starts
9949 * at the 0 that is always stored immediately before the array. */
9955 /* Size the intersection for the worst case: that the intersection ends up
9956 * fragmenting everything to be completely disjoint */
9957 r= _new_invlist(len_a + len_b);
9959 /* Will contain U+0000 iff both components do */
9960 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
9961 && len_b > 0 && array_b[0] == 0);
9963 /* Go through each list item by item, stopping when have exhausted one of
9965 while (i_a < len_a && i_b < len_b) {
9966 UV cp; /* The element to potentially add to the intersection's
9968 bool cp_in_set; /* Is it in the input list's set or not */
9970 /* We need to take one or the other of the two inputs for the
9971 * intersection. Since we are merging two sorted lists, we take the
9972 * smaller of the next items. In case of a tie, we take first the one
9973 * that is not in its set (a difference from the union algorithm). If
9974 * we first took the one in its set, it would increment the count,
9975 * possibly to 2 which would cause it to be output as starting a range
9976 * in the intersection, and the next time through we would take that
9977 * same number, and output it again as ending the set. By doing the
9978 * opposite of this, there is no possibility that the count will be
9979 * momentarily incremented to 2. (In a tie and both are in the set or
9980 * both not in the set, it doesn't matter which we take first.) */
9981 if ( array_a[i_a] < array_b[i_b]
9982 || ( array_a[i_a] == array_b[i_b]
9983 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
9985 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
9986 cp = array_a[i_a++];
9989 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
9993 /* Here, have chosen which of the two inputs to look at. Only output
9994 * if the running count changes to/from 2, which marks the
9995 * beginning/end of a range that's in the intersection */
9999 array_r[i_r++] = cp;
10004 array_r[i_r++] = cp;
10011 /* The loop above increments the index into exactly one of the input lists
10012 * each iteration, and ends when either index gets to its list end. That
10013 * means the other index is lower than its end, and so something is
10014 * remaining in that one. We increment 'count', as explained below, if the
10015 * exhausted list was in its set. (i_a and i_b each currently index the
10016 * element beyond the one we care about.) */
10017 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
10018 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
10023 /* Above we incremented 'count' if the exhausted list was in its set. This
10024 * has made it so that 'count' being below 2 means there is nothing left to
10025 * output; otheriwse what's left to add to the intersection is precisely
10026 * that which is left in the non-exhausted input list.
10028 * To see why, note first that the exhausted input obviously has nothing
10029 * left to affect the intersection. If it was in its set at its end, that
10030 * means the set extends from here to the platform's infinity, and hence
10031 * anything in the non-exhausted's list will be in the intersection, and
10032 * anything not in it won't be. Hence, the rest of the intersection is
10033 * precisely what's in the non-exhausted list The exhausted set also
10034 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
10035 * it means 'count' is now at least 2. This is consistent with the
10036 * incremented 'count' being >= 2 means to add the non-exhausted list to
10037 * the intersection.
10039 * But if the exhausted input wasn't in its set, it contributed 0 to
10040 * 'count', and the intersection can't include anything further; the
10041 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
10042 * incremented. This is consistent with 'count' being < 2 meaning nothing
10043 * further to add to the intersection. */
10044 if (count < 2) { /* Nothing left to put in the intersection. */
10047 else { /* copy the non-exhausted list, unchanged. */
10048 IV copy_count = len_a - i_a;
10049 if (copy_count > 0) { /* a is the one with stuff left */
10050 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
10052 else { /* b is the one with stuff left */
10053 copy_count = len_b - i_b;
10054 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
10056 len_r = i_r + copy_count;
10059 /* Set the result to the final length, which can change the pointer to
10060 * array_r, so re-find it. (Note that it is unlikely that this will
10061 * change, as we are shrinking the space, not enlarging it) */
10062 if (len_r != _invlist_len(r)) {
10063 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
10065 array_r = invlist_array(r);
10068 if (*i == NULL) { /* Simply return the calculated intersection */
10071 else { /* Otherwise, replace the existing inversion list in '*i'. We could
10072 instead free '*i', and then set it to 'r', but experience has
10073 shown [perl #127392] that if the input is a mortal, we can get a
10074 huge build-up of these during regex compilation before they get
10077 invlist_replace_list_destroys_src(*i, r);
10082 SvREFCNT_dec_NN(r);
10089 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
10091 /* Add the range from 'start' to 'end' inclusive to the inversion list's
10092 * set. A pointer to the inversion list is returned. This may actually be
10093 * a new list, in which case the passed in one has been destroyed. The
10094 * passed-in inversion list can be NULL, in which case a new one is created
10095 * with just the one range in it. The new list is not necessarily
10096 * NUL-terminated. Space is not freed if the inversion list shrinks as a
10097 * result of this function. The gain would not be large, and in many
10098 * cases, this is called multiple times on a single inversion list, so
10099 * anything freed may almost immediately be needed again.
10101 * This used to mostly call the 'union' routine, but that is much more
10102 * heavyweight than really needed for a single range addition */
10104 UV* array; /* The array implementing the inversion list */
10105 UV len; /* How many elements in 'array' */
10106 SSize_t i_s; /* index into the invlist array where 'start'
10108 SSize_t i_e = 0; /* And the index where 'end' should go */
10109 UV cur_highest; /* The highest code point in the inversion list
10110 upon entry to this function */
10112 /* This range becomes the whole inversion list if none already existed */
10113 if (invlist == NULL) {
10114 invlist = _new_invlist(2);
10115 _append_range_to_invlist(invlist, start, end);
10119 /* Likewise, if the inversion list is currently empty */
10120 len = _invlist_len(invlist);
10122 _append_range_to_invlist(invlist, start, end);
10126 /* Starting here, we have to know the internals of the list */
10127 array = invlist_array(invlist);
10129 /* If the new range ends higher than the current highest ... */
10130 cur_highest = invlist_highest(invlist);
10131 if (end > cur_highest) {
10133 /* If the whole range is higher, we can just append it */
10134 if (start > cur_highest) {
10135 _append_range_to_invlist(invlist, start, end);
10139 /* Otherwise, add the portion that is higher ... */
10140 _append_range_to_invlist(invlist, cur_highest + 1, end);
10142 /* ... and continue on below to handle the rest. As a result of the
10143 * above append, we know that the index of the end of the range is the
10144 * final even numbered one of the array. Recall that the final element
10145 * always starts a range that extends to infinity. If that range is in
10146 * the set (meaning the set goes from here to infinity), it will be an
10147 * even index, but if it isn't in the set, it's odd, and the final
10148 * range in the set is one less, which is even. */
10149 if (end == UV_MAX) {
10157 /* We have dealt with appending, now see about prepending. If the new
10158 * range starts lower than the current lowest ... */
10159 if (start < array[0]) {
10161 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
10162 * Let the union code handle it, rather than having to know the
10163 * trickiness in two code places. */
10164 if (UNLIKELY(start == 0)) {
10167 range_invlist = _new_invlist(2);
10168 _append_range_to_invlist(range_invlist, start, end);
10170 _invlist_union(invlist, range_invlist, &invlist);
10172 SvREFCNT_dec_NN(range_invlist);
10177 /* If the whole new range comes before the first entry, and doesn't
10178 * extend it, we have to insert it as an additional range */
10179 if (end < array[0] - 1) {
10181 goto splice_in_new_range;
10184 /* Here the new range adjoins the existing first range, extending it
10188 /* And continue on below to handle the rest. We know that the index of
10189 * the beginning of the range is the first one of the array */
10192 else { /* Not prepending any part of the new range to the existing list.
10193 * Find where in the list it should go. This finds i_s, such that:
10194 * invlist[i_s] <= start < array[i_s+1]
10196 i_s = _invlist_search(invlist, start);
10199 /* At this point, any extending before the beginning of the inversion list
10200 * and/or after the end has been done. This has made it so that, in the
10201 * code below, each endpoint of the new range is either in a range that is
10202 * in the set, or is in a gap between two ranges that are. This means we
10203 * don't have to worry about exceeding the array bounds.
10205 * Find where in the list the new range ends (but we can skip this if we
10206 * have already determined what it is, or if it will be the same as i_s,
10207 * which we already have computed) */
10209 i_e = (start == end)
10211 : _invlist_search(invlist, end);
10214 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
10215 * is a range that goes to infinity there is no element at invlist[i_e+1],
10216 * so only the first relation holds. */
10218 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10220 /* Here, the ranges on either side of the beginning of the new range
10221 * are in the set, and this range starts in the gap between them.
10223 * The new range extends the range above it downwards if the new range
10224 * ends at or above that range's start */
10225 const bool extends_the_range_above = ( end == UV_MAX
10226 || end + 1 >= array[i_s+1]);
10228 /* The new range extends the range below it upwards if it begins just
10229 * after where that range ends */
10230 if (start == array[i_s]) {
10232 /* If the new range fills the entire gap between the other ranges,
10233 * they will get merged together. Other ranges may also get
10234 * merged, depending on how many of them the new range spans. In
10235 * the general case, we do the merge later, just once, after we
10236 * figure out how many to merge. But in the case where the new
10237 * range exactly spans just this one gap (possibly extending into
10238 * the one above), we do the merge here, and an early exit. This
10239 * is done here to avoid having to special case later. */
10240 if (i_e - i_s <= 1) {
10242 /* If i_e - i_s == 1, it means that the new range terminates
10243 * within the range above, and hence 'extends_the_range_above'
10244 * must be true. (If the range above it extends to infinity,
10245 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
10246 * will be 0, so no harm done.) */
10247 if (extends_the_range_above) {
10248 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
10249 invlist_set_len(invlist,
10251 *(get_invlist_offset_addr(invlist)));
10255 /* Here, i_e must == i_s. We keep them in sync, as they apply
10256 * to the same range, and below we are about to decrement i_s
10261 /* Here, the new range is adjacent to the one below. (It may also
10262 * span beyond the range above, but that will get resolved later.)
10263 * Extend the range below to include this one. */
10264 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
10266 start = array[i_s];
10268 else if (extends_the_range_above) {
10270 /* Here the new range only extends the range above it, but not the
10271 * one below. It merges with the one above. Again, we keep i_e
10272 * and i_s in sync if they point to the same range */
10277 array[i_s] = start;
10281 /* Here, we've dealt with the new range start extending any adjoining
10284 * If the new range extends to infinity, it is now the final one,
10285 * regardless of what was there before */
10286 if (UNLIKELY(end == UV_MAX)) {
10287 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
10291 /* If i_e started as == i_s, it has also been dealt with,
10292 * and been updated to the new i_s, which will fail the following if */
10293 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
10295 /* Here, the ranges on either side of the end of the new range are in
10296 * the set, and this range ends in the gap between them.
10298 * If this range is adjacent to (hence extends) the range above it, it
10299 * becomes part of that range; likewise if it extends the range below,
10300 * it becomes part of that range */
10301 if (end + 1 == array[i_e+1]) {
10303 array[i_e] = start;
10305 else if (start <= array[i_e]) {
10306 array[i_e] = end + 1;
10313 /* If the range fits entirely in an existing range (as possibly already
10314 * extended above), it doesn't add anything new */
10315 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
10319 /* Here, no part of the range is in the list. Must add it. It will
10320 * occupy 2 more slots */
10321 splice_in_new_range:
10323 invlist_extend(invlist, len + 2);
10324 array = invlist_array(invlist);
10325 /* Move the rest of the array down two slots. Don't include any
10327 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
10329 /* Do the actual splice */
10330 array[i_e+1] = start;
10331 array[i_e+2] = end + 1;
10332 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
10336 /* Here the new range crossed the boundaries of a pre-existing range. The
10337 * code above has adjusted things so that both ends are in ranges that are
10338 * in the set. This means everything in between must also be in the set.
10339 * Just squash things together */
10340 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
10341 invlist_set_len(invlist,
10343 *(get_invlist_offset_addr(invlist)));
10349 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
10350 UV** other_elements_ptr)
10352 /* Create and return an inversion list whose contents are to be populated
10353 * by the caller. The caller gives the number of elements (in 'size') and
10354 * the very first element ('element0'). This function will set
10355 * '*other_elements_ptr' to an array of UVs, where the remaining elements
10356 * are to be placed.
10358 * Obviously there is some trust involved that the caller will properly
10359 * fill in the other elements of the array.
10361 * (The first element needs to be passed in, as the underlying code does
10362 * things differently depending on whether it is zero or non-zero) */
10364 SV* invlist = _new_invlist(size);
10367 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
10369 invlist = add_cp_to_invlist(invlist, element0);
10370 offset = *get_invlist_offset_addr(invlist);
10372 invlist_set_len(invlist, size, offset);
10373 *other_elements_ptr = invlist_array(invlist) + 1;
10379 #ifndef PERL_IN_XSUB_RE
10381 Perl__invlist_invert(pTHX_ SV* const invlist)
10383 /* Complement the input inversion list. This adds a 0 if the list didn't
10384 * have a zero; removes it otherwise. As described above, the data
10385 * structure is set up so that this is very efficient */
10387 PERL_ARGS_ASSERT__INVLIST_INVERT;
10389 assert(! invlist_is_iterating(invlist));
10391 /* The inverse of matching nothing is matching everything */
10392 if (_invlist_len(invlist) == 0) {
10393 _append_range_to_invlist(invlist, 0, UV_MAX);
10397 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
10401 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
10403 /* Return a new inversion list that is a copy of the input one, which is
10404 * unchanged. The new list will not be mortal even if the old one was. */
10406 const STRLEN nominal_length = _invlist_len(invlist);
10407 const STRLEN physical_length = SvCUR(invlist);
10408 const bool offset = *(get_invlist_offset_addr(invlist));
10410 PERL_ARGS_ASSERT_INVLIST_CLONE;
10412 if (new_invlist == NULL) {
10413 new_invlist = _new_invlist(nominal_length);
10416 sv_upgrade(new_invlist, SVt_INVLIST);
10417 initialize_invlist_guts(new_invlist, nominal_length);
10420 *(get_invlist_offset_addr(new_invlist)) = offset;
10421 invlist_set_len(new_invlist, nominal_length, offset);
10422 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
10424 return new_invlist;
10429 PERL_STATIC_INLINE UV
10430 S_invlist_lowest(SV* const invlist)
10432 /* Returns the lowest code point that matches an inversion list. This API
10433 * has an ambiguity, as it returns 0 under either the lowest is actually
10434 * 0, or if the list is empty. If this distinction matters to you, check
10435 * for emptiness before calling this function */
10437 UV len = _invlist_len(invlist);
10440 PERL_ARGS_ASSERT_INVLIST_LOWEST;
10446 array = invlist_array(invlist);
10452 S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
10454 /* Get the contents of an inversion list into a string SV so that they can
10455 * be printed out. If 'traditional_style' is TRUE, it uses the format
10456 * traditionally done for debug tracing; otherwise it uses a format
10457 * suitable for just copying to the output, with blanks between ranges and
10458 * a dash between range components */
10462 const char intra_range_delimiter = (traditional_style ? '\t' : '-');
10463 const char inter_range_delimiter = (traditional_style ? '\n' : ' ');
10465 if (traditional_style) {
10466 output = newSVpvs("\n");
10469 output = newSVpvs("");
10472 PERL_ARGS_ASSERT_INVLIST_CONTENTS;
10474 assert(! invlist_is_iterating(invlist));
10476 invlist_iterinit(invlist);
10477 while (invlist_iternext(invlist, &start, &end)) {
10478 if (end == UV_MAX) {
10479 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
10480 start, intra_range_delimiter,
10481 inter_range_delimiter);
10483 else if (end != start) {
10484 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c",
10486 intra_range_delimiter,
10487 end, inter_range_delimiter);
10490 Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c",
10491 start, inter_range_delimiter);
10495 if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */
10496 SvCUR_set(output, SvCUR(output) - 1);
10502 #ifndef PERL_IN_XSUB_RE
10504 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
10505 const char * const indent, SV* const invlist)
10507 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
10508 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
10509 * the string 'indent'. The output looks like this:
10510 [0] 0x000A .. 0x000D
10512 [4] 0x2028 .. 0x2029
10513 [6] 0x3104 .. INFTY
10514 * This means that the first range of code points matched by the list are
10515 * 0xA through 0xD; the second range contains only the single code point
10516 * 0x85, etc. An inversion list is an array of UVs. Two array elements
10517 * are used to define each range (except if the final range extends to
10518 * infinity, only a single element is needed). The array index of the
10519 * first element for the corresponding range is given in brackets. */
10524 PERL_ARGS_ASSERT__INVLIST_DUMP;
10526 if (invlist_is_iterating(invlist)) {
10527 Perl_dump_indent(aTHX_ level, file,
10528 "%sCan't dump inversion list because is in middle of iterating\n",
10533 invlist_iterinit(invlist);
10534 while (invlist_iternext(invlist, &start, &end)) {
10535 if (end == UV_MAX) {
10536 Perl_dump_indent(aTHX_ level, file,
10537 "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
10538 indent, (UV)count, start);
10540 else if (end != start) {
10541 Perl_dump_indent(aTHX_ level, file,
10542 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
10543 indent, (UV)count, start, end);
10546 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
10547 indent, (UV)count, start);
10555 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
10557 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
10559 /* Return a boolean as to if the two passed in inversion lists are
10560 * identical. The final argument, if TRUE, says to take the complement of
10561 * the second inversion list before doing the comparison */
10563 const UV len_a = _invlist_len(a);
10564 UV len_b = _invlist_len(b);
10566 const UV* array_a = NULL;
10567 const UV* array_b = NULL;
10569 PERL_ARGS_ASSERT__INVLISTEQ;
10571 /* This code avoids accessing the arrays unless it knows the length is
10576 return ! complement_b;
10580 array_a = invlist_array(a);
10584 array_b = invlist_array(b);
10587 /* If are to compare 'a' with the complement of b, set it
10588 * up so are looking at b's complement. */
10589 if (complement_b) {
10591 /* The complement of nothing is everything, so <a> would have to have
10592 * just one element, starting at zero (ending at infinity) */
10594 return (len_a == 1 && array_a[0] == 0);
10596 if (array_b[0] == 0) {
10598 /* Otherwise, to complement, we invert. Here, the first element is
10599 * 0, just remove it. To do this, we just pretend the array starts
10607 /* But if the first element is not zero, we pretend the list starts
10608 * at the 0 that is always stored immediately before the array. */
10614 return len_a == len_b
10615 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
10621 * As best we can, determine the characters that can match the start of
10622 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
10623 * can be false positive matches
10625 * Returns the invlist as a new SV*; it is the caller's responsibility to
10626 * call SvREFCNT_dec() when done with it.
10629 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
10631 const U8 * s = (U8*)STRING(node);
10632 SSize_t bytelen = STR_LEN(node);
10634 /* Start out big enough for 2 separate code points */
10635 SV* invlist = _new_invlist(4);
10637 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
10642 /* We punt and assume can match anything if the node begins
10643 * with a multi-character fold. Things are complicated. For
10644 * example, /ffi/i could match any of:
10645 * "\N{LATIN SMALL LIGATURE FFI}"
10646 * "\N{LATIN SMALL LIGATURE FF}I"
10647 * "F\N{LATIN SMALL LIGATURE FI}"
10648 * plus several other things; and making sure we have all the
10649 * possibilities is hard. */
10650 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
10651 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10654 /* Any Latin1 range character can potentially match any
10655 * other depending on the locale, and in Turkic locales, U+130 and
10657 if (OP(node) == EXACTFL) {
10658 _invlist_union(invlist, PL_Latin1, &invlist);
10659 invlist = add_cp_to_invlist(invlist,
10660 LATIN_SMALL_LETTER_DOTLESS_I);
10661 invlist = add_cp_to_invlist(invlist,
10662 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10665 /* But otherwise, it matches at least itself. We can
10666 * quickly tell if it has a distinct fold, and if so,
10667 * it matches that as well */
10668 invlist = add_cp_to_invlist(invlist, uc);
10669 if (IS_IN_SOME_FOLD_L1(uc))
10670 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
10673 /* Some characters match above-Latin1 ones under /i. This
10674 * is true of EXACTFL ones when the locale is UTF-8 */
10675 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
10676 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
10677 EXACTFAA_NO_TRIE)))
10679 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
10683 else { /* Pattern is UTF-8 */
10684 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
10685 const U8* e = s + bytelen;
10688 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
10690 /* The only code points that aren't folded in a UTF EXACTFish
10691 * node are the problematic ones in EXACTFL nodes */
10692 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
10693 /* We need to check for the possibility that this EXACTFL
10694 * node begins with a multi-char fold. Therefore we fold
10695 * the first few characters of it so that we can make that
10701 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
10703 *(d++) = (U8) toFOLD(*s);
10704 if (fc < 0) { /* Save the first fold */
10711 UV fold = toFOLD_utf8_safe(s, e, d, &len);
10712 if (fc < 0) { /* Save the first fold */
10720 /* And set up so the code below that looks in this folded
10721 * buffer instead of the node's string */
10726 /* When we reach here 's' points to the fold of the first
10727 * character(s) of the node; and 'e' points to far enough along
10728 * the folded string to be just past any possible multi-char
10731 * Like the non-UTF case above, we punt if the node begins with a
10732 * multi-char fold */
10734 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
10735 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
10737 else { /* Single char fold */
10740 const U32 * remaining_folds;
10741 Size_t folds_count;
10743 /* It matches itself */
10744 invlist = add_cp_to_invlist(invlist, fc);
10746 /* ... plus all the things that fold to it, which are found in
10747 * PL_utf8_foldclosures */
10748 folds_count = _inverse_folds(fc, &first_fold,
10750 for (k = 0; k < folds_count; k++) {
10751 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
10753 /* /aa doesn't allow folds between ASCII and non- */
10754 if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
10755 && isASCII(c) != isASCII(fc))
10760 invlist = add_cp_to_invlist(invlist, c);
10763 if (OP(node) == EXACTFL) {
10765 /* If either [iI] are present in an EXACTFL node the above code
10766 * should have added its normal case pair, but under a Turkish
10767 * locale they could match instead the case pairs from it. Add
10768 * those as potential matches as well */
10769 if (isALPHA_FOLD_EQ(fc, 'I')) {
10770 invlist = add_cp_to_invlist(invlist,
10771 LATIN_SMALL_LETTER_DOTLESS_I);
10772 invlist = add_cp_to_invlist(invlist,
10773 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
10775 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
10776 invlist = add_cp_to_invlist(invlist, 'I');
10778 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
10779 invlist = add_cp_to_invlist(invlist, 'i');
10788 #undef HEADER_LENGTH
10789 #undef TO_INTERNAL_SIZE
10790 #undef FROM_INTERNAL_SIZE
10791 #undef INVLIST_VERSION_ID
10793 /* End of inversion list object */
10796 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
10798 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
10799 * constructs, and updates RExC_flags with them. On input, RExC_parse
10800 * should point to the first flag; it is updated on output to point to the
10801 * final ')' or ':'. There needs to be at least one flag, or this will
10804 /* for (?g), (?gc), and (?o) warnings; warning
10805 about (?c) will warn about (?g) -- japhy */
10807 #define WASTED_O 0x01
10808 #define WASTED_G 0x02
10809 #define WASTED_C 0x04
10810 #define WASTED_GC (WASTED_G|WASTED_C)
10811 I32 wastedflags = 0x00;
10812 U32 posflags = 0, negflags = 0;
10813 U32 *flagsp = &posflags;
10814 char has_charset_modifier = '\0';
10816 bool has_use_defaults = FALSE;
10817 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
10818 int x_mod_count = 0;
10820 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
10822 /* '^' as an initial flag sets certain defaults */
10823 if (UCHARAT(RExC_parse) == '^') {
10825 has_use_defaults = TRUE;
10826 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
10827 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
10828 ? REGEX_UNICODE_CHARSET
10829 : REGEX_DEPENDS_CHARSET;
10830 set_regex_charset(&RExC_flags, cs);
10833 cs = get_regex_charset(RExC_flags);
10834 if ( cs == REGEX_DEPENDS_CHARSET
10835 && (toUSE_UNI_CHARSET_NOT_DEPENDS))
10837 cs = REGEX_UNICODE_CHARSET;
10841 while (RExC_parse < RExC_end) {
10842 /* && memCHRs("iogcmsx", *RExC_parse) */
10843 /* (?g), (?gc) and (?o) are useless here
10844 and must be globally applied -- japhy */
10845 if ((RExC_pm_flags & PMf_WILDCARD)) {
10846 if (flagsp == & negflags) {
10847 if (*RExC_parse == 'm') {
10849 /* diag_listed_as: Use of %s is not allowed in Unicode
10850 property wildcard subpatterns in regex; marked by <--
10852 vFAIL("Use of modifier '-m' is not allowed in Unicode"
10853 " property wildcard subpatterns");
10857 if (*RExC_parse == 's') {
10858 goto modifier_illegal_in_wildcard;
10863 switch (*RExC_parse) {
10865 /* Code for the imsxn flags */
10866 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
10868 case LOCALE_PAT_MOD:
10869 if (has_charset_modifier) {
10870 goto excess_modifier;
10872 else if (flagsp == &negflags) {
10875 cs = REGEX_LOCALE_CHARSET;
10876 has_charset_modifier = LOCALE_PAT_MOD;
10878 case UNICODE_PAT_MOD:
10879 if (has_charset_modifier) {
10880 goto excess_modifier;
10882 else if (flagsp == &negflags) {
10885 cs = REGEX_UNICODE_CHARSET;
10886 has_charset_modifier = UNICODE_PAT_MOD;
10888 case ASCII_RESTRICT_PAT_MOD:
10889 if (flagsp == &negflags) {
10892 if (has_charset_modifier) {
10893 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
10894 goto excess_modifier;
10896 /* Doubled modifier implies more restricted */
10897 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
10900 cs = REGEX_ASCII_RESTRICTED_CHARSET;
10902 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
10904 case DEPENDS_PAT_MOD:
10905 if (has_use_defaults) {
10906 goto fail_modifiers;
10908 else if (flagsp == &negflags) {
10911 else if (has_charset_modifier) {
10912 goto excess_modifier;
10915 /* The dual charset means unicode semantics if the
10916 * pattern (or target, not known until runtime) are
10917 * utf8, or something in the pattern indicates unicode
10919 cs = (toUSE_UNI_CHARSET_NOT_DEPENDS)
10920 ? REGEX_UNICODE_CHARSET
10921 : REGEX_DEPENDS_CHARSET;
10922 has_charset_modifier = DEPENDS_PAT_MOD;
10926 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
10927 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
10929 else if (has_charset_modifier == *(RExC_parse - 1)) {
10930 vFAIL2("Regexp modifier \"%c\" may not appear twice",
10931 *(RExC_parse - 1));
10934 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
10936 NOT_REACHED; /*NOTREACHED*/
10939 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
10940 *(RExC_parse - 1));
10941 NOT_REACHED; /*NOTREACHED*/
10942 case GLOBAL_PAT_MOD: /* 'g' */
10943 if (RExC_pm_flags & PMf_WILDCARD) {
10944 goto modifier_illegal_in_wildcard;
10947 case ONCE_PAT_MOD: /* 'o' */
10948 if (ckWARN(WARN_REGEXP)) {
10949 const I32 wflagbit = *RExC_parse == 'o'
10952 if (! (wastedflags & wflagbit) ) {
10953 wastedflags |= wflagbit;
10954 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10957 "Useless (%s%c) - %suse /%c modifier",
10958 flagsp == &negflags ? "?-" : "?",
10960 flagsp == &negflags ? "don't " : "",
10967 case CONTINUE_PAT_MOD: /* 'c' */
10968 if (RExC_pm_flags & PMf_WILDCARD) {
10969 goto modifier_illegal_in_wildcard;
10971 if (ckWARN(WARN_REGEXP)) {
10972 if (! (wastedflags & WASTED_C) ) {
10973 wastedflags |= WASTED_GC;
10974 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
10977 "Useless (%sc) - %suse /gc modifier",
10978 flagsp == &negflags ? "?-" : "?",
10979 flagsp == &negflags ? "don't " : ""
10984 case KEEPCOPY_PAT_MOD: /* 'p' */
10985 if (RExC_pm_flags & PMf_WILDCARD) {
10986 goto modifier_illegal_in_wildcard;
10988 if (flagsp == &negflags) {
10989 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
10991 *flagsp |= RXf_PMf_KEEPCOPY;
10995 /* A flag is a default iff it is following a minus, so
10996 * if there is a minus, it means will be trying to
10997 * re-specify a default which is an error */
10998 if (has_use_defaults || flagsp == &negflags) {
10999 goto fail_modifiers;
11001 flagsp = &negflags;
11002 wastedflags = 0; /* reset so (?g-c) warns twice */
11008 if ( (RExC_pm_flags & PMf_WILDCARD)
11009 && cs != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
11012 /* diag_listed_as: Use of %s is not allowed in Unicode
11013 property wildcard subpatterns in regex; marked by <--
11015 vFAIL2("Use of modifier '%c' is not allowed in Unicode"
11016 " property wildcard subpatterns",
11017 has_charset_modifier);
11020 if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) {
11021 negflags |= RXf_PMf_EXTENDED_MORE;
11023 RExC_flags |= posflags;
11025 if (negflags & RXf_PMf_EXTENDED) {
11026 negflags |= RXf_PMf_EXTENDED_MORE;
11028 RExC_flags &= ~negflags;
11029 set_regex_charset(&RExC_flags, cs);
11034 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11035 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11036 vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
11037 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11038 NOT_REACHED; /*NOTREACHED*/
11041 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11044 vFAIL("Sequence (?... not terminated");
11046 modifier_illegal_in_wildcard:
11048 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
11049 subpatterns in regex; marked by <-- HERE in m/%s/ */
11050 vFAIL2("Use of modifier '%c' is not allowed in Unicode property wildcard"
11051 " subpatterns", *(RExC_parse - 1));
11055 - reg - regular expression, i.e. main body or parenthesized thing
11057 * Caller must absorb opening parenthesis.
11059 * Combining parenthesis handling with the base level of regular expression
11060 * is a trifle forced, but the need to tie the tails of the branches to what
11061 * follows makes it hard to avoid.
11063 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
11065 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
11067 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
11070 STATIC regnode_offset
11071 S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
11073 char * parse_start,
11077 regnode_offset ret;
11078 char* name_start = RExC_parse;
11080 SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11081 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11083 PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
11085 if (RExC_parse == name_start || *RExC_parse != ch) {
11086 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11087 vFAIL2("Sequence %.3s... not terminated", parse_start);
11091 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11092 RExC_rxi->data->data[num]=(void*)sv_dat;
11093 SvREFCNT_inc_simple_void_NN(sv_dat);
11096 ret = reganode(pRExC_state,
11099 : (ASCII_FOLD_RESTRICTED)
11101 : (AT_LEAST_UNI_SEMANTICS)
11107 *flagp |= HASWIDTH;
11109 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
11110 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
11112 nextchar(pRExC_state);
11116 /* On success, returns the offset at which any next node should be placed into
11117 * the regex engine program being compiled.
11119 * Returns 0 otherwise, with *flagp set to indicate why:
11120 * TRYAGAIN at the end of (?) that only sets flags.
11121 * RESTART_PARSE if the parse needs to be restarted, or'd with
11122 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
11123 * Otherwise would only return 0 if regbranch() returns 0, which cannot
11125 STATIC regnode_offset
11126 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
11127 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
11128 * 2 is like 1, but indicates that nextchar() has been called to advance
11129 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
11130 * this flag alerts us to the need to check for that */
11132 regnode_offset ret = 0; /* Will be the head of the group. */
11134 regnode_offset lastbr;
11135 regnode_offset ender = 0;
11138 U32 oregflags = RExC_flags;
11139 bool have_branch = 0;
11141 I32 freeze_paren = 0;
11142 I32 after_freeze = 0;
11143 I32 num; /* numeric backreferences */
11144 SV * max_open; /* Max number of unclosed parens */
11146 char * parse_start = RExC_parse; /* MJD */
11147 char * const oregcomp_parse = RExC_parse;
11149 DECLARE_AND_GET_RE_DEBUG_FLAGS;
11151 PERL_ARGS_ASSERT_REG;
11152 DEBUG_PARSE("reg ");
11154 max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
11156 if (!SvIOK(max_open)) {
11157 sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
11159 if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
11161 vFAIL("Too many nested open parens");
11164 *flagp = 0; /* Initialize. */
11166 if (RExC_in_lookbehind) {
11167 RExC_in_lookbehind++;
11169 if (RExC_in_lookahead) {
11170 RExC_in_lookahead++;
11173 /* Having this true makes it feasible to have a lot fewer tests for the
11174 * parse pointer being in scope. For example, we can write
11175 * while(isFOO(*RExC_parse)) RExC_parse++;
11177 * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++;
11179 assert(*RExC_end == '\0');
11181 /* Make an OPEN node, if parenthesized. */
11184 /* Under /x, space and comments can be gobbled up between the '(' and
11185 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
11186 * intervening space, as the sequence is a token, and a token should be
11188 bool has_intervening_patws = (paren == 2)
11189 && *(RExC_parse - 1) != '(';
11191 if (RExC_parse >= RExC_end) {
11192 vFAIL("Unmatched (");
11195 if (paren == 'r') { /* Atomic script run */
11199 else if ( *RExC_parse == '*') { /* (*VERB:ARG), (*construct:...) */
11200 char *start_verb = RExC_parse + 1;
11202 char *start_arg = NULL;
11203 unsigned char op = 0;
11204 int arg_required = 0;
11205 int internal_argval = -1; /* if >-1 we are not allowed an argument*/
11206 bool has_upper = FALSE;
11208 if (has_intervening_patws) {
11209 RExC_parse++; /* past the '*' */
11211 /* For strict backwards compatibility, don't change the message
11212 * now that we also have lowercase operands */
11213 if (isUPPER(*RExC_parse)) {
11214 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
11217 vFAIL("In '(*...)', the '(' and '*' must be adjacent");
11220 while (RExC_parse < RExC_end && *RExC_parse != ')' ) {
11221 if ( *RExC_parse == ':' ) {
11222 start_arg = RExC_parse + 1;
11226 if (isUPPER(*RExC_parse)) {
11232 RExC_parse += UTF8SKIP(RExC_parse);
11235 verb_len = RExC_parse - start_verb;
11237 if (RExC_parse >= RExC_end) {
11238 goto unterminated_verb_pattern;
11241 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11242 while ( RExC_parse < RExC_end && *RExC_parse != ')' ) {
11243 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11245 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11246 unterminated_verb_pattern:
11248 vFAIL("Unterminated verb pattern argument");
11251 vFAIL("Unterminated '(*...' argument");
11255 if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) {
11257 vFAIL("Unterminated verb pattern");
11260 vFAIL("Unterminated '(*...' construct");
11265 /* Here, we know that RExC_parse < RExC_end */
11267 switch ( *start_verb ) {
11268 case 'A': /* (*ACCEPT) */
11269 if ( memEQs(start_verb, verb_len,"ACCEPT") ) {
11271 internal_argval = RExC_nestroot;
11274 case 'C': /* (*COMMIT) */
11275 if ( memEQs(start_verb, verb_len,"COMMIT") )
11278 case 'F': /* (*FAIL) */
11279 if ( verb_len==1 || memEQs(start_verb, verb_len,"FAIL") ) {
11283 case ':': /* (*:NAME) */
11284 case 'M': /* (*MARK:NAME) */
11285 if ( verb_len==0 || memEQs(start_verb, verb_len,"MARK") ) {
11290 case 'P': /* (*PRUNE) */
11291 if ( memEQs(start_verb, verb_len,"PRUNE") )
11294 case 'S': /* (*SKIP) */
11295 if ( memEQs(start_verb, verb_len,"SKIP") )
11298 case 'T': /* (*THEN) */
11299 /* [19:06] <TimToady> :: is then */
11300 if ( memEQs(start_verb, verb_len,"THEN") ) {
11302 RExC_seen |= REG_CUTGROUP_SEEN;
11306 if ( memEQs(start_verb, verb_len, "asr")
11307 || memEQs(start_verb, verb_len, "atomic_script_run"))
11309 paren = 'r'; /* Mnemonic: recursed run */
11312 else if (memEQs(start_verb, verb_len, "atomic")) {
11313 paren = 't'; /* AtOMIC */
11314 goto alpha_assertions;
11318 if ( memEQs(start_verb, verb_len, "plb")
11319 || memEQs(start_verb, verb_len, "positive_lookbehind"))
11322 goto lookbehind_alpha_assertions;
11324 else if ( memEQs(start_verb, verb_len, "pla")
11325 || memEQs(start_verb, verb_len, "positive_lookahead"))
11328 goto alpha_assertions;
11332 if ( memEQs(start_verb, verb_len, "nlb")
11333 || memEQs(start_verb, verb_len, "negative_lookbehind"))
11336 goto lookbehind_alpha_assertions;
11338 else if ( memEQs(start_verb, verb_len, "nla")
11339 || memEQs(start_verb, verb_len, "negative_lookahead"))
11342 goto alpha_assertions;
11346 if ( memEQs(start_verb, verb_len, "sr")
11347 || memEQs(start_verb, verb_len, "script_run"))
11349 regnode_offset atomic;
11355 /* This indicates Unicode rules. */
11356 REQUIRE_UNI_RULES(flagp, 0);
11362 RExC_parse = start_arg;
11364 if (RExC_in_script_run) {
11366 /* Nested script runs are treated as no-ops, because
11367 * if the nested one fails, the outer one must as
11368 * well. It could fail sooner, and avoid (??{} with
11369 * side effects, but that is explicitly documented as
11370 * undefined behavior. */
11374 if (paren == 's') {
11379 /* But, the atomic part of a nested atomic script run
11380 * isn't a no-op, but can be treated just like a '(?>'
11386 if (paren == 's') {
11387 /* Here, we're starting a new regular script run */
11388 ret = reg_node(pRExC_state, SROPEN);
11389 RExC_in_script_run = 1;
11394 /* Here, we are starting an atomic script run. This is
11395 * handled by recursing to deal with the atomic portion
11396 * separately, enclosed in SROPEN ... SRCLOSE nodes */
11398 ret = reg_node(pRExC_state, SROPEN);
11400 RExC_in_script_run = 1;
11402 atomic = reg(pRExC_state, 'r', &flags, depth);
11403 if (flags & (RESTART_PARSE|NEED_UTF8)) {
11404 *flagp = flags & (RESTART_PARSE|NEED_UTF8);
11408 if (! REGTAIL(pRExC_state, ret, atomic)) {
11409 REQUIRE_BRANCHJ(flagp, 0);
11412 if (! REGTAIL(pRExC_state, atomic, reg_node(pRExC_state,
11415 REQUIRE_BRANCHJ(flagp, 0);
11418 RExC_in_script_run = 0;
11424 lookbehind_alpha_assertions:
11425 RExC_seen |= REG_LOOKBEHIND_SEEN;
11426 RExC_in_lookbehind++;
11431 RExC_seen_zerolen++;
11437 /* An empty negative lookahead assertion simply is failure */
11438 if (paren == 'A' && RExC_parse == start_arg) {
11439 ret=reganode(pRExC_state, OPFAIL, 0);
11440 nextchar(pRExC_state);
11444 RExC_parse = start_arg;
11449 "'(*%" UTF8f "' requires a terminating ':'",
11450 UTF8fARG(UTF, verb_len, start_verb));
11451 NOT_REACHED; /*NOTREACHED*/
11453 } /* End of switch */
11456 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
11458 if (has_upper || verb_len == 0) {
11460 "Unknown verb pattern '%" UTF8f "'",
11461 UTF8fARG(UTF, verb_len, start_verb));
11465 "Unknown '(*...)' construct '%" UTF8f "'",
11466 UTF8fARG(UTF, verb_len, start_verb));
11469 if ( RExC_parse == start_arg ) {
11472 if ( arg_required && !start_arg ) {
11473 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
11474 (int) verb_len, start_verb);
11476 if (internal_argval == -1) {
11477 ret = reganode(pRExC_state, op, 0);
11479 ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
11481 RExC_seen |= REG_VERBARG_SEEN;
11483 SV *sv = newSVpvn( start_arg,
11484 RExC_parse - start_arg);
11485 ARG(REGNODE_p(ret)) = add_data( pRExC_state,
11486 STR_WITH_LEN("S"));
11487 RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
11488 FLAGS(REGNODE_p(ret)) = 1;
11490 FLAGS(REGNODE_p(ret)) = 0;
11492 if ( internal_argval != -1 )
11493 ARG2L_SET(REGNODE_p(ret), internal_argval);
11494 nextchar(pRExC_state);
11497 else if (*RExC_parse == '?') { /* (?...) */
11498 bool is_logical = 0;
11499 const char * const seqstart = RExC_parse;
11500 const char * endptr;
11501 const char non_existent_group_msg[]
11502 = "Reference to nonexistent group";
11503 const char impossible_group[] = "Invalid reference to group";
11505 if (has_intervening_patws) {
11507 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
11510 RExC_parse++; /* past the '?' */
11511 paren = *RExC_parse; /* might be a trailing NUL, if not
11513 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
11514 if (RExC_parse > RExC_end) {
11517 ret = 0; /* For look-ahead/behind. */
11520 case 'P': /* (?P...) variants for those used to PCRE/Python */
11521 paren = *RExC_parse;
11522 if ( paren == '<') { /* (?P<...>) named capture */
11524 if (RExC_parse >= RExC_end) {
11525 vFAIL("Sequence (?P<... not terminated");
11527 goto named_capture;
11529 else if (paren == '>') { /* (?P>name) named recursion */
11531 if (RExC_parse >= RExC_end) {
11532 vFAIL("Sequence (?P>... not terminated");
11534 goto named_recursion;
11536 else if (paren == '=') { /* (?P=...) named backref */
11538 return handle_named_backref(pRExC_state, flagp,
11541 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11542 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11543 vFAIL3("Sequence (%.*s...) not recognized",
11544 (int) (RExC_parse - seqstart), seqstart);
11545 NOT_REACHED; /*NOTREACHED*/
11546 case '<': /* (?<...) */
11547 /* If you want to support (?<*...), first reconcile with GH #17363 */
11548 if (*RExC_parse == '!')
11550 else if (*RExC_parse != '=')
11557 case '\'': /* (?'...') */
11558 name_start = RExC_parse;
11559 svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
11560 if ( RExC_parse == name_start
11561 || RExC_parse >= RExC_end
11562 || *RExC_parse != paren)
11564 vFAIL2("Sequence (?%c... not terminated",
11565 paren=='>' ? '<' : (char) paren);
11570 if (!svname) /* shouldn't happen */
11572 "panic: reg_scan_name returned NULL");
11573 if (!RExC_paren_names) {
11574 RExC_paren_names= newHV();
11575 sv_2mortal(MUTABLE_SV(RExC_paren_names));
11577 RExC_paren_name_list= newAV();
11578 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
11581 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
11583 sv_dat = HeVAL(he_str);
11585 /* croak baby croak */
11587 "panic: paren_name hash element allocation failed");
11588 } else if ( SvPOK(sv_dat) ) {
11589 /* (?|...) can mean we have dupes so scan to check
11590 its already been stored. Maybe a flag indicating
11591 we are inside such a construct would be useful,
11592 but the arrays are likely to be quite small, so
11593 for now we punt -- dmq */
11594 IV count = SvIV(sv_dat);
11595 I32 *pv = (I32*)SvPVX(sv_dat);
11597 for ( i = 0 ; i < count ; i++ ) {
11598 if ( pv[i] == RExC_npar ) {
11604 pv = (I32*)SvGROW(sv_dat,
11605 SvCUR(sv_dat) + sizeof(I32)+1);
11606 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
11607 pv[count] = RExC_npar;
11608 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
11611 (void)SvUPGRADE(sv_dat, SVt_PVNV);
11612 sv_setpvn(sv_dat, (char *)&(RExC_npar),
11615 SvIV_set(sv_dat, 1);
11618 /* Yes this does cause a memory leak in debugging Perls
11620 if (!av_store(RExC_paren_name_list,
11621 RExC_npar, SvREFCNT_inc_NN(svname)))
11622 SvREFCNT_dec_NN(svname);
11625 /*sv_dump(sv_dat);*/
11627 nextchar(pRExC_state);
11629 goto capturing_parens;
11632 RExC_seen |= REG_LOOKBEHIND_SEEN;
11633 RExC_in_lookbehind++;
11635 if (RExC_parse >= RExC_end) {
11636 vFAIL("Sequence (?... not terminated");
11638 RExC_seen_zerolen++;
11640 case '=': /* (?=...) */
11641 RExC_seen_zerolen++;
11642 RExC_in_lookahead++;
11644 case '!': /* (?!...) */
11645 RExC_seen_zerolen++;
11646 /* check if we're really just a "FAIL" assertion */
11647 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
11648 FALSE /* Don't force to /x */ );
11649 if (*RExC_parse == ')') {
11650 ret=reganode(pRExC_state, OPFAIL, 0);
11651 nextchar(pRExC_state);
11655 case '|': /* (?|...) */
11656 /* branch reset, behave like a (?:...) except that
11657 buffers in alternations share the same numbers */
11659 after_freeze = freeze_paren = RExC_npar;
11661 /* XXX This construct currently requires an extra pass.
11662 * Investigation would be required to see if that could be
11664 REQUIRE_PARENS_PASS;
11666 case ':': /* (?:...) */
11667 case '>': /* (?>...) */
11669 case '$': /* (?$...) */
11670 case '@': /* (?@...) */
11671 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
11673 case '0' : /* (?0) */
11674 case 'R' : /* (?R) */
11675 if (RExC_parse == RExC_end || *RExC_parse != ')')
11676 FAIL("Sequence (?R) not terminated");
11678 RExC_seen |= REG_RECURSE_SEEN;
11680 /* XXX These constructs currently require an extra pass.
11681 * It probably could be changed */
11682 REQUIRE_PARENS_PASS;
11684 *flagp |= POSTPONED;
11685 goto gen_recurse_regop;
11687 /* named and numeric backreferences */
11688 case '&': /* (?&NAME) */
11689 parse_start = RExC_parse - 1;
11692 SV *sv_dat = reg_scan_name(pRExC_state,
11693 REG_RSN_RETURN_DATA);
11694 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
11696 if (RExC_parse >= RExC_end || *RExC_parse != ')')
11697 vFAIL("Sequence (?&... not terminated");
11698 goto gen_recurse_regop;
11701 if (! inRANGE(RExC_parse[0], '1', '9')) {
11703 vFAIL("Illegal pattern");
11705 goto parse_recursion;
11707 case '-': /* (?-1) */
11708 if (! inRANGE(RExC_parse[0], '1', '9')) {
11709 RExC_parse--; /* rewind to let it be handled later */
11713 case '1': case '2': case '3': case '4': /* (?1) */
11714 case '5': case '6': case '7': case '8': case '9':
11715 RExC_parse = (char *) seqstart + 1; /* Point to the digit */
11718 bool is_neg = FALSE;
11720 parse_start = RExC_parse - 1; /* MJD */
11721 if (*RExC_parse == '-') {
11726 if (grok_atoUV(RExC_parse, &unum, &endptr)
11730 RExC_parse = (char*)endptr;
11732 else { /* Overflow, or something like that. Position
11733 beyond all digits for the message */
11734 while (RExC_parse < RExC_end && isDIGIT(*RExC_parse)) {
11737 vFAIL(impossible_group);
11740 /* -num is always representable on 1 and 2's complement
11745 if (*RExC_parse!=')')
11746 vFAIL("Expecting close bracket");
11749 if (paren == '-' || paren == '+') {
11751 /* Don't overflow */
11752 if (UNLIKELY(I32_MAX - RExC_npar < num)) {
11754 vFAIL(impossible_group);
11758 Diagram of capture buffer numbering.
11759 Top line is the normal capture buffer numbers
11760 Bottom line is the negative indexing as from
11764 /(a(x)y)(a(b(c(?+2)d)e)f)(g(h))/
11765 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
11766 - 5 4 3 2 1 X Y x x
11768 Resolve to absolute group. Recall that RExC_npar is +1 of
11769 the actual parenthesis group number. For lookahead, we
11770 have to compensate for that. Using the above example, when
11771 we get to Y in the parse, num is 2 and RExC_npar is 6. We
11772 want 7 for +2, and 4 for -2.
11774 if ( paren == '+' ) {
11780 if (paren == '-' && num < 1) {
11782 vFAIL(non_existent_group_msg);
11786 if (num >= RExC_npar) {
11788 /* It might be a forward reference; we can't fail until we
11789 * know, by completing the parse to get all the groups, and
11790 * then reparsing */
11791 if (ALL_PARENS_COUNTED) {
11792 if (num >= RExC_total_parens) {
11794 vFAIL(non_existent_group_msg);
11798 REQUIRE_PARENS_PASS;
11802 /* We keep track how many GOSUB items we have produced.
11803 To start off the ARG2L() of the GOSUB holds its "id",
11804 which is used later in conjunction with RExC_recurse
11805 to calculate the offset we need to jump for the GOSUB,
11806 which it will store in the final representation.
11807 We have to defer the actual calculation until much later
11808 as the regop may move.
11810 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
11811 RExC_recurse_count++;
11812 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
11813 "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
11814 22, "| |", (int)(depth * 2 + 1), "",
11815 (UV)ARG(REGNODE_p(ret)),
11816 (IV)ARG2L(REGNODE_p(ret))));
11817 RExC_seen |= REG_RECURSE_SEEN;
11819 Set_Node_Length(REGNODE_p(ret),
11820 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */
11821 Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */
11823 *flagp |= POSTPONED;
11824 assert(*RExC_parse == ')');
11825 nextchar(pRExC_state);
11830 case '?': /* (??...) */
11832 if (*RExC_parse != '{') {
11833 RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
11834 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
11836 "Sequence (%" UTF8f "...) not recognized",
11837 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
11838 NOT_REACHED; /*NOTREACHED*/
11840 *flagp |= POSTPONED;
11844 case '{': /* (?{...}) */
11847 struct reg_code_block *cb;
11850 RExC_seen_zerolen++;
11852 if ( !pRExC_state->code_blocks
11853 || pRExC_state->code_index
11854 >= pRExC_state->code_blocks->count
11855 || pRExC_state->code_blocks->cb[pRExC_state->code_index].start
11856 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
11859 if (RExC_pm_flags & PMf_USE_RE_EVAL)
11860 FAIL("panic: Sequence (?{...}): no code block found\n");
11861 FAIL("Eval-group not allowed at runtime, use re 'eval'");
11863 /* this is a pre-compiled code block (?{...}) */
11864 cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
11865 RExC_parse = RExC_start + cb->end;
11867 if (cb->src_regex) {
11868 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
11869 RExC_rxi->data->data[n] =
11870 (void*)SvREFCNT_inc((SV*)cb->src_regex);
11871 RExC_rxi->data->data[n+1] = (void*)o;
11874 n = add_data(pRExC_state,
11875 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
11876 RExC_rxi->data->data[n] = (void*)o;
11878 pRExC_state->code_index++;
11879 nextchar(pRExC_state);
11882 regnode_offset eval;
11883 ret = reg_node(pRExC_state, LOGICAL);
11885 eval = reg2Lanode(pRExC_state, EVAL,
11888 /* for later propagation into (??{})
11890 RExC_flags & RXf_PMf_COMPILETIME
11892 FLAGS(REGNODE_p(ret)) = 2;
11893 if (! REGTAIL(pRExC_state, ret, eval)) {
11894 REQUIRE_BRANCHJ(flagp, 0);
11896 /* deal with the length of this later - MJD */
11899 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
11900 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1);
11901 Set_Node_Offset(REGNODE_p(ret), parse_start);
11904 case '(': /* (?(?{...})...) and (?(?=...)...) */
11907 const int DEFINE_len = sizeof("DEFINE") - 1;
11908 if ( RExC_parse < RExC_end - 1
11909 && ( ( RExC_parse[0] == '?' /* (?(?...)) */
11910 && ( RExC_parse[1] == '='
11911 || RExC_parse[1] == '!'
11912 || RExC_parse[1] == '<'
11913 || RExC_parse[1] == '{'))
11914 || ( RExC_parse[0] == '*' /* (?(*...)) */
11915 && ( memBEGINs(RExC_parse + 1,
11916 (Size_t) (RExC_end - (RExC_parse + 1)),
11918 || memBEGINs(RExC_parse + 1,
11919 (Size_t) (RExC_end - (RExC_parse + 1)),
11921 || memBEGINs(RExC_parse + 1,
11922 (Size_t) (RExC_end - (RExC_parse + 1)),
11924 || memBEGINs(RExC_parse + 1,
11925 (Size_t) (RExC_end - (RExC_parse + 1)),
11927 || memBEGINs(RExC_parse + 1,
11928 (Size_t) (RExC_end - (RExC_parse + 1)),
11929 "positive_lookahead:")
11930 || memBEGINs(RExC_parse + 1,
11931 (Size_t) (RExC_end - (RExC_parse + 1)),
11932 "positive_lookbehind:")
11933 || memBEGINs(RExC_parse + 1,
11934 (Size_t) (RExC_end - (RExC_parse + 1)),
11935 "negative_lookahead:")
11936 || memBEGINs(RExC_parse + 1,
11937 (Size_t) (RExC_end - (RExC_parse + 1)),
11938 "negative_lookbehind:"))))
11939 ) { /* Lookahead or eval. */
11941 regnode_offset tail;
11943 ret = reg_node(pRExC_state, LOGICAL);
11944 FLAGS(REGNODE_p(ret)) = 1;
11946 tail = reg(pRExC_state, 1, &flag, depth+1);
11947 RETURN_FAIL_ON_RESTART(flag, flagp);
11948 if (! REGTAIL(pRExC_state, ret, tail)) {
11949 REQUIRE_BRANCHJ(flagp, 0);
11953 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
11954 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
11956 char ch = RExC_parse[0] == '<' ? '>' : '\'';
11957 char *name_start= RExC_parse++;
11959 SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
11960 if ( RExC_parse == name_start
11961 || RExC_parse >= RExC_end
11962 || *RExC_parse != ch)
11964 vFAIL2("Sequence (?(%c... not terminated",
11965 (ch == '>' ? '<' : ch));
11969 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11970 RExC_rxi->data->data[num]=(void*)sv_dat;
11971 SvREFCNT_inc_simple_void_NN(sv_dat);
11973 ret = reganode(pRExC_state, GROUPPN, num);
11974 goto insert_if_check_paren;
11976 else if (memBEGINs(RExC_parse,
11977 (STRLEN) (RExC_end - RExC_parse),
11980 ret = reganode(pRExC_state, DEFINEP, 0);
11981 RExC_parse += DEFINE_len;
11983 goto insert_if_check_paren;
11985 else if (RExC_parse[0] == 'R') {
11987 /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval"
11988 * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)"
11989 * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)"
11992 if (RExC_parse[0] == '0') {
11996 else if (inRANGE(RExC_parse[0], '1', '9')) {
11999 if (grok_atoUV(RExC_parse, &uv, &endptr)
12002 parno = (I32)uv + 1;
12003 RExC_parse = (char*)endptr;
12005 /* else "Switch condition not recognized" below */
12006 } else if (RExC_parse[0] == '&') {
12009 sv_dat = reg_scan_name(pRExC_state,
12010 REG_RSN_RETURN_DATA);
12012 parno = 1 + *((I32 *)SvPVX(sv_dat));
12014 ret = reganode(pRExC_state, INSUBP, parno);
12015 goto insert_if_check_paren;
12017 else if (inRANGE(RExC_parse[0], '1', '9')) {
12022 if (grok_atoUV(RExC_parse, &uv, &endptr)
12026 RExC_parse = (char*)endptr;
12029 vFAIL("panic: grok_atoUV returned FALSE");
12031 ret = reganode(pRExC_state, GROUPP, parno);
12033 insert_if_check_paren:
12034 if (UCHARAT(RExC_parse) != ')') {
12036 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12038 vFAIL("Switch condition not recognized");
12040 nextchar(pRExC_state);
12042 if (! REGTAIL(pRExC_state, ret, reganode(pRExC_state,
12045 REQUIRE_BRANCHJ(flagp, 0);
12047 br = regbranch(pRExC_state, &flags, 1, depth+1);
12049 RETURN_FAIL_ON_RESTART(flags,flagp);
12050 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12053 if (! REGTAIL(pRExC_state, br, reganode(pRExC_state,
12056 REQUIRE_BRANCHJ(flagp, 0);
12058 c = UCHARAT(RExC_parse);
12059 nextchar(pRExC_state);
12060 if (flags&HASWIDTH)
12061 *flagp |= HASWIDTH;
12064 vFAIL("(?(DEFINE)....) does not allow branches");
12066 /* Fake one for optimizer. */
12067 lastbr = reganode(pRExC_state, IFTHEN, 0);
12069 if (!regbranch(pRExC_state, &flags, 1, depth+1)) {
12070 RETURN_FAIL_ON_RESTART(flags, flagp);
12071 FAIL2("panic: regbranch returned failure, flags=%#" UVxf,
12074 if (! REGTAIL(pRExC_state, ret, lastbr)) {
12075 REQUIRE_BRANCHJ(flagp, 0);
12077 if (flags&HASWIDTH)
12078 *flagp |= HASWIDTH;
12079 c = UCHARAT(RExC_parse);
12080 nextchar(pRExC_state);
12085 if (RExC_parse >= RExC_end)
12086 vFAIL("Switch (?(condition)... not terminated");
12088 vFAIL("Switch (?(condition)... contains too many branches");
12090 ender = reg_node(pRExC_state, TAIL);
12091 if (! REGTAIL(pRExC_state, br, ender)) {
12092 REQUIRE_BRANCHJ(flagp, 0);
12095 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12096 REQUIRE_BRANCHJ(flagp, 0);
12098 if (! REGTAIL(pRExC_state,
12101 NEXTOPER(REGNODE_p(lastbr)))),
12104 REQUIRE_BRANCHJ(flagp, 0);
12108 if (! REGTAIL(pRExC_state, ret, ender)) {
12109 REQUIRE_BRANCHJ(flagp, 0);
12111 #if 0 /* Removing this doesn't cause failures in the test suite -- khw */
12112 RExC_size++; /* XXX WHY do we need this?!!
12113 For large programs it seems to be required
12114 but I can't figure out why. -- dmq*/
12119 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
12121 vFAIL("Unknown switch condition (?(...))");
12123 case '[': /* (?[ ... ]) */
12124 return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
12126 case 0: /* A NUL */
12127 RExC_parse--; /* for vFAIL to print correctly */
12128 vFAIL("Sequence (? incomplete");
12132 if (RExC_strict) { /* [perl #132851] */
12133 ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
12136 case '*': /* If you want to support (?*...), first reconcile with GH #17363 */
12138 default: /* e.g., (?i) */
12139 RExC_parse = (char *) seqstart + 1;
12141 parse_lparen_question_flags(pRExC_state);
12142 if (UCHARAT(RExC_parse) != ':') {
12143 if (RExC_parse < RExC_end)
12144 nextchar(pRExC_state);
12149 nextchar(pRExC_state);
12154 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
12158 if (! ALL_PARENS_COUNTED) {
12159 /* If we are in our first pass through (and maybe only pass),
12160 * we need to allocate memory for the capturing parentheses
12164 if (!RExC_parens_buf_size) {
12165 /* first guess at number of parens we might encounter */
12166 RExC_parens_buf_size = 10;
12168 /* setup RExC_open_parens, which holds the address of each
12169 * OPEN tag, and to make things simpler for the 0 index the
12170 * start of the program - this is used later for offsets */
12171 Newxz(RExC_open_parens, RExC_parens_buf_size,
12173 RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
12175 /* setup RExC_close_parens, which holds the address of each
12176 * CLOSE tag, and to make things simpler for the 0 index
12177 * the end of the program - this is used later for offsets
12179 Newxz(RExC_close_parens, RExC_parens_buf_size,
12181 /* we dont know where end op starts yet, so we dont need to
12182 * set RExC_close_parens[0] like we do RExC_open_parens[0]
12185 else if (RExC_npar > RExC_parens_buf_size) {
12186 I32 old_size = RExC_parens_buf_size;
12188 RExC_parens_buf_size *= 2;
12190 Renew(RExC_open_parens, RExC_parens_buf_size,
12192 Zero(RExC_open_parens + old_size,
12193 RExC_parens_buf_size - old_size, regnode_offset);
12195 Renew(RExC_close_parens, RExC_parens_buf_size,
12197 Zero(RExC_close_parens + old_size,
12198 RExC_parens_buf_size - old_size, regnode_offset);
12202 ret = reganode(pRExC_state, OPEN, parno);
12203 if (!RExC_nestroot)
12204 RExC_nestroot = parno;
12205 if (RExC_open_parens && !RExC_open_parens[parno])
12207 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12208 "%*s%*s Setting open paren #%" IVdf " to %zu\n",
12209 22, "| |", (int)(depth * 2 + 1), "",
12211 RExC_open_parens[parno]= ret;
12214 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
12215 Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
12218 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
12227 /* Pick up the branches, linking them together. */
12228 parse_start = RExC_parse; /* MJD */
12229 br = regbranch(pRExC_state, &flags, 1, depth+1);
12231 /* branch_len = (paren != 0); */
12234 RETURN_FAIL_ON_RESTART(flags, flagp);
12235 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12237 if (*RExC_parse == '|') {
12238 if (RExC_use_BRANCHJ) {
12239 reginsert(pRExC_state, BRANCHJ, br, depth+1);
12242 reginsert(pRExC_state, BRANCH, br, depth+1);
12243 Set_Node_Length(REGNODE_p(br), paren != 0);
12244 Set_Node_Offset_To_R(br, parse_start-RExC_start);
12248 else if (paren == ':') {
12249 *flagp |= flags&SIMPLE;
12251 if (is_open) { /* Starts with OPEN. */
12252 if (! REGTAIL(pRExC_state, ret, br)) { /* OPEN -> first. */
12253 REQUIRE_BRANCHJ(flagp, 0);
12256 else if (paren != '?') /* Not Conditional */
12258 *flagp |= flags & (HASWIDTH | POSTPONED);
12260 while (*RExC_parse == '|') {
12261 if (RExC_use_BRANCHJ) {
12264 ender = reganode(pRExC_state, LONGJMP, 0);
12266 /* Append to the previous. */
12267 shut_gcc_up = REGTAIL(pRExC_state,
12268 REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
12270 PERL_UNUSED_VAR(shut_gcc_up);
12272 nextchar(pRExC_state);
12273 if (freeze_paren) {
12274 if (RExC_npar > after_freeze)
12275 after_freeze = RExC_npar;
12276 RExC_npar = freeze_paren;
12278 br = regbranch(pRExC_state, &flags, 0, depth+1);
12281 RETURN_FAIL_ON_RESTART(flags, flagp);
12282 FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
12284 if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
12285 REQUIRE_BRANCHJ(flagp, 0);
12288 *flagp |= flags & (HASWIDTH | POSTPONED);
12291 if (have_branch || paren != ':') {
12294 /* Make a closing node, and hook it on the end. */
12297 ender = reg_node(pRExC_state, TAIL);
12300 ender = reganode(pRExC_state, CLOSE, parno);
12301 if ( RExC_close_parens ) {
12302 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12303 "%*s%*s Setting close paren #%" IVdf " to %zu\n",
12304 22, "| |", (int)(depth * 2 + 1), "",
12305 (IV)parno, ender));
12306 RExC_close_parens[parno]= ender;
12307 if (RExC_nestroot == parno)
12310 Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */
12311 Set_Node_Length(REGNODE_p(ender), 1); /* MJD */
12314 ender = reg_node(pRExC_state, SRCLOSE);
12315 RExC_in_script_run = 0;
12325 *flagp &= ~HASWIDTH;
12327 case 't': /* aTomic */
12329 ender = reg_node(pRExC_state, SUCCEED);
12332 ender = reg_node(pRExC_state, END);
12333 assert(!RExC_end_op); /* there can only be one! */
12334 RExC_end_op = REGNODE_p(ender);
12335 if (RExC_close_parens) {
12336 DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
12337 "%*s%*s Setting close paren #0 (END) to %zu\n",
12338 22, "| |", (int)(depth * 2 + 1), "",
12341 RExC_close_parens[0]= ender;
12346 DEBUG_PARSE_MSG("lsbr");
12347 regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
12348 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
12349 Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12350 SvPV_nolen_const(RExC_mysv1),
12352 SvPV_nolen_const(RExC_mysv2),
12354 (IV)(ender - lastbr)
12357 if (! REGTAIL(pRExC_state, lastbr, ender)) {
12358 REQUIRE_BRANCHJ(flagp, 0);
12362 char is_nothing= 1;
12364 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
12366 /* Hook the tails of the branches to the closing node. */
12367 for (br = REGNODE_p(ret); br; br = regnext(br)) {
12368 const U8 op = PL_regkind[OP(br)];
12369 if (op == BRANCH) {
12370 if (! REGTAIL_STUDY(pRExC_state,
12371 REGNODE_OFFSET(NEXTOPER(br)),
12374 REQUIRE_BRANCHJ(flagp, 0);
12376 if ( OP(NEXTOPER(br)) != NOTHING
12377 || regnext(NEXTOPER(br)) != REGNODE_p(ender))
12380 else if (op == BRANCHJ) {
12381 bool shut_gcc_up = REGTAIL_STUDY(pRExC_state,
12382 REGNODE_OFFSET(NEXTOPER(NEXTOPER(br))),
12384 PERL_UNUSED_VAR(shut_gcc_up);
12385 /* for now we always disable this optimisation * /
12386 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
12387 || regnext(NEXTOPER(NEXTOPER(br))) != REGNODE_p(ender))
12393 regnode * ret_as_regnode = REGNODE_p(ret);
12394 br= PL_regkind[OP(ret_as_regnode)] != BRANCH
12395 ? regnext(ret_as_regnode)
12398 DEBUG_PARSE_MSG("NADA");
12399 regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
12400 NULL, pRExC_state);
12401 regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender),
12402 NULL, pRExC_state);
12403 Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
12404 SvPV_nolen_const(RExC_mysv1),
12405 (IV)REG_NODE_NUM(ret_as_regnode),
12406 SvPV_nolen_const(RExC_mysv2),
12412 if (OP(REGNODE_p(ender)) == TAIL) {
12414 RExC_emit= REGNODE_OFFSET(br) + 1;
12417 for ( opt= br + 1; opt < REGNODE_p(ender) ; opt++ )
12418 OP(opt)= OPTIMIZED;
12419 NEXT_OFF(br)= REGNODE_p(ender) - br;
12427 /* Even/odd or x=don't care: 010101x10x */
12428 static const char parens[] = "=!aA<,>Bbt";
12429 /* flag below is set to 0 up through 'A'; 1 for larger */
12431 if (paren && (p = strchr(parens, paren))) {
12432 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
12433 int flag = (p - parens) > 3;
12435 if (paren == '>' || paren == 't') {
12436 node = SUSPEND, flag = 0;
12439 reginsert(pRExC_state, node, ret, depth+1);
12440 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12441 Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
12442 FLAGS(REGNODE_p(ret)) = flag;
12443 if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
12445 REQUIRE_BRANCHJ(flagp, 0);
12450 /* Check for proper termination. */
12452 /* restore original flags, but keep (?p) and, if we've encountered
12453 * something in the parse that changes /d rules into /u, keep the /u */
12454 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
12455 if (DEPENDS_SEMANTICS && toUSE_UNI_CHARSET_NOT_DEPENDS) {
12456 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
12458 if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
12459 RExC_parse = oregcomp_parse;
12460 vFAIL("Unmatched (");
12462 nextchar(pRExC_state);
12464 else if (!paren && RExC_parse < RExC_end) {
12465 if (*RExC_parse == ')') {
12467 vFAIL("Unmatched )");
12470 FAIL("Junk on end of regexp"); /* "Can't happen". */
12471 NOT_REACHED; /* NOTREACHED */
12474 if (RExC_in_lookbehind) {
12475 RExC_in_lookbehind--;
12477 if (RExC_in_lookahead) {
12478 RExC_in_lookahead--;
12480 if (after_freeze > RExC_npar)
12481 RExC_npar = after_freeze;
12486 - regbranch - one alternative of an | operator
12488 * Implements the concatenation operator.
12490 * On success, returns the offset at which any next node should be placed into
12491 * the regex engine program being compiled.
12493 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
12494 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
12497 STATIC regnode_offset
12498 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
12500 regnode_offset ret;
12501 regnode_offset chain = 0;
12502 regnode_offset latest;
12503 I32 flags = 0, c = 0;
12504 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12506 PERL_ARGS_ASSERT_REGBRANCH;
12508 DEBUG_PARSE("brnc");
12513 if (RExC_use_BRANCHJ)
12514 ret = reganode(pRExC_state, BRANCHJ, 0);
12516 ret = reg_node(pRExC_state, BRANCH);
12517 Set_Node_Length(REGNODE_p(ret), 1);
12521 *flagp = 0; /* Initialize. */
12523 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
12524 FALSE /* Don't force to /x */ );
12525 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
12526 flags &= ~TRYAGAIN;
12527 latest = regpiece(pRExC_state, &flags, depth+1);
12529 if (flags & TRYAGAIN)
12531 RETURN_FAIL_ON_RESTART(flags, flagp);
12532 FAIL2("panic: regpiece returned failure, flags=%#" UVxf, (UV) flags);
12536 *flagp |= flags&(HASWIDTH|POSTPONED);
12538 /* FIXME adding one for every branch after the first is probably
12539 * excessive now we have TRIE support. (hv) */
12541 if (! REGTAIL(pRExC_state, chain, latest)) {
12542 /* XXX We could just redo this branch, but figuring out what
12543 * bookkeeping needs to be reset is a pain, and it's likely
12544 * that other branches that goto END will also be too large */
12545 REQUIRE_BRANCHJ(flagp, 0);
12551 if (chain == 0) { /* Loop ran zero times. */
12552 chain = reg_node(pRExC_state, NOTHING);
12557 *flagp |= flags&SIMPLE;
12564 - regpiece - something followed by possible quantifier * + ? {n,m}
12566 * Note that the branching code sequences used for ? and the general cases
12567 * of * and + are somewhat optimized: they use the same NOTHING node as
12568 * both the endmarker for their branch list and the body of the last branch.
12569 * It might seem that this node could be dispensed with entirely, but the
12570 * endmarker role is not redundant.
12572 * On success, returns the offset at which any next node should be placed into
12573 * the regex engine program being compiled.
12575 * Returns 0 otherwise, with *flagp set to indicate why:
12576 * TRYAGAIN if regatom() returns 0 with TRYAGAIN.
12577 * RESTART_PARSE if the parse needs to be restarted, or'd with
12578 * NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
12580 STATIC regnode_offset
12581 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
12583 regnode_offset ret;
12587 const char * const origparse = RExC_parse;
12589 I32 max = REG_INFTY;
12590 #ifdef RE_TRACK_PATTERN_OFFSETS
12593 const char *maxpos = NULL;
12596 /* Save the original in case we change the emitted regop to a FAIL. */
12597 const regnode_offset orig_emit = RExC_emit;
12599 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12601 PERL_ARGS_ASSERT_REGPIECE;
12603 DEBUG_PARSE("piec");
12605 ret = regatom(pRExC_state, &flags, depth+1);
12607 RETURN_FAIL_ON_RESTART_OR_FLAGS(flags, flagp, TRYAGAIN);
12608 FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags);
12611 #ifdef RE_TRACK_PATTERN_OFFSETS
12612 parse_start = RExC_parse;
12619 nextchar(pRExC_state);
12624 nextchar(pRExC_state);
12629 nextchar(pRExC_state);
12633 case '{': /* A '{' may or may not indicate a quantifier; call regcurly()
12634 to determine which */
12635 if (regcurly(RExC_parse)) {
12636 const char* endptr;
12638 /* Here is a quantifier, parse for min and max values */
12640 next = RExC_parse + 1;
12641 while (isDIGIT(*next) || *next == ',') {
12642 if (*next == ',') {
12651 assert(*next == '}');
12656 if (isDIGIT(*RExC_parse)) {
12658 if (!grok_atoUV(RExC_parse, &uv, &endptr))
12659 vFAIL("Invalid quantifier in {,}");
12660 if (uv >= REG_INFTY)
12661 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12666 if (*maxpos == ',')
12669 maxpos = RExC_parse;
12670 if (isDIGIT(*maxpos)) {
12672 if (!grok_atoUV(maxpos, &uv, &endptr))
12673 vFAIL("Invalid quantifier in {,}");
12674 if (uv >= REG_INFTY)
12675 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
12678 max = REG_INFTY; /* meaning "infinity" */
12682 nextchar(pRExC_state);
12683 if (max < min) { /* If can't match, warn and optimize to fail
12685 reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
12686 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
12687 NEXT_OFF(REGNODE_p(orig_emit)) =
12688 regarglen[OPFAIL] + NODE_STEP_REGNODE;
12691 else if (min == max && *RExC_parse == '?')
12693 ckWARN2reg(RExC_parse + 1,
12694 "Useless use of greediness modifier '%c'",
12699 } /* End of is regcurly() */
12701 /* Here was a '{', but what followed it didn't form a quantifier. */
12707 NOT_REACHED; /*NOTREACHED*/
12710 /* Here we have a quantifier, and have calculated 'min' and 'max'.
12712 * Check and possibly adjust a zero width operand */
12713 if (! (flags & (HASWIDTH|POSTPONED))) {
12714 if (max > REG_INFTY/3) {
12715 if (origparse[0] == '\\' && origparse[1] == 'K') {
12717 "%" UTF8f " is forbidden - matches null string"
12719 UTF8fARG(UTF, (RExC_parse >= origparse
12720 ? RExC_parse - origparse
12724 ckWARN2reg(RExC_parse,
12725 "%" UTF8f " matches null string many times",
12726 UTF8fARG(UTF, (RExC_parse >= origparse
12727 ? RExC_parse - origparse
12733 /* There's no point in trying to match something 0 length more than
12734 * once except for extra side effects, which we don't have here since
12744 /* If this is a code block pass it up */
12745 *flagp |= (flags & POSTPONED);
12748 *flagp |= (flags & HASWIDTH);
12749 if (max == REG_INFTY)
12750 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
12753 /* 'SIMPLE' operands don't require full generality */
12754 if ((flags&SIMPLE)) {
12755 if (max == REG_INFTY) {
12757 if (UNLIKELY(RExC_pm_flags & PMf_WILDCARD)) {
12758 goto min0_maxINF_wildcard_forbidden;
12761 reginsert(pRExC_state, STAR, ret, depth+1);
12765 else if (min == 1) {
12766 reginsert(pRExC_state, PLUS, ret, depth+1);
12772 /* Here, SIMPLE, but not the '*' and '+' special cases */
12774 MARK_NAUGHTY_EXP(2, 2);
12775 reginsert(pRExC_state, CURLY, ret, depth+1);
12776 Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */
12777 Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
12779 else { /* not SIMPLE */
12780 const regnode_offset w = reg_node(pRExC_state, WHILEM);
12782 FLAGS(REGNODE_p(w)) = 0;
12783 if (! REGTAIL(pRExC_state, ret, w)) {
12784 REQUIRE_BRANCHJ(flagp, 0);
12786 if (RExC_use_BRANCHJ) {
12787 reginsert(pRExC_state, LONGJMP, ret, depth+1);
12788 reginsert(pRExC_state, NOTHING, ret, depth+1);
12789 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
12791 reginsert(pRExC_state, CURLYX, ret, depth+1);
12793 Set_Node_Offset(REGNODE_p(ret), parse_start+1);
12794 Set_Node_Length(REGNODE_p(ret),
12795 op == '{' ? (RExC_parse - parse_start) : 1);
12797 if (RExC_use_BRANCHJ)
12798 NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
12800 if (! REGTAIL(pRExC_state, ret, reg_node(pRExC_state,
12803 REQUIRE_BRANCHJ(flagp, 0);
12805 RExC_whilem_seen++;
12806 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
12809 /* Finish up the CURLY/CURLYX case */
12810 FLAGS(REGNODE_p(ret)) = 0;
12812 ARG1_SET(REGNODE_p(ret), (U16)min);
12813 ARG2_SET(REGNODE_p(ret), (U16)max);
12817 /* Process any greediness modifiers */
12818 if (*RExC_parse == '?') {
12819 nextchar(pRExC_state);
12820 reginsert(pRExC_state, MINMOD, ret, depth+1);
12821 if (! REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE)) {
12822 REQUIRE_BRANCHJ(flagp, 0);
12825 else if (*RExC_parse == '+') {
12826 regnode_offset ender;
12827 nextchar(pRExC_state);
12828 ender = reg_node(pRExC_state, SUCCEED);
12829 if (! REGTAIL(pRExC_state, ret, ender)) {
12830 REQUIRE_BRANCHJ(flagp, 0);
12832 reginsert(pRExC_state, SUSPEND, ret, depth+1);
12833 ender = reg_node(pRExC_state, TAIL);
12834 if (! REGTAIL(pRExC_state, ret, ender)) {
12835 REQUIRE_BRANCHJ(flagp, 0);
12839 /* Forbid extra quantifiers */
12840 if (ISMULT2(RExC_parse)) {
12842 vFAIL("Nested quantifiers");
12847 min0_maxINF_wildcard_forbidden:
12849 /* Here we are in a wildcard match, and the minimum match length is 0, and
12850 * the max could be infinity. This is currently forbidden. The only
12851 * reason is to make it harder to write patterns that take a long long time
12852 * to halt, and because the use of this construct isn't necessary in
12853 * matching Unicode property values */
12855 /* diag_listed_as: Use of %s is not allowed in Unicode property wildcard
12856 subpatterns in regex; marked by <-- HERE in m/%s/
12858 vFAIL("Use of quantifier '*' is not allowed in Unicode property wildcard"
12861 /* Note, don't need to worry about the input being '{0,}', as a '}' isn't
12862 * legal at all in wildcards, so can't get this far */
12864 NOT_REACHED; /*NOTREACHED*/
12868 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
12869 regnode_offset * node_p,
12877 /* This routine teases apart the various meanings of \N and returns
12878 * accordingly. The input parameters constrain which meaning(s) is/are valid
12879 * in the current context.
12881 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
12883 * If <code_point_p> is not NULL, the context is expecting the result to be a
12884 * single code point. If this \N instance turns out to a single code point,
12885 * the function returns TRUE and sets *code_point_p to that code point.
12887 * If <node_p> is not NULL, the context is expecting the result to be one of
12888 * the things representable by a regnode. If this \N instance turns out to be
12889 * one such, the function generates the regnode, returns TRUE and sets *node_p
12890 * to point to the offset of that regnode into the regex engine program being
12893 * If this instance of \N isn't legal in any context, this function will
12894 * generate a fatal error and not return.
12896 * On input, RExC_parse should point to the first char following the \N at the
12897 * time of the call. On successful return, RExC_parse will have been updated
12898 * to point to just after the sequence identified by this routine. Also
12899 * *flagp has been updated as needed.
12901 * When there is some problem with the current context and this \N instance,
12902 * the function returns FALSE, without advancing RExC_parse, nor setting
12903 * *node_p, nor *code_point_p, nor *flagp.
12905 * If <cp_count> is not NULL, the caller wants to know the length (in code
12906 * points) that this \N sequence matches. This is set, and the input is
12907 * parsed for errors, even if the function returns FALSE, as detailed below.
12909 * There are 6 possibilities here, as detailed in the next 6 paragraphs.
12911 * Probably the most common case is for the \N to specify a single code point.
12912 * *cp_count will be set to 1, and *code_point_p will be set to that code
12915 * Another possibility is for the input to be an empty \N{}. This is no
12916 * longer accepted, and will generate a fatal error.
12918 * Another possibility is for a custom charnames handler to be in effect which
12919 * translates the input name to an empty string. *cp_count will be set to 0.
12920 * *node_p will be set to a generated NOTHING node.
12922 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
12923 * set to 0. *node_p will be set to a generated REG_ANY node.
12925 * The fifth possibility is that \N resolves to a sequence of more than one
12926 * code points. *cp_count will be set to the number of code points in the
12927 * sequence. *node_p will be set to a generated node returned by this
12928 * function calling S_reg().
12930 * The final possibility is that it is premature to be calling this function;
12931 * the parse needs to be restarted. This can happen when this changes from
12932 * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
12933 * latter occurs only when the fifth possibility would otherwise be in
12934 * effect, and is because one of those code points requires the pattern to be
12935 * recompiled as UTF-8. The function returns FALSE, and sets the
12936 * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
12937 * happens, the caller needs to desist from continuing parsing, and return
12938 * this information to its caller. This is not set for when there is only one
12939 * code point, as this can be called as part of an ANYOF node, and they can
12940 * store above-Latin1 code points without the pattern having to be in UTF-8.
12942 * For non-single-quoted regexes, the tokenizer has resolved character and
12943 * sequence names inside \N{...} into their Unicode values, normalizing the
12944 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
12945 * hex-represented code points in the sequence. This is done there because
12946 * the names can vary based on what charnames pragma is in scope at the time,
12947 * so we need a way to take a snapshot of what they resolve to at the time of
12948 * the original parse. [perl #56444].
12950 * That parsing is skipped for single-quoted regexes, so here we may get
12951 * '\N{NAME}', which is parsed now. If the single-quoted regex is something
12952 * like '\N{U+41}', that code point is Unicode, and has to be translated into
12953 * the native character set for non-ASCII platforms. The other possibilities
12954 * are already native, so no translation is done. */
12956 char * endbrace; /* points to '}' following the name */
12957 char* p = RExC_parse; /* Temporary */
12959 SV * substitute_parse = NULL;
12964 DECLARE_AND_GET_RE_DEBUG_FLAGS;
12966 PERL_ARGS_ASSERT_GROK_BSLASH_N;
12968 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
12969 assert(! (node_p && cp_count)); /* At most 1 should be set */
12971 if (cp_count) { /* Initialize return for the most common case */
12975 /* The [^\n] meaning of \N ignores spaces and comments under the /x
12976 * modifier. The other meanings do not, so use a temporary until we find
12977 * out which we are being called with */
12978 skip_to_be_ignored_text(pRExC_state, &p,
12979 FALSE /* Don't force to /x */ );
12981 /* Disambiguate between \N meaning a named character versus \N meaning
12982 * [^\n]. The latter is assumed when the {...} following the \N is a legal
12983 * quantifier, or if there is no '{' at all */
12984 if (*p != '{' || regcurly(p)) {
12994 *node_p = reg_node(pRExC_state, REG_ANY);
12995 *flagp |= HASWIDTH|SIMPLE;
12997 Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */
13001 /* The test above made sure that the next real character is a '{', but
13002 * under the /x modifier, it could be separated by space (or a comment and
13003 * \n) and this is not allowed (for consistency with \x{...} and the
13004 * tokenizer handling of \N{NAME}). */
13005 if (*RExC_parse != '{') {
13006 vFAIL("Missing braces on \\N{}");
13009 RExC_parse++; /* Skip past the '{' */
13011 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13012 if (! endbrace) { /* no trailing brace */
13013 vFAIL2("Missing right brace on \\%c{}", 'N');
13016 /* Here, we have decided it should be a named character or sequence. These
13017 * imply Unicode semantics */
13018 REQUIRE_UNI_RULES(flagp, FALSE);
13020 /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
13021 * nothing at all (not allowed under strict) */
13022 if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
13023 RExC_parse = endbrace;
13025 RExC_parse++; /* Position after the "}" */
13026 vFAIL("Zero length \\N{}");
13032 nextchar(pRExC_state);
13037 *node_p = reg_node(pRExC_state, NOTHING);
13041 if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
13043 /* Here, the name isn't of the form U+.... This can happen if the
13044 * pattern is single-quoted, so didn't get evaluated in toke.c. Now
13045 * is the time to find out what the name means */
13047 const STRLEN name_len = endbrace - RExC_parse;
13048 SV * value_sv; /* What does this name evaluate to */
13050 const U8 * value; /* string of name's value */
13051 STRLEN value_len; /* and its length */
13053 /* RExC_unlexed_names is a hash of names that weren't evaluated by
13054 * toke.c, and their values. Make sure is initialized */
13055 if (! RExC_unlexed_names) {
13056 RExC_unlexed_names = newHV();
13059 /* If we have already seen this name in this pattern, use that. This
13060 * allows us to only call the charnames handler once per name per
13061 * pattern. A broken or malicious handler could return something
13062 * different each time, which could cause the results to vary depending
13063 * on if something gets added or subtracted from the pattern that
13064 * causes the number of passes to change, for example */
13065 if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
13068 value_sv = *value_svp;
13070 else { /* Otherwise we have to go out and get the name */
13071 const char * error_msg = NULL;
13072 value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
13076 RExC_parse = endbrace;
13080 /* If no error message, should have gotten a valid return */
13083 /* Save the name's meaning for later use */
13084 if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
13087 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
13091 /* Here, we have the value the name evaluates to in 'value_sv' */
13092 value = (U8 *) SvPV(value_sv, value_len);
13094 /* See if the result is one code point vs 0 or multiple */
13095 if (inRANGE(value_len, 1, ((UV) SvUTF8(value_sv)
13099 /* Here, exactly one code point. If that isn't what is wanted,
13101 if (! code_point_p) {
13106 /* Convert from string to numeric code point */
13107 *code_point_p = (SvUTF8(value_sv))
13108 ? valid_utf8_to_uvchr(value, NULL)
13111 /* Have parsed this entire single code point \N{...}. *cp_count
13112 * has already been set to 1, so don't do it again. */
13113 RExC_parse = endbrace;
13114 nextchar(pRExC_state);
13116 } /* End of is a single code point */
13118 /* Count the code points, if caller desires. The API says to do this
13119 * even if we will later return FALSE */
13123 *cp_count = (SvUTF8(value_sv))
13124 ? utf8_length(value, value + value_len)
13128 /* Fail if caller doesn't want to handle a multi-code-point sequence.
13129 * But don't back the pointer up if the caller wants to know how many
13130 * code points there are (they need to handle it themselves in this
13139 /* Convert this to a sub-pattern of the form "(?: ... )", and then call
13140 * reg recursively to parse it. That way, it retains its atomicness,
13141 * while not having to worry about any special handling that some code
13142 * points may have. */
13144 substitute_parse = newSVpvs("?:");
13145 sv_catsv(substitute_parse, value_sv);
13146 sv_catpv(substitute_parse, ")");
13148 /* The value should already be native, so no need to convert on EBCDIC
13150 assert(! RExC_recode_x_to_native);
13153 else { /* \N{U+...} */
13154 Size_t count = 0; /* code point count kept internally */
13156 /* We can get to here when the input is \N{U+...} or when toke.c has
13157 * converted a name to the \N{U+...} form. This include changing a
13158 * name that evaluates to multiple code points to \N{U+c1.c2.c3 ...} */
13160 RExC_parse += 2; /* Skip past the 'U+' */
13162 /* Code points are separated by dots. The '}' terminates the whole
13165 do { /* Loop until the ending brace */
13166 I32 flags = PERL_SCAN_SILENT_OVERFLOW
13167 | PERL_SCAN_SILENT_ILLDIGIT
13168 | PERL_SCAN_NOTIFY_ILLDIGIT
13169 | PERL_SCAN_ALLOW_MEDIAL_UNDERSCORES
13170 | PERL_SCAN_DISALLOW_PREFIX;
13171 STRLEN len = endbrace - RExC_parse;
13173 char * start_digit = RExC_parse;
13174 UV cp = grok_hex(RExC_parse, &len, &flags, &overflow_value);
13179 vFAIL("Invalid hexadecimal number in \\N{U+...}");
13184 if (cp > MAX_LEGAL_CP) {
13185 vFAIL(form_cp_too_large_msg(16, start_digit, len, 0));
13188 if (RExC_parse >= endbrace) { /* Got to the closing '}' */
13193 /* Here, is a single code point; fail if doesn't want that */
13194 if (! code_point_p) {
13199 /* A single code point is easy to handle; just return it */
13200 *code_point_p = UNI_TO_NATIVE(cp);
13201 RExC_parse = endbrace;
13202 nextchar(pRExC_state);
13206 /* Here, the parse stopped bfore the ending brace. This is legal
13207 * only if that character is a dot separating code points, like a
13208 * multiple character sequence (of the form "\N{U+c1.c2. ... }".
13209 * So the next character must be a dot (and the one after that
13210 * can't be the endbrace, or we'd have something like \N{U+100.} )
13212 if (*RExC_parse != '.' || RExC_parse + 1 >= endbrace) {
13213 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
13214 ? UTF8SKIP(RExC_parse)
13216 RExC_parse = MIN(endbrace, RExC_parse);/* Guard against
13221 /* Here, looks like its really a multiple character sequence. Fail
13222 * if that's not what the caller wants. But continue with counting
13223 * and error checking if they still want a count */
13224 if (! node_p && ! cp_count) {
13228 /* What is done here is to convert this to a sub-pattern of the
13229 * form \x{char1}\x{char2}... and then call reg recursively to
13230 * parse it (enclosing in "(?: ... )" ). That way, it retains its
13231 * atomicness, while not having to worry about special handling
13232 * that some code points may have. We don't create a subpattern,
13233 * but go through the motions of code point counting and error
13234 * checking, if the caller doesn't want a node returned. */
13236 if (node_p && ! substitute_parse) {
13237 substitute_parse = newSVpvs("?:");
13243 /* Convert to notation the rest of the code understands */
13244 sv_catpvs(substitute_parse, "\\x{");
13245 sv_catpvn(substitute_parse, start_digit,
13246 RExC_parse - start_digit);
13247 sv_catpvs(substitute_parse, "}");
13250 /* Move to after the dot (or ending brace the final time through.)
13255 } while (RExC_parse < endbrace);
13257 if (! node_p) { /* Doesn't want the node */
13264 sv_catpvs(substitute_parse, ")");
13266 /* The values are Unicode, and therefore have to be converted to native
13267 * on a non-Unicode (meaning non-ASCII) platform. */
13268 SET_recode_x_to_native(1);
13271 /* Here, we have the string the name evaluates to, ready to be parsed,
13272 * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
13273 * constructs. This can be called from within a substitute parse already.
13274 * The error reporting mechanism doesn't work for 2 levels of this, but the
13275 * code above has validated this new construct, so there should be no
13276 * errors generated by the below. And this isn' an exact copy, so the
13277 * mechanism to seamlessly deal with this won't work, so turn off warnings
13279 save_start = RExC_start;
13280 orig_end = RExC_end;
13282 RExC_parse = RExC_start = SvPVX(substitute_parse);
13283 RExC_end = RExC_parse + SvCUR(substitute_parse);
13284 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
13286 *node_p = reg(pRExC_state, 1, &flags, depth+1);
13288 /* Restore the saved values */
13290 RExC_start = save_start;
13291 RExC_parse = endbrace;
13292 RExC_end = orig_end;
13293 SET_recode_x_to_native(0);
13295 SvREFCNT_dec_NN(substitute_parse);
13298 RETURN_FAIL_ON_RESTART(flags, flagp);
13299 FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
13302 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13304 nextchar(pRExC_state);
13311 S_compute_EXACTish(RExC_state_t *pRExC_state)
13315 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
13323 op = get_regex_charset(RExC_flags);
13324 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
13325 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
13326 been, so there is no hole */
13329 return op + EXACTF;
13333 S_new_regcurly(const char *s, const char *e)
13335 /* This is a temporary function designed to match the most lenient form of
13336 * a {m,n} quantifier we ever envision, with either number omitted, and
13337 * spaces anywhere between/before/after them.
13339 * If this function fails, then the string it matches is very unlikely to
13340 * ever be considered a valid quantifier, so we can allow the '{' that
13341 * begins it to be considered as a literal */
13343 bool has_min = FALSE;
13344 bool has_max = FALSE;
13346 PERL_ARGS_ASSERT_NEW_REGCURLY;
13348 if (s >= e || *s++ != '{')
13351 while (s < e && isSPACE(*s)) {
13354 while (s < e && isDIGIT(*s)) {
13358 while (s < e && isSPACE(*s)) {
13364 while (s < e && isSPACE(*s)) {
13367 while (s < e && isDIGIT(*s)) {
13371 while (s < e && isSPACE(*s)) {
13376 return s < e && *s == '}' && (has_min || has_max);
13379 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
13380 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
13383 S_backref_value(char *p, char *e)
13385 const char* endptr = e;
13387 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
13394 - regatom - the lowest level
13396 Try to identify anything special at the start of the current parse position.
13397 If there is, then handle it as required. This may involve generating a
13398 single regop, such as for an assertion; or it may involve recursing, such as
13399 to handle a () structure.
13401 If the string doesn't start with something special then we gobble up
13402 as much literal text as we can. If we encounter a quantifier, we have to
13403 back off the final literal character, as that quantifier applies to just it
13404 and not to the whole string of literals.
13406 Once we have been able to handle whatever type of thing started the
13407 sequence, we return the offset into the regex engine program being compiled
13408 at which any next regnode should be placed.
13410 Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
13411 Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
13412 restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
13413 Otherwise does not return 0.
13415 Note: we have to be careful with escapes, as they can be both literal
13416 and special, and in the case of \10 and friends, context determines which.
13418 A summary of the code structure is:
13420 switch (first_byte) {
13421 cases for each special:
13422 handle this special;
13425 switch (2nd byte) {
13426 cases for each unambiguous special:
13427 handle this special;
13429 cases for each ambigous special/literal:
13431 if (special) handle here
13433 default: // unambiguously literal:
13436 default: // is a literal char
13439 create EXACTish node for literal;
13440 while (more input and node isn't full) {
13441 switch (input_byte) {
13442 cases for each special;
13443 make sure parse pointer is set so that the next call to
13444 regatom will see this special first
13445 goto loopdone; // EXACTish node terminated by prev. char
13447 append char to EXACTISH node;
13449 get next input byte;
13453 return the generated node;
13455 Specifically there are two separate switches for handling
13456 escape sequences, with the one for handling literal escapes requiring
13457 a dummy entry for all of the special escapes that are actually handled
13462 STATIC regnode_offset
13463 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
13465 regnode_offset ret = 0;
13471 DECLARE_AND_GET_RE_DEBUG_FLAGS;
13473 *flagp = 0; /* Initialize. */
13475 DEBUG_PARSE("atom");
13477 PERL_ARGS_ASSERT_REGATOM;
13480 parse_start = RExC_parse;
13481 assert(RExC_parse < RExC_end);
13482 switch ((U8)*RExC_parse) {
13484 RExC_seen_zerolen++;
13485 nextchar(pRExC_state);
13486 if (RExC_flags & RXf_PMf_MULTILINE)
13487 ret = reg_node(pRExC_state, MBOL);
13489 ret = reg_node(pRExC_state, SBOL);
13490 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13493 nextchar(pRExC_state);
13495 RExC_seen_zerolen++;
13496 if (RExC_flags & RXf_PMf_MULTILINE)
13497 ret = reg_node(pRExC_state, MEOL);
13499 ret = reg_node(pRExC_state, SEOL);
13500 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13503 nextchar(pRExC_state);
13504 if (RExC_flags & RXf_PMf_SINGLELINE)
13505 ret = reg_node(pRExC_state, SANY);
13507 ret = reg_node(pRExC_state, REG_ANY);
13508 *flagp |= HASWIDTH|SIMPLE;
13510 Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
13514 char * const oregcomp_parse = ++RExC_parse;
13515 ret = regclass(pRExC_state, flagp, depth+1,
13516 FALSE, /* means parse the whole char class */
13517 TRUE, /* allow multi-char folds */
13518 FALSE, /* don't silence non-portable warnings. */
13519 (bool) RExC_strict,
13520 TRUE, /* Allow an optimized regnode result */
13523 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13524 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13527 if (*RExC_parse != ']') {
13528 RExC_parse = oregcomp_parse;
13529 vFAIL("Unmatched [");
13531 nextchar(pRExC_state);
13532 Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */
13536 nextchar(pRExC_state);
13537 ret = reg(pRExC_state, 2, &flags, depth+1);
13539 if (flags & TRYAGAIN) {
13540 if (RExC_parse >= RExC_end) {
13541 /* Make parent create an empty node if needed. */
13542 *flagp |= TRYAGAIN;
13547 RETURN_FAIL_ON_RESTART(flags, flagp);
13548 FAIL2("panic: reg returned failure to regatom, flags=%#" UVxf,
13551 *flagp |= flags&(HASWIDTH|SIMPLE|POSTPONED);
13555 if (flags & TRYAGAIN) {
13556 *flagp |= TRYAGAIN;
13559 vFAIL("Internal urp");
13560 /* Supposed to be caught earlier. */
13566 vFAIL("Quantifier follows nothing");
13571 This switch handles escape sequences that resolve to some kind
13572 of special regop and not to literal text. Escape sequences that
13573 resolve to literal text are handled below in the switch marked
13576 Every entry in this switch *must* have a corresponding entry
13577 in the literal escape switch. However, the opposite is not
13578 required, as the default for this switch is to jump to the
13579 literal text handling code.
13582 switch ((U8)*RExC_parse) {
13583 /* Special Escapes */
13585 RExC_seen_zerolen++;
13586 /* Under wildcards, this is changed to match \n; should be
13587 * invisible to the user, as they have to compile under /m */
13588 if (RExC_pm_flags & PMf_WILDCARD) {
13589 ret = reg_node(pRExC_state, MBOL);
13592 ret = reg_node(pRExC_state, SBOL);
13593 /* SBOL is shared with /^/ so we set the flags so we can tell
13594 * /\A/ from /^/ in split. */
13595 FLAGS(REGNODE_p(ret)) = 1;
13597 goto finish_meta_pat;
13599 if (RExC_pm_flags & PMf_WILDCARD) {
13601 /* diag_listed_as: Use of %s is not allowed in Unicode property
13602 wildcard subpatterns in regex; marked by <-- HERE in m/%s/
13604 vFAIL("Use of '\\G' is not allowed in Unicode property"
13605 " wildcard subpatterns");
13607 ret = reg_node(pRExC_state, GPOS);
13608 RExC_seen |= REG_GPOS_SEEN;
13609 goto finish_meta_pat;
13611 if (!RExC_in_lookbehind && !RExC_in_lookahead) {
13612 RExC_seen_zerolen++;
13613 ret = reg_node(pRExC_state, KEEPS);
13614 /* XXX:dmq : disabling in-place substitution seems to
13615 * be necessary here to avoid cases of memory corruption, as
13616 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
13618 RExC_seen |= REG_LOOKBEHIND_SEEN;
13619 goto finish_meta_pat;
13622 ++RExC_parse; /* advance past the 'K' */
13623 vFAIL("\\K not permitted in lookahead/lookbehind");
13626 if (RExC_pm_flags & PMf_WILDCARD) {
13627 /* See comment under \A above */
13628 ret = reg_node(pRExC_state, MEOL);
13631 ret = reg_node(pRExC_state, SEOL);
13633 RExC_seen_zerolen++; /* Do not optimize RE away */
13634 goto finish_meta_pat;
13636 if (RExC_pm_flags & PMf_WILDCARD) {
13637 /* See comment under \A above */
13638 ret = reg_node(pRExC_state, MEOL);
13641 ret = reg_node(pRExC_state, EOS);
13643 RExC_seen_zerolen++; /* Do not optimize RE away */
13644 goto finish_meta_pat;
13646 vFAIL("\\C no longer supported");
13648 ret = reg_node(pRExC_state, CLUMP);
13649 *flagp |= HASWIDTH;
13650 goto finish_meta_pat;
13658 regex_charset charset = get_regex_charset(RExC_flags);
13660 RExC_seen_zerolen++;
13661 RExC_seen |= REG_LOOKBEHIND_SEEN;
13662 op = BOUND + charset;
13664 if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
13665 flags = TRADITIONAL_BOUND;
13666 if (op > BOUNDA) { /* /aa is same as /a */
13672 char name = *RExC_parse;
13673 char * endbrace = NULL;
13675 endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
13678 vFAIL2("Missing right brace on \\%c{}", name);
13680 /* XXX Need to decide whether to take spaces or not. Should be
13681 * consistent with \p{}, but that currently is SPACE, which
13682 * means vertical too, which seems wrong
13683 * while (isBLANK(*RExC_parse)) {
13686 if (endbrace == RExC_parse) {
13687 RExC_parse++; /* After the '}' */
13688 vFAIL2("Empty \\%c{}", name);
13690 length = endbrace - RExC_parse;
13691 /*while (isBLANK(*(RExC_parse + length - 1))) {
13694 switch (*RExC_parse) {
13697 && (memNEs(RExC_parse + 1, length - 1, "cb")))
13699 goto bad_bound_type;
13704 if (length != 2 || *(RExC_parse + 1) != 'b') {
13705 goto bad_bound_type;
13710 if (length != 2 || *(RExC_parse + 1) != 'b') {
13711 goto bad_bound_type;
13716 if (length != 2 || *(RExC_parse + 1) != 'b') {
13717 goto bad_bound_type;
13723 RExC_parse = endbrace;
13725 "'%" UTF8f "' is an unknown bound type",
13726 UTF8fARG(UTF, length, endbrace - length));
13727 NOT_REACHED; /*NOTREACHED*/
13729 RExC_parse = endbrace;
13730 REQUIRE_UNI_RULES(flagp, 0);
13735 else if (op >= BOUNDA) { /* /aa is same as /a */
13739 /* Don't have to worry about UTF-8, in this message because
13740 * to get here the contents of the \b must be ASCII */
13741 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
13742 "Using /u for '%.*s' instead of /%s",
13744 endbrace - length + 1,
13745 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
13746 ? ASCII_RESTRICT_PAT_MODS
13747 : ASCII_MORE_RESTRICT_PAT_MODS);
13752 RExC_seen_d_op = TRUE;
13754 else if (op == BOUNDL) {
13755 RExC_contains_locale = 1;
13759 op += NBOUND - BOUND;
13762 ret = reg_node(pRExC_state, op);
13763 FLAGS(REGNODE_p(ret)) = flags;
13765 goto finish_meta_pat;
13769 ret = reg_node(pRExC_state, LNBREAK);
13770 *flagp |= HASWIDTH|SIMPLE;
13771 goto finish_meta_pat;
13785 /* These all have the same meaning inside [brackets], and it knows
13786 * how to do the best optimizations for them. So, pretend we found
13787 * these within brackets, and let it do the work */
13790 ret = regclass(pRExC_state, flagp, depth+1,
13791 TRUE, /* means just parse this element */
13792 FALSE, /* don't allow multi-char folds */
13793 FALSE, /* don't silence non-portable warnings. It
13794 would be a bug if these returned
13796 (bool) RExC_strict,
13797 TRUE, /* Allow an optimized regnode result */
13799 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13800 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
13801 * multi-char folds are allowed. */
13803 FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
13806 RExC_parse--; /* regclass() leaves this one too far ahead */
13809 /* The escapes above that don't take a parameter can't be
13810 * followed by a '{'. But 'pX', 'p{foo}' and
13811 * correspondingly 'P' can be */
13812 if ( RExC_parse - parse_start == 1
13813 && UCHARAT(RExC_parse + 1) == '{'
13814 && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
13817 vFAIL("Unescaped left brace in regex is illegal here");
13819 Set_Node_Offset(REGNODE_p(ret), parse_start);
13820 Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */
13821 nextchar(pRExC_state);
13824 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
13825 * \N{...} evaluates to a sequence of more than one code points).
13826 * The function call below returns a regnode, which is our result.
13827 * The parameters cause it to fail if the \N{} evaluates to a
13828 * single code point; we handle those like any other literal. The
13829 * reason that the multicharacter case is handled here and not as
13830 * part of the EXACtish code is because of quantifiers. In
13831 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
13832 * this way makes that Just Happen. dmq.
13833 * join_exact() will join this up with adjacent EXACTish nodes
13834 * later on, if appropriate. */
13836 if (grok_bslash_N(pRExC_state,
13837 &ret, /* Want a regnode returned */
13838 NULL, /* Fail if evaluates to a single code
13840 NULL, /* Don't need a count of how many code
13849 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
13851 /* Here, evaluates to a single code point. Go get that */
13852 RExC_parse = parse_start;
13855 case 'k': /* Handle \k<NAME> and \k'NAME' */
13859 if ( RExC_parse >= RExC_end - 1
13860 || (( ch = RExC_parse[1]) != '<'
13865 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
13866 vFAIL2("Sequence %.2s... not terminated", parse_start);
13869 ret = handle_named_backref(pRExC_state,
13881 case '1': case '2': case '3': case '4':
13882 case '5': case '6': case '7': case '8': case '9':
13887 if (*RExC_parse == 'g') {
13891 if (*RExC_parse == '{') {
13895 if (*RExC_parse == '-') {
13899 if (hasbrace && !isDIGIT(*RExC_parse)) {
13900 if (isrel) RExC_parse--;
13902 goto parse_named_seq;
13905 if (RExC_parse >= RExC_end) {
13906 goto unterminated_g;
13908 num = S_backref_value(RExC_parse, RExC_end);
13910 vFAIL("Reference to invalid group 0");
13911 else if (num == I32_MAX) {
13912 if (isDIGIT(*RExC_parse))
13913 vFAIL("Reference to nonexistent group");
13916 vFAIL("Unterminated \\g... pattern");
13920 num = RExC_npar - num;
13922 vFAIL("Reference to nonexistent or unclosed group");
13926 num = S_backref_value(RExC_parse, RExC_end);
13927 /* bare \NNN might be backref or octal - if it is larger
13928 * than or equal RExC_npar then it is assumed to be an
13929 * octal escape. Note RExC_npar is +1 from the actual
13930 * number of parens. */
13931 /* Note we do NOT check if num == I32_MAX here, as that is
13932 * handled by the RExC_npar check */
13935 /* any numeric escape < 10 is always a backref */
13937 /* any numeric escape < RExC_npar is a backref */
13938 && num >= RExC_npar
13939 /* cannot be an octal escape if it starts with [89] */
13940 && ! inRANGE(*RExC_parse, '8', '9')
13942 /* Probably not meant to be a backref, instead likely
13943 * to be an octal character escape, e.g. \35 or \777.
13944 * The above logic should make it obvious why using
13945 * octal escapes in patterns is problematic. - Yves */
13946 RExC_parse = parse_start;
13951 /* At this point RExC_parse points at a numeric escape like
13952 * \12 or \88 or something similar, which we should NOT treat
13953 * as an octal escape. It may or may not be a valid backref
13954 * escape. For instance \88888888 is unlikely to be a valid
13956 while (isDIGIT(*RExC_parse))
13959 if (*RExC_parse != '}')
13960 vFAIL("Unterminated \\g{...} pattern");
13963 if (num >= (I32)RExC_npar) {
13965 /* It might be a forward reference; we can't fail until we
13966 * know, by completing the parse to get all the groups, and
13967 * then reparsing */
13968 if (ALL_PARENS_COUNTED) {
13969 if (num >= RExC_total_parens) {
13970 vFAIL("Reference to nonexistent group");
13974 REQUIRE_PARENS_PASS;
13978 ret = reganode(pRExC_state,
13981 : (ASCII_FOLD_RESTRICTED)
13983 : (AT_LEAST_UNI_SEMANTICS)
13989 if (OP(REGNODE_p(ret)) == REFF) {
13990 RExC_seen_d_op = TRUE;
13992 *flagp |= HASWIDTH;
13994 /* override incorrect value set in reganode MJD */
13995 Set_Node_Offset(REGNODE_p(ret), parse_start);
13996 Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1);
13997 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
13998 FALSE /* Don't force to /x */ );
14002 if (RExC_parse >= RExC_end)
14003 FAIL("Trailing \\");
14006 /* Do not generate "unrecognized" warnings here, we fall
14007 back into the quick-grab loop below */
14008 RExC_parse = parse_start;
14010 } /* end of switch on a \foo sequence */
14015 /* '#' comments should have been spaced over before this function was
14017 assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
14019 if (RExC_flags & RXf_PMf_EXTENDED) {
14020 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
14021 if (RExC_parse < RExC_end)
14031 /* Here, we have determined that the next thing is probably a
14032 * literal character. RExC_parse points to the first byte of its
14033 * definition. (It still may be an escape sequence that evaluates
14034 * to a single character) */
14039 char *s, *old_s = NULL, *old_old_s = NULL;
14041 U32 max_string_len = 255;
14043 /* We may have to reparse the node, artificially stopping filling
14044 * it early, based on info gleaned in the first parse. This
14045 * variable gives where we stop. Make it above the normal stopping
14046 * place first time through; otherwise it would stop too early */
14047 U32 upper_fill = max_string_len + 1;
14049 /* We start out as an EXACT node, even if under /i, until we find a
14050 * character which is in a fold. The algorithm now segregates into
14051 * separate nodes, characters that fold from those that don't under
14052 * /i. (This hopefully will create nodes that are fixed strings
14053 * even under /i, giving the optimizer something to grab on to.)
14054 * So, if a node has something in it and the next character is in
14055 * the opposite category, that node is closed up, and the function
14056 * returns. Then regatom is called again, and a new node is
14057 * created for the new category. */
14058 U8 node_type = EXACT;
14060 /* Assume the node will be fully used; the excess is given back at
14061 * the end. Under /i, we may need to temporarily add the fold of
14062 * an extra character or two at the end to check for splitting
14063 * multi-char folds, so allocate extra space for that. We can't
14064 * make any other length assumptions, as a byte input sequence
14065 * could shrink down. */
14066 Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
14070 ? UTF8_MAXBYTES_CASE
14071 /* Max non-UTF-8 expansion is 2 */ : 2)));
14073 bool next_is_quantifier;
14074 char * oldp = NULL;
14076 /* We can convert EXACTF nodes to EXACTFU if they contain only
14077 * characters that match identically regardless of the target
14078 * string's UTF8ness. The reason to do this is that EXACTF is not
14079 * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
14082 * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
14083 * contain only above-Latin1 characters (hence must be in UTF8),
14084 * which don't participate in folds with Latin1-range characters,
14085 * as the latter's folds aren't known until runtime. */
14086 bool maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14088 /* Single-character EXACTish nodes are almost always SIMPLE. This
14089 * allows us to override this as encountered */
14090 U8 maybe_SIMPLE = SIMPLE;
14092 /* Does this node contain something that can't match unless the
14093 * target string is (also) in UTF-8 */
14094 bool requires_utf8_target = FALSE;
14096 /* The sequence 'ss' is problematic in non-UTF-8 patterns. */
14097 bool has_ss = FALSE;
14099 /* So is the MICRO SIGN */
14100 bool has_micro_sign = FALSE;
14102 /* Set when we fill up the current node and there is still more
14103 * text to process */
14106 /* Allocate an EXACT node. The node_type may change below to
14107 * another EXACTish node, but since the size of the node doesn't
14108 * change, it works */
14109 ret = regnode_guts(pRExC_state, node_type, current_string_nodes,
14111 FILL_NODE(ret, node_type);
14114 s = STRING(REGNODE_p(ret));
14125 maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
14126 maybe_SIMPLE = SIMPLE;
14127 requires_utf8_target = FALSE;
14129 has_micro_sign = FALSE;
14133 /* This breaks under rare circumstances. If folding, we do not
14134 * want to split a node at a character that is a non-final in a
14135 * multi-char fold, as an input string could just happen to want to
14136 * match across the node boundary. The code at the end of the loop
14137 * looks for this, and backs off until it finds not such a
14138 * character, but it is possible (though extremely, extremely
14139 * unlikely) for all characters in the node to be non-final fold
14140 * ones, in which case we just leave the node fully filled, and
14141 * hope that it doesn't match the string in just the wrong place */
14143 assert( ! UTF /* Is at the beginning of a character */
14144 || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
14145 || UTF8_IS_START(UCHARAT(RExC_parse)));
14147 overflowed = FALSE;
14149 /* Here, we have a literal character. Find the maximal string of
14150 * them in the input that we can fit into a single EXACTish node.
14151 * We quit at the first non-literal or when the node gets full, or
14152 * under /i the categorization of folding/non-folding character
14154 while (p < RExC_end && len < upper_fill) {
14156 /* In most cases each iteration adds one byte to the output.
14157 * The exceptions override this */
14158 Size_t added_len = 1;
14164 /* White space has already been ignored */
14165 assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
14166 || ! is_PATWS_safe((p), RExC_end, UTF));
14169 const char* message;
14182 /* Literal Escapes Switch
14184 This switch is meant to handle escape sequences that
14185 resolve to a literal character.
14187 Every escape sequence that represents something
14188 else, like an assertion or a char class, is handled
14189 in the switch marked 'Special Escapes' above in this
14190 routine, but also has an entry here as anything that
14191 isn't explicitly mentioned here will be treated as
14192 an unescaped equivalent literal.
14195 switch ((U8)*++p) {
14197 /* These are all the special escapes. */
14198 case 'A': /* Start assertion */
14199 case 'b': case 'B': /* Word-boundary assertion*/
14200 case 'C': /* Single char !DANGEROUS! */
14201 case 'd': case 'D': /* digit class */
14202 case 'g': case 'G': /* generic-backref, pos assertion */
14203 case 'h': case 'H': /* HORIZWS */
14204 case 'k': case 'K': /* named backref, keep marker */
14205 case 'p': case 'P': /* Unicode property */
14206 case 'R': /* LNBREAK */
14207 case 's': case 'S': /* space class */
14208 case 'v': case 'V': /* VERTWS */
14209 case 'w': case 'W': /* word class */
14210 case 'X': /* eXtended Unicode "combining
14211 character sequence" */
14212 case 'z': case 'Z': /* End of line/string assertion */
14216 /* Anything after here is an escape that resolves to a
14217 literal. (Except digits, which may or may not)
14223 case 'N': /* Handle a single-code point named character. */
14224 RExC_parse = p + 1;
14225 if (! grok_bslash_N(pRExC_state,
14226 NULL, /* Fail if evaluates to
14227 anything other than a
14228 single code point */
14229 &ender, /* The returned single code
14231 NULL, /* Don't need a count of
14232 how many code points */
14237 if (*flagp & NEED_UTF8)
14238 FAIL("panic: grok_bslash_N set NEED_UTF8");
14239 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
14241 /* Here, it wasn't a single code point. Go close
14242 * up this EXACTish node. The switch() prior to
14243 * this switch handles the other cases */
14244 RExC_parse = p = oldp;
14248 RExC_parse = parse_start;
14250 /* The \N{} means the pattern, if previously /d,
14251 * becomes /u. That means it can't be an EXACTF node,
14252 * but an EXACTFU */
14253 if (node_type == EXACTF) {
14254 node_type = EXACTFU;
14256 /* If the node already contains something that
14257 * differs between EXACTF and EXACTFU, reparse it
14259 if (! maybe_exactfu) {
14280 ender = ESC_NATIVE;
14288 if (! grok_bslash_o(&p,
14293 (bool) RExC_strict,
14294 FALSE, /* No illegal cp's */
14297 RExC_parse = p; /* going to die anyway; point to
14298 exact spot of failure */
14302 if (message && TO_OUTPUT_WARNINGS(p)) {
14303 warn_non_literal_string(p, packed_warn, message);
14307 if (! grok_bslash_x(&p,
14312 (bool) RExC_strict,
14313 FALSE, /* No illegal cp's */
14316 RExC_parse = p; /* going to die anyway; point
14317 to exact spot of failure */
14321 if (message && TO_OUTPUT_WARNINGS(p)) {
14322 warn_non_literal_string(p, packed_warn, message);
14326 if (ender < 0x100) {
14327 if (RExC_recode_x_to_native) {
14328 ender = LATIN1_TO_NATIVE(ender);
14335 if (! grok_bslash_c(*p, &grok_c_char,
14336 &message, &packed_warn))
14338 /* going to die anyway; point to exact spot of
14340 RExC_parse = p + ((UTF)
14341 ? UTF8_SAFE_SKIP(p, RExC_end)
14346 ender = grok_c_char;
14348 if (message && TO_OUTPUT_WARNINGS(p)) {
14349 warn_non_literal_string(p, packed_warn, message);
14353 case '8': case '9': /* must be a backreference */
14355 /* we have an escape like \8 which cannot be an octal escape
14356 * so we exit the loop, and let the outer loop handle this
14357 * escape which may or may not be a legitimate backref. */
14359 case '1': case '2': case '3':case '4':
14360 case '5': case '6': case '7':
14361 /* When we parse backslash escapes there is ambiguity
14362 * between backreferences and octal escapes. Any escape
14363 * from \1 - \9 is a backreference, any multi-digit
14364 * escape which does not start with 0 and which when
14365 * evaluated as decimal could refer to an already
14366 * parsed capture buffer is a back reference. Anything
14369 * Note this implies that \118 could be interpreted as
14370 * 118 OR as "\11" . "8" depending on whether there
14371 * were 118 capture buffers defined already in the
14374 /* NOTE, RExC_npar is 1 more than the actual number of
14375 * parens we have seen so far, hence the "<" as opposed
14377 if ( !isDIGIT(p[1]) || S_backref_value(p, RExC_end) < RExC_npar)
14378 { /* Not to be treated as an octal constant, go
14386 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
14387 | PERL_SCAN_NOTIFY_ILLDIGIT;
14389 ender = grok_oct(p, &numlen, &flags, NULL);
14391 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
14392 && isDIGIT(*p) /* like \08, \178 */
14393 && ckWARN(WARN_REGEXP))
14395 reg_warn_non_literal_string(
14397 form_alien_digit_msg(8, numlen, p,
14398 RExC_end, UTF, FALSE));
14404 FAIL("Trailing \\");
14407 if (isALPHANUMERIC(*p)) {
14408 /* An alpha followed by '{' is going to fail next
14409 * iteration, so don't output this warning in that
14411 if (! isALPHA(*p) || *(p + 1) != '{') {
14412 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
14413 " passed through", p);
14416 goto normal_default;
14417 } /* End of switch on '\' */
14420 /* Trying to gain new uses for '{' without breaking too
14421 * much existing code is hard. The solution currently
14423 * 1) If there is no ambiguity that a '{' should always
14424 * be taken literally, at the start of a construct, we
14426 * 2) If the literal '{' conflicts with our desired use
14427 * of it as a metacharacter, we die. The deprecation
14428 * cycles for this have come and gone.
14429 * 3) If there is ambiguity, we raise a simple warning.
14430 * This could happen, for example, if the user
14431 * intended it to introduce a quantifier, but slightly
14432 * misspelled the quantifier. Without this warning,
14433 * the quantifier would silently be taken as a literal
14434 * string of characters instead of a meta construct */
14435 if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
14437 || ( p > parse_start + 1
14438 && isALPHA_A(*(p - 1))
14439 && *(p - 2) == '\\')
14440 || new_regcurly(p, RExC_end))
14442 RExC_parse = p + 1;
14443 vFAIL("Unescaped left brace in regex is "
14446 ckWARNreg(p + 1, "Unescaped left brace in regex is"
14447 " passed through");
14449 goto normal_default;
14452 if (p > RExC_parse && RExC_strict) {
14453 ckWARN2reg(p + 1, "Unescaped literal '%c'", *p);
14456 default: /* A literal character */
14458 if (! UTF8_IS_INVARIANT(*p) && UTF) {
14460 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
14461 &numlen, UTF8_ALLOW_DEFAULT);
14467 } /* End of switch on the literal */
14469 /* Here, have looked at the literal character, and <ender>
14470 * contains its ordinal; <p> points to the character after it.
14474 REQUIRE_UTF8(flagp);
14475 if ( UNICODE_IS_PERL_EXTENDED(ender)
14476 && TO_OUTPUT_WARNINGS(p))
14478 ckWARN2_non_literal_string(p,
14479 packWARN(WARN_PORTABLE),
14480 PL_extended_cp_format,
14485 /* We need to check if the next non-ignored thing is a
14486 * quantifier. Move <p> to after anything that should be
14487 * ignored, which, as a side effect, positions <p> for the next
14488 * loop iteration */
14489 skip_to_be_ignored_text(pRExC_state, &p,
14490 FALSE /* Don't force to /x */ );
14492 /* If the next thing is a quantifier, it applies to this
14493 * character only, which means that this character has to be in
14494 * its own node and can't just be appended to the string in an
14495 * existing node, so if there are already other characters in
14496 * the node, close the node with just them, and set up to do
14497 * this character again next time through, when it will be the
14498 * only thing in its new node */
14500 next_is_quantifier = LIKELY(p < RExC_end)
14501 && UNLIKELY(ISMULT2(p));
14503 if (next_is_quantifier && LIKELY(len)) {
14508 /* Ready to add 'ender' to the node */
14510 if (! FOLD) { /* The simple case, just append the literal */
14513 /* Don't output if it would overflow */
14514 if (UNLIKELY(len > max_string_len - ((UTF)
14515 ? UVCHR_SKIP(ender)
14522 if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
14523 *(s++) = (char) ender;
14526 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
14527 added_len = (char *) new_s - s;
14528 s = (char *) new_s;
14531 requires_utf8_target = TRUE;
14535 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
14537 /* Here are folding under /l, and the code point is
14538 * problematic. If this is the first character in the
14539 * node, change the node type to folding. Otherwise, if
14540 * this is the first problematic character, close up the
14541 * existing node, so can start a new node with this one */
14543 node_type = EXACTFL;
14544 RExC_contains_locale = 1;
14546 else if (node_type == EXACT) {
14551 /* This problematic code point means we can't simplify
14553 maybe_exactfu = FALSE;
14555 /* Although these two characters have folds that are
14556 * locale-problematic, they also have folds to above Latin1
14557 * that aren't a problem. Doing these now helps at
14559 if (UNLIKELY( ender == GREEK_CAPITAL_LETTER_MU
14560 || ender == LATIN_CAPITAL_LETTER_SHARP_S))
14565 /* Here, we are adding a problematic fold character.
14566 * "Problematic" in this context means that its fold isn't
14567 * known until runtime. (The non-problematic code points
14568 * are the above-Latin1 ones that fold to also all
14569 * above-Latin1. Their folds don't vary no matter what the
14570 * locale is.) But here we have characters whose fold
14571 * depends on the locale. We just add in the unfolded
14572 * character, and wait until runtime to fold it */
14573 goto not_fold_common;
14575 else /* regular fold; see if actually is in a fold */
14576 if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
14578 && ! _invlist_contains_cp(PL_in_some_fold, ender)))
14580 /* Here, folding, but the character isn't in a fold.
14582 * Start a new node if previous characters in the node were
14584 if (len && node_type != EXACT) {
14589 /* Here, continuing a node with non-folded characters. Add
14591 goto not_fold_common;
14593 else { /* Here, does participate in some fold */
14595 /* If this is the first character in the node, change its
14596 * type to folding. Otherwise, if this is the first
14597 * folding character in the node, close up the existing
14598 * node, so can start a new node with this one. */
14600 node_type = compute_EXACTish(pRExC_state);
14602 else if (node_type == EXACT) {
14607 if (UTF) { /* Alway use the folded value for UTF-8
14609 if (UVCHR_IS_INVARIANT(ender)) {
14610 if (UNLIKELY(len + 1 > max_string_len)) {
14615 *(s)++ = (U8) toFOLD(ender);
14621 folded = _to_uni_fold_flags(
14623 (U8 *) s, /* We have allocated extra space
14624 in 's' so can't run off the
14628 | (( ASCII_FOLD_RESTRICTED
14629 || node_type == EXACTFL)
14630 ? FOLD_FLAGS_NOMIX_ASCII
14632 if (UNLIKELY(len + added_len > max_string_len)) {
14640 && LIKELY(folded != GREEK_SMALL_LETTER_MU))
14642 /* U+B5 folds to the MU, so its possible for a
14643 * non-UTF-8 target to match it */
14644 requires_utf8_target = TRUE;
14648 else { /* Here is non-UTF8. */
14650 /* The fold will be one or (rarely) two characters.
14651 * Check that there's room for at least a single one
14652 * before setting any flags, etc. Because otherwise an
14653 * overflowing character could cause a flag to be set
14654 * even though it doesn't end up in this node. (For
14655 * the two character fold, we check again, before
14656 * setting any flags) */
14657 if (UNLIKELY(len + 1 > max_string_len)) {
14662 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
14663 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
14664 || UNICODE_DOT_DOT_VERSION > 0)
14666 /* On non-ancient Unicodes, check for the only possible
14667 * multi-char fold */
14668 if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
14670 /* This potential multi-char fold means the node
14671 * can't be simple (because it could match more
14672 * than a single char). And in some cases it will
14673 * match 'ss', so set that flag */
14677 /* It can't change to be an EXACTFU (unless already
14678 * is one). We fold it iff under /u rules. */
14679 if (node_type != EXACTFU) {
14680 maybe_exactfu = FALSE;
14683 if (UNLIKELY(len + 2 > max_string_len)) {
14692 goto done_with_this_char;
14695 else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
14697 && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
14699 /* Also, the sequence 'ss' is special when not
14700 * under /u. If the target string is UTF-8, it
14701 * should match SHARP S; otherwise it won't. So,
14702 * here we have to exclude the possibility of this
14703 * node moving to /u.*/
14705 maybe_exactfu = FALSE;
14708 /* Here, the fold will be a single character */
14710 if (UNLIKELY(ender == MICRO_SIGN)) {
14711 has_micro_sign = TRUE;
14713 else if (PL_fold[ender] != PL_fold_latin1[ender]) {
14715 /* If the character's fold differs between /d and
14716 * /u, this can't change to be an EXACTFU node */
14717 maybe_exactfu = FALSE;
14720 *(s++) = (DEPENDS_SEMANTICS)
14721 ? (char) toFOLD(ender)
14723 /* Under /u, the fold of any character in
14724 * the 0-255 range happens to be its
14725 * lowercase equivalent, except for LATIN
14726 * SMALL LETTER SHARP S, which was handled
14727 * above, and the MICRO SIGN, whose fold
14728 * requires UTF-8 to represent. */
14729 : (char) toLOWER_L1(ender);
14731 } /* End of adding current character to the node */
14733 done_with_this_char:
14737 if (next_is_quantifier) {
14739 /* Here, the next input is a quantifier, and to get here,
14740 * the current character is the only one in the node. */
14744 } /* End of loop through literal characters */
14746 /* Here we have either exhausted the input or run out of room in
14747 * the node. If the former, we are done. (If we encountered a
14748 * character that can't be in the node, transfer is made directly
14749 * to <loopdone>, and so we wouldn't have fallen off the end of the
14751 if (LIKELY(! overflowed)) {
14755 /* Here we have run out of room. We can grow plain EXACT and
14756 * LEXACT nodes. If the pattern is gigantic enough, though,
14757 * eventually we'll have to artificially chunk the pattern into
14758 * multiple nodes. */
14759 if (! LOC && (node_type == EXACT || node_type == LEXACT)) {
14760 Size_t overhead = 1 + regarglen[OP(REGNODE_p(ret))];
14761 Size_t overhead_expansion = 0;
14763 Size_t max_nodes_for_string;
14767 /* Here we couldn't fit the final character in the current
14768 * node, so it will have to be reparsed, no matter what else we
14772 /* If would have overflowed a regular EXACT node, switch
14773 * instead to an LEXACT. The code below is structured so that
14774 * the actual growing code is common to changing from an EXACT
14775 * or just increasing the LEXACT size. This means that we have
14776 * to save the string in the EXACT case before growing, and
14777 * then copy it afterwards to its new location */
14778 if (node_type == EXACT) {
14779 overhead_expansion = regarglen[LEXACT] - regarglen[EXACT];
14780 RExC_emit += overhead_expansion;
14781 Copy(s0, temp, len, char);
14784 /* Ready to grow. If it was a plain EXACT, the string was
14785 * saved, and the first few bytes of it overwritten by adding
14786 * an argument field. We assume, as we do elsewhere in this
14787 * file, that one byte of remaining input will translate into
14788 * one byte of output, and if that's too small, we grow again,
14789 * if too large the excess memory is freed at the end */
14791 max_nodes_for_string = U16_MAX - overhead - overhead_expansion;
14792 achievable = MIN(max_nodes_for_string,
14793 current_string_nodes + STR_SZ(RExC_end - p));
14794 delta = achievable - current_string_nodes;
14796 /* If there is just no more room, go finish up this chunk of
14802 change_engine_size(pRExC_state, delta + overhead_expansion);
14803 current_string_nodes += delta;
14805 = sizeof(struct regnode) * current_string_nodes;
14806 upper_fill = max_string_len + 1;
14808 /* If the length was small, we know this was originally an
14809 * EXACT node now converted to LEXACT, and the string has to be
14810 * restored. Otherwise the string was untouched. 260 is just
14811 * a number safely above 255 so don't have to worry about
14812 * getting it precise */
14814 node_type = LEXACT;
14815 FILL_NODE(ret, node_type);
14816 s0 = STRING(REGNODE_p(ret));
14817 Copy(temp, s0, len, char);
14821 goto continue_parse;
14824 bool splittable = FALSE;
14825 bool backed_up = FALSE;
14826 char * e; /* should this be U8? */
14827 char * s_start; /* should this be U8? */
14829 /* Here is /i. Running out of room creates a problem if we are
14830 * folding, and the split happens in the middle of a
14831 * multi-character fold, as a match that should have occurred,
14832 * won't, due to the way nodes are matched, and our artificial
14833 * boundary. So back off until we aren't splitting such a
14834 * fold. If there is no such place to back off to, we end up
14835 * taking the entire node as-is. This can happen if the node
14836 * consists entirely of 'f' or entirely of 's' characters (or
14837 * things that fold to them) as 'ff' and 'ss' are
14838 * multi-character folds.
14840 * The Unicode standard says that multi character folds consist
14841 * of either two or three characters. That means we would be
14842 * splitting one if the final character in the node is at the
14843 * beginning of either type, or is the second of a three
14847 * ender is the code point of the character that won't fit
14849 * s points to just beyond the final byte in the node.
14850 * It's where we would place ender if there were
14851 * room, and where in fact we do place ender's fold
14852 * in the code below, as we've over-allocated space
14853 * for s0 (hence s) to allow for this
14854 * e starts at 's' and advances as we append things.
14855 * old_s is the same as 's'. (If ender had fit, 's' would
14856 * have been advanced to beyond it).
14857 * old_old_s points to the beginning byte of the final
14858 * character in the node
14859 * p points to the beginning byte in the input of the
14860 * character beyond 'ender'.
14861 * oldp points to the beginning byte in the input of
14864 * In the case of /il, we haven't folded anything that could be
14865 * affected by the locale. That means only above-Latin1
14866 * characters that fold to other above-latin1 characters get
14867 * folded at compile time. To check where a good place to
14868 * split nodes is, everything in it will have to be folded.
14869 * The boolean 'maybe_exactfu' keeps track in /il if there are
14870 * any unfolded characters in the node. */
14871 bool need_to_fold_loc = LOC && ! maybe_exactfu;
14873 /* If we do need to fold the node, we need a place to store the
14874 * folded copy, and a way to map back to the unfolded original
14876 char * locfold_buf = NULL;
14877 Size_t * loc_correspondence = NULL;
14879 if (! need_to_fold_loc) { /* The normal case. Just
14880 initialize to the actual node */
14883 s = old_old_s; /* Point to the beginning of the final char
14884 that fits in the node */
14888 /* Here, we have filled a /il node, and there are unfolded
14889 * characters in it. If the runtime locale turns out to be
14890 * UTF-8, there are possible multi-character folds, just
14891 * like when not under /l. The node hence can't terminate
14892 * in the middle of such a fold. To determine this, we
14893 * have to create a folded copy of this node. That means
14894 * reparsing the node, folding everything assuming a UTF-8
14895 * locale. (If at runtime it isn't such a locale, the
14896 * actions here wouldn't have been necessary, but we have
14897 * to assume the worst case.) If we find we need to back
14898 * off the folded string, we do so, and then map that
14899 * position back to the original unfolded node, which then
14900 * gets output, truncated at that spot */
14902 char * redo_p = RExC_parse;
14906 /* Allow enough space assuming a single byte input folds to
14907 * a single byte output, plus assume that the two unparsed
14908 * characters (that we may need) fold to the largest number
14909 * of bytes possible, plus extra for one more worst case
14910 * scenario. In the loop below, if we start eating into
14911 * that final spare space, we enlarge this initial space */
14912 Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
14914 Newxz(locfold_buf, size, char);
14915 Newxz(loc_correspondence, size, Size_t);
14917 /* Redo this node's parse, folding into 'locfold_buf' */
14918 redo_p = RExC_parse;
14919 old_redo_e = redo_e = locfold_buf;
14920 while (redo_p <= oldp) {
14922 old_redo_e = redo_e;
14923 loc_correspondence[redo_e - locfold_buf]
14924 = redo_p - RExC_parse;
14929 (void) _to_utf8_fold_flags((U8 *) redo_p,
14934 redo_e += added_len;
14935 redo_p += UTF8SKIP(redo_p);
14939 /* Note that if this code is run on some ancient
14940 * Unicode versions, SHARP S doesn't fold to 'ss',
14941 * but rather than clutter the code with #ifdef's,
14942 * as is done above, we ignore that possibility.
14943 * This is ok because this code doesn't affect what
14944 * gets matched, but merely where the node gets
14946 if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
14947 *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
14957 /* If we're getting so close to the end that a
14958 * worst-case fold in the next character would cause us
14959 * to overflow, increase, assuming one byte output byte
14960 * per one byte input one, plus room for another worst
14962 if ( redo_p <= oldp
14963 && redo_e > locfold_buf + size
14964 - (UTF8_MAXBYTES_CASE + 1))
14966 Size_t new_size = size
14968 + UTF8_MAXBYTES_CASE + 1;
14969 Ptrdiff_t e_offset = redo_e - locfold_buf;
14971 Renew(locfold_buf, new_size, char);
14972 Renew(loc_correspondence, new_size, Size_t);
14975 redo_e = locfold_buf + e_offset;
14979 /* Set so that things are in terms of the folded, temporary
14982 s_start = locfold_buf;
14987 /* Here, we have 's', 's_start' and 'e' set up to point to the
14988 * input that goes into the node, folded.
14990 * If the final character of the node and the fold of ender
14991 * form the first two characters of a three character fold, we
14992 * need to peek ahead at the next (unparsed) character in the
14993 * input to determine if the three actually do form such a
14994 * fold. Just looking at that character is not generally
14995 * sufficient, as it could be, for example, an escape sequence
14996 * that evaluates to something else, and it needs to be folded.
14998 * khw originally thought to just go through the parse loop one
14999 * extra time, but that doesn't work easily as that iteration
15000 * could cause things to think that the parse is over and to
15001 * goto loopdone. The character could be a '$' for example, or
15002 * the character beyond could be a quantifier, and other
15003 * glitches as well.
15005 * The solution used here for peeking ahead is to look at that
15006 * next character. If it isn't ASCII punctuation, then it will
15007 * be something that would continue on in an EXACTish node if
15008 * there were space. We append the fold of it to s, having
15009 * reserved enough room in s0 for the purpose. If we can't
15010 * reasonably peek ahead, we instead assume the worst case:
15011 * that it is something that would form the completion of a
15014 * If we can't split between s and ender, we work backwards
15015 * character-by-character down to s0. At each current point
15016 * see if we are at the beginning of a multi-char fold. If so,
15017 * that means we would be splitting the fold across nodes, and
15018 * so we back up one and try again.
15020 * If we're not at the beginning, we still could be at the
15021 * final two characters of a (rare) three character fold. We
15022 * check if the sequence starting at the character before the
15023 * current position (and including the current and next
15024 * characters) is a three character fold. If not, the node can
15025 * be split here. If it is, we have to backup two characters
15028 * Otherwise, the node can be split at the current position.
15030 * The same logic is used for UTF-8 patterns and not */
15034 /* Append the fold of ender */
15035 (void) _to_uni_fold_flags(
15039 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15040 ? FOLD_FLAGS_NOMIX_ASCII
15044 /* 's' and the character folded to by ender may be the
15045 * first two of a three-character fold, in which case the
15046 * node should not be split here. That may mean examining
15047 * the so-far unparsed character starting at 'p'. But if
15048 * ender folded to more than one character, we already have
15049 * three characters to look at. Also, we first check if
15050 * the sequence consisting of s and the next character form
15051 * the first two of some three character fold. If not,
15052 * there's no need to peek ahead. */
15053 if ( added_len <= UTF8SKIP(e - added_len)
15054 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
15056 /* Here, the two do form the beginning of a potential
15057 * three character fold. The unexamined character may
15058 * or may not complete it. Peek at it. It might be
15059 * something that ends the node or an escape sequence,
15060 * in which case we don't know without a lot of work
15061 * what it evaluates to, so we have to assume the worst
15062 * case: that it does complete the fold, and so we
15063 * can't split here. All such instances will have
15064 * that character be an ASCII punctuation character,
15065 * like a backslash. So, for that case, backup one and
15066 * drop down to try at that position */
15068 s = (char *) utf8_hop_back((U8 *) s, -1,
15073 /* Here, since it's not punctuation, it must be a
15074 * real character, and we can append its fold to
15075 * 'e' (having deliberately reserved enough space
15076 * for this eventuality) and drop down to check if
15077 * the three actually do form a folded sequence */
15078 (void) _to_utf8_fold_flags(
15079 (U8 *) p, (U8 *) RExC_end,
15082 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
15083 ? FOLD_FLAGS_NOMIX_ASCII
15089 /* Here, we either have three characters available in
15090 * sequence starting at 's', or we have two characters and
15091 * know that the following one can't possibly be part of a
15092 * three character fold. We go through the node backwards
15093 * until we find a place where we can split it without
15094 * breaking apart a multi-character fold. At any given
15095 * point we have to worry about if such a fold begins at
15096 * the current 's', and also if a three-character fold
15097 * begins at s-1, (containing s and s+1). Splitting in
15098 * either case would break apart a fold */
15100 char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
15103 /* If is a multi-char fold, can't split here. Backup
15104 * one char and try again */
15105 if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
15111 /* If the two characters beginning at 's' are part of a
15112 * three character fold starting at the character
15113 * before s, we can't split either before or after s.
15114 * Backup two chars and try again */
15115 if ( LIKELY(s > s_start)
15116 && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
15119 s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
15124 /* Here there's no multi-char fold between s and the
15125 * next character following it. We can split */
15129 } while (s > s_start); /* End of loops backing up through the node */
15131 /* Here we either couldn't find a place to split the node,
15132 * or else we broke out of the loop setting 'splittable' to
15133 * true. In the latter case, the place to split is between
15134 * the first and second characters in the sequence starting
15140 else { /* Pattern not UTF-8 */
15141 if ( ender != LATIN_SMALL_LETTER_SHARP_S
15142 || ASCII_FOLD_RESTRICTED)
15144 assert( toLOWER_L1(ender) < 256 );
15145 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15153 && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
15160 if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
15161 || ASCII_FOLD_RESTRICTED)
15163 assert( toLOWER_L1(ender) < 256 );
15164 *e++ = (char)(toLOWER_L1(ender)); /* should e and the cast be U8? */
15174 if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
15180 if ( LIKELY(s > s_start)
15181 && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
15191 } while (s > s_start);
15198 /* Here, we are done backing up. If we didn't backup at all
15199 * (the likely case), just proceed */
15202 /* If we did find a place to split, reparse the entire node
15203 * stopping where we have calculated. */
15206 /* If we created a temporary folded string under /l, we
15207 * have to map that back to the original */
15208 if (need_to_fold_loc) {
15209 upper_fill = loc_correspondence[s - s_start];
15210 if (upper_fill == 0) {
15211 FAIL2("panic: loc_correspondence[%d] is 0",
15212 (int) (s - s_start));
15214 Safefree(locfold_buf);
15215 Safefree(loc_correspondence);
15218 upper_fill = s - s0;
15223 /* Here the node consists entirely of non-final multi-char
15224 * folds. (Likely it is all 'f's or all 's's.) There's no
15225 * decent place to split it, so give up and just take the
15230 if (need_to_fold_loc) {
15231 Safefree(locfold_buf);
15232 Safefree(loc_correspondence);
15234 } /* End of verifying node ends with an appropriate char */
15236 /* We need to start the next node at the character that didn't fit
15240 loopdone: /* Jumped to when encounters something that shouldn't be
15243 /* Free up any over-allocated space; cast is to silence bogus
15244 * warning in MS VC */
15245 change_engine_size(pRExC_state,
15246 - (Ptrdiff_t) (current_string_nodes - STR_SZ(len)));
15248 /* I (khw) don't know if you can get here with zero length, but the
15249 * old code handled this situation by creating a zero-length EXACT
15250 * node. Might as well be NOTHING instead */
15252 OP(REGNODE_p(ret)) = NOTHING;
15256 /* If the node type is EXACT here, check to see if it
15257 * should be EXACTL, or EXACT_REQ8. */
15258 if (node_type == EXACT) {
15260 node_type = EXACTL;
15262 else if (requires_utf8_target) {
15263 node_type = EXACT_REQ8;
15266 else if (node_type == LEXACT) {
15267 if (requires_utf8_target) {
15268 node_type = LEXACT_REQ8;
15272 if ( UNLIKELY(has_micro_sign || has_ss)
15273 && (node_type == EXACTFU || ( node_type == EXACTF
15274 && maybe_exactfu)))
15275 { /* These two conditions are problematic in non-UTF-8
15278 node_type = EXACTFUP;
15280 else if (node_type == EXACTFL) {
15282 /* 'maybe_exactfu' is deliberately set above to
15283 * indicate this node type, where all code points in it
15285 if (maybe_exactfu) {
15286 node_type = EXACTFLU8;
15289 _invlist_contains_cp(PL_HasMultiCharFold, ender)))
15291 /* A character that folds to more than one will
15292 * match multiple characters, so can't be SIMPLE.
15293 * We don't have to worry about this with EXACTFLU8
15294 * nodes just above, as they have already been
15295 * folded (since the fold doesn't vary at run
15296 * time). Here, if the final character in the node
15297 * folds to multiple, it can't be simple. (This
15298 * only has an effect if the node has only a single
15299 * character, hence the final one, as elsewhere we
15300 * turn off simple for nodes whose length > 1 */
15304 else if (node_type == EXACTF) { /* Means is /di */
15306 /* This intermediate variable is needed solely because
15307 * the asserts in the macro where used exceed Win32's
15308 * literal string capacity */
15309 char first_char = * STRING(REGNODE_p(ret));
15311 /* If 'maybe_exactfu' is clear, then we need to stay
15312 * /di. If it is set, it means there are no code
15313 * points that match differently depending on UTF8ness
15314 * of the target string, so it can become an EXACTFU
15316 if (! maybe_exactfu) {
15317 RExC_seen_d_op = TRUE;
15319 else if ( isALPHA_FOLD_EQ(first_char, 's')
15320 || isALPHA_FOLD_EQ(ender, 's'))
15322 /* But, if the node begins or ends in an 's' we
15323 * have to defer changing it into an EXACTFU, as
15324 * the node could later get joined with another one
15325 * that ends or begins with 's' creating an 'ss'
15326 * sequence which would then wrongly match the
15327 * sharp s without the target being UTF-8. We
15328 * create a special node that we resolve later when
15329 * we join nodes together */
15331 node_type = EXACTFU_S_EDGE;
15334 node_type = EXACTFU;
15338 if (requires_utf8_target && node_type == EXACTFU) {
15339 node_type = EXACTFU_REQ8;
15343 OP(REGNODE_p(ret)) = node_type;
15344 setSTR_LEN(REGNODE_p(ret), len);
15345 RExC_emit += STR_SZ(len);
15347 /* If the node isn't a single character, it can't be SIMPLE */
15348 if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
15352 *flagp |= HASWIDTH | maybe_SIMPLE;
15355 Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
15359 /* len is STRLEN which is unsigned, need to copy to signed */
15362 vFAIL("Internal disaster");
15365 } /* End of label 'defchar:' */
15367 } /* End of giant switch on input character */
15369 /* Position parse to next real character */
15370 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
15371 FALSE /* Don't force to /x */ );
15372 if ( *RExC_parse == '{'
15373 && OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
15375 if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
15377 vFAIL("Unescaped left brace in regex is illegal here");
15379 ckWARNreg(RExC_parse + 1, "Unescaped left brace in regex is"
15380 " passed through");
15388 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
15390 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
15391 * sets up the bitmap and any flags, removing those code points from the
15392 * inversion list, setting it to NULL should it become completely empty */
15395 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
15396 assert(PL_regkind[OP(node)] == ANYOF);
15398 /* There is no bitmap for this node type */
15399 if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
15403 ANYOF_BITMAP_ZERO(node);
15404 if (*invlist_ptr) {
15406 /* This gets set if we actually need to modify things */
15407 bool change_invlist = FALSE;
15411 /* Start looking through *invlist_ptr */
15412 invlist_iterinit(*invlist_ptr);
15413 while (invlist_iternext(*invlist_ptr, &start, &end)) {
15417 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
15418 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
15421 /* Quit if are above what we should change */
15422 if (start >= NUM_ANYOF_CODE_POINTS) {
15426 change_invlist = TRUE;
15428 /* Set all the bits in the range, up to the max that we are doing */
15429 high = (end < NUM_ANYOF_CODE_POINTS - 1)
15431 : NUM_ANYOF_CODE_POINTS - 1;
15432 for (i = start; i <= (int) high; i++) {
15433 ANYOF_BITMAP_SET(node, i);
15436 invlist_iterfinish(*invlist_ptr);
15438 /* Done with loop; remove any code points that are in the bitmap from
15439 * *invlist_ptr; similarly for code points above the bitmap if we have
15440 * a flag to match all of them anyways */
15441 if (change_invlist) {
15442 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
15444 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
15445 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
15448 /* If have completely emptied it, remove it completely */
15449 if (_invlist_len(*invlist_ptr) == 0) {
15450 SvREFCNT_dec_NN(*invlist_ptr);
15451 *invlist_ptr = NULL;
15456 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
15457 Character classes ([:foo:]) can also be negated ([:^foo:]).
15458 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
15459 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
15460 but trigger failures because they are currently unimplemented. */
15462 #define POSIXCC_DONE(c) ((c) == ':')
15463 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
15464 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
15465 #define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';')
15467 #define WARNING_PREFIX "Assuming NOT a POSIX class since "
15468 #define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one"
15469 #define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon"
15471 #define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1)
15473 /* 'posix_warnings' and 'warn_text' are names of variables in the following
15475 #define ADD_POSIX_WARNING(p, text) STMT_START { \
15476 if (posix_warnings) { \
15477 if (! RExC_warn_text ) RExC_warn_text = \
15478 (AV *) sv_2mortal((SV *) newAV()); \
15479 av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \
15483 REPORT_LOCATION_ARGS(p))); \
15486 #define CLEAR_POSIX_WARNINGS() \
15488 if (posix_warnings && RExC_warn_text) \
15489 av_clear(RExC_warn_text); \
15492 #define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
15494 CLEAR_POSIX_WARNINGS(); \
15499 S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
15501 const char * const s, /* Where the putative posix class begins.
15502 Normally, this is one past the '['. This
15503 parameter exists so it can be somewhere
15504 besides RExC_parse. */
15505 char ** updated_parse_ptr, /* Where to set the updated parse pointer, or
15507 AV ** posix_warnings, /* Where to place any generated warnings, or
15509 const bool check_only /* Don't die if error */
15512 /* This parses what the caller thinks may be one of the three POSIX
15514 * 1) a character class, like [:blank:]
15515 * 2) a collating symbol, like [. .]
15516 * 3) an equivalence class, like [= =]
15517 * In the latter two cases, it croaks if it finds a syntactically legal
15518 * one, as these are not handled by Perl.
15520 * The main purpose is to look for a POSIX character class. It returns:
15521 * a) the class number
15522 * if it is a completely syntactically and semantically legal class.
15523 * 'updated_parse_ptr', if not NULL, is set to point to just after the
15524 * closing ']' of the class
15525 * b) OOB_NAMEDCLASS
15526 * if it appears that one of the three POSIX constructs was meant, but
15527 * its specification was somehow defective. 'updated_parse_ptr', if
15528 * not NULL, is set to point to the character just after the end
15529 * character of the class. See below for handling of warnings.
15530 * c) NOT_MEANT_TO_BE_A_POSIX_CLASS
15531 * if it doesn't appear that a POSIX construct was intended.
15532 * 'updated_parse_ptr' is not changed. No warnings nor errors are
15535 * In b) there may be errors or warnings generated. If 'check_only' is
15536 * TRUE, then any errors are discarded. Warnings are returned to the
15537 * caller via an AV* created into '*posix_warnings' if it is not NULL. If
15538 * instead it is NULL, warnings are suppressed.
15540 * The reason for this function, and its complexity is that a bracketed
15541 * character class can contain just about anything. But it's easy to
15542 * mistype the very specific posix class syntax but yielding a valid
15543 * regular bracketed class, so it silently gets compiled into something
15544 * quite unintended.
15546 * The solution adopted here maintains backward compatibility except that
15547 * it adds a warning if it looks like a posix class was intended but
15548 * improperly specified. The warning is not raised unless what is input
15549 * very closely resembles one of the 14 legal posix classes. To do this,
15550 * it uses fuzzy parsing. It calculates how many single-character edits it
15551 * would take to transform what was input into a legal posix class. Only
15552 * if that number is quite small does it think that the intention was a
15553 * posix class. Obviously these are heuristics, and there will be cases
15554 * where it errs on one side or another, and they can be tweaked as
15555 * experience informs.
15557 * The syntax for a legal posix class is:
15559 * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
15561 * What this routine considers syntactically to be an intended posix class
15562 * is this (the comments indicate some restrictions that the pattern
15565 * qr/(?x: \[? # The left bracket, possibly
15567 * \h* # possibly followed by blanks
15568 * (?: \^ \h* )? # possibly a misplaced caret
15569 * [:;]? # The opening class character,
15570 * # possibly omitted. A typo
15571 * # semi-colon can also be used.
15573 * \^? # possibly a correctly placed
15574 * # caret, but not if there was also
15575 * # a misplaced one
15577 * .{3,15} # The class name. If there are
15578 * # deviations from the legal syntax,
15579 * # its edit distance must be close
15580 * # to a real class name in order
15581 * # for it to be considered to be
15582 * # an intended posix class.
15584 * [[:punct:]]? # The closing class character,
15585 * # possibly omitted. If not a colon
15586 * # nor semi colon, the class name
15587 * # must be even closer to a valid
15590 * \]? # The right bracket, possibly
15594 * In the above, \h must be ASCII-only.
15596 * These are heuristics, and can be tweaked as field experience dictates.
15597 * There will be cases when someone didn't intend to specify a posix class
15598 * that this warns as being so. The goal is to minimize these, while
15599 * maximizing the catching of things intended to be a posix class that
15600 * aren't parsed as such.
15604 const char * const e = RExC_end;
15605 unsigned complement = 0; /* If to complement the class */
15606 bool found_problem = FALSE; /* Assume OK until proven otherwise */
15607 bool has_opening_bracket = FALSE;
15608 bool has_opening_colon = FALSE;
15609 int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find
15611 const char * possible_end = NULL; /* used for a 2nd parse pass */
15612 const char* name_start; /* ptr to class name first char */
15614 /* If the number of single-character typos the input name is away from a
15615 * legal name is no more than this number, it is considered to have meant
15616 * the legal name */
15617 int max_distance = 2;
15619 /* to store the name. The size determines the maximum length before we
15620 * decide that no posix class was intended. Should be at least
15621 * sizeof("alphanumeric") */
15623 STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
15625 PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
15627 CLEAR_POSIX_WARNINGS();
15630 return NOT_MEANT_TO_BE_A_POSIX_CLASS;
15633 if (*(p - 1) != '[') {
15634 ADD_POSIX_WARNING(p, "it doesn't start with a '['");
15635 found_problem = TRUE;
15638 has_opening_bracket = TRUE;
15641 /* They could be confused and think you can put spaces between the
15644 found_problem = TRUE;
15648 } while (p < e && isBLANK(*p));
15650 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15653 /* For [. .] and [= =]. These are quite different internally from [: :],
15654 * so they are handled separately. */
15655 if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']'
15656 and 1 for at least one char in it
15659 const char open_char = *p;
15660 const char * temp_ptr = p + 1;
15662 /* These two constructs are not handled by perl, and if we find a
15663 * syntactically valid one, we croak. khw, who wrote this code, finds
15664 * this explanation of them very unclear:
15665 * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html
15666 * And searching the rest of the internet wasn't very helpful either.
15667 * It looks like just about any byte can be in these constructs,
15668 * depending on the locale. But unless the pattern is being compiled
15669 * under /l, which is very rare, Perl runs under the C or POSIX locale.
15670 * In that case, it looks like [= =] isn't allowed at all, and that
15671 * [. .] could be any single code point, but for longer strings the
15672 * constituent characters would have to be the ASCII alphabetics plus
15673 * the minus-hyphen. Any sensible locale definition would limit itself
15674 * to these. And any portable one definitely should. Trying to parse
15675 * the general case is a nightmare (see [perl #127604]). So, this code
15676 * looks only for interiors of these constructs that match:
15678 * Using \w relaxes the apparent rules a little, without adding much
15679 * danger of mistaking something else for one of these constructs.
15681 * [. .] in some implementations described on the internet is usable to
15682 * escape a character that otherwise is special in bracketed character
15683 * classes. For example [.].] means a literal right bracket instead of
15684 * the ending of the class
15686 * [= =] can legitimately contain a [. .] construct, but we don't
15687 * handle this case, as that [. .] construct will later get parsed
15688 * itself and croak then. And [= =] is checked for even when not under
15689 * /l, as Perl has long done so.
15691 * The code below relies on there being a trailing NUL, so it doesn't
15692 * have to keep checking if the parse ptr < e.
15694 if (temp_ptr[1] == open_char) {
15697 else while ( temp_ptr < e
15698 && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-'))
15703 if (*temp_ptr == open_char) {
15705 if (*temp_ptr == ']') {
15707 if (! found_problem && ! check_only) {
15708 RExC_parse = (char *) temp_ptr;
15709 vFAIL3("POSIX syntax [%c %c] is reserved for future "
15710 "extensions", open_char, open_char);
15713 /* Here, the syntax wasn't completely valid, or else the call
15714 * is to check-only */
15715 if (updated_parse_ptr) {
15716 *updated_parse_ptr = (char *) temp_ptr;
15719 CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
15723 /* If we find something that started out to look like one of these
15724 * constructs, but isn't, we continue below so that it can be checked
15725 * for being a class name with a typo of '.' or '=' instead of a colon.
15729 /* Here, we think there is a possibility that a [: :] class was meant, and
15730 * we have the first real character. It could be they think the '^' comes
15733 found_problem = TRUE;
15734 ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon");
15739 found_problem = TRUE;
15743 } while (p < e && isBLANK(*p));
15745 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15749 /* But the first character should be a colon, which they could have easily
15750 * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to
15751 * distinguish from a colon, so treat that as a colon). */
15754 has_opening_colon = TRUE;
15756 else if (*p == ';') {
15757 found_problem = TRUE;
15759 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15760 has_opening_colon = TRUE;
15763 found_problem = TRUE;
15764 ADD_POSIX_WARNING(p, "there must be a starting ':'");
15766 /* Consider an initial punctuation (not one of the recognized ones) to
15767 * be a left terminator */
15768 if (*p != '^' && *p != ']' && isPUNCT(*p)) {
15773 /* They may think that you can put spaces between the components */
15775 found_problem = TRUE;
15779 } while (p < e && isBLANK(*p));
15781 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15786 /* We consider something like [^:^alnum:]] to not have been intended to
15787 * be a posix class, but XXX maybe we should */
15789 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15796 /* Again, they may think that you can put spaces between the components */
15798 found_problem = TRUE;
15802 } while (p < e && isBLANK(*p));
15804 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
15809 /* XXX This ']' may be a typo, and something else was meant. But
15810 * treating it as such creates enough complications, that that
15811 * possibility isn't currently considered here. So we assume that the
15812 * ']' is what is intended, and if we've already found an initial '[',
15813 * this leaves this construct looking like [:] or [:^], which almost
15814 * certainly weren't intended to be posix classes */
15815 if (has_opening_bracket) {
15816 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15819 /* But this function can be called when we parse the colon for
15820 * something like qr/[alpha:]]/, so we back up to look for the
15825 found_problem = TRUE;
15826 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
15828 else if (*p != ':') {
15830 /* XXX We are currently very restrictive here, so this code doesn't
15831 * consider the possibility that, say, /[alpha.]]/ was intended to
15832 * be a posix class. */
15833 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
15836 /* Here we have something like 'foo:]'. There was no initial colon,
15837 * and we back up over 'foo. XXX Unlike the going forward case, we
15838 * don't handle typos of non-word chars in the middle */
15839 has_opening_colon = FALSE;
15842 while (p > RExC_start && isWORDCHAR(*p)) {
15847 /* Here, we have positioned ourselves to where we think the first
15848 * character in the potential class is */
15851 /* Now the interior really starts. There are certain key characters that
15852 * can end the interior, or these could just be typos. To catch both
15853 * cases, we may have to do two passes. In the first pass, we keep on
15854 * going unless we come to a sequence that matches
15855 * qr/ [[:punct:]] [[:blank:]]* \] /xa
15856 * This means it takes a sequence to end the pass, so two typos in a row if
15857 * that wasn't what was intended. If the class is perfectly formed, just
15858 * this one pass is needed. We also stop if there are too many characters
15859 * being accumulated, but this number is deliberately set higher than any
15860 * real class. It is set high enough so that someone who thinks that
15861 * 'alphanumeric' is a correct name would get warned that it wasn't.
15862 * While doing the pass, we keep track of where the key characters were in
15863 * it. If we don't find an end to the class, and one of the key characters
15864 * was found, we redo the pass, but stop when we get to that character.
15865 * Thus the key character was considered a typo in the first pass, but a
15866 * terminator in the second. If two key characters are found, we stop at
15867 * the second one in the first pass. Again this can miss two typos, but
15868 * catches a single one
15870 * In the first pass, 'possible_end' starts as NULL, and then gets set to
15871 * point to the first key character. For the second pass, it starts as -1.
15877 bool has_blank = FALSE;
15878 bool has_upper = FALSE;
15879 bool has_terminating_colon = FALSE;
15880 bool has_terminating_bracket = FALSE;
15881 bool has_semi_colon = FALSE;
15882 unsigned int name_len = 0;
15883 int punct_count = 0;
15887 /* Squeeze out blanks when looking up the class name below */
15888 if (isBLANK(*p) ) {
15890 found_problem = TRUE;
15895 /* The name will end with a punctuation */
15897 const char * peek = p + 1;
15899 /* Treat any non-']' punctuation followed by a ']' (possibly
15900 * with intervening blanks) as trying to terminate the class.
15901 * ']]' is very likely to mean a class was intended (but
15902 * missing the colon), but the warning message that gets
15903 * generated shows the error position better if we exit the
15904 * loop at the bottom (eventually), so skip it here. */
15906 if (peek < e && isBLANK(*peek)) {
15908 found_problem = TRUE;
15911 } while (peek < e && isBLANK(*peek));
15914 if (peek < e && *peek == ']') {
15915 has_terminating_bracket = TRUE;
15917 has_terminating_colon = TRUE;
15919 else if (*p == ';') {
15920 has_semi_colon = TRUE;
15921 has_terminating_colon = TRUE;
15924 found_problem = TRUE;
15931 /* Here we have punctuation we thought didn't end the class.
15932 * Keep track of the position of the key characters that are
15933 * more likely to have been class-enders */
15934 if (*p == ']' || *p == '[' || *p == ':' || *p == ';') {
15936 /* Allow just one such possible class-ender not actually
15937 * ending the class. */
15938 if (possible_end) {
15944 /* If we have too many punctuation characters, no use in
15946 if (++punct_count > max_distance) {
15950 /* Treat the punctuation as a typo. */
15951 input_text[name_len++] = *p;
15954 else if (isUPPER(*p)) { /* Use lowercase for lookup */
15955 input_text[name_len++] = toLOWER(*p);
15957 found_problem = TRUE;
15959 } else if (! UTF || UTF8_IS_INVARIANT(*p)) {
15960 input_text[name_len++] = *p;
15964 input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL);
15968 /* The declaration of 'input_text' is how long we allow a potential
15969 * class name to be, before saying they didn't mean a class name at
15971 if (name_len >= C_ARRAY_LENGTH(input_text)) {
15976 /* We get to here when the possible class name hasn't been properly
15977 * terminated before:
15978 * 1) we ran off the end of the pattern; or
15979 * 2) found two characters, each of which might have been intended to
15980 * be the name's terminator
15981 * 3) found so many punctuation characters in the purported name,
15982 * that the edit distance to a valid one is exceeded
15983 * 4) we decided it was more characters than anyone could have
15984 * intended to be one. */
15986 found_problem = TRUE;
15988 /* In the final two cases, we know that looking up what we've
15989 * accumulated won't lead to a match, even a fuzzy one. */
15990 if ( name_len >= C_ARRAY_LENGTH(input_text)
15991 || punct_count > max_distance)
15993 /* If there was an intermediate key character that could have been
15994 * an intended end, redo the parse, but stop there */
15995 if (possible_end && possible_end != (char *) -1) {
15996 possible_end = (char *) -1; /* Special signal value to say
15997 we've done a first pass */
16002 /* Otherwise, it can't have meant to have been a class */
16003 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16006 /* If we ran off the end, and the final character was a punctuation
16007 * one, back up one, to look at that final one just below. Later, we
16008 * will restore the parse pointer if appropriate */
16009 if (name_len && p == e && isPUNCT(*(p-1))) {
16014 if (p < e && isPUNCT(*p)) {
16016 has_terminating_bracket = TRUE;
16018 /* If this is a 2nd ']', and the first one is just below this
16019 * one, consider that to be the real terminator. This gives a
16020 * uniform and better positioning for the warning message */
16022 && possible_end != (char *) -1
16023 && *possible_end == ']'
16024 && name_len && input_text[name_len - 1] == ']')
16029 /* And this is actually equivalent to having done the 2nd
16030 * pass now, so set it to not try again */
16031 possible_end = (char *) -1;
16036 has_terminating_colon = TRUE;
16038 else if (*p == ';') {
16039 has_semi_colon = TRUE;
16040 has_terminating_colon = TRUE;
16048 /* Here, we have a class name to look up. We can short circuit the
16049 * stuff below for short names that can't possibly be meant to be a
16050 * class name. (We can do this on the first pass, as any second pass
16051 * will yield an even shorter name) */
16052 if (name_len < 3) {
16053 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16056 /* Find which class it is. Initially switch on the length of the name.
16058 switch (name_len) {
16060 if (memEQs(name_start, 4, "word")) {
16061 /* this is not POSIX, this is the Perl \w */
16062 class_number = ANYOF_WORDCHAR;
16066 /* Names all of length 5: alnum alpha ascii blank cntrl digit
16067 * graph lower print punct space upper
16068 * Offset 4 gives the best switch position. */
16069 switch (name_start[4]) {
16071 if (memBEGINs(name_start, 5, "alph")) /* alpha */
16072 class_number = ANYOF_ALPHA;
16075 if (memBEGINs(name_start, 5, "spac")) /* space */
16076 class_number = ANYOF_SPACE;
16079 if (memBEGINs(name_start, 5, "grap")) /* graph */
16080 class_number = ANYOF_GRAPH;
16083 if (memBEGINs(name_start, 5, "asci")) /* ascii */
16084 class_number = ANYOF_ASCII;
16087 if (memBEGINs(name_start, 5, "blan")) /* blank */
16088 class_number = ANYOF_BLANK;
16091 if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
16092 class_number = ANYOF_CNTRL;
16095 if (memBEGINs(name_start, 5, "alnu")) /* alnum */
16096 class_number = ANYOF_ALPHANUMERIC;
16099 if (memBEGINs(name_start, 5, "lowe")) /* lower */
16100 class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
16101 else if (memBEGINs(name_start, 5, "uppe")) /* upper */
16102 class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
16105 if (memBEGINs(name_start, 5, "digi")) /* digit */
16106 class_number = ANYOF_DIGIT;
16107 else if (memBEGINs(name_start, 5, "prin")) /* print */
16108 class_number = ANYOF_PRINT;
16109 else if (memBEGINs(name_start, 5, "punc")) /* punct */
16110 class_number = ANYOF_PUNCT;
16115 if (memEQs(name_start, 6, "xdigit"))
16116 class_number = ANYOF_XDIGIT;
16120 /* If the name exactly matches a posix class name the class number will
16121 * here be set to it, and the input almost certainly was meant to be a
16122 * posix class, so we can skip further checking. If instead the syntax
16123 * is exactly correct, but the name isn't one of the legal ones, we
16124 * will return that as an error below. But if neither of these apply,
16125 * it could be that no posix class was intended at all, or that one
16126 * was, but there was a typo. We tease these apart by doing fuzzy
16127 * matching on the name */
16128 if (class_number == OOB_NAMEDCLASS && found_problem) {
16129 const UV posix_names[][6] = {
16130 { 'a', 'l', 'n', 'u', 'm' },
16131 { 'a', 'l', 'p', 'h', 'a' },
16132 { 'a', 's', 'c', 'i', 'i' },
16133 { 'b', 'l', 'a', 'n', 'k' },
16134 { 'c', 'n', 't', 'r', 'l' },
16135 { 'd', 'i', 'g', 'i', 't' },
16136 { 'g', 'r', 'a', 'p', 'h' },
16137 { 'l', 'o', 'w', 'e', 'r' },
16138 { 'p', 'r', 'i', 'n', 't' },
16139 { 'p', 'u', 'n', 'c', 't' },
16140 { 's', 'p', 'a', 'c', 'e' },
16141 { 'u', 'p', 'p', 'e', 'r' },
16142 { 'w', 'o', 'r', 'd' },
16143 { 'x', 'd', 'i', 'g', 'i', 't' }
16145 /* The names of the above all have added NULs to make them the same
16146 * size, so we need to also have the real lengths */
16147 const UV posix_name_lengths[] = {
16148 sizeof("alnum") - 1,
16149 sizeof("alpha") - 1,
16150 sizeof("ascii") - 1,
16151 sizeof("blank") - 1,
16152 sizeof("cntrl") - 1,
16153 sizeof("digit") - 1,
16154 sizeof("graph") - 1,
16155 sizeof("lower") - 1,
16156 sizeof("print") - 1,
16157 sizeof("punct") - 1,
16158 sizeof("space") - 1,
16159 sizeof("upper") - 1,
16160 sizeof("word") - 1,
16161 sizeof("xdigit")- 1
16164 int temp_max = max_distance; /* Use a temporary, so if we
16165 reparse, we haven't changed the
16168 /* Use a smaller max edit distance if we are missing one of the
16170 if ( has_opening_bracket + has_opening_colon < 2
16171 || has_terminating_bracket + has_terminating_colon < 2)
16176 /* See if the input name is close to a legal one */
16177 for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) {
16179 /* Short circuit call if the lengths are too far apart to be
16181 if (abs( (int) (name_len - posix_name_lengths[i]))
16187 if (edit_distance(input_text,
16190 posix_name_lengths[i],
16194 { /* If it is close, it probably was intended to be a class */
16195 goto probably_meant_to_be;
16199 /* Here the input name is not close enough to a valid class name
16200 * for us to consider it to be intended to be a posix class. If
16201 * we haven't already done so, and the parse found a character that
16202 * could have been terminators for the name, but which we absorbed
16203 * as typos during the first pass, repeat the parse, signalling it
16204 * to stop at that character */
16205 if (possible_end && possible_end != (char *) -1) {
16206 possible_end = (char *) -1;
16211 /* Here neither pass found a close-enough class name */
16212 CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
16215 probably_meant_to_be:
16217 /* Here we think that a posix specification was intended. Update any
16219 if (updated_parse_ptr) {
16220 *updated_parse_ptr = (char *) p;
16223 /* If a posix class name was intended but incorrectly specified, we
16224 * output or return the warnings */
16225 if (found_problem) {
16227 /* We set flags for these issues in the parse loop above instead of
16228 * adding them to the list of warnings, because we can parse it
16229 * twice, and we only want one warning instance */
16231 ADD_POSIX_WARNING(p, "the name must be all lowercase letters");
16234 ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING);
16236 if (has_semi_colon) {
16237 ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING);
16239 else if (! has_terminating_colon) {
16240 ADD_POSIX_WARNING(p, "there is no terminating ':'");
16242 if (! has_terminating_bracket) {
16243 ADD_POSIX_WARNING(p, "there is no terminating ']'");
16246 if ( posix_warnings
16248 && av_count(RExC_warn_text) > 0)
16250 *posix_warnings = RExC_warn_text;
16253 else if (class_number != OOB_NAMEDCLASS) {
16254 /* If it is a known class, return the class. The class number
16255 * #defines are structured so each complement is +1 to the normal
16257 CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
16259 else if (! check_only) {
16261 /* Here, it is an unrecognized class. This is an error (unless the
16262 * call is to check only, which we've already handled above) */
16263 const char * const complement_string = (complement)
16266 RExC_parse = (char *) p;
16267 vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown",
16269 UTF8fARG(UTF, RExC_parse - name_start - 2, name_start));
16273 return OOB_NAMEDCLASS;
16275 #undef ADD_POSIX_WARNING
16277 STATIC unsigned int
16278 S_regex_set_precedence(const U8 my_operator) {
16280 /* Returns the precedence in the (?[...]) construct of the input operator,
16281 * specified by its character representation. The precedence follows
16282 * general Perl rules, but it extends this so that ')' and ']' have (low)
16283 * precedence even though they aren't really operators */
16285 switch (my_operator) {
16301 NOT_REACHED; /* NOTREACHED */
16302 return 0; /* Silence compiler warning */
16305 STATIC regnode_offset
16306 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16307 I32 *flagp, U32 depth,
16308 char * const oregcomp_parse)
16310 /* Handle the (?[...]) construct to do set operations */
16312 U8 curchar; /* Current character being parsed */
16313 UV start, end; /* End points of code point ranges */
16314 SV* final = NULL; /* The end result inversion list */
16315 SV* result_string; /* 'final' stringified */
16316 AV* stack; /* stack of operators and operands not yet
16318 AV* fence_stack = NULL; /* A stack containing the positions in
16319 'stack' of where the undealt-with left
16320 parens would be if they were actually
16322 /* The 'volatile' is a workaround for an optimiser bug
16323 * in Solaris Studio 12.3. See RT #127455 */
16324 volatile IV fence = 0; /* Position of where most recent undealt-
16325 with left paren in stack is; -1 if none.
16327 STRLEN len; /* Temporary */
16328 regnode_offset node; /* Temporary, and final regnode returned by
16330 const bool save_fold = FOLD; /* Temporary */
16331 char *save_end, *save_parse; /* Temporaries */
16332 const bool in_locale = LOC; /* we turn off /l during processing */
16334 DECLARE_AND_GET_RE_DEBUG_FLAGS;
16336 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
16337 PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */
16339 DEBUG_PARSE("xcls");
16342 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
16345 /* The use of this operator implies /u. This is required so that the
16346 * compile time values are valid in all runtime cases */
16347 REQUIRE_UNI_RULES(flagp, 0);
16349 ckWARNexperimental(RExC_parse,
16350 WARN_EXPERIMENTAL__REGEX_SETS,
16351 "The regex_sets feature is experimental");
16353 /* Everything in this construct is a metacharacter. Operands begin with
16354 * either a '\' (for an escape sequence), or a '[' for a bracketed
16355 * character class. Any other character should be an operator, or
16356 * parenthesis for grouping. Both types of operands are handled by calling
16357 * regclass() to parse them. It is called with a parameter to indicate to
16358 * return the computed inversion list. The parsing here is implemented via
16359 * a stack. Each entry on the stack is a single character representing one
16360 * of the operators; or else a pointer to an operand inversion list. */
16362 #define IS_OPERATOR(a) SvIOK(a)
16363 #define IS_OPERAND(a) (! IS_OPERATOR(a))
16365 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
16366 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
16367 * with pronouncing it called it Reverse Polish instead, but now that YOU
16368 * know how to pronounce it you can use the correct term, thus giving due
16369 * credit to the person who invented it, and impressing your geek friends.
16370 * Wikipedia says that the pronounciation of "Ł" has been changing so that
16371 * it is now more like an English initial W (as in wonk) than an L.)
16373 * This means that, for example, 'a | b & c' is stored on the stack as
16381 * where the numbers in brackets give the stack [array] element number.
16382 * In this implementation, parentheses are not stored on the stack.
16383 * Instead a '(' creates a "fence" so that the part of the stack below the
16384 * fence is invisible except to the corresponding ')' (this allows us to
16385 * replace testing for parens, by using instead subtraction of the fence
16386 * position). As new operands are processed they are pushed onto the stack
16387 * (except as noted in the next paragraph). New operators of higher
16388 * precedence than the current final one are inserted on the stack before
16389 * the lhs operand (so that when the rhs is pushed next, everything will be
16390 * in the correct positions shown above. When an operator of equal or
16391 * lower precedence is encountered in parsing, all the stacked operations
16392 * of equal or higher precedence are evaluated, leaving the result as the
16393 * top entry on the stack. This makes higher precedence operations
16394 * evaluate before lower precedence ones, and causes operations of equal
16395 * precedence to left associate.
16397 * The only unary operator '!' is immediately pushed onto the stack when
16398 * encountered. When an operand is encountered, if the top of the stack is
16399 * a '!", the complement is immediately performed, and the '!' popped. The
16400 * resulting value is treated as a new operand, and the logic in the
16401 * previous paragraph is executed. Thus in the expression
16403 * the stack looks like
16409 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
16416 * A ')' is treated as an operator with lower precedence than all the
16417 * aforementioned ones, which causes all operations on the stack above the
16418 * corresponding '(' to be evaluated down to a single resultant operand.
16419 * Then the fence for the '(' is removed, and the operand goes through the
16420 * algorithm above, without the fence.
16422 * A separate stack is kept of the fence positions, so that the position of
16423 * the latest so-far unbalanced '(' is at the top of it.
16425 * The ']' ending the construct is treated as the lowest operator of all,
16426 * so that everything gets evaluated down to a single operand, which is the
16429 sv_2mortal((SV *)(stack = newAV()));
16430 sv_2mortal((SV *)(fence_stack = newAV()));
16432 while (RExC_parse < RExC_end) {
16433 I32 top_index; /* Index of top-most element in 'stack' */
16434 SV** top_ptr; /* Pointer to top 'stack' element */
16435 SV* current = NULL; /* To contain the current inversion list
16437 SV* only_to_avoid_leaks;
16439 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
16440 TRUE /* Force /x */ );
16441 if (RExC_parse >= RExC_end) { /* Fail */
16445 curchar = UCHARAT(RExC_parse);
16449 #ifdef ENABLE_REGEX_SETS_DEBUGGING
16450 /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
16451 DEBUG_U(dump_regex_sets_structures(pRExC_state,
16452 stack, fence, fence_stack));
16455 top_index = av_tindex_skip_len_mg(stack);
16458 SV** stacked_ptr; /* Ptr to something already on 'stack' */
16459 char stacked_operator; /* The topmost operator on the 'stack'. */
16460 SV* lhs; /* Operand to the left of the operator */
16461 SV* rhs; /* Operand to the right of the operator */
16462 SV* fence_ptr; /* Pointer to top element of the fence
16466 if ( RExC_parse < RExC_end - 2
16467 && UCHARAT(RExC_parse + 1) == '?'
16468 && UCHARAT(RExC_parse + 2) == '^')
16470 const regnode_offset orig_emit = RExC_emit;
16471 SV * resultant_invlist;
16473 /* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
16474 * This happens when we have some thing like
16476 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
16478 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
16480 * Here we would be handling the interpolated
16481 * '$thai_or_lao'. We handle this by a recursive call to
16482 * reg which returns the inversion list the
16483 * interpolated expression evaluates to. Actually, the
16484 * return is a special regnode containing a pointer to that
16485 * inversion list. If the return isn't that regnode alone,
16486 * we know that this wasn't such an interpolation, which is
16487 * an error: we need to get a single inversion list back
16488 * from the recursion */
16493 node = reg(pRExC_state, 2, flagp, depth+1);
16494 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16496 if ( OP(REGNODE_p(node)) != REGEX_SET
16497 /* If more than a single node returned, the nested
16498 * parens evaluated to more than just a (?[...]),
16499 * which isn't legal */
16500 || RExC_emit != orig_emit
16501 + NODE_STEP_REGNODE
16502 + regarglen[REGEX_SET])
16504 vFAIL("Expecting interpolated extended charclass");
16506 resultant_invlist = (SV *) ARGp(REGNODE_p(node));
16507 current = invlist_clone(resultant_invlist, NULL);
16508 SvREFCNT_dec(resultant_invlist);
16511 RExC_emit = orig_emit;
16512 goto handle_operand;
16515 /* A regular '('. Look behind for illegal syntax */
16516 if (top_index - fence >= 0) {
16517 /* If the top entry on the stack is an operator, it had
16518 * better be a '!', otherwise the entry below the top
16519 * operand should be an operator */
16520 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
16521 || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
16522 || ( IS_OPERAND(*top_ptr)
16523 && ( top_index - fence < 1
16524 || ! (stacked_ptr = av_fetch(stack,
16527 || ! IS_OPERATOR(*stacked_ptr))))
16530 vFAIL("Unexpected '(' with no preceding operator");
16534 /* Stack the position of this undealt-with left paren */
16535 av_push(fence_stack, newSViv(fence));
16536 fence = top_index + 1;
16540 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16541 * multi-char folds are allowed. */
16542 if (!regclass(pRExC_state, flagp, depth+1,
16543 TRUE, /* means parse just the next thing */
16544 FALSE, /* don't allow multi-char folds */
16545 FALSE, /* don't silence non-portable warnings. */
16547 FALSE, /* Require return to be an ANYOF */
16550 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16551 goto regclass_failed;
16556 /* regclass() will return with parsing just the \ sequence,
16557 * leaving the parse pointer at the next thing to parse */
16559 goto handle_operand;
16561 case '[': /* Is a bracketed character class */
16563 /* See if this is a [:posix:] class. */
16564 bool is_posix_class = (OOB_NAMEDCLASS
16565 < handle_possible_posix(pRExC_state,
16569 TRUE /* checking only */));
16570 /* If it is a posix class, leave the parse pointer at the '['
16571 * to fool regclass() into thinking it is part of a
16572 * '[[:posix:]]'. */
16573 if (! is_posix_class) {
16577 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if
16578 * multi-char folds are allowed. */
16579 if (!regclass(pRExC_state, flagp, depth+1,
16580 is_posix_class, /* parse the whole char
16581 class only if not a
16583 FALSE, /* don't allow multi-char folds */
16584 TRUE, /* silence non-portable warnings. */
16586 FALSE, /* Require return to be an ANYOF */
16589 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16590 goto regclass_failed;
16595 /* function call leaves parse pointing to the ']', except if we
16597 if (is_posix_class) {
16601 goto handle_operand;
16605 if (top_index >= 1) {
16606 goto join_operators;
16609 /* Only a single operand on the stack: are done */
16613 if (av_tindex_skip_len_mg(fence_stack) < 0) {
16614 if (UCHARAT(RExC_parse - 1) == ']') {
16618 vFAIL("Unexpected ')'");
16621 /* If nothing after the fence, is missing an operand */
16622 if (top_index - fence < 0) {
16626 /* If at least two things on the stack, treat this as an
16628 if (top_index - fence >= 1) {
16629 goto join_operators;
16632 /* Here only a single thing on the fenced stack, and there is a
16633 * fence. Get rid of it */
16634 fence_ptr = av_pop(fence_stack);
16636 fence = SvIV(fence_ptr);
16637 SvREFCNT_dec_NN(fence_ptr);
16644 /* Having gotten rid of the fence, we pop the operand at the
16645 * stack top and process it as a newly encountered operand */
16646 current = av_pop(stack);
16647 if (IS_OPERAND(current)) {
16648 goto handle_operand;
16660 /* These binary operators should have a left operand already
16662 if ( top_index - fence < 0
16663 || top_index - fence == 1
16664 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
16665 || ! IS_OPERAND(*top_ptr))
16667 goto unexpected_binary;
16670 /* If only the one operand is on the part of the stack visible
16671 * to us, we just place this operator in the proper position */
16672 if (top_index - fence < 2) {
16674 /* Place the operator before the operand */
16676 SV* lhs = av_pop(stack);
16677 av_push(stack, newSVuv(curchar));
16678 av_push(stack, lhs);
16682 /* But if there is something else on the stack, we need to
16683 * process it before this new operator if and only if the
16684 * stacked operation has equal or higher precedence than the
16689 /* The operator on the stack is supposed to be below both its
16691 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
16692 || IS_OPERAND(*stacked_ptr))
16694 /* But if not, it's legal and indicates we are completely
16695 * done if and only if we're currently processing a ']',
16696 * which should be the final thing in the expression */
16697 if (curchar == ']') {
16703 vFAIL2("Unexpected binary operator '%c' with no "
16704 "preceding operand", curchar);
16706 stacked_operator = (char) SvUV(*stacked_ptr);
16708 if (regex_set_precedence(curchar)
16709 > regex_set_precedence(stacked_operator))
16711 /* Here, the new operator has higher precedence than the
16712 * stacked one. This means we need to add the new one to
16713 * the stack to await its rhs operand (and maybe more
16714 * stuff). We put it before the lhs operand, leaving
16715 * untouched the stacked operator and everything below it
16717 lhs = av_pop(stack);
16718 assert(IS_OPERAND(lhs));
16720 av_push(stack, newSVuv(curchar));
16721 av_push(stack, lhs);
16725 /* Here, the new operator has equal or lower precedence than
16726 * what's already there. This means the operation already
16727 * there should be performed now, before the new one. */
16729 rhs = av_pop(stack);
16730 if (! IS_OPERAND(rhs)) {
16732 /* This can happen when a ! is not followed by an operand,
16733 * like in /(?[\t &!])/ */
16737 lhs = av_pop(stack);
16739 if (! IS_OPERAND(lhs)) {
16741 /* This can happen when there is an empty (), like in
16742 * /(?[[0]+()+])/ */
16746 switch (stacked_operator) {
16748 _invlist_intersection(lhs, rhs, &rhs);
16753 _invlist_union(lhs, rhs, &rhs);
16757 _invlist_subtract(lhs, rhs, &rhs);
16760 case '^': /* The union minus the intersection */
16765 _invlist_union(lhs, rhs, &u);
16766 _invlist_intersection(lhs, rhs, &i);
16767 _invlist_subtract(u, i, &rhs);
16768 SvREFCNT_dec_NN(i);
16769 SvREFCNT_dec_NN(u);
16775 /* Here, the higher precedence operation has been done, and the
16776 * result is in 'rhs'. We overwrite the stacked operator with
16777 * the result. Then we redo this code to either push the new
16778 * operator onto the stack or perform any higher precedence
16779 * stacked operation */
16780 only_to_avoid_leaks = av_pop(stack);
16781 SvREFCNT_dec(only_to_avoid_leaks);
16782 av_push(stack, rhs);
16785 case '!': /* Highest priority, right associative */
16787 /* If what's already at the top of the stack is another '!",
16788 * they just cancel each other out */
16789 if ( (top_ptr = av_fetch(stack, top_index, FALSE))
16790 && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
16792 only_to_avoid_leaks = av_pop(stack);
16793 SvREFCNT_dec(only_to_avoid_leaks);
16795 else { /* Otherwise, since it's right associative, just push
16797 av_push(stack, newSVuv(curchar));
16802 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16803 if (RExC_parse >= RExC_end) {
16806 vFAIL("Unexpected character");
16810 /* Here 'current' is the operand. If something is already on the
16811 * stack, we have to check if it is a !. But first, the code above
16812 * may have altered the stack in the time since we earlier set
16815 top_index = av_tindex_skip_len_mg(stack);
16816 if (top_index - fence >= 0) {
16817 /* If the top entry on the stack is an operator, it had better
16818 * be a '!', otherwise the entry below the top operand should
16819 * be an operator */
16820 top_ptr = av_fetch(stack, top_index, FALSE);
16822 if (IS_OPERATOR(*top_ptr)) {
16824 /* The only permissible operator at the top of the stack is
16825 * '!', which is applied immediately to this operand. */
16826 curchar = (char) SvUV(*top_ptr);
16827 if (curchar != '!') {
16828 SvREFCNT_dec(current);
16829 vFAIL2("Unexpected binary operator '%c' with no "
16830 "preceding operand", curchar);
16833 _invlist_invert(current);
16835 only_to_avoid_leaks = av_pop(stack);
16836 SvREFCNT_dec(only_to_avoid_leaks);
16838 /* And we redo with the inverted operand. This allows
16839 * handling multiple ! in a row */
16840 goto handle_operand;
16842 /* Single operand is ok only for the non-binary ')'
16844 else if ((top_index - fence == 0 && curchar != ')')
16845 || (top_index - fence > 0
16846 && (! (stacked_ptr = av_fetch(stack,
16849 || IS_OPERAND(*stacked_ptr))))
16851 SvREFCNT_dec(current);
16852 vFAIL("Operand with no preceding operator");
16856 /* Here there was nothing on the stack or the top element was
16857 * another operand. Just add this new one */
16858 av_push(stack, current);
16860 } /* End of switch on next parse token */
16862 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
16863 } /* End of loop parsing through the construct */
16865 vFAIL("Syntax error in (?[...])");
16869 if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
16870 if (RExC_parse < RExC_end) {
16874 vFAIL("Unexpected ']' with no following ')' in (?[...");
16877 if (av_tindex_skip_len_mg(fence_stack) >= 0) {
16878 vFAIL("Unmatched (");
16881 if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */
16882 || ((final = av_pop(stack)) == NULL)
16883 || ! IS_OPERAND(final)
16884 || ! is_invlist(final)
16885 || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */
16888 SvREFCNT_dec(final);
16889 vFAIL("Incomplete expression within '(?[ ])'");
16892 /* Here, 'final' is the resultant inversion list from evaluating the
16893 * expression. Return it if so requested */
16894 if (return_invlist) {
16895 *return_invlist = final;
16899 if (RExC_sets_depth) { /* If within a recursive call, return in a special
16902 node = regpnode(pRExC_state, REGEX_SET, final);
16906 /* Otherwise generate a resultant node, based on 'final'. regclass()
16907 * is expecting a string of ranges and individual code points */
16908 invlist_iterinit(final);
16909 result_string = newSVpvs("");
16910 while (invlist_iternext(final, &start, &end)) {
16911 if (start == end) {
16912 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start);
16915 Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%"
16916 UVXf "}", start, end);
16920 /* About to generate an ANYOF (or similar) node from the inversion list
16921 * we have calculated */
16922 save_parse = RExC_parse;
16923 RExC_parse = SvPV(result_string, len);
16924 save_end = RExC_end;
16925 RExC_end = RExC_parse + len;
16926 TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
16928 /* We turn off folding around the call, as the class we have
16929 * constructed already has all folding taken into consideration, and we
16930 * don't want regclass() to add to that */
16931 RExC_flags &= ~RXf_PMf_FOLD;
16932 /* regclass() can only return RESTART_PARSE and NEED_UTF8 if multi-char
16933 * folds are allowed. */
16934 node = regclass(pRExC_state, flagp, depth+1,
16935 FALSE, /* means parse the whole char class */
16936 FALSE, /* don't allow multi-char folds */
16937 TRUE, /* silence non-portable warnings. The above may
16938 very well have generated non-portable code
16939 points, but they're valid on this machine */
16940 FALSE, /* similarly, no need for strict */
16942 /* We can optimize into something besides an ANYOF,
16943 * except under /l, which needs to be ANYOF because of
16944 * runtime checks for locale sanity, etc */
16950 RExC_parse = save_parse + 1;
16951 RExC_end = save_end;
16952 SvREFCNT_dec_NN(final);
16953 SvREFCNT_dec_NN(result_string);
16956 RExC_flags |= RXf_PMf_FOLD;
16960 RETURN_FAIL_ON_RESTART(*flagp, flagp);
16961 goto regclass_failed;
16964 /* Fix up the node type if we are in locale. (We have pretended we are
16965 * under /u for the purposes of regclass(), as this construct will only
16966 * work under UTF-8 locales. But now we change the opcode to be ANYOFL
16967 * (so as to cause any warnings about bad locales to be output in
16968 * regexec.c), and add the flag that indicates to check if not in a
16969 * UTF-8 locale. The reason we above forbid optimization into
16970 * something other than an ANYOF node is simply to minimize the number
16971 * of code changes in regexec.c. Otherwise we would have to create new
16972 * EXACTish node types and deal with them. This decision could be
16973 * revisited should this construct become popular.
16975 * (One might think we could look at the resulting ANYOF node and
16976 * suppress the flag if everything is above 255, as those would be
16977 * UTF-8 only, but this isn't true, as the components that led to that
16978 * result could have been locale-affected, and just happen to cancel
16979 * each other out under UTF-8 locales.) */
16981 set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
16983 assert(OP(REGNODE_p(node)) == ANYOF);
16985 OP(REGNODE_p(node)) = ANYOFL;
16986 ANYOF_FLAGS(REGNODE_p(node))
16987 |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16991 nextchar(pRExC_state);
16992 Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
16996 FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
17000 #ifdef ENABLE_REGEX_SETS_DEBUGGING
17003 S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
17004 AV * stack, const IV fence, AV * fence_stack)
17005 { /* Dumps the stacks in handle_regex_sets() */
17007 const SSize_t stack_top = av_tindex_skip_len_mg(stack);
17008 const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack);
17011 PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
17013 PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
17015 if (stack_top < 0) {
17016 PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
17019 PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
17020 for (i = stack_top; i >= 0; i--) {
17021 SV ** element_ptr = av_fetch(stack, i, FALSE);
17022 if (! element_ptr) {
17025 if (IS_OPERATOR(*element_ptr)) {
17026 PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
17027 (int) i, (int) SvIV(*element_ptr));
17030 PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
17031 sv_dump(*element_ptr);
17036 if (fence_stack_top < 0) {
17037 PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
17040 PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
17041 for (i = fence_stack_top; i >= 0; i--) {
17042 SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
17043 if (! element_ptr) {
17046 PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
17047 (int) i, (int) SvIV(*element_ptr));
17058 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
17060 /* This adds the Latin1/above-Latin1 folding rules.
17062 * This should be called only for a Latin1-range code points, cp, which is
17063 * known to be involved in a simple fold with other code points above
17064 * Latin1. It would give false results if /aa has been specified.
17065 * Multi-char folds are outside the scope of this, and must be handled
17068 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
17070 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
17072 /* The rules that are valid for all Unicode versions are hard-coded in */
17077 add_cp_to_invlist(*invlist, KELVIN_SIGN);
17081 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
17084 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
17085 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
17087 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
17088 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
17089 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
17091 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
17092 *invlist = add_cp_to_invlist(*invlist,
17093 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
17096 default: /* Other code points are checked against the data for the
17097 current Unicode version */
17099 Size_t folds_count;
17101 const U32 * remaining_folds;
17105 folded_cp = toFOLD(cp);
17108 U8 dummy_fold[UTF8_MAXBYTES_CASE+1];
17110 folded_cp = _to_fold_latin1(cp, dummy_fold, &dummy_len, 0);
17113 if (folded_cp > 255) {
17114 *invlist = add_cp_to_invlist(*invlist, folded_cp);
17117 folds_count = _inverse_folds(folded_cp, &first_fold,
17119 if (folds_count == 0) {
17121 /* Use deprecated warning to increase the chances of this being
17123 ckWARN2reg_d(RExC_parse,
17124 "Perl folding rules are not up-to-date for 0x%02X;"
17125 " please use the perlbug utility to report;", cp);
17130 if (first_fold > 255) {
17131 *invlist = add_cp_to_invlist(*invlist, first_fold);
17133 for (i = 0; i < folds_count - 1; i++) {
17134 if (remaining_folds[i] > 255) {
17135 *invlist = add_cp_to_invlist(*invlist,
17136 remaining_folds[i]);
17146 S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
17148 /* Output the elements of the array given by '*posix_warnings' as REGEXP
17152 const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
17154 PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
17156 if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
17157 CLEAR_POSIX_WARNINGS();
17161 while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
17162 if (first_is_fatal) { /* Avoid leaking this */
17163 av_undef(posix_warnings); /* This isn't necessary if the
17164 array is mortal, but is a
17166 (void) sv_2mortal(msg);
17169 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
17170 SvREFCNT_dec_NN(msg);
17173 UPDATE_WARNINGS_LOC(RExC_parse);
17176 PERL_STATIC_INLINE Size_t
17177 S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
17179 const U8 * const start = s1;
17180 const U8 * const send = start + max;
17182 PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
17184 while (s1 < send && *s1 == *s2) {
17193 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
17195 /* This adds the string scalar <multi_string> to the array
17196 * <multi_char_matches>. <multi_string> is known to have exactly
17197 * <cp_count> code points in it. This is used when constructing a
17198 * bracketed character class and we find something that needs to match more
17199 * than a single character.
17201 * <multi_char_matches> is actually an array of arrays. Each top-level
17202 * element is an array that contains all the strings known so far that are
17203 * the same length. And that length (in number of code points) is the same
17204 * as the index of the top-level array. Hence, the [2] element is an
17205 * array, each element thereof is a string containing TWO code points;
17206 * while element [3] is for strings of THREE characters, and so on. Since
17207 * this is for multi-char strings there can never be a [0] nor [1] element.
17209 * When we rewrite the character class below, we will do so such that the
17210 * longest strings are written first, so that it prefers the longest
17211 * matching strings first. This is done even if it turns out that any
17212 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
17213 * Christiansen has agreed that this is ok. This makes the test for the
17214 * ligature 'ffi' come before the test for 'ff', for example */
17217 AV** this_array_ptr;
17219 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
17221 if (! multi_char_matches) {
17222 multi_char_matches = newAV();
17225 if (av_exists(multi_char_matches, cp_count)) {
17226 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
17227 this_array = *this_array_ptr;
17230 this_array = newAV();
17231 av_store(multi_char_matches, cp_count,
17234 av_push(this_array, multi_string);
17236 return multi_char_matches;
17239 /* The names of properties whose definitions are not known at compile time are
17240 * stored in this SV, after a constant heading. So if the length has been
17241 * changed since initialization, then there is a run-time definition. */
17242 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
17243 (SvCUR(listsv) != initial_listsv_len)
17245 /* There is a restricted set of white space characters that are legal when
17246 * ignoring white space in a bracketed character class. This generates the
17247 * code to skip them.
17249 * There is a line below that uses the same white space criteria but is outside
17250 * this macro. Both here and there must use the same definition */
17251 #define SKIP_BRACKETED_WHITE_SPACE(do_skip, p, stop_p) \
17254 while (p < stop_p && isBLANK_A(UCHARAT(p))) \
17261 STATIC regnode_offset
17262 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
17263 const bool stop_at_1, /* Just parse the next thing, don't
17264 look for a full character class */
17265 bool allow_mutiple_chars,
17266 const bool silence_non_portable, /* Don't output warnings
17270 bool optimizable, /* ? Allow a non-ANYOF return
17272 SV** ret_invlist /* Return an inversion list, not a node */
17275 /* parse a bracketed class specification. Most of these will produce an
17276 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
17277 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
17278 * under /i with multi-character folds: it will be rewritten following the
17279 * paradigm of this example, where the <multi-fold>s are characters which
17280 * fold to multiple character sequences:
17281 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
17282 * gets effectively rewritten as:
17283 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
17284 * reg() gets called (recursively) on the rewritten version, and this
17285 * function will return what it constructs. (Actually the <multi-fold>s
17286 * aren't physically removed from the [abcdefghi], it's just that they are
17287 * ignored in the recursion by means of a flag:
17288 * <RExC_in_multi_char_class>.)
17290 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
17291 * characters, with the corresponding bit set if that character is in the
17292 * list. For characters above this, an inversion list is used. There
17293 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
17294 * determinable at compile time
17296 * On success, returns the offset at which any next node should be placed
17297 * into the regex engine program being compiled.
17299 * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
17300 * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
17304 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
17306 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
17307 regnode_offset ret = -1; /* Initialized to an illegal value */
17309 int namedclass = OOB_NAMEDCLASS;
17310 char *rangebegin = NULL;
17311 SV *listsv = NULL; /* List of \p{user-defined} whose definitions
17312 aren't available at the time this was called */
17313 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
17314 than just initialized. */
17315 SV* properties = NULL; /* Code points that match \p{} \P{} */
17316 SV* posixes = NULL; /* Code points that match classes like [:word:],
17317 extended beyond the Latin1 range. These have to
17318 be kept separate from other code points for much
17319 of this function because their handling is
17320 different under /i, and for most classes under
17322 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
17323 separate for a while from the non-complemented
17324 versions because of complications with /d
17326 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
17327 treated more simply than the general case,
17328 leading to less compilation and execution
17330 UV element_count = 0; /* Number of distinct elements in the class.
17331 Optimizations may be possible if this is tiny */
17332 AV * multi_char_matches = NULL; /* Code points that fold to more than one
17333 character; used under /i */
17335 char * stop_ptr = RExC_end; /* where to stop parsing */
17337 /* ignore unescaped whitespace? */
17338 const bool skip_white = cBOOL( ret_invlist
17339 || (RExC_flags & RXf_PMf_EXTENDED_MORE));
17341 /* inversion list of code points this node matches only when the target
17342 * string is in UTF-8. These are all non-ASCII, < 256. (Because is under
17344 SV* upper_latin1_only_utf8_matches = NULL;
17346 /* Inversion list of code points this node matches regardless of things
17347 * like locale, folding, utf8ness of the target string */
17348 SV* cp_list = NULL;
17350 /* Like cp_list, but code points on this list need to be checked for things
17351 * that fold to/from them under /i */
17352 SV* cp_foldable_list = NULL;
17354 /* Like cp_list, but code points on this list are valid only when the
17355 * runtime locale is UTF-8 */
17356 SV* only_utf8_locale_list = NULL;
17358 /* In a range, if one of the endpoints is non-character-set portable,
17359 * meaning that it hard-codes a code point that may mean a different
17360 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
17361 * mnemonic '\t' which each mean the same character no matter which
17362 * character set the platform is on. */
17363 unsigned int non_portable_endpoint = 0;
17365 /* Is the range unicode? which means on a platform that isn't 1-1 native
17366 * to Unicode (i.e. non-ASCII), each code point in it should be considered
17367 * to be a Unicode value. */
17368 bool unicode_range = FALSE;
17369 bool invert = FALSE; /* Is this class to be complemented */
17371 bool warn_super = ALWAYS_WARN_SUPER;
17373 const char * orig_parse = RExC_parse;
17375 /* This variable is used to mark where the end in the input is of something
17376 * that looks like a POSIX construct but isn't. During the parse, when
17377 * something looks like it could be such a construct is encountered, it is
17378 * checked for being one, but not if we've already checked this area of the
17379 * input. Only after this position is reached do we check again */
17380 char *not_posix_region_end = RExC_parse - 1;
17382 AV* posix_warnings = NULL;
17383 const bool do_posix_warnings = ckWARN(WARN_REGEXP);
17384 U8 op = END; /* The returned node-type, initialized to an impossible
17386 U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
17387 U32 posixl = 0; /* bit field of posix classes matched under /l */
17390 /* Flags as to what things aren't knowable until runtime. (Note that these are
17391 * mutually exclusive.) */
17392 #define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
17393 haven't been defined as of yet */
17394 #define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
17396 #define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
17397 what gets folded */
17398 U32 has_runtime_dependency = 0; /* OR of the above flags */
17400 DECLARE_AND_GET_RE_DEBUG_FLAGS;
17402 PERL_ARGS_ASSERT_REGCLASS;
17404 PERL_UNUSED_ARG(depth);
17407 assert(! (ret_invlist && allow_mutiple_chars));
17409 /* If wants an inversion list returned, we can't optimize to something
17412 optimizable = FALSE;
17415 DEBUG_PARSE("clas");
17417 #if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
17418 || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
17419 && UNICODE_DOT_DOT_VERSION == 0)
17420 allow_mutiple_chars = FALSE;
17423 /* We include the /i status at the beginning of this so that we can
17424 * know it at runtime */
17425 listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD)));
17426 initial_listsv_len = SvCUR(listsv);
17427 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
17429 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17431 assert(RExC_parse <= RExC_end);
17433 if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
17436 allow_mutiple_chars = FALSE;
17438 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17441 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
17442 if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) {
17443 int maybe_class = handle_possible_posix(pRExC_state,
17445 ¬_posix_region_end,
17447 TRUE /* checking only */);
17448 if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
17449 ckWARN4reg(not_posix_region_end,
17450 "POSIX syntax [%c %c] belongs inside character classes%s",
17451 *RExC_parse, *RExC_parse,
17452 (maybe_class == OOB_NAMEDCLASS)
17453 ? ((POSIXCC_NOTYET(*RExC_parse))
17454 ? " (but this one isn't implemented)"
17455 : " (but this one isn't fully valid)")
17461 /* If the caller wants us to just parse a single element, accomplish this
17462 * by faking the loop ending condition */
17463 if (stop_at_1 && RExC_end > RExC_parse) {
17464 stop_ptr = RExC_parse + 1;
17467 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
17468 if (UCHARAT(RExC_parse) == ']')
17469 goto charclassloop;
17473 if ( posix_warnings
17474 && av_tindex_skip_len_mg(posix_warnings) >= 0
17475 && RExC_parse > not_posix_region_end)
17477 /* Warnings about posix class issues are considered tentative until
17478 * we are far enough along in the parse that we can no longer
17479 * change our mind, at which point we output them. This is done
17480 * each time through the loop so that a later class won't zap them
17481 * before they have been dealt with. */
17482 output_posix_warnings(pRExC_state, posix_warnings);
17485 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
17487 if (RExC_parse >= stop_ptr) {
17491 if (UCHARAT(RExC_parse) == ']') {
17497 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
17498 save_value = value;
17499 save_prevvalue = prevvalue;
17502 rangebegin = RExC_parse;
17504 non_portable_endpoint = 0;
17506 if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) {
17507 value = utf8n_to_uvchr((U8*)RExC_parse,
17508 RExC_end - RExC_parse,
17509 &numlen, UTF8_ALLOW_DEFAULT);
17510 RExC_parse += numlen;
17513 value = UCHARAT(RExC_parse++);
17515 if (value == '[') {
17516 char * posix_class_end;
17517 namedclass = handle_possible_posix(pRExC_state,
17520 do_posix_warnings ? &posix_warnings : NULL,
17521 FALSE /* die if error */);
17522 if (namedclass > OOB_NAMEDCLASS) {
17524 /* If there was an earlier attempt to parse this particular
17525 * posix class, and it failed, it was a false alarm, as this
17526 * successful one proves */
17527 if ( posix_warnings
17528 && av_tindex_skip_len_mg(posix_warnings) >= 0
17529 && not_posix_region_end >= RExC_parse
17530 && not_posix_region_end <= posix_class_end)
17532 av_undef(posix_warnings);
17535 RExC_parse = posix_class_end;
17537 else if (namedclass == OOB_NAMEDCLASS) {
17538 not_posix_region_end = posix_class_end;
17541 namedclass = OOB_NAMEDCLASS;
17544 else if ( RExC_parse - 1 > not_posix_region_end
17545 && MAYBE_POSIXCC(value))
17547 (void) handle_possible_posix(
17549 RExC_parse - 1, /* -1 because parse has already been
17551 ¬_posix_region_end,
17552 do_posix_warnings ? &posix_warnings : NULL,
17553 TRUE /* checking only */);
17555 else if ( strict && ! skip_white
17556 && ( _generic_isCC(value, _CC_VERTSPACE)
17557 || is_VERTWS_cp_high(value)))
17559 vFAIL("Literal vertical space in [] is illegal except under /x");
17561 else if (value == '\\') {
17562 /* Is a backslash; get the code point of the char after it */
17564 if (RExC_parse >= RExC_end) {
17565 vFAIL("Unmatched [");
17568 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
17569 value = utf8n_to_uvchr((U8*)RExC_parse,
17570 RExC_end - RExC_parse,
17571 &numlen, UTF8_ALLOW_DEFAULT);
17572 RExC_parse += numlen;
17575 value = UCHARAT(RExC_parse++);
17577 /* Some compilers cannot handle switching on 64-bit integer
17578 * values, therefore value cannot be an UV. Yes, this will
17579 * be a problem later if we want switch on Unicode.
17580 * A similar issue a little bit later when switching on
17581 * namedclass. --jhi */
17583 /* If the \ is escaping white space when white space is being
17584 * skipped, it means that that white space is wanted literally, and
17585 * is already in 'value'. Otherwise, need to translate the escape
17586 * into what it signifies. */
17587 if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
17588 const char * message;
17592 case 'w': namedclass = ANYOF_WORDCHAR; break;
17593 case 'W': namedclass = ANYOF_NWORDCHAR; break;
17594 case 's': namedclass = ANYOF_SPACE; break;
17595 case 'S': namedclass = ANYOF_NSPACE; break;
17596 case 'd': namedclass = ANYOF_DIGIT; break;
17597 case 'D': namedclass = ANYOF_NDIGIT; break;
17598 case 'v': namedclass = ANYOF_VERTWS; break;
17599 case 'V': namedclass = ANYOF_NVERTWS; break;
17600 case 'h': namedclass = ANYOF_HORIZWS; break;
17601 case 'H': namedclass = ANYOF_NHORIZWS; break;
17602 case 'N': /* Handle \N{NAME} in class */
17604 const char * const backslash_N_beg = RExC_parse - 2;
17607 if (! grok_bslash_N(pRExC_state,
17608 NULL, /* No regnode */
17609 &value, /* Yes single value */
17610 &cp_count, /* Multiple code pt count */
17616 if (*flagp & NEED_UTF8)
17617 FAIL("panic: grok_bslash_N set NEED_UTF8");
17619 RETURN_FAIL_ON_RESTART_FLAGP(flagp);
17621 if (cp_count < 0) {
17622 vFAIL("\\N in a character class must be a named character: \\N{...}");
17624 else if (cp_count == 0) {
17625 ckWARNreg(RExC_parse,
17626 "Ignoring zero length \\N{} in character class");
17628 else { /* cp_count > 1 */
17629 assert(cp_count > 1);
17630 if (! RExC_in_multi_char_class) {
17631 if ( ! allow_mutiple_chars
17634 || *RExC_parse == '-')
17638 vFAIL("\\N{} here is restricted to one character");
17640 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
17641 break; /* <value> contains the first code
17642 point. Drop out of the switch to
17646 SV * multi_char_N = newSVpvn(backslash_N_beg,
17647 RExC_parse - backslash_N_beg);
17649 = add_multi_match(multi_char_matches,
17654 } /* End of cp_count != 1 */
17656 /* This element should not be processed further in this
17659 value = save_value;
17660 prevvalue = save_prevvalue;
17661 continue; /* Back to top of loop to get next char */
17664 /* Here, is a single code point, and <value> contains it */
17665 unicode_range = TRUE; /* \N{} are Unicode */
17673 if (RExC_pm_flags & PMf_WILDCARD) {
17675 /* diag_listed_as: Use of %s is not allowed in Unicode
17676 property wildcard subpatterns in regex; marked by <--
17678 vFAIL3("Use of '\\%c%c' is not allowed in Unicode property"
17679 " wildcard subpatterns", (char) value, *(RExC_parse - 1));
17682 /* \p means they want Unicode semantics */
17683 REQUIRE_UNI_RULES(flagp, 0);
17685 if (RExC_parse >= RExC_end)
17686 vFAIL2("Empty \\%c", (U8)value);
17687 if (*RExC_parse == '{') {
17688 const U8 c = (U8)value;
17689 e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
17692 vFAIL2("Missing right brace on \\%c{}", c);
17697 /* White space is allowed adjacent to the braces and after
17698 * any '^', even when not under /x */
17699 while (isSPACE(*RExC_parse)) {
17703 if (UCHARAT(RExC_parse) == '^') {
17705 /* toggle. (The rhs xor gets the single bit that
17706 * differs between P and p; the other xor inverts just
17708 value ^= 'P' ^ 'p';
17711 while (isSPACE(*RExC_parse)) {
17716 if (e == RExC_parse)
17717 vFAIL2("Empty \\%c{}", c);
17719 n = e - RExC_parse;
17720 while (isSPACE(*(RExC_parse + n - 1)))
17723 } /* The \p isn't immediately followed by a '{' */
17724 else if (! isALPHA(*RExC_parse)) {
17725 RExC_parse += (UTF)
17726 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17728 vFAIL2("Character following \\%c must be '{' or a "
17729 "single-character Unicode property name",
17737 char* name = RExC_parse;
17739 /* Any message returned about expanding the definition */
17740 SV* msg = newSVpvs_flags("", SVs_TEMP);
17742 /* If set TRUE, the property is user-defined as opposed to
17743 * official Unicode */
17744 bool user_defined = FALSE;
17745 AV * strings = NULL;
17747 SV * prop_definition = parse_uniprop_string(
17748 name, n, UTF, FOLD,
17749 FALSE, /* This is compile-time */
17751 /* We can't defer this defn when
17752 * the full result is required in
17754 ! cBOOL(ret_invlist),
17761 if (SvCUR(msg)) { /* Assumes any error causes a msg */
17762 assert(prop_definition == NULL);
17763 RExC_parse = e + 1;
17764 if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole
17765 thing so, or else the display is
17769 /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */
17770 vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg),
17771 SvCUR(msg), SvPVX(msg)));
17774 assert(prop_definition || strings);
17778 if (! prop_definition) {
17779 RExC_parse = e + 1;
17780 vFAIL("Unicode string properties are not implemented in (?[...])");
17784 "Using just the single character results"
17785 " returned by \\p{} in (?[...])");
17788 else if (! RExC_in_multi_char_class) {
17789 if (invert ^ (value == 'P')) {
17790 RExC_parse = e + 1;
17791 vFAIL("Inverting a character class which contains"
17792 " a multi-character sequence is illegal");
17795 /* For each multi-character string ... */
17796 while (av_count(strings) > 0) {
17797 /* ... Each entry is itself an array of code
17799 AV * this_string = (AV *) av_shift( strings);
17800 STRLEN cp_count = av_count(this_string);
17801 SV * final = newSV(cp_count * 4);
17804 /* Create another string of sequences of \x{...} */
17805 while (av_count(this_string) > 0) {
17806 SV * character = av_shift(this_string);
17807 UV cp = SvUV(character);
17810 REQUIRE_UTF8(flagp);
17812 Perl_sv_catpvf(aTHX_ final, "\\x{%" UVXf "}",
17814 SvREFCNT_dec_NN(character);
17816 SvREFCNT_dec_NN(this_string);
17818 /* And add that to the list of such things */
17820 = add_multi_match(multi_char_matches,
17825 SvREFCNT_dec_NN(strings);
17828 if (! prop_definition) { /* If we got only a string,
17829 this iteration didn't really
17830 find a character */
17833 else if (! is_invlist(prop_definition)) {
17835 /* Here, the definition isn't known, so we have gotten
17836 * returned a string that will be evaluated if and when
17837 * encountered at runtime. We add it to the list of
17838 * such properties, along with whether it should be
17839 * complemented or not */
17840 if (value == 'P') {
17841 sv_catpvs(listsv, "!");
17844 sv_catpvs(listsv, "+");
17846 sv_catsv(listsv, prop_definition);
17848 has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
17850 /* We don't know yet what this matches, so have to flag
17852 anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
17855 assert (prop_definition && is_invlist(prop_definition));
17857 /* Here we do have the complete property definition
17859 * Temporary workaround for [perl #133136]. For this
17860 * precise input that is in the .t that is failing,
17861 * load utf8.pm, which is what the test wants, so that
17862 * that .t passes */
17863 if ( memEQs(RExC_start, e + 1 - RExC_start,
17865 && ! hv_common(GvHVn(PL_incgv),
17867 "utf8.pm", sizeof("utf8.pm") - 1,
17868 0, HV_FETCH_ISEXISTS, NULL, 0))
17870 require_pv("utf8.pm");
17873 if (! user_defined &&
17874 /* We warn on matching an above-Unicode code point
17875 * if the match would return true, except don't
17876 * warn for \p{All}, which has exactly one element
17878 (_invlist_contains_cp(prop_definition, 0x110000)
17879 && (! (_invlist_len(prop_definition) == 1
17880 && *invlist_array(prop_definition) == 0))))
17885 /* Invert if asking for the complement */
17886 if (value == 'P') {
17887 _invlist_union_complement_2nd(properties,
17892 _invlist_union(properties, prop_definition, &properties);
17897 RExC_parse = e + 1;
17898 namedclass = ANYOF_UNIPROP; /* no official name, but it's
17902 case 'n': value = '\n'; break;
17903 case 'r': value = '\r'; break;
17904 case 't': value = '\t'; break;
17905 case 'f': value = '\f'; break;
17906 case 'b': value = '\b'; break;
17907 case 'e': value = ESC_NATIVE; break;
17908 case 'a': value = '\a'; break;
17910 RExC_parse--; /* function expects to be pointed at the 'o' */
17911 if (! grok_bslash_o(&RExC_parse,
17917 cBOOL(range), /* MAX_UV allowed for range
17923 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17924 warn_non_literal_string(RExC_parse, packed_warn, message);
17928 non_portable_endpoint++;
17932 RExC_parse--; /* function expects to be pointed at the 'x' */
17933 if (! grok_bslash_x(&RExC_parse,
17939 cBOOL(range), /* MAX_UV allowed for range
17945 else if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17946 warn_non_literal_string(RExC_parse, packed_warn, message);
17950 non_portable_endpoint++;
17954 if (! grok_bslash_c(*RExC_parse, &grok_c_char, &message,
17957 /* going to die anyway; point to exact spot of
17959 RExC_parse += (UTF)
17960 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17965 value = grok_c_char;
17967 if (message && TO_OUTPUT_WARNINGS(RExC_parse)) {
17968 warn_non_literal_string(RExC_parse, packed_warn, message);
17971 non_portable_endpoint++;
17973 case '0': case '1': case '2': case '3': case '4':
17974 case '5': case '6': case '7':
17976 /* Take 1-3 octal digits */
17977 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
17978 | PERL_SCAN_NOTIFY_ILLDIGIT;
17979 numlen = (strict) ? 4 : 3;
17980 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
17981 RExC_parse += numlen;
17984 RExC_parse += (UTF)
17985 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
17987 vFAIL("Need exactly 3 octal digits");
17989 else if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
17990 && RExC_parse < RExC_end
17991 && isDIGIT(*RExC_parse)
17992 && ckWARN(WARN_REGEXP))
17994 reg_warn_non_literal_string(
17996 form_alien_digit_msg(8, numlen, RExC_parse,
17997 RExC_end, UTF, FALSE));
18001 non_portable_endpoint++;
18006 /* Allow \_ to not give an error */
18007 if (isWORDCHAR(value) && value != '_') {
18009 vFAIL2("Unrecognized escape \\%c in character class",
18013 ckWARN2reg(RExC_parse,
18014 "Unrecognized escape \\%c in character class passed through",
18019 } /* End of switch on char following backslash */
18020 } /* end of handling backslash escape sequences */
18022 /* Here, we have the current token in 'value' */
18024 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
18027 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
18028 * literal, as is the character that began the false range, i.e.
18029 * the 'a' in the examples */
18031 const int w = (RExC_parse >= rangebegin)
18032 ? RExC_parse - rangebegin
18036 "False [] range \"%" UTF8f "\"",
18037 UTF8fARG(UTF, w, rangebegin));
18040 ckWARN2reg(RExC_parse,
18041 "False [] range \"%" UTF8f "\"",
18042 UTF8fARG(UTF, w, rangebegin));
18043 cp_list = add_cp_to_invlist(cp_list, '-');
18044 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
18048 range = 0; /* this was not a true range */
18049 element_count += 2; /* So counts for three values */
18052 classnum = namedclass_to_classnum(namedclass);
18054 if (LOC && namedclass < ANYOF_POSIXL_MAX
18055 #ifndef HAS_ISASCII
18056 && classnum != _CC_ASCII
18059 SV* scratch_list = NULL;
18061 /* What the Posix classes (like \w, [:space:]) match isn't
18062 * generally knowable under locale until actual match time. A
18063 * special node is used for these which has extra space for a
18064 * bitmap, with a bit reserved for each named class that is to
18065 * be matched against. (This isn't needed for \p{} and
18066 * pseudo-classes, as they are not affected by locale, and
18067 * hence are dealt with separately.) However, if a named class
18068 * and its complement are both present, then it matches
18069 * everything, and there is no runtime dependency. Odd numbers
18070 * are the complements of the next lower number, so xor works.
18071 * (Note that something like [\w\D] should match everything,
18072 * because \d should be a proper subset of \w. But rather than
18073 * trust that the locale is well behaved, we leave this to
18074 * runtime to sort out) */
18075 if (POSIXL_TEST(posixl, namedclass ^ 1)) {
18076 cp_list = _add_range_to_invlist(cp_list, 0, UV_MAX);
18077 POSIXL_ZERO(posixl);
18078 has_runtime_dependency &= ~HAS_L_RUNTIME_DEPENDENCY;
18079 anyof_flags &= ~ANYOF_MATCHES_POSIXL;
18080 continue; /* We could ignore the rest of the class, but
18081 best to parse it for any errors */
18083 else { /* Here, isn't the complement of any already parsed
18085 POSIXL_SET(posixl, namedclass);
18086 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18087 anyof_flags |= ANYOF_MATCHES_POSIXL;
18089 /* The above-Latin1 characters are not subject to locale
18090 * rules. Just add them to the unconditionally-matched
18093 /* Get the list of the above-Latin1 code points this
18095 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
18096 PL_XPosix_ptrs[classnum],
18098 /* Odd numbers are complements,
18099 * like NDIGIT, NASCII, ... */
18100 namedclass % 2 != 0,
18102 /* Checking if 'cp_list' is NULL first saves an extra
18103 * clone. Its reference count will be decremented at the
18104 * next union, etc, or if this is the only instance, at the
18105 * end of the routine */
18107 cp_list = scratch_list;
18110 _invlist_union(cp_list, scratch_list, &cp_list);
18111 SvREFCNT_dec_NN(scratch_list);
18113 continue; /* Go get next character */
18118 /* Here, is not /l, or is a POSIX class for which /l doesn't
18119 * matter (or is a Unicode property, which is skipped here). */
18120 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
18121 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
18123 /* Here, should be \h, \H, \v, or \V. None of /d, /i
18124 * nor /l make a difference in what these match,
18125 * therefore we just add what they match to cp_list. */
18126 if (classnum != _CC_VERTSPACE) {
18127 assert( namedclass == ANYOF_HORIZWS
18128 || namedclass == ANYOF_NHORIZWS);
18130 /* It turns out that \h is just a synonym for
18132 classnum = _CC_BLANK;
18135 _invlist_union_maybe_complement_2nd(
18137 PL_XPosix_ptrs[classnum],
18138 namedclass % 2 != 0, /* Complement if odd
18139 (NHORIZWS, NVERTWS)
18144 else if ( AT_LEAST_UNI_SEMANTICS
18145 || classnum == _CC_ASCII
18146 || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
18147 || classnum == _CC_XDIGIT)))
18149 /* We usually have to worry about /d affecting what POSIX
18150 * classes match, with special code needed because we won't
18151 * know until runtime what all matches. But there is no
18152 * extra work needed under /u and /a; and [:ascii:] is
18153 * unaffected by /d; and :digit: and :xdigit: don't have
18154 * runtime differences under /d. So we can special case
18155 * these, and avoid some extra work below, and at runtime.
18157 _invlist_union_maybe_complement_2nd(
18159 ((AT_LEAST_ASCII_RESTRICTED)
18160 ? PL_Posix_ptrs[classnum]
18161 : PL_XPosix_ptrs[classnum]),
18162 namedclass % 2 != 0,
18165 else { /* Garden variety class. If is NUPPER, NALPHA, ...
18166 complement and use nposixes */
18167 SV** posixes_ptr = namedclass % 2 == 0
18170 _invlist_union_maybe_complement_2nd(
18172 PL_XPosix_ptrs[classnum],
18173 namedclass % 2 != 0,
18177 } /* end of namedclass \blah */
18179 SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse, RExC_end);
18181 /* If 'range' is set, 'value' is the ending of a range--check its
18182 * validity. (If value isn't a single code point in the case of a
18183 * range, we should have figured that out above in the code that
18184 * catches false ranges). Later, we will handle each individual code
18185 * point in the range. If 'range' isn't set, this could be the
18186 * beginning of a range, so check for that by looking ahead to see if
18187 * the next real character to be processed is the range indicator--the
18192 /* For unicode ranges, we have to test that the Unicode as opposed
18193 * to the native values are not decreasing. (Above 255, there is
18194 * no difference between native and Unicode) */
18195 if (unicode_range && prevvalue < 255 && value < 255) {
18196 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
18197 goto backwards_range;
18202 if (prevvalue > value) /* b-a */ {
18207 w = RExC_parse - rangebegin;
18209 "Invalid [] range \"%" UTF8f "\"",
18210 UTF8fARG(UTF, w, rangebegin));
18211 NOT_REACHED; /* NOTREACHED */
18215 prevvalue = value; /* save the beginning of the potential range */
18216 if (! stop_at_1 /* Can't be a range if parsing just one thing */
18217 && *RExC_parse == '-')
18219 char* next_char_ptr = RExC_parse + 1;
18221 /* Get the next real char after the '-' */
18222 SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr, RExC_end);
18224 /* If the '-' is at the end of the class (just before the ']',
18225 * it is a literal minus; otherwise it is a range */
18226 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
18227 RExC_parse = next_char_ptr;
18229 /* a bad range like \w-, [:word:]- ? */
18230 if (namedclass > OOB_NAMEDCLASS) {
18231 if (strict || ckWARN(WARN_REGEXP)) {
18232 const int w = RExC_parse >= rangebegin
18233 ? RExC_parse - rangebegin
18236 vFAIL4("False [] range \"%*.*s\"",
18241 "False [] range \"%*.*s\"",
18245 cp_list = add_cp_to_invlist(cp_list, '-');
18248 range = 1; /* yeah, it's a range! */
18249 continue; /* but do it the next time */
18254 if (namedclass > OOB_NAMEDCLASS) {
18258 /* Here, we have a single value this time through the loop, and
18259 * <prevvalue> is the beginning of the range, if any; or <value> if
18262 /* non-Latin1 code point implies unicode semantics. */
18264 if (value > MAX_LEGAL_CP && ( value != UV_MAX
18265 || prevvalue > MAX_LEGAL_CP))
18267 vFAIL(form_cp_too_large_msg(16, NULL, 0, value));
18269 REQUIRE_UNI_RULES(flagp, 0);
18270 if ( ! silence_non_portable
18271 && UNICODE_IS_PERL_EXTENDED(value)
18272 && TO_OUTPUT_WARNINGS(RExC_parse))
18274 ckWARN2_non_literal_string(RExC_parse,
18275 packWARN(WARN_PORTABLE),
18276 PL_extended_cp_format,
18281 /* Ready to process either the single value, or the completed range.
18282 * For single-valued non-inverted ranges, we consider the possibility
18283 * of multi-char folds. (We made a conscious decision to not do this
18284 * for the other cases because it can often lead to non-intuitive
18285 * results. For example, you have the peculiar case that:
18286 * "s s" =~ /^[^\xDF]+$/i => Y
18287 * "ss" =~ /^[^\xDF]+$/i => N
18289 * See [perl #89750] */
18290 if (FOLD && allow_mutiple_chars && value == prevvalue) {
18291 if ( value == LATIN_SMALL_LETTER_SHARP_S
18292 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
18295 /* Here <value> is indeed a multi-char fold. Get what it is */
18297 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18300 UV folded = _to_uni_fold_flags(
18304 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
18305 ? FOLD_FLAGS_NOMIX_ASCII
18309 /* Here, <folded> should be the first character of the
18310 * multi-char fold of <value>, with <foldbuf> containing the
18311 * whole thing. But, if this fold is not allowed (because of
18312 * the flags), <fold> will be the same as <value>, and should
18313 * be processed like any other character, so skip the special
18315 if (folded != value) {
18317 /* Skip if we are recursed, currently parsing the class
18318 * again. Otherwise add this character to the list of
18319 * multi-char folds. */
18320 if (! RExC_in_multi_char_class) {
18321 STRLEN cp_count = utf8_length(foldbuf,
18322 foldbuf + foldlen);
18323 SV* multi_fold = sv_2mortal(newSVpvs(""));
18325 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value);
18328 = add_multi_match(multi_char_matches,
18334 /* This element should not be processed further in this
18337 value = save_value;
18338 prevvalue = save_prevvalue;
18344 if (strict && ckWARN(WARN_REGEXP)) {
18347 /* If the range starts above 255, everything is portable and
18348 * likely to be so for any forseeable character set, so don't
18350 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
18351 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
18353 else if (prevvalue != value) {
18355 /* Under strict, ranges that stop and/or end in an ASCII
18356 * printable should have each end point be a portable value
18357 * for it (preferably like 'A', but we don't warn if it is
18358 * a (portable) Unicode name or code point), and the range
18359 * must be all digits or all letters of the same case.
18360 * Otherwise, the range is non-portable and unclear as to
18361 * what it contains */
18362 if ( (isPRINT_A(prevvalue) || isPRINT_A(value))
18363 && ( non_portable_endpoint
18364 || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value))
18365 || (isLOWER_A(prevvalue) && isLOWER_A(value))
18366 || (isUPPER_A(prevvalue) && isUPPER_A(value))
18368 vWARN(RExC_parse, "Ranges of ASCII printables should"
18369 " be some subset of \"0-9\","
18370 " \"A-Z\", or \"a-z\"");
18372 else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
18373 SSize_t index_start;
18374 SSize_t index_final;
18376 /* But the nature of Unicode and languages mean we
18377 * can't do the same checks for above-ASCII ranges,
18378 * except in the case of digit ones. These should
18379 * contain only digits from the same group of 10. The
18380 * ASCII case is handled just above. Hence here, the
18381 * range could be a range of digits. First some
18382 * unlikely special cases. Grandfather in that a range
18383 * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
18384 * if its starting value is one of the 10 digits prior
18385 * to it. This is because it is an alternate way of
18386 * writing 19D1, and some people may expect it to be in
18387 * that group. But it is bad, because it won't give
18388 * the expected results. In Unicode 5.2 it was
18389 * considered to be in that group (of 11, hence), but
18390 * this was fixed in the next version */
18392 if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) {
18393 goto warn_bad_digit_range;
18395 else if (UNLIKELY( prevvalue >= 0x1D7CE
18396 && value <= 0x1D7FF))
18398 /* This is the only other case currently in Unicode
18399 * where the algorithm below fails. The code
18400 * points just above are the end points of a single
18401 * range containing only decimal digits. It is 5
18402 * different series of 0-9. All other ranges of
18403 * digits currently in Unicode are just a single
18404 * series. (And mktables will notify us if a later
18405 * Unicode version breaks this.)
18407 * If the range being checked is at most 9 long,
18408 * and the digit values represented are in
18409 * numerical order, they are from the same series.
18411 if ( value - prevvalue > 9
18412 || ((( value - 0x1D7CE) % 10)
18413 <= (prevvalue - 0x1D7CE) % 10))
18415 goto warn_bad_digit_range;
18420 /* For all other ranges of digits in Unicode, the
18421 * algorithm is just to check if both end points
18422 * are in the same series, which is the same range.
18424 index_start = _invlist_search(
18425 PL_XPosix_ptrs[_CC_DIGIT],
18428 /* Warn if the range starts and ends with a digit,
18429 * and they are not in the same group of 10. */
18430 if ( index_start >= 0
18431 && ELEMENT_RANGE_MATCHES_INVLIST(index_start)
18433 _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
18434 value)) != index_start
18435 && index_final >= 0
18436 && ELEMENT_RANGE_MATCHES_INVLIST(index_final))
18438 warn_bad_digit_range:
18439 vWARN(RExC_parse, "Ranges of digits should be"
18440 " from the same group of"
18447 if ((! range || prevvalue == value) && non_portable_endpoint) {
18448 if (isPRINT_A(value)) {
18451 if (isBACKSLASHED_PUNCT(value)) {
18452 literal[d++] = '\\';
18454 literal[d++] = (char) value;
18455 literal[d++] = '\0';
18458 "\"%.*s\" is more clearly written simply as \"%s\"",
18459 (int) (RExC_parse - rangebegin),
18464 else if (isMNEMONIC_CNTRL(value)) {
18466 "\"%.*s\" is more clearly written simply as \"%s\"",
18467 (int) (RExC_parse - rangebegin),
18469 cntrl_to_mnemonic((U8) value)
18475 /* Deal with this element of the class */
18478 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18481 /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
18482 * that don't require special handling, we can just add the range like
18483 * we do for ASCII platforms */
18484 if ((UNLIKELY(prevvalue == 0) && value >= 255)
18485 || ! (prevvalue < 256
18487 || (! non_portable_endpoint
18488 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
18489 || (isUPPER_A(prevvalue)
18490 && isUPPER_A(value)))))))
18492 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18496 /* Here, requires special handling. This can be because it is a
18497 * range whose code points are considered to be Unicode, and so
18498 * must be individually translated into native, or because its a
18499 * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
18500 * EBCDIC, but we have defined them to include only the "expected"
18501 * upper or lower case ASCII alphabetics. Subranges above 255 are
18502 * the same in native and Unicode, so can be added as a range */
18503 U8 start = NATIVE_TO_LATIN1(prevvalue);
18505 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
18506 for (j = start; j <= end; j++) {
18507 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
18510 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
18516 range = 0; /* this range (if it was one) is done now */
18517 } /* End of loop through all the text within the brackets */
18519 if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
18520 output_posix_warnings(pRExC_state, posix_warnings);
18523 /* If anything in the class expands to more than one character, we have to
18524 * deal with them by building up a substitute parse string, and recursively
18525 * calling reg() on it, instead of proceeding */
18526 if (multi_char_matches) {
18527 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
18530 char *save_end = RExC_end;
18531 char *save_parse = RExC_parse;
18532 char *save_start = RExC_start;
18533 Size_t constructed_prefix_len = 0; /* This gives the length of the
18534 constructed portion of the
18535 substitute parse. */
18536 bool first_time = TRUE; /* First multi-char occurrence doesn't get
18541 /* Only one level of recursion allowed */
18542 assert(RExC_copy_start_in_constructed == RExC_precomp);
18544 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
18545 because too confusing */
18547 sv_catpvs(substitute_parse, "(?:");
18551 /* Look at the longest strings first */
18552 for (cp_count = av_tindex_skip_len_mg(multi_char_matches);
18557 if (av_exists(multi_char_matches, cp_count)) {
18558 AV** this_array_ptr;
18561 this_array_ptr = (AV**) av_fetch(multi_char_matches,
18563 while ((this_sequence = av_pop(*this_array_ptr)) !=
18566 if (! first_time) {
18567 sv_catpvs(substitute_parse, "|");
18569 first_time = FALSE;
18571 sv_catpv(substitute_parse, SvPVX(this_sequence));
18576 /* If the character class contains anything else besides these
18577 * multi-character strings, have to include it in recursive parsing */
18578 if (element_count) {
18579 bool has_l_bracket = orig_parse > RExC_start && *(orig_parse - 1) == '[';
18581 sv_catpvs(substitute_parse, "|");
18582 if (has_l_bracket) { /* Add an [ if the original had one */
18583 sv_catpvs(substitute_parse, "[");
18585 constructed_prefix_len = SvCUR(substitute_parse);
18586 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
18588 /* Put in a closing ']' to match any opening one, but not if going
18589 * off the end, as otherwise we are adding something that really
18591 if (has_l_bracket && RExC_parse < RExC_end) {
18592 sv_catpvs(substitute_parse, "]");
18596 sv_catpvs(substitute_parse, ")");
18599 /* This is a way to get the parse to skip forward a whole named
18600 * sequence instead of matching the 2nd character when it fails the
18602 sv_catpvs(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
18606 /* Set up the data structure so that any errors will be properly
18607 * reported. See the comments at the definition of
18608 * REPORT_LOCATION_ARGS for details */
18609 RExC_copy_start_in_input = (char *) orig_parse;
18610 RExC_start = RExC_parse = SvPV(substitute_parse, len);
18611 RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
18612 RExC_end = RExC_parse + len;
18613 RExC_in_multi_char_class = 1;
18615 ret = reg(pRExC_state, 1, ®_flags, depth+1);
18617 *flagp |= reg_flags & (HASWIDTH|SIMPLE|POSTPONED|RESTART_PARSE|NEED_UTF8);
18619 /* And restore so can parse the rest of the pattern */
18620 RExC_parse = save_parse;
18621 RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
18622 RExC_end = save_end;
18623 RExC_in_multi_char_class = 0;
18624 SvREFCNT_dec_NN(multi_char_matches);
18628 /* If folding, we calculate all characters that could fold to or from the
18629 * ones already on the list */
18630 if (cp_foldable_list) {
18632 UV start, end; /* End points of code point ranges */
18634 SV* fold_intersection = NULL;
18637 /* Our calculated list will be for Unicode rules. For locale
18638 * matching, we have to keep a separate list that is consulted at
18639 * runtime only when the locale indicates Unicode rules (and we
18640 * don't include potential matches in the ASCII/Latin1 range, as
18641 * any code point could fold to any other, based on the run-time
18642 * locale). For non-locale, we just use the general list */
18644 use_list = &only_utf8_locale_list;
18647 use_list = &cp_list;
18650 /* Only the characters in this class that participate in folds need
18651 * be checked. Get the intersection of this class and all the
18652 * possible characters that are foldable. This can quickly narrow
18653 * down a large class */
18654 _invlist_intersection(PL_in_some_fold, cp_foldable_list,
18655 &fold_intersection);
18657 /* Now look at the foldable characters in this class individually */
18658 invlist_iterinit(fold_intersection);
18659 while (invlist_iternext(fold_intersection, &start, &end)) {
18663 /* Look at every character in the range */
18664 for (j = start; j <= end; j++) {
18665 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
18668 Size_t folds_count;
18670 const U32 * remaining_folds;
18674 /* Under /l, we don't know what code points below 256
18675 * fold to, except we do know the MICRO SIGN folds to
18676 * an above-255 character if the locale is UTF-8, so we
18677 * add it to the special list (in *use_list) Otherwise
18678 * we know now what things can match, though some folds
18679 * are valid under /d only if the target is UTF-8.
18680 * Those go in a separate list */
18681 if ( IS_IN_SOME_FOLD_L1(j)
18682 && ! (LOC && j != MICRO_SIGN))
18685 /* ASCII is always matched; non-ASCII is matched
18686 * only under Unicode rules (which could happen
18687 * under /l if the locale is a UTF-8 one */
18688 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
18689 *use_list = add_cp_to_invlist(*use_list,
18690 PL_fold_latin1[j]);
18692 else if (j != PL_fold_latin1[j]) {
18693 upper_latin1_only_utf8_matches
18694 = add_cp_to_invlist(
18695 upper_latin1_only_utf8_matches,
18696 PL_fold_latin1[j]);
18700 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
18701 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
18703 add_above_Latin1_folds(pRExC_state,
18710 /* Here is an above Latin1 character. We don't have the
18711 * rules hard-coded for it. First, get its fold. This is
18712 * the simple fold, as the multi-character folds have been
18713 * handled earlier and separated out */
18714 folded = _to_uni_fold_flags(j, foldbuf, &foldlen,
18715 (ASCII_FOLD_RESTRICTED)
18716 ? FOLD_FLAGS_NOMIX_ASCII
18719 /* Single character fold of above Latin1. Add everything
18720 * in its fold closure to the list that this node should
18722 folds_count = _inverse_folds(folded, &first_fold,
18724 for (k = 0; k <= folds_count; k++) {
18725 UV c = (k == 0) /* First time through use itself */
18727 : (k == 1) /* 2nd time use, the first fold */
18730 /* Then the remaining ones */
18731 : remaining_folds[k-2];
18733 /* /aa doesn't allow folds between ASCII and non- */
18734 if (( ASCII_FOLD_RESTRICTED
18735 && (isASCII(c) != isASCII(j))))
18740 /* Folds under /l which cross the 255/256 boundary are
18741 * added to a separate list. (These are valid only
18742 * when the locale is UTF-8.) */
18743 if (c < 256 && LOC) {
18744 *use_list = add_cp_to_invlist(*use_list, c);
18748 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
18750 cp_list = add_cp_to_invlist(cp_list, c);
18753 /* Similarly folds involving non-ascii Latin1
18754 * characters under /d are added to their list */
18755 upper_latin1_only_utf8_matches
18756 = add_cp_to_invlist(
18757 upper_latin1_only_utf8_matches,
18763 SvREFCNT_dec_NN(fold_intersection);
18766 /* Now that we have finished adding all the folds, there is no reason
18767 * to keep the foldable list separate */
18768 _invlist_union(cp_list, cp_foldable_list, &cp_list);
18769 SvREFCNT_dec_NN(cp_foldable_list);
18772 /* And combine the result (if any) with any inversion lists from posix
18773 * classes. The lists are kept separate up to now because we don't want to
18774 * fold the classes */
18775 if (simple_posixes) { /* These are the classes known to be unaffected by
18778 _invlist_union(cp_list, simple_posixes, &cp_list);
18779 SvREFCNT_dec_NN(simple_posixes);
18782 cp_list = simple_posixes;
18785 if (posixes || nposixes) {
18786 if (! DEPENDS_SEMANTICS) {
18788 /* For everything but /d, we can just add the current 'posixes' and
18789 * 'nposixes' to the main list */
18792 _invlist_union(cp_list, posixes, &cp_list);
18793 SvREFCNT_dec_NN(posixes);
18801 _invlist_union(cp_list, nposixes, &cp_list);
18802 SvREFCNT_dec_NN(nposixes);
18805 cp_list = nposixes;
18810 /* Under /d, things like \w match upper Latin1 characters only if
18811 * the target string is in UTF-8. But things like \W match all the
18812 * upper Latin1 characters if the target string is not in UTF-8.
18814 * Handle the case with something like \W separately */
18816 SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
18818 /* A complemented posix class matches all upper Latin1
18819 * characters if not in UTF-8. And it matches just certain
18820 * ones when in UTF-8. That means those certain ones are
18821 * matched regardless, so can just be added to the
18822 * unconditional list */
18824 _invlist_union(cp_list, nposixes, &cp_list);
18825 SvREFCNT_dec_NN(nposixes);
18829 cp_list = nposixes;
18832 /* Likewise for 'posixes' */
18833 _invlist_union(posixes, cp_list, &cp_list);
18834 SvREFCNT_dec(posixes);
18836 /* Likewise for anything else in the range that matched only
18838 if (upper_latin1_only_utf8_matches) {
18839 _invlist_union(cp_list,
18840 upper_latin1_only_utf8_matches,
18842 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18843 upper_latin1_only_utf8_matches = NULL;
18846 /* If we don't match all the upper Latin1 characters regardless
18847 * of UTF-8ness, we have to set a flag to match the rest when
18849 _invlist_subtract(only_non_utf8_list, cp_list,
18850 &only_non_utf8_list);
18851 if (_invlist_len(only_non_utf8_list) != 0) {
18852 anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18854 SvREFCNT_dec_NN(only_non_utf8_list);
18857 /* Here there were no complemented posix classes. That means
18858 * the upper Latin1 characters in 'posixes' match only when the
18859 * target string is in UTF-8. So we have to add them to the
18860 * list of those types of code points, while adding the
18861 * remainder to the unconditional list.
18863 * First calculate what they are */
18864 SV* nonascii_but_latin1_properties = NULL;
18865 _invlist_intersection(posixes, PL_UpperLatin1,
18866 &nonascii_but_latin1_properties);
18868 /* And add them to the final list of such characters. */
18869 _invlist_union(upper_latin1_only_utf8_matches,
18870 nonascii_but_latin1_properties,
18871 &upper_latin1_only_utf8_matches);
18873 /* Remove them from what now becomes the unconditional list */
18874 _invlist_subtract(posixes, nonascii_but_latin1_properties,
18877 /* And add those unconditional ones to the final list */
18879 _invlist_union(cp_list, posixes, &cp_list);
18880 SvREFCNT_dec_NN(posixes);
18887 SvREFCNT_dec(nonascii_but_latin1_properties);
18889 /* Get rid of any characters from the conditional list that we
18890 * now know are matched unconditionally, which may make that
18892 _invlist_subtract(upper_latin1_only_utf8_matches,
18894 &upper_latin1_only_utf8_matches);
18895 if (_invlist_len(upper_latin1_only_utf8_matches) == 0) {
18896 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
18897 upper_latin1_only_utf8_matches = NULL;
18903 /* And combine the result (if any) with any inversion list from properties.
18904 * The lists are kept separate up to now so that we can distinguish the two
18905 * in regards to matching above-Unicode. A run-time warning is generated
18906 * if a Unicode property is matched against a non-Unicode code point. But,
18907 * we allow user-defined properties to match anything, without any warning,
18908 * and we also suppress the warning if there is a portion of the character
18909 * class that isn't a Unicode property, and which matches above Unicode, \W
18910 * or [\x{110000}] for example.
18911 * (Note that in this case, unlike the Posix one above, there is no
18912 * <upper_latin1_only_utf8_matches>, because having a Unicode property
18913 * forces Unicode semantics */
18917 /* If it matters to the final outcome, see if a non-property
18918 * component of the class matches above Unicode. If so, the
18919 * warning gets suppressed. This is true even if just a single
18920 * such code point is specified, as, though not strictly correct if
18921 * another such code point is matched against, the fact that they
18922 * are using above-Unicode code points indicates they should know
18923 * the issues involved */
18925 warn_super = ! (invert
18926 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
18929 _invlist_union(properties, cp_list, &cp_list);
18930 SvREFCNT_dec_NN(properties);
18933 cp_list = properties;
18938 |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
18940 /* Because an ANYOF node is the only one that warns, this node
18941 * can't be optimized into something else */
18942 optimizable = FALSE;
18946 /* Here, we have calculated what code points should be in the character
18949 * Now we can see about various optimizations. Fold calculation (which we
18950 * did above) needs to take place before inversion. Otherwise /[^k]/i
18951 * would invert to include K, which under /i would match k, which it
18952 * shouldn't. Therefore we can't invert folded locale now, as it won't be
18953 * folded until runtime */
18955 /* If we didn't do folding, it's because some information isn't available
18956 * until runtime; set the run-time fold flag for these We know to set the
18957 * flag if we have a non-NULL list for UTF-8 locales, or the class matches
18958 * at least one 0-255 range code point */
18961 /* Some things on the list might be unconditionally included because of
18962 * other components. Remove them, and clean up the list if it goes to
18964 if (only_utf8_locale_list && cp_list) {
18965 _invlist_subtract(only_utf8_locale_list, cp_list,
18966 &only_utf8_locale_list);
18968 if (_invlist_len(only_utf8_locale_list) == 0) {
18969 SvREFCNT_dec_NN(only_utf8_locale_list);
18970 only_utf8_locale_list = NULL;
18973 if ( only_utf8_locale_list
18974 || (cp_list && ( _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
18975 || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
18977 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18980 | ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
18982 else if (cp_list && invlist_lowest(cp_list) < 256) {
18983 /* If nothing is below 256, has no locale dependency; otherwise it
18985 anyof_flags |= ANYOFL_FOLD;
18986 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
18989 else if ( DEPENDS_SEMANTICS
18990 && ( upper_latin1_only_utf8_matches
18991 || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
18993 RExC_seen_d_op = TRUE;
18994 has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
18997 /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
19001 && ! has_runtime_dependency)
19003 _invlist_invert(cp_list);
19005 /* Clear the invert flag since have just done it here */
19009 /* All possible optimizations below still have these characteristics.
19010 * (Multi-char folds aren't SIMPLE, but they don't get this far in this
19012 *flagp |= HASWIDTH|SIMPLE;
19015 *ret_invlist = cp_list;
19017 return (cp_list) ? RExC_emit : 0;
19020 if (anyof_flags & ANYOF_LOCALE_FLAGS) {
19021 RExC_contains_locale = 1;
19024 /* Some character classes are equivalent to other nodes. Such nodes take
19025 * up less room, and some nodes require fewer operations to execute, than
19026 * ANYOF nodes. EXACTish nodes may be joinable with adjacent nodes to
19027 * improve efficiency. */
19030 PERL_UINT_FAST8_T i;
19031 UV partial_cp_count = 0;
19032 UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
19033 UV end[MAX_FOLD_FROMS+1] = { 0 };
19034 bool single_range = FALSE;
19036 if (cp_list) { /* Count the code points in enough ranges that we would
19037 see all the ones possible in any fold in this version
19040 invlist_iterinit(cp_list);
19041 for (i = 0; i <= MAX_FOLD_FROMS; i++) {
19042 if (! invlist_iternext(cp_list, &start[i], &end[i])) {
19045 partial_cp_count += end[i] - start[i] + 1;
19049 single_range = TRUE;
19051 invlist_iterfinish(cp_list);
19054 /* If we know at compile time that this matches every possible code
19055 * point, any run-time dependencies don't matter */
19056 if (start[0] == 0 && end[0] == UV_MAX) {
19058 ret = reganode(pRExC_state, OPFAIL, 0);
19061 ret = reg_node(pRExC_state, SANY);
19067 /* Similarly, for /l posix classes, if both a class and its
19068 * complement match, any run-time dependencies don't matter */
19070 for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
19073 if ( POSIXL_TEST(posixl, namedclass) /* class */
19074 && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
19077 ret = reganode(pRExC_state, OPFAIL, 0);
19080 ret = reg_node(pRExC_state, SANY);
19087 /* For well-behaved locales, some classes are subsets of others,
19088 * so complementing the subset and including the non-complemented
19089 * superset should match everything, like [\D[:alnum:]], and
19090 * [[:^alpha:][:alnum:]], but some implementations of locales are
19091 * buggy, and khw thinks its a bad idea to have optimization change
19092 * behavior, even if it avoids an OS bug in a given case */
19094 #define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
19096 /* If is a single posix /l class, can optimize to just that op.
19097 * Such a node will not match anything in the Latin1 range, as that
19098 * is not determinable until runtime, but will match whatever the
19099 * class does outside that range. (Note that some classes won't
19100 * match anything outside the range, like [:ascii:]) */
19101 if ( isSINGLE_BIT_SET(posixl)
19102 && (partial_cp_count == 0 || start[0] > 255))
19105 SV * class_above_latin1 = NULL;
19106 bool already_inverted;
19107 bool are_equivalent;
19109 /* Compute which bit is set, which is the same thing as, e.g.,
19110 * ANYOF_CNTRL. From
19111 * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
19113 static const int MultiplyDeBruijnBitPosition2[32] =
19115 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
19116 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
19119 namedclass = MultiplyDeBruijnBitPosition2[(posixl
19120 * 0x077CB531U) >> 27];
19121 classnum = namedclass_to_classnum(namedclass);
19123 /* The named classes are such that the inverted number is one
19124 * larger than the non-inverted one */
19125 already_inverted = namedclass
19126 - classnum_to_namedclass(classnum);
19128 /* Create an inversion list of the official property, inverted
19129 * if the constructed node list is inverted, and restricted to
19130 * only the above latin1 code points, which are the only ones
19131 * known at compile time */
19132 _invlist_intersection_maybe_complement_2nd(
19134 PL_XPosix_ptrs[classnum],
19136 &class_above_latin1);
19137 are_equivalent = _invlistEQ(class_above_latin1, cp_list,
19139 SvREFCNT_dec_NN(class_above_latin1);
19141 if (are_equivalent) {
19143 /* Resolve the run-time inversion flag with this possibly
19144 * inverted class */
19145 invert = invert ^ already_inverted;
19147 ret = reg_node(pRExC_state,
19148 POSIXL + invert * (NPOSIXL - POSIXL));
19149 FLAGS(REGNODE_p(ret)) = classnum;
19155 /* khw can't think of any other possible transformation involving
19157 if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
19161 if (! has_runtime_dependency) {
19163 /* If the list is empty, nothing matches. This happens, for
19164 * example, when a Unicode property that doesn't match anything is
19165 * the only element in the character class (perluniprops.pod notes
19166 * such properties). */
19167 if (partial_cp_count == 0) {
19169 ret = reg_node(pRExC_state, SANY);
19172 ret = reganode(pRExC_state, OPFAIL, 0);
19178 /* If matches everything but \n */
19179 if ( start[0] == 0 && end[0] == '\n' - 1
19180 && start[1] == '\n' + 1 && end[1] == UV_MAX)
19183 ret = reg_node(pRExC_state, REG_ANY);
19189 /* Next see if can optimize classes that contain just a few code points
19190 * into an EXACTish node. The reason to do this is to let the
19191 * optimizer join this node with adjacent EXACTish ones, and ANYOF
19192 * nodes require conversion to code point from UTF-8.
19194 * An EXACTFish node can be generated even if not under /i, and vice
19195 * versa. But care must be taken. An EXACTFish node has to be such
19196 * that it only matches precisely the code points in the class, but we
19197 * want to generate the least restrictive one that does that, to
19198 * increase the odds of being able to join with an adjacent node. For
19199 * example, if the class contains [kK], we have to make it an EXACTFAA
19200 * node to prevent the KELVIN SIGN from matching. Whether we are under
19201 * /i or not is irrelevant in this case. Less obvious is the pattern
19202 * qr/[\x{02BC}]n/i. U+02BC is MODIFIER LETTER APOSTROPHE. That is
19203 * supposed to match the single character U+0149 LATIN SMALL LETTER N
19204 * PRECEDED BY APOSTROPHE. And so even though there is no simple fold
19205 * that includes \X{02BC}, there is a multi-char fold that does, and so
19206 * the node generated for it must be an EXACTFish one. On the other
19207 * hand qr/:/i should generate a plain EXACT node since the colon
19208 * participates in no fold whatsoever, and having it EXACT tells the
19209 * optimizer the target string cannot match unless it has a colon in
19215 /* Only try if there are no more code points in the class than
19216 * in the max possible fold */
19217 && inRANGE(partial_cp_count, 1, MAX_FOLD_FROMS + 1))
19219 if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
19221 /* We can always make a single code point class into an
19222 * EXACTish node. */
19226 /* Here is /l: Use EXACTL, except if there is a fold not
19227 * known until runtime so shows as only a single code point
19228 * here. For code points above 255, we know which can
19229 * cause problems by having a potential fold to the Latin1
19232 || ( start[0] > 255
19233 && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
19241 else if (! FOLD) { /* Not /l and not /i */
19242 op = (start[0] < 256) ? EXACT : EXACT_REQ8;
19244 else if (start[0] < 256) { /* /i, not /l, and the code point is
19247 /* Under /i, it gets a little tricky. A code point that
19248 * doesn't participate in a fold should be an EXACT node.
19249 * We know this one isn't the result of a simple fold, or
19250 * there'd be more than one code point in the list, but it
19251 * could be part of a multi- character fold. In that case
19252 * we better not create an EXACT node, as we would wrongly
19253 * be telling the optimizer that this code point must be in
19254 * the target string, and that is wrong. This is because
19255 * if the sequence around this code point forms a
19256 * multi-char fold, what needs to be in the string could be
19257 * the code point that folds to the sequence.
19259 * This handles the case of below-255 code points, as we
19260 * have an easy look up for those. The next clause handles
19261 * the above-256 one */
19262 op = IS_IN_SOME_FOLD_L1(start[0])
19266 else { /* /i, larger code point. Since we are under /i, and
19267 have just this code point, we know that it can't
19268 fold to something else, so PL_InMultiCharFold
19270 op = _invlist_contains_cp(PL_InMultiCharFold,
19278 else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
19279 && _invlist_contains_cp(PL_in_some_fold, start[0]))
19281 /* Here, the only runtime dependency, if any, is from /d, and
19282 * the class matches more than one code point, and the lowest
19283 * code point participates in some fold. It might be that the
19284 * other code points are /i equivalent to this one, and hence
19285 * they would representable by an EXACTFish node. Above, we
19286 * eliminated classes that contain too many code points to be
19287 * EXACTFish, with the test for MAX_FOLD_FROMS
19289 * First, special case the ASCII fold pairs, like 'B' and 'b'.
19290 * We do this because we have EXACTFAA at our disposal for the
19292 if (partial_cp_count == 2 && isASCII(start[0])) {
19294 /* The only ASCII characters that participate in folds are
19296 assert(isALPHA(start[0]));
19297 if ( end[0] == start[0] /* First range is a single
19298 character, so 2nd exists */
19299 && isALPHA_FOLD_EQ(start[0], start[1]))
19302 /* Here, is part of an ASCII fold pair */
19304 if ( ASCII_FOLD_RESTRICTED
19305 || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
19307 /* If the second clause just above was true, it
19308 * means we can't be under /i, or else the list
19309 * would have included more than this fold pair.
19310 * Therefore we have to exclude the possibility of
19311 * whatever else it is that folds to these, by
19312 * using EXACTFAA */
19315 else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
19317 /* Here, there's no simple fold that start[0] is part
19318 * of, but there is a multi-character one. If we
19319 * are not under /i, we want to exclude that
19320 * possibility; if under /i, we want to include it
19322 op = (FOLD) ? EXACTFU : EXACTFAA;
19326 /* Here, the only possible fold start[0] particpates in
19327 * is with start[1]. /i or not isn't relevant */
19331 value = toFOLD(start[0]);
19334 else if ( ! upper_latin1_only_utf8_matches
19335 || ( _invlist_len(upper_latin1_only_utf8_matches)
19338 invlist_highest(upper_latin1_only_utf8_matches)]
19341 /* Here, the smallest character is non-ascii or there are
19342 * more than 2 code points matched by this node. Also, we
19343 * either don't have /d UTF-8 dependent matches, or if we
19344 * do, they look like they could be a single character that
19345 * is the fold of the lowest one in the always-match list.
19346 * This test quickly excludes most of the false positives
19347 * when there are /d UTF-8 depdendent matches. These are
19348 * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
19349 * SMALL LETTER A WITH GRAVE iff the target string is
19350 * UTF-8. (We don't have to worry above about exceeding
19351 * the array bounds of PL_fold_latin1[] because any code
19352 * point in 'upper_latin1_only_utf8_matches' is below 256.)
19354 * EXACTFAA would apply only to pairs (hence exactly 2 code
19355 * points) in the ASCII range, so we can't use it here to
19356 * artificially restrict the fold domain, so we check if
19357 * the class does or does not match some EXACTFish node.
19358 * Further, if we aren't under /i, and the folded-to
19359 * character is part of a multi-character fold, we can't do
19360 * this optimization, as the sequence around it could be
19361 * that multi-character fold, and we don't here know the
19362 * context, so we have to assume it is that multi-char
19363 * fold, to prevent potential bugs.
19365 * To do the general case, we first find the fold of the
19366 * lowest code point (which may be higher than the lowest
19367 * one), then find everything that folds to it. (The data
19368 * structure we have only maps from the folded code points,
19369 * so we have to do the earlier step.) */
19372 U8 foldbuf[UTF8_MAXBYTES_CASE];
19373 UV folded = _to_uni_fold_flags(start[0],
19374 foldbuf, &foldlen, 0);
19376 const U32 * remaining_folds;
19377 Size_t folds_to_this_cp_count = _inverse_folds(
19381 Size_t folds_count = folds_to_this_cp_count + 1;
19382 SV * fold_list = _new_invlist(folds_count);
19385 /* If there are UTF-8 dependent matches, create a temporary
19386 * list of what this node matches, including them. */
19387 SV * all_cp_list = NULL;
19388 SV ** use_this_list = &cp_list;
19390 if (upper_latin1_only_utf8_matches) {
19391 all_cp_list = _new_invlist(0);
19392 use_this_list = &all_cp_list;
19393 _invlist_union(cp_list,
19394 upper_latin1_only_utf8_matches,
19398 /* Having gotten everything that participates in the fold
19399 * containing the lowest code point, we turn that into an
19400 * inversion list, making sure everything is included. */
19401 fold_list = add_cp_to_invlist(fold_list, start[0]);
19402 fold_list = add_cp_to_invlist(fold_list, folded);
19403 if (folds_to_this_cp_count > 0) {
19404 fold_list = add_cp_to_invlist(fold_list, first_fold);
19405 for (i = 0; i + 1 < folds_to_this_cp_count; i++) {
19406 fold_list = add_cp_to_invlist(fold_list,
19407 remaining_folds[i]);
19411 /* If the fold list is identical to what's in this ANYOF
19412 * node, the node can be represented by an EXACTFish one
19414 if (_invlistEQ(*use_this_list, fold_list,
19415 0 /* Don't complement */ )
19418 /* But, we have to be careful, as mentioned above.
19419 * Just the right sequence of characters could match
19420 * this if it is part of a multi-character fold. That
19421 * IS what we want if we are under /i. But it ISN'T
19422 * what we want if not under /i, as it could match when
19423 * it shouldn't. So, when we aren't under /i and this
19424 * character participates in a multi-char fold, we
19425 * don't optimize into an EXACTFish node. So, for each
19426 * case below we have to check if we are folding
19427 * and if not, if it is not part of a multi-char fold.
19429 if (start[0] > 255) { /* Highish code point */
19430 if (FOLD || ! _invlist_contains_cp(
19431 PL_InMultiCharFold, folded))
19435 : (ASCII_FOLD_RESTRICTED)
19440 } /* Below, the lowest code point < 256 */
19443 && DEPENDS_SEMANTICS)
19444 { /* An EXACTF node containing a single character
19445 's', can be an EXACTFU if it doesn't get
19446 joined with an adjacent 's' */
19447 op = EXACTFU_S_EDGE;
19451 || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
19453 if (upper_latin1_only_utf8_matches) {
19456 /* We can't use the fold, as that only matches
19460 else if ( UNLIKELY(start[0] == MICRO_SIGN)
19462 { /* EXACTFUP is a special node for this
19464 op = (ASCII_FOLD_RESTRICTED)
19467 value = MICRO_SIGN;
19469 else if ( ASCII_FOLD_RESTRICTED
19470 && ! isASCII(start[0]))
19471 { /* For ASCII under /iaa, we can use EXACTFU
19483 SvREFCNT_dec_NN(fold_list);
19484 SvREFCNT_dec(all_cp_list);
19491 /* Here, we have calculated what EXACTish node to use. Have to
19492 * convert to UTF-8 if not already there */
19495 SvREFCNT_dec(cp_list);;
19496 REQUIRE_UTF8(flagp);
19499 /* This is a kludge to the special casing issues with this
19500 * ligature under /aa. FB05 should fold to FB06, but the
19501 * call above to _to_uni_fold_flags() didn't find this, as
19502 * it didn't use the /aa restriction in order to not miss
19503 * other folds that would be affected. This is the only
19504 * instance likely to ever be a problem in all of Unicode.
19505 * So special case it. */
19506 if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
19507 && ASCII_FOLD_RESTRICTED)
19509 value = LATIN_SMALL_LIGATURE_ST;
19513 len = (UTF) ? UVCHR_SKIP(value) : 1;
19515 ret = regnode_guts(pRExC_state, op, len, "exact");
19516 FILL_NODE(ret, op);
19517 RExC_emit += 1 + STR_SZ(len);
19518 setSTR_LEN(REGNODE_p(ret), len);
19520 *STRINGs(REGNODE_p(ret)) = (U8) value;
19523 uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
19529 if (! has_runtime_dependency) {
19531 /* See if this can be turned into an ANYOFM node. Think about the
19532 * bit patterns in two different bytes. In some positions, the
19533 * bits in each will be 1; and in other positions both will be 0;
19534 * and in some positions the bit will be 1 in one byte, and 0 in
19535 * the other. Let 'n' be the number of positions where the bits
19536 * differ. We create a mask which has exactly 'n' 0 bits, each in
19537 * a position where the two bytes differ. Now take the set of all
19538 * bytes that when ANDed with the mask yield the same result. That
19539 * set has 2**n elements, and is representable by just two 8 bit
19540 * numbers: the result and the mask. Importantly, matching the set
19541 * can be vectorized by creating a word full of the result bytes,
19542 * and a word full of the mask bytes, yielding a significant speed
19543 * up. Here, see if this node matches such a set. As a concrete
19544 * example consider [01], and the byte representing '0' which is
19545 * 0x30 on ASCII machines. It has the bits 0011 0000. Take the
19546 * mask 1111 1110. If we AND 0x31 and 0x30 with that mask we get
19547 * 0x30. Any other bytes ANDed yield something else. So [01],
19548 * which is a common usage, is optimizable into ANYOFM, and can
19549 * benefit from the speed up. We can only do this on UTF-8
19550 * invariant bytes, because they have the same bit patterns under
19552 PERL_UINT_FAST8_T inverted = 0;
19554 const PERL_UINT_FAST8_T max_permissible = 0xFF;
19556 const PERL_UINT_FAST8_T max_permissible = 0x7F;
19558 /* If doesn't fit the criteria for ANYOFM, invert and try again.
19559 * If that works we will instead later generate an NANYOFM, and
19560 * invert back when through */
19561 if (invlist_highest(cp_list) > max_permissible) {
19562 _invlist_invert(cp_list);
19566 if (invlist_highest(cp_list) <= max_permissible) {
19567 UV this_start, this_end;
19568 UV lowest_cp = UV_MAX; /* init'ed to suppress compiler warn */
19569 U8 bits_differing = 0;
19570 Size_t full_cp_count = 0;
19571 bool first_time = TRUE;
19573 /* Go through the bytes and find the bit positions that differ
19575 invlist_iterinit(cp_list);
19576 while (invlist_iternext(cp_list, &this_start, &this_end)) {
19577 unsigned int i = this_start;
19580 if (! UVCHR_IS_INVARIANT(i)) {
19584 first_time = FALSE;
19585 lowest_cp = this_start;
19587 /* We have set up the code point to compare with.
19588 * Don't compare it with itself */
19592 /* Find the bit positions that differ from the lowest code
19593 * point in the node. Keep track of all such positions by
19595 for (; i <= this_end; i++) {
19596 if (! UVCHR_IS_INVARIANT(i)) {
19600 bits_differing |= i ^ lowest_cp;
19603 full_cp_count += this_end - this_start + 1;
19606 /* At the end of the loop, we count how many bits differ from
19607 * the bits in lowest code point, call the count 'd'. If the
19608 * set we found contains 2**d elements, it is the closure of
19609 * all code points that differ only in those bit positions. To
19610 * convince yourself of that, first note that the number in the
19611 * closure must be a power of 2, which we test for. The only
19612 * way we could have that count and it be some differing set,
19613 * is if we got some code points that don't differ from the
19614 * lowest code point in any position, but do differ from each
19615 * other in some other position. That means one code point has
19616 * a 1 in that position, and another has a 0. But that would
19617 * mean that one of them differs from the lowest code point in
19618 * that position, which possibility we've already excluded. */
19619 if ( (inverted || full_cp_count > 1)
19620 && full_cp_count == 1U << PL_bitcount[bits_differing])
19624 op = ANYOFM + inverted;;
19626 /* We need to make the bits that differ be 0's */
19627 ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
19629 /* The argument is the lowest code point */
19630 ret = reganode(pRExC_state, op, lowest_cp);
19631 FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
19635 invlist_iterfinish(cp_list);
19639 _invlist_invert(cp_list);
19646 /* XXX We could create an ANYOFR_LOW node here if we saved above if
19647 * all were invariants, it wasn't inverted, and there is a single
19648 * range. This would be faster than some of the posix nodes we
19649 * create below like /\d/a, but would be twice the size. Without
19650 * having actually measured the gain, khw doesn't think the
19651 * tradeoff is really worth it */
19654 if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) {
19655 PERL_UINT_FAST8_T type;
19656 SV * intersection = NULL;
19657 SV* d_invlist = NULL;
19659 /* See if this matches any of the POSIX classes. The POSIXA and
19660 * POSIXD ones are about the same speed as ANYOF ops, but take less
19661 * room; the ones that have above-Latin1 code point matches are
19662 * somewhat faster than ANYOF. */
19664 for (type = POSIXA; type >= POSIXD; type--) {
19667 if (type == POSIXL) { /* But not /l posix classes */
19671 for (posix_class = 0;
19672 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
19675 SV** our_code_points = &cp_list;
19676 SV** official_code_points;
19679 if (type == POSIXA) {
19680 official_code_points = &PL_Posix_ptrs[posix_class];
19683 official_code_points = &PL_XPosix_ptrs[posix_class];
19686 /* Skip non-existent classes of this type. e.g. \v only
19687 * has an entry in PL_XPosix_ptrs */
19688 if (! *official_code_points) {
19692 /* Try both the regular class, and its inversion */
19693 for (try_inverted = 0; try_inverted < 2; try_inverted++) {
19694 bool this_inverted = invert ^ try_inverted;
19696 if (type != POSIXD) {
19698 /* This class that isn't /d can't match if we have
19699 * /d dependencies */
19700 if (has_runtime_dependency
19701 & HAS_D_RUNTIME_DEPENDENCY)
19706 else /* is /d */ if (! this_inverted) {
19708 /* /d classes don't match anything non-ASCII below
19709 * 256 unconditionally (which cp_list contains) */
19710 _invlist_intersection(cp_list, PL_UpperLatin1,
19712 if (_invlist_len(intersection) != 0) {
19716 SvREFCNT_dec(d_invlist);
19717 d_invlist = invlist_clone(cp_list, NULL);
19719 /* But under UTF-8 it turns into using /u rules.
19720 * Add the things it matches under these conditions
19721 * so that we check below that these are identical
19722 * to what the tested class should match */
19723 if (upper_latin1_only_utf8_matches) {
19726 upper_latin1_only_utf8_matches,
19729 our_code_points = &d_invlist;
19731 else { /* POSIXD, inverted. If this doesn't have this
19732 flag set, it isn't /d. */
19733 if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
19737 our_code_points = &cp_list;
19740 /* Here, have weeded out some things. We want to see
19741 * if the list of characters this node contains
19742 * ('*our_code_points') precisely matches those of the
19743 * class we are currently checking against
19744 * ('*official_code_points'). */
19745 if (_invlistEQ(*our_code_points,
19746 *official_code_points,
19749 /* Here, they precisely match. Optimize this ANYOF
19750 * node into its equivalent POSIX one of the
19751 * correct type, possibly inverted */
19752 ret = reg_node(pRExC_state, (try_inverted)
19756 FLAGS(REGNODE_p(ret)) = posix_class;
19757 SvREFCNT_dec(d_invlist);
19758 SvREFCNT_dec(intersection);
19764 SvREFCNT_dec(d_invlist);
19765 SvREFCNT_dec(intersection);
19768 /* If it is a single contiguous range, ANYOFR is an efficient regnode,
19769 * both in size and speed. Currently, a 20 bit range base (smallest
19770 * code point in the range), and a 12 bit maximum delta are packed into
19771 * a 32 bit word. This allows for using it on all of the Unicode code
19772 * points except for the highest plane, which is only for private use
19773 * code points. khw doubts that a bigger delta is likely in real world
19776 && ! has_runtime_dependency
19777 && anyof_flags == 0
19778 && start[0] < (1 << ANYOFR_BASE_BITS)
19779 && end[0] - start[0]
19780 < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
19781 * CHARBITS - ANYOFR_BASE_BITS))))
19784 U8 low_utf8[UTF8_MAXBYTES+1];
19785 U8 high_utf8[UTF8_MAXBYTES+1];
19787 ret = reganode(pRExC_state, ANYOFR,
19788 (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
19790 /* Place the lowest UTF-8 start byte in the flags field, so as to
19791 * allow efficient ruling out at run time of many possible inputs.
19793 (void) uvchr_to_utf8(low_utf8, start[0]);
19794 (void) uvchr_to_utf8(high_utf8, end[0]);
19796 /* If all code points share the same first byte, this can be an
19797 * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
19798 * quickly rule out many inputs at run-time without having to
19799 * compute the code point from UTF-8. For EBCDIC, we use I8, as
19800 * not doing that transformation would not rule out nearly so many
19802 if (low_utf8[0] == high_utf8[0]) {
19803 OP(REGNODE_p(ret)) = ANYOFRb;
19804 ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
19807 ANYOF_FLAGS(REGNODE_p(ret))
19808 = NATIVE_UTF8_TO_I8(low_utf8[0]);
19814 /* If didn't find an optimization and there is no need for a bitmap,
19815 * optimize to indicate that */
19816 if ( start[0] >= NUM_ANYOF_CODE_POINTS
19818 && ! upper_latin1_only_utf8_matches
19819 && anyof_flags == 0)
19821 U8 low_utf8[UTF8_MAXBYTES+1];
19822 UV highest_cp = invlist_highest(cp_list);
19824 /* Currently the maximum allowed code point by the system is
19825 * IV_MAX. Higher ones are reserved for future internal use. This
19826 * particular regnode can be used for higher ones, but we can't
19827 * calculate the code point of those. IV_MAX suffices though, as
19828 * it will be a large first byte */
19829 Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
19832 /* We store the lowest possible first byte of the UTF-8
19833 * representation, using the flags field. This allows for quick
19834 * ruling out of some inputs without having to convert from UTF-8
19835 * to code point. For EBCDIC, we use I8, as not doing that
19836 * transformation would not rule out nearly so many things */
19837 anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
19841 /* If the first UTF-8 start byte for the highest code point in the
19842 * range is suitably small, we may be able to get an upper bound as
19844 if (highest_cp <= IV_MAX) {
19845 U8 high_utf8[UTF8_MAXBYTES+1];
19846 Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
19849 /* If the lowest and highest are the same, we can get an exact
19850 * first byte instead of a just minimum or even a sequence of
19851 * exact leading bytes. We signal these with different
19853 if (low_utf8[0] == high_utf8[0]) {
19854 Size_t len = find_first_differing_byte_pos(low_utf8,
19856 MIN(low_len, high_len));
19860 /* No need to convert to I8 for EBCDIC as this is an
19862 anyof_flags = low_utf8[0];
19867 ret = regnode_guts(pRExC_state, op,
19868 regarglen[op] + STR_SZ(len),
19870 FILL_NODE(ret, op);
19871 ((struct regnode_anyofhs *) REGNODE_p(ret))->str_len
19873 Copy(low_utf8, /* Add the common bytes */
19874 ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
19876 RExC_emit += NODE_SZ_STR(REGNODE_p(ret));
19877 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19878 NULL, only_utf8_locale_list);
19882 else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
19885 /* Here, the high byte is not the same as the low, but is
19886 * small enough that its reasonable to have a loose upper
19887 * bound, which is packed in with the strict lower bound.
19888 * See comments at the definition of MAX_ANYOF_HRx_BYTE.
19889 * On EBCDIC platforms, I8 is used. On ASCII platforms I8
19890 * is the same thing as UTF-8 */
19893 U8 max_range_diff = MAX_ANYOF_HRx_BYTE - anyof_flags;
19894 U8 range_diff = NATIVE_UTF8_TO_I8(high_utf8[0])
19897 if (range_diff <= max_range_diff / 8) {
19900 else if (range_diff <= max_range_diff / 4) {
19903 else if (range_diff <= max_range_diff / 2) {
19906 anyof_flags = (anyof_flags - 0xC0) << 2 | bits;
19911 goto done_finding_op;
19913 } /* End of seeing if can optimize it into a different node */
19915 is_anyof: /* It's going to be an ANYOF node. */
19916 op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
19926 ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
19927 FILL_NODE(ret, op); /* We set the argument later */
19928 RExC_emit += 1 + regarglen[op];
19929 ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
19931 /* Here, <cp_list> contains all the code points we can determine at
19932 * compile time that match under all conditions. Go through it, and
19933 * for things that belong in the bitmap, put them there, and delete from
19934 * <cp_list>. While we are at it, see if everything above 255 is in the
19935 * list, and if so, set a flag to speed up execution */
19937 populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
19940 ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
19944 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
19947 /* Here, the bitmap has been populated with all the Latin1 code points that
19948 * always match. Can now add to the overall list those that match only
19949 * when the target string is UTF-8 (<upper_latin1_only_utf8_matches>).
19951 if (upper_latin1_only_utf8_matches) {
19953 _invlist_union(cp_list,
19954 upper_latin1_only_utf8_matches,
19956 SvREFCNT_dec_NN(upper_latin1_only_utf8_matches);
19959 cp_list = upper_latin1_only_utf8_matches;
19961 ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
19964 set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
19965 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
19968 only_utf8_locale_list);
19969 SvREFCNT_dec(cp_list);;
19970 SvREFCNT_dec(only_utf8_locale_list);
19975 /* Here, the node is getting optimized into something that's not an ANYOF
19976 * one. Finish up. */
19978 Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
19979 RExC_parse - orig_parse);;
19980 SvREFCNT_dec(cp_list);;
19981 SvREFCNT_dec(only_utf8_locale_list);
19985 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
19988 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
19989 regnode* const node,
19991 SV* const runtime_defns,
19992 SV* const only_utf8_locale_list)
19994 /* Sets the arg field of an ANYOF-type node 'node', using information about
19995 * the node passed-in. If there is nothing outside the node's bitmap, the
19996 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
19997 * the count returned by add_data(), having allocated and stored an array,
20000 * av[0] stores the inversion list defining this class as far as known at
20001 * this time, or PL_sv_undef if nothing definite is now known.
20002 * av[1] stores the inversion list of code points that match only if the
20003 * current locale is UTF-8, or if none, PL_sv_undef if there is an
20004 * av[2], or no entry otherwise.
20005 * av[2] stores the list of user-defined properties whose subroutine
20006 * definitions aren't known at this time, or no entry if none. */
20010 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
20012 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
20013 assert(! (ANYOF_FLAGS(node)
20014 & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
20015 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
20018 AV * const av = newAV();
20022 av_store(av, INVLIST_INDEX, SvREFCNT_inc_NN(cp_list));
20025 /* (Note that if any of this changes, the size calculations in
20026 * S_optimize_regclass() might need to be updated.) */
20028 if (only_utf8_locale_list) {
20029 av_store(av, ONLY_LOCALE_MATCHES_INDEX,
20030 SvREFCNT_inc_NN(only_utf8_locale_list));
20033 if (runtime_defns) {
20034 av_store(av, DEFERRED_USER_DEFINED_INDEX,
20035 SvREFCNT_inc_NN(runtime_defns));
20038 rv = newRV_noinc(MUTABLE_SV(av));
20039 n = add_data(pRExC_state, STR_WITH_LEN("s"));
20040 RExC_rxi->data->data[n] = (void*)rv;
20047 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20048 Perl_get_regclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20050 Perl_get_re_gclass_nonbitmap_data(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, SV** output_invlist)
20054 /* For internal core use only.
20055 * Returns the inversion list for the input 'node' in the regex 'prog'.
20056 * If <doinit> is 'true', will attempt to create the inversion list if not
20058 * If <listsvp> is non-null, will return the printable contents of the
20059 * property definition. This can be used to get debugging information
20060 * even before the inversion list exists, by calling this function with
20061 * 'doinit' set to false, in which case the components that will be used
20062 * to eventually create the inversion list are returned (in a printable
20064 * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to
20065 * store an inversion list of code points that should match only if the
20066 * execution-time locale is a UTF-8 one.
20067 * If <output_invlist> is not NULL, it is where this routine is to store an
20068 * inversion list of the code points that would be instead returned in
20069 * <listsvp> if this were NULL. Thus, what gets output in <listsvp>
20070 * when this parameter is used, is just the non-code point data that
20071 * will go into creating the inversion list. This currently should be just
20072 * user-defined properties whose definitions were not known at compile
20073 * time. Using this parameter allows for easier manipulation of the
20074 * inversion list's data by the caller. It is illegal to call this
20075 * function with this parameter set, but not <listsvp>
20077 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
20078 * that, in spite of this function's name, the inversion list it returns
20079 * may include the bitmap data as well */
20081 SV *si = NULL; /* Input initialization string */
20082 SV* invlist = NULL;
20084 RXi_GET_DECL(prog, progi);
20085 const struct reg_data * const data = prog ? progi->data : NULL;
20087 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
20088 PERL_ARGS_ASSERT_GET_REGCLASS_NONBITMAP_DATA;
20090 PERL_ARGS_ASSERT_GET_RE_GCLASS_NONBITMAP_DATA;
20092 assert(! output_invlist || listsvp);
20094 if (data && data->count) {
20095 const U32 n = ARG(node);
20097 if (data->what[n] == 's') {
20098 SV * const rv = MUTABLE_SV(data->data[n]);
20099 AV * const av = MUTABLE_AV(SvRV(rv));
20100 SV **const ary = AvARRAY(av);
20102 invlist = ary[INVLIST_INDEX];
20104 if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) {
20105 *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX];
20108 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
20109 si = ary[DEFERRED_USER_DEFINED_INDEX];
20112 if (doinit && (si || invlist)) {
20115 SV * msg = newSVpvs_flags("", SVs_TEMP);
20117 SV * prop_definition = handle_user_defined_property(
20118 "", 0, FALSE, /* There is no \p{}, \P{} */
20119 SvPVX_const(si)[1] - '0', /* /i or not has been
20120 stored here for just
20122 TRUE, /* run time */
20123 FALSE, /* This call must find the defn */
20124 si, /* The property definition */
20127 0 /* base level call */
20131 assert(prop_definition == NULL);
20133 Perl_croak(aTHX_ "%" UTF8f,
20134 UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg)));
20138 _invlist_union(invlist, prop_definition, &invlist);
20139 SvREFCNT_dec_NN(prop_definition);
20142 invlist = prop_definition;
20145 STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX);
20146 STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX);
20148 ary[INVLIST_INDEX] = invlist;
20149 av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX])
20150 ? ONLY_LOCALE_MATCHES_INDEX
20158 /* If requested, return a printable version of what this ANYOF node matches
20161 SV* matches_string = NULL;
20163 /* This function can be called at compile-time, before everything gets
20164 * resolved, in which case we return the currently best available
20165 * information, which is the string that will eventually be used to do
20166 * that resolving, 'si' */
20168 /* Here, we only have 'si' (and possibly some passed-in data in
20169 * 'invlist', which is handled below) If the caller only wants
20170 * 'si', use that. */
20171 if (! output_invlist) {
20172 matches_string = newSVsv(si);
20175 /* But if the caller wants an inversion list of the node, we
20176 * need to parse 'si' and place as much as possible in the
20177 * desired output inversion list, making 'matches_string' only
20178 * contain the currently unresolvable things */
20179 const char *si_string = SvPVX(si);
20180 STRLEN remaining = SvCUR(si);
20184 /* Ignore everything before and including the first new-line */
20185 si_string = (const char *) memchr(si_string, '\n', SvCUR(si));
20186 assert (si_string != NULL);
20188 remaining = SvPVX(si) + SvCUR(si) - si_string;
20190 while (remaining > 0) {
20192 /* The data consists of just strings defining user-defined
20193 * property names, but in prior incarnations, and perhaps
20194 * somehow from pluggable regex engines, it could still
20195 * hold hex code point definitions, all of which should be
20196 * legal (or it wouldn't have gotten this far). Each
20197 * component of a range would be separated by a tab, and
20198 * each range by a new-line. If these are found, instead
20199 * add them to the inversion list */
20200 I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT
20201 |PERL_SCAN_SILENT_NON_PORTABLE;
20202 STRLEN len = remaining;
20203 UV cp = grok_hex(si_string, &len, &grok_flags, NULL);
20205 /* If the hex decode routine found something, it should go
20206 * up to the next \n */
20207 if ( *(si_string + len) == '\n') {
20208 if (count) { /* 2nd code point on line */
20209 *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp);
20212 *output_invlist = add_cp_to_invlist(*output_invlist, cp);
20215 goto prepare_for_next_iteration;
20218 /* If the hex decode was instead for the lower range limit,
20219 * save it, and go parse the upper range limit */
20220 if (*(si_string + len) == '\t') {
20221 assert(count == 0);
20225 prepare_for_next_iteration:
20226 si_string += len + 1;
20227 remaining -= len + 1;
20231 /* Here, didn't find a legal hex number. Just add the text
20232 * from here up to the next \n, omitting any trailing
20236 len = strcspn(si_string,
20237 DEFERRED_COULD_BE_OFFICIAL_MARKERs "\n");
20239 if (matches_string) {
20240 sv_catpvn(matches_string, si_string, len);
20243 matches_string = newSVpvn(si_string, len);
20245 sv_catpvs(matches_string, " ");
20249 && UCHARAT(si_string)
20250 == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
20255 if (remaining && UCHARAT(si_string) == '\n') {
20259 } /* end of loop through the text */
20261 assert(matches_string);
20262 if (SvCUR(matches_string)) { /* Get rid of trailing blank */
20263 SvCUR_set(matches_string, SvCUR(matches_string) - 1);
20265 } /* end of has an 'si' */
20268 /* Add the stuff that's already known */
20271 /* Again, if the caller doesn't want the output inversion list, put
20272 * everything in 'matches-string' */
20273 if (! output_invlist) {
20274 if ( ! matches_string) {
20275 matches_string = newSVpvs("\n");
20277 sv_catsv(matches_string, invlist_contents(invlist,
20278 TRUE /* traditional style */
20281 else if (! *output_invlist) {
20282 *output_invlist = invlist_clone(invlist, NULL);
20285 _invlist_union(*output_invlist, invlist, output_invlist);
20289 *listsvp = matches_string;
20295 /* reg_skipcomment()
20297 Absorbs an /x style # comment from the input stream,
20298 returning a pointer to the first character beyond the comment, or if the
20299 comment terminates the pattern without anything following it, this returns
20300 one past the final character of the pattern (in other words, RExC_end) and
20301 sets the REG_RUN_ON_COMMENT_SEEN flag.
20303 Note it's the callers responsibility to ensure that we are
20304 actually in /x mode
20308 PERL_STATIC_INLINE char*
20309 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
20311 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
20315 while (p < RExC_end) {
20316 if (*(++p) == '\n') {
20321 /* we ran off the end of the pattern without ending the comment, so we have
20322 * to add an \n when wrapping */
20323 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
20328 S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
20330 const bool force_to_xmod
20333 /* If the text at the current parse position '*p' is a '(?#...)' comment,
20334 * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
20335 * is /x whitespace, advance '*p' so that on exit it points to the first
20336 * byte past all such white space and comments */
20338 const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
20340 PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
20342 assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
20345 if (RExC_end - (*p) >= 3
20347 && *(*p + 1) == '?'
20348 && *(*p + 2) == '#')
20350 while (*(*p) != ')') {
20351 if ((*p) == RExC_end)
20352 FAIL("Sequence (?#... not terminated");
20360 const char * save_p = *p;
20361 while ((*p) < RExC_end) {
20363 if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
20366 else if (*(*p) == '#') {
20367 (*p) = reg_skipcomment(pRExC_state, (*p));
20373 if (*p != save_p) {
20386 Advances the parse position by one byte, unless that byte is the beginning
20387 of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
20388 those two cases, the parse position is advanced beyond all such comments and
20391 This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
20395 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
20397 PERL_ARGS_ASSERT_NEXTCHAR;
20399 if (RExC_parse < RExC_end) {
20401 || UTF8_IS_INVARIANT(*RExC_parse)
20402 || UTF8_IS_START(*RExC_parse));
20404 RExC_parse += (UTF)
20405 ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
20408 skip_to_be_ignored_text(pRExC_state, &RExC_parse,
20409 FALSE /* Don't force /x */ );
20414 S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
20416 /* 'size' is the delta number of smallest regnode equivalents to add or
20417 * subtract from the current memory allocated to the regex engine being
20420 PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
20425 sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
20426 /* +1 for REG_MAGIC */
20429 if ( RExC_rxi == NULL )
20430 FAIL("Regexp out of space");
20431 RXi_SET(RExC_rx, RExC_rxi);
20433 RExC_emit_start = RExC_rxi->program;
20435 Zero(REGNODE_p(RExC_emit), size, regnode);
20438 #ifdef RE_TRACK_PATTERN_OFFSETS
20439 Renew(RExC_offsets, 2*RExC_size+1, U32);
20441 Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
20443 RExC_offsets[0] = RExC_size;
20447 STATIC regnode_offset
20448 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
20450 /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
20451 * equivalents space. It aligns and increments RExC_size
20453 * It returns the regnode's offset into the regex engine program */
20455 const regnode_offset ret = RExC_emit;
20457 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20459 PERL_ARGS_ASSERT_REGNODE_GUTS;
20461 SIZE_ALIGN(RExC_size);
20462 change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
20463 NODE_ALIGN_FILL(REGNODE_p(ret));
20464 #ifndef RE_TRACK_PATTERN_OFFSETS
20465 PERL_UNUSED_ARG(name);
20466 PERL_UNUSED_ARG(op);
20468 assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
20470 if (RExC_offsets) { /* MJD */
20472 ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
20475 (UV)(RExC_emit) > RExC_offsets[0]
20476 ? "Overwriting end of array!\n" : "OK",
20478 (UV)(RExC_parse - RExC_start),
20479 (UV)RExC_offsets[0]));
20480 Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END));
20487 - reg_node - emit a node
20489 STATIC regnode_offset /* Location. */
20490 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
20492 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
20493 regnode_offset ptr = ret;
20495 PERL_ARGS_ASSERT_REG_NODE;
20497 assert(regarglen[op] == 0);
20499 FILL_ADVANCE_NODE(ptr, op);
20505 - reganode - emit a node with an argument
20507 STATIC regnode_offset /* Location. */
20508 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20510 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
20511 regnode_offset ptr = ret;
20513 PERL_ARGS_ASSERT_REGANODE;
20515 /* ANYOF are special cased to allow non-length 1 args */
20516 assert(regarglen[op] == 1);
20518 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
20524 - regpnode - emit a temporary node with a SV* argument
20526 STATIC regnode_offset /* Location. */
20527 S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg)
20529 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode");
20530 regnode_offset ptr = ret;
20532 PERL_ARGS_ASSERT_REGPNODE;
20534 FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
20539 STATIC regnode_offset
20540 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20542 /* emit a node with U32 and I32 arguments */
20544 const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
20545 regnode_offset ptr = ret;
20547 PERL_ARGS_ASSERT_REG2LANODE;
20549 assert(regarglen[op] == 2);
20551 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
20557 - reginsert - insert an operator in front of already-emitted operand
20559 * That means that on exit 'operand' is the offset of the newly inserted
20560 * operator, and the original operand has been relocated.
20562 * IMPORTANT NOTE - it is the *callers* responsibility to correctly
20563 * set up NEXT_OFF() of the inserted node if needed. Something like this:
20565 * reginsert(pRExC, OPFAIL, orig_emit, depth+1);
20566 * NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
20568 * ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
20571 S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
20572 const regnode_offset operand, const U32 depth)
20577 const int offset = regarglen[(U8)op];
20578 const int size = NODE_STEP_REGNODE + offset;
20579 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20581 PERL_ARGS_ASSERT_REGINSERT;
20582 PERL_UNUSED_CONTEXT;
20583 PERL_UNUSED_ARG(depth);
20584 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
20585 DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
20586 assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
20587 studying. If this is wrong then we need to adjust RExC_recurse
20588 below like we do with RExC_open_parens/RExC_close_parens. */
20589 change_engine_size(pRExC_state, (Ptrdiff_t) size);
20590 src = REGNODE_p(RExC_emit);
20592 dst = REGNODE_p(RExC_emit);
20594 /* If we are in a "count the parentheses" pass, the numbers are unreliable,
20595 * and [perl #133871] shows this can lead to problems, so skip this
20596 * realignment of parens until a later pass when they are reliable */
20597 if (! IN_PARENS_PASS && RExC_open_parens) {
20599 /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
20600 /* remember that RExC_npar is rex->nparens + 1,
20601 * iow it is 1 more than the number of parens seen in
20602 * the pattern so far. */
20603 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
20604 /* note, RExC_open_parens[0] is the start of the
20605 * regex, it can't move. RExC_close_parens[0] is the end
20606 * of the regex, it *can* move. */
20607 if ( paren && RExC_open_parens[paren] >= operand ) {
20608 /*DEBUG_PARSE_FMT("open"," - %d", size);*/
20609 RExC_open_parens[paren] += size;
20611 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
20613 if ( RExC_close_parens[paren] >= operand ) {
20614 /*DEBUG_PARSE_FMT("close"," - %d", size);*/
20615 RExC_close_parens[paren] += size;
20617 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
20622 RExC_end_op += size;
20624 while (src > REGNODE_p(operand)) {
20625 StructCopy(--src, --dst, regnode);
20626 #ifdef RE_TRACK_PATTERN_OFFSETS
20627 if (RExC_offsets) { /* MJD 20010112 */
20629 ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n",
20633 (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0]
20634 ? "Overwriting end of array!\n" : "OK",
20635 (UV)REGNODE_OFFSET(src),
20636 (UV)REGNODE_OFFSET(dst),
20637 (UV)RExC_offsets[0]));
20638 Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src));
20639 Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src));
20644 place = REGNODE_p(operand); /* Op node, where operand used to be. */
20645 #ifdef RE_TRACK_PATTERN_OFFSETS
20646 if (RExC_offsets) { /* MJD */
20648 ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n",
20652 (UV)REGNODE_OFFSET(place) > RExC_offsets[0]
20653 ? "Overwriting end of array!\n" : "OK",
20654 (UV)REGNODE_OFFSET(place),
20655 (UV)(RExC_parse - RExC_start),
20656 (UV)RExC_offsets[0]));
20657 Set_Node_Offset(place, RExC_parse);
20658 Set_Node_Length(place, 1);
20661 src = NEXTOPER(place);
20663 FILL_NODE(operand, op);
20665 /* Zero out any arguments in the new node */
20666 Zero(src, offset, regnode);
20670 - regtail - set the next-pointer at the end of a node chain of p to val. If
20671 that value won't fit in the space available, instead returns FALSE.
20672 (Except asserts if we can't fit in the largest space the regex
20673 engine is designed for.)
20674 - SEE ALSO: regtail_study
20677 S_regtail(pTHX_ RExC_state_t * pRExC_state,
20678 const regnode_offset p,
20679 const regnode_offset val,
20682 regnode_offset scan;
20683 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20685 PERL_ARGS_ASSERT_REGTAIL;
20687 PERL_UNUSED_ARG(depth);
20690 /* The final node in the chain is the first one with a nonzero next pointer
20692 scan = (regnode_offset) p;
20694 regnode * const temp = regnext(REGNODE_p(scan));
20696 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
20697 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20698 Perl_re_printf( aTHX_ "~ %s (%zu) %s %s\n",
20699 SvPV_nolen_const(RExC_mysv), scan,
20700 (temp == NULL ? "->" : ""),
20701 (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
20706 scan = REGNODE_OFFSET(temp);
20709 /* Populate this node's next pointer */
20710 assert(val >= scan);
20711 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20712 assert((UV) (val - scan) <= U32_MAX);
20713 ARG_SET(REGNODE_p(scan), val - scan);
20716 if (val - scan > U16_MAX) {
20717 /* Populate this with something that won't loop and will likely
20718 * lead to a crash if the caller ignores the failure return, and
20719 * execution continues */
20720 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20723 NEXT_OFF(REGNODE_p(scan)) = val - scan;
20731 - regtail_study - set the next-pointer at the end of a node chain of p to val.
20732 - Look for optimizable sequences at the same time.
20733 - currently only looks for EXACT chains.
20735 This is experimental code. The idea is to use this routine to perform
20736 in place optimizations on branches and groups as they are constructed,
20737 with the long term intention of removing optimization from study_chunk so
20738 that it is purely analytical.
20740 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
20741 to control which is which.
20743 This used to return a value that was ignored. It was a problem that it is
20744 #ifdef'd to be another function that didn't return a value. khw has changed it
20745 so both currently return a pass/fail return.
20748 /* TODO: All four parms should be const */
20751 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
20752 const regnode_offset val, U32 depth)
20754 regnode_offset scan;
20756 #ifdef EXPERIMENTAL_INPLACESCAN
20759 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20761 PERL_ARGS_ASSERT_REGTAIL_STUDY;
20764 /* Find last node. */
20768 regnode * const temp = regnext(REGNODE_p(scan));
20769 #ifdef EXPERIMENTAL_INPLACESCAN
20770 if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20771 bool unfolded_multi_char; /* Unexamined in this routine */
20772 if (join_exact(pRExC_state, scan, &min,
20773 &unfolded_multi_char, 1, REGNODE_p(val), depth+1))
20774 return TRUE; /* Was return EXACT */
20778 if (PL_regkind[OP(REGNODE_p(scan))] == EXACT) {
20779 if (exact == PSEUDO )
20780 exact= OP(REGNODE_p(scan));
20781 else if (exact != OP(REGNODE_p(scan)) )
20784 else if (OP(REGNODE_p(scan)) != NOTHING) {
20789 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
20790 regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
20791 Perl_re_printf( aTHX_ "~ %s (%zu) -> %s\n",
20792 SvPV_nolen_const(RExC_mysv),
20794 PL_reg_name[exact]);
20798 scan = REGNODE_OFFSET(temp);
20801 DEBUG_PARSE_MSG("");
20802 regprop(RExC_rx, RExC_mysv, REGNODE_p(val), NULL, pRExC_state);
20803 Perl_re_printf( aTHX_
20804 "~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
20805 SvPV_nolen_const(RExC_mysv),
20810 if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
20811 assert((UV) (val - scan) <= U32_MAX);
20812 ARG_SET(REGNODE_p(scan), val - scan);
20815 if (val - scan > U16_MAX) {
20816 /* Populate this with something that won't loop and will likely
20817 * lead to a crash if the caller ignores the failure return, and
20818 * execution continues */
20819 NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
20822 NEXT_OFF(REGNODE_p(scan)) = val - scan;
20825 return TRUE; /* Was 'return exact' */
20830 S_get_ANYOFM_contents(pTHX_ const regnode * n) {
20832 /* Returns an inversion list of all the code points matched by the
20833 * ANYOFM/NANYOFM node 'n' */
20835 SV * cp_list = _new_invlist(-1);
20836 const U8 lowest = (U8) ARG(n);
20839 U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
20841 PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
20843 /* Starting with the lowest code point, any code point that ANDed with the
20844 * mask yields the lowest code point is in the set */
20845 for (i = lowest; i <= 0xFF; i++) {
20846 if ((i & FLAGS(n)) == ARG(n)) {
20847 cp_list = add_cp_to_invlist(cp_list, i);
20850 /* We know how many code points (a power of two) that are in the
20851 * set. No use looking once we've got that number */
20852 if (count >= needed) break;
20856 if (OP(n) == NANYOFM) {
20857 _invlist_invert(cp_list);
20863 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
20868 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
20873 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20875 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
20876 if (flags & (1<<bit)) {
20877 if (!set++ && lead)
20878 Perl_re_printf( aTHX_ "%s", lead);
20879 Perl_re_printf( aTHX_ "%s ", PL_reg_intflags_name[bit]);
20884 Perl_re_printf( aTHX_ "\n");
20886 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20891 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
20897 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
20899 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
20900 if (flags & (1<<bit)) {
20901 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
20904 if (!set++ && lead)
20905 Perl_re_printf( aTHX_ "%s", lead);
20906 Perl_re_printf( aTHX_ "%s ", PL_reg_extflags_name[bit]);
20909 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
20910 if (!set++ && lead) {
20911 Perl_re_printf( aTHX_ "%s", lead);
20914 case REGEX_UNICODE_CHARSET:
20915 Perl_re_printf( aTHX_ "UNICODE");
20917 case REGEX_LOCALE_CHARSET:
20918 Perl_re_printf( aTHX_ "LOCALE");
20920 case REGEX_ASCII_RESTRICTED_CHARSET:
20921 Perl_re_printf( aTHX_ "ASCII-RESTRICTED");
20923 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
20924 Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED");
20927 Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET");
20933 Perl_re_printf( aTHX_ "\n");
20935 Perl_re_printf( aTHX_ "%s[none-set]\n", lead);
20941 Perl_regdump(pTHX_ const regexp *r)
20945 SV * const sv = sv_newmortal();
20946 SV *dsv= sv_newmortal();
20947 RXi_GET_DECL(r, ri);
20948 DECLARE_AND_GET_RE_DEBUG_FLAGS;
20950 PERL_ARGS_ASSERT_REGDUMP;
20952 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
20954 /* Header fields of interest. */
20955 for (i = 0; i < 2; i++) {
20956 if (r->substrs->data[i].substr) {
20957 RE_PV_QUOTED_DECL(s, 0, dsv,
20958 SvPVX_const(r->substrs->data[i].substr),
20959 RE_SV_DUMPLEN(r->substrs->data[i].substr),
20960 PL_dump_re_max_len);
20961 Perl_re_printf( aTHX_
20962 "%s %s%s at %" IVdf "..%" UVuf " ",
20963 i ? "floating" : "anchored",
20965 RE_SV_TAIL(r->substrs->data[i].substr),
20966 (IV)r->substrs->data[i].min_offset,
20967 (UV)r->substrs->data[i].max_offset);
20969 else if (r->substrs->data[i].utf8_substr) {
20970 RE_PV_QUOTED_DECL(s, 1, dsv,
20971 SvPVX_const(r->substrs->data[i].utf8_substr),
20972 RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
20974 Perl_re_printf( aTHX_
20975 "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
20976 i ? "floating" : "anchored",
20978 RE_SV_TAIL(r->substrs->data[i].utf8_substr),
20979 (IV)r->substrs->data[i].min_offset,
20980 (UV)r->substrs->data[i].max_offset);
20984 if (r->check_substr || r->check_utf8)
20985 Perl_re_printf( aTHX_
20987 ( r->check_substr == r->substrs->data[1].substr
20988 && r->check_utf8 == r->substrs->data[1].utf8_substr
20989 ? "(checking floating" : "(checking anchored"));
20990 if (r->intflags & PREGf_NOSCAN)
20991 Perl_re_printf( aTHX_ " noscan");
20992 if (r->extflags & RXf_CHECK_ALL)
20993 Perl_re_printf( aTHX_ " isall");
20994 if (r->check_substr || r->check_utf8)
20995 Perl_re_printf( aTHX_ ") ");
20997 if (ri->regstclass) {
20998 regprop(r, sv, ri->regstclass, NULL, NULL);
20999 Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv));
21001 if (r->intflags & PREGf_ANCH) {
21002 Perl_re_printf( aTHX_ "anchored");
21003 if (r->intflags & PREGf_ANCH_MBOL)
21004 Perl_re_printf( aTHX_ "(MBOL)");
21005 if (r->intflags & PREGf_ANCH_SBOL)
21006 Perl_re_printf( aTHX_ "(SBOL)");
21007 if (r->intflags & PREGf_ANCH_GPOS)
21008 Perl_re_printf( aTHX_ "(GPOS)");
21009 Perl_re_printf( aTHX_ " ");
21011 if (r->intflags & PREGf_GPOS_SEEN)
21012 Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs);
21013 if (r->intflags & PREGf_SKIP)
21014 Perl_re_printf( aTHX_ "plus ");
21015 if (r->intflags & PREGf_IMPLICIT)
21016 Perl_re_printf( aTHX_ "implicit ");
21017 Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen);
21018 if (r->extflags & RXf_EVAL_SEEN)
21019 Perl_re_printf( aTHX_ "with eval ");
21020 Perl_re_printf( aTHX_ "\n");
21022 regdump_extflags("r->extflags: ", r->extflags);
21023 regdump_intflags("r->intflags: ", r->intflags);
21026 PERL_ARGS_ASSERT_REGDUMP;
21027 PERL_UNUSED_CONTEXT;
21028 PERL_UNUSED_ARG(r);
21029 #endif /* DEBUGGING */
21032 /* Should be synchronized with ANYOF_ #defines in regcomp.h */
21035 # if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \
21036 || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \
21037 || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \
21038 || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \
21039 || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \
21040 || _CC_VERTSPACE != 15
21041 # error Need to adjust order of anyofs[]
21043 static const char * const anyofs[] = {
21080 - regprop - printable representation of opcode, with run time support
21084 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
21088 RXi_GET_DECL(prog, progi);
21089 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21091 PERL_ARGS_ASSERT_REGPROP;
21095 if (OP(o) > REGNODE_MAX) { /* regnode.type is unsigned */
21096 if (pRExC_state) { /* This gives more info, if we have it */
21097 FAIL3("panic: corrupted regexp opcode %d > %d",
21098 (int)OP(o), (int)REGNODE_MAX);
21101 Perl_croak(aTHX_ "panic: corrupted regexp opcode %d > %d",
21102 (int)OP(o), (int)REGNODE_MAX);
21105 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
21107 k = PL_regkind[OP(o)];
21110 sv_catpvs(sv, " ");
21111 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
21112 * is a crude hack but it may be the best for now since
21113 * we have no flag "this EXACTish node was UTF-8"
21115 pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
21116 PL_colors[0], PL_colors[1],
21117 PERL_PV_ESCAPE_UNI_DETECT |
21118 PERL_PV_ESCAPE_NONASCII |
21119 PERL_PV_PRETTY_ELLIPSES |
21120 PERL_PV_PRETTY_LTGT |
21121 PERL_PV_PRETTY_NOCLEAR
21123 } else if (k == TRIE) {
21124 /* print the details of the trie in dumpuntil instead, as
21125 * progi->data isn't available here */
21126 const char op = OP(o);
21127 const U32 n = ARG(o);
21128 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
21129 (reg_ac_data *)progi->data->data[n] :
21131 const reg_trie_data * const trie
21132 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
21134 Perl_sv_catpvf(aTHX_ sv, "-%s", PL_reg_name[o->flags]);
21135 DEBUG_TRIE_COMPILE_r({
21137 sv_catpvs(sv, "(JUMP)");
21138 Perl_sv_catpvf(aTHX_ sv,
21139 "<S:%" UVuf "/%" IVdf " W:%" UVuf " L:%" UVuf "/%" UVuf " C:%" UVuf "/%" UVuf ">",
21140 (UV)trie->startstate,
21141 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
21142 (UV)trie->wordcount,
21145 (UV)TRIE_CHARCOUNT(trie),
21146 (UV)trie->uniquecharcount
21149 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
21150 sv_catpvs(sv, "[");
21151 (void) put_charclass_bitmap_innards(sv,
21152 ((IS_ANYOF_TRIE(op))
21154 : TRIE_BITMAP(trie)),
21161 sv_catpvs(sv, "]");
21163 } else if (k == CURLY) {
21164 U32 lo = ARG1(o), hi = ARG2(o);
21165 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
21166 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
21167 Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
21168 if (hi == REG_INFTY)
21169 sv_catpvs(sv, "INFTY");
21171 Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
21172 sv_catpvs(sv, "}");
21174 else if (k == WHILEM && o->flags) /* Ordinal/of */
21175 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
21176 else if (k == REF || k == OPEN || k == CLOSE
21177 || k == GROUPP || OP(o)==ACCEPT)
21179 AV *name_list= NULL;
21180 U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
21181 Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */
21182 if ( RXp_PAREN_NAMES(prog) ) {
21183 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21184 } else if ( pRExC_state ) {
21185 name_list= RExC_paren_name_list;
21188 if ( k != REF || (OP(o) < REFN)) {
21189 SV **name= av_fetch(name_list, parno, 0 );
21191 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21194 SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
21195 I32 *nums=(I32*)SvPVX(sv_dat);
21196 SV **name= av_fetch(name_list, nums[0], 0 );
21199 for ( n=0; n<SvIVX(sv_dat); n++ ) {
21200 Perl_sv_catpvf(aTHX_ sv, "%s%" IVdf,
21201 (n ? "," : ""), (IV)nums[n]);
21203 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21207 if ( k == REF && reginfo) {
21208 U32 n = ARG(o); /* which paren pair */
21209 I32 ln = prog->offs[n].start;
21210 if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
21211 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
21212 else if (ln == prog->offs[n].end)
21213 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
21215 const char *s = reginfo->strbeg + ln;
21216 Perl_sv_catpvf(aTHX_ sv, ": ");
21217 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
21218 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
21221 } else if (k == GOSUB) {
21222 AV *name_list= NULL;
21223 if ( RXp_PAREN_NAMES(prog) ) {
21224 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
21225 } else if ( pRExC_state ) {
21226 name_list= RExC_paren_name_list;
21229 /* Paren and offset */
21230 Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o),
21231 (int)((o + (int)ARG2L(o)) - progi->program) );
21233 SV **name= av_fetch(name_list, ARG(o), 0 );
21235 Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name));
21238 else if (k == LOGICAL)
21239 /* 2: embedded, otherwise 1 */
21240 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
21241 else if (k == ANYOF || k == ANYOFR) {
21245 bool do_sep = FALSE; /* Do we need to separate various components of
21247 /* Set if there is still an unresolved user-defined property */
21248 SV *unresolved = NULL;
21250 /* Things that are ignored except when the runtime locale is UTF-8 */
21251 SV *only_utf8_locale_invlist = NULL;
21253 /* Code points that don't fit in the bitmap */
21254 SV *nonbitmap_invlist = NULL;
21256 /* And things that aren't in the bitmap, but are small enough to be */
21257 SV* bitmap_range_not_in_bitmap = NULL;
21261 if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21267 flags = ANYOF_FLAGS(o);
21268 bitmap = ANYOF_BITMAP(o);
21272 if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
21273 if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
21274 sv_catpvs(sv, "{utf8-locale-reqd}");
21276 if (flags & ANYOFL_FOLD) {
21277 sv_catpvs(sv, "{i}");
21281 inverted = flags & ANYOF_INVERT;
21283 /* If there is stuff outside the bitmap, get it */
21284 if (arg != ANYOF_ONLY_HAS_BITMAP) {
21285 if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
21286 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21288 ANYOFRbase(o) + ANYOFRdelta(o));
21291 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
21292 (void) get_regclass_nonbitmap_data(prog, o, FALSE,
21294 &only_utf8_locale_invlist,
21295 &nonbitmap_invlist);
21297 (void) get_re_gclass_nonbitmap_data(prog, o, FALSE,
21299 &only_utf8_locale_invlist,
21300 &nonbitmap_invlist);
21304 /* The non-bitmap data may contain stuff that could fit in the
21305 * bitmap. This could come from a user-defined property being
21306 * finally resolved when this call was done; or much more likely
21307 * because there are matches that require UTF-8 to be valid, and so
21308 * aren't in the bitmap (or ANYOFR). This is teased apart later */
21309 _invlist_intersection(nonbitmap_invlist,
21311 &bitmap_range_not_in_bitmap);
21312 /* Leave just the things that don't fit into the bitmap */
21313 _invlist_subtract(nonbitmap_invlist,
21315 &nonbitmap_invlist);
21318 /* Obey this flag to add all above-the-bitmap code points */
21319 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
21320 nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
21321 NUM_ANYOF_CODE_POINTS,
21325 /* Ready to start outputting. First, the initial left bracket */
21326 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21328 /* ANYOFH by definition doesn't have anything that will fit inside the
21329 * bitmap; ANYOFR may or may not. */
21330 if ( ! inRANGE(OP(o), ANYOFH, ANYOFHr)
21331 && ( ! inRANGE(OP(o), ANYOFR, ANYOFRb)
21332 || ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
21334 /* Then all the things that could fit in the bitmap */
21335 do_sep = put_charclass_bitmap_innards(sv,
21337 bitmap_range_not_in_bitmap,
21338 only_utf8_locale_invlist,
21342 /* Can't try inverting for a
21343 * better display if there
21344 * are things that haven't
21347 || inRANGE(OP(o), ANYOFR, ANYOFRb));
21348 SvREFCNT_dec(bitmap_range_not_in_bitmap);
21350 /* If there are user-defined properties which haven't been defined
21351 * yet, output them. If the result is not to be inverted, it is
21352 * clearest to output them in a separate [] from the bitmap range
21353 * stuff. If the result is to be complemented, we have to show
21354 * everything in one [], as the inversion applies to the whole
21355 * thing. Use {braces} to separate them from anything in the
21356 * bitmap and anything above the bitmap. */
21359 if (! do_sep) { /* If didn't output anything in the bitmap
21361 sv_catpvs(sv, "^");
21363 sv_catpvs(sv, "{");
21366 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
21369 sv_catsv(sv, unresolved);
21371 sv_catpvs(sv, "}");
21373 do_sep = ! inverted;
21377 /* And, finally, add the above-the-bitmap stuff */
21378 if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) {
21381 /* See if truncation size is overridden */
21382 const STRLEN dump_len = (PL_dump_re_max_len > 256)
21383 ? PL_dump_re_max_len
21386 /* This is output in a separate [] */
21388 Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
21391 /* And, for easy of understanding, it is shown in the
21392 * uncomplemented form if possible. The one exception being if
21393 * there are unresolved items, where the inversion has to be
21394 * delayed until runtime */
21395 if (inverted && ! unresolved) {
21396 _invlist_invert(nonbitmap_invlist);
21397 _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
21400 contents = invlist_contents(nonbitmap_invlist,
21401 FALSE /* output suitable for catsv */
21404 /* If the output is shorter than the permissible maximum, just do it. */
21405 if (SvCUR(contents) <= dump_len) {
21406 sv_catsv(sv, contents);
21409 const char * contents_string = SvPVX(contents);
21410 STRLEN i = dump_len;
21412 /* Otherwise, start at the permissible max and work back to the
21413 * first break possibility */
21414 while (i > 0 && contents_string[i] != ' ') {
21417 if (i == 0) { /* Fail-safe. Use the max if we couldn't
21418 find a legal break */
21422 sv_catpvn(sv, contents_string, i);
21423 sv_catpvs(sv, "...");
21426 SvREFCNT_dec_NN(contents);
21427 SvREFCNT_dec_NN(nonbitmap_invlist);
21430 /* And finally the matching, closing ']' */
21431 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21433 if (OP(o) == ANYOFHs) {
21434 Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
21436 else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
21437 U8 lowest = (OP(o) != ANYOFHr)
21439 : LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
21440 U8 highest = (OP(o) == ANYOFHr)
21441 ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
21442 : (OP(o) == ANYOFH || OP(o) == ANYOFR)
21446 if (OP(o) != ANYOFR || ! isASCII(ANYOFRbase(o) + ANYOFRdelta(o)))
21449 Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
21450 if (lowest != highest) {
21451 Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
21453 Perl_sv_catpvf(aTHX_ sv, ")");
21457 SvREFCNT_dec(unresolved);
21459 else if (k == ANYOFM) {
21460 SV * cp_list = get_ANYOFM_contents(o);
21462 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
21463 if (OP(o) == NANYOFM) {
21464 _invlist_invert(cp_list);
21467 put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
21468 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
21470 SvREFCNT_dec(cp_list);
21472 else if (k == POSIXD || k == NPOSIXD) {
21473 U8 index = FLAGS(o) * 2;
21474 if (index < C_ARRAY_LENGTH(anyofs)) {
21475 if (*anyofs[index] != '[') {
21476 sv_catpvs(sv, "[");
21478 sv_catpv(sv, anyofs[index]);
21479 if (*anyofs[index] != '[') {
21480 sv_catpvs(sv, "]");
21484 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
21487 else if (k == BOUND || k == NBOUND) {
21488 /* Must be synced with order of 'bound_type' in regcomp.h */
21489 const char * const bounds[] = {
21490 "", /* Traditional */
21496 assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
21497 sv_catpv(sv, bounds[FLAGS(o)]);
21499 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
21500 Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
21502 Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
21504 Perl_sv_catpvf(aTHX_ sv, "]");
21506 else if (OP(o) == SBOL)
21507 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
21509 /* add on the verb argument if there is one */
21510 if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
21512 Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
21513 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
21515 sv_catpvs(sv, ":NULL");
21518 PERL_UNUSED_CONTEXT;
21519 PERL_UNUSED_ARG(sv);
21520 PERL_UNUSED_ARG(o);
21521 PERL_UNUSED_ARG(prog);
21522 PERL_UNUSED_ARG(reginfo);
21523 PERL_UNUSED_ARG(pRExC_state);
21524 #endif /* DEBUGGING */
21530 Perl_re_intuit_string(pTHX_ REGEXP * const r)
21531 { /* Assume that RE_INTUIT is set */
21532 /* Returns an SV containing a string that must appear in the target for it
21533 * to match, or NULL if nothing is known that must match.
21535 * CAUTION: the SV can be freed during execution of the regex engine */
21537 struct regexp *const prog = ReANY(r);
21538 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21540 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
21541 PERL_UNUSED_CONTEXT;
21545 if (prog->maxlen > 0) {
21546 const char * const s = SvPV_nolen_const(RX_UTF8(r)
21547 ? prog->check_utf8 : prog->check_substr);
21549 if (!PL_colorset) reginitcolors();
21550 Perl_re_printf( aTHX_
21551 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
21553 RX_UTF8(r) ? "utf8 " : "",
21554 PL_colors[5], PL_colors[0],
21557 (strlen(s) > PL_dump_re_max_len ? "..." : ""));
21561 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
21562 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
21568 handles refcounting and freeing the perl core regexp structure. When
21569 it is necessary to actually free the structure the first thing it
21570 does is call the 'free' method of the regexp_engine associated to
21571 the regexp, allowing the handling of the void *pprivate; member
21572 first. (This routine is not overridable by extensions, which is why
21573 the extensions free is called first.)
21575 See regdupe and regdupe_internal if you change anything here.
21577 #ifndef PERL_IN_XSUB_RE
21579 Perl_pregfree(pTHX_ REGEXP *r)
21585 Perl_pregfree2(pTHX_ REGEXP *rx)
21587 struct regexp *const r = ReANY(rx);
21588 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21590 PERL_ARGS_ASSERT_PREGFREE2;
21595 if (r->mother_re) {
21596 ReREFCNT_dec(r->mother_re);
21598 CALLREGFREE_PVT(rx); /* free the private data */
21599 SvREFCNT_dec(RXp_PAREN_NAMES(r));
21603 for (i = 0; i < 2; i++) {
21604 SvREFCNT_dec(r->substrs->data[i].substr);
21605 SvREFCNT_dec(r->substrs->data[i].utf8_substr);
21607 Safefree(r->substrs);
21609 RX_MATCH_COPY_FREE(rx);
21610 #ifdef PERL_ANY_COW
21611 SvREFCNT_dec(r->saved_copy);
21614 SvREFCNT_dec(r->qr_anoncv);
21615 if (r->recurse_locinput)
21616 Safefree(r->recurse_locinput);
21622 Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
21623 except that dsv will be created if NULL.
21625 This function is used in two main ways. First to implement
21626 $r = qr/....; $s = $$r;
21628 Secondly, it is used as a hacky workaround to the structural issue of
21630 being stored in the regexp structure which is in turn stored in
21631 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
21632 could be PL_curpm in multiple contexts, and could require multiple
21633 result sets being associated with the pattern simultaneously, such
21634 as when doing a recursive match with (??{$qr})
21636 The solution is to make a lightweight copy of the regexp structure
21637 when a qr// is returned from the code executed by (??{$qr}) this
21638 lightweight copy doesn't actually own any of its data except for
21639 the starp/end and the actual regexp structure itself.
21645 Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
21647 struct regexp *drx;
21648 struct regexp *const srx = ReANY(ssv);
21649 const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
21651 PERL_ARGS_ASSERT_REG_TEMP_COPY;
21654 dsv = (REGEXP*) newSV_type(SVt_REGEXP);
21656 assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
21658 /* our only valid caller, sv_setsv_flags(), should have done
21659 * a SV_CHECK_THINKFIRST_COW_DROP() by now */
21660 assert(!SvOOK(dsv));
21661 assert(!SvIsCOW(dsv));
21662 assert(!SvROK(dsv));
21664 if (SvPVX_const(dsv)) {
21666 Safefree(SvPVX(dsv));
21671 SvOK_off((SV *)dsv);
21674 /* For PVLVs, the head (sv_any) points to an XPVLV, while
21675 * the LV's xpvlenu_rx will point to a regexp body, which
21676 * we allocate here */
21677 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
21678 assert(!SvPVX(dsv));
21679 ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
21680 temp->sv_any = NULL;
21681 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
21682 SvREFCNT_dec_NN(temp);
21683 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
21684 ing below will not set it. */
21685 SvCUR_set(dsv, SvCUR(ssv));
21688 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
21689 sv_force_normal(sv) is called. */
21693 SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
21694 SvPV_set(dsv, RX_WRAPPED(ssv));
21695 /* We share the same string buffer as the original regexp, on which we
21696 hold a reference count, incremented when mother_re is set below.
21697 The string pointer is copied here, being part of the regexp struct.
21699 memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
21700 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
21704 const I32 npar = srx->nparens+1;
21705 Newx(drx->offs, npar, regexp_paren_pair);
21706 Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
21708 if (srx->substrs) {
21710 Newx(drx->substrs, 1, struct reg_substr_data);
21711 StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
21713 for (i = 0; i < 2; i++) {
21714 SvREFCNT_inc_void(drx->substrs->data[i].substr);
21715 SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
21718 /* check_substr and check_utf8, if non-NULL, point to either their
21719 anchored or float namesakes, and don't hold a second reference. */
21721 RX_MATCH_COPIED_off(dsv);
21722 #ifdef PERL_ANY_COW
21723 drx->saved_copy = NULL;
21725 drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
21726 SvREFCNT_inc_void(drx->qr_anoncv);
21727 if (srx->recurse_locinput)
21728 Newx(drx->recurse_locinput, srx->nparens + 1, char *);
21735 /* regfree_internal()
21737 Free the private data in a regexp. This is overloadable by
21738 extensions. Perl takes care of the regexp structure in pregfree(),
21739 this covers the *pprivate pointer which technically perl doesn't
21740 know about, however of course we have to handle the
21741 regexp_internal structure when no extension is in use.
21743 Note this is called before freeing anything in the regexp
21748 Perl_regfree_internal(pTHX_ REGEXP * const rx)
21750 struct regexp *const r = ReANY(rx);
21751 RXi_GET_DECL(r, ri);
21752 DECLARE_AND_GET_RE_DEBUG_FLAGS;
21754 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
21764 SV *dsv= sv_newmortal();
21765 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
21766 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
21767 Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
21768 PL_colors[4], PL_colors[5], s);
21772 #ifdef RE_TRACK_PATTERN_OFFSETS
21774 Safefree(ri->u.offsets); /* 20010421 MJD */
21776 if (ri->code_blocks)
21777 S_free_codeblocks(aTHX_ ri->code_blocks);
21780 int n = ri->data->count;
21783 /* If you add a ->what type here, update the comment in regcomp.h */
21784 switch (ri->data->what[n]) {
21790 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
21793 Safefree(ri->data->data[n]);
21799 { /* Aho Corasick add-on structure for a trie node.
21800 Used in stclass optimization only */
21802 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
21803 #ifdef USE_ITHREADS
21806 refcount = --aho->refcount;
21809 PerlMemShared_free(aho->states);
21810 PerlMemShared_free(aho->fail);
21811 /* do this last!!!! */
21812 PerlMemShared_free(ri->data->data[n]);
21813 /* we should only ever get called once, so
21814 * assert as much, and also guard the free
21815 * which /might/ happen twice. At the least
21816 * it will make code anlyzers happy and it
21817 * doesn't cost much. - Yves */
21818 assert(ri->regstclass);
21819 if (ri->regstclass) {
21820 PerlMemShared_free(ri->regstclass);
21821 ri->regstclass = 0;
21828 /* trie structure. */
21830 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
21831 #ifdef USE_ITHREADS
21834 refcount = --trie->refcount;
21837 PerlMemShared_free(trie->charmap);
21838 PerlMemShared_free(trie->states);
21839 PerlMemShared_free(trie->trans);
21841 PerlMemShared_free(trie->bitmap);
21843 PerlMemShared_free(trie->jump);
21844 PerlMemShared_free(trie->wordinfo);
21845 /* do this last!!!! */
21846 PerlMemShared_free(ri->data->data[n]);
21851 Perl_croak(aTHX_ "panic: regfree data code '%c'",
21852 ri->data->what[n]);
21855 Safefree(ri->data->what);
21856 Safefree(ri->data);
21862 #define av_dup_inc(s, t) MUTABLE_AV(sv_dup_inc((const SV *)s, t))
21863 #define hv_dup_inc(s, t) MUTABLE_HV(sv_dup_inc((const SV *)s, t))
21864 #define SAVEPVN(p, n) ((p) ? savepvn(p, n) : NULL)
21867 =for apidoc_section REGEXP Functions
21868 =for apidoc re_dup_guts
21869 Duplicate a regexp.
21871 This routine is expected to clone a given regexp structure. It is only
21872 compiled under USE_ITHREADS.
21874 After all of the core data stored in struct regexp is duplicated
21875 the C<regexp_engine.dupe> method is used to copy any private data
21876 stored in the *pprivate pointer. This allows extensions to handle
21877 any duplication they need to do.
21881 See pregfree() and regfree_internal() if you change anything here.
21883 #if defined(USE_ITHREADS)
21884 #ifndef PERL_IN_XSUB_RE
21886 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
21889 const struct regexp *r = ReANY(sstr);
21890 struct regexp *ret = ReANY(dstr);
21892 PERL_ARGS_ASSERT_RE_DUP_GUTS;
21894 npar = r->nparens+1;
21895 Newx(ret->offs, npar, regexp_paren_pair);
21896 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
21898 if (ret->substrs) {
21899 /* Do it this way to avoid reading from *r after the StructCopy().
21900 That way, if any of the sv_dup_inc()s dislodge *r from the L1
21901 cache, it doesn't matter. */
21903 const bool anchored = r->check_substr
21904 ? r->check_substr == r->substrs->data[0].substr
21905 : r->check_utf8 == r->substrs->data[0].utf8_substr;
21906 Newx(ret->substrs, 1, struct reg_substr_data);
21907 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
21909 for (i = 0; i < 2; i++) {
21910 ret->substrs->data[i].substr =
21911 sv_dup_inc(ret->substrs->data[i].substr, param);
21912 ret->substrs->data[i].utf8_substr =
21913 sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
21916 /* check_substr and check_utf8, if non-NULL, point to either their
21917 anchored or float namesakes, and don't hold a second reference. */
21919 if (ret->check_substr) {
21921 assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
21923 ret->check_substr = ret->substrs->data[0].substr;
21924 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21926 assert(r->check_substr == r->substrs->data[1].substr);
21927 assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
21929 ret->check_substr = ret->substrs->data[1].substr;
21930 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21932 } else if (ret->check_utf8) {
21934 ret->check_utf8 = ret->substrs->data[0].utf8_substr;
21936 ret->check_utf8 = ret->substrs->data[1].utf8_substr;
21941 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
21942 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
21943 if (r->recurse_locinput)
21944 Newx(ret->recurse_locinput, r->nparens + 1, char *);
21947 RXi_SET(ret, CALLREGDUPE_PVT(dstr, param));
21949 if (RX_MATCH_COPIED(dstr))
21950 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
21952 ret->subbeg = NULL;
21953 #ifdef PERL_ANY_COW
21954 ret->saved_copy = NULL;
21957 /* Whether mother_re be set or no, we need to copy the string. We
21958 cannot refrain from copying it when the storage points directly to
21959 our mother regexp, because that's
21960 1: a buffer in a different thread
21961 2: something we no longer hold a reference on
21962 so we need to copy it locally. */
21963 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
21964 /* set malloced length to a non-zero value so it will be freed
21965 * (otherwise in combination with SVf_FAKE it looks like an alien
21966 * buffer). It doesn't have to be the actual malloced size, since it
21967 * should never be grown */
21968 SvLEN_set(dstr, SvCUR(sstr)+1);
21969 ret->mother_re = NULL;
21971 #endif /* PERL_IN_XSUB_RE */
21976 This is the internal complement to regdupe() which is used to copy
21977 the structure pointed to by the *pprivate pointer in the regexp.
21978 This is the core version of the extension overridable cloning hook.
21979 The regexp structure being duplicated will be copied by perl prior
21980 to this and will be provided as the regexp *r argument, however
21981 with the /old/ structures pprivate pointer value. Thus this routine
21982 may override any copying normally done by perl.
21984 It returns a pointer to the new regexp_internal structure.
21988 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
21990 struct regexp *const r = ReANY(rx);
21991 regexp_internal *reti;
21993 RXi_GET_DECL(r, ri);
21995 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
21999 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
22000 char, regexp_internal);
22001 Copy(ri->program, reti->program, len+1, regnode);
22004 if (ri->code_blocks) {
22006 Newx(reti->code_blocks, 1, struct reg_code_blocks);
22007 Newx(reti->code_blocks->cb, ri->code_blocks->count,
22008 struct reg_code_block);
22009 Copy(ri->code_blocks->cb, reti->code_blocks->cb,
22010 ri->code_blocks->count, struct reg_code_block);
22011 for (n = 0; n < ri->code_blocks->count; n++)
22012 reti->code_blocks->cb[n].src_regex = (REGEXP*)
22013 sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param);
22014 reti->code_blocks->count = ri->code_blocks->count;
22015 reti->code_blocks->refcnt = 1;
22018 reti->code_blocks = NULL;
22020 reti->regstclass = NULL;
22023 struct reg_data *d;
22024 const int count = ri->data->count;
22027 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
22028 char, struct reg_data);
22029 Newx(d->what, count, U8);
22032 for (i = 0; i < count; i++) {
22033 d->what[i] = ri->data->what[i];
22034 switch (d->what[i]) {
22035 /* see also regcomp.h and regfree_internal() */
22036 case 'a': /* actually an AV, but the dup function is identical.
22037 values seem to be "plain sv's" generally. */
22038 case 'r': /* a compiled regex (but still just another SV) */
22039 case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
22040 this use case should go away, the code could have used
22041 'a' instead - see S_set_ANYOF_arg() for array contents. */
22042 case 'S': /* actually an SV, but the dup function is identical. */
22043 case 'u': /* actually an HV, but the dup function is identical.
22044 values are "plain sv's" */
22045 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
22048 /* Synthetic Start Class - "Fake" charclass we generate to optimize
22049 * patterns which could start with several different things. Pre-TRIE
22050 * this was more important than it is now, however this still helps
22051 * in some places, for instance /x?a+/ might produce a SSC equivalent
22052 * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
22055 /* This is cheating. */
22056 Newx(d->data[i], 1, regnode_ssc);
22057 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
22058 reti->regstclass = (regnode*)d->data[i];
22061 /* AHO-CORASICK fail table */
22062 /* Trie stclasses are readonly and can thus be shared
22063 * without duplication. We free the stclass in pregfree
22064 * when the corresponding reg_ac_data struct is freed.
22066 reti->regstclass= ri->regstclass;
22069 /* TRIE transition table */
22071 ((reg_trie_data*)ri->data->data[i])->refcount++;
22074 case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
22075 case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
22076 is not from another regexp */
22077 d->data[i] = ri->data->data[i];
22080 Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'",
22081 ri->data->what[i]);
22090 reti->name_list_idx = ri->name_list_idx;
22092 #ifdef RE_TRACK_PATTERN_OFFSETS
22093 if (ri->u.offsets) {
22094 Newx(reti->u.offsets, 2*len+1, U32);
22095 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
22098 SetProgLen(reti, len);
22101 return (void*)reti;
22104 #endif /* USE_ITHREADS */
22106 #ifndef PERL_IN_XSUB_RE
22109 - regnext - dig the "next" pointer out of a node
22112 Perl_regnext(pTHX_ regnode *p)
22119 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
22120 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
22121 (int)OP(p), (int)REGNODE_MAX);
22124 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
22134 S_re_croak(pTHX_ bool utf8, const char* pat,...)
22137 STRLEN len = strlen(pat);
22140 const char *message;
22142 PERL_ARGS_ASSERT_RE_CROAK;
22146 Copy(pat, buf, len , char);
22148 buf[len + 1] = '\0';
22149 va_start(args, pat);
22150 msv = vmess(buf, &args);
22152 message = SvPV_const(msv, len);
22155 Copy(message, buf, len , char);
22156 /* len-1 to avoid \n */
22157 Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, len-1, buf));
22160 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
22162 #ifndef PERL_IN_XSUB_RE
22164 Perl_save_re_context(pTHX)
22169 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
22172 const REGEXP * const rx = PM_GETRE(PL_curpm);
22174 nparens = RX_NPARENS(rx);
22177 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
22178 * that PL_curpm will be null, but that utf8.pm and the modules it
22179 * loads will only use $1..$3.
22180 * The t/porting/re_context.t test file checks this assumption.
22185 for (i = 1; i <= nparens; i++) {
22186 char digits[TYPE_CHARS(long)];
22187 const STRLEN len = my_snprintf(digits, sizeof(digits),
22189 GV *const *const gvp
22190 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
22193 GV * const gv = *gvp;
22194 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
22204 S_put_code_point(pTHX_ SV *sv, UV c)
22206 PERL_ARGS_ASSERT_PUT_CODE_POINT;
22209 Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c);
22211 else if (isPRINT(c)) {
22212 const char string = (char) c;
22214 /* We use {phrase} as metanotation in the class, so also escape literal
22216 if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}')
22217 sv_catpvs(sv, "\\");
22218 sv_catpvn(sv, &string, 1);
22220 else if (isMNEMONIC_CNTRL(c)) {
22221 Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c));
22224 Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c);
22228 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
22231 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
22233 /* Appends to 'sv' a displayable version of the range of code points from
22234 * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls
22235 * that have them, when they occur at the beginning or end of the range.
22236 * It uses hex to output the remaining code points, unless 'allow_literals'
22237 * is true, in which case the printable ASCII ones are output as-is (though
22238 * some of these will be escaped by put_code_point()).
22240 * NOTE: This is designed only for printing ranges of code points that fit
22241 * inside an ANYOF bitmap. Higher code points are simply suppressed
22244 const unsigned int min_range_count = 3;
22246 assert(start <= end);
22248 PERL_ARGS_ASSERT_PUT_RANGE;
22250 while (start <= end) {
22252 const char * format;
22254 if ( end - start < min_range_count
22255 && (end - start <= 2 || (isPRINT_A(start) && isPRINT_A(end))))
22257 /* Output a range of 1 or 2 chars individually, or longer ranges
22258 * when printable */
22259 for (; start <= end; start++) {
22260 put_code_point(sv, start);
22265 /* If permitted by the input options, and there is a possibility that
22266 * this range contains a printable literal, look to see if there is
22268 if (allow_literals && start <= MAX_PRINT_A) {
22270 /* If the character at the beginning of the range isn't an ASCII
22271 * printable, effectively split the range into two parts:
22272 * 1) the portion before the first such printable,
22274 * and output them separately. */
22275 if (! isPRINT_A(start)) {
22276 UV temp_end = start + 1;
22278 /* There is no point looking beyond the final possible
22279 * printable, in MAX_PRINT_A */
22280 UV max = MIN(end, MAX_PRINT_A);
22282 while (temp_end <= max && ! isPRINT_A(temp_end)) {
22286 /* Here, temp_end points to one beyond the first printable if
22287 * found, or to one beyond 'max' if not. If none found, make
22288 * sure that we use the entire range */
22289 if (temp_end > MAX_PRINT_A) {
22290 temp_end = end + 1;
22293 /* Output the first part of the split range: the part that
22294 * doesn't have printables, with the parameter set to not look
22295 * for literals (otherwise we would infinitely recurse) */
22296 put_range(sv, start, temp_end - 1, FALSE);
22298 /* The 2nd part of the range (if any) starts here. */
22301 /* We do a continue, instead of dropping down, because even if
22302 * the 2nd part is non-empty, it could be so short that we want
22303 * to output it as individual characters, as tested for at the
22304 * top of this loop. */
22308 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
22309 * output a sub-range of just the digits or letters, then process
22310 * the remaining portion as usual. */
22311 if (isALPHANUMERIC_A(start)) {
22312 UV mask = (isDIGIT_A(start))
22317 UV temp_end = start + 1;
22319 /* Find the end of the sub-range that includes just the
22320 * characters in the same class as the first character in it */
22321 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
22326 /* For short ranges, don't duplicate the code above to output
22327 * them; just call recursively */
22328 if (temp_end - start < min_range_count) {
22329 put_range(sv, start, temp_end, FALSE);
22331 else { /* Output as a range */
22332 put_code_point(sv, start);
22333 sv_catpvs(sv, "-");
22334 put_code_point(sv, temp_end);
22336 start = temp_end + 1;
22340 /* We output any other printables as individual characters */
22341 if (isPUNCT_A(start) || isSPACE_A(start)) {
22342 while (start <= end && (isPUNCT_A(start)
22343 || isSPACE_A(start)))
22345 put_code_point(sv, start);
22350 } /* End of looking for literals */
22352 /* Here is not to output as a literal. Some control characters have
22353 * mnemonic names. Split off any of those at the beginning and end of
22354 * the range to print mnemonically. It isn't possible for many of
22355 * these to be in a row, so this won't overwhelm with output */
22357 && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
22359 while (isMNEMONIC_CNTRL(start) && start <= end) {
22360 put_code_point(sv, start);
22364 /* If this didn't take care of the whole range ... */
22365 if (start <= end) {
22367 /* Look backwards from the end to find the final non-mnemonic
22370 while (isMNEMONIC_CNTRL(temp_end)) {
22374 /* And separately output the interior range that doesn't start
22375 * or end with mnemonics */
22376 put_range(sv, start, temp_end, FALSE);
22378 /* Then output the mnemonic trailing controls */
22379 start = temp_end + 1;
22380 while (start <= end) {
22381 put_code_point(sv, start);
22388 /* As a final resort, output the range or subrange as hex. */
22390 if (start >= NUM_ANYOF_CODE_POINTS) {
22393 else { /* Have to split range at the bitmap boundary */
22394 this_end = (end < NUM_ANYOF_CODE_POINTS)
22396 : NUM_ANYOF_CODE_POINTS - 1;
22398 #if NUM_ANYOF_CODE_POINTS > 256
22399 format = (this_end < 256)
22400 ? "\\x%02" UVXf "-\\x%02" UVXf
22401 : "\\x{%04" UVXf "}-\\x{%04" UVXf "}";
22403 format = "\\x%02" UVXf "-\\x%02" UVXf;
22405 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
22406 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
22407 GCC_DIAG_RESTORE_STMT;
22413 S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist)
22415 /* Concatenate onto the PV in 'sv' a displayable form of the inversion list
22419 bool allow_literals = TRUE;
22421 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST;
22423 /* Generally, it is more readable if printable characters are output as
22424 * literals, but if a range (nearly) spans all of them, it's best to output
22425 * it as a single range. This code will use a single range if all but 2
22426 * ASCII printables are in it */
22427 invlist_iterinit(invlist);
22428 while (invlist_iternext(invlist, &start, &end)) {
22430 /* If the range starts beyond the final printable, it doesn't have any
22432 if (start > MAX_PRINT_A) {
22436 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
22437 * all but two, the range must start and end no later than 2 from
22439 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
22440 if (end > MAX_PRINT_A) {
22446 if (end - start >= MAX_PRINT_A - ' ' - 2) {
22447 allow_literals = FALSE;
22452 invlist_iterfinish(invlist);
22454 /* Here we have figured things out. Output each range */
22455 invlist_iterinit(invlist);
22456 while (invlist_iternext(invlist, &start, &end)) {
22457 if (start >= NUM_ANYOF_CODE_POINTS) {
22460 put_range(sv, start, end, allow_literals);
22462 invlist_iterfinish(invlist);
22468 S_put_charclass_bitmap_innards_common(pTHX_
22469 SV* invlist, /* The bitmap */
22470 SV* posixes, /* Under /l, things like [:word:], \S */
22471 SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */
22472 SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */
22473 SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */
22474 const bool invert /* Is the result to be inverted? */
22477 /* Create and return an SV containing a displayable version of the bitmap
22478 * and associated information determined by the input parameters. If the
22479 * output would have been only the inversion indicator '^', NULL is instead
22484 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON;
22487 output = newSVpvs("^");
22490 output = newSVpvs("");
22493 /* First, the code points in the bitmap that are unconditionally there */
22494 put_charclass_bitmap_innards_invlist(output, invlist);
22496 /* Traditionally, these have been placed after the main code points */
22498 sv_catsv(output, posixes);
22501 if (only_utf8 && _invlist_len(only_utf8)) {
22502 Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]);
22503 put_charclass_bitmap_innards_invlist(output, only_utf8);
22506 if (not_utf8 && _invlist_len(not_utf8)) {
22507 Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]);
22508 put_charclass_bitmap_innards_invlist(output, not_utf8);
22511 if (only_utf8_locale && _invlist_len(only_utf8_locale)) {
22512 Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]);
22513 put_charclass_bitmap_innards_invlist(output, only_utf8_locale);
22515 /* This is the only list in this routine that can legally contain code
22516 * points outside the bitmap range. The call just above to
22517 * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so
22518 * output them here. There's about a half-dozen possible, and none in
22519 * contiguous ranges longer than 2 */
22520 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22522 SV* above_bitmap = NULL;
22524 _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap);
22526 invlist_iterinit(above_bitmap);
22527 while (invlist_iternext(above_bitmap, &start, &end)) {
22530 for (i = start; i <= end; i++) {
22531 put_code_point(output, i);
22534 invlist_iterfinish(above_bitmap);
22535 SvREFCNT_dec_NN(above_bitmap);
22539 if (invert && SvCUR(output) == 1) {
22547 S_put_charclass_bitmap_innards(pTHX_ SV *sv,
22549 SV *nonbitmap_invlist,
22550 SV *only_utf8_locale_invlist,
22551 const regnode * const node,
22553 const bool force_as_is_display)
22555 /* Appends to 'sv' a displayable version of the innards of the bracketed
22556 * character class defined by the other arguments:
22557 * 'bitmap' points to the bitmap, or NULL if to ignore that.
22558 * 'nonbitmap_invlist' is an inversion list of the code points that are in
22559 * the bitmap range, but for some reason aren't in the bitmap; NULL if
22560 * none. The reasons for this could be that they require some
22561 * condition such as the target string being or not being in UTF-8
22562 * (under /d), or because they came from a user-defined property that
22563 * was not resolved at the time of the regex compilation (under /u)
22564 * 'only_utf8_locale_invlist' is an inversion list of the code points that
22565 * are valid only if the runtime locale is a UTF-8 one; NULL if none
22566 * 'node' is the regex pattern ANYOF node. It is needed only when the
22567 * above two parameters are not null, and is passed so that this
22568 * routine can tease apart the various reasons for them.
22569 * 'flags' is the flags field of 'node'
22570 * 'force_as_is_display' is TRUE if this routine should definitely NOT try
22571 * to invert things to see if that leads to a cleaner display. If
22572 * FALSE, this routine is free to use its judgment about doing this.
22574 * It returns TRUE if there was actually something output. (It may be that
22575 * the bitmap, etc is empty.)
22577 * When called for outputting the bitmap of a non-ANYOF node, just pass the
22578 * bitmap, with the succeeding parameters set to NULL, and the final one to
22582 /* In general, it tries to display the 'cleanest' representation of the
22583 * innards, choosing whether to display them inverted or not, regardless of
22584 * whether the class itself is to be inverted. However, there are some
22585 * cases where it can't try inverting, as what actually matches isn't known
22586 * until runtime, and hence the inversion isn't either. */
22588 bool inverting_allowed = ! force_as_is_display;
22591 STRLEN orig_sv_cur = SvCUR(sv);
22593 SV* invlist; /* Inversion list we accumulate of code points that
22594 are unconditionally matched */
22595 SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is
22597 SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8
22599 SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */
22600 SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale
22603 SV* as_is_display; /* The output string when we take the inputs
22605 SV* inverted_display; /* The output string when we invert the inputs */
22607 bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
22609 /* We are biased in favor of displaying things without them being inverted,
22610 * as that is generally easier to understand */
22611 const int bias = 5;
22613 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
22615 /* Start off with whatever code points are passed in. (We clone, so we
22616 * don't change the caller's list) */
22617 if (nonbitmap_invlist) {
22618 assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS);
22619 invlist = invlist_clone(nonbitmap_invlist, NULL);
22621 else { /* Worst case size is every other code point is matched */
22622 invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
22626 if (OP(node) == ANYOFD) {
22628 /* This flag indicates that the code points below 0x100 in the
22629 * nonbitmap list are precisely the ones that match only when the
22630 * target is UTF-8 (they should all be non-ASCII). */
22631 if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
22633 _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8);
22634 _invlist_subtract(invlist, only_utf8, &invlist);
22637 /* And this flag for matching all non-ASCII 0xFF and below */
22638 if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
22640 not_utf8 = invlist_clone(PL_UpperLatin1, NULL);
22643 else if (OP(node) == ANYOFL || OP(node) == ANYOFPOSIXL) {
22645 /* If either of these flags are set, what matches isn't
22646 * determinable except during execution, so don't know enough here
22648 if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) {
22649 inverting_allowed = FALSE;
22652 /* What the posix classes match also varies at runtime, so these
22653 * will be output symbolically. */
22654 if (ANYOF_POSIXL_TEST_ANY_SET(node)) {
22657 posixes = newSVpvs("");
22658 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
22659 if (ANYOF_POSIXL_TEST(node, i)) {
22660 sv_catpv(posixes, anyofs[i]);
22667 /* Accumulate the bit map into the unconditional match list */
22669 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
22670 if (BITMAP_TEST(bitmap, i)) {
22673 i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
22676 invlist = _add_range_to_invlist(invlist, start, i-1);
22681 /* Make sure that the conditional match lists don't have anything in them
22682 * that match unconditionally; otherwise the output is quite confusing.
22683 * This could happen if the code that populates these misses some
22686 _invlist_subtract(only_utf8, invlist, &only_utf8);
22689 _invlist_subtract(not_utf8, invlist, ¬_utf8);
22692 if (only_utf8_locale_invlist) {
22694 /* Since this list is passed in, we have to make a copy before
22696 only_utf8_locale = invlist_clone(only_utf8_locale_invlist, NULL);
22698 _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale);
22700 /* And, it can get really weird for us to try outputting an inverted
22701 * form of this list when it has things above the bitmap, so don't even
22703 if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) {
22704 inverting_allowed = FALSE;
22708 /* Calculate what the output would be if we take the input as-is */
22709 as_is_display = put_charclass_bitmap_innards_common(invlist,
22716 /* If have to take the output as-is, just do that */
22717 if (! inverting_allowed) {
22718 if (as_is_display) {
22719 sv_catsv(sv, as_is_display);
22720 SvREFCNT_dec_NN(as_is_display);
22723 else { /* But otherwise, create the output again on the inverted input, and
22724 use whichever version is shorter */
22726 int inverted_bias, as_is_bias;
22728 /* We will apply our bias to whichever of the results doesn't have
22738 inverted_bias = bias;
22741 /* Now invert each of the lists that contribute to the output,
22742 * excluding from the result things outside the possible range */
22744 /* For the unconditional inversion list, we have to add in all the
22745 * conditional code points, so that when inverted, they will be gone
22747 _invlist_union(only_utf8, invlist, &invlist);
22748 _invlist_union(not_utf8, invlist, &invlist);
22749 _invlist_union(only_utf8_locale, invlist, &invlist);
22750 _invlist_invert(invlist);
22751 _invlist_intersection(invlist, PL_InBitmap, &invlist);
22754 _invlist_invert(only_utf8);
22755 _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
22757 else if (not_utf8) {
22759 /* If a code point matches iff the target string is not in UTF-8,
22760 * then complementing the result has it not match iff not in UTF-8,
22761 * which is the same thing as matching iff it is UTF-8. */
22762 only_utf8 = not_utf8;
22766 if (only_utf8_locale) {
22767 _invlist_invert(only_utf8_locale);
22768 _invlist_intersection(only_utf8_locale,
22770 &only_utf8_locale);
22773 inverted_display = put_charclass_bitmap_innards_common(
22778 only_utf8_locale, invert);
22780 /* Use the shortest representation, taking into account our bias
22781 * against showing it inverted */
22782 if ( inverted_display
22783 && ( ! as_is_display
22784 || ( SvCUR(inverted_display) + inverted_bias
22785 < SvCUR(as_is_display) + as_is_bias)))
22787 sv_catsv(sv, inverted_display);
22789 else if (as_is_display) {
22790 sv_catsv(sv, as_is_display);
22793 SvREFCNT_dec(as_is_display);
22794 SvREFCNT_dec(inverted_display);
22797 SvREFCNT_dec_NN(invlist);
22798 SvREFCNT_dec(only_utf8);
22799 SvREFCNT_dec(not_utf8);
22800 SvREFCNT_dec(posixes);
22801 SvREFCNT_dec(only_utf8_locale);
22803 return SvCUR(sv) > orig_sv_cur;
22806 #define CLEAR_OPTSTART \
22807 if (optstart) STMT_START { \
22808 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \
22809 " (%" IVdf " nodes)\n", (IV)(node - optstart))); \
22813 #define DUMPUNTIL(b,e) \
22815 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
22817 STATIC const regnode *
22818 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
22819 const regnode *last, const regnode *plast,
22820 SV* sv, I32 indent, U32 depth)
22822 U8 op = PSEUDO; /* Arbitrary non-END op. */
22823 const regnode *next;
22824 const regnode *optstart= NULL;
22826 RXi_GET_DECL(r, ri);
22827 DECLARE_AND_GET_RE_DEBUG_FLAGS;
22829 PERL_ARGS_ASSERT_DUMPUNTIL;
22831 #ifdef DEBUG_DUMPUNTIL
22832 Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n", indent, node-start,
22833 last ? last-start : 0, plast ? plast-start : 0);
22836 if (plast && plast < last)
22839 while (PL_regkind[op] != END && (!last || node < last)) {
22841 /* While that wasn't END last time... */
22844 if (op == CLOSE || op == SRCLOSE || op == WHILEM)
22846 next = regnext((regnode *)node);
22849 if (OP(node) == OPTIMIZED) {
22850 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
22857 regprop(r, sv, node, NULL, NULL);
22858 Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start),
22859 (int)(2*indent + 1), "", SvPVX_const(sv));
22861 if (OP(node) != OPTIMIZED) {
22862 if (next == NULL) /* Next ptr. */
22863 Perl_re_printf( aTHX_ " (0)");
22864 else if (PL_regkind[(U8)op] == BRANCH
22865 && PL_regkind[OP(next)] != BRANCH )
22866 Perl_re_printf( aTHX_ " (FAIL)");
22868 Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start));
22869 Perl_re_printf( aTHX_ "\n");
22873 if (PL_regkind[(U8)op] == BRANCHJ) {
22876 const regnode *nnode = (OP(next) == LONGJMP
22877 ? regnext((regnode *)next)
22879 if (last && nnode > last)
22881 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
22884 else if (PL_regkind[(U8)op] == BRANCH) {
22886 DUMPUNTIL(NEXTOPER(node), next);
22888 else if ( PL_regkind[(U8)op] == TRIE ) {
22889 const regnode *this_trie = node;
22890 const char op = OP(node);
22891 const U32 n = ARG(node);
22892 const reg_ac_data * const ac = op>=AHOCORASICK ?
22893 (reg_ac_data *)ri->data->data[n] :
22895 const reg_trie_data * const trie =
22896 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
22898 AV *const trie_words
22899 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
22901 const regnode *nextbranch= NULL;
22904 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
22905 SV ** const elem_ptr = av_fetch(trie_words, word_idx, 0);
22907 Perl_re_indentf( aTHX_ "%s ",
22910 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
22911 SvCUR(*elem_ptr), PL_dump_re_max_len,
22912 PL_colors[0], PL_colors[1],
22914 ? PERL_PV_ESCAPE_UNI
22916 | PERL_PV_PRETTY_ELLIPSES
22917 | PERL_PV_PRETTY_LTGT
22922 U16 dist= trie->jump[word_idx+1];
22923 Perl_re_printf( aTHX_ "(%" UVuf ")\n",
22924 (UV)((dist ? this_trie + dist : next) - start));
22927 nextbranch= this_trie + trie->jump[0];
22928 DUMPUNTIL(this_trie + dist, nextbranch);
22930 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
22931 nextbranch= regnext((regnode *)nextbranch);
22933 Perl_re_printf( aTHX_ "\n");
22936 if (last && next > last)
22941 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
22942 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
22943 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
22945 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
22947 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
22949 else if ( op == PLUS || op == STAR) {
22950 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
22952 else if (PL_regkind[(U8)op] == EXACT || op == ANYOFHs) {
22953 /* Literal string, where present. */
22954 node += NODE_SZ_STR(node) - 1;
22955 node = NEXTOPER(node);
22958 node = NEXTOPER(node);
22959 node += regarglen[(U8)op];
22961 if (op == CURLYX || op == OPEN || op == SROPEN)
22965 #ifdef DEBUG_DUMPUNTIL
22966 Perl_re_printf( aTHX_ "--- %d\n", (int)indent);
22971 #endif /* DEBUGGING */
22973 #ifndef PERL_IN_XSUB_RE
22975 # include "uni_keywords.h"
22978 Perl_init_uniprops(pTHX)
22982 char * dump_len_string;
22984 dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
22985 if ( ! dump_len_string
22986 || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
22988 PL_dump_re_max_len = 60; /* A reasonable default */
22992 PL_user_def_props = newHV();
22994 # ifdef USE_ITHREADS
22996 HvSHAREKEYS_off(PL_user_def_props);
22997 PL_user_def_props_aTHX = aTHX;
23001 /* Set up the inversion list interpreter-level variables */
23003 PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23004 PL_XPosix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALNUM]);
23005 PL_XPosix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXALPHA]);
23006 PL_XPosix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXBLANK]);
23007 PL_XPosix_ptrs[_CC_CASED] = _new_invlist_C_array(uni_prop_ptrs[UNI_CASED]);
23008 PL_XPosix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXCNTRL]);
23009 PL_XPosix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXDIGIT]);
23010 PL_XPosix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXGRAPH]);
23011 PL_XPosix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXLOWER]);
23012 PL_XPosix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPRINT]);
23013 PL_XPosix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXPUNCT]);
23014 PL_XPosix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXSPACE]);
23015 PL_XPosix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXUPPER]);
23016 PL_XPosix_ptrs[_CC_VERTSPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_VERTSPACE]);
23017 PL_XPosix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXWORD]);
23018 PL_XPosix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_XPOSIXXDIGIT]);
23020 PL_Posix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]);
23021 PL_Posix_ptrs[_CC_ALPHANUMERIC] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALNUM]);
23022 PL_Posix_ptrs[_CC_ALPHA] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXALPHA]);
23023 PL_Posix_ptrs[_CC_BLANK] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXBLANK]);
23024 PL_Posix_ptrs[_CC_CASED] = PL_Posix_ptrs[_CC_ALPHA];
23025 PL_Posix_ptrs[_CC_CNTRL] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXCNTRL]);
23026 PL_Posix_ptrs[_CC_DIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXDIGIT]);
23027 PL_Posix_ptrs[_CC_GRAPH] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXGRAPH]);
23028 PL_Posix_ptrs[_CC_LOWER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXLOWER]);
23029 PL_Posix_ptrs[_CC_PRINT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPRINT]);
23030 PL_Posix_ptrs[_CC_PUNCT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXPUNCT]);
23031 PL_Posix_ptrs[_CC_SPACE] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXSPACE]);
23032 PL_Posix_ptrs[_CC_UPPER] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXUPPER]);
23033 PL_Posix_ptrs[_CC_VERTSPACE] = NULL;
23034 PL_Posix_ptrs[_CC_WORDCHAR] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXWORD]);
23035 PL_Posix_ptrs[_CC_XDIGIT] = _new_invlist_C_array(uni_prop_ptrs[UNI_POSIXXDIGIT]);
23037 PL_GCB_invlist = _new_invlist_C_array(_Perl_GCB_invlist);
23038 PL_SB_invlist = _new_invlist_C_array(_Perl_SB_invlist);
23039 PL_WB_invlist = _new_invlist_C_array(_Perl_WB_invlist);
23040 PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
23041 PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
23043 PL_InBitmap = _new_invlist_C_array(InBitmap_invlist);
23044 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
23045 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
23046 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
23048 PL_Assigned_invlist = _new_invlist_C_array(uni_prop_ptrs[UNI_ASSIGNED]);
23050 PL_utf8_perl_idstart = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDSTART]);
23051 PL_utf8_perl_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_IDCONT]);
23053 PL_utf8_charname_begin = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_BEGIN]);
23054 PL_utf8_charname_continue = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_CHARNAME_CONTINUE]);
23056 PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
23057 PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23058 UNI__PERL_FOLDS_TO_MULTI_CHAR]);
23059 PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
23060 UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
23061 PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
23062 PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
23063 PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);
23064 PL_utf8_tofold = _new_invlist_C_array(Case_Folding_invlist);
23065 PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist);
23066 PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
23067 PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
23068 PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
23069 PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
23072 /* The below are used only by deprecated functions. They could be removed */
23073 PL_utf8_xidcont = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDC]);
23074 PL_utf8_idcont = _new_invlist_C_array(uni_prop_ptrs[UNI_IDC]);
23075 PL_utf8_xidstart = _new_invlist_C_array(uni_prop_ptrs[UNI_XIDS]);
23079 /* These four functions are compiled only in regcomp.c, where they have access
23080 * to the data they return. They are a way for re_comp.c to get access to that
23081 * data without having to compile the whole data structures. */
23084 Perl_do_uniprop_match(const char * const key, const U16 key_len)
23086 PERL_ARGS_ASSERT_DO_UNIPROP_MATCH;
23088 return match_uniprop((U8 *) key, key_len);
23092 Perl_get_prop_definition(pTHX_ const int table_index)
23094 PERL_ARGS_ASSERT_GET_PROP_DEFINITION;
23096 /* Create and return the inversion list */
23097 return _new_invlist_C_array(uni_prop_ptrs[table_index]);
23100 const char * const *
23101 Perl_get_prop_values(const int table_index)
23103 PERL_ARGS_ASSERT_GET_PROP_VALUES;
23105 return UNI_prop_value_ptrs[table_index];
23109 Perl_get_deprecated_property_msg(const Size_t warning_offset)
23111 PERL_ARGS_ASSERT_GET_DEPRECATED_PROPERTY_MSG;
23113 return deprecated_property_msgs[warning_offset];
23118 This code was mainly added for backcompat to give a warning for non-portable
23119 code points in user-defined properties. But experiments showed that the
23120 warning in earlier perls were only omitted on overflow, which should be an
23121 error, so there really isnt a backcompat issue, and actually adding the
23122 warning when none was present before might cause breakage, for little gain. So
23123 khw left this code in, but not enabled. Tests were never added.
23126 Ei |const char *|get_extended_utf8_msg|const UV cp
23128 PERL_STATIC_INLINE const char *
23129 S_get_extended_utf8_msg(pTHX_ const UV cp)
23131 U8 dummy[UTF8_MAXBYTES + 1];
23135 uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED,
23138 msg = hv_fetchs(msgs, "text", 0);
23141 (void) sv_2mortal((SV *) msgs);
23143 return SvPVX(*msg);
23147 #endif /* end of ! PERL_IN_XSUB_RE */
23150 S_compile_wildcard(pTHX_ const char * subpattern, const STRLEN len,
23151 const bool ignore_case)
23153 /* Pretends that the input subpattern is qr/subpattern/aam, compiling it
23154 * possibly with /i if the 'ignore_case' parameter is true. Use /aa
23155 * because nothing outside of ASCII will match. Use /m because the input
23156 * string may be a bunch of lines strung together.
23158 * Also sets up the debugging info */
23160 U32 flags = PMf_MULTILINE|PMf_WILDCARD;
23162 SV * subpattern_sv = sv_2mortal(newSVpvn(subpattern, len));
23163 REGEXP * subpattern_re;
23164 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23166 PERL_ARGS_ASSERT_COMPILE_WILDCARD;
23171 set_regex_charset(&flags, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
23173 /* Like in op.c, we copy the compile time pm flags to the rx ones */
23174 rx_flags = flags & RXf_PMf_COMPILETIME;
23176 #ifndef PERL_IN_XSUB_RE
23177 /* Use the core engine if this file is regcomp.c. That means no
23178 * 'use re "Debug ..." is in effect, so the core engine is sufficient */
23179 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23180 &PL_core_reg_engine,
23184 if (isDEBUG_WILDCARD) {
23185 /* Use the special debugging engine if this file is re_comp.c and wants
23186 * to output the wildcard matching. This uses whatever
23187 * 'use re "Debug ..." is in effect */
23188 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23194 /* Use the special wildcard engine if this file is re_comp.c and
23195 * doesn't want to output the wildcard matching. This uses whatever
23196 * 'use re "Debug ..." is in effect for compilation, but this engine
23197 * structure has been set up so that it uses the core engine for
23198 * execution, so no execution debugging as a result of re.pm will be
23200 subpattern_re = Perl_re_op_compile(aTHX_ &subpattern_sv, 1, NULL,
23204 /* XXX The above has the effect that any user-supplied regex engine
23205 * won't be called for matching wildcards. That might be good, or bad.
23206 * It could be changed in several ways. The reason it is done the
23207 * current way is to avoid having to save and restore
23208 * ^{^RE_DEBUG_FLAGS} around the execution. save_scalar() perhaps
23209 * could be used. Another suggestion is to keep the authoritative
23210 * value of the debug flags in a thread-local variable and add set/get
23211 * magic to ${^RE_DEBUG_FLAGS} to keep the C level variable up to date.
23212 * Still another is to pass a flag, say in the engine's intflags that
23213 * would be checked each time before doing the debug output */
23217 assert(subpattern_re); /* Should have died if didn't compile successfully */
23218 return subpattern_re;
23222 S_execute_wildcard(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
23223 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
23226 DECLARE_AND_GET_RE_DEBUG_FLAGS;
23228 PERL_ARGS_ASSERT_EXECUTE_WILDCARD;
23232 /* The compilation has set things up so that if the program doesn't want to
23233 * see the wildcard matching procedure, it will get the core execution
23234 * engine, which is subject only to -Dr. So we have to turn that off
23235 * around this procedure */
23236 if (! isDEBUG_WILDCARD) {
23237 /* Note! Casts away 'volatile' */
23239 PL_debug &= ~ DEBUG_r_FLAG;
23242 result = CALLREGEXEC(prog, stringarg, strend, strbeg, minend, screamer,
23250 S_handle_user_defined_property(pTHX_
23252 /* Parses the contents of a user-defined property definition; returning the
23253 * expanded definition if possible. If so, the return is an inversion
23256 * If there are subroutines that are part of the expansion and which aren't
23257 * known at the time of the call to this function, this returns what
23258 * parse_uniprop_string() returned for the first one encountered.
23260 * If an error was found, NULL is returned, and 'msg' gets a suitable
23261 * message appended to it. (Appending allows the back trace of how we got
23262 * to the faulty definition to be displayed through nested calls of
23263 * user-defined subs.)
23265 * The caller IS responsible for freeing any returned SV.
23267 * The syntax of the contents is pretty much described in perlunicode.pod,
23268 * but we also allow comments on each line */
23270 const char * name, /* Name of property */
23271 const STRLEN name_len, /* The name's length in bytes */
23272 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23273 const bool to_fold, /* ? Is this under /i */
23274 const bool runtime, /* ? Are we in compile- or run-time */
23275 const bool deferrable, /* Is it ok for this property's full definition
23276 to be deferred until later? */
23277 SV* contents, /* The property's definition */
23278 bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
23279 getting called unless this is thought to be
23280 a user-defined property */
23281 SV * msg, /* Any error or warning msg(s) are appended to
23283 const STRLEN level) /* Recursion level of this call */
23286 const char * string = SvPV_const(contents, len);
23287 const char * const e = string + len;
23288 const bool is_contents_utf8 = cBOOL(SvUTF8(contents));
23289 const STRLEN msgs_length_on_entry = SvCUR(msg);
23291 const char * s0 = string; /* Points to first byte in the current line
23292 being parsed in 'string' */
23293 const char overflow_msg[] = "Code point too large in \"";
23294 SV* running_definition = NULL;
23296 PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY;
23298 *user_defined_ptr = TRUE;
23300 /* Look at each line */
23302 const char * s; /* Current byte */
23303 char op = '+'; /* Default operation is 'union' */
23304 IV min = 0; /* range begin code point */
23305 IV max = -1; /* and range end */
23306 SV* this_definition;
23308 /* Skip comment lines */
23310 s0 = strchr(s0, '\n');
23318 /* For backcompat, allow an empty first line */
23324 /* First character in the line may optionally be the operation */
23333 /* If the line is one or two hex digits separated by blank space, its
23334 * a range; otherwise it is either another user-defined property or an
23339 if (! isXDIGIT(*s)) {
23340 goto check_if_property;
23343 do { /* Each new hex digit will add 4 bits. */
23344 if (min > ( (IV) MAX_LEGAL_CP >> 4)) {
23345 s = strchr(s, '\n');
23349 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23350 sv_catpv(msg, overflow_msg);
23351 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23352 UTF8fARG(is_contents_utf8, s - s0, s0));
23353 sv_catpvs(msg, "\"");
23354 goto return_failure;
23357 /* Accumulate this digit into the value */
23358 min = (min << 4) + READ_XDIGIT(s);
23359 } while (isXDIGIT(*s));
23361 while (isBLANK(*s)) { s++; }
23363 /* We allow comments at the end of the line */
23365 s = strchr(s, '\n');
23371 else if (s < e && *s != '\n') {
23372 if (! isXDIGIT(*s)) {
23373 goto check_if_property;
23376 /* Look for the high point of the range */
23379 if (max > ( (IV) MAX_LEGAL_CP >> 4)) {
23380 s = strchr(s, '\n');
23384 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23385 sv_catpv(msg, overflow_msg);
23386 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23387 UTF8fARG(is_contents_utf8, s - s0, s0));
23388 sv_catpvs(msg, "\"");
23389 goto return_failure;
23392 max = (max << 4) + READ_XDIGIT(s);
23393 } while (isXDIGIT(*s));
23395 while (isBLANK(*s)) { s++; }
23398 s = strchr(s, '\n');
23403 else if (s < e && *s != '\n') {
23404 goto check_if_property;
23408 if (max == -1) { /* The line only had one entry */
23411 else if (max < min) {
23412 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23413 sv_catpvs(msg, "Illegal range in \"");
23414 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23415 UTF8fARG(is_contents_utf8, s - s0, s0));
23416 sv_catpvs(msg, "\"");
23417 goto return_failure;
23420 # if 0 /* See explanation at definition above of get_extended_utf8_msg() */
23422 if ( UNICODE_IS_PERL_EXTENDED(min)
23423 || UNICODE_IS_PERL_EXTENDED(max))
23425 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
23427 /* If both code points are non-portable, warn only on the lower
23429 sv_catpv(msg, get_extended_utf8_msg(
23430 (UNICODE_IS_PERL_EXTENDED(min))
23432 sv_catpvs(msg, " in \"");
23433 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
23434 UTF8fARG(is_contents_utf8, s - s0, s0));
23435 sv_catpvs(msg, "\"");
23440 /* Here, this line contains a legal range */
23441 this_definition = sv_2mortal(_new_invlist(2));
23442 this_definition = _add_range_to_invlist(this_definition, min, max);
23447 /* Here it isn't a legal range line. See if it is a legal property
23448 * line. First find the end of the meat of the line */
23449 s = strpbrk(s, "#\n");
23454 /* Ignore trailing blanks in keeping with the requirements of
23455 * parse_uniprop_string() */
23457 while (s > s0 && isBLANK_A(*s)) {
23462 this_definition = parse_uniprop_string(s0, s - s0,
23463 is_utf8, to_fold, runtime,
23466 user_defined_ptr, msg,
23468 ? level /* Don't increase level
23469 if input is empty */
23472 if (this_definition == NULL) {
23473 goto return_failure; /* 'msg' should have had the reason
23474 appended to it by the above call */
23477 if (! is_invlist(this_definition)) { /* Unknown at this time */
23478 return newSVsv(this_definition);
23482 s = strchr(s, '\n');
23492 _invlist_union(running_definition, this_definition,
23493 &running_definition);
23496 _invlist_subtract(running_definition, this_definition,
23497 &running_definition);
23500 _invlist_intersection(running_definition, this_definition,
23501 &running_definition);
23504 _invlist_union_complement_2nd(running_definition,
23505 this_definition, &running_definition);
23508 Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d",
23509 __FILE__, __LINE__, op);
23513 /* Position past the '\n' */
23515 } /* End of loop through the lines of 'contents' */
23517 /* Here, we processed all the lines in 'contents' without error. If we
23518 * didn't add any warnings, simply return success */
23519 if (msgs_length_on_entry == SvCUR(msg)) {
23521 /* If the expansion was empty, the answer isn't nothing: its an empty
23522 * inversion list */
23523 if (running_definition == NULL) {
23524 running_definition = _new_invlist(1);
23527 return running_definition;
23530 /* Otherwise, add some explanatory text, but we will return success */
23534 running_definition = NULL;
23538 if (name_len > 0) {
23539 sv_catpvs(msg, " in expansion of ");
23540 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
23543 return running_definition;
23546 /* As explained below, certain operations need to take place in the first
23547 * thread created. These macros switch contexts */
23548 # ifdef USE_ITHREADS
23549 # define DECLARATION_FOR_GLOBAL_CONTEXT \
23550 PerlInterpreter * save_aTHX = aTHX;
23551 # define SWITCH_TO_GLOBAL_CONTEXT \
23552 PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX))
23553 # define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX));
23554 # define CUR_CONTEXT aTHX
23555 # define ORIGINAL_CONTEXT save_aTHX
23557 # define DECLARATION_FOR_GLOBAL_CONTEXT dNOOP
23558 # define SWITCH_TO_GLOBAL_CONTEXT NOOP
23559 # define RESTORE_CONTEXT NOOP
23560 # define CUR_CONTEXT NULL
23561 # define ORIGINAL_CONTEXT NULL
23565 S_delete_recursion_entry(pTHX_ void *key)
23567 /* Deletes the entry used to detect recursion when expanding user-defined
23568 * properties. This is a function so it can be set up to be called even if
23569 * the program unexpectedly quits */
23571 SV ** current_entry;
23572 const STRLEN key_len = strlen((const char *) key);
23573 DECLARATION_FOR_GLOBAL_CONTEXT;
23575 SWITCH_TO_GLOBAL_CONTEXT;
23577 /* If the entry is one of these types, it is a permanent entry, and not the
23578 * one used to detect recursions. This function should delete only the
23579 * recursion entry */
23580 current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0);
23582 && ! is_invlist(*current_entry)
23583 && ! SvPOK(*current_entry))
23585 (void) hv_delete(PL_user_def_props, (const char *) key, key_len,
23593 S_get_fq_name(pTHX_
23594 const char * const name, /* The first non-blank in the \p{}, \P{} */
23595 const Size_t name_len, /* Its length in bytes, not including any trailing space */
23596 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23597 const bool has_colon_colon
23600 /* Returns a mortal SV containing the fully qualified version of the input
23605 fq_name = newSVpvs_flags("", SVs_TEMP);
23607 /* Use the current package if it wasn't included in our input */
23608 if (! has_colon_colon) {
23609 const HV * pkg = (IN_PERL_COMPILETIME)
23611 : CopSTASH(PL_curcop);
23612 const char* pkgname = HvNAME(pkg);
23614 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23615 UTF8fARG(is_utf8, strlen(pkgname), pkgname));
23616 sv_catpvs(fq_name, "::");
23619 Perl_sv_catpvf(aTHX_ fq_name, "%" UTF8f,
23620 UTF8fARG(is_utf8, name_len, name));
23625 S_parse_uniprop_string(pTHX_
23627 /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable
23628 * now. If so, the return is an inversion list.
23630 * If the property is user-defined, it is a subroutine, which in turn
23631 * may call other subroutines. This function will call the whole nest of
23632 * them to get the definition they return; if some aren't known at the time
23633 * of the call to this function, the fully qualified name of the highest
23634 * level sub is returned. It is an error to call this function at runtime
23635 * without every sub defined.
23637 * If an error was found, NULL is returned, and 'msg' gets a suitable
23638 * message appended to it. (Appending allows the back trace of how we got
23639 * to the faulty definition to be displayed through nested calls of
23640 * user-defined subs.)
23642 * The caller should NOT try to free any returned inversion list.
23644 * Other parameters will be set on return as described below */
23646 const char * const name, /* The first non-blank in the \p{}, \P{} */
23647 Size_t name_len, /* Its length in bytes, not including any
23649 const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
23650 const bool to_fold, /* ? Is this under /i */
23651 const bool runtime, /* TRUE if this is being called at run time */
23652 const bool deferrable, /* TRUE if it's ok for the definition to not be
23653 known at this call */
23654 AV ** strings, /* To return string property values, like named
23656 bool *user_defined_ptr, /* Upon return from this function it will be
23657 set to TRUE if any component is a
23658 user-defined property */
23659 SV * msg, /* Any error or warning msg(s) are appended to
23661 const STRLEN level) /* Recursion level of this call */
23663 char* lookup_name; /* normalized name for lookup in our tables */
23664 unsigned lookup_len; /* Its length */
23665 enum { Not_Strict = 0, /* Some properties have stricter name */
23666 Strict, /* normalization rules, which we decide */
23667 As_Is /* upon based on parsing */
23668 } stricter = Not_Strict;
23670 /* nv= or numeric_value=, or possibly one of the cjk numeric properties
23671 * (though it requires extra effort to download them from Unicode and
23672 * compile perl to know about them) */
23673 bool is_nv_type = FALSE;
23675 unsigned int i, j = 0;
23676 int equals_pos = -1; /* Where the '=' is found, or negative if none */
23677 int slash_pos = -1; /* Where the '/' is found, or negative if none */
23678 int table_index = 0; /* The entry number for this property in the table
23679 of all Unicode property names */
23680 bool starts_with_Is = FALSE; /* ? Does the name start with 'Is' */
23681 Size_t lookup_offset = 0; /* Used to ignore the first few characters of
23682 the normalized name in certain situations */
23683 Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't
23684 part of a package name */
23685 Size_t lun_non_pkg_begin = 0; /* Similarly for 'lookup_name' */
23686 bool could_be_user_defined = TRUE; /* ? Could this be a user-defined
23687 property rather than a Unicode
23689 SV * prop_definition = NULL; /* The returned definition of 'name' or NULL
23690 if an error. If it is an inversion list,
23691 it is the definition. Otherwise it is a
23692 string containing the fully qualified sub
23694 SV * fq_name = NULL; /* For user-defined properties, the fully
23696 bool invert_return = FALSE; /* ? Do we need to complement the result before
23698 bool stripped_utf8_pkg = FALSE; /* Set TRUE if the input includes an
23699 explicit utf8:: package that we strip
23701 /* The expansion of properties that could be either user-defined or
23702 * official unicode ones is deferred until runtime, including a marker for
23703 * those that might be in the latter category. This boolean indicates if
23704 * we've seen that marker. If not, what we're parsing can't be such an
23705 * official Unicode property whose expansion was deferred */
23706 bool could_be_deferred_official = FALSE;
23708 PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING;
23710 /* The input will be normalized into 'lookup_name' */
23711 Newx(lookup_name, name_len, char);
23712 SAVEFREEPV(lookup_name);
23714 /* Parse the input. */
23715 for (i = 0; i < name_len; i++) {
23716 char cur = name[i];
23718 /* Most of the characters in the input will be of this ilk, being parts
23720 if (isIDCONT_A(cur)) {
23722 /* Case differences are ignored. Our lookup routine assumes
23723 * everything is lowercase, so normalize to that */
23724 if (isUPPER_A(cur)) {
23725 lookup_name[j++] = toLOWER_A(cur);
23729 if (cur == '_') { /* Don't include these in the normalized name */
23733 lookup_name[j++] = cur;
23735 /* The first character in a user-defined name must be of this type.
23737 if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) {
23738 could_be_user_defined = FALSE;
23744 /* Here, the character is not something typically in a name, But these
23745 * two types of characters (and the '_' above) can be freely ignored in
23746 * most situations. Later it may turn out we shouldn't have ignored
23747 * them, and we have to reparse, but we don't have enough information
23748 * yet to make that decision */
23749 if (cur == '-' || isSPACE_A(cur)) {
23750 could_be_user_defined = FALSE;
23754 /* An equals sign or single colon mark the end of the first part of
23755 * the property name */
23757 || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':')))
23759 lookup_name[j++] = '='; /* Treat the colon as an '=' */
23760 equals_pos = j; /* Note where it occurred in the input */
23761 could_be_user_defined = FALSE;
23765 /* If this looks like it is a marker we inserted at compile time,
23766 * set a flag and otherwise ignore it. If it isn't in the final
23767 * position, keep it as it would have been user input. */
23768 if ( UNLIKELY(cur == DEFERRED_COULD_BE_OFFICIAL_MARKERc)
23770 && could_be_user_defined
23771 && i == name_len - 1)
23774 could_be_deferred_official = TRUE;
23778 /* Otherwise, this character is part of the name. */
23779 lookup_name[j++] = cur;
23781 /* Here it isn't a single colon, so if it is a colon, it must be a
23785 /* A double colon should be a package qualifier. We note its
23786 * position and continue. Note that one could have
23787 * pkg1::pkg2::...::foo
23788 * so that the position at the end of the loop will be just after
23789 * the final qualifier */
23792 non_pkg_begin = i + 1;
23793 lookup_name[j++] = ':';
23794 lun_non_pkg_begin = j;
23796 else { /* Only word chars (and '::') can be in a user-defined name */
23797 could_be_user_defined = FALSE;
23799 } /* End of parsing through the lhs of the property name (or all of it if
23802 # define STRLENs(s) (sizeof("" s "") - 1)
23804 /* If there is a single package name 'utf8::', it is ambiguous. It could
23805 * be for a user-defined property, or it could be a Unicode property, as
23806 * all of them are considered to be for that package. For the purposes of
23807 * parsing the rest of the property, strip it off */
23808 if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) {
23809 lookup_name += STRLENs("utf8::");
23810 j -= STRLENs("utf8::");
23811 equals_pos -= STRLENs("utf8::");
23812 stripped_utf8_pkg = TRUE;
23815 /* Here, we are either done with the whole property name, if it was simple;
23816 * or are positioned just after the '=' if it is compound. */
23818 if (equals_pos >= 0) {
23819 assert(stricter == Not_Strict); /* We shouldn't have set this yet */
23821 /* Space immediately after the '=' is ignored */
23823 for (; i < name_len; i++) {
23824 if (! isSPACE_A(name[i])) {
23829 /* Most punctuation after the equals indicates a subpattern, like
23831 if ( isPUNCT_A(name[i])
23836 /* A backslash means the real delimitter is the next character,
23837 * but it must be punctuation */
23838 && (name[i] != '\\' || (i < name_len && isPUNCT_A(name[i+1]))))
23840 bool special_property = memEQs(lookup_name, j - 1, "name")
23841 || memEQs(lookup_name, j - 1, "na");
23842 if (! special_property) {
23843 /* Find the property. The table includes the equals sign, so
23844 * we use 'j' as-is */
23845 table_index = do_uniprop_match(lookup_name, j);
23847 if (special_property || table_index) {
23848 REGEXP * subpattern_re;
23849 char open = name[i++];
23851 const char * pos_in_brackets;
23852 const char * const * prop_values;
23855 /* Backslash => delimitter is the character following. We
23856 * already checked that it is punctuation */
23857 if (open == '\\') {
23862 /* This data structure is constructed so that the matching
23863 * closing bracket is 3 past its matching opening. The second
23864 * set of closing is so that if the opening is something like
23865 * ']', the closing will be that as well. Something similar is
23866 * done in toke.c */
23867 pos_in_brackets = memCHRs("([<)]>)]>", open);
23868 close = (pos_in_brackets) ? pos_in_brackets[3] : open;
23871 || name[name_len-1] != close
23872 || (escaped && name[name_len-2] != '\\')
23873 /* Also make sure that there are enough characters.
23874 * e.g., '\\\' would show up incorrectly as legal even
23875 * though it is too short */
23876 || (SSize_t) (name_len - i - 1 - escaped) < 0)
23878 sv_catpvs(msg, "Unicode property wildcard not terminated");
23879 goto append_name_to_msg;
23882 Perl_ck_warner_d(aTHX_
23883 packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
23884 "The Unicode property wildcards feature is experimental");
23886 if (special_property) {
23887 const char * error_msg;
23888 const char * revised_name = name + i;
23889 Size_t revised_name_len = name_len - (i + 1 + escaped);
23891 /* Currently, the only 'special_property' is name, which we
23892 * lookup in _charnames.pm */
23894 if (! load_charnames(newSVpvs("placeholder"),
23895 revised_name, revised_name_len,
23898 sv_catpv(msg, error_msg);
23899 goto append_name_to_msg;
23902 /* Farm this out to a function just to make the current
23903 * function less unwieldy */
23904 if (handle_names_wildcard(revised_name, revised_name_len,
23908 return prop_definition;
23914 prop_values = get_prop_values(table_index);
23916 /* Now create and compile the wildcard subpattern. Use /i
23917 * because the property values are supposed to match with case
23919 subpattern_re = compile_wildcard(name + i,
23920 name_len - i - 1 - escaped,
23924 /* For each legal property value, see if the supplied pattern
23926 while (*prop_values) {
23927 const char * const entry = *prop_values;
23928 const Size_t len = strlen(entry);
23929 SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
23931 if (execute_wildcard(subpattern_re,
23933 (char *) entry + len,
23937 { /* Here, matched. Add to the returned list */
23938 Size_t total_len = j + len;
23939 SV * sub_invlist = NULL;
23940 char * this_string;
23942 /* We know this is a legal \p{property=value}. Call
23943 * the function to return the list of code points that
23945 Newxz(this_string, total_len + 1, char);
23946 Copy(lookup_name, this_string, j, char);
23947 my_strlcat(this_string, entry, total_len + 1);
23948 SAVEFREEPV(this_string);
23949 sub_invlist = parse_uniprop_string(this_string,
23959 _invlist_union(prop_definition, sub_invlist,
23963 prop_values++; /* Next iteration, look at next propvalue */
23964 } /* End of looking through property values; (the data
23965 structure is terminated by a NULL ptr) */
23967 SvREFCNT_dec_NN(subpattern_re);
23969 if (prop_definition) {
23970 return prop_definition;
23973 sv_catpvs(msg, "No Unicode property value wildcard matches:");
23974 goto append_name_to_msg;
23977 /* Here's how khw thinks we should proceed to handle the properties
23978 * not yet done: Bidi Mirroring Glyph can map to ""
23979 Bidi Paired Bracket can map to ""
23980 Case Folding (both full and simple)
23981 Shouldn't /i be good enough for Full
23982 Decomposition Mapping
23983 Equivalent Unified Ideograph can map to ""
23984 Lowercase Mapping (both full and simple)
23985 NFKC Case Fold can map to ""
23986 Titlecase Mapping (both full and simple)
23987 Uppercase Mapping (both full and simple)
23988 * Handle these the same way Name is done, using say, _wild.pm, but
23989 * having both loose and full, like in charclass_invlists.h.
23990 * Perhaps move block and script to that as they are somewhat large
23991 * in charclass_invlists.h.
23992 * For properties where the default is the code point itself, such
23993 * as any of the case changing mappings, the string would otherwise
23994 * consist of all Unicode code points in UTF-8 strung together.
23995 * This would be impractical. So instead, examine their compiled
23996 * pattern, looking at the ssc. If none, reject the pattern as an
23997 * error. Otherwise run the pattern against every code point in
23998 * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
23999 * And it might be good to create an API to return the ssc.
24000 * Or handle them like the algorithmic names are done
24002 } /* End of is a wildcard subppattern */
24004 /* \p{name=...} is handled specially. Instead of using the normal
24005 * mechanism involving charclass_invlists.h, it uses _charnames.pm
24006 * which has the necessary (huge) data accessible to it, and which
24007 * doesn't get loaded unless necessary. The legal syntax for names is
24008 * somewhat different than other properties due both to the vagaries of
24009 * a few outlier official names, and the fact that only a few ASCII
24010 * characters are permitted in them */
24011 if ( memEQs(lookup_name, j - 1, "name")
24012 || memEQs(lookup_name, j - 1, "na"))
24017 const char * error_msg;
24019 SV * character_name;
24020 STRLEN character_len;
24025 /* Since the RHS (after skipping initial space) is passed unchanged
24026 * to charnames, and there are different criteria for what are
24027 * legal characters in the name, just parse it here. A character
24028 * name must begin with an ASCII alphabetic */
24029 if (! isALPHA(name[i])) {
24032 lookup_name[j++] = name[i];
24034 for (++i; i < name_len; i++) {
24035 /* Official names can only be in the ASCII range, and only
24036 * certain characters */
24037 if (! isASCII(name[i]) || ! isCHARNAME_CONT(name[i])) {
24040 lookup_name[j++] = name[i];
24043 /* Finished parsing, save the name into an SV */
24044 character_name = newSVpvn(lookup_name + equals_pos, j - equals_pos);
24046 /* Make sure _charnames is loaded. (The parameters give context
24047 * for any errors generated */
24048 table = load_charnames(character_name, name, name_len, &error_msg);
24049 if (table == NULL) {
24050 sv_catpv(msg, error_msg);
24051 goto append_name_to_msg;
24054 lookup_loose = get_cvs("_charnames::_loose_regcomp_lookup", 0);
24055 if (! lookup_loose) {
24057 "panic: Can't find '_charnames::_loose_regcomp_lookup");
24060 PUSHSTACKi(PERLSI_REGCOMP);
24066 XPUSHs(character_name);
24068 call_sv(MUTABLE_SV(lookup_loose), G_SCALAR);
24073 SvREFCNT_inc_simple_void_NN(character);
24080 if (! SvOK(character)) {
24084 cp = valid_utf8_to_uvchr((U8 *) SvPVX(character), &character_len);
24085 if (character_len == SvCUR(character)) {
24086 prop_definition = add_cp_to_invlist(NULL, cp);
24091 /* First of the remaining characters in the string. */
24092 char * remaining = SvPVX(character) + character_len;
24094 if (strings == NULL) {
24095 goto failed; /* XXX Perhaps a specific msg instead, like
24096 'not available here' */
24099 if (*strings == NULL) {
24100 *strings = newAV();
24103 this_string = newAV();
24104 av_push(this_string, newSVuv(cp));
24107 cp = valid_utf8_to_uvchr((U8 *) remaining, &character_len);
24108 av_push(this_string, newSVuv(cp));
24109 remaining += character_len;
24110 } while (remaining < SvEND(character));
24112 av_push(*strings, (SV *) this_string);
24115 return prop_definition;
24118 /* Certain properties whose values are numeric need special handling.
24119 * They may optionally be prefixed by 'is'. Ignore that prefix for the
24120 * purposes of checking if this is one of those properties */
24121 if (memBEGINPs(lookup_name, j, "is")) {
24125 /* Then check if it is one of these specially-handled properties. The
24126 * possibilities are hard-coded because easier this way, and the list
24127 * is unlikely to change.
24129 * All numeric value type properties are of this ilk, and are also
24130 * special in a different way later on. So find those first. There
24131 * are several numeric value type properties in the Unihan DB (which is
24132 * unlikely to be compiled with perl, but we handle it here in case it
24133 * does get compiled). They all end with 'numeric'. The interiors
24134 * aren't checked for the precise property. This would stop working if
24135 * a cjk property were to be created that ended with 'numeric' and
24136 * wasn't a numeric type */
24137 is_nv_type = memEQs(lookup_name + lookup_offset,
24138 j - 1 - lookup_offset, "numericvalue")
24139 || memEQs(lookup_name + lookup_offset,
24140 j - 1 - lookup_offset, "nv")
24141 || ( memENDPs(lookup_name + lookup_offset,
24142 j - 1 - lookup_offset, "numeric")
24143 && ( memBEGINPs(lookup_name + lookup_offset,
24144 j - 1 - lookup_offset, "cjk")
24145 || memBEGINPs(lookup_name + lookup_offset,
24146 j - 1 - lookup_offset, "k")));
24148 || memEQs(lookup_name + lookup_offset,
24149 j - 1 - lookup_offset, "canonicalcombiningclass")
24150 || memEQs(lookup_name + lookup_offset,
24151 j - 1 - lookup_offset, "ccc")
24152 || memEQs(lookup_name + lookup_offset,
24153 j - 1 - lookup_offset, "age")
24154 || memEQs(lookup_name + lookup_offset,
24155 j - 1 - lookup_offset, "in")
24156 || memEQs(lookup_name + lookup_offset,
24157 j - 1 - lookup_offset, "presentin"))
24161 /* Since the stuff after the '=' is a number, we can't throw away
24162 * '-' willy-nilly, as those could be a minus sign. Other stricter
24163 * rules also apply. However, these properties all can have the
24164 * rhs not be a number, in which case they contain at least one
24165 * alphabetic. In those cases, the stricter rules don't apply.
24166 * But the numeric type properties can have the alphas [Ee] to
24167 * signify an exponent, and it is still a number with stricter
24168 * rules. So look for an alpha that signifies not-strict */
24170 for (k = i; k < name_len; k++) {
24171 if ( isALPHA_A(name[k])
24172 && (! is_nv_type || ! isALPHA_FOLD_EQ(name[k], 'E')))
24174 stricter = Not_Strict;
24182 /* A number may have a leading '+' or '-'. The latter is retained
24184 if (name[i] == '+') {
24187 else if (name[i] == '-') {
24188 lookup_name[j++] = '-';
24192 /* Skip leading zeros including single underscores separating the
24193 * zeros, or between the final leading zero and the first other
24195 for (; i < name_len - 1; i++) {
24196 if ( name[i] != '0'
24197 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24204 else { /* No '=' */
24206 /* Only a few properties without an '=' should be parsed with stricter
24207 * rules. The list is unlikely to change. */
24208 if ( memBEGINPs(lookup_name, j, "perl")
24209 && memNEs(lookup_name + 4, j - 4, "space")
24210 && memNEs(lookup_name + 4, j - 4, "word"))
24214 /* We set the inputs back to 0 and the code below will reparse,
24220 /* Here, we have either finished the property, or are positioned to parse
24221 * the remainder, and we know if stricter rules apply. Finish out, if not
24223 for (; i < name_len; i++) {
24224 char cur = name[i];
24226 /* In all instances, case differences are ignored, and we normalize to
24228 if (isUPPER_A(cur)) {
24229 lookup_name[j++] = toLOWER(cur);
24233 /* An underscore is skipped, but not under strict rules unless it
24234 * separates two digits */
24237 && ( i == 0 || (int) i == equals_pos || i == name_len- 1
24238 || ! isDIGIT_A(name[i-1]) || ! isDIGIT_A(name[i+1])))
24240 lookup_name[j++] = '_';
24245 /* Hyphens are skipped except under strict */
24246 if (cur == '-' && ! stricter) {
24250 /* XXX Bug in documentation. It says white space skipped adjacent to
24251 * non-word char. Maybe we should, but shouldn't skip it next to a dot
24253 if (isSPACE_A(cur) && ! stricter) {
24257 lookup_name[j++] = cur;
24259 /* Unless this is a non-trailing slash, we are done with it */
24260 if (i >= name_len - 1 || cur != '/') {
24266 /* A slash in the 'numeric value' property indicates that what follows
24267 * is a denominator. It can have a leading '+' and '0's that should be
24268 * skipped. But we have never allowed a negative denominator, so treat
24269 * a minus like every other character. (No need to rule out a second
24270 * '/', as that won't match anything anyway */
24273 if (i < name_len && name[i] == '+') {
24277 /* Skip leading zeros including underscores separating digits */
24278 for (; i < name_len - 1; i++) {
24279 if ( name[i] != '0'
24280 && (name[i] != '_' || ! isDIGIT_A(name[i+1])))
24286 /* Store the first real character in the denominator */
24287 if (i < name_len) {
24288 lookup_name[j++] = name[i];
24293 /* Here are completely done parsing the input 'name', and 'lookup_name'
24294 * contains a copy, normalized.
24296 * This special case is grandfathered in: 'L_' and 'GC=L_' are accepted and
24297 * different from without the underscores. */
24298 if ( ( UNLIKELY(memEQs(lookup_name, j, "l"))
24299 || UNLIKELY(memEQs(lookup_name, j, "gc=l")))
24300 && UNLIKELY(name[name_len-1] == '_'))
24302 lookup_name[j++] = '&';
24305 /* If the original input began with 'In' or 'Is', it could be a subroutine
24306 * call to a user-defined property instead of a Unicode property name. */
24307 if ( name_len - non_pkg_begin > 2
24308 && name[non_pkg_begin+0] == 'I'
24309 && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's'))
24311 /* Names that start with In have different characterstics than those
24312 * that start with Is */
24313 if (name[non_pkg_begin+1] == 's') {
24314 starts_with_Is = TRUE;
24318 could_be_user_defined = FALSE;
24321 if (could_be_user_defined) {
24324 /* If the user defined property returns the empty string, it could
24325 * easily be because the pattern is being compiled before the data it
24326 * actually needs to compile is available. This could be argued to be
24327 * a bug in the perl code, but this is a change of behavior for Perl,
24328 * so we handle it. This means that intentionally returning nothing
24329 * will not be resolved until runtime */
24330 bool empty_return = FALSE;
24332 /* Here, the name could be for a user defined property, which are
24333 * implemented as subs. */
24334 user_sub = get_cvn_flags(name, name_len, 0);
24337 /* Here, the property name could be a user-defined one, but there
24338 * is no subroutine to handle it (as of now). Defer handling it
24339 * until runtime. Otherwise, a block defined by Unicode in a later
24340 * release would get the synonym InFoo added for it, and existing
24341 * code that used that name would suddenly break if it referred to
24342 * the property before the sub was declared. See [perl #134146] */
24344 goto definition_deferred;
24347 /* Here, we are at runtime, and didn't find the user property. It
24348 * could be an official property, but only if no package was
24349 * specified, or just the utf8:: package. */
24350 if (could_be_deferred_official) {
24351 lookup_name += lun_non_pkg_begin;
24352 j -= lun_non_pkg_begin;
24354 else if (! stripped_utf8_pkg) {
24355 goto unknown_user_defined;
24358 /* Drop down to look up in the official properties */
24361 const char insecure[] = "Insecure user-defined property";
24363 /* Here, there is a sub by the correct name. Normally we call it
24364 * to get the property definition */
24366 SV * user_sub_sv = MUTABLE_SV(user_sub);
24367 SV * error; /* Any error returned by calling 'user_sub' */
24368 SV * key; /* The key into the hash of user defined sub names
24371 SV ** saved_user_prop_ptr; /* Hash entry for this property */
24373 /* How many times to retry when another thread is in the middle of
24374 * expanding the same definition we want */
24375 PERL_INT_FAST8_T retry_countdown = 10;
24377 DECLARATION_FOR_GLOBAL_CONTEXT;
24379 /* If we get here, we know this property is user-defined */
24380 *user_defined_ptr = TRUE;
24382 /* We refuse to call a potentially tainted subroutine; returning an
24385 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24386 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24387 goto append_name_to_msg;
24390 /* In principal, we only call each subroutine property definition
24391 * once during the life of the program. This guarantees that the
24392 * property definition never changes. The results of the single
24393 * sub call are stored in a hash, which is used instead for future
24394 * references to this property. The property definition is thus
24395 * immutable. But, to allow the user to have a /i-dependent
24396 * definition, we call the sub once for non-/i, and once for /i,
24397 * should the need arise, passing the /i status as a parameter.
24399 * We start by constructing the hash key name, consisting of the
24400 * fully qualified subroutine name, preceded by the /i status, so
24401 * that there is a key for /i and a different key for non-/i */
24402 key = newSVpvn(((to_fold) ? "1" : "0"), 1);
24403 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
24404 non_pkg_begin != 0);
24405 sv_catsv(key, fq_name);
24408 /* We only call the sub once throughout the life of the program
24409 * (with the /i, non-/i exception noted above). That means the
24410 * hash must be global and accessible to all threads. It is
24411 * created at program start-up, before any threads are created, so
24412 * is accessible to all children. But this creates some
24415 * 1) The keys can't be shared, or else problems arise; sharing is
24416 * turned off at hash creation time
24417 * 2) All SVs in it are there for the remainder of the life of the
24418 * program, and must be created in the same interpreter context
24419 * as the hash, or else they will be freed from the wrong pool
24420 * at global destruction time. This is handled by switching to
24421 * the hash's context to create each SV going into it, and then
24422 * immediately switching back
24423 * 3) All accesses to the hash must be controlled by a mutex, to
24424 * prevent two threads from getting an unstable state should
24425 * they simultaneously be accessing it. The code below is
24426 * crafted so that the mutex is locked whenever there is an
24427 * access and unlocked only when the next stable state is
24430 * The hash stores either the definition of the property if it was
24431 * valid, or, if invalid, the error message that was raised. We
24432 * use the type of SV to distinguish.
24434 * There's also the need to guard against the definition expansion
24435 * from infinitely recursing. This is handled by storing the aTHX
24436 * of the expanding thread during the expansion. Again the SV type
24437 * is used to distinguish this from the other two cases. If we
24438 * come to here and the hash entry for this property is our aTHX,
24439 * it means we have recursed, and the code assumes that we would
24440 * infinitely recurse, so instead stops and raises an error.
24441 * (Any recursion has always been treated as infinite recursion in
24444 * If instead, the entry is for a different aTHX, it means that
24445 * that thread has gotten here first, and hasn't finished expanding
24446 * the definition yet. We just have to wait until it is done. We
24447 * sleep and retry a few times, returning an error if the other
24448 * thread doesn't complete. */
24451 USER_PROP_MUTEX_LOCK;
24453 /* If we have an entry for this key, the subroutine has already
24454 * been called once with this /i status. */
24455 saved_user_prop_ptr = hv_fetch(PL_user_def_props,
24456 SvPVX(key), SvCUR(key), 0);
24457 if (saved_user_prop_ptr) {
24459 /* If the saved result is an inversion list, it is the valid
24460 * definition of this property */
24461 if (is_invlist(*saved_user_prop_ptr)) {
24462 prop_definition = *saved_user_prop_ptr;
24464 /* The SV in the hash won't be removed until global
24465 * destruction, so it is stable and we can unlock */
24466 USER_PROP_MUTEX_UNLOCK;
24468 /* The caller shouldn't try to free this SV */
24469 return prop_definition;
24472 /* Otherwise, if it is a string, it is the error message
24473 * that was returned when we first tried to evaluate this
24474 * property. Fail, and append the message */
24475 if (SvPOK(*saved_user_prop_ptr)) {
24476 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24477 sv_catsv(msg, *saved_user_prop_ptr);
24479 /* The SV in the hash won't be removed until global
24480 * destruction, so it is stable and we can unlock */
24481 USER_PROP_MUTEX_UNLOCK;
24486 assert(SvIOK(*saved_user_prop_ptr));
24488 /* Here, we have an unstable entry in the hash. Either another
24489 * thread is in the middle of expanding the property's
24490 * definition, or we are ourselves recursing. We use the aTHX
24491 * in it to distinguish */
24492 if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) {
24494 /* Here, it's another thread doing the expanding. We've
24495 * looked as much as we are going to at the contents of the
24496 * hash entry. It's safe to unlock. */
24497 USER_PROP_MUTEX_UNLOCK;
24499 /* Retry a few times */
24500 if (retry_countdown-- > 0) {
24505 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24506 sv_catpvs(msg, "Timeout waiting for another thread to "
24508 goto append_name_to_msg;
24511 /* Here, we are recursing; don't dig any deeper */
24512 USER_PROP_MUTEX_UNLOCK;
24514 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24516 "Infinite recursion in user-defined property");
24517 goto append_name_to_msg;
24520 /* Here, this thread has exclusive control, and there is no entry
24521 * for this property in the hash. So we have the go ahead to
24522 * expand the definition ourselves. */
24524 PUSHSTACKi(PERLSI_REGCOMP);
24527 /* Create a temporary placeholder in the hash to detect recursion
24529 SWITCH_TO_GLOBAL_CONTEXT;
24530 placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT));
24531 (void) hv_store_ent(PL_user_def_props, key, placeholder, 0);
24534 /* Now that we have a placeholder, we can let other threads
24536 USER_PROP_MUTEX_UNLOCK;
24538 /* Make sure the placeholder always gets destroyed */
24539 SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(key));
24544 /* Call the user's function, with the /i status as a parameter.
24545 * Note that we have gone to a lot of trouble to keep this call
24546 * from being within the locked mutex region. */
24547 XPUSHs(boolSV(to_fold));
24550 /* The following block was taken from swash_init(). Presumably
24551 * they apply to here as well, though we no longer use a swash --
24555 /* We might get here via a subroutine signature which uses a utf8
24556 * parameter name, at which point PL_subname will have been set
24557 * but not yet used. */
24558 save_item(PL_subname);
24560 /* G_SCALAR guarantees a single return value */
24561 (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR);
24566 if (TAINT_get || SvTRUE(error)) {
24567 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24568 if (SvTRUE(error)) {
24569 sv_catpvs(msg, "Error \"");
24570 sv_catsv(msg, error);
24571 sv_catpvs(msg, "\"");
24574 if (SvTRUE(error)) sv_catpvs(msg, "; ");
24575 sv_catpvn(msg, insecure, sizeof(insecure) - 1);
24578 if (name_len > 0) {
24579 sv_catpvs(msg, " in expansion of ");
24580 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8,
24586 prop_definition = NULL;
24589 SV * contents = POPs;
24591 /* The contents is supposed to be the expansion of the property
24592 * definition. If the definition is deferrable, and we got an
24593 * empty string back, set a flag to later defer it (after clean
24596 && (! SvPOK(contents) || SvCUR(contents) == 0))
24598 empty_return = TRUE;
24600 else { /* Otherwise, call a function to check for valid syntax,
24603 prop_definition = handle_user_defined_property(
24605 is_utf8, to_fold, runtime,
24607 contents, user_defined_ptr,
24613 /* Here, we have the results of the expansion. Delete the
24614 * placeholder, and if the definition is now known, replace it with
24615 * that definition. We need exclusive access to the hash, and we
24616 * can't let anyone else in, between when we delete the placeholder
24617 * and add the permanent entry */
24618 USER_PROP_MUTEX_LOCK;
24620 S_delete_recursion_entry(aTHX_ SvPVX(key));
24622 if ( ! empty_return
24623 && (! prop_definition || is_invlist(prop_definition)))
24625 /* If we got success we use the inversion list defining the
24626 * property; otherwise use the error message */
24627 SWITCH_TO_GLOBAL_CONTEXT;
24628 (void) hv_store_ent(PL_user_def_props,
24631 ? newSVsv(prop_definition)
24637 /* All done, and the hash now has a permanent entry for this
24638 * property. Give up exclusive control */
24639 USER_PROP_MUTEX_UNLOCK;
24645 if (empty_return) {
24646 goto definition_deferred;
24649 if (prop_definition) {
24651 /* If the definition is for something not known at this time,
24652 * we toss it, and go return the main property name, as that's
24653 * the one the user will be aware of */
24654 if (! is_invlist(prop_definition)) {
24655 SvREFCNT_dec_NN(prop_definition);
24656 goto definition_deferred;
24659 sv_2mortal(prop_definition);
24663 return prop_definition;
24665 } /* End of calling the subroutine for the user-defined property */
24666 } /* End of it could be a user-defined property */
24668 /* Here it wasn't a user-defined property that is known at this time. See
24669 * if it is a Unicode property */
24671 lookup_len = j; /* This is a more mnemonic name than 'j' */
24673 /* Get the index into our pointer table of the inversion list corresponding
24674 * to the property */
24675 table_index = do_uniprop_match(lookup_name, lookup_len);
24677 /* If it didn't find the property ... */
24678 if (table_index == 0) {
24680 /* Try again stripping off any initial 'Is'. This is because we
24681 * promise that an initial Is is optional. The same isn't true of
24682 * names that start with 'In'. Those can match only blocks, and the
24683 * lookup table already has those accounted for. The lookup table also
24684 * has already accounted for Perl extensions (without and = sign)
24685 * starting with 'i's'. */
24686 if (starts_with_Is && equals_pos >= 0) {
24692 table_index = do_uniprop_match(lookup_name, lookup_len);
24695 if (table_index == 0) {
24698 /* Here, we didn't find it. If not a numeric type property, and
24699 * can't be a user-defined one, it isn't a legal property */
24700 if (! is_nv_type) {
24701 if (! could_be_user_defined) {
24705 /* Here, the property name is legal as a user-defined one. At
24706 * compile time, it might just be that the subroutine for that
24707 * property hasn't been encountered yet, but at runtime, it's
24708 * an error to try to use an undefined one */
24709 if (! deferrable) {
24710 goto unknown_user_defined;;
24713 goto definition_deferred;
24714 } /* End of isn't a numeric type property */
24716 /* The numeric type properties need more work to decide. What we
24717 * do is make sure we have the number in canonical form and look
24720 if (slash_pos < 0) { /* No slash */
24722 /* When it isn't a rational, take the input, convert it to a
24723 * NV, then create a canonical string representation of that
24727 SSize_t value_len = lookup_len - equals_pos;
24729 /* Get the value */
24730 if ( value_len <= 0
24731 || my_atof3(lookup_name + equals_pos, &value,
24733 != lookup_name + lookup_len)
24738 /* If the value is an integer, the canonical value is integral
24740 if (Perl_ceil(value) == value) {
24741 canonical = Perl_form(aTHX_ "%.*s%.0" NVff,
24742 equals_pos, lookup_name, value);
24744 else { /* Otherwise, it is %e with a known precision */
24747 canonical = Perl_form(aTHX_ "%.*s%.*" NVef,
24748 equals_pos, lookup_name,
24749 PL_E_FORMAT_PRECISION, value);
24751 /* The exponent generated is expecting two digits, whereas
24752 * %e on some systems will generate three. Remove leading
24753 * zeros in excess of 2 from the exponent. We start
24754 * looking for them after the '=' */
24755 exp_ptr = strchr(canonical + equals_pos, 'e');
24757 char * cur_ptr = exp_ptr + 2; /* past the 'e[+-]' */
24758 SSize_t excess_exponent_len = strlen(cur_ptr) - 2;
24760 assert(*(cur_ptr - 1) == '-' || *(cur_ptr - 1) == '+');
24762 if (excess_exponent_len > 0) {
24763 SSize_t leading_zeros = strspn(cur_ptr, "0");
24764 SSize_t excess_leading_zeros
24765 = MIN(leading_zeros, excess_exponent_len);
24766 if (excess_leading_zeros > 0) {
24767 Move(cur_ptr + excess_leading_zeros,
24769 strlen(cur_ptr) - excess_leading_zeros
24770 + 1, /* Copy the NUL as well */
24777 else { /* Has a slash. Create a rational in canonical form */
24778 UV numerator, denominator, gcd, trial;
24779 const char * end_ptr;
24780 const char * sign = "";
24782 /* We can't just find the numerator, denominator, and do the
24783 * division, then use the method above, because that is
24784 * inexact. And the input could be a rational that is within
24785 * epsilon (given our precision) of a valid rational, and would
24786 * then incorrectly compare valid.
24788 * We're only interested in the part after the '=' */
24789 const char * this_lookup_name = lookup_name + equals_pos;
24790 lookup_len -= equals_pos;
24791 slash_pos -= equals_pos;
24793 /* Handle any leading minus */
24794 if (this_lookup_name[0] == '-') {
24796 this_lookup_name++;
24801 /* Convert the numerator to numeric */
24802 end_ptr = this_lookup_name + slash_pos;
24803 if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) {
24807 /* It better have included all characters before the slash */
24808 if (*end_ptr != '/') {
24812 /* Set to look at just the denominator */
24813 this_lookup_name += slash_pos;
24814 lookup_len -= slash_pos;
24815 end_ptr = this_lookup_name + lookup_len;
24817 /* Convert the denominator to numeric */
24818 if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) {
24822 /* It better be the rest of the characters, and don't divide by
24824 if ( end_ptr != this_lookup_name + lookup_len
24825 || denominator == 0)
24830 /* Get the greatest common denominator using
24831 http://en.wikipedia.org/wiki/Euclidean_algorithm */
24833 trial = denominator;
24834 while (trial != 0) {
24836 trial = gcd % trial;
24840 /* If already in lowest possible terms, we have already tried
24841 * looking this up */
24846 /* Reduce the rational, which should put it in canonical form
24849 denominator /= gcd;
24851 canonical = Perl_form(aTHX_ "%.*s%s%" UVuf "/%" UVuf,
24852 equals_pos, lookup_name, sign, numerator, denominator);
24855 /* Here, we have the number in canonical form. Try that */
24856 table_index = do_uniprop_match(canonical, strlen(canonical));
24857 if (table_index == 0) {
24860 } /* End of still didn't find the property in our table */
24861 } /* End of didn't find the property in our table */
24863 /* Here, we have a non-zero return, which is an index into a table of ptrs.
24864 * A negative return signifies that the real index is the absolute value,
24865 * but the result needs to be inverted */
24866 if (table_index < 0) {
24867 invert_return = TRUE;
24868 table_index = -table_index;
24871 /* Out-of band indices indicate a deprecated property. The proper index is
24872 * modulo it with the table size. And dividing by the table size yields
24873 * an offset into a table constructed by regen/mk_invlists.pl to contain
24874 * the corresponding warning message */
24875 if (table_index > MAX_UNI_KEYWORD_INDEX) {
24876 Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX;
24877 table_index %= MAX_UNI_KEYWORD_INDEX;
24878 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED),
24879 "Use of '%.*s' in \\p{} or \\P{} is deprecated because: %s",
24880 (int) name_len, name,
24881 get_deprecated_property_msg(warning_offset));
24884 /* In a few properties, a different property is used under /i. These are
24885 * unlikely to change, so are hard-coded here. */
24887 if ( table_index == UNI_XPOSIXUPPER
24888 || table_index == UNI_XPOSIXLOWER
24889 || table_index == UNI_TITLE)
24891 table_index = UNI_CASED;
24893 else if ( table_index == UNI_UPPERCASELETTER
24894 || table_index == UNI_LOWERCASELETTER
24895 # ifdef UNI_TITLECASELETTER /* Missing from early Unicodes */
24896 || table_index == UNI_TITLECASELETTER
24899 table_index = UNI_CASEDLETTER;
24901 else if ( table_index == UNI_POSIXUPPER
24902 || table_index == UNI_POSIXLOWER)
24904 table_index = UNI_POSIXALPHA;
24908 /* Create and return the inversion list */
24909 prop_definition = get_prop_definition(table_index);
24910 sv_2mortal(prop_definition);
24912 /* See if there is a private use override to add to this definition */
24914 COPHH * hinthash = (IN_PERL_COMPILETIME)
24915 ? CopHINTHASH_get(&PL_compiling)
24916 : CopHINTHASH_get(PL_curcop);
24917 SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
24919 if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
24921 /* See if there is an element in the hints hash for this table */
24922 SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
24923 const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
24927 SV * pu_definition;
24929 SV * expanded_prop_definition =
24930 sv_2mortal(invlist_clone(prop_definition, NULL));
24932 /* If so, it's definition is the string from here to the next
24933 * \a character. And its format is the same as a user-defined
24935 pos += SvCUR(pu_lookup);
24936 pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
24937 pu_invlist = handle_user_defined_property(lookup_name,
24940 0, /* Not folded */
24948 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24949 sv_catpvs(msg, "Insecure private-use override");
24950 goto append_name_to_msg;
24953 /* For now, as a safety measure, make sure that it doesn't
24954 * override non-private use code points */
24955 _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
24957 /* Add it to the list to be returned */
24958 _invlist_union(prop_definition, pu_invlist,
24959 &expanded_prop_definition);
24960 prop_definition = expanded_prop_definition;
24961 Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
24966 if (invert_return) {
24967 _invlist_invert(prop_definition);
24969 return prop_definition;
24971 unknown_user_defined:
24972 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24973 sv_catpvs(msg, "Unknown user-defined property name");
24974 goto append_name_to_msg;
24977 if (non_pkg_begin != 0) {
24978 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24979 sv_catpvs(msg, "Illegal user-defined property name");
24982 if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
24983 sv_catpvs(msg, "Can't find Unicode property definition");
24987 append_name_to_msg:
24989 const char * prefix = (runtime && level == 0) ? " \\p{" : " \"";
24990 const char * suffix = (runtime && level == 0) ? "}" : "\"";
24992 sv_catpv(msg, prefix);
24993 Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name));
24994 sv_catpv(msg, suffix);
24999 definition_deferred:
25002 bool is_qualified = non_pkg_begin != 0; /* If has "::" */
25004 /* Here it could yet to be defined, so defer evaluation of this until
25005 * its needed at runtime. We need the fully qualified property name to
25006 * avoid ambiguity */
25008 fq_name = S_get_fq_name(aTHX_ name, name_len, is_utf8,
25012 /* If it didn't come with a package, or the package is utf8::, this
25013 * actually could be an official Unicode property whose inclusion we
25014 * are deferring until runtime to make sure that it isn't overridden by
25015 * a user-defined property of the same name (which we haven't
25016 * encountered yet). Add a marker to indicate this possibility, for
25017 * use at such time when we first need the definition during pattern
25018 * matching execution */
25019 if (! is_qualified || memBEGINPs(name, non_pkg_begin, "utf8::")) {
25020 sv_catpvs(fq_name, DEFERRED_COULD_BE_OFFICIAL_MARKERs);
25023 /* We also need a trailing newline */
25024 sv_catpvs(fq_name, "\n");
25026 *user_defined_ptr = TRUE;
25032 S_handle_names_wildcard(pTHX_ const char * wname, /* wildcard name to match */
25033 const STRLEN wname_len, /* Its length */
25034 SV ** prop_definition,
25037 /* Deal with Name property wildcard subpatterns; returns TRUE if there were
25038 * any matches, adding them to prop_definition */
25042 CV * get_names_info; /* entry to charnames.pm to get info we need */
25043 SV * names_string; /* Contains all character names, except algo */
25044 SV * algorithmic_names; /* Contains info about algorithmically
25045 generated character names */
25046 REGEXP * subpattern_re; /* The user's pattern to match with */
25047 struct regexp * prog; /* The compiled pattern */
25048 char * all_names_start; /* lib/unicore/Name.pl string of every
25049 (non-algorithmic) character name */
25050 char * cur_pos; /* We match, effectively using /gc; this is
25051 where we are now */
25052 bool found_matches = FALSE; /* Did any name match so far? */
25053 SV * empty; /* For matching zero length names */
25054 SV * must_sv; /* Contains the substring, if any, that must be
25055 in a name for the subpattern to match */
25056 const char * must; /* The PV of 'must' */
25057 STRLEN must_len; /* And its length */
25058 SV * syllable_name = NULL; /* For Hangul syllables */
25059 const char hangul_prefix[] = "HANGUL SYLLABLE ";
25060 const STRLEN hangul_prefix_len = sizeof(hangul_prefix) - 1;
25062 /* By inspection, there are a maximum of 7 bytes in the suffix of a hangul
25063 * syllable name, and these are immutable and guaranteed by the Unicode
25064 * standard to never be extended */
25065 const STRLEN syl_max_len = hangul_prefix_len + 7;
25069 PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD;
25071 /* Make sure _charnames is loaded. (The parameters give context
25072 * for any errors generated */
25073 get_names_info = get_cv("_charnames::_get_names_info", 0);
25074 if (! get_names_info) {
25075 Perl_croak(aTHX_ "panic: Can't find '_charnames::_get_names_info");
25078 /* Get the charnames data */
25079 PUSHSTACKi(PERLSI_REGCOMP);
25087 /* Special _charnames entry point that returns the info this routine
25089 call_sv(MUTABLE_SV(get_names_info), G_ARRAY);
25093 /* Data structure for names which end in their very own code points */
25094 algorithmic_names = POPs;
25095 SvREFCNT_inc_simple_void_NN(algorithmic_names);
25097 /* The lib/unicore/Name.pl string */
25098 names_string = POPs;
25099 SvREFCNT_inc_simple_void_NN(names_string);
25106 if ( ! SvROK(names_string)
25107 || ! SvROK(algorithmic_names))
25108 { /* Perhaps should panic instead XXX */
25109 SvREFCNT_dec(names_string);
25110 SvREFCNT_dec(algorithmic_names);
25114 names_string = sv_2mortal(SvRV(names_string));
25115 all_names_start = SvPVX(names_string);
25116 cur_pos = all_names_start;
25118 algorithmic_names= sv_2mortal(SvRV(algorithmic_names));
25120 /* Compile the subpattern consisting of the name being looked for */
25121 subpattern_re = compile_wildcard(wname, wname_len, FALSE /* /-i */ );
25123 must_sv = re_intuit_string(subpattern_re);
25125 /* regexec.c can free the re_intuit_string() return. GH #17734 */
25126 must_sv = sv_2mortal(newSVsv(must_sv));
25127 must = SvPV(must_sv, must_len);
25134 /* (Note: 'must' could contain a NUL. And yet we use strspn() below on it.
25135 * This works because the NUL causes the function to return early, thus
25136 * showing that there are characters in it other than the acceptable ones,
25137 * which is our desired result.) */
25139 prog = ReANY(subpattern_re);
25141 /* If only nothing is matched, skip to where empty names are looked for */
25142 if (prog->maxlen == 0) {
25146 /* And match against the string of all names /gc. Don't even try if it
25147 * must match a character not found in any name. */
25148 if (strspn(must, "\n -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ()") == must_len)
25150 while (execute_wildcard(subpattern_re,
25152 SvEND(names_string),
25153 all_names_start, 0,
25156 { /* Here, matched. */
25158 /* Note the string entries look like
25159 * 00001\nSTART OF HEADING\n\n
25160 * so we could match anywhere in that string. We have to rule out
25161 * matching a code point line */
25162 char * this_name_start = all_names_start
25163 + RX_OFFS(subpattern_re)->start;
25164 char * this_name_end = all_names_start
25165 + RX_OFFS(subpattern_re)->end;
25168 UV cp = 0; /* Silences some compilers */
25169 AV * this_string = NULL;
25170 bool is_multi = FALSE;
25172 /* If matched nothing, advance to next possible match */
25173 if (this_name_start == this_name_end) {
25174 cur_pos = (char *) memchr(this_name_end + 1, '\n',
25175 SvEND(names_string) - this_name_end);
25176 if (cur_pos == NULL) {
25181 /* Position the next match to start beyond the current returned
25183 cur_pos = (char *) memchr(this_name_end, '\n',
25184 SvEND(names_string) - this_name_end);
25187 /* Back up to the \n just before the beginning of the character. */
25188 cp_end = (char *) my_memrchr(all_names_start,
25190 this_name_start - all_names_start);
25192 /* If we didn't find a \n, it means it matched somewhere in the
25193 * initial '00000' in the string, so isn't a real match */
25194 if (cp_end == NULL) {
25198 this_name_start = cp_end + 1; /* The name starts just after */
25199 cp_end--; /* the \n, and the code point */
25200 /* ends just before it */
25202 /* All code points are 5 digits long */
25203 cp_start = cp_end - 4;
25205 /* This shouldn't happen, as we found a \n, and the first \n is
25206 * further along than what we subtracted */
25207 assert(cp_start >= all_names_start);
25209 if (cp_start == all_names_start) {
25210 *prop_definition = add_cp_to_invlist(*prop_definition, 0);
25214 /* If the character is a blank, we either have a named sequence, or
25215 * something is wrong */
25216 if (*(cp_start - 1) == ' ') {
25217 cp_start = (char *) my_memrchr(all_names_start,
25219 cp_start - all_names_start);
25223 assert(cp_start != NULL && cp_start >= all_names_start + 2);
25225 /* Except for the first line in the string, the sequence before the
25226 * code point is \n\n. If that isn't the case here, we didn't
25227 * match the name of a character. (We could have matched a named
25228 * sequence, not currently handled */
25229 if (*(cp_start - 1) != '\n' || *(cp_start - 2) != '\n') {
25233 /* We matched! Add this to the list */
25234 found_matches = TRUE;
25236 /* Loop through all the code points in the sequence */
25237 while (cp_start < cp_end) {
25239 /* Calculate this code point from its 5 digits */
25240 cp = (XDIGIT_VALUE(cp_start[0]) << 16)
25241 + (XDIGIT_VALUE(cp_start[1]) << 12)
25242 + (XDIGIT_VALUE(cp_start[2]) << 8)
25243 + (XDIGIT_VALUE(cp_start[3]) << 4)
25244 + XDIGIT_VALUE(cp_start[4]);
25246 cp_start += 6; /* Go past any blank */
25248 if (cp_start < cp_end || is_multi) {
25249 if (this_string == NULL) {
25250 this_string = newAV();
25254 av_push(this_string, newSVuv(cp));
25258 if (is_multi) { /* Was more than one code point */
25259 if (*strings == NULL) {
25260 *strings = newAV();
25263 av_push(*strings, (SV *) this_string);
25265 else { /* Only a single code point */
25266 *prop_definition = add_cp_to_invlist(*prop_definition, cp);
25268 } /* End of loop through the non-algorithmic names string */
25271 /* There are also character names not in 'names_string'. These are
25272 * algorithmically generatable. Try this pattern on each possible one.
25273 * (khw originally planned to leave this out given the large number of
25274 * matches attempted; but the speed turned out to be quite acceptable
25276 * There are plenty of opportunities to optimize to skip many of the tests.
25277 * beyond the rudimentary ones already here */
25279 /* First see if the subpattern matches any of the algorithmic generatable
25280 * Hangul syllable names.
25282 * We know none of these syllable names will match if the input pattern
25283 * requires more bytes than any syllable has, or if the input pattern only
25284 * matches an empty name, or if the pattern has something it must match and
25285 * one of the characters in that isn't in any Hangul syllable. */
25286 if ( prog->minlen <= (SSize_t) syl_max_len
25287 && prog->maxlen > 0
25288 && (strspn(must, "\n ABCDEGHIJKLMNOPRSTUWY") == must_len))
25290 /* These constants, names, values, and algorithm are adapted from the
25291 * Unicode standard, version 5.1, section 3.12, and should never
25293 const char * JamoL[] = {
25294 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
25295 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
25297 const int LCount = C_ARRAY_LENGTH(JamoL);
25299 const char * JamoV[] = {
25300 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
25301 "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
25304 const int VCount = C_ARRAY_LENGTH(JamoV);
25306 const char * JamoT[] = {
25307 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L",
25308 "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M", "B",
25309 "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
25311 const int TCount = C_ARRAY_LENGTH(JamoT);
25315 /* This is the initial Hangul syllable code point; each time through the
25316 * inner loop, it maps to the next higher code point. For more info,
25317 * see the Hangul syllable section of the Unicode standard. */
25320 syllable_name = sv_2mortal(newSV(syl_max_len));
25321 sv_setpvn(syllable_name, hangul_prefix, hangul_prefix_len);
25323 for (L = 0; L < LCount; L++) {
25324 for (V = 0; V < VCount; V++) {
25325 for (T = 0; T < TCount; T++) {
25327 /* Truncate back to the prefix, which is unvarying */
25328 SvCUR_set(syllable_name, hangul_prefix_len);
25330 sv_catpv(syllable_name, JamoL[L]);
25331 sv_catpv(syllable_name, JamoV[V]);
25332 sv_catpv(syllable_name, JamoT[T]);
25334 if (execute_wildcard(subpattern_re,
25335 SvPVX(syllable_name),
25336 SvEND(syllable_name),
25337 SvPVX(syllable_name), 0,
25341 *prop_definition = add_cp_to_invlist(*prop_definition,
25343 found_matches = TRUE;
25352 /* The rest of the algorithmically generatable names are of the form
25353 * "PREFIX-code_point". The prefixes and the code point limits of each
25354 * were returned to us in the array 'algorithmic_names' from data in
25355 * lib/unicore/Name.pm. 'code_point' in the name is expressed in hex. */
25356 for (i = 0; i <= av_top_index((AV *) algorithmic_names); i++) {
25359 /* Each element of the array is a hash, giving the details for the
25360 * series of names it covers. There is the base name of the characters
25361 * in the series, and the low and high code points in the series. And,
25362 * for optimization purposes a string containing all the legal
25363 * characters that could possibly be in a name in this series. */
25364 HV * this_series = (HV *) SvRV(* av_fetch((AV *) algorithmic_names, i, 0));
25365 SV * prefix = * hv_fetchs(this_series, "name", 0);
25366 IV low = SvIV(* hv_fetchs(this_series, "low", 0));
25367 IV high = SvIV(* hv_fetchs(this_series, "high", 0));
25368 char * legal = SvPVX(* hv_fetchs(this_series, "legal", 0));
25370 /* Pre-allocate an SV with enough space */
25371 SV * algo_name = sv_2mortal(Perl_newSVpvf(aTHX_ "%s-0000",
25373 if (high >= 0x10000) {
25374 sv_catpvs(algo_name, "0");
25377 /* This series can be skipped entirely if the pattern requires
25378 * something longer than any name in the series, or can only match an
25379 * empty name, or contains a character not found in any name in the
25381 if ( prog->minlen <= (SSize_t) SvCUR(algo_name)
25382 && prog->maxlen > 0
25383 && (strspn(must, legal) == must_len))
25385 for (j = low; j <= high; j++) { /* For each code point in the series */
25387 /* Get its name, and see if it matches the subpattern */
25388 Perl_sv_setpvf(aTHX_ algo_name, "%s-%X", SvPVX(prefix),
25391 if (execute_wildcard(subpattern_re,
25394 SvPVX(algo_name), 0,
25398 *prop_definition = add_cp_to_invlist(*prop_definition, j);
25399 found_matches = TRUE;
25406 /* Finally, see if the subpattern matches an empty string */
25407 empty = newSVpvs("");
25408 if (execute_wildcard(subpattern_re,
25415 /* Many code points have empty names. Currently these are the \p{GC=C}
25416 * ones, minus CC and CF */
25418 SV * empty_names_ref = get_prop_definition(UNI_C);
25419 SV * empty_names = invlist_clone(empty_names_ref, NULL);
25421 SV * subtract = get_prop_definition(UNI_CC);
25423 _invlist_subtract(empty_names, subtract, &empty_names);
25424 SvREFCNT_dec_NN(empty_names_ref);
25425 SvREFCNT_dec_NN(subtract);
25427 subtract = get_prop_definition(UNI_CF);
25428 _invlist_subtract(empty_names, subtract, &empty_names);
25429 SvREFCNT_dec_NN(subtract);
25431 _invlist_union(*prop_definition, empty_names, prop_definition);
25432 found_matches = TRUE;
25433 SvREFCNT_dec_NN(empty_names);
25435 SvREFCNT_dec_NN(empty);
25438 /* If we ever were to accept aliases for, say private use names, we would
25439 * need to do something fancier to find empty names. The code below works
25440 * (at the time it was written), and is slower than the above */
25441 const char empties_pat[] = "^.";
25442 if (strNE(name, empties_pat)) {
25443 SV * empty = newSVpvs("");
25444 if (execute_wildcard(subpattern_re,
25451 SV * empties = NULL;
25453 (void) handle_names_wildcard(empties_pat, strlen(empties_pat), &empties);
25455 _invlist_union_complement_2nd(*prop_definition, empties, prop_definition);
25456 SvREFCNT_dec_NN(empties);
25458 found_matches = TRUE;
25460 SvREFCNT_dec_NN(empty);
25464 SvREFCNT_dec_NN(subpattern_re);
25465 return found_matches;
25469 * ex: set ts=8 sts=4 sw=4 et: